camel-ai 0.2.73a2__py3-none-any.whl → 0.2.73a4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -23,7 +23,7 @@ from typing import Any, Callable, ClassVar, Dict, List, Optional, cast
23
23
 
24
24
  from camel.logger import get_logger
25
25
  from camel.models import BaseModelBackend
26
- from camel.toolkits.base import BaseToolkit
26
+ from camel.toolkits.base import BaseToolkit, RegisteredAgentToolkit
27
27
  from camel.toolkits.function_tool import FunctionTool
28
28
  from camel.utils import sanitize_filename
29
29
  from camel.utils.commons import dependencies_required
@@ -35,7 +35,7 @@ from .config_loader import ConfigLoader
35
35
  logger = get_logger(__name__)
36
36
 
37
37
 
38
- class HybridBrowserToolkit(BaseToolkit):
38
+ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
39
39
  r"""A hybrid browser toolkit that combines non-visual, DOM-based browser
40
40
  automation with visual, screenshot-based capabilities.
41
41
 
@@ -48,36 +48,36 @@ class HybridBrowserToolkit(BaseToolkit):
48
48
 
49
49
  # Default tool list - core browser functionality
50
50
  DEFAULT_TOOLS: ClassVar[List[str]] = [
51
- "open_browser",
52
- "close_browser",
53
- "visit_page",
54
- "back",
55
- "forward",
56
- "click",
57
- "type",
58
- "switch_tab",
51
+ "browser_open",
52
+ "browser_close",
53
+ "browser_visit_page",
54
+ "browser_back",
55
+ "browser_forward",
56
+ "browser_click",
57
+ "browser_type",
58
+ "browser_switch_tab",
59
59
  ]
60
60
 
61
61
  # All available tools
62
62
  ALL_TOOLS: ClassVar[List[str]] = [
63
- "open_browser",
64
- "close_browser",
65
- "visit_page",
66
- "back",
67
- "forward",
68
- "get_page_snapshot",
69
- "get_som_screenshot",
70
- "get_page_links",
71
- "click",
72
- "type",
73
- "select",
74
- "scroll",
75
- "enter",
76
- "wait_user",
77
- "solve_task",
78
- "switch_tab",
79
- "close_tab",
80
- "get_tab_info",
63
+ "browser_open",
64
+ "browser_close",
65
+ "browser_visit_page",
66
+ "browser_back",
67
+ "browser_forward",
68
+ "browser_get_page_snapshot",
69
+ "browser_get_som_screenshot",
70
+ "browser_get_page_links",
71
+ "browser_click",
72
+ "browser_type",
73
+ "browser_select",
74
+ "browser_scroll",
75
+ "browser_enter",
76
+ "browser_wait_user",
77
+ "browser_solve_task",
78
+ "browser_switch_tab",
79
+ "browser_close_tab",
80
+ "browser_get_tab_info",
81
81
  ]
82
82
 
83
83
  def __init__(
@@ -109,7 +109,8 @@ class HybridBrowserToolkit(BaseToolkit):
109
109
  browser data like cookies and local storage. Useful for
110
110
  maintaining sessions across runs. Defaults to `None` (a
111
111
  temporary directory is used).
112
- stealth (bool): Whether to run the browser in stealth mode to avoid
112
+ stealth (bool): Whether to run the browser in stealth mode to
113
+ avoid
113
114
  bot detection. When enabled, hides WebDriver characteristics,
114
115
  spoofs navigator properties, and implements various
115
116
  anti-detection
@@ -121,11 +122,15 @@ class HybridBrowserToolkit(BaseToolkit):
121
122
  Defaults to `None`.
122
123
  cache_dir (str): The directory to store cached files, such as
123
124
  screenshots. Defaults to `"tmp/"`.
124
- enabled_tools (Optional[List[str]]): List of tool names to enable.
125
- If None, uses DEFAULT_TOOLS. Available tools: open_browser,
126
- close_browser, visit_page, back, forward, get_page_snapshot,
127
- get_som_screenshot, get_page_links, click, type, select,
128
- scroll, enter, wait_user, solve_task.
125
+ enabled_tools (Optional[List[str]]): List of tool names to
126
+ enable.
127
+ If None, uses DEFAULT_TOOLS. Available tools: browser_open,
128
+ browser_close, browser_visit_page, browser_back,
129
+ browser_forward, browser_get_page_snapshot,
130
+ browser_get_som_screenshot, browser_get_page_links,
131
+ browser_click, browser_type, browser_select,
132
+ browser_scroll, browser_enter, browser_wait_user,
133
+ browser_solve_task.
129
134
  Defaults to `None`.
130
135
  browser_log_to_file (bool): Whether to save detailed browser
131
136
  action logs to file.
@@ -134,12 +139,14 @@ class HybridBrowserToolkit(BaseToolkit):
134
139
  Logs are saved to an auto-generated timestamped file.
135
140
  Defaults to `False`.
136
141
  session_id (Optional[str]): A unique identifier for this browser
137
- session. When multiple HybridBrowserToolkit instances are used
142
+ session. When multiple HybridBrowserToolkit instances are
143
+ used
138
144
  concurrently, different session IDs prevent them from sharing
139
145
  the same browser session and causing conflicts. If None, a
140
146
  default session will be used. Defaults to `None`.
141
147
  default_start_url (str): The default URL to navigate to when
142
- open_browser() is called without a start_url parameter or with
148
+ open_browser() is called without a start_url parameter or
149
+ with
143
150
  None. Defaults to `"https://google.com/"`.
144
151
  default_timeout (Optional[int]): Default timeout in milliseconds
145
152
  for browser actions. If None, uses environment variable
@@ -177,6 +184,7 @@ class HybridBrowserToolkit(BaseToolkit):
177
184
  Defaults to `None`.
178
185
  """
179
186
  super().__init__()
187
+ RegisteredAgentToolkit.__init__(self)
180
188
  self._headless = headless
181
189
  self._user_data_dir = user_data_dir
182
190
  self._stealth = stealth
@@ -267,7 +275,7 @@ class HybridBrowserToolkit(BaseToolkit):
267
275
  # Use the session directly - singleton logic is handled in
268
276
  # ensure_browser
269
277
  self._session = temp_session
270
- self._agent: Optional[PlaywrightLLMAgent] = None
278
+ self._playwright_agent: Optional[PlaywrightLLMAgent] = None
271
279
  self._unified_script = self._load_unified_analyzer()
272
280
 
273
281
  @property
@@ -403,9 +411,13 @@ class HybridBrowserToolkit(BaseToolkit):
403
411
  if self.log_to_console:
404
412
  log_msg = f"[BROWSER ACTION] {action_name}"
405
413
  if self.enable_timing_logging:
406
- log_msg += f" | Execution: {log_entry['execution_time_ms']}ms"
414
+ log_msg += (
415
+ f" | Execution: " f"{log_entry['execution_time_ms']}ms"
416
+ )
407
417
  if page_load_time is not None and self.enable_page_loading_logging:
408
- log_msg += f" | Page Load: {log_entry['page_load_time_ms']}ms"
418
+ log_msg += (
419
+ f" | Page Load: " f"{log_entry['page_load_time_ms']}ms"
420
+ )
409
421
  if error:
410
422
  log_msg += f" | ERROR: {error}"
411
423
 
@@ -570,7 +582,8 @@ class HybridBrowserToolkit(BaseToolkit):
570
582
  # Success - return result
571
583
  if attempt > 0:
572
584
  logger.debug(
573
- f"Unified analysis succeeded on attempt {attempt + 1}"
585
+ f"Unified analysis succeeded on attempt "
586
+ f"{attempt + 1}"
574
587
  )
575
588
  return result
576
589
 
@@ -591,7 +604,8 @@ class HybridBrowserToolkit(BaseToolkit):
591
604
  f"{attempt + 1}/{max_retries}): {e}. Retrying..."
592
605
  )
593
606
 
594
- # Wait a bit for page stability before retrying (optimized)
607
+ # Wait a bit for page stability before retrying (
608
+ # optimized)
595
609
  try:
596
610
  await page.wait_for_load_state(
597
611
  'domcontentloaded',
@@ -774,7 +788,8 @@ class HybridBrowserToolkit(BaseToolkit):
774
788
  }
775
789
  except Exception as e:
776
790
  logger.warning(
777
- f"Failed to get tab info from session: {type(e).__name__}: {e}"
791
+ f"Failed to get tab info from session: {type(e).__name__}: "
792
+ f"{e}"
778
793
  )
779
794
 
780
795
  # Try to get actual tab count from session pages directly
@@ -816,7 +831,8 @@ class HybridBrowserToolkit(BaseToolkit):
816
831
  f"{len(fallback_session._pages)} total"
817
832
  )
818
833
  except Exception:
819
- # Keep the original count if we can't check page status
834
+ # Keep the original count if we can't check page
835
+ # status
820
836
  pass
821
837
 
822
838
  if actual_tab_count == 0:
@@ -827,7 +843,8 @@ class HybridBrowserToolkit(BaseToolkit):
827
843
  ):
828
844
  actual_tab_count = 1
829
845
  logger.debug(
830
- "No pages in list but main page exists, assuming "
846
+ "No pages in list but main page exists, "
847
+ "assuming "
831
848
  "1 tab"
832
849
  )
833
850
  else:
@@ -872,7 +889,8 @@ class HybridBrowserToolkit(BaseToolkit):
872
889
  )
873
890
  before_snapshot_time = time.time() - snapshot_start_before
874
891
  logger.info(
875
- f"Pre-action snapshot captured in {before_snapshot_time:.2f}s"
892
+ f"Pre-action snapshot captured in "
893
+ f"{before_snapshot_time:.2f}s"
876
894
  )
877
895
 
878
896
  # Execute action
@@ -976,7 +994,8 @@ class HybridBrowserToolkit(BaseToolkit):
976
994
  **tab_info, # Include tab information
977
995
  }
978
996
 
979
- # If snapshot is unchanged after click, add element details to log
997
+ # If snapshot is unchanged after click, add element details to
998
+ # log
980
999
  if (
981
1000
  snapshot == "snapshot not changed"
982
1001
  and action_type == "click"
@@ -1076,17 +1095,17 @@ class HybridBrowserToolkit(BaseToolkit):
1076
1095
  "web_agent_model required for high-level task planning"
1077
1096
  )
1078
1097
 
1079
- if self._agent is None:
1080
- self._agent = PlaywrightLLMAgent(
1098
+ if self._playwright_agent is None:
1099
+ self._playwright_agent = PlaywrightLLMAgent(
1081
1100
  headless=self._headless,
1082
1101
  user_data_dir=self._user_data_dir,
1083
1102
  model_backend=self._web_agent_model,
1084
1103
  )
1085
- return self._agent
1104
+ return self._playwright_agent
1086
1105
 
1087
1106
  # Public API Methods
1088
1107
 
1089
- async def open_browser(self) -> Dict[str, Any]:
1108
+ async def browser_open(self) -> Dict[str, Any]:
1090
1109
  r"""Starts a new browser session. This must be the first browser
1091
1110
  action.
1092
1111
 
@@ -1096,7 +1115,8 @@ class HybridBrowserToolkit(BaseToolkit):
1096
1115
  Returns:
1097
1116
  Dict[str, Any]: A dictionary with the result of the action:
1098
1117
  - "result" (str): Confirmation of the action.
1099
- - "snapshot" (str): A textual snapshot of interactive elements.
1118
+ - "snapshot" (str): A textual snapshot of interactive
1119
+ elements.
1100
1120
  - "tabs" (List[Dict]): Information about all open tabs.
1101
1121
  - "current_tab" (int): Index of the active tab.
1102
1122
  - "total_tabs" (int): Total number of open tabs.
@@ -1118,13 +1138,13 @@ class HybridBrowserToolkit(BaseToolkit):
1118
1138
  logger.info(f"Navigating to configured default page: {start_url}")
1119
1139
 
1120
1140
  # Use visit_page without creating a new tab
1121
- result = await self.visit_page(start_url)
1141
+ result = await self.browser_visit_page(url=start_url)
1122
1142
 
1123
1143
  # Log success
1124
1144
  if self.enable_action_logging or self.enable_timing_logging:
1125
1145
  execution_time = time.time() - action_start
1126
1146
  await self._log_action(
1127
- action_name="open_browser",
1147
+ action_name="browser_open",
1128
1148
  inputs=inputs,
1129
1149
  outputs={
1130
1150
  "result": "Browser opened and navigated to "
@@ -1140,7 +1160,7 @@ class HybridBrowserToolkit(BaseToolkit):
1140
1160
  if self.enable_action_logging or self.enable_timing_logging:
1141
1161
  execution_time = time.time() - action_start
1142
1162
  await self._log_action(
1143
- action_name="open_browser",
1163
+ action_name="browser_open",
1144
1164
  inputs=inputs,
1145
1165
  outputs=None,
1146
1166
  execution_time=execution_time,
@@ -1149,7 +1169,7 @@ class HybridBrowserToolkit(BaseToolkit):
1149
1169
  raise
1150
1170
 
1151
1171
  @action_logger
1152
- async def close_browser(self) -> str:
1172
+ async def browser_close(self) -> str:
1153
1173
  r"""Closes the browser session, releasing all resources.
1154
1174
 
1155
1175
  This should be called at the end of a task for cleanup.
@@ -1157,18 +1177,18 @@ class HybridBrowserToolkit(BaseToolkit):
1157
1177
  Returns:
1158
1178
  str: A confirmation message.
1159
1179
  """
1160
- if self._agent is not None:
1180
+ if self._playwright_agent is not None:
1161
1181
  try:
1162
- await self._agent.close()
1182
+ await self._playwright_agent.close()
1163
1183
  except Exception:
1164
1184
  pass
1165
- self._agent = None
1185
+ self._playwright_agent = None
1166
1186
 
1167
1187
  await self._session.close()
1168
1188
  return "Browser session closed."
1169
1189
 
1170
1190
  @action_logger
1171
- async def visit_page(self, url: str) -> Dict[str, Any]:
1191
+ async def browser_visit_page(self, url: str) -> Dict[str, Any]:
1172
1192
  r"""Opens a URL in a new browser tab and switches to it.
1173
1193
 
1174
1194
  Args:
@@ -1202,7 +1222,8 @@ class HybridBrowserToolkit(BaseToolkit):
1202
1222
  # By default, we want to create a new tab.
1203
1223
  should_create_new_tab = True
1204
1224
  try:
1205
- # If the browser has just started with a single "about:blank" tab,
1225
+ # If the browser has just started with a single "about:blank"
1226
+ # tab,
1206
1227
  # use that tab instead of creating a new one.
1207
1228
  tab_info_data = await self._get_tab_info_for_output()
1208
1229
  tabs = tab_info_data.get("tabs", [])
@@ -1246,7 +1267,7 @@ class HybridBrowserToolkit(BaseToolkit):
1246
1267
  return {"result": nav_result, "snapshot": snapshot, **tab_info}
1247
1268
 
1248
1269
  @action_logger
1249
- async def back(self) -> Dict[str, Any]:
1270
+ async def browser_back(self) -> Dict[str, Any]:
1250
1271
  r"""Goes back to the previous page in the browser history.
1251
1272
 
1252
1273
  This action simulates using the browser's "back" button in the
@@ -1271,7 +1292,8 @@ class HybridBrowserToolkit(BaseToolkit):
1271
1292
  nav_time = time.time() - nav_start
1272
1293
  logger.info(f"Back navigation completed in {nav_time:.2f}s")
1273
1294
 
1274
- # Minimal wait for page stability (back navigation is usually fast)
1295
+ # Minimal wait for page stability (back navigation is usually
1296
+ # fast)
1275
1297
  import asyncio
1276
1298
 
1277
1299
  await asyncio.sleep(0.2)
@@ -1310,7 +1332,7 @@ class HybridBrowserToolkit(BaseToolkit):
1310
1332
  }
1311
1333
 
1312
1334
  @action_logger
1313
- async def forward(self) -> Dict[str, Any]:
1335
+ async def browser_forward(self) -> Dict[str, Any]:
1314
1336
  r"""Goes forward to the next page in the browser history.
1315
1337
 
1316
1338
  This action simulates using the browser's "forward" button in the
@@ -1349,7 +1371,8 @@ class HybridBrowserToolkit(BaseToolkit):
1349
1371
  )
1350
1372
  snapshot_time = time.time() - snapshot_start
1351
1373
  logger.info(
1352
- f"Forward navigation snapshot captured in {snapshot_time:.2f}s"
1374
+ f"Forward navigation snapshot captured in "
1375
+ f"{snapshot_time:.2f}s"
1353
1376
  )
1354
1377
 
1355
1378
  # Get tab information
@@ -1375,10 +1398,11 @@ class HybridBrowserToolkit(BaseToolkit):
1375
1398
  }
1376
1399
 
1377
1400
  @action_logger
1378
- async def get_page_snapshot(self) -> str:
1401
+ async def browser_get_page_snapshot(self) -> str:
1379
1402
  r"""Gets a textual snapshot of the page's interactive elements.
1380
1403
 
1381
- The snapshot lists elements like buttons, links, and inputs, each with
1404
+ The snapshot lists elements like buttons, links, and inputs,
1405
+ each with
1382
1406
  a unique `ref` ID. This ID is used by other tools (e.g., `click`,
1383
1407
  `type`) to interact with a specific element. This tool provides no
1384
1408
  visual information.
@@ -1407,18 +1431,33 @@ class HybridBrowserToolkit(BaseToolkit):
1407
1431
 
1408
1432
  @dependencies_required('PIL')
1409
1433
  @action_logger
1410
- async def get_som_screenshot(self):
1434
+ async def browser_get_som_screenshot(
1435
+ self,
1436
+ read_image: bool = True,
1437
+ instruction: Optional[str] = None,
1438
+ ):
1411
1439
  r"""Captures a screenshot with interactive elements highlighted.
1412
1440
 
1413
- "SoM" stands for "Set of Marks". This tool takes a screenshot and draws
1441
+ "SoM" stands for "Set of Marks". This tool takes a screenshot and
1442
+ draws
1414
1443
  boxes around clickable elements, overlaying a `ref` ID on each. Use
1415
1444
  this for a visual understanding of the page, especially when the
1416
1445
  textual snapshot is not enough.
1417
1446
 
1447
+ Args:
1448
+ read_image (bool, optional): If `True`, the agent will analyze
1449
+ the screenshot. Requires agent to be registered.
1450
+ (default: :obj:`True`)
1451
+ instruction (Optional[str], optional): A specific question or
1452
+ command for the agent regarding the screenshot, used only if
1453
+ `read_image` is `True`. For example: "Find the login button."
1454
+
1418
1455
  Returns:
1419
1456
  str: A summary message including the file path of the saved
1420
1457
  screenshot, e.g., "Visual webpage screenshot captured with 42
1421
- interactive elements and saved to /path/to/screenshot.png"
1458
+ interactive elements and saved to /path/to/screenshot.png",
1459
+ and optionally the agent's analysis if `read_image` is
1460
+ `True`.
1422
1461
  """
1423
1462
  from PIL import Image
1424
1463
 
@@ -1465,12 +1504,44 @@ class HybridBrowserToolkit(BaseToolkit):
1465
1504
 
1466
1505
  text_result = (
1467
1506
  f"Visual webpage screenshot captured with {len(rects)} "
1468
- f"interactive elements and saved to {file_path}"
1507
+ f"interactive elements."
1469
1508
  )
1470
1509
 
1510
+ # Analyze image if requested and agent is registered
1511
+ if read_image and file_path:
1512
+ if self.agent is None:
1513
+ logger.error(
1514
+ "Cannot analyze screenshot: No agent registered. "
1515
+ "Please pass this toolkit to ChatAgent via "
1516
+ "toolkits_to_register_agent parameter."
1517
+ )
1518
+ text_result += (
1519
+ " Error: No agent registered for image analysis. "
1520
+ "Please pass this toolkit to ChatAgent via "
1521
+ "toolkits_to_register_agent parameter."
1522
+ )
1523
+ else:
1524
+ try:
1525
+ # Load the image and create a message
1526
+ from camel.messages import BaseMessage
1527
+
1528
+ img = Image.open(file_path)
1529
+ inst = instruction if instruction is not None else ""
1530
+ message = BaseMessage.make_user_message(
1531
+ role_name="User",
1532
+ content=inst,
1533
+ image_list=[img],
1534
+ )
1535
+
1536
+ # Get agent's analysis
1537
+ await self.agent.astep(message)
1538
+ except Exception as e:
1539
+ logger.error(f"Error analyzing screenshot: {e}")
1540
+ text_result += f". Error analyzing screenshot: {e}"
1541
+
1471
1542
  return text_result
1472
1543
 
1473
- async def click(self, *, ref: str) -> Dict[str, Any]:
1544
+ async def browser_click(self, *, ref: str) -> Dict[str, Any]:
1474
1545
  r"""Performs a click on an element on the page.
1475
1546
 
1476
1547
  Args:
@@ -1514,7 +1585,7 @@ class HybridBrowserToolkit(BaseToolkit):
1514
1585
 
1515
1586
  return result
1516
1587
 
1517
- async def type(self, *, ref: str, text: str) -> Dict[str, Any]:
1588
+ async def browser_type(self, *, ref: str, text: str) -> Dict[str, Any]:
1518
1589
  r"""Types text into an input element on the page.
1519
1590
 
1520
1591
  Args:
@@ -1542,7 +1613,7 @@ class HybridBrowserToolkit(BaseToolkit):
1542
1613
 
1543
1614
  return result
1544
1615
 
1545
- async def select(self, *, ref: str, value: str) -> Dict[str, Any]:
1616
+ async def browser_select(self, *, ref: str, value: str) -> Dict[str, Any]:
1546
1617
  r"""Selects an option in a dropdown (`<select>`) element.
1547
1618
 
1548
1619
  Args:
@@ -1571,7 +1642,9 @@ class HybridBrowserToolkit(BaseToolkit):
1571
1642
 
1572
1643
  return result
1573
1644
 
1574
- async def scroll(self, *, direction: str, amount: int) -> Dict[str, Any]:
1645
+ async def browser_scroll(
1646
+ self, *, direction: str, amount: int
1647
+ ) -> Dict[str, Any]:
1575
1648
  r"""Scrolls the current page window.
1576
1649
 
1577
1650
  Args:
@@ -1603,8 +1676,9 @@ class HybridBrowserToolkit(BaseToolkit):
1603
1676
 
1604
1677
  return result
1605
1678
 
1606
- async def enter(self) -> Dict[str, Any]:
1607
- r"""Simulates pressing the Enter key on the currently focused element.
1679
+ async def browser_enter(self) -> Dict[str, Any]:
1680
+ r"""Simulates pressing the Enter key on the currently focused
1681
+ element.
1608
1682
 
1609
1683
  This is useful for submitting forms or search queries after using the
1610
1684
  `type` tool.
@@ -1630,12 +1704,13 @@ class HybridBrowserToolkit(BaseToolkit):
1630
1704
  return result
1631
1705
 
1632
1706
  @action_logger
1633
- async def wait_user(
1707
+ async def browser_wait_user(
1634
1708
  self, timeout_sec: Optional[float] = None
1635
1709
  ) -> Dict[str, Any]:
1636
1710
  r"""Pauses execution and waits for human input from the console.
1637
1711
 
1638
- Use this for tasks requiring manual steps, like solving a CAPTCHA. The
1712
+ Use this for tasks requiring manual steps, like solving a CAPTCHA.
1713
+ The
1639
1714
  agent will resume after the user presses Enter in the console.
1640
1715
 
1641
1716
  Args:
@@ -1694,7 +1769,9 @@ class HybridBrowserToolkit(BaseToolkit):
1694
1769
  return {"result": result_msg, "snapshot": snapshot, **tab_info}
1695
1770
 
1696
1771
  @action_logger
1697
- async def get_page_links(self, *, ref: List[str]) -> Dict[str, Any]:
1772
+ async def browser_get_page_links(
1773
+ self, *, ref: List[str]
1774
+ ) -> Dict[str, Any]:
1698
1775
  r"""Gets the destination URLs for a list of link elements.
1699
1776
 
1700
1777
  This is useful to know where a link goes before clicking it.
@@ -1724,12 +1801,13 @@ class HybridBrowserToolkit(BaseToolkit):
1724
1801
  return {"links": links}
1725
1802
 
1726
1803
  @action_logger
1727
- async def solve_task(
1804
+ async def browser_solve_task(
1728
1805
  self, task_prompt: str, start_url: str, max_steps: int = 15
1729
1806
  ) -> str:
1730
1807
  r"""Delegates a complex, high-level task to a specialized web agent.
1731
1808
 
1732
- Use this for multi-step tasks that can be described in a single prompt
1809
+ Use this for multi-step tasks that can be described in a single
1810
+ prompt
1733
1811
  (e.g., "log into my account and check for new messages"). The agent
1734
1812
  will autonomously perform the necessary browser actions.
1735
1813
 
@@ -1794,52 +1872,6 @@ class HybridBrowserToolkit(BaseToolkit):
1794
1872
  self.log_buffer.clear()
1795
1873
  logger.info("Log buffer cleared")
1796
1874
 
1797
- def get_tools(self) -> List[FunctionTool]:
1798
- r"""Get available function tools
1799
- based on enabled_tools configuration."""
1800
- # Map tool names to their corresponding methods
1801
- tool_map = {
1802
- "open_browser": self.open_browser,
1803
- "close_browser": self.close_browser,
1804
- "visit_page": self.visit_page,
1805
- "back": self.back,
1806
- "forward": self.forward,
1807
- "get_page_snapshot": self.get_page_snapshot,
1808
- "get_som_screenshot": self.get_som_screenshot,
1809
- "get_page_links": self.get_page_links,
1810
- "click": self.click,
1811
- "type": self.type,
1812
- "select": self.select,
1813
- "scroll": self.scroll,
1814
- "enter": self.enter,
1815
- "wait_user": self.wait_user,
1816
- "solve_task": self.solve_task,
1817
- "switch_tab": self.switch_tab,
1818
- "close_tab": self.close_tab,
1819
- "get_tab_info": self.get_tab_info,
1820
- }
1821
-
1822
- enabled_tools = []
1823
-
1824
- for tool_name in self.enabled_tools:
1825
- if tool_name == "solve_task" and self._web_agent_model is None:
1826
- logger.warning(
1827
- f"Tool '{tool_name}' is enabled but web_agent_model "
1828
- f"is not provided. Skipping this tool."
1829
- )
1830
- continue
1831
-
1832
- if tool_name in tool_map:
1833
- tool = FunctionTool(
1834
- cast(Callable[..., Any], tool_map[tool_name])
1835
- )
1836
- enabled_tools.append(tool)
1837
- else:
1838
- logger.warning(f"Unknown tool name: {tool_name}")
1839
-
1840
- logger.info(f"Returning {len(enabled_tools)} enabled tools")
1841
- return enabled_tools
1842
-
1843
1875
  def clone_for_new_session(
1844
1876
  self, new_session_id: Optional[str] = None
1845
1877
  ) -> "HybridBrowserToolkit":
@@ -1864,7 +1896,8 @@ class HybridBrowserToolkit(BaseToolkit):
1864
1896
  user_data_dir=self._user_data_dir,
1865
1897
  stealth=self._stealth,
1866
1898
  web_agent_model=self._web_agent_model,
1867
- cache_dir=f"{self._cache_dir.rstrip('/')}_clone_{new_session_id}/",
1899
+ cache_dir=f"{self._cache_dir.rstrip('/')}_clone_"
1900
+ f"{new_session_id}/",
1868
1901
  enabled_tools=self.enabled_tools.copy(),
1869
1902
  browser_log_to_file=self._browser_log_to_file,
1870
1903
  session_id=new_session_id,
@@ -1879,7 +1912,7 @@ class HybridBrowserToolkit(BaseToolkit):
1879
1912
  )
1880
1913
 
1881
1914
  @action_logger
1882
- async def switch_tab(self, *, tab_id: str) -> Dict[str, Any]:
1915
+ async def browser_switch_tab(self, *, tab_id: str) -> Dict[str, Any]:
1883
1916
  r"""Switches to a different browser tab using its ID.
1884
1917
 
1885
1918
  After switching, all actions will apply to the new tab. Use
@@ -1924,7 +1957,7 @@ class HybridBrowserToolkit(BaseToolkit):
1924
1957
  return result
1925
1958
 
1926
1959
  @action_logger
1927
- async def close_tab(self, *, tab_id: str) -> Dict[str, Any]:
1960
+ async def browser_close_tab(self, *, tab_id: str) -> Dict[str, Any]:
1928
1961
  r"""Closes a browser tab using its ID.
1929
1962
 
1930
1963
  Use `get_tab_info` to find the ID of the tab to close. After
@@ -1936,7 +1969,8 @@ class HybridBrowserToolkit(BaseToolkit):
1936
1969
  Returns:
1937
1970
  Dict[str, Any]: A dictionary with the result of the action:
1938
1971
  - "result" (str): Confirmation of the action.
1939
- - "snapshot" (str): A snapshot of the active tab after closure.
1972
+ - "snapshot" (str): A snapshot of the active tab after
1973
+ closure.
1940
1974
  - "tabs" (List[Dict]): Information about remaining tabs.
1941
1975
  - "current_tab" (int): Index of the new active tab.
1942
1976
  - "total_tabs" (int): Total number of remaining tabs.
@@ -1974,7 +2008,7 @@ class HybridBrowserToolkit(BaseToolkit):
1974
2008
  return result
1975
2009
 
1976
2010
  @action_logger
1977
- async def get_tab_info(self) -> Dict[str, Any]:
2011
+ async def browser_get_tab_info(self) -> Dict[str, Any]:
1978
2012
  r"""Gets a list of all open browser tabs and their information.
1979
2013
 
1980
2014
  This includes each tab's index, title, and URL, and indicates which
@@ -1992,3 +2026,52 @@ class HybridBrowserToolkit(BaseToolkit):
1992
2026
  """
1993
2027
  await self._ensure_browser()
1994
2028
  return await self._get_tab_info_for_output()
2029
+
2030
+ def get_tools(self) -> List[FunctionTool]:
2031
+ r"""Get available function tools
2032
+ based on enabled_tools configuration."""
2033
+ # Map tool names to their corresponding methods
2034
+ tool_map = {
2035
+ "browser_open": self.browser_open,
2036
+ "browser_close": self.browser_close,
2037
+ "browser_visit_page": self.browser_visit_page,
2038
+ "browser_back": self.browser_back,
2039
+ "browser_forward": self.browser_forward,
2040
+ "browser_get_page_snapshot": self.browser_get_page_snapshot,
2041
+ "browser_get_som_screenshot": self.browser_get_som_screenshot,
2042
+ "browser_get_page_links": self.browser_get_page_links,
2043
+ "browser_click": self.browser_click,
2044
+ "browser_type": self.browser_type,
2045
+ "browser_select": self.browser_select,
2046
+ "browser_scroll": self.browser_scroll,
2047
+ "browser_enter": self.browser_enter,
2048
+ "browser_wait_user": self.browser_wait_user,
2049
+ "browser_solve_task": self.browser_solve_task,
2050
+ "browser_switch_tab": self.browser_switch_tab,
2051
+ "browser_close_tab": self.browser_close_tab,
2052
+ "browser_get_tab_info": self.browser_get_tab_info,
2053
+ }
2054
+
2055
+ enabled_tools = []
2056
+
2057
+ for tool_name in self.enabled_tools:
2058
+ if (
2059
+ tool_name == "browser_solve_task"
2060
+ and self._web_agent_model is None
2061
+ ):
2062
+ logger.warning(
2063
+ f"Tool '{tool_name}' is enabled but web_agent_model "
2064
+ f"is not provided. Skipping this tool."
2065
+ )
2066
+ continue
2067
+
2068
+ if tool_name in tool_map:
2069
+ tool = FunctionTool(
2070
+ cast(Callable[..., Any], tool_map[tool_name])
2071
+ )
2072
+ enabled_tools.append(tool)
2073
+ else:
2074
+ logger.warning(f"Unknown tool name: {tool_name}")
2075
+
2076
+ logger.info(f"Returning {len(enabled_tools)} enabled tools")
2077
+ return enabled_tools