camel-ai 0.2.71a10__py3-none-any.whl → 0.2.71a12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of camel-ai might be problematic. Click here for more details.

@@ -1094,25 +1094,19 @@ class HybridBrowserToolkit(BaseToolkit):
1094
1094
  # Public API Methods
1095
1095
 
1096
1096
  async def open_browser(self) -> Dict[str, Any]:
1097
- r"""Launches a new browser session and navigates to the configured
1098
- default page.
1097
+ r"""Starts a new browser session. This must be the first browser
1098
+ action.
1099
1099
 
1100
- This method initializes the underlying browser instance and
1101
- automatically navigates to the default start URL that was configured
1102
- during toolkit initialization in the first tab. Agents cannot specify
1103
- a custom URL - they must use the visit_page tool to open new tabs
1104
- with other URLs.
1100
+ This method initializes the browser and navigates to a default start
1101
+ page. To visit a specific URL, use `visit_page` after this.
1105
1102
 
1106
1103
  Returns:
1107
- Dict[str, Any]: A dictionary containing:
1108
- - "result": A string confirming that the browser session has
1109
- started and the default page has been loaded.
1110
- - "snapshot": A textual representation of the current page's
1111
- interactive elements. This snapshot is crucial for
1112
- identifying elements for subsequent actions.
1113
- - "tabs": List of all open tabs with their information.
1114
- - "current_tab": Index of the currently active tab.
1115
- - "total_tabs": Total number of open tabs.
1104
+ Dict[str, Any]: A dictionary with the result of the action:
1105
+ - "result" (str): Confirmation of the action.
1106
+ - "snapshot" (str): A textual snapshot of interactive elements.
1107
+ - "tabs" (List[Dict]): Information about all open tabs.
1108
+ - "current_tab" (int): Index of the active tab.
1109
+ - "total_tabs" (int): Total number of open tabs.
1116
1110
  """
1117
1111
  # Add logging if enabled
1118
1112
  action_start = time.time()
@@ -1163,14 +1157,12 @@ class HybridBrowserToolkit(BaseToolkit):
1163
1157
 
1164
1158
  @action_logger
1165
1159
  async def close_browser(self) -> str:
1166
- r"""Closes the current browser session and releases all associated
1167
- resources.
1160
+ r"""Closes the browser session, releasing all resources.
1168
1161
 
1169
- This should be called at the end of a web automation task to ensure a
1170
- clean shutdown of the browser instance.
1162
+ This should be called at the end of a task for cleanup.
1171
1163
 
1172
1164
  Returns:
1173
- str: A confirmation message indicating the session is closed.
1165
+ str: A confirmation message.
1174
1166
  """
1175
1167
  if self._agent is not None:
1176
1168
  try:
@@ -1184,17 +1176,19 @@ class HybridBrowserToolkit(BaseToolkit):
1184
1176
 
1185
1177
  @action_logger
1186
1178
  async def visit_page(self, url: str) -> Dict[str, Any]:
1187
- r"""Navigates to a URL.
1188
-
1189
- This method creates a new tab for the URL instead of navigating
1190
- in the current tab, allowing better multi-tab management.
1179
+ r"""Opens a URL in a new browser tab and switches to it.
1191
1180
 
1192
1181
  Args:
1193
- url (str): The web address to load in the browser.
1182
+ url (str): The web address to load. This should be a valid and
1183
+ existing URL.
1194
1184
 
1195
1185
  Returns:
1196
- Dict[str, Any]: A dictionary containing the result, snapshot, and
1197
- tab information.
1186
+ Dict[str, Any]: A dictionary with the result of the action:
1187
+ - "result" (str): Confirmation of the action.
1188
+ - "snapshot" (str): A textual snapshot of the new page.
1189
+ - "tabs" (List[Dict]): Information about all open tabs.
1190
+ - "current_tab" (int): Index of the new active tab.
1191
+ - "total_tabs" (int): Total number of open tabs.
1198
1192
  """
1199
1193
  if not url or not isinstance(url, str):
1200
1194
  return {
@@ -1260,23 +1254,18 @@ class HybridBrowserToolkit(BaseToolkit):
1260
1254
 
1261
1255
  @action_logger
1262
1256
  async def back(self) -> Dict[str, Any]:
1263
- r"""Navigates the browser back to the previous page in history.
1257
+ r"""Goes back to the previous page in the browser history.
1264
1258
 
1265
- This function simulates clicking the browser's back button, taking
1266
- you to the previously visited page if one exists in the browser
1267
- history.
1259
+ This action simulates using the browser's "back" button in the
1260
+ currently active tab.
1268
1261
 
1269
1262
  Returns:
1270
- Dict[str, Any]: A dictionary containing:
1271
- - "result": A message indicating the outcome of the back
1272
- navigation, e.g., "Back navigation successful." or an error
1273
- message if no previous page exists.
1274
- - "snapshot": A new textual snapshot of the page after
1275
- navigation. If the snapshot is unchanged, it will be the
1276
- string "snapshot not changed".
1277
- - "tabs": List of all open tabs with their information.
1278
- - "current_tab": Index of the currently active tab.
1279
- - "total_tabs": Total number of open tabs.
1263
+ Dict[str, Any]: A dictionary with the result of the action:
1264
+ - "result" (str): Confirmation of the action.
1265
+ - "snapshot" (str): A textual snapshot of the previous page.
1266
+ - "tabs" (List[Dict]): Information about all open tabs.
1267
+ - "current_tab" (int): Index of the active tab.
1268
+ - "total_tabs" (int): Total number of open tabs.
1280
1269
  """
1281
1270
  page = await self._require_page()
1282
1271
 
@@ -1329,23 +1318,18 @@ class HybridBrowserToolkit(BaseToolkit):
1329
1318
 
1330
1319
  @action_logger
1331
1320
  async def forward(self) -> Dict[str, Any]:
1332
- r"""Navigates the browser forward to the next page in history.
1321
+ r"""Goes forward to the next page in the browser history.
1333
1322
 
1334
- This function simulates clicking the browser's forward button, taking
1335
- you to the next page in the browser history if one exists (i.e.,
1336
- if you have previously navigated back).
1323
+ This action simulates using the browser's "forward" button in the
1324
+ currently active tab.
1337
1325
 
1338
1326
  Returns:
1339
- Dict[str, Any]: A dictionary containing:
1340
- - "result": A message indicating the outcome of the forward
1341
- navigation, e.g., "Forward navigation successful." or an
1342
- error message if no next page exists.
1343
- - "snapshot": A new textual snapshot of the page after
1344
- navigation. If the snapshot is unchanged, it will be the
1345
- string "snapshot not changed".
1346
- - "tabs": List of all open tabs with their information.
1347
- - "current_tab": Index of the currently active tab.
1348
- - "total_tabs": Total number of open tabs.
1327
+ Dict[str, Any]: A dictionary with the result of the action:
1328
+ - "result" (str): Confirmation of the action.
1329
+ - "snapshot" (str): A textual snapshot of the next page.
1330
+ - "tabs" (List[Dict]): Information about all open tabs.
1331
+ - "current_tab" (int): Index of the active tab.
1332
+ - "total_tabs" (int): Total number of open tabs.
1349
1333
  """
1350
1334
  page = await self._require_page()
1351
1335
 
@@ -1399,20 +1383,16 @@ class HybridBrowserToolkit(BaseToolkit):
1399
1383
 
1400
1384
  @action_logger
1401
1385
  async def get_page_snapshot(self) -> str:
1402
- r"""Captures a textual representation of the current page's content.
1386
+ r"""Gets a textual snapshot of the page's interactive elements.
1403
1387
 
1404
- This "snapshot" provides a simplified view of the DOM, focusing on
1405
- interactive elements like links, buttons, and input fields. Each
1406
- element is assigned a unique reference ID (`ref`) that can be used in
1407
- other actions like `click` or `type`.
1408
-
1409
- The snapshot is useful for understanding the page structure and
1410
- identifying elements to interact with without needing to parse raw
1411
- HTML. A new snapshot is generated on each call.
1388
+ The snapshot lists elements like buttons, links, and inputs, each with
1389
+ a unique `ref` ID. This ID is used by other tools (e.g., `click`,
1390
+ `type`) to interact with a specific element. This tool provides no
1391
+ visual information.
1412
1392
 
1413
1393
  Returns:
1414
- str: A formatted string representing the interactive elements on
1415
- the page. For example:
1394
+ str: A formatted string representing the interactive elements and
1395
+ their `ref` IDs. For example:
1416
1396
  '- link "Sign In" [ref=1]'
1417
1397
  '- textbox "Username" [ref=2]'
1418
1398
  """
@@ -1435,32 +1415,25 @@ class HybridBrowserToolkit(BaseToolkit):
1435
1415
  @dependencies_required('PIL')
1436
1416
  @action_logger
1437
1417
  async def get_som_screenshot(self):
1438
- r"""Captures a screenshot of the current webpage and visually marks all
1439
- interactive elements. "SoM" stands for "Set of Marks".
1440
-
1441
- This method is essential for tasks requiring visual understanding of
1442
- the page layout. It works by:
1443
- 1. Taking a full-page screenshot.
1444
- 2. Identifying all interactive elements (buttons, links, inputs, etc.).
1445
- 3. Drawing colored boxes and reference IDs (`ref`) over these elements
1446
- on the screenshot.
1447
- 4. Saving the annotated image to a cache directory.
1448
- 5. Returning the image as a base64-encoded string along with a summary.
1418
+ r"""Captures a screenshot with interactive elements highlighted.
1449
1419
 
1450
- Use this when the textual snapshot from `get_page_snapshot` is
1451
- insufficient and visual context is needed to decide the next action.
1420
+ "SoM" stands for "Set of Marks". This tool takes a screenshot and draws
1421
+ boxes around clickable elements, overlaying a `ref` ID on each. Use
1422
+ this for a visual understanding of the page, especially when the
1423
+ textual snapshot is not enough.
1452
1424
 
1453
1425
  Returns:
1454
1426
  ToolResult: An object containing:
1455
- - `text`: A summary string, e.g., "Visual webpage screenshot
1427
+ - `text` (str): A summary, e.g., "Visual webpage screenshot
1456
1428
  captured with 42 interactive elements".
1457
- - `images`: A list containing a single base64-encoded PNG image
1458
- as a data URL.
1429
+ - `images` (List[str]): A list containing one base64-encoded
1430
+ PNG image data URL.
1459
1431
  """
1460
1432
  from PIL import Image
1461
1433
 
1462
1434
  from camel.utils.tool_result import ToolResult
1463
1435
 
1436
+ os.makedirs(self._cache_dir, exist_ok=True)
1464
1437
  # Get screenshot and analysis
1465
1438
  page = await self._require_page()
1466
1439
 
@@ -1516,37 +1489,33 @@ class HybridBrowserToolkit(BaseToolkit):
1516
1489
  return ToolResult(text=text_result, images=[img_data_url])
1517
1490
 
1518
1491
  async def click(self, *, ref: str) -> Dict[str, Any]:
1519
- r"""Clicks on an interactive element on the page.
1492
+ r"""Performs a click on an element on the page.
1520
1493
 
1521
1494
  Args:
1522
- ref (str): The reference ID of the element to click. This ID is
1523
- obtained from the page snapshot (see `get_page_snapshot` or
1495
+ ref (str): The `ref` ID of the element to click. This ID is
1496
+ obtained from a page snapshot (`get_page_snapshot` or
1524
1497
  `get_som_screenshot`).
1525
1498
 
1526
1499
  Returns:
1527
- Dict[str, Any]: A dictionary containing:
1528
- - "result": A message confirming the click action.
1529
- - "snapshot": A new textual snapshot of the page after the
1530
- click, which may have changed as a result of the action. If
1531
- the snapshot is unchanged, it will be the string "snapshot
1532
- not changed".
1533
- - "tabs": List of all open tabs with their information.
1534
- - "current_tab": Index of the currently active tab.
1535
- - "total_tabs": Total number of open tabs.
1500
+ Dict[str, Any]: A dictionary with the result of the action:
1501
+ - "result" (str): Confirmation of the action.
1502
+ - "snapshot" (str): A textual snapshot of the page after the
1503
+ click.
1504
+ - "tabs" (List[Dict]): Information about all open tabs.
1505
+ - "current_tab" (int): Index of the active tab.
1506
+ - "total_tabs" (int): Total number of open tabs.
1536
1507
  """
1537
1508
  self._validate_ref(ref, "click")
1538
1509
 
1539
1510
  analysis = await self._get_unified_analysis()
1540
1511
  elements = analysis.get("elements", {})
1541
1512
  if ref not in elements:
1542
- available_refs = list(elements.keys())
1543
1513
  logger.error(f"Error: Element reference '{ref}' not found. ")
1544
1514
  # Added snapshot to give more context on failure
1545
1515
  snapshot = self._format_snapshot_from_analysis(analysis)
1546
1516
  tab_info = await self._get_tab_info_for_output()
1547
1517
  return {
1548
- "result": f"Error: Element reference '{ref}' not found. "
1549
- f"Available refs: {available_refs}",
1518
+ "result": f"Error: Element reference '{ref}' not found. ",
1550
1519
  "snapshot": snapshot,
1551
1520
  **tab_info,
1552
1521
  }
@@ -1564,20 +1533,20 @@ class HybridBrowserToolkit(BaseToolkit):
1564
1533
  return result
1565
1534
 
1566
1535
  async def type(self, *, ref: str, text: str) -> Dict[str, Any]:
1567
- r"""Types text into an input field, such as a textbox or search bar.
1536
+ r"""Types text into an input element on the page.
1568
1537
 
1569
1538
  Args:
1570
- ref (str): The reference ID of the input element.
1571
- text (str): The text to be typed into the element.
1539
+ ref (str): The `ref` ID of the input element, from a snapshot.
1540
+ text (str): The text to type into the element.
1572
1541
 
1573
1542
  Returns:
1574
- Dict[str, Any]: A dictionary containing:
1575
- - "result": A message confirming the type action.
1576
- - "snapshot": A new textual snapshot of the page after the
1577
- text has been entered.
1578
- - "tabs": List of all open tabs with their information.
1579
- - "current_tab": Index of the currently active tab.
1580
- - "total_tabs": Total number of open tabs.
1543
+ Dict[str, Any]: A dictionary with the result of the action:
1544
+ - "result" (str): Confirmation of the action.
1545
+ - "snapshot" (str): A textual snapshot of the page after
1546
+ typing.
1547
+ - "tabs" (List[Dict]): Information about all open tabs.
1548
+ - "current_tab" (int): Index of the active tab.
1549
+ - "total_tabs" (int): Total number of open tabs.
1581
1550
  """
1582
1551
  self._validate_ref(ref, "type")
1583
1552
  await self._get_unified_analysis() # Ensure aria-ref attributes
@@ -1592,21 +1561,21 @@ class HybridBrowserToolkit(BaseToolkit):
1592
1561
  return result
1593
1562
 
1594
1563
  async def select(self, *, ref: str, value: str) -> Dict[str, Any]:
1595
- r"""Selects an option from a dropdown (`<select>`) element.
1564
+ r"""Selects an option in a dropdown (`<select>`) element.
1596
1565
 
1597
1566
  Args:
1598
- ref (str): The reference ID of the `<select>` element.
1599
- value (str): The value of the `<option>` to be selected. This
1600
- should match the `value` attribute of the option, not the
1601
- visible text.
1567
+ ref (str): The `ref` ID of the `<select>` element.
1568
+ value (str): The `value` attribute of the `<option>` to select,
1569
+ not its visible text.
1602
1570
 
1603
1571
  Returns:
1604
- Dict[str, Any]: A dictionary containing:
1605
- - "result": A message confirming the select action.
1606
- - "snapshot": A new snapshot of the page after the selection.
1607
- - "tabs": List of all open tabs with their information.
1608
- - "current_tab": Index of the currently active tab.
1609
- - "total_tabs": Total number of open tabs.
1572
+ Dict[str, Any]: A dictionary with the result of the action:
1573
+ - "result" (str): Confirmation of the action.
1574
+ - "snapshot" (str): A snapshot of the page after the
1575
+ selection.
1576
+ - "tabs" (List[Dict]): Information about all open tabs.
1577
+ - "current_tab" (int): Index of the active tab.
1578
+ - "total_tabs" (int): Total number of open tabs.
1610
1579
  """
1611
1580
  self._validate_ref(ref, "select")
1612
1581
  await self._get_unified_analysis()
@@ -1621,20 +1590,19 @@ class HybridBrowserToolkit(BaseToolkit):
1621
1590
  return result
1622
1591
 
1623
1592
  async def scroll(self, *, direction: str, amount: int) -> Dict[str, Any]:
1624
- r"""Scrolls the page window up or down by a specified amount.
1593
+ r"""Scrolls the current page window.
1625
1594
 
1626
1595
  Args:
1627
- direction (str): The direction to scroll. Must be either 'up' or
1628
- 'down'.
1596
+ direction (str): The direction to scroll: 'up' or 'down'.
1629
1597
  amount (int): The number of pixels to scroll.
1630
1598
 
1631
1599
  Returns:
1632
- Dict[str, Any]: A dictionary containing:
1633
- - "result": A confirmation of the scroll action.
1634
- - "snapshot": A new snapshot of the page after scrolling.
1635
- - "tabs": List of all open tabs with their information.
1636
- - "current_tab": Index of the currently active tab.
1637
- - "total_tabs": Total number of open tabs.
1600
+ Dict[str, Any]: A dictionary with the result of the action:
1601
+ - "result" (str): Confirmation of the action.
1602
+ - "snapshot" (str): A snapshot of the page after scrolling.
1603
+ - "tabs" (List[Dict]): Information about all open tabs.
1604
+ - "current_tab" (int): Index of the active tab.
1605
+ - "total_tabs" (int): Total number of open tabs.
1638
1606
  """
1639
1607
  if direction not in ("up", "down"):
1640
1608
  tab_info = await self._get_tab_info_for_output()
@@ -1656,25 +1624,17 @@ class HybridBrowserToolkit(BaseToolkit):
1656
1624
  async def enter(self) -> Dict[str, Any]:
1657
1625
  r"""Simulates pressing the Enter key on the currently focused element.
1658
1626
 
1659
- This tool is used to execute or confirm an action after interacting
1660
- with
1661
- an element, such as:
1662
- - Submitting a search query after typing in a search box.
1663
- - Confirming a form submission.
1664
- - Executing a command in a text input field.
1665
-
1666
- The common usage pattern is to first use the 'type' tool to input
1667
- text, which sets the focus, and then call 'enter' without any
1668
- parameters to trigger the action.
1627
+ This is useful for submitting forms or search queries after using the
1628
+ `type` tool.
1669
1629
 
1670
1630
  Returns:
1671
- Dict[str, Any]: A dictionary containing:
1672
- - "result": A confirmation of the Enter key action.
1673
- - "snapshot": A new page snapshot, as this action often
1674
- triggers navigation or page updates.
1675
- - "tabs": List of all open tabs with their information.
1676
- - "current_tab": Index of the currently active tab.
1677
- - "total_tabs": Total number of open tabs.
1631
+ Dict[str, Any]: A dictionary with the result of the action:
1632
+ - "result" (str): Confirmation of the action.
1633
+ - "snapshot" (str): A new page snapshot, as this action often
1634
+ triggers navigation.
1635
+ - "tabs" (List[Dict]): Information about all open tabs.
1636
+ - "current_tab" (int): Index of the active tab.
1637
+ - "total_tabs" (int): Total number of open tabs.
1678
1638
  """
1679
1639
  # Always press Enter on the currently focused element
1680
1640
  action = {"type": "enter"}
@@ -1691,25 +1651,22 @@ class HybridBrowserToolkit(BaseToolkit):
1691
1651
  async def wait_user(
1692
1652
  self, timeout_sec: Optional[float] = None
1693
1653
  ) -> Dict[str, Any]:
1694
- r"""Pauses the agent's execution and waits for human intervention.
1654
+ r"""Pauses execution and waits for human input from the console.
1695
1655
 
1696
- This is useful for tasks that require manual steps, like solving a
1697
- CAPTCHA. The agent will print a message to the console and wait
1698
- until the user presses the Enter key.
1656
+ Use this for tasks requiring manual steps, like solving a CAPTCHA. The
1657
+ agent will resume after the user presses Enter in the console.
1699
1658
 
1700
1659
  Args:
1701
- timeout_sec (Optional[float]): The maximum time to wait in
1702
- seconds. If the timeout is reached, the agent will resume
1703
- automatically. If `None`, it will wait indefinitely.
1660
+ timeout_sec (Optional[float]): Max time to wait in seconds. If
1661
+ `None`, it will wait indefinitely.
1704
1662
 
1705
1663
  Returns:
1706
- Dict[str, Any]: A dictionary containing:
1707
- - "result": A message indicating how the wait ended (e.g.,
1708
- "User resumed." or "Timeout... reached, auto-resumed.").
1709
- - "snapshot": The current page snapshot after the wait.
1710
- - "tabs": List of all open tabs with their information.
1711
- - "current_tab": Index of the currently active tab.
1712
- - "total_tabs": Total number of open tabs.
1664
+ Dict[str, Any]: A dictionary with the result of the action:
1665
+ - "result" (str): A message indicating how the wait ended.
1666
+ - "snapshot" (str): The page snapshot after the wait.
1667
+ - "tabs" (List[Dict]): Information about all open tabs.
1668
+ - "current_tab" (int): Index of the active tab.
1669
+ - "total_tabs" (int): Total number of open tabs.
1713
1670
  """
1714
1671
  import asyncio
1715
1672
 
@@ -1756,20 +1713,18 @@ class HybridBrowserToolkit(BaseToolkit):
1756
1713
 
1757
1714
  @action_logger
1758
1715
  async def get_page_links(self, *, ref: List[str]) -> Dict[str, Any]:
1759
- r"""Retrieves the full URLs for a given list of link reference IDs.
1716
+ r"""Gets the destination URLs for a list of link elements.
1760
1717
 
1761
- This is useful when you need to know the destination of a link before
1762
- clicking it.
1718
+ This is useful to know where a link goes before clicking it.
1763
1719
 
1764
1720
  Args:
1765
- ref (List[str]): A list of reference IDs for link elements,
1766
- obtained from a page snapshot.
1721
+ ref (List[str]): A list of `ref` IDs for link elements, obtained
1722
+ from a page snapshot.
1767
1723
 
1768
1724
  Returns:
1769
1725
  Dict[str, Any]: A dictionary containing:
1770
- - "links": A list of dictionaries, where each dictionary
1771
- represents a found link and has "text", "ref", and "url"
1772
- keys.
1726
+ - "links" (List[Dict]): A list of found links, where each
1727
+ link has "text", "ref", and "url" keys.
1773
1728
  """
1774
1729
  if not ref or not isinstance(ref, list):
1775
1730
  return {"links": []}
@@ -1790,26 +1745,25 @@ class HybridBrowserToolkit(BaseToolkit):
1790
1745
  async def solve_task(
1791
1746
  self, task_prompt: str, start_url: str, max_steps: int = 15
1792
1747
  ) -> str:
1793
- r"""Uses a high-level LLM agent to autonomously complete a task.
1748
+ r"""Delegates a complex, high-level task to a specialized web agent.
1794
1749
 
1795
- This function delegates control to another agent that can reason about
1796
- a task, break it down into steps, and execute browser actions to
1797
- achieve the goal. It is suitable for complex, multi-step tasks.
1750
+ Use this for multi-step tasks that can be described in a single prompt
1751
+ (e.g., "log into my account and check for new messages"). The agent
1752
+ will autonomously perform the necessary browser actions.
1798
1753
 
1799
- Note: `web_agent_model` must be provided during the toolkit's
1800
- initialization to use this function.
1754
+ NOTE: This is a high-level action; for simple interactions, use tools
1755
+ like `click` and `type`. `web_agent_model` must be provided during
1756
+ toolkit initialization.
1801
1757
 
1802
1758
  Args:
1803
- task_prompt (str): A natural language description of the task to
1804
- be completed (e.g., "log into my account on example.com").
1805
- start_url (str): The URL to start the task from.
1806
- max_steps (int): The maximum number of steps the agent is allowed
1807
- to take before stopping.
1759
+ task_prompt (str): A natural language description of the task.
1760
+ start_url (str): The URL to start the task from. This should be a
1761
+ valid and existing URL, as agents may generate non-existent
1762
+ ones.
1763
+ max_steps (int): The maximum number of steps the agent can take.
1808
1764
 
1809
1765
  Returns:
1810
- str: A summary message indicating that the task processing has
1811
- finished. The detailed trace of the agent's actions will be
1812
- printed to the standard output.
1766
+ str: A summary message indicating the task has finished.
1813
1767
  """
1814
1768
  agent = self._ensure_agent()
1815
1769
  await agent.navigate(start_url)
@@ -1944,25 +1898,21 @@ class HybridBrowserToolkit(BaseToolkit):
1944
1898
 
1945
1899
  @action_logger
1946
1900
  async def switch_tab(self, *, tab_index: int) -> Dict[str, Any]:
1947
- r"""Switches to a specific browser tab by its index.
1901
+ r"""Switches to a different browser tab using its index.
1948
1902
 
1949
- This allows you to control which tab is currently active. After
1950
- switching, all subsequent browser actions will operate on the newly
1951
- selected tab.
1903
+ After switching, all actions will apply to the new tab. Use
1904
+ `get_tab_info` to find the index of the tab you want to switch to.
1952
1905
 
1953
1906
  Args:
1954
- tab_index (int): The zero-based index of the tab to switch to.
1955
- Use `get_tab_info` to see available tabs and their indices.
1907
+ tab_index (int): The zero-based index of the tab to activate.
1956
1908
 
1957
1909
  Returns:
1958
- Dict[str, Any]: A dictionary containing:
1959
- - "result": A message indicating success or failure of the
1960
- tab switch.
1961
- - "snapshot": A textual snapshot of the newly active tab's
1962
- content.
1963
- - "tabs": List of all open tabs with their information.
1964
- - "current_tab": Index of the currently active tab.
1965
- - "total_tabs": Total number of open tabs.
1910
+ Dict[str, Any]: A dictionary with the result of the action:
1911
+ - "result" (str): Confirmation of the action.
1912
+ - "snapshot" (str): A snapshot of the newly active tab.
1913
+ - "tabs" (List[Dict]): Information about all open tabs.
1914
+ - "current_tab" (int): Index of the new active tab.
1915
+ - "total_tabs" (int): Total number of open tabs.
1966
1916
  """
1967
1917
  await self._ensure_browser()
1968
1918
  session = await self._get_session()
@@ -1993,24 +1943,21 @@ class HybridBrowserToolkit(BaseToolkit):
1993
1943
 
1994
1944
  @action_logger
1995
1945
  async def close_tab(self, *, tab_index: int) -> Dict[str, Any]:
1996
- r"""Closes a specific browser tab by its index.
1946
+ r"""Closes a browser tab using its index.
1997
1947
 
1998
- After closing a tab, the browser will automatically switch to another
1999
- available tab. If the closed tab was the only one open, the browser
2000
- session will remain active but without any pages.
1948
+ Use `get_tab_info` to find the index of the tab to close. After
1949
+ closing, the browser will switch to another tab if available.
2001
1950
 
2002
1951
  Args:
2003
1952
  tab_index (int): The zero-based index of the tab to close.
2004
1953
 
2005
1954
  Returns:
2006
- Dict[str, Any]: A dictionary containing:
2007
- - "result": A message indicating success or failure of the
2008
- tab closure.
2009
- - "snapshot": A textual snapshot of the currently active tab
2010
- after the closure (empty if no tabs remain).
2011
- - "tabs": List of remaining open tabs.
2012
- - "current_tab": Index of the currently active tab.
2013
- - "total_tabs": Total number of remaining open tabs.
1955
+ Dict[str, Any]: A dictionary with the result of the action:
1956
+ - "result" (str): Confirmation of the action.
1957
+ - "snapshot" (str): A snapshot of the active tab after closure.
1958
+ - "tabs" (List[Dict]): Information about remaining tabs.
1959
+ - "current_tab" (int): Index of the new active tab.
1960
+ - "total_tabs" (int): Total number of remaining tabs.
2014
1961
  """
2015
1962
  await self._ensure_browser()
2016
1963
  session = await self._get_session()
@@ -2046,20 +1993,20 @@ class HybridBrowserToolkit(BaseToolkit):
2046
1993
 
2047
1994
  @action_logger
2048
1995
  async def get_tab_info(self) -> Dict[str, Any]:
2049
- r"""Retrieves information about all currently open browser tabs.
1996
+ r"""Gets a list of all open browser tabs and their information.
2050
1997
 
2051
- This provides a comprehensive overview of the browser state, including
2052
- all open tabs, their titles, URLs, and which one is currently active.
1998
+ This includes each tab's index, title, and URL, and indicates which
1999
+ tab is currently active. Use this to manage multiple tabs.
2053
2000
 
2054
2001
  Returns:
2055
- Dict[str, Any]: A dictionary containing:
2056
- - "tabs": A list of dictionaries, each representing a tab with:
2057
- - "index": The zero-based index of the tab
2058
- - "title": The page title
2059
- - "url": The current URL
2060
- - "is_current": Whether this is the currently active tab
2061
- - "current_tab": Index of the currently active tab
2062
- - "total_tabs": Total number of open tabs
2002
+ Dict[str, Any]: A dictionary with tab information:
2003
+ - "tabs" (List[Dict]): A list of open tabs, each with:
2004
+ - "index" (int): The tab's zero-based index.
2005
+ - "title" (str): The page title.
2006
+ - "url" (str): The current URL.
2007
+ - "is_current" (bool): True if the tab is active.
2008
+ - "current_tab" (int): Index of the active tab.
2009
+ - "total_tabs" (int): Total number of open tabs.
2063
2010
  """
2064
2011
  await self._ensure_browser()
2065
2012
  return await self._get_tab_info_for_output()
@@ -735,11 +735,11 @@
735
735
  function renderTree(node, indent = '') {
736
736
  const lines = [];
737
737
  let meaningfulProps = '';
738
- if (node.disabled) meaningfulProps += ' disabled';
739
- if (node.occluded) meaningfulProps += ' occluded';
738
+ if (node.disabled) meaningfulProps += ' [disabled]';
739
+ if (node.occluded) meaningfulProps += ' [occluded]';
740
740
  if (node.checked !== undefined) meaningfulProps += ` checked=${node.checked}`;
741
741
  if (node.expanded !== undefined) meaningfulProps += ` expanded=${node.expanded}`;
742
- if (node.selected) meaningfulProps += ' selected';
742
+ if (node.selected) meaningfulProps += ' [selected]';
743
743
 
744
744
  // Add level attribute following Playwright's format
745
745
  if (node.level !== undefined) meaningfulProps += ` [level=${node.level}]`;