camel-ai 0.2.73a0__py3-none-any.whl → 0.2.73a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of camel-ai might be problematic. Click here for more details.

@@ -11,69 +11,26 @@
11
11
  # See the License for the specific language governing permissions and
12
12
  # limitations under the License.
13
13
  # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
14
- # =========
15
14
 
16
- import time
17
- from typing import Any, Callable, ClassVar, Dict, List, Optional, cast
15
+ from typing import Any, List, Literal, Optional
18
16
 
19
- from camel.logger import get_logger
20
- from camel.messages import BaseMessage
21
17
  from camel.models import BaseModelBackend
22
- from camel.toolkits.base import BaseToolkit, RegisteredAgentToolkit
23
- from camel.toolkits.function_tool import FunctionTool
24
- from camel.utils.commons import dependencies_required
18
+ from camel.toolkits.base import BaseToolkit
25
19
 
26
- from .config_loader import ConfigLoader
27
- from .ws_wrapper import WebSocketBrowserWrapper
28
20
 
29
- logger = get_logger(__name__)
21
+ class HybridBrowserToolkit(BaseToolkit):
22
+ r"""A hybrid browser toolkit that can switch between TypeScript and Python
23
+ implementations.
30
24
 
31
-
32
- class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
33
- r"""A hybrid browser toolkit that combines non-visual, DOM-based browser
34
- automation with visual, screenshot-based capabilities.
35
-
36
- This toolkit now uses TypeScript implementation with Playwright's
37
- _snapshotForAI functionality for enhanced AI integration.
25
+ This wrapper allows users to choose between:
26
+ - 'typescript': WebSocket-based implementation using TypeScript/Node.js
27
+ - 'python': Pure Python implementation using Playwright directly
38
28
  """
39
29
 
40
- # Default tool list - core browser functionality
41
- DEFAULT_TOOLS: ClassVar[List[str]] = [
42
- "browser_open",
43
- "browser_close",
44
- "browser_visit_page",
45
- "browser_back",
46
- "browser_forward",
47
- "browser_click",
48
- "browser_type",
49
- "browser_switch_tab",
50
- ]
51
-
52
- # All available tools
53
- ALL_TOOLS: ClassVar[List[str]] = [
54
- "browser_open",
55
- "browser_close",
56
- "browser_visit_page",
57
- "browser_back",
58
- "browser_forward",
59
- "browser_get_page_snapshot",
60
- "browser_get_som_screenshot",
61
- "browser_get_page_links",
62
- "browser_click",
63
- "browser_type",
64
- "browser_select",
65
- "browser_scroll",
66
- "browser_enter",
67
- "browser_wait_user",
68
- "browser_solve_task",
69
- "browser_switch_tab",
70
- "browser_close_tab",
71
- "browser_get_tab_info",
72
- ]
73
-
74
- def __init__(
75
- self,
30
+ def __new__(
31
+ cls,
76
32
  *,
33
+ mode: Literal["typescript", "python"] = "typescript",
77
34
  headless: bool = True,
78
35
  user_data_dir: Optional[str] = None,
79
36
  stealth: bool = False,
@@ -93,1085 +50,128 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
93
50
  viewport_limit: bool = False,
94
51
  connect_over_cdp: bool = False,
95
52
  cdp_url: Optional[str] = None,
96
- ) -> None:
97
- r"""Initialize the HybridBrowserToolkit.
53
+ **kwargs: Any,
54
+ ) -> Any:
55
+ r"""Create a HybridBrowserToolkit instance with the specified mode.
98
56
 
99
57
  Args:
58
+ mode (Literal["typescript", "python"]): Implementation mode.
59
+ - 'typescript': Uses WebSocket-based TypeScript implementation
60
+ - 'python': Uses pure Python Playwright implementation
61
+ Defaults to "typescript".
100
62
  headless (bool): Whether to run browser in headless mode.
101
- Defaults to True.
63
+ Defaults to True.
102
64
  user_data_dir (Optional[str]): Directory for user data
103
- persistence. Defaults to None.
65
+ persistence. Defaults to None.
104
66
  stealth (bool): Whether to enable stealth mode. Defaults to
105
- False.
67
+ False.
106
68
  web_agent_model (Optional[BaseModelBackend]): Model for web
107
- agent operations. Defaults to None.
69
+ agent operations. Defaults to None.
108
70
  cache_dir (str): Directory for caching. Defaults to "tmp/".
109
71
  enabled_tools (Optional[List[str]]): List of enabled tools.
110
- Defaults to None.
72
+ Defaults to None.
111
73
  browser_log_to_file (bool): Whether to log browser actions to
112
- file. Defaults to False.
74
+ file. Defaults to False.
113
75
  session_id (Optional[str]): Session identifier. Defaults to None.
114
76
  default_start_url (str): Default URL to start with. Defaults
115
- to "https://google.com/".
77
+ to "https://google.com/".
116
78
  default_timeout (Optional[int]): Default timeout in
117
- milliseconds. Defaults to None.
79
+ milliseconds. Defaults to None.
118
80
  short_timeout (Optional[int]): Short timeout in milliseconds.
119
- Defaults to None.
81
+ Defaults to None.
120
82
  navigation_timeout (Optional[int]): Navigation timeout in
121
- milliseconds. Defaults to None.
83
+ milliseconds. Defaults to None.
122
84
  network_idle_timeout (Optional[int]): Network idle timeout in
123
- milliseconds. Defaults to None.
85
+ milliseconds. Defaults to None.
124
86
  screenshot_timeout (Optional[int]): Screenshot timeout in
125
- milliseconds. Defaults to None.
87
+ milliseconds. Defaults to None.
126
88
  page_stability_timeout (Optional[int]): Page stability timeout
127
- in milliseconds. Defaults to None.
89
+ in milliseconds. Defaults to None.
128
90
  dom_content_loaded_timeout (Optional[int]): DOM content loaded
129
- timeout in milliseconds. Defaults to None.
91
+ timeout in milliseconds. Defaults to None.
130
92
  viewport_limit (bool): Whether to filter page snapshot
131
- elements to only those visible in the current viewport.
132
- When True, only elements within the current viewport
133
- bounds will be included in snapshots.
134
- When False (default), all elements on the page are
135
- included. Defaults to False.
93
+ elements to only those visible in the current viewport.
94
+ Defaults to False.
136
95
  connect_over_cdp (bool): Whether to connect to an existing
137
- browser via Chrome DevTools Protocol. Defaults to False.
96
+ browser via Chrome DevTools Protocol. Defaults to False.
97
+ (Only supported in TypeScript mode)
138
98
  cdp_url (Optional[str]): WebSocket endpoint URL for CDP
139
- connection (e.g., 'ws://localhost:9222/devtools/browser/...').
140
- Required when connect_over_cdp is True. Defaults to None.
141
- """
142
- super().__init__()
143
- RegisteredAgentToolkit.__init__(self)
144
-
145
- # Initialize configuration loader
146
- self.config_loader = ConfigLoader.from_kwargs(
147
- headless=headless,
148
- user_data_dir=user_data_dir,
149
- stealth=stealth,
150
- default_start_url=default_start_url,
151
- default_timeout=default_timeout,
152
- short_timeout=short_timeout,
153
- navigation_timeout=navigation_timeout,
154
- network_idle_timeout=network_idle_timeout,
155
- screenshot_timeout=screenshot_timeout,
156
- page_stability_timeout=page_stability_timeout,
157
- dom_content_loaded_timeout=dom_content_loaded_timeout,
158
- viewport_limit=viewport_limit,
159
- cache_dir=cache_dir,
160
- browser_log_to_file=browser_log_to_file,
161
- session_id=session_id,
162
- enabled_tools=enabled_tools,
163
- connect_over_cdp=connect_over_cdp,
164
- cdp_url=cdp_url,
165
- )
166
-
167
- # Legacy attribute access for backward compatibility
168
- browser_config = self.config_loader.get_browser_config()
169
- toolkit_config = self.config_loader.get_toolkit_config()
170
-
171
- self._headless = browser_config.headless
172
- self._user_data_dir = browser_config.user_data_dir
173
- self._stealth = browser_config.stealth
174
- self._web_agent_model = web_agent_model
175
- self._cache_dir = toolkit_config.cache_dir
176
- self._browser_log_to_file = toolkit_config.browser_log_to_file
177
- self._default_start_url = browser_config.default_start_url
178
- self._session_id = toolkit_config.session_id or "default"
179
- self._viewport_limit = browser_config.viewport_limit
180
-
181
- # Store timeout configuration for backward compatibility
182
- self._default_timeout = browser_config.default_timeout
183
- self._short_timeout = browser_config.short_timeout
184
- self._navigation_timeout = browser_config.navigation_timeout
185
- self._network_idle_timeout = browser_config.network_idle_timeout
186
- self._screenshot_timeout = browser_config.screenshot_timeout
187
- self._page_stability_timeout = browser_config.page_stability_timeout
188
- self._dom_content_loaded_timeout = (
189
- browser_config.dom_content_loaded_timeout
190
- )
191
-
192
- # Configure enabled tools
193
- if enabled_tools is None:
194
- self.enabled_tools = self.DEFAULT_TOOLS.copy()
195
- else:
196
- # Validate enabled tools
197
- invalid_tools = [
198
- tool for tool in enabled_tools if tool not in self.ALL_TOOLS
199
- ]
200
- if invalid_tools:
201
- raise ValueError(
202
- f"Invalid tools specified: {invalid_tools}. "
203
- f"Available tools: {self.ALL_TOOLS}"
204
- )
205
- self.enabled_tools = enabled_tools.copy()
206
-
207
- logger.info(f"Enabled tools: {self.enabled_tools}")
208
-
209
- # Initialize WebSocket wrapper
210
- self._ws_wrapper: Optional[WebSocketBrowserWrapper] = None
211
- self._ws_config = self.config_loader.to_ws_config()
212
-
213
- async def _ensure_ws_wrapper(self):
214
- """Ensure WebSocket wrapper is initialized."""
215
- if self._ws_wrapper is None:
216
- self._ws_wrapper = WebSocketBrowserWrapper(self._ws_config)
217
- await self._ws_wrapper.start()
218
-
219
- async def _get_ws_wrapper(self) -> WebSocketBrowserWrapper:
220
- """Get the WebSocket wrapper, initializing if needed."""
221
- await self._ensure_ws_wrapper()
222
- if self._ws_wrapper is None:
223
- raise RuntimeError("Failed to initialize WebSocket wrapper")
224
- return self._ws_wrapper
225
-
226
- def __del__(self):
227
- r"""Cleanup browser resources on garbage collection."""
228
- try:
229
- import sys
230
-
231
- if getattr(sys, "is_finalizing", lambda: False)():
232
- return
233
-
234
- import asyncio
235
-
236
- try:
237
- loop = asyncio.get_event_loop()
238
- if not loop.is_closed() and not loop.is_running():
239
- try:
240
- loop.run_until_complete(
241
- asyncio.wait_for(self.browser_close(), timeout=2.0)
242
- )
243
- except asyncio.TimeoutError:
244
- pass
245
- except (RuntimeError, ImportError):
246
- pass
247
- except Exception:
248
- pass
249
-
250
- @property
251
- def web_agent_model(self) -> Optional[BaseModelBackend]:
252
- """Get the web agent model."""
253
- return self._web_agent_model
254
-
255
- @web_agent_model.setter
256
- def web_agent_model(self, value: Optional[BaseModelBackend]) -> None:
257
- """Set the web agent model."""
258
- self._web_agent_model = value
259
-
260
- @property
261
- def cache_dir(self) -> str:
262
- """Get the cache directory."""
263
- return self._cache_dir
264
-
265
- # Public API Methods
266
-
267
- async def browser_open(self) -> Dict[str, Any]:
268
- r"""Starts a new browser session. This must be the first browser
269
- action.
270
-
271
- This method initializes the browser and navigates to a default start
272
- page. To visit a specific URL, use `visit_page` after this.
273
-
274
- Returns:
275
- Dict[str, Any]: A dictionary with the result of the action:
276
- - "result" (str): Confirmation of the action.
277
- - "snapshot" (str): A textual snapshot of interactive
278
- elements.
279
- - "tabs" (List[Dict]): Information about all open tabs.
280
- - "current_tab" (int): Index of the active tab.
281
- - "total_tabs" (int): Total number of open tabs.
282
- """
283
- try:
284
- ws_wrapper = await self._get_ws_wrapper()
285
- result = await ws_wrapper.open_browser(self._default_start_url)
286
-
287
- # Add tab information
288
- tab_info = await ws_wrapper.get_tab_info()
289
- result.update(
290
- {
291
- "tabs": tab_info,
292
- "current_tab": next(
293
- (
294
- i
295
- for i, tab in enumerate(tab_info)
296
- if tab.get("is_current")
297
- ),
298
- 0,
299
- ),
300
- "total_tabs": len(tab_info),
301
- }
302
- )
303
-
304
- return result
305
- except Exception as e:
306
- logger.error(f"Failed to open browser: {e}")
307
- return {
308
- "result": f"Error opening browser: {e}",
309
- "snapshot": "",
310
- "tabs": [],
311
- "current_tab": 0,
312
- "total_tabs": 0,
313
- }
314
-
315
- async def browser_close(self) -> str:
316
- r"""Closes the browser session, releasing all resources.
317
-
318
- This should be called at the end of a task for cleanup.
99
+ connection. Required when connect_over_cdp is True.
100
+ Defaults to None. (Only supported in TypeScript mode)
101
+ **kwargs: Additional keyword arguments passed to the
102
+ implementation.
319
103
 
320
104
  Returns:
321
- str: A confirmation message.
105
+ HybridBrowserToolkit instance of the specified implementation.
322
106
  """
323
- try:
324
- if self._ws_wrapper:
325
- await self._ws_wrapper.stop()
326
- self._ws_wrapper = None
327
- return "Browser session closed."
328
- except Exception as e:
329
- logger.error(f"Failed to close browser: {e}")
330
- return f"Error closing browser: {e}"
331
-
332
- async def browser_visit_page(self, url: str) -> Dict[str, Any]:
333
- r"""Opens a URL in a new browser tab and switches to it.
334
-
335
- Args:
336
- url (str): The web address to load. This should be a valid and
337
- existing URL.
338
-
339
- Returns:
340
- Dict[str, Any]: A dictionary with the result of the action:
341
- - "result" (str): Confirmation of the action.
342
- - "snapshot" (str): A textual snapshot of the new page.
343
- - "tabs" (List[Dict]): Information about all open tabs.
344
- - "current_tab" (int): Index of the new active tab.
345
- - "total_tabs" (int): Total number of open tabs.
346
- """
347
- try:
348
- ws_wrapper = await self._get_ws_wrapper()
349
- result = await ws_wrapper.visit_page(url)
350
-
351
- # Add tab information
352
- tab_info = await ws_wrapper.get_tab_info()
353
- result.update(
354
- {
355
- "tabs": tab_info,
356
- "current_tab": next(
357
- (
358
- i
359
- for i, tab in enumerate(tab_info)
360
- if tab.get("is_current")
361
- ),
362
- 0,
363
- ),
364
- "total_tabs": len(tab_info),
365
- }
107
+ if mode == "typescript":
108
+ from .hybrid_browser_toolkit_ts import (
109
+ HybridBrowserToolkit as TSToolkit,
366
110
  )
367
111
 
368
- return result
369
- except Exception as e:
370
- logger.error(f"Failed to visit page: {e}")
371
- return {
372
- "result": f"Error visiting page: {e}",
373
- "snapshot": "",
374
- "tabs": [],
375
- "current_tab": 0,
376
- "total_tabs": 0,
377
- }
378
-
379
- async def browser_back(self) -> Dict[str, Any]:
380
- r"""Goes back to the previous page in the browser history.
381
-
382
- This action simulates using the browser's "back" button in the
383
- currently active tab.
384
-
385
- Returns:
386
- Dict[str, Any]: A dictionary with the result of the action:
387
- - "result" (str): Confirmation of the action.
388
- - "snapshot" (str): A textual snapshot of the previous page.
389
- - "tabs" (List[Dict]): Information about all open tabs.
390
- - "current_tab" (int): Index of the active tab.
391
- - "total_tabs" (int): Total number of open tabs.
392
- """
393
- try:
394
- ws_wrapper = await self._get_ws_wrapper()
395
- result = await ws_wrapper.back()
396
-
397
- # Add tab information
398
- tab_info = await ws_wrapper.get_tab_info()
399
- result.update(
400
- {
401
- "tabs": tab_info,
402
- "current_tab": next(
403
- (
404
- i
405
- for i, tab in enumerate(tab_info)
406
- if tab.get("is_current")
407
- ),
408
- 0,
409
- ),
410
- "total_tabs": len(tab_info),
411
- }
112
+ return TSToolkit(
113
+ headless=headless,
114
+ user_data_dir=user_data_dir,
115
+ stealth=stealth,
116
+ web_agent_model=web_agent_model,
117
+ cache_dir=cache_dir,
118
+ enabled_tools=enabled_tools,
119
+ browser_log_to_file=browser_log_to_file,
120
+ session_id=session_id,
121
+ default_start_url=default_start_url,
122
+ default_timeout=default_timeout,
123
+ short_timeout=short_timeout,
124
+ navigation_timeout=navigation_timeout,
125
+ network_idle_timeout=network_idle_timeout,
126
+ screenshot_timeout=screenshot_timeout,
127
+ page_stability_timeout=page_stability_timeout,
128
+ dom_content_loaded_timeout=dom_content_loaded_timeout,
129
+ viewport_limit=viewport_limit,
130
+ connect_over_cdp=connect_over_cdp,
131
+ cdp_url=cdp_url,
132
+ **kwargs,
412
133
  )
413
-
414
- return result
415
- except Exception as e:
416
- logger.error(f"Failed to navigate back: {e}")
417
- return {
418
- "result": f"Error navigating back: {e}",
419
- "snapshot": "",
420
- "tabs": [],
421
- "current_tab": 0,
422
- "total_tabs": 0,
423
- }
424
-
425
- async def browser_forward(self) -> Dict[str, Any]:
426
- r"""Goes forward to the next page in the browser history.
427
-
428
- This action simulates using the browser's "forward" button in the
429
- currently active tab.
430
-
431
- Returns:
432
- Dict[str, Any]: A dictionary with the result of the action:
433
- - "result" (str): Confirmation of the action.
434
- - "snapshot" (str): A textual snapshot of the next page.
435
- - "tabs" (List[Dict]): Information about all open tabs.
436
- - "current_tab" (int): Index of the active tab.
437
- - "total_tabs" (int): Total number of open tabs.
438
- """
439
- try:
440
- ws_wrapper = await self._get_ws_wrapper()
441
- result = await ws_wrapper.forward()
442
-
443
- # Add tab information
444
- tab_info = await ws_wrapper.get_tab_info()
445
- result.update(
446
- {
447
- "tabs": tab_info,
448
- "current_tab": next(
449
- (
450
- i
451
- for i, tab in enumerate(tab_info)
452
- if tab.get("is_current")
453
- ),
454
- 0,
455
- ),
456
- "total_tabs": len(tab_info),
457
- }
134
+ elif mode == "python":
135
+ from ..hybrid_browser_toolkit_py import (
136
+ HybridBrowserToolkit as PyToolkit,
458
137
  )
459
138
 
460
- return result
461
- except Exception as e:
462
- logger.error(f"Failed to navigate forward: {e}")
463
- return {
464
- "result": f"Error navigating forward: {e}",
465
- "snapshot": "",
466
- "tabs": [],
467
- "current_tab": 0,
468
- "total_tabs": 0,
469
- }
470
-
471
- async def browser_get_page_snapshot(self) -> str:
472
- r"""Gets a textual snapshot of the page's interactive elements.
473
-
474
- The snapshot lists elements like buttons, links, and inputs,
475
- each with
476
- a unique `ref` ID. This ID is used by other tools (e.g., `click`,
477
- `type`) to interact with a specific element. This tool provides no
478
- visual information.
479
-
480
- If viewport_limit is enabled, only elements within the current
481
- viewport
482
- will be included in the snapshot.
483
-
484
- Returns:
485
- str: A formatted string representing the interactive elements and
486
- their `ref` IDs. For example:
487
- '- link "Sign In" [ref=1]'
488
- '- textbox "Username" [ref=2]'
489
- """
490
- try:
491
- ws_wrapper = await self._get_ws_wrapper()
492
- return await ws_wrapper.get_page_snapshot(self._viewport_limit)
493
- except Exception as e:
494
- logger.error(f"Failed to get page snapshot: {e}")
495
- return f"Error capturing snapshot: {e}"
496
-
497
- @dependencies_required('PIL')
498
- async def browser_get_som_screenshot(
499
- self,
500
- read_image: bool = True,
501
- instruction: Optional[str] = None,
502
- ) -> str:
503
- r"""Captures a screenshot with interactive elements highlighted.
504
-
505
- "SoM" stands for "Set of Marks". This tool takes a screenshot and
506
- draws
507
- boxes around clickable elements, overlaying a `ref` ID on each. Use
508
- this for a visual understanding of the page, especially when the
509
- textual snapshot is not enough.
510
-
511
- Args:
512
- read_image (bool, optional): If `True`, the agent will analyze
513
- the screenshot. Requires agent to be registered.
514
- (default: :obj:`True`)
515
- instruction (Optional[str], optional): A specific question or
516
- command for the agent regarding the screenshot, used only if
517
- `read_image` is `True`. For example: "Find the login button."
518
-
519
- Returns:
520
- str: A confirmation message indicating the screenshot was
521
- captured, the file path where it was saved, and optionally the
522
- agent's analysis if `read_image` is `True`.
523
- """
524
- import base64
525
- import datetime
526
- import os
527
- import urllib.parse
528
-
529
- from camel.utils import sanitize_filename
530
-
531
- try:
532
- ws_wrapper = await self._get_ws_wrapper()
533
- result = await ws_wrapper.get_som_screenshot()
534
-
535
- # Initialize result text
536
- result_text = result.text
537
- file_path = None
538
-
539
- # Save screenshot to cache directory if images are available
540
- if result.images:
541
- # Ensure cache directory exists (use absolute path)
542
- cache_dir = os.path.abspath(self._cache_dir)
543
- os.makedirs(cache_dir, exist_ok=True)
544
-
545
- # Get current page URL for filename
546
- try:
547
- # Try to get the current page URL from the wrapper
548
- page_info = await ws_wrapper.get_tab_info()
549
- current_tab = next(
550
- (tab for tab in page_info if tab.get('is_current')),
551
- None,
552
- )
553
- url = current_tab['url'] if current_tab else 'unknown'
554
- except Exception:
555
- url = 'unknown'
556
-
557
- # Generate filename
558
- parsed_url = urllib.parse.urlparse(url)
559
- url_name = sanitize_filename(
560
- str(parsed_url.path) or 'homepage', max_length=241
561
- )
562
- timestamp = datetime.datetime.now().strftime("%m%d%H%M%S")
563
- file_path = os.path.join(
564
- cache_dir, f"{url_name}_{timestamp}_som.png"
139
+ # Note: Python implementation doesn't support CDP connection
140
+ if connect_over_cdp:
141
+ raise ValueError(
142
+ "CDP connection is only supported in TypeScript mode"
565
143
  )
566
144
 
567
- # Extract base64 data and save to file
568
- for _, image_data in enumerate(result.images):
569
- if image_data.startswith('data:image/png;base64,'):
570
- # Remove data URL prefix
571
- base64_data = image_data.split(',', 1)[1]
572
-
573
- # Decode and save
574
- image_bytes = base64.b64decode(base64_data)
575
- with open(file_path, 'wb') as f:
576
- f.write(image_bytes)
577
-
578
- logger.info(f"Screenshot saved to: {file_path}")
579
-
580
- # Update result text to include file path
581
- result_text += f" (saved to: {file_path})"
582
- break
583
-
584
- # Analyze image if requested and agent is registered
585
- if read_image and file_path:
586
- if self.agent is None:
587
- logger.error(
588
- "Cannot analyze screenshot: No agent registered. "
589
- "Please pass this toolkit to ChatAgent via "
590
- "toolkits_to_register_agent parameter."
591
- )
592
- result_text += (
593
- " Error: No agent registered for image analysis. "
594
- "Please pass this toolkit to ChatAgent via "
595
- "toolkits_to_register_agent parameter."
596
- )
597
- else:
598
- try:
599
- # Load the image and create a message
600
- from PIL import Image
601
-
602
- img = Image.open(file_path)
603
- inst = instruction if instruction is not None else ""
604
- message = BaseMessage.make_user_message(
605
- role_name="User",
606
- content=inst,
607
- image_list=[img],
608
- )
609
-
610
- # Get agent's analysis
611
- response = await self.agent.astep(message)
612
- agent_response = response.msgs[0].content
613
- result_text += f". Agent analysis: {agent_response}"
614
- except Exception as e:
615
- logger.error(f"Error analyzing screenshot: {e}")
616
- result_text += f". Error analyzing screenshot: {e}"
617
-
618
- return result_text
619
- except Exception as e:
620
- logger.error(f"Failed to get screenshot: {e}")
621
- return f"Error capturing screenshot: {e}"
622
-
623
- async def browser_click(self, *, ref: str) -> Dict[str, Any]:
624
- r"""Performs a click on an element on the page.
625
-
626
- Args:
627
- ref (str): The `ref` ID of the element to click. This ID is
628
- obtained from a page snapshot (`get_page_snapshot` or
629
- `get_som_screenshot`).
630
-
631
- Returns:
632
- Dict[str, Any]: A dictionary with the result of the action:
633
- - "result" (str): Confirmation of the action.
634
- - "snapshot" (str): A textual snapshot of the page after the
635
- click.
636
- - "tabs" (List[Dict]): Information about all open tabs.
637
- - "current_tab" (int): Index of the active tab.
638
- - "total_tabs" (int): Total number of open tabs.
639
- """
640
- try:
641
- ws_wrapper = await self._get_ws_wrapper()
642
- result = await ws_wrapper.click(ref)
643
-
644
- # Add tab information
645
- tab_info = await ws_wrapper.get_tab_info()
646
- result.update(
647
- {
648
- "tabs": tab_info,
649
- "current_tab": next(
650
- (
651
- i
652
- for i, tab in enumerate(tab_info)
653
- if tab.get("is_current")
654
- ),
655
- 0,
656
- ),
657
- "total_tabs": len(tab_info),
658
- }
659
- )
660
-
661
- return result
662
- except Exception as e:
663
- logger.error(f"Failed to click element: {e}")
664
- return {
665
- "result": f"Error clicking element: {e}",
666
- "snapshot": "",
667
- "tabs": [],
668
- "current_tab": 0,
669
- "total_tabs": 0,
670
- }
671
-
672
- async def browser_type(self, *, ref: str, text: str) -> Dict[str, Any]:
673
- r"""Types text into an input element on the page.
674
-
675
- Args:
676
- ref (str): The `ref` ID of the input element, from a snapshot.
677
- text (str): The text to type into the element.
678
-
679
- Returns:
680
- Dict[str, Any]: A dictionary with the result of the action:
681
- - "result" (str): Confirmation of the action.
682
- - "snapshot" (str): A textual snapshot of the page after
683
- typing.
684
- - "tabs" (List[Dict]): Information about all open tabs.
685
- - "current_tab" (int): Index of the active tab.
686
- - "total_tabs" (int): Total number of open tabs.
687
- """
688
- try:
689
- ws_wrapper = await self._get_ws_wrapper()
690
- result = await ws_wrapper.type(ref, text)
691
-
692
- # Add tab information
693
- tab_info = await ws_wrapper.get_tab_info()
694
- result.update(
695
- {
696
- "tabs": tab_info,
697
- "current_tab": next(
698
- (
699
- i
700
- for i, tab in enumerate(tab_info)
701
- if tab.get("is_current")
702
- ),
703
- 0,
704
- ),
705
- "total_tabs": len(tab_info),
706
- }
707
- )
708
-
709
- return result
710
- except Exception as e:
711
- logger.error(f"Failed to type text: {e}")
712
- return {
713
- "result": f"Error typing text: {e}",
714
- "snapshot": "",
715
- "tabs": [],
716
- "current_tab": 0,
717
- "total_tabs": 0,
718
- }
719
-
720
- async def browser_select(self, *, ref: str, value: str) -> Dict[str, Any]:
721
- r"""Selects an option in a dropdown (`<select>`) element.
722
-
723
- Args:
724
- ref (str): The `ref` ID of the `<select>` element.
725
- value (str): The `value` attribute of the `<option>` to select,
726
- not its visible text.
727
-
728
- Returns:
729
- Dict[str, Any]: A dictionary with the result of the action:
730
- - "result" (str): Confirmation of the action.
731
- - "snapshot" (str): A snapshot of the page after the
732
- selection.
733
- - "tabs" (List[Dict]): Information about all open tabs.
734
- - "current_tab" (int): Index of the active tab.
735
- - "total_tabs" (int): Total number of open tabs.
736
- """
737
- try:
738
- ws_wrapper = await self._get_ws_wrapper()
739
- result = await ws_wrapper.select(ref, value)
740
-
741
- # Add tab information
742
- tab_info = await ws_wrapper.get_tab_info()
743
- result.update(
744
- {
745
- "tabs": tab_info,
746
- "current_tab": next(
747
- (
748
- i
749
- for i, tab in enumerate(tab_info)
750
- if tab.get("is_current")
751
- ),
752
- 0,
753
- ),
754
- "total_tabs": len(tab_info),
755
- }
756
- )
757
-
758
- return result
759
- except Exception as e:
760
- logger.error(f"Failed to select option: {e}")
761
- return {
762
- "result": f"Error selecting option: {e}",
763
- "snapshot": "",
764
- "tabs": [],
765
- "current_tab": 0,
766
- "total_tabs": 0,
767
- }
768
-
769
- async def browser_scroll(
770
- self, *, direction: str, amount: int = 500
771
- ) -> Dict[str, Any]:
772
- r"""Scrolls the current page window.
773
-
774
- Args:
775
- direction (str): The direction to scroll: 'up' or 'down'.
776
- amount (int): The number of pixels to scroll, default is 500.
777
-
778
- Returns:
779
- Dict[str, Any]: A dictionary with the result of the action:
780
- - "result" (str): Confirmation of the action.
781
- - "snapshot" (str): A snapshot of the page after scrolling.
782
- - "tabs" (List[Dict]): Information about all open tabs.
783
- - "current_tab" (int): Index of the active tab.
784
- - "total_tabs" (int): Total number of open tabs.
785
- """
786
- try:
787
- ws_wrapper = await self._get_ws_wrapper()
788
- result = await ws_wrapper.scroll(direction, amount)
789
-
790
- # Add tab information
791
- tab_info = await ws_wrapper.get_tab_info()
792
- result.update(
793
- {
794
- "tabs": tab_info,
795
- "current_tab": next(
796
- (
797
- i
798
- for i, tab in enumerate(tab_info)
799
- if tab.get("is_current")
800
- ),
801
- 0,
802
- ),
803
- "total_tabs": len(tab_info),
804
- }
805
- )
806
-
807
- return result
808
- except Exception as e:
809
- logger.error(f"Failed to scroll: {e}")
810
- return {
811
- "result": f"Error scrolling: {e}",
812
- "snapshot": "",
813
- "tabs": [],
814
- "current_tab": 0,
815
- "total_tabs": 0,
816
- }
817
-
818
- async def browser_enter(self) -> Dict[str, Any]:
819
- r"""Simulates pressing the Enter key on the currently focused
820
- element.
821
-
822
- This is useful for submitting forms or search queries after using the
823
- `type` tool.
824
-
825
- Returns:
826
- Dict[str, Any]: A dictionary with the result of the action:
827
- - "result" (str): Confirmation of the action.
828
- - "snapshot" (str): A new page snapshot, as this action often
829
- triggers navigation.
830
- - "tabs" (List[Dict]): Information about all open tabs.
831
- - "current_tab" (int): Index of the active tab.
832
- - "total_tabs" (int): Total number of open tabs.
833
- """
834
- try:
835
- ws_wrapper = await self._get_ws_wrapper()
836
- result = await ws_wrapper.enter()
837
-
838
- # Add tab information
839
- tab_info = await ws_wrapper.get_tab_info()
840
- result.update(
841
- {
842
- "tabs": tab_info,
843
- "current_tab": next(
844
- (
845
- i
846
- for i, tab in enumerate(tab_info)
847
- if tab.get("is_current")
848
- ),
849
- 0,
850
- ),
851
- "total_tabs": len(tab_info),
852
- }
853
- )
854
-
855
- return result
856
- except Exception as e:
857
- logger.error(f"Failed to press enter: {e}")
858
- return {
859
- "result": f"Error pressing enter: {e}",
860
- "snapshot": "",
861
- "tabs": [],
862
- "current_tab": 0,
863
- "total_tabs": 0,
864
- }
865
-
866
- async def browser_switch_tab(self, *, tab_id: str) -> Dict[str, Any]:
867
- r"""Switches to a different browser tab using its ID.
868
-
869
- After switching, all actions will apply to the new tab. Use
870
- `get_tab_info` to find the ID of the tab you want to switch to.
871
-
872
- Args:
873
- tab_id (str): The ID of the tab to activate.
874
-
875
- Returns:
876
- Dict[str, Any]: A dictionary with the result of the action:
877
- - "result" (str): Confirmation of the action.
878
- - "snapshot" (str): A snapshot of the newly active tab.
879
- - "tabs" (List[Dict]): Information about all open tabs.
880
- - "current_tab" (int): Index of the new active tab.
881
- - "total_tabs" (int): Total number of open tabs.
882
- """
883
- try:
884
- ws_wrapper = await self._get_ws_wrapper()
885
- result = await ws_wrapper.switch_tab(tab_id)
886
-
887
- # Add tab information
888
- tab_info = await ws_wrapper.get_tab_info()
889
- result.update(
890
- {
891
- "tabs": tab_info,
892
- "current_tab": next(
893
- (
894
- i
895
- for i, tab in enumerate(tab_info)
896
- if tab.get("is_current")
897
- ),
898
- 0,
899
- ),
900
- "total_tabs": len(tab_info),
901
- }
902
- )
903
-
904
- return result
905
- except Exception as e:
906
- logger.error(f"Failed to switch tab: {e}")
907
- return {
908
- "result": f"Error switching tab: {e}",
909
- "snapshot": "",
910
- "tabs": [],
911
- "current_tab": 0,
912
- "total_tabs": 0,
913
- }
914
-
915
- async def browser_close_tab(self, *, tab_id: str) -> Dict[str, Any]:
916
- r"""Closes a browser tab using its ID.
917
-
918
- Use `get_tab_info` to find the ID of the tab to close. After
919
- closing, the browser will switch to another tab if available.
920
-
921
- Args:
922
- tab_id (str): The ID of the tab to close.
923
-
924
- Returns:
925
- Dict[str, Any]: A dictionary with the result of the action:
926
- - "result" (str): Confirmation of the action.
927
- - "snapshot" (str): A snapshot of the active tab after
928
- closure.
929
- - "tabs" (List[Dict]): Information about remaining tabs.
930
- - "current_tab" (int): Index of the new active tab.
931
- - "total_tabs" (int): Total number of remaining tabs.
932
- """
933
- try:
934
- ws_wrapper = await self._get_ws_wrapper()
935
- result = await ws_wrapper.close_tab(tab_id)
936
-
937
- # Add tab information
938
- tab_info = await ws_wrapper.get_tab_info()
939
- result.update(
940
- {
941
- "tabs": tab_info,
942
- "current_tab": next(
943
- (
944
- i
945
- for i, tab in enumerate(tab_info)
946
- if tab.get("is_current")
947
- ),
948
- 0,
949
- ),
950
- "total_tabs": len(tab_info),
951
- }
952
- )
953
-
954
- return result
955
- except Exception as e:
956
- logger.error(f"Failed to close tab: {e}")
957
- return {
958
- "result": f"Error closing tab: {e}",
959
- "snapshot": "",
960
- "tabs": [],
961
- "current_tab": 0,
962
- "total_tabs": 0,
963
- }
145
+ # Note: Python implementation doesn't support viewport_limit
146
+ if viewport_limit:
147
+ import warnings
964
148
 
965
- async def browser_get_tab_info(self) -> Dict[str, Any]:
966
- r"""Gets a list of all open browser tabs and their information.
967
-
968
- This includes each tab's index, title, and URL, and indicates which
969
- tab is currently active. Use this to manage multiple tabs.
970
-
971
- Returns:
972
- Dict[str, Any]: A dictionary with tab information:
973
- - "tabs" (List[Dict]): A list of open tabs, each with:
974
- - "index" (int): The tab's zero-based index.
975
- - "title" (str): The page title.
976
- - "url" (str): The current URL.
977
- - "is_current" (bool): True if the tab is active.
978
- - "current_tab" (int): Index of the active tab.
979
- - "total_tabs" (int): Total number of open tabs.
980
- """
981
- try:
982
- ws_wrapper = await self._get_ws_wrapper()
983
- tab_info = await ws_wrapper.get_tab_info()
984
-
985
- return {
986
- "tabs": tab_info,
987
- "current_tab": next(
988
- (
989
- i
990
- for i, tab in enumerate(tab_info)
991
- if tab.get("is_current")
992
- ),
993
- 0,
994
- ),
995
- "total_tabs": len(tab_info),
996
- }
997
- except Exception as e:
998
- logger.error(f"Failed to get tab info: {e}")
999
- return {
1000
- "tabs": [],
1001
- "current_tab": 0,
1002
- "total_tabs": 0,
1003
- }
1004
-
1005
- # Additional methods for backward compatibility
1006
- async def browser_wait_user(
1007
- self, timeout_sec: Optional[float] = None
1008
- ) -> Dict[str, Any]:
1009
- r"""Pauses execution and waits for human input from the console.
1010
-
1011
- Use this for tasks requiring manual steps, like solving a CAPTCHA.
1012
- The
1013
- agent will resume after the user presses Enter in the console.
1014
-
1015
- Args:
1016
- timeout_sec (Optional[float]): Max time to wait in seconds. If
1017
- `None`, it will wait indefinitely.
1018
-
1019
- Returns:
1020
- Dict[str, Any]: A dictionary with the result of the action:
1021
- - "result" (str): A message indicating how the wait ended.
1022
- - "snapshot" (str): The page snapshot after the wait.
1023
- - "tabs" (List[Dict]): Information about all open tabs.
1024
- - "current_tab" (int): Index of the active tab.
1025
- - "total_tabs" (int): Total number of open tabs.
1026
- """
1027
- import asyncio
1028
-
1029
- prompt = (
1030
- "🕑 Agent waiting for human input. "
1031
- "Complete action in browser, then press Enter..."
1032
- )
1033
- logger.info(f"\n{prompt}\n")
1034
-
1035
- async def _await_enter():
1036
- try:
1037
- await asyncio.to_thread(
1038
- input, ">>> Press Enter to resume <<<\n"
149
+ warnings.warn(
150
+ "viewport_limit is not supported "
151
+ "in Python mode and will be ignored",
152
+ UserWarning,
1039
153
  )
1040
- except (asyncio.CancelledError, Exception):
1041
- # Handle cancellation gracefully
1042
- pass
1043
154
 
1044
- try:
1045
- if timeout_sec is not None:
1046
- logger.info(
1047
- f"Waiting for user input with timeout: {timeout_sec}s"
1048
- )
1049
- start_time = time.time()
1050
- task = asyncio.create_task(_await_enter())
1051
- try:
1052
- await asyncio.wait_for(task, timeout=timeout_sec)
1053
- wait_time = time.time() - start_time
1054
- logger.info(f"User input received after {wait_time:.2f}s")
1055
- result_msg = "User resumed."
1056
- except asyncio.TimeoutError:
1057
- task.cancel()
1058
- # Wait for task to be cancelled properly
1059
- try:
1060
- await task
1061
- except asyncio.CancelledError:
1062
- pass
1063
- raise
1064
- else:
1065
- logger.info("Waiting for user input (no timeout)")
1066
- start_time = time.time()
1067
- await _await_enter()
1068
- wait_time = time.time() - start_time
1069
- logger.info(f"User input received after {wait_time:.2f}s")
1070
- result_msg = "User resumed."
1071
- except asyncio.TimeoutError:
1072
- wait_time = timeout_sec or 0.0
1073
- logger.info(
1074
- f"User input timeout reached after {wait_time}s, "
1075
- f"auto-resuming"
155
+ return PyToolkit(
156
+ headless=headless,
157
+ user_data_dir=user_data_dir,
158
+ stealth=stealth,
159
+ web_agent_model=web_agent_model,
160
+ cache_dir=cache_dir,
161
+ enabled_tools=enabled_tools,
162
+ browser_log_to_file=browser_log_to_file,
163
+ session_id=session_id,
164
+ default_start_url=default_start_url,
165
+ default_timeout=default_timeout,
166
+ short_timeout=short_timeout,
167
+ navigation_timeout=navigation_timeout,
168
+ network_idle_timeout=network_idle_timeout,
169
+ screenshot_timeout=screenshot_timeout,
170
+ page_stability_timeout=page_stability_timeout,
171
+ dom_content_loaded_timeout=dom_content_loaded_timeout,
172
+ **kwargs,
173
+ )
174
+ else:
175
+ raise ValueError(
176
+ f"Invalid mode: {mode}. Must be 'typescript' or 'python'."
1076
177
  )
1077
- result_msg = f"Timeout {timeout_sec}s reached, auto-resumed."
1078
-
1079
- try:
1080
- snapshot = await self.browser_get_page_snapshot()
1081
- tab_info = await self.browser_get_tab_info()
1082
- return {"result": result_msg, "snapshot": snapshot, **tab_info}
1083
- except Exception as e:
1084
- logger.warning(f"Failed to get snapshot after wait: {e}")
1085
- return {
1086
- "result": result_msg,
1087
- "snapshot": "",
1088
- "tabs": [],
1089
- "current_tab": 0,
1090
- "total_tabs": 0,
1091
- }
1092
-
1093
- def clone_for_new_session(
1094
- self, new_session_id: Optional[str] = None
1095
- ) -> "HybridBrowserToolkit":
1096
- r"""Create a new instance of HybridBrowserToolkit with a unique
1097
- session.
1098
-
1099
- Args:
1100
- new_session_id: Optional new session ID. If None, a UUID will be
1101
- generated.
1102
-
1103
- Returns:
1104
- A new HybridBrowserToolkit instance with the same configuration
1105
- but a different session.
1106
- """
1107
- import uuid
1108
-
1109
- if new_session_id is None:
1110
- new_session_id = str(uuid.uuid4())[:8]
1111
-
1112
- return HybridBrowserToolkit(
1113
- headless=self._headless,
1114
- user_data_dir=self._user_data_dir,
1115
- stealth=self._stealth,
1116
- web_agent_model=self._web_agent_model,
1117
- cache_dir=f"{self._cache_dir.rstrip('/')}_clone_"
1118
- f"{new_session_id}/",
1119
- enabled_tools=self.enabled_tools.copy(),
1120
- browser_log_to_file=self._browser_log_to_file,
1121
- session_id=new_session_id,
1122
- default_start_url=self._default_start_url,
1123
- default_timeout=self._default_timeout,
1124
- short_timeout=self._short_timeout,
1125
- navigation_timeout=self._navigation_timeout,
1126
- network_idle_timeout=self._network_idle_timeout,
1127
- screenshot_timeout=self._screenshot_timeout,
1128
- page_stability_timeout=self._page_stability_timeout,
1129
- dom_content_loaded_timeout=self._dom_content_loaded_timeout,
1130
- )
1131
-
1132
- def get_tools(self) -> List[FunctionTool]:
1133
- r"""Get available function tools based
1134
- on enabled_tools configuration."""
1135
- # Map tool names to their corresponding methods
1136
- tool_map = {
1137
- "browser_open": self.browser_open,
1138
- "browser_close": self.browser_close,
1139
- "browser_visit_page": self.browser_visit_page,
1140
- "browser_back": self.browser_back,
1141
- "browser_forward": self.browser_forward,
1142
- "browser_get_page_snapshot": self.browser_get_page_snapshot,
1143
- "browser_get_som_screenshot": self.browser_get_som_screenshot,
1144
- "browser_click": self.browser_click,
1145
- "browser_type": self.browser_type,
1146
- "browser_select": self.browser_select,
1147
- "browser_scroll": self.browser_scroll,
1148
- "browser_enter": self.browser_enter,
1149
- "browser_wait_user": self.browser_wait_user,
1150
- "browser_switch_tab": self.browser_switch_tab,
1151
- "browser_close_tab": self.browser_close_tab,
1152
- "browser_get_tab_info": self.browser_get_tab_info,
1153
- }
1154
-
1155
- enabled_tools = []
1156
-
1157
- for tool_name in self.enabled_tools:
1158
- if (
1159
- tool_name == "browser_solve_task"
1160
- and self._web_agent_model is None
1161
- ):
1162
- logger.warning(
1163
- f"Tool '{tool_name}' is enabled but web_agent_model "
1164
- f"is not provided. Skipping this tool."
1165
- )
1166
- continue
1167
-
1168
- if tool_name in tool_map:
1169
- tool = FunctionTool(
1170
- cast(Callable[..., Any], tool_map[tool_name])
1171
- )
1172
- enabled_tools.append(tool)
1173
- else:
1174
- logger.warning(f"Unknown tool name: {tool_name}")
1175
-
1176
- logger.info(f"Returning {len(enabled_tools)} enabled tools")
1177
- return enabled_tools