camel-ai 0.2.71a2__py3-none-any.whl → 0.2.71a4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of camel-ai might be problematic. Click here for more details.

Files changed (32) hide show
  1. camel/__init__.py +1 -1
  2. camel/agents/_types.py +6 -2
  3. camel/agents/chat_agent.py +297 -16
  4. camel/interpreters/docker_interpreter.py +3 -2
  5. camel/loaders/base_loader.py +85 -0
  6. camel/messages/base.py +2 -6
  7. camel/services/agent_openapi_server.py +380 -0
  8. camel/societies/workforce/workforce.py +144 -33
  9. camel/toolkits/__init__.py +7 -4
  10. camel/toolkits/craw4ai_toolkit.py +2 -2
  11. camel/toolkits/file_write_toolkit.py +6 -6
  12. camel/toolkits/{non_visual_browser_toolkit → hybrid_browser_toolkit}/__init__.py +2 -2
  13. camel/toolkits/{non_visual_browser_toolkit → hybrid_browser_toolkit}/actions.py +47 -11
  14. camel/toolkits/{non_visual_browser_toolkit → hybrid_browser_toolkit}/agent.py +21 -11
  15. camel/toolkits/{non_visual_browser_toolkit/nv_browser_session.py → hybrid_browser_toolkit/browser_session.py} +64 -10
  16. camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit.py +1008 -0
  17. camel/toolkits/{non_visual_browser_toolkit → hybrid_browser_toolkit}/snapshot.py +16 -4
  18. camel/toolkits/{non_visual_browser_toolkit/snapshot.js → hybrid_browser_toolkit/unified_analyzer.js} +202 -23
  19. camel/toolkits/note_taking_toolkit.py +90 -0
  20. camel/toolkits/openai_image_toolkit.py +292 -0
  21. camel/toolkits/slack_toolkit.py +4 -4
  22. camel/toolkits/terminal_toolkit.py +223 -73
  23. camel/types/agents/tool_calling_record.py +4 -1
  24. camel/types/enums.py +24 -24
  25. camel/utils/mcp_client.py +37 -1
  26. camel/utils/tool_result.py +44 -0
  27. {camel_ai-0.2.71a2.dist-info → camel_ai-0.2.71a4.dist-info}/METADATA +58 -5
  28. {camel_ai-0.2.71a2.dist-info → camel_ai-0.2.71a4.dist-info}/RECORD +30 -26
  29. camel/toolkits/dalle_toolkit.py +0 -175
  30. camel/toolkits/non_visual_browser_toolkit/browser_non_visual_toolkit.py +0 -446
  31. {camel_ai-0.2.71a2.dist-info → camel_ai-0.2.71a4.dist-info}/WHEEL +0 -0
  32. {camel_ai-0.2.71a2.dist-info → camel_ai-0.2.71a4.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1008 @@
1
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
14
+
15
+ import base64
16
+ import datetime
17
+ import io
18
+ import os
19
+ import time
20
+ import urllib.parse
21
+ from typing import Any, Callable, ClassVar, Dict, List, Optional, cast
22
+
23
+ from camel.logger import get_logger
24
+ from camel.models import BaseModelBackend
25
+ from camel.toolkits.base import BaseToolkit
26
+ from camel.toolkits.function_tool import FunctionTool
27
+ from camel.utils import sanitize_filename
28
+ from camel.utils.commons import dependencies_required
29
+
30
+ from .agent import PlaywrightLLMAgent
31
+ from .browser_session import NVBrowserSession
32
+
33
+ logger = get_logger(__name__)
34
+
35
+
36
+ class HybridBrowserToolkit(BaseToolkit):
37
+ r"""A hybrid browser toolkit that combines non-visual, DOM-based browser
38
+ automation with visual, screenshot-based capabilities.
39
+
40
+ This toolkit exposes a set of actions as CAMEL FunctionTools for agents
41
+ to interact with web pages. It can operate in headless mode and supports
42
+ both programmatic control of browser actions (like clicking and typing)
43
+ and visual analysis of the page layout through screenshots with marked
44
+ interactive elements.
45
+ """
46
+
47
+ # Configuration constants
48
+ DEFAULT_SCREENSHOT_TIMEOUT = 60000 # 60 seconds for screenshots
49
+ PAGE_STABILITY_TIMEOUT = 3000 # 3 seconds for DOM stability
50
+ NETWORK_IDLE_TIMEOUT = 2000 # 2 seconds for network idle
51
+
52
+ # Default tool list - core browser functionality
53
+ DEFAULT_TOOLS: ClassVar[List[str]] = [
54
+ "open_browser",
55
+ "close_browser",
56
+ "visit_page",
57
+ "click",
58
+ "type",
59
+ "enter",
60
+ ]
61
+
62
+ # All available tools
63
+ ALL_TOOLS: ClassVar[List[str]] = [
64
+ "open_browser",
65
+ "close_browser",
66
+ "visit_page",
67
+ "get_page_snapshot",
68
+ "get_som_screenshot",
69
+ "get_page_links",
70
+ "click",
71
+ "type",
72
+ "select",
73
+ "scroll",
74
+ "enter",
75
+ "wait_user",
76
+ "solve_task",
77
+ ]
78
+
79
+ def __init__(
80
+ self,
81
+ *,
82
+ headless: bool = True,
83
+ user_data_dir: Optional[str] = None,
84
+ web_agent_model: Optional[BaseModelBackend] = None,
85
+ cache_dir: str = "tmp/",
86
+ enabled_tools: Optional[List[str]] = None,
87
+ ) -> None:
88
+ r"""Initialize the HybridBrowserToolkit.
89
+
90
+ Args:
91
+ headless (bool): Whether to run the browser in headless mode.
92
+ Defaults to `True`.
93
+ user_data_dir (Optional[str]): Path to a directory for storing
94
+ browser data like cookies and local storage. Useful for
95
+ maintaining sessions across runs. Defaults to `None` (a
96
+ temporary directory is used).
97
+ web_agent_model (Optional[BaseModelBackend]): The language model
98
+ backend to use for the high-level `solve_task` agent. This is
99
+ required only if you plan to use `solve_task`.
100
+ Defaults to `None`.
101
+ cache_dir (str): The directory to store cached files, such as
102
+ screenshots. Defaults to `"tmp/"`.
103
+ enabled_tools (Optional[List[str]]): List of tool names to enable.
104
+ If None, uses DEFAULT_TOOLS. Available tools: open_browser,
105
+ close_browser, visit_page, get_page_snapshot,
106
+ get_som_screenshot, get_page_links, click, type, select,
107
+ scroll, enter, wait_user, solve_task.
108
+ Defaults to `None`.
109
+ """
110
+ super().__init__()
111
+ self._headless = headless
112
+ self._user_data_dir = user_data_dir
113
+ self.web_agent_model = web_agent_model
114
+ self.cache_dir = cache_dir
115
+ os.makedirs(self.cache_dir, exist_ok=True)
116
+
117
+ # Configure enabled tools
118
+ if enabled_tools is None:
119
+ self.enabled_tools = self.DEFAULT_TOOLS.copy()
120
+ else:
121
+ # Validate enabled tools
122
+ invalid_tools = [
123
+ tool for tool in enabled_tools if tool not in self.ALL_TOOLS
124
+ ]
125
+ if invalid_tools:
126
+ raise ValueError(
127
+ f"Invalid tools specified: {invalid_tools}. "
128
+ f"Available tools: {self.ALL_TOOLS}"
129
+ )
130
+ self.enabled_tools = enabled_tools.copy()
131
+
132
+ logger.info(f"Enabled tools: {self.enabled_tools}")
133
+
134
+ # Core components
135
+ self._session = NVBrowserSession(
136
+ headless=headless, user_data_dir=user_data_dir
137
+ )
138
+ self._agent: Optional[PlaywrightLLMAgent] = None
139
+ self._unified_script = self._load_unified_analyzer()
140
+
141
+ def __del__(self):
142
+ r"""Cleanup browser resources on garbage collection."""
143
+ try:
144
+ import sys
145
+
146
+ if getattr(sys, "is_finalizing", lambda: False)():
147
+ return
148
+
149
+ import asyncio
150
+
151
+ try:
152
+ loop = asyncio.get_event_loop()
153
+ if not loop.is_closed() and not loop.is_running():
154
+ loop.run_until_complete(self.close_browser())
155
+ except (RuntimeError, ImportError):
156
+ pass # Event loop unavailable, skip cleanup
157
+ except Exception:
158
+ pass # Suppress all errors during garbage collection
159
+
160
+ def _load_unified_analyzer(self) -> str:
161
+ r"""Load the unified analyzer JavaScript script."""
162
+ script_path = os.path.join(
163
+ os.path.dirname(os.path.abspath(__file__)), "unified_analyzer.js"
164
+ )
165
+
166
+ try:
167
+ with open(
168
+ script_path, "r", encoding='utf-8', errors='replace'
169
+ ) as f:
170
+ script_content = f.read()
171
+
172
+ if not script_content.strip():
173
+ raise ValueError(f"Script is empty: {script_path}")
174
+
175
+ logger.debug(
176
+ f"Loaded unified analyzer ({len(script_content)} chars)"
177
+ )
178
+ return script_content
179
+ except FileNotFoundError:
180
+ raise FileNotFoundError(f"Script not found: {script_path}")
181
+
182
+ def _validate_ref(self, ref: str, method_name: str) -> None:
183
+ r"""Validate ref parameter."""
184
+ if not ref or not isinstance(ref, str):
185
+ raise ValueError(
186
+ f"{method_name}: 'ref' must be a non-empty string"
187
+ )
188
+
189
+ async def _ensure_browser(self):
190
+ await self._session.ensure_browser()
191
+
192
+ async def _require_page(self):
193
+ await self._session.ensure_browser()
194
+ return await self._session.get_page()
195
+
196
+ async def _wait_for_page_stability(self):
197
+ r"""Wait for page to become stable after actions that might trigger
198
+ updates.
199
+ """
200
+ page = await self._require_page()
201
+ import asyncio
202
+
203
+ try:
204
+ # Wait for DOM content to be loaded
205
+ await page.wait_for_load_state(
206
+ 'domcontentloaded', timeout=self.PAGE_STABILITY_TIMEOUT
207
+ )
208
+ logger.debug("DOM content loaded")
209
+
210
+ # Try to wait for network idle (important for AJAX/SPA)
211
+ try:
212
+ await page.wait_for_load_state(
213
+ 'networkidle', timeout=self.NETWORK_IDLE_TIMEOUT
214
+ )
215
+ logger.debug("Network idle achieved")
216
+ except Exception:
217
+ logger.debug("Network idle timeout - continuing anyway")
218
+
219
+ # Additional small delay for JavaScript execution
220
+ await asyncio.sleep(0.5)
221
+ logger.debug("Page stability wait completed")
222
+
223
+ except Exception as e:
224
+ logger.debug(
225
+ f"Page stability wait failed: {e} - continuing anyway"
226
+ )
227
+
228
+ async def _get_unified_analysis(self) -> Dict[str, Any]:
229
+ r"""Get unified analysis data from the page."""
230
+ page = await self._require_page()
231
+ try:
232
+ if not self._unified_script:
233
+ logger.error("Unified analyzer script not loaded")
234
+ return {"elements": {}, "metadata": {"elementCount": 0}}
235
+
236
+ result = await page.evaluate(self._unified_script)
237
+
238
+ if not isinstance(result, dict):
239
+ logger.warning(f"Invalid result type: {type(result)}")
240
+ return {"elements": {}, "metadata": {"elementCount": 0}}
241
+
242
+ return result
243
+ except Exception as e:
244
+ logger.warning(f"Error in unified analysis: {e}")
245
+ return {"elements": {}, "metadata": {"elementCount": 0}}
246
+
247
+ def _convert_analysis_to_rects(
248
+ self, analysis_data: Dict[str, Any]
249
+ ) -> Dict[str, Any]:
250
+ r"""Convert analysis data to rect format for visual marking."""
251
+ rects = {}
252
+ elements = analysis_data.get("elements", {})
253
+
254
+ for ref, element_data in elements.items():
255
+ coordinates = element_data.get("coordinates", [])
256
+ if coordinates:
257
+ rects[ref] = {
258
+ "role": element_data.get("role", "generic"),
259
+ "aria-name": element_data.get("name", ""),
260
+ "rects": [coordinates[0]],
261
+ }
262
+ return rects
263
+
264
+ def _add_set_of_mark(self, image, rects):
265
+ r"""Add visual marks to the image."""
266
+ try:
267
+ from PIL import ImageDraw, ImageFont
268
+ except ImportError:
269
+ logger.warning("PIL not available, returning original image")
270
+ return image
271
+
272
+ marked_image = image.copy()
273
+ draw = ImageDraw.Draw(marked_image)
274
+
275
+ # Try to get font
276
+ try:
277
+ font = ImageFont.truetype("arial.ttf", 16)
278
+ except (OSError, IOError):
279
+ try:
280
+ font = ImageFont.load_default()
281
+ except (OSError, IOError):
282
+ font = None
283
+
284
+ # Color scheme
285
+ colors = {
286
+ "button": "#FF6B6B",
287
+ "link": "#4ECDC4",
288
+ "textbox": "#45B7D1",
289
+ "select": "#96CEB4",
290
+ "checkbox": "#FECA57",
291
+ "radio": "#FF9FF3",
292
+ "default": "#DDA0DD",
293
+ }
294
+
295
+ for ref, rect_data in rects.items():
296
+ rects_list = rect_data.get("rects", [])
297
+ role = rect_data.get("role", "generic")
298
+ color = colors.get(role, colors["default"])
299
+
300
+ for rect in rects_list:
301
+ x, y = rect.get("x", 0), rect.get("y", 0)
302
+ width, height = rect.get("width", 0), rect.get("height", 0)
303
+
304
+ # Draw rectangle outline
305
+ draw.rectangle(
306
+ [x, y, x + width, y + height], outline=color, width=2
307
+ )
308
+
309
+ # Draw reference label
310
+ label_text = ref
311
+ if font:
312
+ bbox = draw.textbbox((0, 0), label_text, font=font)
313
+ text_width, text_height = (
314
+ bbox[2] - bbox[0],
315
+ bbox[3] - bbox[1],
316
+ )
317
+ else:
318
+ text_width, text_height = len(label_text) * 8, 16
319
+
320
+ label_x, label_y = max(0, x - 2), max(0, y - text_height - 2)
321
+
322
+ # Background and text
323
+ draw.rectangle(
324
+ [
325
+ label_x,
326
+ label_y,
327
+ label_x + text_width + 4,
328
+ label_y + text_height + 2,
329
+ ],
330
+ fill=color,
331
+ )
332
+ draw.text(
333
+ (label_x + 2, label_y + 1),
334
+ label_text,
335
+ fill="white",
336
+ font=font,
337
+ )
338
+
339
+ return marked_image
340
+
341
+ def _format_snapshot_from_analysis(
342
+ self, analysis_data: Dict[str, Any]
343
+ ) -> str:
344
+ r"""Format analysis data into snapshot string."""
345
+ lines = []
346
+ elements = analysis_data.get("elements", {})
347
+
348
+ for ref, element_data in elements.items():
349
+ role = element_data.get("role", "generic")
350
+ name = element_data.get("name", "")
351
+
352
+ line = f"- {role}"
353
+ if name:
354
+ line += f' "{name}"'
355
+
356
+ # Add properties
357
+ props = []
358
+ for prop in ["disabled", "checked", "expanded"]:
359
+ value = element_data.get(prop)
360
+ if value is True:
361
+ props.append(prop)
362
+ elif value is not None and prop in ["checked", "expanded"]:
363
+ props.append(f"{prop}={value}")
364
+
365
+ if props:
366
+ line += f" {' '.join(props)}"
367
+
368
+ line += f" [ref={ref}]"
369
+ lines.append(line)
370
+
371
+ return "\n".join(lines)
372
+
373
+ async def _exec_with_snapshot(
374
+ self, action: Dict[str, Any]
375
+ ) -> Dict[str, str]:
376
+ r"""Execute action and return result with snapshot comparison."""
377
+
378
+ # Log action execution start
379
+ action_type = action.get("type", "unknown")
380
+ logger.info(f"Executing action: {action_type}")
381
+
382
+ # Get before snapshot
383
+ logger.info("Capturing pre-action snapshot...")
384
+ snapshot_start = time.time()
385
+ before_snapshot = await self._session.get_snapshot(
386
+ force_refresh=True, diff_only=False
387
+ )
388
+ snapshot_time = time.time() - snapshot_start
389
+ logger.info(f"Pre-action snapshot captured in {snapshot_time:.2f}s")
390
+
391
+ # Execute action
392
+ logger.info(f"Executing {action_type} action...")
393
+ action_start = time.time()
394
+ result = await self._session.exec_action(action)
395
+ action_time = time.time() - action_start
396
+ logger.info(f"Action {action_type} completed in {action_time:.2f}s")
397
+
398
+ # Wait for page stability after action (especially important for click)
399
+ if action_type in ["click", "type", "select", "enter"]:
400
+ logger.info(
401
+ f"Waiting for page stability " f"after {action_type}..."
402
+ )
403
+ stability_start = time.time()
404
+ await self._wait_for_page_stability()
405
+ stability_time = time.time() - stability_start
406
+ logger.info(
407
+ f"Page stability wait " f"completed in {stability_time:.2f}s"
408
+ )
409
+
410
+ # Get after snapshot
411
+ logger.info("Capturing post-action snapshot...")
412
+ snapshot_start = time.time()
413
+ after_snapshot = await self._session.get_snapshot(
414
+ force_refresh=True, diff_only=False
415
+ )
416
+ snapshot_time = time.time() - snapshot_start
417
+ logger.info(
418
+ f"Post-action snapshot " f"captured in {snapshot_time:.2f}s"
419
+ )
420
+
421
+ # Check for snapshot quality and log warnings
422
+ if before_snapshot == after_snapshot:
423
+ snapshot = "snapshot not changed"
424
+ logger.debug("Page snapshot unchanged after action")
425
+ else:
426
+ snapshot = after_snapshot
427
+ # Check if snapshot is empty or problematic
428
+ if "<empty>" in after_snapshot:
429
+ logger.warning(
430
+ f"Action {action_type} resulted "
431
+ f"in empty snapshot - "
432
+ f"page may still be loading"
433
+ )
434
+ elif len(after_snapshot.strip()) < 50:
435
+ logger.warning(
436
+ f"Action {action_type} resulted "
437
+ f"in very short snapshot:"
438
+ f" {len(after_snapshot)} chars"
439
+ )
440
+ else:
441
+ logger.debug(
442
+ f"Action {action_type} resulted "
443
+ f"in updated snapshot: "
444
+ f"{len(after_snapshot)} chars"
445
+ )
446
+
447
+ return {"result": result, "snapshot": snapshot}
448
+
449
+ async def _extract_links_by_refs(
450
+ self, snapshot: str, page, refs: List[str]
451
+ ) -> List[Dict[str, str]]:
452
+ r"""Extract multiple links by their reference IDs."""
453
+ import re
454
+
455
+ found_links = []
456
+ ref_set = set(refs)
457
+ lines = snapshot.split('\n')
458
+
459
+ for line in lines:
460
+ link_match = re.search(
461
+ r'- link\s+"([^"]+)"\s+\[ref=([^\]]+)\]', line
462
+ )
463
+ if link_match and link_match.group(2) in ref_set:
464
+ text, found_ref = link_match.groups()
465
+ try:
466
+ url = await self._get_link_url_by_ref(page, found_ref)
467
+ found_links.append(
468
+ {"text": text, "ref": found_ref, "url": url or ""}
469
+ )
470
+ except Exception as e:
471
+ logger.warning(
472
+ f"Failed to get URL for ref {found_ref}: {e}"
473
+ )
474
+ found_links.append(
475
+ {"text": text, "ref": found_ref, "url": ""}
476
+ )
477
+
478
+ return found_links
479
+
480
+ async def _get_link_url_by_ref(self, page, ref: str) -> str:
481
+ r"""Get URL of a link element by reference ID."""
482
+ try:
483
+ element = await page.query_selector(f'[aria-ref="{ref}"]')
484
+ if element:
485
+ href = await element.get_attribute('href')
486
+ if href:
487
+ from urllib.parse import urljoin
488
+
489
+ return urljoin(page.url, href)
490
+ return ""
491
+ except Exception as e:
492
+ logger.warning(f"Failed to get URL for ref {ref}: {e}")
493
+ return ""
494
+
495
+ def _ensure_agent(self) -> PlaywrightLLMAgent:
496
+ r"""Create PlaywrightLLMAgent on first use."""
497
+ if self.web_agent_model is None:
498
+ raise RuntimeError(
499
+ "web_agent_model required for high-level task planning"
500
+ )
501
+
502
+ if self._agent is None:
503
+ self._agent = PlaywrightLLMAgent(
504
+ headless=self._headless,
505
+ user_data_dir=self._user_data_dir,
506
+ model_backend=self.web_agent_model,
507
+ )
508
+ return self._agent
509
+
510
+ # Public API Methods
511
+
512
+ async def open_browser(
513
+ self, start_url: Optional[str] = "https://search.brave.com/"
514
+ ) -> Dict[str, str]:
515
+ r"""Launches a new browser session, making it ready for web automation.
516
+
517
+ This method initializes the underlying browser instance. If a
518
+ `start_url` is provided, it will also navigate to that URL. If you
519
+ don't have a specific URL to start with, you can use a search engine
520
+ like 'https://search.brave.com/'.
521
+
522
+ Args:
523
+ start_url (Optional[str]): The initial URL to navigate to after the
524
+ browser is launched. If not provided, the browser will start
525
+ with a blank page. (default: :obj:`https://search.brave.com/`)
526
+
527
+ Returns:
528
+ Dict[str, str]: A dictionary containing:
529
+ - "result": A string confirming that the browser session has
530
+ started.
531
+ - "snapshot": A textual representation of the current page's
532
+ interactive elements. This snapshot is crucial for
533
+ identifying elements for subsequent actions.
534
+ """
535
+ logger.info("Starting browser session...")
536
+
537
+ browser_start = time.time()
538
+ await self._session.ensure_browser()
539
+ browser_time = time.time() - browser_start
540
+ logger.info(f"Browser session started in {browser_time:.2f}s")
541
+
542
+ if start_url:
543
+ logger.info(f"Auto-navigating to start URL: {start_url}")
544
+ return await self.visit_page(start_url)
545
+
546
+ logger.info("Capturing initial browser snapshot...")
547
+ snapshot_start = time.time()
548
+ snapshot = await self._session.get_snapshot(
549
+ force_refresh=True, diff_only=False
550
+ )
551
+ snapshot_time = time.time() - snapshot_start
552
+ logger.info(f"Initial snapshot captured in {snapshot_time:.2f}s")
553
+
554
+ return {"result": "Browser session started.", "snapshot": snapshot}
555
+
556
+ async def close_browser(self) -> str:
557
+ r"""Closes the current browser session and releases all associated
558
+ resources.
559
+
560
+ This should be called at the end of a web automation task to ensure a
561
+ clean shutdown of the browser instance.
562
+
563
+ Returns:
564
+ str: A confirmation message indicating the session is closed.
565
+ """
566
+ if self._agent is not None:
567
+ try:
568
+ await self._agent.close()
569
+ except Exception:
570
+ pass
571
+ self._agent = None
572
+
573
+ await self._session.close()
574
+ return "Browser session closed."
575
+
576
+ async def visit_page(self, url: str) -> Dict[str, str]:
577
+ r"""Navigates the current browser page to a specified URL.
578
+
579
+ Args:
580
+ url (str): The web address to load in the browser. Must be a
581
+ valid URL.
582
+
583
+ Returns:
584
+ Dict[str, str]: A dictionary containing:
585
+ - "result": A message indicating the outcome of the navigation,
586
+ e.g., "Navigation successful.".
587
+ - "snapshot": A new textual snapshot of the page's interactive
588
+ elements after the new page has loaded.
589
+ """
590
+ if not url or not isinstance(url, str):
591
+ return {
592
+ "result": "Error: 'url' must be a non-empty string",
593
+ "snapshot": "",
594
+ }
595
+
596
+ if '://' not in url:
597
+ url = f'https://{url}'
598
+
599
+ logger.info(f"Navigating to URL: {url}")
600
+
601
+ # Navigate to page
602
+ nav_start = time.time()
603
+ nav_result = await self._session.visit(url)
604
+ nav_time = time.time() - nav_start
605
+ logger.info(f"Page navigation completed in {nav_time:.2f}s")
606
+
607
+ # Get snapshot
608
+ logger.info("Capturing page snapshot after navigation...")
609
+ snapshot_start = time.time()
610
+ snapshot = await self._session.get_snapshot(
611
+ force_refresh=True, diff_only=False
612
+ )
613
+ snapshot_time = time.time() - snapshot_start
614
+ logger.info(f"Navigation snapshot captured in {snapshot_time:.2f}s")
615
+
616
+ return {"result": nav_result, "snapshot": snapshot}
617
+
618
+ async def get_page_snapshot(self) -> str:
619
+ r"""Captures a textual representation of the current page's content.
620
+
621
+ This "snapshot" provides a simplified view of the DOM, focusing on
622
+ interactive elements like links, buttons, and input fields. Each
623
+ element is assigned a unique reference ID (`ref`) that can be used in
624
+ other actions like `click` or `type`.
625
+
626
+ The snapshot is useful for understanding the page structure and
627
+ identifying elements to interact with without needing to parse raw
628
+ HTML. A new snapshot is generated on each call.
629
+
630
+ Returns:
631
+ str: A formatted string representing the interactive elements on
632
+ the page. For example:
633
+ '- link "Sign In" [ref=1]'
634
+ '- textbox "Username" [ref=2]'
635
+ """
636
+ logger.info("Capturing page snapshot")
637
+
638
+ analysis_start = time.time()
639
+ analysis_data = await self._get_unified_analysis()
640
+ analysis_time = time.time() - analysis_start
641
+ logger.info(
642
+ f"Page snapshot analysis " f"completed in {analysis_time:.2f}s"
643
+ )
644
+
645
+ snapshot_text = analysis_data.get("snapshotText", "")
646
+ return (
647
+ snapshot_text
648
+ if snapshot_text
649
+ else self._format_snapshot_from_analysis(analysis_data)
650
+ )
651
+
652
+ @dependencies_required('PIL')
653
+ async def get_som_screenshot(self):
654
+ r"""Captures a screenshot of the current webpage and visually marks all
655
+ interactive elements. "SoM" stands for "Set of Marks".
656
+
657
+ This method is essential for tasks requiring visual understanding of
658
+ the page layout. It works by:
659
+ 1. Taking a full-page screenshot.
660
+ 2. Identifying all interactive elements (buttons, links, inputs, etc.).
661
+ 3. Drawing colored boxes and reference IDs (`ref`) over these elements
662
+ on the screenshot.
663
+ 4. Saving the annotated image to a cache directory.
664
+ 5. Returning the image as a base64-encoded string along with a summary.
665
+
666
+ Use this when the textual snapshot from `get_page_snapshot` is
667
+ insufficient and visual context is needed to decide the next action.
668
+
669
+ Returns:
670
+ ToolResult: An object containing:
671
+ - `text`: A summary string, e.g., "Visual webpage screenshot
672
+ captured with 42 interactive elements".
673
+ - `images`: A list containing a single base64-encoded PNG image
674
+ as a data URL.
675
+ """
676
+ from PIL import Image
677
+
678
+ from camel.utils.tool_result import ToolResult
679
+
680
+ # Get screenshot and analysis
681
+ page = await self._require_page()
682
+
683
+ # Log screenshot timeout start
684
+ logger.info(
685
+ f"Starting screenshot capture"
686
+ f"with timeout: {self.DEFAULT_SCREENSHOT_TIMEOUT}ms"
687
+ )
688
+
689
+ start_time = time.time()
690
+ image_data = await page.screenshot(
691
+ timeout=self.DEFAULT_SCREENSHOT_TIMEOUT
692
+ )
693
+ screenshot_time = time.time() - start_time
694
+
695
+ logger.info(f"Screenshot capture completed in {screenshot_time:.2f}s")
696
+ image = Image.open(io.BytesIO(image_data))
697
+
698
+ # Log unified analysis start
699
+ logger.info("Starting unified page analysis...")
700
+ analysis_start_time = time.time()
701
+ analysis_data = await self._get_unified_analysis()
702
+ analysis_time = time.time() - analysis_start_time
703
+ logger.info(f"Unified page analysis completed in {analysis_time:.2f}s")
704
+
705
+ # Log image processing
706
+ logger.info("Processing visual marks on screenshot...")
707
+ mark_start_time = time.time()
708
+ rects = self._convert_analysis_to_rects(analysis_data)
709
+ marked_image = self._add_set_of_mark(image, rects)
710
+ mark_time = time.time() - mark_start_time
711
+ logger.info(f"Visual marks processing completed in {mark_time:.2f}s")
712
+
713
+ # Save screenshot to cache directory
714
+ parsed_url = urllib.parse.urlparse(page.url)
715
+ url_name = sanitize_filename(str(parsed_url.path), max_length=241)
716
+ timestamp = datetime.datetime.now().strftime("%m%d%H%M%S")
717
+ file_path = os.path.join(
718
+ self.cache_dir, f"{url_name}_{timestamp}_som.png"
719
+ )
720
+ marked_image.save(file_path, "PNG")
721
+
722
+ # Convert to base64
723
+ img_buffer = io.BytesIO()
724
+ marked_image.save(img_buffer, format="PNG")
725
+ img_buffer.seek(0)
726
+ img_base64 = base64.b64encode(img_buffer.getvalue()).decode('utf-8')
727
+ img_data_url = f"data:image/png;base64,{img_base64}"
728
+
729
+ text_result = (
730
+ f"Visual webpage screenshot "
731
+ f"captured with {len(rects)} interactive elements"
732
+ )
733
+
734
+ return ToolResult(text=text_result, images=[img_data_url])
735
+
736
+ async def click(self, *, ref: str) -> Dict[str, str]:
737
+ r"""Clicks on an interactive element on the page.
738
+
739
+ Args:
740
+ ref (str): The reference ID of the element to click. This ID is
741
+ obtained from the page snapshot (see `get_page_snapshot` or
742
+ `get_som_screenshot`).
743
+
744
+ Returns:
745
+ Dict[str, str]: A dictionary containing:
746
+ - "result": A message confirming the click action.
747
+ - "snapshot": A new textual snapshot of the page after the
748
+ click, which may have changed as a result of the action. If
749
+ the snapshot is unchanged, it will be the string "snapshot
750
+ not changed".
751
+ """
752
+ self._validate_ref(ref, "click")
753
+
754
+ analysis = await self._get_unified_analysis()
755
+ elements = analysis.get("elements", {})
756
+ if ref not in elements:
757
+ available_refs = list(elements.keys())
758
+ logger.error(
759
+ f"Error: Element reference '{ref}' not found. "
760
+ f"Available refs: {available_refs}"
761
+ )
762
+ return {
763
+ "result": f"Error: Element reference '{ref}' not found. "
764
+ f"Available refs: {available_refs}"
765
+ }
766
+
767
+ action = {"type": "click", "ref": ref}
768
+ return await self._exec_with_snapshot(action)
769
+
770
+ async def type(self, *, ref: str, text: str) -> Dict[str, str]:
771
+ r"""Types text into an input field, such as a textbox or search bar.
772
+
773
+ Args:
774
+ ref (str): The reference ID of the input element.
775
+ text (str): The text to be typed into the element.
776
+
777
+ Returns:
778
+ Dict[str, str]: A dictionary containing:
779
+ - "result": A message confirming the type action.
780
+ - "snapshot": A new textual snapshot of the page after the
781
+ text has been entered.
782
+ """
783
+ self._validate_ref(ref, "type")
784
+ await self._get_unified_analysis() # Ensure aria-ref attributes
785
+
786
+ action = {"type": "type", "ref": ref, "text": text}
787
+ return await self._exec_with_snapshot(action)
788
+
789
+ async def select(self, *, ref: str, value: str) -> Dict[str, str]:
790
+ r"""Selects an option from a dropdown (`<select>`) element.
791
+
792
+ Args:
793
+ ref (str): The reference ID of the `<select>` element.
794
+ value (str): The value of the `<option>` to be selected. This
795
+ should match the `value` attribute of the option, not the
796
+ visible text.
797
+
798
+ Returns:
799
+ Dict[str, str]: A dictionary containing:
800
+ - "result": A message confirming the select action.
801
+ - "snapshot": A new snapshot of the page after the selection.
802
+ """
803
+ self._validate_ref(ref, "select")
804
+ await self._get_unified_analysis()
805
+
806
+ action = {"type": "select", "ref": ref, "value": value}
807
+ return await self._exec_with_snapshot(action)
808
+
809
+ async def scroll(self, *, direction: str, amount: int) -> Dict[str, str]:
810
+ r"""Scrolls the page window up or down by a specified amount.
811
+
812
+ Args:
813
+ direction (str): The direction to scroll. Must be either 'up' or
814
+ 'down'.
815
+ amount (int): The number of pixels to scroll.
816
+
817
+ Returns:
818
+ Dict[str, str]: A dictionary containing:
819
+ - "result": A confirmation of the scroll action.
820
+ - "snapshot": A new snapshot of the page after scrolling.
821
+ """
822
+ if direction not in ("up", "down"):
823
+ return {
824
+ "result": "Error: direction must be 'up' or 'down'",
825
+ "snapshot": "",
826
+ }
827
+
828
+ action = {"type": "scroll", "direction": direction, "amount": amount}
829
+ return await self._exec_with_snapshot(action)
830
+
831
+ async def enter(self, *, ref: str) -> Dict[str, str]:
832
+ r"""Simulates pressing the Enter key on a specific element.
833
+
834
+ This is often used to submit forms after filling them out.
835
+
836
+ Args:
837
+ ref (str): The reference ID of the element to press Enter on.
838
+
839
+ Returns:
840
+ Dict[str, str]: A dictionary containing:
841
+ - "result": A confirmation of the action.
842
+ - "snapshot": A new page snapshot, as this action often
843
+ triggers navigation or page updates.
844
+ """
845
+ self._validate_ref(ref, "enter")
846
+ action = {"type": "enter", "ref": ref}
847
+ return await self._exec_with_snapshot(action)
848
+
849
+ async def wait_user(
850
+ self, timeout_sec: Optional[float] = None
851
+ ) -> Dict[str, str]:
852
+ r"""Pauses the agent's execution and waits for human intervention.
853
+
854
+ This is useful for tasks that require manual steps, like solving a
855
+ CAPTCHA. The agent will print a message to the console and wait
856
+ until the user presses the Enter key.
857
+
858
+ Args:
859
+ timeout_sec (Optional[float]): The maximum time to wait in
860
+ seconds. If the timeout is reached, the agent will resume
861
+ automatically. If `None`, it will wait indefinitely.
862
+
863
+ Returns:
864
+ Dict[str, str]: A dictionary containing:
865
+ - "result": A message indicating how the wait ended (e.g.,
866
+ "User resumed." or "Timeout... reached, auto-resumed.").
867
+ - "snapshot": The current page snapshot after the wait.
868
+ """
869
+ import asyncio
870
+
871
+ prompt = (
872
+ "🕑 Agent waiting for human input. "
873
+ "Complete action in browser, then press Enter..."
874
+ )
875
+ logger.info(f"\n{prompt}\n")
876
+
877
+ async def _await_enter():
878
+ await asyncio.to_thread(input, ">>> Press Enter to resume <<<\n")
879
+
880
+ try:
881
+ if timeout_sec is not None:
882
+ logger.info(
883
+ f"Waiting for user input with timeout: {timeout_sec}s"
884
+ )
885
+ start_time = time.time()
886
+ await asyncio.wait_for(_await_enter(), timeout=timeout_sec)
887
+ wait_time = time.time() - start_time
888
+ logger.info(f"User input received after {wait_time:.2f}s")
889
+ result_msg = "User resumed."
890
+ else:
891
+ logger.info("Waiting for user " "input (no timeout)")
892
+ start_time = time.time()
893
+ await _await_enter()
894
+ wait_time = time.time() - start_time
895
+ logger.info(f"User input received " f"after {wait_time:.2f}s")
896
+ result_msg = "User resumed."
897
+ except asyncio.TimeoutError:
898
+ wait_time = timeout_sec or 0.0
899
+ logger.info(
900
+ f"User input timeout reached "
901
+ f"after {wait_time}s, auto-resuming"
902
+ )
903
+ result_msg = f"Timeout {timeout_sec}s reached, auto-resumed."
904
+
905
+ snapshot = await self._session.get_snapshot(
906
+ force_refresh=True, diff_only=False
907
+ )
908
+ return {"result": result_msg, "snapshot": snapshot}
909
+
910
+ async def get_page_links(self, *, ref: List[str]) -> Dict[str, Any]:
911
+ r"""Retrieves the full URLs for a given list of link reference IDs.
912
+
913
+ This is useful when you need to know the destination of a link before
914
+ clicking it.
915
+
916
+ Args:
917
+ ref (List[str]): A list of reference IDs for link elements,
918
+ obtained from a page snapshot.
919
+
920
+ Returns:
921
+ Dict[str, Any]: A dictionary containing:
922
+ - "links": A list of dictionaries, where each dictionary
923
+ represents a found link and has "text", "ref", and "url"
924
+ keys.
925
+ """
926
+ if not ref or not isinstance(ref, list):
927
+ return {"links": []}
928
+
929
+ for r in ref:
930
+ if not r or not isinstance(r, str):
931
+ return {"links": []}
932
+
933
+ page = await self._require_page()
934
+ snapshot = await self._session.get_snapshot(
935
+ force_refresh=True, diff_only=False
936
+ )
937
+ links = await self._extract_links_by_refs(snapshot, page, ref)
938
+
939
+ return {"links": links}
940
+
941
+ async def solve_task(
942
+ self, task_prompt: str, start_url: str, max_steps: int = 15
943
+ ) -> str:
944
+ r"""Uses a high-level LLM agent to autonomously complete a task.
945
+
946
+ This function delegates control to another agent that can reason about
947
+ a task, break it down into steps, and execute browser actions to
948
+ achieve the goal. It is suitable for complex, multi-step tasks.
949
+
950
+ Note: `web_agent_model` must be provided during the toolkit's
951
+ initialization to use this function.
952
+
953
+ Args:
954
+ task_prompt (str): A natural language description of the task to
955
+ be completed (e.g., "log into my account on example.com").
956
+ start_url (str): The URL to start the task from.
957
+ max_steps (int): The maximum number of steps the agent is allowed
958
+ to take before stopping.
959
+
960
+ Returns:
961
+ str: A summary message indicating that the task processing has
962
+ finished. The detailed trace of the agent's actions will be
963
+ printed to the standard output.
964
+ """
965
+ agent = self._ensure_agent()
966
+ await agent.navigate(start_url)
967
+ await agent.process_command(task_prompt, max_steps=max_steps)
968
+ return "Task processing finished - see stdout for detailed trace."
969
+
970
+ def get_tools(self) -> List[FunctionTool]:
971
+ r"""Get available function tools
972
+ based on enabled_tools configuration."""
973
+ # Map tool names to their corresponding methods
974
+ tool_map = {
975
+ "open_browser": self.open_browser,
976
+ "close_browser": self.close_browser,
977
+ "visit_page": self.visit_page,
978
+ "get_page_snapshot": self.get_page_snapshot,
979
+ "get_som_screenshot": self.get_som_screenshot,
980
+ "get_page_links": self.get_page_links,
981
+ "click": self.click,
982
+ "type": self.type,
983
+ "select": self.select,
984
+ "scroll": self.scroll,
985
+ "enter": self.enter,
986
+ "wait_user": self.wait_user,
987
+ "solve_task": self.solve_task,
988
+ }
989
+
990
+ enabled_tools = []
991
+
992
+ for tool_name in self.enabled_tools:
993
+ if tool_name == "solve_task" and self.web_agent_model is None:
994
+ logger.warning(
995
+ f"Tool '{tool_name}' is enabled but web_agent_model "
996
+ f"is not provided. Skipping this tool."
997
+ )
998
+ continue
999
+
1000
+ if tool_name in tool_map:
1001
+ enabled_tools.append(
1002
+ FunctionTool(cast(Callable, tool_map[tool_name]))
1003
+ )
1004
+ else:
1005
+ logger.warning(f"Unknown tool name: {tool_name}")
1006
+
1007
+ logger.info(f"Returning {len(enabled_tools)} enabled tools")
1008
+ return enabled_tools