camel-ai 0.2.71a1__py3-none-any.whl → 0.2.71a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of camel-ai might be problematic. Click here for more details.

Files changed (32) hide show
  1. camel/__init__.py +1 -1
  2. camel/agents/_types.py +6 -2
  3. camel/agents/chat_agent.py +357 -18
  4. camel/messages/base.py +2 -6
  5. camel/messages/func_message.py +32 -5
  6. camel/services/agent_openapi_server.py +380 -0
  7. camel/societies/workforce/single_agent_worker.py +1 -5
  8. camel/societies/workforce/workforce.py +68 -8
  9. camel/tasks/task.py +2 -2
  10. camel/toolkits/__init__.py +2 -2
  11. camel/toolkits/craw4ai_toolkit.py +27 -7
  12. camel/toolkits/file_write_toolkit.py +110 -31
  13. camel/toolkits/human_toolkit.py +19 -14
  14. camel/toolkits/{non_visual_browser_toolkit → hybrid_browser_toolkit}/__init__.py +2 -2
  15. camel/toolkits/{non_visual_browser_toolkit → hybrid_browser_toolkit}/actions.py +47 -11
  16. camel/toolkits/{non_visual_browser_toolkit → hybrid_browser_toolkit}/agent.py +21 -11
  17. camel/toolkits/{non_visual_browser_toolkit/nv_browser_session.py → hybrid_browser_toolkit/browser_session.py} +64 -10
  18. camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit.py +1002 -0
  19. camel/toolkits/{non_visual_browser_toolkit → hybrid_browser_toolkit}/snapshot.py +16 -4
  20. camel/toolkits/{non_visual_browser_toolkit/snapshot.js → hybrid_browser_toolkit/unified_analyzer.js} +171 -15
  21. camel/toolkits/jina_reranker_toolkit.py +3 -4
  22. camel/toolkits/terminal_toolkit.py +189 -48
  23. camel/toolkits/video_download_toolkit.py +1 -2
  24. camel/types/agents/tool_calling_record.py +4 -1
  25. camel/types/enums.py +24 -24
  26. camel/utils/message_summarizer.py +148 -0
  27. camel/utils/tool_result.py +44 -0
  28. {camel_ai-0.2.71a1.dist-info → camel_ai-0.2.71a3.dist-info}/METADATA +19 -5
  29. {camel_ai-0.2.71a1.dist-info → camel_ai-0.2.71a3.dist-info}/RECORD +31 -28
  30. camel/toolkits/non_visual_browser_toolkit/browser_non_visual_toolkit.py +0 -446
  31. {camel_ai-0.2.71a1.dist-info → camel_ai-0.2.71a3.dist-info}/WHEEL +0 -0
  32. {camel_ai-0.2.71a1.dist-info → camel_ai-0.2.71a3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1002 @@
1
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
14
+
15
+ import base64
16
+ import datetime
17
+ import io
18
+ import os
19
+ import time
20
+ import urllib.parse
21
+ from typing import Any, Callable, ClassVar, Dict, List, Optional, cast
22
+
23
+ from camel.logger import get_logger
24
+ from camel.models import BaseModelBackend
25
+ from camel.toolkits.base import BaseToolkit
26
+ from camel.toolkits.function_tool import FunctionTool
27
+ from camel.utils import sanitize_filename
28
+ from camel.utils.commons import dependencies_required
29
+
30
+ from .agent import PlaywrightLLMAgent
31
+ from .browser_session import NVBrowserSession
32
+
33
+ logger = get_logger(__name__)
34
+
35
+
36
+ class HybridBrowserToolkit(BaseToolkit):
37
+ r"""A hybrid browser toolkit that combines non-visual, DOM-based browser
38
+ automation with visual, screenshot-based capabilities.
39
+
40
+ This toolkit exposes a set of actions as CAMEL FunctionTools for agents
41
+ to interact with web pages. It can operate in headless mode and supports
42
+ both programmatic control of browser actions (like clicking and typing)
43
+ and visual analysis of the page layout through screenshots with marked
44
+ interactive elements.
45
+ """
46
+
47
+ # Configuration constants
48
+ DEFAULT_SCREENSHOT_TIMEOUT = 60000 # 60 seconds for screenshots
49
+ PAGE_STABILITY_TIMEOUT = 3000 # 3 seconds for DOM stability
50
+ NETWORK_IDLE_TIMEOUT = 2000 # 2 seconds for network idle
51
+
52
+ # Default tool list - core browser functionality
53
+ DEFAULT_TOOLS: ClassVar[List[str]] = [
54
+ "open_browser",
55
+ "close_browser",
56
+ "visit_page",
57
+ "click",
58
+ "type",
59
+ ]
60
+
61
+ # All available tools
62
+ ALL_TOOLS: ClassVar[List[str]] = [
63
+ "open_browser",
64
+ "close_browser",
65
+ "visit_page",
66
+ "get_page_snapshot",
67
+ "get_som_screenshot",
68
+ "get_page_links",
69
+ "click",
70
+ "type",
71
+ "select",
72
+ "scroll",
73
+ "enter",
74
+ "wait_user",
75
+ "solve_task",
76
+ ]
77
+
78
+ def __init__(
79
+ self,
80
+ *,
81
+ headless: bool = True,
82
+ user_data_dir: Optional[str] = None,
83
+ web_agent_model: Optional[BaseModelBackend] = None,
84
+ cache_dir: str = "tmp/",
85
+ enabled_tools: Optional[List[str]] = None,
86
+ ) -> None:
87
+ r"""Initialize the HybridBrowserToolkit.
88
+
89
+ Args:
90
+ headless (bool): Whether to run the browser in headless mode.
91
+ Defaults to `True`.
92
+ user_data_dir (Optional[str]): Path to a directory for storing
93
+ browser data like cookies and local storage. Useful for
94
+ maintaining sessions across runs. Defaults to `None` (a
95
+ temporary directory is used).
96
+ web_agent_model (Optional[BaseModelBackend]): The language model
97
+ backend to use for the high-level `solve_task` agent. This is
98
+ required only if you plan to use `solve_task`.
99
+ Defaults to `None`.
100
+ cache_dir (str): The directory to store cached files, such as
101
+ screenshots. Defaults to `"tmp/"`.
102
+ enabled_tools (Optional[List[str]]): List of tool names to enable.
103
+ If None, uses DEFAULT_TOOLS. Available tools: open_browser,
104
+ close_browser, visit_page, get_page_snapshot,
105
+ get_som_screenshot, get_page_links, click, type, select,
106
+ scroll, enter, wait_user, solve_task.
107
+ Defaults to `None`.
108
+ """
109
+ super().__init__()
110
+ self._headless = headless
111
+ self._user_data_dir = user_data_dir
112
+ self.web_agent_model = web_agent_model
113
+ self.cache_dir = cache_dir
114
+ os.makedirs(self.cache_dir, exist_ok=True)
115
+
116
+ # Configure enabled tools
117
+ if enabled_tools is None:
118
+ self.enabled_tools = self.DEFAULT_TOOLS.copy()
119
+ else:
120
+ # Validate enabled tools
121
+ invalid_tools = [
122
+ tool for tool in enabled_tools if tool not in self.ALL_TOOLS
123
+ ]
124
+ if invalid_tools:
125
+ raise ValueError(
126
+ f"Invalid tools specified: {invalid_tools}. "
127
+ f"Available tools: {self.ALL_TOOLS}"
128
+ )
129
+ self.enabled_tools = enabled_tools.copy()
130
+
131
+ logger.info(f"Enabled tools: {self.enabled_tools}")
132
+
133
+ # Core components
134
+ self._session = NVBrowserSession(
135
+ headless=headless, user_data_dir=user_data_dir
136
+ )
137
+ self._agent: Optional[PlaywrightLLMAgent] = None
138
+ self._unified_script = self._load_unified_analyzer()
139
+
140
+ def __del__(self):
141
+ r"""Cleanup browser resources on garbage collection."""
142
+ try:
143
+ import sys
144
+
145
+ if getattr(sys, "is_finalizing", lambda: False)():
146
+ return
147
+
148
+ import asyncio
149
+
150
+ try:
151
+ loop = asyncio.get_event_loop()
152
+ if not loop.is_closed() and not loop.is_running():
153
+ loop.run_until_complete(self.close_browser())
154
+ except (RuntimeError, ImportError):
155
+ pass # Event loop unavailable, skip cleanup
156
+ except Exception:
157
+ pass # Suppress all errors during garbage collection
158
+
159
+ def _load_unified_analyzer(self) -> str:
160
+ r"""Load the unified analyzer JavaScript script."""
161
+ script_path = os.path.join(
162
+ os.path.dirname(os.path.abspath(__file__)), "unified_analyzer.js"
163
+ )
164
+
165
+ try:
166
+ with open(
167
+ script_path, "r", encoding='utf-8', errors='replace'
168
+ ) as f:
169
+ script_content = f.read()
170
+
171
+ if not script_content.strip():
172
+ raise ValueError(f"Script is empty: {script_path}")
173
+
174
+ logger.debug(
175
+ f"Loaded unified analyzer ({len(script_content)} chars)"
176
+ )
177
+ return script_content
178
+ except FileNotFoundError:
179
+ raise FileNotFoundError(f"Script not found: {script_path}")
180
+
181
+ def _validate_ref(self, ref: str, method_name: str) -> None:
182
+ r"""Validate ref parameter."""
183
+ if not ref or not isinstance(ref, str):
184
+ raise ValueError(
185
+ f"{method_name}: 'ref' must be a non-empty string"
186
+ )
187
+
188
+ async def _ensure_browser(self):
189
+ await self._session.ensure_browser()
190
+
191
+ async def _require_page(self):
192
+ await self._session.ensure_browser()
193
+ return await self._session.get_page()
194
+
195
+ async def _wait_for_page_stability(self):
196
+ r"""Wait for page to become stable after actions that might trigger
197
+ updates.
198
+ """
199
+ page = await self._require_page()
200
+ import asyncio
201
+
202
+ try:
203
+ # Wait for DOM content to be loaded
204
+ await page.wait_for_load_state(
205
+ 'domcontentloaded', timeout=self.PAGE_STABILITY_TIMEOUT
206
+ )
207
+ logger.debug("DOM content loaded")
208
+
209
+ # Try to wait for network idle (important for AJAX/SPA)
210
+ try:
211
+ await page.wait_for_load_state(
212
+ 'networkidle', timeout=self.NETWORK_IDLE_TIMEOUT
213
+ )
214
+ logger.debug("Network idle achieved")
215
+ except Exception:
216
+ logger.debug("Network idle timeout - continuing anyway")
217
+
218
+ # Additional small delay for JavaScript execution
219
+ await asyncio.sleep(0.5)
220
+ logger.debug("Page stability wait completed")
221
+
222
+ except Exception as e:
223
+ logger.debug(
224
+ f"Page stability wait failed: {e} - continuing anyway"
225
+ )
226
+
227
+ async def _get_unified_analysis(self) -> Dict[str, Any]:
228
+ r"""Get unified analysis data from the page."""
229
+ page = await self._require_page()
230
+ try:
231
+ if not self._unified_script:
232
+ logger.error("Unified analyzer script not loaded")
233
+ return {"elements": {}, "metadata": {"elementCount": 0}}
234
+
235
+ result = await page.evaluate(self._unified_script)
236
+
237
+ if not isinstance(result, dict):
238
+ logger.warning(f"Invalid result type: {type(result)}")
239
+ return {"elements": {}, "metadata": {"elementCount": 0}}
240
+
241
+ return result
242
+ except Exception as e:
243
+ logger.warning(f"Error in unified analysis: {e}")
244
+ return {"elements": {}, "metadata": {"elementCount": 0}}
245
+
246
+ def _convert_analysis_to_rects(
247
+ self, analysis_data: Dict[str, Any]
248
+ ) -> Dict[str, Any]:
249
+ r"""Convert analysis data to rect format for visual marking."""
250
+ rects = {}
251
+ elements = analysis_data.get("elements", {})
252
+
253
+ for ref, element_data in elements.items():
254
+ coordinates = element_data.get("coordinates", [])
255
+ if coordinates:
256
+ rects[ref] = {
257
+ "role": element_data.get("role", "generic"),
258
+ "aria-name": element_data.get("name", ""),
259
+ "rects": [coordinates[0]],
260
+ }
261
+ return rects
262
+
263
+ def _add_set_of_mark(self, image, rects):
264
+ r"""Add visual marks to the image."""
265
+ try:
266
+ from PIL import ImageDraw, ImageFont
267
+ except ImportError:
268
+ logger.warning("PIL not available, returning original image")
269
+ return image
270
+
271
+ marked_image = image.copy()
272
+ draw = ImageDraw.Draw(marked_image)
273
+
274
+ # Try to get font
275
+ try:
276
+ font = ImageFont.truetype("arial.ttf", 16)
277
+ except (OSError, IOError):
278
+ try:
279
+ font = ImageFont.load_default()
280
+ except (OSError, IOError):
281
+ font = None
282
+
283
+ # Color scheme
284
+ colors = {
285
+ "button": "#FF6B6B",
286
+ "link": "#4ECDC4",
287
+ "textbox": "#45B7D1",
288
+ "select": "#96CEB4",
289
+ "checkbox": "#FECA57",
290
+ "radio": "#FF9FF3",
291
+ "default": "#DDA0DD",
292
+ }
293
+
294
+ for ref, rect_data in rects.items():
295
+ rects_list = rect_data.get("rects", [])
296
+ role = rect_data.get("role", "generic")
297
+ color = colors.get(role, colors["default"])
298
+
299
+ for rect in rects_list:
300
+ x, y = rect.get("x", 0), rect.get("y", 0)
301
+ width, height = rect.get("width", 0), rect.get("height", 0)
302
+
303
+ # Draw rectangle outline
304
+ draw.rectangle(
305
+ [x, y, x + width, y + height], outline=color, width=2
306
+ )
307
+
308
+ # Draw reference label
309
+ label_text = ref
310
+ if font:
311
+ bbox = draw.textbbox((0, 0), label_text, font=font)
312
+ text_width, text_height = (
313
+ bbox[2] - bbox[0],
314
+ bbox[3] - bbox[1],
315
+ )
316
+ else:
317
+ text_width, text_height = len(label_text) * 8, 16
318
+
319
+ label_x, label_y = max(0, x - 2), max(0, y - text_height - 2)
320
+
321
+ # Background and text
322
+ draw.rectangle(
323
+ [
324
+ label_x,
325
+ label_y,
326
+ label_x + text_width + 4,
327
+ label_y + text_height + 2,
328
+ ],
329
+ fill=color,
330
+ )
331
+ draw.text(
332
+ (label_x + 2, label_y + 1),
333
+ label_text,
334
+ fill="white",
335
+ font=font,
336
+ )
337
+
338
+ return marked_image
339
+
340
+ def _format_snapshot_from_analysis(
341
+ self, analysis_data: Dict[str, Any]
342
+ ) -> str:
343
+ r"""Format analysis data into snapshot string."""
344
+ lines = []
345
+ elements = analysis_data.get("elements", {})
346
+
347
+ for ref, element_data in elements.items():
348
+ role = element_data.get("role", "generic")
349
+ name = element_data.get("name", "")
350
+
351
+ line = f"- {role}"
352
+ if name:
353
+ line += f' "{name}"'
354
+
355
+ # Add properties
356
+ props = []
357
+ for prop in ["disabled", "checked", "expanded"]:
358
+ value = element_data.get(prop)
359
+ if value is True:
360
+ props.append(prop)
361
+ elif value is not None and prop in ["checked", "expanded"]:
362
+ props.append(f"{prop}={value}")
363
+
364
+ if props:
365
+ line += f" {' '.join(props)}"
366
+
367
+ line += f" [ref={ref}]"
368
+ lines.append(line)
369
+
370
+ return "\n".join(lines)
371
+
372
+ async def _exec_with_snapshot(
373
+ self, action: Dict[str, Any]
374
+ ) -> Dict[str, str]:
375
+ r"""Execute action and return result with snapshot comparison."""
376
+
377
+ # Log action execution start
378
+ action_type = action.get("type", "unknown")
379
+ logger.info(f"Executing action: {action_type}")
380
+
381
+ # Get before snapshot
382
+ logger.info("Capturing pre-action snapshot...")
383
+ snapshot_start = time.time()
384
+ before_snapshot = await self._session.get_snapshot(
385
+ force_refresh=True, diff_only=False
386
+ )
387
+ snapshot_time = time.time() - snapshot_start
388
+ logger.info(f"Pre-action snapshot captured in {snapshot_time:.2f}s")
389
+
390
+ # Execute action
391
+ logger.info(f"Executing {action_type} action...")
392
+ action_start = time.time()
393
+ result = await self._session.exec_action(action)
394
+ action_time = time.time() - action_start
395
+ logger.info(f"Action {action_type} completed in {action_time:.2f}s")
396
+
397
+ # Wait for page stability after action (especially important for click)
398
+ if action_type in ["click", "type", "select", "enter"]:
399
+ logger.info(
400
+ f"Waiting for page stability " f"after {action_type}..."
401
+ )
402
+ stability_start = time.time()
403
+ await self._wait_for_page_stability()
404
+ stability_time = time.time() - stability_start
405
+ logger.info(
406
+ f"Page stability wait " f"completed in {stability_time:.2f}s"
407
+ )
408
+
409
+ # Get after snapshot
410
+ logger.info("Capturing post-action snapshot...")
411
+ snapshot_start = time.time()
412
+ after_snapshot = await self._session.get_snapshot(
413
+ force_refresh=True, diff_only=False
414
+ )
415
+ snapshot_time = time.time() - snapshot_start
416
+ logger.info(
417
+ f"Post-action snapshot " f"captured in {snapshot_time:.2f}s"
418
+ )
419
+
420
+ # Check for snapshot quality and log warnings
421
+ if before_snapshot == after_snapshot:
422
+ snapshot = "snapshot not changed"
423
+ logger.debug("Page snapshot unchanged after action")
424
+ else:
425
+ snapshot = after_snapshot
426
+ # Check if snapshot is empty or problematic
427
+ if "<empty>" in after_snapshot:
428
+ logger.warning(
429
+ f"Action {action_type} resulted "
430
+ f"in empty snapshot - "
431
+ f"page may still be loading"
432
+ )
433
+ elif len(after_snapshot.strip()) < 50:
434
+ logger.warning(
435
+ f"Action {action_type} resulted "
436
+ f"in very short snapshot:"
437
+ f" {len(after_snapshot)} chars"
438
+ )
439
+ else:
440
+ logger.debug(
441
+ f"Action {action_type} resulted "
442
+ f"in updated snapshot: "
443
+ f"{len(after_snapshot)} chars"
444
+ )
445
+
446
+ return {"result": result, "snapshot": snapshot}
447
+
448
+ async def _extract_links_by_refs(
449
+ self, snapshot: str, page, refs: List[str]
450
+ ) -> List[Dict[str, str]]:
451
+ r"""Extract multiple links by their reference IDs."""
452
+ import re
453
+
454
+ found_links = []
455
+ ref_set = set(refs)
456
+ lines = snapshot.split('\n')
457
+
458
+ for line in lines:
459
+ link_match = re.search(
460
+ r'- link\s+"([^"]+)"\s+\[ref=([^\]]+)\]', line
461
+ )
462
+ if link_match and link_match.group(2) in ref_set:
463
+ text, found_ref = link_match.groups()
464
+ try:
465
+ url = await self._get_link_url_by_ref(page, found_ref)
466
+ found_links.append(
467
+ {"text": text, "ref": found_ref, "url": url or ""}
468
+ )
469
+ except Exception as e:
470
+ logger.warning(
471
+ f"Failed to get URL for ref {found_ref}: {e}"
472
+ )
473
+ found_links.append(
474
+ {"text": text, "ref": found_ref, "url": ""}
475
+ )
476
+
477
+ return found_links
478
+
479
+ async def _get_link_url_by_ref(self, page, ref: str) -> str:
480
+ r"""Get URL of a link element by reference ID."""
481
+ try:
482
+ element = await page.query_selector(f'[aria-ref="{ref}"]')
483
+ if element:
484
+ href = await element.get_attribute('href')
485
+ if href:
486
+ from urllib.parse import urljoin
487
+
488
+ return urljoin(page.url, href)
489
+ return ""
490
+ except Exception as e:
491
+ logger.warning(f"Failed to get URL for ref {ref}: {e}")
492
+ return ""
493
+
494
+ def _ensure_agent(self) -> PlaywrightLLMAgent:
495
+ r"""Create PlaywrightLLMAgent on first use."""
496
+ if self.web_agent_model is None:
497
+ raise RuntimeError(
498
+ "web_agent_model required for high-level task planning"
499
+ )
500
+
501
+ if self._agent is None:
502
+ self._agent = PlaywrightLLMAgent(
503
+ headless=self._headless,
504
+ user_data_dir=self._user_data_dir,
505
+ model_backend=self.web_agent_model,
506
+ )
507
+ return self._agent
508
+
509
+ # Public API Methods
510
+
511
+ async def open_browser(
512
+ self, start_url: Optional[str] = None
513
+ ) -> Dict[str, str]:
514
+ r"""Launches a new browser session, making it ready for web automation.
515
+
516
+ This method initializes the underlying browser instance. If a
517
+ `start_url` is provided, it will also navigate to that URL.
518
+
519
+ Args:
520
+ start_url (Optional[str]): The initial URL to navigate to after the
521
+ browser is launched. If not provided, the browser will start
522
+ with a blank page.
523
+
524
+ Returns:
525
+ Dict[str, str]: A dictionary containing:
526
+ - "result": A string confirming that the browser session has
527
+ started.
528
+ - "snapshot": A textual representation of the current page's
529
+ interactive elements. This snapshot is crucial for
530
+ identifying elements for subsequent actions.
531
+ """
532
+ logger.info("Starting browser session...")
533
+
534
+ browser_start = time.time()
535
+ await self._session.ensure_browser()
536
+ browser_time = time.time() - browser_start
537
+ logger.info(f"Browser session started in {browser_time:.2f}s")
538
+
539
+ if start_url:
540
+ logger.info(f"Auto-navigating to start URL: {start_url}")
541
+ return await self.visit_page(start_url)
542
+
543
+ logger.info("Capturing initial browser snapshot...")
544
+ snapshot_start = time.time()
545
+ snapshot = await self._session.get_snapshot(
546
+ force_refresh=True, diff_only=False
547
+ )
548
+ snapshot_time = time.time() - snapshot_start
549
+ logger.info(f"Initial snapshot captured in {snapshot_time:.2f}s")
550
+
551
+ return {"result": "Browser session started.", "snapshot": snapshot}
552
+
553
+ async def close_browser(self) -> str:
554
+ r"""Closes the current browser session and releases all associated
555
+ resources.
556
+
557
+ This should be called at the end of a web automation task to ensure a
558
+ clean shutdown of the browser instance.
559
+
560
+ Returns:
561
+ str: A confirmation message indicating the session is closed.
562
+ """
563
+ if self._agent is not None:
564
+ try:
565
+ await self._agent.close()
566
+ except Exception:
567
+ pass
568
+ self._agent = None
569
+
570
+ await self._session.close()
571
+ return "Browser session closed."
572
+
573
+ async def visit_page(self, url: str) -> Dict[str, str]:
574
+ r"""Navigates the current browser page to a specified URL.
575
+
576
+ Args:
577
+ url (str): The web address to load in the browser. Must be a
578
+ valid URL.
579
+
580
+ Returns:
581
+ Dict[str, str]: A dictionary containing:
582
+ - "result": A message indicating the outcome of the navigation,
583
+ e.g., "Navigation successful.".
584
+ - "snapshot": A new textual snapshot of the page's interactive
585
+ elements after the new page has loaded.
586
+ """
587
+ if not url or not isinstance(url, str):
588
+ return {
589
+ "result": "Error: 'url' must be a non-empty string",
590
+ "snapshot": "",
591
+ }
592
+
593
+ logger.info(f"Navigating to URL: {url}")
594
+
595
+ # Navigate to page
596
+ nav_start = time.time()
597
+ nav_result = await self._session.visit(url)
598
+ nav_time = time.time() - nav_start
599
+ logger.info(f"Page navigation completed in {nav_time:.2f}s")
600
+
601
+ # Get snapshot
602
+ logger.info("Capturing page snapshot after navigation...")
603
+ snapshot_start = time.time()
604
+ snapshot = await self._session.get_snapshot(
605
+ force_refresh=True, diff_only=False
606
+ )
607
+ snapshot_time = time.time() - snapshot_start
608
+ logger.info(f"Navigation snapshot captured in {snapshot_time:.2f}s")
609
+
610
+ return {"result": nav_result, "snapshot": snapshot}
611
+
612
+ async def get_page_snapshot(self) -> str:
613
+ r"""Captures a textual representation of the current page's content.
614
+
615
+ This "snapshot" provides a simplified view of the DOM, focusing on
616
+ interactive elements like links, buttons, and input fields. Each
617
+ element is assigned a unique reference ID (`ref`) that can be used in
618
+ other actions like `click` or `type`.
619
+
620
+ The snapshot is useful for understanding the page structure and
621
+ identifying elements to interact with without needing to parse raw
622
+ HTML. A new snapshot is generated on each call.
623
+
624
+ Returns:
625
+ str: A formatted string representing the interactive elements on
626
+ the page. For example:
627
+ '- link "Sign In" [ref=1]'
628
+ '- textbox "Username" [ref=2]'
629
+ """
630
+ logger.info("Capturing page snapshot")
631
+
632
+ analysis_start = time.time()
633
+ analysis_data = await self._get_unified_analysis()
634
+ analysis_time = time.time() - analysis_start
635
+ logger.info(
636
+ f"Page snapshot analysis " f"completed in {analysis_time:.2f}s"
637
+ )
638
+
639
+ snapshot_text = analysis_data.get("snapshotText", "")
640
+ return (
641
+ snapshot_text
642
+ if snapshot_text
643
+ else self._format_snapshot_from_analysis(analysis_data)
644
+ )
645
+
646
+ @dependencies_required('PIL')
647
+ async def get_som_screenshot(self):
648
+ r"""Captures a screenshot of the current webpage and visually marks all
649
+ interactive elements. "SoM" stands for "Set of Marks".
650
+
651
+ This method is essential for tasks requiring visual understanding of
652
+ the page layout. It works by:
653
+ 1. Taking a full-page screenshot.
654
+ 2. Identifying all interactive elements (buttons, links, inputs, etc.).
655
+ 3. Drawing colored boxes and reference IDs (`ref`) over these elements
656
+ on the screenshot.
657
+ 4. Saving the annotated image to a cache directory.
658
+ 5. Returning the image as a base64-encoded string along with a summary.
659
+
660
+ Use this when the textual snapshot from `get_page_snapshot` is
661
+ insufficient and visual context is needed to decide the next action.
662
+
663
+ Returns:
664
+ ToolResult: An object containing:
665
+ - `text`: A summary string, e.g., "Visual webpage screenshot
666
+ captured with 42 interactive elements".
667
+ - `images`: A list containing a single base64-encoded PNG image
668
+ as a data URL.
669
+ """
670
+ from PIL import Image
671
+
672
+ from camel.utils.tool_result import ToolResult
673
+
674
+ # Get screenshot and analysis
675
+ page = await self._require_page()
676
+
677
+ # Log screenshot timeout start
678
+ logger.info(
679
+ f"Starting screenshot capture"
680
+ f"with timeout: {self.DEFAULT_SCREENSHOT_TIMEOUT}ms"
681
+ )
682
+
683
+ start_time = time.time()
684
+ image_data = await page.screenshot(
685
+ timeout=self.DEFAULT_SCREENSHOT_TIMEOUT
686
+ )
687
+ screenshot_time = time.time() - start_time
688
+
689
+ logger.info(f"Screenshot capture completed in {screenshot_time:.2f}s")
690
+ image = Image.open(io.BytesIO(image_data))
691
+
692
+ # Log unified analysis start
693
+ logger.info("Starting unified page analysis...")
694
+ analysis_start_time = time.time()
695
+ analysis_data = await self._get_unified_analysis()
696
+ analysis_time = time.time() - analysis_start_time
697
+ logger.info(f"Unified page analysis completed in {analysis_time:.2f}s")
698
+
699
+ # Log image processing
700
+ logger.info("Processing visual marks on screenshot...")
701
+ mark_start_time = time.time()
702
+ rects = self._convert_analysis_to_rects(analysis_data)
703
+ marked_image = self._add_set_of_mark(image, rects)
704
+ mark_time = time.time() - mark_start_time
705
+ logger.info(f"Visual marks processing completed in {mark_time:.2f}s")
706
+
707
+ # Save screenshot to cache directory
708
+ parsed_url = urllib.parse.urlparse(page.url)
709
+ url_name = sanitize_filename(str(parsed_url.path), max_length=241)
710
+ timestamp = datetime.datetime.now().strftime("%m%d%H%M%S")
711
+ file_path = os.path.join(
712
+ self.cache_dir, f"{url_name}_{timestamp}_som.png"
713
+ )
714
+ marked_image.save(file_path, "PNG")
715
+
716
+ # Convert to base64
717
+ img_buffer = io.BytesIO()
718
+ marked_image.save(img_buffer, format="PNG")
719
+ img_buffer.seek(0)
720
+ img_base64 = base64.b64encode(img_buffer.getvalue()).decode('utf-8')
721
+ img_data_url = f"data:image/png;base64,{img_base64}"
722
+
723
+ text_result = (
724
+ f"Visual webpage screenshot "
725
+ f"captured with {len(rects)} interactive elements"
726
+ )
727
+
728
+ return ToolResult(text=text_result, images=[img_data_url])
729
+
730
+ async def click(self, *, ref: str) -> Dict[str, str]:
731
+ r"""Clicks on an interactive element on the page.
732
+
733
+ Args:
734
+ ref (str): The reference ID of the element to click. This ID is
735
+ obtained from the page snapshot (see `get_page_snapshot` or
736
+ `get_som_screenshot`).
737
+
738
+ Returns:
739
+ Dict[str, str]: A dictionary containing:
740
+ - "result": A message confirming the click action.
741
+ - "snapshot": A new textual snapshot of the page after the
742
+ click, which may have changed as a result of the action. If
743
+ the snapshot is unchanged, it will be the string "snapshot
744
+ not changed".
745
+ """
746
+ self._validate_ref(ref, "click")
747
+
748
+ analysis = await self._get_unified_analysis()
749
+ elements = analysis.get("elements", {})
750
+ if ref not in elements:
751
+ available_refs = list(elements.keys())
752
+ logger.error(
753
+ f"Error: Element reference '{ref}' not found. "
754
+ f"Available refs: {available_refs}"
755
+ )
756
+ return {
757
+ "result": f"Error: Element reference '{ref}' not found. "
758
+ f"Available refs: {available_refs}"
759
+ }
760
+
761
+ action = {"type": "click", "ref": ref}
762
+ return await self._exec_with_snapshot(action)
763
+
764
+ async def type(self, *, ref: str, text: str) -> Dict[str, str]:
765
+ r"""Types text into an input field, such as a textbox or search bar.
766
+
767
+ Args:
768
+ ref (str): The reference ID of the input element.
769
+ text (str): The text to be typed into the element.
770
+
771
+ Returns:
772
+ Dict[str, str]: A dictionary containing:
773
+ - "result": A message confirming the type action.
774
+ - "snapshot": A new textual snapshot of the page after the
775
+ text has been entered.
776
+ """
777
+ self._validate_ref(ref, "type")
778
+ await self._get_unified_analysis() # Ensure aria-ref attributes
779
+
780
+ action = {"type": "type", "ref": ref, "text": text}
781
+ return await self._exec_with_snapshot(action)
782
+
783
+ async def select(self, *, ref: str, value: str) -> Dict[str, str]:
784
+ r"""Selects an option from a dropdown (`<select>`) element.
785
+
786
+ Args:
787
+ ref (str): The reference ID of the `<select>` element.
788
+ value (str): The value of the `<option>` to be selected. This
789
+ should match the `value` attribute of the option, not the
790
+ visible text.
791
+
792
+ Returns:
793
+ Dict[str, str]: A dictionary containing:
794
+ - "result": A message confirming the select action.
795
+ - "snapshot": A new snapshot of the page after the selection.
796
+ """
797
+ self._validate_ref(ref, "select")
798
+ await self._get_unified_analysis()
799
+
800
+ action = {"type": "select", "ref": ref, "value": value}
801
+ return await self._exec_with_snapshot(action)
802
+
803
+ async def scroll(self, *, direction: str, amount: int) -> Dict[str, str]:
804
+ r"""Scrolls the page window up or down by a specified amount.
805
+
806
+ Args:
807
+ direction (str): The direction to scroll. Must be either 'up' or
808
+ 'down'.
809
+ amount (int): The number of pixels to scroll.
810
+
811
+ Returns:
812
+ Dict[str, str]: A dictionary containing:
813
+ - "result": A confirmation of the scroll action.
814
+ - "snapshot": A new snapshot of the page after scrolling.
815
+ """
816
+ if direction not in ("up", "down"):
817
+ return {
818
+ "result": "Error: direction must be 'up' or 'down'",
819
+ "snapshot": "",
820
+ }
821
+
822
+ action = {"type": "scroll", "direction": direction, "amount": amount}
823
+ return await self._exec_with_snapshot(action)
824
+
825
+ async def enter(self, *, ref: str) -> Dict[str, str]:
826
+ r"""Simulates pressing the Enter key on a specific element.
827
+
828
+ This is often used to submit forms after filling them out.
829
+
830
+ Args:
831
+ ref (str): The reference ID of the element to press Enter on.
832
+
833
+ Returns:
834
+ Dict[str, str]: A dictionary containing:
835
+ - "result": A confirmation of the action.
836
+ - "snapshot": A new page snapshot, as this action often
837
+ triggers navigation or page updates.
838
+ """
839
+ self._validate_ref(ref, "enter")
840
+ action = {"type": "enter", "ref": ref}
841
+ return await self._exec_with_snapshot(action)
842
+
843
+ async def wait_user(
844
+ self, timeout_sec: Optional[float] = None
845
+ ) -> Dict[str, str]:
846
+ r"""Pauses the agent's execution and waits for human intervention.
847
+
848
+ This is useful for tasks that require manual steps, like solving a
849
+ CAPTCHA. The agent will print a message to the console and wait
850
+ until the user presses the Enter key.
851
+
852
+ Args:
853
+ timeout_sec (Optional[float]): The maximum time to wait in
854
+ seconds. If the timeout is reached, the agent will resume
855
+ automatically. If `None`, it will wait indefinitely.
856
+
857
+ Returns:
858
+ Dict[str, str]: A dictionary containing:
859
+ - "result": A message indicating how the wait ended (e.g.,
860
+ "User resumed." or "Timeout... reached, auto-resumed.").
861
+ - "snapshot": The current page snapshot after the wait.
862
+ """
863
+ import asyncio
864
+
865
+ prompt = (
866
+ "🕑 Agent waiting for human input. "
867
+ "Complete action in browser, then press Enter..."
868
+ )
869
+ logger.info(f"\n{prompt}\n")
870
+
871
+ async def _await_enter():
872
+ await asyncio.to_thread(input, ">>> Press Enter to resume <<<\n")
873
+
874
+ try:
875
+ if timeout_sec is not None:
876
+ logger.info(
877
+ f"Waiting for user input with timeout: {timeout_sec}s"
878
+ )
879
+ start_time = time.time()
880
+ await asyncio.wait_for(_await_enter(), timeout=timeout_sec)
881
+ wait_time = time.time() - start_time
882
+ logger.info(f"User input received after {wait_time:.2f}s")
883
+ result_msg = "User resumed."
884
+ else:
885
+ logger.info("Waiting for user " "input (no timeout)")
886
+ start_time = time.time()
887
+ await _await_enter()
888
+ wait_time = time.time() - start_time
889
+ logger.info(f"User input received " f"after {wait_time:.2f}s")
890
+ result_msg = "User resumed."
891
+ except asyncio.TimeoutError:
892
+ wait_time = timeout_sec or 0.0
893
+ logger.info(
894
+ f"User input timeout reached "
895
+ f"after {wait_time}s, auto-resuming"
896
+ )
897
+ result_msg = f"Timeout {timeout_sec}s reached, auto-resumed."
898
+
899
+ snapshot = await self._session.get_snapshot(
900
+ force_refresh=True, diff_only=False
901
+ )
902
+ return {"result": result_msg, "snapshot": snapshot}
903
+
904
+ async def get_page_links(self, *, ref: List[str]) -> Dict[str, Any]:
905
+ r"""Retrieves the full URLs for a given list of link reference IDs.
906
+
907
+ This is useful when you need to know the destination of a link before
908
+ clicking it.
909
+
910
+ Args:
911
+ ref (List[str]): A list of reference IDs for link elements,
912
+ obtained from a page snapshot.
913
+
914
+ Returns:
915
+ Dict[str, Any]: A dictionary containing:
916
+ - "links": A list of dictionaries, where each dictionary
917
+ represents a found link and has "text", "ref", and "url"
918
+ keys.
919
+ """
920
+ if not ref or not isinstance(ref, list):
921
+ return {"links": []}
922
+
923
+ for r in ref:
924
+ if not r or not isinstance(r, str):
925
+ return {"links": []}
926
+
927
+ page = await self._require_page()
928
+ snapshot = await self._session.get_snapshot(
929
+ force_refresh=True, diff_only=False
930
+ )
931
+ links = await self._extract_links_by_refs(snapshot, page, ref)
932
+
933
+ return {"links": links}
934
+
935
+ async def solve_task(
936
+ self, task_prompt: str, start_url: str, max_steps: int = 15
937
+ ) -> str:
938
+ r"""Uses a high-level LLM agent to autonomously complete a task.
939
+
940
+ This function delegates control to another agent that can reason about
941
+ a task, break it down into steps, and execute browser actions to
942
+ achieve the goal. It is suitable for complex, multi-step tasks.
943
+
944
+ Note: `web_agent_model` must be provided during the toolkit's
945
+ initialization to use this function.
946
+
947
+ Args:
948
+ task_prompt (str): A natural language description of the task to
949
+ be completed (e.g., "log into my account on example.com").
950
+ start_url (str): The URL to start the task from.
951
+ max_steps (int): The maximum number of steps the agent is allowed
952
+ to take before stopping.
953
+
954
+ Returns:
955
+ str: A summary message indicating that the task processing has
956
+ finished. The detailed trace of the agent's actions will be
957
+ printed to the standard output.
958
+ """
959
+ agent = self._ensure_agent()
960
+ await agent.navigate(start_url)
961
+ await agent.process_command(task_prompt, max_steps=max_steps)
962
+ return "Task processing finished - see stdout for detailed trace."
963
+
964
+ def get_tools(self) -> List[FunctionTool]:
965
+ r"""Get available function tools
966
+ based on enabled_tools configuration."""
967
+ # Map tool names to their corresponding methods
968
+ tool_map = {
969
+ "open_browser": self.open_browser,
970
+ "close_browser": self.close_browser,
971
+ "visit_page": self.visit_page,
972
+ "get_page_snapshot": self.get_page_snapshot,
973
+ "get_som_screenshot": self.get_som_screenshot,
974
+ "get_page_links": self.get_page_links,
975
+ "click": self.click,
976
+ "type": self.type,
977
+ "select": self.select,
978
+ "scroll": self.scroll,
979
+ "enter": self.enter,
980
+ "wait_user": self.wait_user,
981
+ "solve_task": self.solve_task,
982
+ }
983
+
984
+ enabled_tools = []
985
+
986
+ for tool_name in self.enabled_tools:
987
+ if tool_name == "solve_task" and self.web_agent_model is None:
988
+ logger.warning(
989
+ f"Tool '{tool_name}' is enabled but web_agent_model "
990
+ f"is not provided. Skipping this tool."
991
+ )
992
+ continue
993
+
994
+ if tool_name in tool_map:
995
+ enabled_tools.append(
996
+ FunctionTool(cast(Callable, tool_map[tool_name]))
997
+ )
998
+ else:
999
+ logger.warning(f"Unknown tool name: {tool_name}")
1000
+
1001
+ logger.info(f"Returning {len(enabled_tools)} enabled tools")
1002
+ return enabled_tools