camel-ai 0.2.60__py3-none-any.whl → 0.2.62__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of camel-ai might be problematic. Click here for more details.

Files changed (53) hide show
  1. camel/__init__.py +1 -1
  2. camel/agents/chat_agent.py +159 -8
  3. camel/agents/mcp_agent.py +5 -5
  4. camel/configs/anthropic_config.py +6 -5
  5. camel/{data_collector → data_collectors}/alpaca_collector.py +1 -1
  6. camel/{data_collector → data_collectors}/sharegpt_collector.py +1 -1
  7. camel/datagen/evol_instruct/scorer.py +22 -23
  8. camel/datagen/evol_instruct/templates.py +46 -46
  9. camel/datasets/static_dataset.py +144 -0
  10. camel/loaders/__init__.py +5 -2
  11. camel/loaders/chunkr_reader.py +117 -91
  12. camel/loaders/mistral_reader.py +148 -0
  13. camel/memories/blocks/chat_history_block.py +1 -2
  14. camel/models/model_manager.py +7 -3
  15. camel/retrievers/auto_retriever.py +20 -1
  16. camel/{runtime → runtimes}/daytona_runtime.py +1 -1
  17. camel/{runtime → runtimes}/docker_runtime.py +1 -1
  18. camel/{runtime → runtimes}/llm_guard_runtime.py +2 -2
  19. camel/{runtime → runtimes}/remote_http_runtime.py +1 -1
  20. camel/{runtime → runtimes}/ubuntu_docker_runtime.py +1 -1
  21. camel/societies/workforce/base.py +7 -3
  22. camel/societies/workforce/single_agent_worker.py +2 -1
  23. camel/societies/workforce/worker.py +5 -3
  24. camel/societies/workforce/workforce.py +65 -24
  25. camel/storages/__init__.py +2 -0
  26. camel/storages/vectordb_storages/__init__.py +2 -0
  27. camel/storages/vectordb_storages/faiss.py +712 -0
  28. camel/toolkits/__init__.py +4 -0
  29. camel/toolkits/async_browser_toolkit.py +75 -523
  30. camel/toolkits/bohrium_toolkit.py +318 -0
  31. camel/toolkits/browser_toolkit.py +215 -538
  32. camel/toolkits/browser_toolkit_commons.py +568 -0
  33. camel/toolkits/file_write_toolkit.py +80 -31
  34. camel/toolkits/mcp_toolkit.py +477 -665
  35. camel/toolkits/pptx_toolkit.py +777 -0
  36. camel/toolkits/wolfram_alpha_toolkit.py +5 -1
  37. camel/types/enums.py +13 -1
  38. camel/utils/__init__.py +2 -0
  39. camel/utils/commons.py +27 -0
  40. camel/utils/mcp_client.py +979 -0
  41. {camel_ai-0.2.60.dist-info → camel_ai-0.2.62.dist-info}/METADATA +14 -1
  42. {camel_ai-0.2.60.dist-info → camel_ai-0.2.62.dist-info}/RECORD +53 -47
  43. /camel/{data_collector → data_collectors}/__init__.py +0 -0
  44. /camel/{data_collector → data_collectors}/base.py +0 -0
  45. /camel/{runtime → runtimes}/__init__.py +0 -0
  46. /camel/{runtime → runtimes}/api.py +0 -0
  47. /camel/{runtime → runtimes}/base.py +0 -0
  48. /camel/{runtime → runtimes}/configs.py +0 -0
  49. /camel/{runtime → runtimes}/utils/__init__.py +0 -0
  50. /camel/{runtime → runtimes}/utils/function_risk_toolkit.py +0 -0
  51. /camel/{runtime → runtimes}/utils/ignore_risk_toolkit.py +0 -0
  52. {camel_ai-0.2.60.dist-info → camel_ai-0.2.62.dist-info}/WHEEL +0 -0
  53. {camel_ai-0.2.60.dist-info → camel_ai-0.2.62.dist-info}/licenses/LICENSE +0 -0
@@ -17,9 +17,7 @@ from __future__ import annotations
17
17
 
18
18
  import datetime
19
19
  import io
20
- import json
21
20
  import os
22
- import random
23
21
  import re
24
22
  import shutil
25
23
  import time
@@ -28,18 +26,16 @@ from copy import deepcopy
28
26
  from typing import (
29
27
  TYPE_CHECKING,
30
28
  Any,
31
- BinaryIO,
32
29
  Dict,
33
30
  List,
34
31
  Literal,
35
32
  Optional,
36
33
  Tuple,
37
- TypedDict,
38
34
  Union,
39
35
  cast,
40
36
  )
41
37
 
42
- from PIL import Image, ImageDraw, ImageFont
38
+ from PIL import Image
43
39
 
44
40
  from camel.logger import get_logger
45
41
  from camel.messages import BaseMessage
@@ -54,7 +50,34 @@ from camel.utils import (
54
50
  sanitize_filename,
55
51
  )
56
52
 
53
+ # Import shared components from browser_toolkit_commons
54
+ from .browser_toolkit_commons import (
55
+ ACTION_WITH_FEEDBACK_LIST,
56
+ AVAILABLE_ACTIONS_PROMPT,
57
+ GET_FINAL_ANSWER_PROMPT_TEMPLATE,
58
+ OBSERVE_PROMPT_TEMPLATE,
59
+ PLANNING_AGENT_SYSTEM_PROMPT,
60
+ TASK_PLANNING_PROMPT_TEMPLATE,
61
+ TASK_REPLANNING_PROMPT_TEMPLATE,
62
+ WEB_AGENT_SYSTEM_PROMPT,
63
+ InteractiveRegion,
64
+ VisualViewport,
65
+ _add_set_of_mark,
66
+ _parse_json_output,
67
+ _reload_image,
68
+ interactive_region_from_dict,
69
+ visual_viewport_from_dict,
70
+ )
71
+
57
72
  if TYPE_CHECKING:
73
+ from playwright.sync_api import (
74
+ Browser,
75
+ BrowserContext,
76
+ FloatRect,
77
+ Page,
78
+ Playwright,
79
+ )
80
+
58
81
  from camel.agents import ChatAgent
59
82
 
60
83
  logger = get_logger(__name__)
@@ -62,82 +85,6 @@ logger = get_logger(__name__)
62
85
  TOP_NO_LABEL_ZONE = 20
63
86
 
64
87
 
65
- AVAILABLE_ACTIONS_PROMPT = """
66
- 1. `fill_input_id(identifier: Union[str, int], text: str)`: Fill an input
67
- field (e.g. search box) with the given text and press Enter.
68
- 2. `click_id(identifier: Union[str, int])`: Click an element with the given ID.
69
- 3. `hover_id(identifier: Union[str, int])`: Hover over an element with the
70
- given ID.
71
- 4. `download_file_id(identifier: Union[str, int])`: Download a file with the
72
- given ID. It returns the path to the downloaded file. If the file is
73
- successfully downloaded, you can stop the simulation and report the path to
74
- the downloaded file for further processing.
75
- 5. `scroll_to_bottom()`: Scroll to the bottom of the page.
76
- 6. `scroll_to_top()`: Scroll to the top of the page.
77
- 7. `scroll_up()`: Scroll up the page. It is suitable when you want to see the
78
- elements above the current viewport.
79
- 8. `scroll_down()`: Scroll down the page. It is suitable when you want to see
80
- the elements below the current viewport. If the webpage does not change, It
81
- means that the webpage has scrolled to the bottom.
82
- 9. `back()`: Navigate back to the previous page. This is useful when you want
83
- to go back to the previous page, as current page is not useful.
84
- 10. `stop()`: Stop the action process, because the task is completed or failed
85
- (impossible to find the answer). In this situation, you should provide your
86
- answer in your output.
87
- 11. `get_url()`: Get the current URL of the current page.
88
- 12. `find_text_on_page(search_text: str)`: Find the next given text on the
89
- current whole page, and scroll the page to the targeted text. It is equivalent
90
- to pressing Ctrl + F and searching for the text, and is powerful when you want
91
- to fast-check whether the current page contains some specific text.
92
- 13. `visit_page(url: str)`: Go to the specific url page.
93
- 14. `click_blank_area()`: Click a blank area of the page to unfocus the
94
- current element. It is useful when you have clicked an element but it cannot
95
- unfocus itself (e.g. Menu bar) to automatically render the updated webpage.
96
- 15. `ask_question_about_video(question: str)`: Ask a question about the
97
- current webpage which contains video, e.g. youtube websites.
98
- """
99
-
100
- ACTION_WITH_FEEDBACK_LIST = [
101
- 'ask_question_about_video',
102
- 'download_file_id',
103
- 'find_text_on_page',
104
- ]
105
-
106
-
107
- # Code from magentic-one
108
- class DOMRectangle(TypedDict):
109
- x: Union[int, float]
110
- y: Union[int, float]
111
- width: Union[int, float]
112
- height: Union[int, float]
113
- top: Union[int, float]
114
- right: Union[int, float]
115
- bottom: Union[int, float]
116
- left: Union[int, float]
117
-
118
-
119
- class VisualViewport(TypedDict):
120
- height: Union[int, float]
121
- width: Union[int, float]
122
- offsetLeft: Union[int, float]
123
- offsetTop: Union[int, float]
124
- pageLeft: Union[int, float]
125
- pageTop: Union[int, float]
126
- scale: Union[int, float]
127
- clientWidth: Union[int, float]
128
- clientHeight: Union[int, float]
129
- scrollWidth: Union[int, float]
130
- scrollHeight: Union[int, float]
131
-
132
-
133
- class InteractiveRegion(TypedDict):
134
- tag_name: str
135
- role: str
136
- aria_name: str
137
- v_scrollable: bool
138
- rects: List[DOMRectangle]
139
-
140
-
141
88
  def _get_str(d: Any, k: str) -> str:
142
89
  r"""Safely retrieve a string value from a dictionary."""
143
90
  if k not in d:
@@ -171,270 +118,6 @@ def _get_bool(d: Any, k: str) -> bool:
171
118
  )
172
119
 
173
120
 
174
- def _parse_json_output(text: str) -> Dict[str, Any]:
175
- r"""Extract JSON output from a string."""
176
-
177
- markdown_pattern = r'```(?:json)?\s*(.*?)\s*```'
178
- markdown_match = re.search(markdown_pattern, text, re.DOTALL)
179
- if markdown_match:
180
- text = markdown_match.group(1).strip()
181
-
182
- triple_quotes_pattern = r'"""(?:json)?\s*(.*?)\s*"""'
183
- triple_quotes_match = re.search(triple_quotes_pattern, text, re.DOTALL)
184
- if triple_quotes_match:
185
- text = triple_quotes_match.group(1).strip()
186
-
187
- try:
188
- return json.loads(text)
189
- except json.JSONDecodeError:
190
- try:
191
- fixed_text = re.sub(
192
- r'`([^`]*?)`(?=\s*[:,\[\]{}]|$)', r'"\1"', text
193
- )
194
- return json.loads(fixed_text)
195
- except json.JSONDecodeError:
196
- result = {}
197
- try:
198
- bool_pattern = r'"(\w+)"\s*:\s*(true|false)'
199
- for match in re.finditer(bool_pattern, text, re.IGNORECASE):
200
- key, value = match.groups()
201
- result[key] = value.lower() == "true"
202
-
203
- str_pattern = r'"(\w+)"\s*:\s*"([^"]*)"'
204
- for match in re.finditer(str_pattern, text):
205
- key, value = match.groups()
206
- result[key] = value
207
-
208
- num_pattern = r'"(\w+)"\s*:\s*(-?\d+(?:\.\d+)?)'
209
- for match in re.finditer(num_pattern, text):
210
- key, value = match.groups()
211
- try:
212
- result[key] = int(value)
213
- except ValueError:
214
- result[key] = float(value)
215
-
216
- empty_str_pattern = r'"(\w+)"\s*:\s*""'
217
- for match in re.finditer(empty_str_pattern, text):
218
- key = match.group(1)
219
- result[key] = ""
220
-
221
- if result:
222
- return result
223
-
224
- logger.warning(f"Failed to parse JSON output: {text}")
225
- return {}
226
- except Exception as e:
227
- logger.warning(f"Error while extracting fields from JSON: {e}")
228
- return {}
229
-
230
-
231
- def _reload_image(image: Image.Image) -> Image.Image:
232
- buffer = io.BytesIO()
233
- image.save(buffer, format="PNG")
234
- buffer.seek(0)
235
- return Image.open(buffer)
236
-
237
-
238
- def dom_rectangle_from_dict(rect: Dict[str, Any]) -> DOMRectangle:
239
- r"""Create a DOMRectangle object from a dictionary."""
240
- return DOMRectangle(
241
- x=_get_number(rect, "x"),
242
- y=_get_number(rect, "y"),
243
- width=_get_number(rect, "width"),
244
- height=_get_number(rect, "height"),
245
- top=_get_number(rect, "top"),
246
- right=_get_number(rect, "right"),
247
- bottom=_get_number(rect, "bottom"),
248
- left=_get_number(rect, "left"),
249
- )
250
-
251
-
252
- def interactive_region_from_dict(region: Dict[str, Any]) -> InteractiveRegion:
253
- r"""Create an :class:`InteractiveRegion` object from a dictionary."""
254
- typed_rects: List[DOMRectangle] = []
255
- for rect in region["rects"]:
256
- typed_rects.append(dom_rectangle_from_dict(rect))
257
-
258
- return InteractiveRegion(
259
- tag_name=_get_str(region, "tag_name"),
260
- role=_get_str(region, "role"),
261
- aria_name=_get_str(region, "aria-name"),
262
- v_scrollable=_get_bool(region, "v-scrollable"),
263
- rects=typed_rects,
264
- )
265
-
266
-
267
- def visual_viewport_from_dict(viewport: Dict[str, Any]) -> VisualViewport:
268
- r"""Create a :class:`VisualViewport` object from a dictionary."""
269
- return VisualViewport(
270
- height=_get_number(viewport, "height"),
271
- width=_get_number(viewport, "width"),
272
- offsetLeft=_get_number(viewport, "offsetLeft"),
273
- offsetTop=_get_number(viewport, "offsetTop"),
274
- pageLeft=_get_number(viewport, "pageLeft"),
275
- pageTop=_get_number(viewport, "pageTop"),
276
- scale=_get_number(viewport, "scale"),
277
- clientWidth=_get_number(viewport, "clientWidth"),
278
- clientHeight=_get_number(viewport, "clientHeight"),
279
- scrollWidth=_get_number(viewport, "scrollWidth"),
280
- scrollHeight=_get_number(viewport, "scrollHeight"),
281
- )
282
-
283
-
284
- def add_set_of_mark(
285
- screenshot: Union[bytes, Image.Image, io.BufferedIOBase],
286
- ROIs: Dict[str, InteractiveRegion],
287
- ) -> Tuple[Image.Image, List[str], List[str], List[str]]:
288
- if isinstance(screenshot, Image.Image):
289
- return _add_set_of_mark(screenshot, ROIs)
290
-
291
- if isinstance(screenshot, bytes):
292
- screenshot = io.BytesIO(screenshot)
293
-
294
- image = Image.open(cast(BinaryIO, screenshot))
295
- comp, visible_rects, rects_above, rects_below = _add_set_of_mark(
296
- image, ROIs
297
- )
298
- image.close()
299
- return comp, visible_rects, rects_above, rects_below
300
-
301
-
302
- def _add_set_of_mark(
303
- screenshot: Image.Image, ROIs: Dict[str, InteractiveRegion]
304
- ) -> Tuple[Image.Image, List[str], List[str], List[str]]:
305
- r"""Add a set of marks to the screenshot.
306
-
307
- Args:
308
- screenshot (Image.Image): The screenshot to add marks to.
309
- ROIs (Dict[str, InteractiveRegion]): The regions to add marks to.
310
-
311
- Returns:
312
- Tuple[Image.Image, List[str], List[str], List[str]]: A tuple
313
- containing the screenshot with marked ROIs, ROIs fully within the
314
- images, ROIs located above the visible area, and ROIs located below
315
- the visible area.
316
- """
317
- visible_rects: List[str] = list()
318
- rects_above: List[str] = list() # Scroll up to see
319
- rects_below: List[str] = list() # Scroll down to see
320
-
321
- fnt = ImageFont.load_default(14)
322
- base = screenshot.convert("L").convert("RGBA")
323
- overlay = Image.new("RGBA", base.size)
324
-
325
- draw = ImageDraw.Draw(overlay)
326
- for r in ROIs:
327
- for rect in ROIs[r]["rects"]:
328
- # Empty rectangles
329
- if not rect or rect["width"] == 0 or rect["height"] == 0:
330
- continue
331
-
332
- # TODO: add scroll left and right?
333
- horizontal_center = (rect["right"] + rect["left"]) / 2.0
334
- vertical_center = (rect["top"] + rect["bottom"]) / 2.0
335
- is_within_horizon = 0 <= horizontal_center < base.size[0]
336
- is_above_viewport = vertical_center < 0
337
- is_below_viewport = vertical_center >= base.size[1]
338
-
339
- if is_within_horizon:
340
- if is_above_viewport:
341
- rects_above.append(r)
342
- elif is_below_viewport:
343
- rects_below.append(r)
344
- else: # Fully visible
345
- visible_rects.append(r)
346
- _draw_roi(draw, int(r), fnt, rect)
347
-
348
- comp = Image.alpha_composite(base, overlay)
349
- overlay.close()
350
- return comp, visible_rects, rects_above, rects_below
351
-
352
-
353
- def _draw_roi(
354
- draw: ImageDraw.ImageDraw,
355
- idx: int,
356
- font: ImageFont.FreeTypeFont | ImageFont.ImageFont,
357
- rect: DOMRectangle,
358
- ) -> None:
359
- r"""Draw a ROI on the image.
360
-
361
- Args:
362
- draw (ImageDraw.ImageDraw): The draw object.
363
- idx (int): The index of the ROI.
364
- font (ImageFont.FreeTypeFont | ImageFont.ImageFont): The font.
365
- rect (DOMRectangle): The DOM rectangle.
366
- """
367
- color = _get_random_color(idx)
368
- text_color = _get_text_color(color)
369
-
370
- roi = ((rect["left"], rect["top"]), (rect["right"], rect["bottom"]))
371
-
372
- label_location = (rect["right"], rect["top"])
373
- label_anchor = "rb"
374
-
375
- if label_location[1] <= TOP_NO_LABEL_ZONE:
376
- label_location = (rect["right"], rect["bottom"])
377
- label_anchor = "rt"
378
-
379
- draw.rectangle(
380
- roi, outline=color, fill=(color[0], color[1], color[2], 48), width=2
381
- )
382
-
383
- bbox = draw.textbbox(
384
- label_location,
385
- str(idx),
386
- font=font,
387
- anchor=label_anchor,
388
- align="center",
389
- )
390
- bbox = (bbox[0] - 3, bbox[1] - 3, bbox[2] + 3, bbox[3] + 3)
391
- draw.rectangle(bbox, fill=color)
392
-
393
- draw.text(
394
- label_location,
395
- str(idx),
396
- fill=text_color,
397
- font=font,
398
- anchor=label_anchor,
399
- align="center",
400
- )
401
-
402
-
403
- def _get_text_color(
404
- bg_color: Tuple[int, int, int, int],
405
- ) -> Tuple[int, int, int, int]:
406
- r"""Determine the ideal text color (black or white) for contrast.
407
-
408
- Args:
409
- bg_color: The background color (R, G, B, A).
410
-
411
- Returns:
412
- A tuple representing black or white color for text.
413
- """
414
- luminance = bg_color[0] * 0.3 + bg_color[1] * 0.59 + bg_color[2] * 0.11
415
- return (0, 0, 0, 255) if luminance > 120 else (255, 255, 255, 255)
416
-
417
-
418
- def _get_random_color(identifier: int) -> Tuple[int, int, int, int]:
419
- r"""Generate a consistent random RGBA color based on the identifier.
420
-
421
- Args:
422
- identifier: The ID used as a seed to ensure color consistency.
423
-
424
- Returns:
425
- A tuple representing (R, G, B, A) values.
426
- """
427
- rnd = random.Random(int(identifier))
428
- r = rnd.randint(0, 255)
429
- g = rnd.randint(125, 255)
430
- b = rnd.randint(0, 50)
431
- color = [r, g, b]
432
- # TODO: check why shuffle is needed?
433
- rnd.shuffle(color)
434
- color.append(255)
435
- return cast(Tuple[int, int, int, int], tuple(color))
436
-
437
-
438
121
  class BaseBrowser:
439
122
  def __init__(
440
123
  self,
@@ -453,7 +136,8 @@ class BaseBrowser:
453
136
  "chromium".
454
137
  cookie_json_path (Optional[str]): Path to a JSON file containing
455
138
  authentication cookies and browser storage state. If provided
456
- and the file exists, the browser will load this state to maintain
139
+ and the file exists, the browser will load this state to
140
+ maintain
457
141
  authenticated sessions without requiring manual login.
458
142
 
459
143
  Returns:
@@ -463,12 +147,14 @@ class BaseBrowser:
463
147
  sync_playwright,
464
148
  )
465
149
 
466
- self.history: list = []
150
+ self.history: List[Any] = []
467
151
  self.headless = headless
468
152
  self.channel = channel
469
153
  self._ensure_browser_installed()
470
- self.playwright = sync_playwright().start()
471
- self.page_history: list = [] # stores the history of visited pages
154
+ self.playwright: Playwright = sync_playwright().start()
155
+ self.page_history: List[
156
+ str
157
+ ] = [] # stores the history of visited pages
472
158
  self.cookie_json_path = cookie_json_path
473
159
 
474
160
  # Set the cache directory
@@ -487,10 +173,18 @@ class BaseBrowser:
487
173
  raise FileNotFoundError(
488
174
  f"Page script file not found at path: {page_script_path}"
489
175
  )
176
+ self.browser: Optional[Browser] = None
177
+ self.context: Optional[BrowserContext] = None
178
+ self.page: Optional[Page] = None
179
+ self.page_url: Optional[str] = None
180
+ self.web_agent_model: Optional[BaseModelBackend] = (
181
+ None # Added for type hinting
182
+ )
490
183
 
491
184
  def init(self) -> None:
492
185
  r"""Initialize the browser."""
493
186
  # Launch the browser, if headless is False, the browser will display
187
+ assert self.playwright is not None
494
188
  self.browser = self.playwright.chromium.launch(
495
189
  headless=self.headless, channel=self.channel
496
190
  )
@@ -498,6 +192,7 @@ class BaseBrowser:
498
192
  # Check if cookie file exists before using it to maintain
499
193
  # authenticated sessions. This prevents errors when the cookie file
500
194
  # doesn't exist
195
+ assert self.browser is not None
501
196
  if self.cookie_json_path and os.path.exists(self.cookie_json_path):
502
197
  self.context = self.browser.new_context(
503
198
  accept_downloads=True, storage_state=self.cookie_json_path
@@ -507,6 +202,7 @@ class BaseBrowser:
507
202
  accept_downloads=True,
508
203
  )
509
204
  # Create a new page
205
+ assert self.context is not None
510
206
  self.page = self.context.new_page()
511
207
 
512
208
  def clean_cache(self) -> None:
@@ -517,7 +213,7 @@ class BaseBrowser:
517
213
  def _wait_for_load(self, timeout: int = 20) -> None:
518
214
  r"""Wait for a certain amount of time for the page to load."""
519
215
  timeout_ms = timeout * 1000
520
-
216
+ assert self.page is not None
521
217
  self.page.wait_for_load_state("load", timeout=timeout_ms)
522
218
 
523
219
  # TODO: check if this is needed
@@ -525,13 +221,14 @@ class BaseBrowser:
525
221
 
526
222
  def click_blank_area(self) -> None:
527
223
  r"""Click a blank area of the page to unfocus the current element."""
224
+ assert self.page is not None
528
225
  self.page.mouse.click(0, 0)
529
226
  self._wait_for_load()
530
227
 
531
228
  @retry_on_error()
532
229
  def visit_page(self, url: str) -> None:
533
230
  r"""Visit a page with the given URL."""
534
-
231
+ assert self.page is not None
535
232
  self.page.goto(url)
536
233
  self._wait_for_load()
537
234
  self.page_url = url
@@ -548,7 +245,8 @@ class BaseBrowser:
548
245
  """
549
246
  current_url = self.get_url()
550
247
 
551
- # Confirm with user before proceeding due to potential slow processing time
248
+ # Confirm with user before proceeding due to potential slow
249
+ # processing time
552
250
  confirmation_message = (
553
251
  f"Do you want to analyze the video on the current "
554
252
  f"page({current_url})? This operation may take a long time.(y/n): "
@@ -559,7 +257,10 @@ class BaseBrowser:
559
257
  return "User cancelled the video analysis."
560
258
 
561
259
  model = None
562
- if hasattr(self, 'web_agent_model'):
260
+ if (
261
+ hasattr(self, 'web_agent_model')
262
+ and self.web_agent_model is not None
263
+ ):
563
264
  model = self.web_agent_model
564
265
 
565
266
  video_analyzer = VideoAnalysisToolkit(model=model)
@@ -581,6 +282,7 @@ class BaseBrowser:
581
282
  image and the path to the image file if saved, otherwise
582
283
  :obj:`None`.
583
284
  """
285
+ assert self.page is not None
584
286
  image_data = self.page.screenshot(timeout=60000)
585
287
  image = Image.open(io.BytesIO(image_data))
586
288
 
@@ -588,6 +290,7 @@ class BaseBrowser:
588
290
  if save_image:
589
291
  # Get url name to form a file name
590
292
  # Use urlparser for a safer extraction the url name
293
+ assert self.page_url is not None
591
294
  parsed_url = urllib.parse.urlparse(self.page_url)
592
295
  # Max length is set to 241 as there are 10 characters for the
593
296
  # timestamp and 4 characters for the file extension:
@@ -615,17 +318,24 @@ class BaseBrowser:
615
318
  Returns:
616
319
  List[str]: A list of paths to the screenshot files.
617
320
  """
618
- screenshots = []
619
- scroll_height = self.page.evaluate("document.body.scrollHeight")
321
+ screenshots: List[str] = [] # Ensure screenshots is typed
322
+ assert self.page is not None
323
+ scroll_height_eval = self.page.evaluate("document.body.scrollHeight")
324
+ scroll_height = cast(
325
+ float, scroll_height_eval
326
+ ) # Ensure scroll_height is
327
+ # float
328
+
620
329
  assert self.page.viewport_size is not None
621
330
  viewport_height = self.page.viewport_size["height"]
622
- current_scroll = 0
623
- screenshot_index = 1
331
+ current_scroll_eval = self.page.evaluate("window.scrollY")
332
+ current_scroll = cast(float, current_scroll_eval)
333
+ # screenshot_index = 1 # This variable is not used
624
334
 
625
335
  max_height = scroll_height - viewport_height
626
336
  scroll_step = int(viewport_height * scroll_ratio)
627
337
 
628
- last_height = 0
338
+ last_height = 0.0 # Initialize last_height as float
629
339
 
630
340
  while True:
631
341
  logger.debug(
@@ -634,19 +344,22 @@ class BaseBrowser:
634
344
  )
635
345
 
636
346
  _, file_path = self.get_screenshot(save_image=True)
637
- screenshots.append(file_path)
347
+ if file_path is not None: # Ensure file_path is not None before
348
+ # appending
349
+ screenshots.append(file_path)
638
350
 
639
351
  self.page.evaluate(f"window.scrollBy(0, {scroll_step})")
640
352
  # Allow time for content to load
641
353
  time.sleep(0.5)
642
354
 
643
- current_scroll = self.page.evaluate("window.scrollY")
355
+ current_scroll_eval = self.page.evaluate("window.scrollY")
356
+ current_scroll = cast(float, current_scroll_eval)
644
357
  # Break if there is no significant scroll
645
358
  if abs(current_scroll - last_height) < viewport_height * 0.1:
646
359
  break
647
360
 
648
361
  last_height = current_scroll
649
- screenshot_index += 1
362
+ # screenshot_index += 1 # This variable is not used
650
363
 
651
364
  return screenshots
652
365
 
@@ -656,13 +369,17 @@ class BaseBrowser:
656
369
  Returns:
657
370
  VisualViewport: The visual viewport of the current page.
658
371
  """
372
+ assert self.page is not None
659
373
  try:
660
374
  self.page.evaluate(self.page_script)
661
375
  except Exception as e:
662
376
  logger.warning(f"Error evaluating page script: {e}")
663
377
 
378
+ visual_viewport_eval = self.page.evaluate(
379
+ "MultimodalWebSurfer.getVisualViewport();"
380
+ )
664
381
  return visual_viewport_from_dict(
665
- self.page.evaluate("MultimodalWebSurfer.getVisualViewport();")
382
+ cast(Dict[str, Any], visual_viewport_eval)
666
383
  )
667
384
 
668
385
  def get_interactive_elements(self) -> Dict[str, InteractiveRegion]:
@@ -671,6 +388,7 @@ class BaseBrowser:
671
388
  Returns:
672
389
  Dict[str, InteractiveRegion]: A dictionary of interactive elements.
673
390
  """
391
+ assert self.page is not None
674
392
  try:
675
393
  self.page.evaluate(self.page_script)
676
394
  except Exception as e:
@@ -685,7 +403,7 @@ class BaseBrowser:
685
403
  for k in result:
686
404
  typed_results[k] = interactive_region_from_dict(result[k])
687
405
 
688
- return typed_results # type: ignore[return-value]
406
+ return typed_results
689
407
 
690
408
  def get_som_screenshot(
691
409
  self,
@@ -699,7 +417,8 @@ class BaseBrowser:
699
417
  directory.
700
418
 
701
419
  Returns:
702
- Tuple[Image.Image, Union[str, None]]: A tuple containing the screenshot image
420
+ Tuple[Image.Image, Union[str, None]]: A tuple containing the
421
+ screenshot image
703
422
  and an optional path to the image file if saved, otherwise
704
423
  :obj:`None`.
705
424
  """
@@ -709,11 +428,12 @@ class BaseBrowser:
709
428
  rects = self.get_interactive_elements()
710
429
 
711
430
  file_path: str | None = None
712
- comp, _, _, _ = add_set_of_mark(
431
+ comp, _, _, _ = _add_set_of_mark(
713
432
  screenshot,
714
- rects, # type: ignore[arg-type]
433
+ rects,
715
434
  )
716
435
  if save_image:
436
+ assert self.page_url is not None
717
437
  parsed_url = urllib.parse.urlparse(self.page_url)
718
438
  # Max length is set to 241 as there are 10 characters for the
719
439
  # timestamp and 4 characters for the file extension:
@@ -730,25 +450,30 @@ class BaseBrowser:
730
450
 
731
451
  def scroll_up(self) -> None:
732
452
  r"""Scroll up the page."""
453
+ assert self.page is not None
733
454
  self.page.keyboard.press("PageUp")
734
455
 
735
456
  def scroll_down(self) -> None:
736
457
  r"""Scroll down the page."""
458
+ assert self.page is not None
737
459
  self.page.keyboard.press("PageDown")
738
460
 
739
461
  def get_url(self) -> str:
740
462
  r"""Get the URL of the current page."""
463
+ assert self.page is not None
741
464
  return self.page.url
742
465
 
743
466
  def click_id(self, identifier: Union[str, int]) -> None:
744
467
  r"""Click an element with the given identifier."""
468
+ assert self.page is not None
745
469
  if isinstance(identifier, int):
746
470
  identifier = str(identifier)
747
471
  target = self.page.locator(f"[__elementId='{identifier}']")
748
472
 
749
473
  try:
750
474
  target.wait_for(timeout=5000)
751
- except (TimeoutError, Exception) as e:
475
+ except Exception as e: # Consider using playwright specific
476
+ # TimeoutError
752
477
  logger.debug(f"Error during click operation: {e}")
753
478
  raise ValueError("No such element.") from None
754
479
 
@@ -757,7 +482,13 @@ class BaseBrowser:
757
482
  new_page = None
758
483
  try:
759
484
  with self.page.expect_event("popup", timeout=1000) as page_info:
760
- box = cast(Dict[str, Union[int, float]], target.bounding_box())
485
+ box: Optional[FloatRect] = target.bounding_box()
486
+ if box is None:
487
+ logger.warning(
488
+ f"Bounding box not found for element '{identifier}'. "
489
+ f"Cannot click."
490
+ )
491
+ return
761
492
  self.page.mouse.click(
762
493
  box["x"] + box["width"] / 2, box["y"] + box["height"] / 2
763
494
  )
@@ -768,7 +499,8 @@ class BaseBrowser:
768
499
  self.page_history.append(deepcopy(self.page.url))
769
500
  self.page = new_page
770
501
 
771
- except (TimeoutError, Exception) as e:
502
+ except Exception as e: # Consider using playwright specific
503
+ # TimeoutError
772
504
  logger.debug(f"Error during click operation: {e}")
773
505
  pass
774
506
 
@@ -776,6 +508,7 @@ class BaseBrowser:
776
508
 
777
509
  def extract_url_content(self) -> str:
778
510
  r"""Extract the content of the current page."""
511
+ assert self.page is not None
779
512
  content = self.page.content()
780
513
  return content
781
514
 
@@ -784,17 +517,17 @@ class BaseBrowser:
784
517
 
785
518
  Args:
786
519
  identifier (str): The identifier of the file to download.
787
- file_path (str): The path to save the downloaded file.
788
520
 
789
521
  Returns:
790
522
  str: The result of the action.
791
523
  """
792
-
524
+ assert self.page is not None
793
525
  if isinstance(identifier, int):
794
526
  identifier = str(identifier)
795
527
  try:
796
528
  target = self.page.locator(f"[__elementId='{identifier}']")
797
- except (TimeoutError, Exception) as e:
529
+ except Exception as e: # Consider using playwright specific
530
+ # TimeoutError
798
531
  logger.debug(f"Error during download operation: {e}")
799
532
  logger.warning(
800
533
  f"Element with identifier '{identifier}' not found."
@@ -803,7 +536,7 @@ class BaseBrowser:
803
536
 
804
537
  target.scroll_into_view_if_needed()
805
538
 
806
- file_path = os.path.join(self.cache_dir)
539
+ file_path_val = os.path.join(self.cache_dir)
807
540
  self._wait_for_load()
808
541
 
809
542
  try:
@@ -812,12 +545,13 @@ class BaseBrowser:
812
545
  download = download_info.value
813
546
  file_name = download.suggested_filename
814
547
 
815
- file_path = os.path.join(file_path, file_name)
816
- download.save_as(file_path)
548
+ file_path_val = os.path.join(file_path_val, file_name)
549
+ download.save_as(file_path_val)
817
550
 
818
- return f"Downloaded file to path '{file_path}'."
551
+ return f"Downloaded file to path '{file_path_val}'."
819
552
 
820
- except (TimeoutError, Exception) as e:
553
+ except Exception as e: # Consider using playwright specific
554
+ # TimeoutError
821
555
  logger.debug(f"Error during download operation: {e}")
822
556
  return f"Failed to download file with identifier '{identifier}'."
823
557
 
@@ -831,12 +565,14 @@ class BaseBrowser:
831
565
  Returns:
832
566
  str: The result of the action.
833
567
  """
568
+ assert self.page is not None
834
569
  if isinstance(identifier, int):
835
570
  identifier = str(identifier)
836
571
 
837
572
  try:
838
573
  target = self.page.locator(f"[__elementId='{identifier}']")
839
- except (TimeoutError, Exception) as e:
574
+ except Exception as e: # Consider using playwright specific
575
+ # TimeoutError
840
576
  logger.debug(f"Error during fill operation: {e}")
841
577
  logger.warning(
842
578
  f"Element with identifier '{identifier}' not found."
@@ -847,7 +583,8 @@ class BaseBrowser:
847
583
  target.focus()
848
584
  try:
849
585
  target.fill(text)
850
- except (TimeoutError, Exception) as e:
586
+ except Exception as e: # Consider using playwright specific
587
+ # TimeoutError
851
588
  logger.debug(f"Error during fill operation: {e}")
852
589
  target.press_sequentially(text)
853
590
 
@@ -859,11 +596,13 @@ class BaseBrowser:
859
596
  )
860
597
 
861
598
  def scroll_to_bottom(self) -> str:
599
+ assert self.page is not None
862
600
  self.page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
863
601
  self._wait_for_load()
864
602
  return "Scrolled to the bottom of the page."
865
603
 
866
604
  def scroll_to_top(self) -> str:
605
+ assert self.page is not None
867
606
  self.page.evaluate("window.scrollTo(0, 0);")
868
607
  self._wait_for_load()
869
608
  return "Scrolled to the top of the page."
@@ -877,11 +616,13 @@ class BaseBrowser:
877
616
  Returns:
878
617
  str: The result of the action.
879
618
  """
619
+ assert self.page is not None
880
620
  if isinstance(identifier, int):
881
621
  identifier = str(identifier)
882
622
  try:
883
623
  target = self.page.locator(f"[__elementId='{identifier}']")
884
- except (TimeoutError, Exception) as e:
624
+ except Exception as e: # Consider using playwright specific
625
+ # TimeoutError
885
626
  logger.debug(f"Error during hover operation: {e}")
886
627
  logger.warning(
887
628
  f"Element with identifier '{identifier}' not found."
@@ -899,15 +640,18 @@ class BaseBrowser:
899
640
  the text.
900
641
  """
901
642
  # ruff: noqa: E501
643
+ assert self.page is not None
902
644
  script = f"""
903
- (function() {{
645
+ (function() {{
904
646
  let text = "{search_text}";
905
647
  let found = window.find(text);
906
648
  if (!found) {{
907
- let elements = document.querySelectorAll("*:not(script):not(style)");
649
+ let elements = document.querySelectorAll("*:not(script):not(
650
+ style)");
908
651
  for (let el of elements) {{
909
652
  if (el.innerText && el.innerText.includes(text)) {{
910
- el.scrollIntoView({{behavior: "smooth", block: "center"}});
653
+ el.scrollIntoView({{behavior: "smooth", block:
654
+ "center"}});
911
655
  el.style.backgroundColor = "yellow";
912
656
  el.style.border = '2px solid red';
913
657
  return true;
@@ -918,7 +662,8 @@ class BaseBrowser:
918
662
  return true;
919
663
  }})();
920
664
  """
921
- found = self.page.evaluate(script)
665
+ found_eval = self.page.evaluate(script)
666
+ found = cast(bool, found_eval) # Ensure found is bool
922
667
  self._wait_for_load()
923
668
  if found:
924
669
  return f"Found text '{search_text}' on the page."
@@ -927,7 +672,7 @@ class BaseBrowser:
927
672
 
928
673
  def back(self):
929
674
  r"""Navigate back to the previous page."""
930
-
675
+ assert self.page is not None
931
676
  page_url_before = self.page.url
932
677
  self.page.go_back()
933
678
 
@@ -945,15 +690,21 @@ class BaseBrowser:
945
690
  self._wait_for_load()
946
691
 
947
692
  def close(self):
693
+ assert self.browser is not None
948
694
  self.browser.close()
695
+ if self.playwright:
696
+ self.playwright.stop() # Stop playwright instance
949
697
 
950
698
  # ruff: noqa: E501
951
699
  def show_interactive_elements(self):
952
700
  r"""Show simple interactive elements on the current page."""
701
+ assert self.page is not None
953
702
  self.page.evaluate(self.page_script)
954
703
  self.page.evaluate("""
955
704
  () => {
956
- document.querySelectorAll('a, button, input, select, textarea, [tabindex]:not([tabindex="-1"]), [contenteditable="true"]').forEach(el => {
705
+ document.querySelectorAll('a, button, input, select, textarea,
706
+ [tabindex]:not([tabindex="-1"]),
707
+ [contenteditable="true"]').forEach(el => {
957
708
  el.style.border = '2px solid red';
958
709
  });
959
710
  }
@@ -963,6 +714,7 @@ class BaseBrowser:
963
714
  def get_webpage_content(self) -> str:
964
715
  from html2text import html2text
965
716
 
717
+ assert self.page is not None
966
718
  self._wait_for_load()
967
719
  html_content = self.page.content()
968
720
 
@@ -1048,25 +800,32 @@ class BrowserToolkit(BaseToolkit):
1048
800
  (default: :obj:`"en`")
1049
801
  cookie_json_path (Optional[str]): Path to a JSON file containing
1050
802
  authentication cookies and browser storage state. If provided
1051
- and the file exists, the browser will load this state to maintain
803
+ and the file exists, the browser will load this state to
804
+ maintain
1052
805
  authenticated sessions without requiring manual login.
1053
806
  (default: :obj:`None`)
1054
807
  """
1055
-
808
+ super().__init__() # Call to super().__init__() added
1056
809
  self.browser = BaseBrowser(
1057
810
  headless=headless,
1058
811
  cache_dir=cache_dir,
1059
812
  channel=channel,
1060
813
  cookie_json_path=cookie_json_path,
1061
814
  )
815
+ self.browser.web_agent_model = web_agent_model # Pass model to
816
+ # BaseBrowser instance
1062
817
 
1063
818
  self.history_window = history_window
1064
819
  self.web_agent_model = web_agent_model
1065
820
  self.planning_agent_model = planning_agent_model
1066
821
  self.output_language = output_language
1067
822
 
1068
- self.history: list = []
1069
- self.web_agent, self.planning_agent = self._initialize_agent()
823
+ self.history: List[Dict[str, Any]] = [] # Typed history list
824
+ self.web_agent: ChatAgent
825
+ self.planning_agent: ChatAgent
826
+ self.web_agent, self.planning_agent = self._initialize_agent(
827
+ web_agent_model, planning_agent_model
828
+ )
1070
829
 
1071
830
  def _reset(self):
1072
831
  self.web_agent.reset()
@@ -1074,43 +833,40 @@ class BrowserToolkit(BaseToolkit):
1074
833
  self.history = []
1075
834
  os.makedirs(self.browser.cache_dir, exist_ok=True)
1076
835
 
1077
- def _initialize_agent(self) -> Tuple["ChatAgent", "ChatAgent"]:
836
+ def _initialize_agent(
837
+ self,
838
+ web_agent_model_backend: Optional[BaseModelBackend],
839
+ planning_agent_model_backend: Optional[BaseModelBackend],
840
+ ) -> Tuple[ChatAgent, ChatAgent]:
1078
841
  r"""Initialize the agent."""
1079
842
  from camel.agents import ChatAgent
1080
843
 
1081
- if self.web_agent_model is None:
1082
- web_agent_model = ModelFactory.create(
844
+ if web_agent_model_backend is None:
845
+ web_agent_model_instance = ModelFactory.create(
1083
846
  model_platform=ModelPlatformType.OPENAI,
1084
847
  model_type=ModelType.GPT_4_1,
1085
848
  model_config_dict={"temperature": 0, "top_p": 1},
1086
849
  )
1087
850
  else:
1088
- web_agent_model = self.web_agent_model
851
+ web_agent_model_instance = web_agent_model_backend
1089
852
 
1090
- if self.planning_agent_model is None:
853
+ if planning_agent_model_backend is None:
1091
854
  planning_model = ModelFactory.create(
1092
855
  model_platform=ModelPlatformType.OPENAI,
1093
856
  model_type=ModelType.O3_MINI,
1094
857
  )
1095
858
  else:
1096
- planning_model = self.planning_agent_model
859
+ planning_model = planning_agent_model_backend
1097
860
 
1098
- system_prompt = """
1099
- You are a helpful web agent that can assist users in browsing the web.
1100
- Given a high-level task, you can leverage predefined browser tools to help
1101
- users achieve their goals.
1102
- """
861
+ system_prompt = WEB_AGENT_SYSTEM_PROMPT
1103
862
 
1104
863
  web_agent = ChatAgent(
1105
864
  system_message=system_prompt,
1106
- model=web_agent_model,
865
+ model=web_agent_model_instance,
1107
866
  output_language=self.output_language,
1108
867
  )
1109
868
 
1110
- planning_system_prompt = """
1111
- You are a helpful planning agent that can assist users in planning complex
1112
- tasks which need multi-step browser interaction.
1113
- """
869
+ planning_system_prompt = PLANNING_AGENT_SYSTEM_PROMPT
1114
870
 
1115
871
  planning_agent = ChatAgent(
1116
872
  system_message=planning_system_prompt,
@@ -1123,96 +879,24 @@ tasks which need multi-step browser interaction.
1123
879
  def _observe(
1124
880
  self, task_prompt: str, detailed_plan: Optional[str] = None
1125
881
  ) -> Tuple[str, str, str]:
1126
- r"""Let agent observe the current environment, and get the next action."""
882
+ r"""Let agent observe the current environment, and get the next
883
+ action."""
1127
884
 
1128
- detailed_plan_prompt = ""
885
+ detailed_plan_prompt_str = ""
1129
886
 
1130
887
  if detailed_plan is not None:
1131
- detailed_plan_prompt = f"""
888
+ detailed_plan_prompt_str = f"""
1132
889
  Here is a plan about how to solve the task step-by-step which you must follow:
1133
890
  <detailed_plan>{detailed_plan}<detailed_plan>
1134
891
  """
1135
892
 
1136
- observe_prompt = f"""
1137
- Please act as a web agent to help me complete the following high-level task:
1138
- <task>{task_prompt}</task>
1139
- Now, I have made screenshot (only the current viewport, not the full webpage)
1140
- based on the current browser state, and marked interactive elements in the
1141
- webpage.
1142
- Please carefully examine the requirements of the task, and current state of
1143
- the browser, and provide the next appropriate action to take.
1144
-
1145
- {detailed_plan_prompt}
1146
-
1147
- Here are the current available browser functions you can use:
1148
- {AVAILABLE_ACTIONS_PROMPT}
1149
-
1150
- Here are the latest {self.history_window} trajectory (at most) you have taken:
1151
- <history>
1152
- {self.history[-self.history_window :]}
1153
- </history>
1154
-
1155
- Your output should be in json format, including the following fields:
1156
- - `observation`: The detailed image description about the current viewport. Do
1157
- not over-confident about the correctness of the history actions. You should
1158
- always check the current viewport to make sure the correctness of the next
1159
- action.
1160
- - `reasoning`: The reasoning about the next action you want to take, and the
1161
- possible obstacles you may encounter, and how to solve them. Do not forget to
1162
- check the history actions to avoid the same mistakes.
1163
- - `action_code`: The action code you want to take. It is only one step action
1164
- code, without any other texts (such as annotation)
1165
-
1166
- Here is two example of the output:
1167
- ```json
1168
- {{
1169
- "observation": [IMAGE_DESCRIPTION],
1170
- "reasoning": [YOUR_REASONING],
1171
- "action_code": "fill_input_id([ID], [TEXT])"
1172
- }}
1173
-
1174
- {{
1175
- "observation": "The current page is a CAPTCHA verification page on Amazon. It asks the user to ..",
1176
- "reasoning": "To proceed with the task of searching for products, I need to complete..",
1177
- "action_code": "fill_input_id(3, 'AUXPMR')"
1178
- }}
1179
-
1180
- Here are some tips for you:
1181
- - Never forget the overall question: **{task_prompt}**
1182
- - Maybe after a certain operation (e.g. click_id), the page content has not
1183
- changed. You can check whether the action step is successful by looking at the
1184
- `success` of the action step in the history. If successful, it means that the
1185
- page content is indeed the same after the click. You need to try other methods.
1186
- - If using one way to solve the problem is not successful, try other ways.
1187
- Make sure your provided ID is correct!
1188
- - Some cases are very complex and need to be achieve by an iterative process.
1189
- You can use the `back()` function to go back to the previous page to try other
1190
- methods.
1191
- - There are many links on the page, which may be useful for solving the
1192
- problem. You can use the `click_id()` function to click on the link to see if
1193
- it is useful.
1194
- - Always keep in mind that your action must be based on the ID shown in the
1195
- current image or viewport, not the ID shown in the history.
1196
- - Do not use `stop()` lightly. Always remind yourself that the image only
1197
- shows a part of the full page. If you cannot find the answer, try to use
1198
- functions like `scroll_up()` and `scroll_down()` to check the full content of
1199
- the webpage before doing anything else, because the answer or next key step
1200
- may be hidden in the content below.
1201
- - If the webpage needs human verification, you must avoid processing it.
1202
- Please use `back()` to go back to the previous page, and try other ways.
1203
- - If you have tried everything and still cannot resolve the issue, please stop
1204
- the simulation, and report issues you have encountered.
1205
- - Check the history actions carefully, detect whether you have repeatedly made
1206
- the same actions or not.
1207
- - When dealing with wikipedia revision history related tasks, you need to
1208
- think about the solution flexibly. First, adjust the browsing history
1209
- displayed on a single page to the maximum, and then make use of the
1210
- find_text_on_page function. This is extremely useful which can quickly locate
1211
- the text you want to find and skip massive amount of useless information.
1212
- - Flexibly use interactive elements like slide down selection bar to filter
1213
- out the information you need. Sometimes they are extremely useful.
1214
- ```
1215
- """
893
+ observe_prompt = OBSERVE_PROMPT_TEMPLATE.format(
894
+ task_prompt=task_prompt,
895
+ detailed_plan_prompt=detailed_plan_prompt_str,
896
+ AVAILABLE_ACTIONS_PROMPT=AVAILABLE_ACTIONS_PROMPT,
897
+ history_window=self.history_window,
898
+ history=self.history[-self.history_window :],
899
+ )
1216
900
 
1217
901
  # get current state
1218
902
  som_screenshot, _ = self.browser.get_som_screenshot(save_image=True)
@@ -1226,7 +910,8 @@ out the information you need. Sometimes they are extremely useful.
1226
910
 
1227
911
  resp_content = resp.msgs[0].content
1228
912
 
1229
- resp_dict = _parse_json_output(resp_content)
913
+ resp_dict = _parse_json_output(resp_content, logger) # Pass logger to
914
+ # _parse_json_output
1230
915
  observation_result: str = resp_dict.get("observation", "")
1231
916
  reasoning_result: str = resp_dict.get("reasoning", "")
1232
917
  action_code: str = resp_dict.get("action_code", "")
@@ -1247,7 +932,10 @@ out the information you need. Sometimes they are extremely useful.
1247
932
  id_part = (
1248
933
  parts[0].replace("fill_input_id(", "").strip()
1249
934
  )
1250
- action_code = f"fill_input_id({id_part}, 'Please fill the text here.')"
935
+ action_code = (
936
+ f"fill_input_id({id_part}, 'Please "
937
+ f"fill the text here.')"
938
+ )
1251
939
 
1252
940
  action_code = action_code.replace("`", "").strip()
1253
941
 
@@ -1349,43 +1037,36 @@ out the information you need. Sometimes they are extremely useful.
1349
1037
  )
1350
1038
 
1351
1039
  def _get_final_answer(self, task_prompt: str) -> str:
1352
- r"""Get the final answer based on the task prompt and current browser state.
1353
- It is used when the agent thinks that the task can be completed without any further action, and answer can be directly found in the current viewport.
1040
+ r"""Get the final answer based on the task prompt and current
1041
+ browser state.
1042
+ It is used when the agent thinks that the task can be completed
1043
+ without any further action, and answer can be directly found in the
1044
+ current viewport.
1354
1045
  """
1355
1046
 
1356
- prompt = f"""
1357
- We are solving a complex web task which needs multi-step browser interaction. After the multi-step observation, reasoning and acting with web browser, we think that the task is currently solved.
1358
- Here are all trajectory we have taken:
1359
- <history>{self.history}</history>
1360
- Please find the final answer, or give valuable insights and founds (e.g. if previous actions contain downloading files, your output should include the path of the downloaded file) about the overall task: <task>{task_prompt}</task>
1361
- """
1047
+ prompt = GET_FINAL_ANSWER_PROMPT_TEMPLATE.format(
1048
+ history=self.history, task_prompt=task_prompt
1049
+ )
1362
1050
 
1363
1051
  message = BaseMessage.make_user_message(
1364
1052
  role_name='user',
1365
1053
  content=prompt,
1366
1054
  )
1367
-
1055
+ self.web_agent.reset() # Reset before step
1368
1056
  resp = self.web_agent.step(message)
1369
1057
  return resp.msgs[0].content
1370
1058
 
1371
1059
  def _task_planning(self, task_prompt: str, start_url: str) -> str:
1372
1060
  r"""Plan the task based on the given task prompt."""
1373
1061
 
1374
- # Here are the available browser functions we can use: {AVAILABLE_ACTIONS_PROMPT}
1375
-
1376
- planning_prompt = f"""
1377
- <task>{task_prompt}</task>
1378
- According to the problem above, if we use browser interaction, what is the general process of the interaction after visiting the webpage `{start_url}`?
1379
-
1380
- Please note that it can be viewed as Partially Observable MDP. Do not over-confident about your plan.
1381
- Please first restate the task in detail, and then provide a detailed plan to solve the task.
1382
- """
1383
- # Here are some tips for you: Please note that we can only see a part of the full page because of the limited viewport after an action. Thus, do not forget to use methods like `scroll_up()` and `scroll_down()` to check the full content of the webpage, because the answer or next key step may be hidden in the content below.
1062
+ planning_prompt = TASK_PLANNING_PROMPT_TEMPLATE.format(
1063
+ task_prompt=task_prompt, start_url=start_url
1064
+ )
1384
1065
 
1385
1066
  message = BaseMessage.make_user_message(
1386
1067
  role_name='user', content=planning_prompt
1387
1068
  )
1388
-
1069
+ self.planning_agent.reset() # Reset before step
1389
1070
  resp = self.planning_agent.step(message)
1390
1071
  return resp.msgs[0].content
1391
1072
 
@@ -1399,35 +1080,26 @@ Please first restate the task in detail, and then provide a detailed plan to sol
1399
1080
  detailed_plan (str): The detailed plan to replan.
1400
1081
 
1401
1082
  Returns:
1402
- Tuple[bool, str]: A tuple containing a boolean indicating whether the task needs to be replanned, and the replanned schema.
1083
+ Tuple[bool, str]: A tuple containing a boolean indicating
1084
+ whether the task needs to be replanned, and the replanned schema.
1403
1085
  """
1404
1086
 
1405
- # Here are the available browser functions we can use: {AVAILABLE_ACTIONS_PROMPT}
1406
- replanning_prompt = f"""
1407
- We are using browser interaction to solve a complex task which needs multi-step actions.
1408
- Here are the overall task:
1409
- <overall_task>{task_prompt}</overall_task>
1410
-
1411
- In order to solve the task, we made a detailed plan previously. Here is the detailed plan:
1412
- <detailed plan>{detailed_plan}</detailed plan>
1413
-
1414
- According to the task above, we have made a series of observations, reasonings, and actions. Here are the latest {self.history_window} trajectory (at most) we have taken:
1415
- <history>{self.history[-self.history_window :]}</history>
1416
-
1417
- However, the task is not completed yet. As the task is partially observable, we may need to replan the task based on the current state of the browser if necessary.
1418
- Now please carefully examine the current task planning schema, and our history actions, and then judge whether the task needs to be fundamentally replanned. If so, please provide a detailed replanned schema (including the restated overall task).
1419
-
1420
- Your output should be in json format, including the following fields:
1421
- - `if_need_replan`: bool, A boolean value indicating whether the task needs to be fundamentally replanned.
1422
- - `replanned_schema`: str, The replanned schema for the task, which should not be changed too much compared with the original one. If the task does not need to be replanned, the value should be an empty string.
1423
- """
1087
+ replanning_prompt = TASK_REPLANNING_PROMPT_TEMPLATE.format(
1088
+ task_prompt=task_prompt,
1089
+ detailed_plan=detailed_plan,
1090
+ history_window=self.history_window,
1091
+ history=self.history[-self.history_window :],
1092
+ )
1424
1093
  # Reset the history message of planning_agent.
1425
1094
  self.planning_agent.reset()
1426
1095
  resp = self.planning_agent.step(replanning_prompt)
1427
- resp_dict = _parse_json_output(resp.msgs[0].content)
1096
+ resp_dict = _parse_json_output(
1097
+ resp.msgs[0].content, logger
1098
+ ) # Pass logger
1428
1099
 
1429
- if_need_replan = resp_dict.get("if_need_replan", False)
1430
- replanned_schema = resp_dict.get("replanned_schema", "")
1100
+ if_need_replan_eval = resp_dict.get("if_need_replan", False)
1101
+ if_need_replan = cast(bool, if_need_replan_eval) # Ensure bool
1102
+ replanned_schema: str = resp_dict.get("replanned_schema", "")
1431
1103
 
1432
1104
  if if_need_replan:
1433
1105
  return True, replanned_schema
@@ -1466,10 +1138,10 @@ Your output should be in json format, including the following fields:
1466
1138
  logger.debug(f"Observation: {observation}")
1467
1139
  logger.debug(f"Reasoning: {reasoning}")
1468
1140
  logger.debug(f"Action code: {action_code}")
1469
-
1141
+ trajectory_info: Dict[str, Any]
1470
1142
  if "stop" in action_code:
1471
1143
  task_completed = True
1472
- trajectory_info = {
1144
+ trajectory_info = { # Typed trajectory_info
1473
1145
  "round": i,
1474
1146
  "observation": observation,
1475
1147
  "thought": reasoning,
@@ -1486,7 +1158,7 @@ Your output should be in json format, including the following fields:
1486
1158
  if not success:
1487
1159
  logger.warning(f"Error while executing the action: {info}")
1488
1160
 
1489
- trajectory_info = {
1161
+ trajectory_info = { # Typed trajectory_info
1490
1162
  "round": i,
1491
1163
  "observation": observation,
1492
1164
  "thought": reasoning,
@@ -1505,15 +1177,20 @@ Your output should be in json format, including the following fields:
1505
1177
  detailed_plan = replanned_schema
1506
1178
  logger.debug(f"Replanned schema: {replanned_schema}")
1507
1179
 
1180
+ simulation_result: str
1508
1181
  if not task_completed:
1509
1182
  simulation_result = f"""
1510
- The task is not completed within the round limit. Please check the last round {self.history_window} information to see if there is any useful information:
1183
+ The task is not completed within the round limit. Please
1184
+ check the last round {self.history_window} information to
1185
+ see if there is any useful information:
1511
1186
  <history>{self.history[-self.history_window :]}</history>
1512
1187
  """
1513
1188
 
1514
1189
  else:
1515
1190
  simulation_result = self._get_final_answer(task_prompt)
1516
1191
 
1192
+ self.browser.close() # Close browser after task completion or limit
1193
+ # reached
1517
1194
  return simulation_result
1518
1195
 
1519
1196
  def get_tools(self) -> List[FunctionTool]: