camel-ai 0.2.59__py3-none-any.whl → 0.2.61__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of camel-ai might be problematic. Click here for more details.

Files changed (55) hide show
  1. camel/__init__.py +1 -1
  2. camel/agents/chat_agent.py +158 -7
  3. camel/configs/anthropic_config.py +6 -5
  4. camel/configs/cohere_config.py +1 -1
  5. camel/configs/mistral_config.py +1 -1
  6. camel/configs/openai_config.py +3 -0
  7. camel/configs/reka_config.py +1 -1
  8. camel/configs/samba_config.py +2 -2
  9. camel/datagen/cot_datagen.py +29 -34
  10. camel/datagen/evol_instruct/scorer.py +22 -23
  11. camel/datagen/evol_instruct/templates.py +46 -46
  12. camel/datasets/static_dataset.py +144 -0
  13. camel/embeddings/jina_embedding.py +8 -1
  14. camel/embeddings/sentence_transformers_embeddings.py +2 -2
  15. camel/embeddings/vlm_embedding.py +9 -2
  16. camel/loaders/__init__.py +5 -2
  17. camel/loaders/chunkr_reader.py +117 -91
  18. camel/loaders/mistral_reader.py +148 -0
  19. camel/memories/blocks/chat_history_block.py +1 -2
  20. camel/memories/records.py +3 -0
  21. camel/messages/base.py +15 -3
  22. camel/models/azure_openai_model.py +1 -0
  23. camel/models/model_factory.py +2 -2
  24. camel/models/model_manager.py +7 -3
  25. camel/retrievers/bm25_retriever.py +1 -2
  26. camel/retrievers/hybrid_retrival.py +2 -2
  27. camel/societies/workforce/workforce.py +65 -24
  28. camel/storages/__init__.py +2 -0
  29. camel/storages/vectordb_storages/__init__.py +2 -0
  30. camel/storages/vectordb_storages/faiss.py +712 -0
  31. camel/storages/vectordb_storages/oceanbase.py +1 -2
  32. camel/toolkits/__init__.py +2 -0
  33. camel/toolkits/async_browser_toolkit.py +80 -524
  34. camel/toolkits/bohrium_toolkit.py +318 -0
  35. camel/toolkits/browser_toolkit.py +221 -541
  36. camel/toolkits/browser_toolkit_commons.py +568 -0
  37. camel/toolkits/dalle_toolkit.py +4 -0
  38. camel/toolkits/excel_toolkit.py +8 -2
  39. camel/toolkits/file_write_toolkit.py +76 -29
  40. camel/toolkits/github_toolkit.py +43 -25
  41. camel/toolkits/image_analysis_toolkit.py +3 -0
  42. camel/toolkits/jina_reranker_toolkit.py +194 -77
  43. camel/toolkits/mcp_toolkit.py +134 -16
  44. camel/toolkits/page_script.js +40 -28
  45. camel/toolkits/twitter_toolkit.py +6 -1
  46. camel/toolkits/video_analysis_toolkit.py +3 -0
  47. camel/toolkits/video_download_toolkit.py +3 -0
  48. camel/toolkits/wolfram_alpha_toolkit.py +51 -23
  49. camel/types/enums.py +27 -6
  50. camel/utils/__init__.py +2 -0
  51. camel/utils/commons.py +27 -0
  52. {camel_ai-0.2.59.dist-info → camel_ai-0.2.61.dist-info}/METADATA +17 -9
  53. {camel_ai-0.2.59.dist-info → camel_ai-0.2.61.dist-info}/RECORD +55 -51
  54. {camel_ai-0.2.59.dist-info → camel_ai-0.2.61.dist-info}/WHEEL +0 -0
  55. {camel_ai-0.2.59.dist-info → camel_ai-0.2.61.dist-info}/licenses/LICENSE +0 -0
@@ -12,11 +12,12 @@
12
12
  # limitations under the License.
13
13
  # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
14
14
 
15
+ # Enables postponed evaluation of annotations (for string-based type hints)
16
+ from __future__ import annotations
17
+
15
18
  import datetime
16
19
  import io
17
- import json
18
20
  import os
19
- import random
20
21
  import re
21
22
  import shutil
22
23
  import time
@@ -25,21 +26,17 @@ from copy import deepcopy
25
26
  from typing import (
26
27
  TYPE_CHECKING,
27
28
  Any,
28
- BinaryIO,
29
29
  Dict,
30
30
  List,
31
31
  Literal,
32
32
  Optional,
33
33
  Tuple,
34
- TypedDict,
35
34
  Union,
36
35
  cast,
37
36
  )
38
37
 
39
- from PIL import Image, ImageDraw, ImageFont
38
+ from PIL import Image
40
39
 
41
- if TYPE_CHECKING:
42
- from camel.agents import ChatAgent
43
40
  from camel.logger import get_logger
44
41
  from camel.messages import BaseMessage
45
42
  from camel.models import BaseModelBackend, ModelFactory
@@ -53,85 +50,39 @@ from camel.utils import (
53
50
  sanitize_filename,
54
51
  )
55
52
 
56
- logger = get_logger(__name__)
53
+ # Import shared components from browser_toolkit_commons
54
+ from .browser_toolkit_commons import (
55
+ ACTION_WITH_FEEDBACK_LIST,
56
+ AVAILABLE_ACTIONS_PROMPT,
57
+ GET_FINAL_ANSWER_PROMPT_TEMPLATE,
58
+ OBSERVE_PROMPT_TEMPLATE,
59
+ PLANNING_AGENT_SYSTEM_PROMPT,
60
+ TASK_PLANNING_PROMPT_TEMPLATE,
61
+ TASK_REPLANNING_PROMPT_TEMPLATE,
62
+ WEB_AGENT_SYSTEM_PROMPT,
63
+ InteractiveRegion,
64
+ VisualViewport,
65
+ _add_set_of_mark,
66
+ _parse_json_output,
67
+ _reload_image,
68
+ interactive_region_from_dict,
69
+ visual_viewport_from_dict,
70
+ )
57
71
 
58
- TOP_NO_LABEL_ZONE = 20
72
+ if TYPE_CHECKING:
73
+ from playwright.sync_api import (
74
+ Browser,
75
+ BrowserContext,
76
+ FloatRect,
77
+ Page,
78
+ Playwright,
79
+ )
59
80
 
81
+ from camel.agents import ChatAgent
82
+
83
+ logger = get_logger(__name__)
60
84
 
61
- AVAILABLE_ACTIONS_PROMPT = """
62
- 1. `fill_input_id(identifier: Union[str, int], text: str)`: Fill an input
63
- field (e.g. search box) with the given text and press Enter.
64
- 2. `click_id(identifier: Union[str, int])`: Click an element with the given ID.
65
- 3. `hover_id(identifier: Union[str, int])`: Hover over an element with the
66
- given ID.
67
- 4. `download_file_id(identifier: Union[str, int])`: Download a file with the
68
- given ID. It returns the path to the downloaded file. If the file is
69
- successfully downloaded, you can stop the simulation and report the path to
70
- the downloaded file for further processing.
71
- 5. `scroll_to_bottom()`: Scroll to the bottom of the page.
72
- 6. `scroll_to_top()`: Scroll to the top of the page.
73
- 7. `scroll_up()`: Scroll up the page. It is suitable when you want to see the
74
- elements above the current viewport.
75
- 8. `scroll_down()`: Scroll down the page. It is suitable when you want to see
76
- the elements below the current viewport. If the webpage does not change, It
77
- means that the webpage has scrolled to the bottom.
78
- 9. `back()`: Navigate back to the previous page. This is useful when you want
79
- to go back to the previous page, as current page is not useful.
80
- 10. `stop()`: Stop the action process, because the task is completed or failed
81
- (impossible to find the answer). In this situation, you should provide your
82
- answer in your output.
83
- 11. `get_url()`: Get the current URL of the current page.
84
- 12. `find_text_on_page(search_text: str)`: Find the next given text on the
85
- current whole page, and scroll the page to the targeted text. It is equivalent
86
- to pressing Ctrl + F and searching for the text, and is powerful when you want
87
- to fast-check whether the current page contains some specific text.
88
- 13. `visit_page(url: str)`: Go to the specific url page.
89
- 14. `click_blank_area()`: Click a blank area of the page to unfocus the
90
- current element. It is useful when you have clicked an element but it cannot
91
- unfocus itself (e.g. Menu bar) to automatically render the updated webpage.
92
- 15. `ask_question_about_video(question: str)`: Ask a question about the
93
- current webpage which contains video, e.g. youtube websites.
94
- """
95
-
96
- ACTION_WITH_FEEDBACK_LIST = [
97
- 'ask_question_about_video',
98
- 'download_file_id',
99
- 'find_text_on_page',
100
- ]
101
-
102
-
103
- # Code from magentic-one
104
- class DOMRectangle(TypedDict):
105
- x: Union[int, float]
106
- y: Union[int, float]
107
- width: Union[int, float]
108
- height: Union[int, float]
109
- top: Union[int, float]
110
- right: Union[int, float]
111
- bottom: Union[int, float]
112
- left: Union[int, float]
113
-
114
-
115
- class VisualViewport(TypedDict):
116
- height: Union[int, float]
117
- width: Union[int, float]
118
- offsetLeft: Union[int, float]
119
- offsetTop: Union[int, float]
120
- pageLeft: Union[int, float]
121
- pageTop: Union[int, float]
122
- scale: Union[int, float]
123
- clientWidth: Union[int, float]
124
- clientHeight: Union[int, float]
125
- scrollWidth: Union[int, float]
126
- scrollHeight: Union[int, float]
127
-
128
-
129
- class InteractiveRegion(TypedDict):
130
- tag_name: str
131
- role: str
132
- aria_name: str
133
- v_scrollable: bool
134
- rects: List[DOMRectangle]
85
+ TOP_NO_LABEL_ZONE = 20
135
86
 
136
87
 
137
88
  def _get_str(d: Any, k: str) -> str:
@@ -167,270 +118,6 @@ def _get_bool(d: Any, k: str) -> bool:
167
118
  )
168
119
 
169
120
 
170
- def _parse_json_output(text: str) -> Dict[str, Any]:
171
- r"""Extract JSON output from a string."""
172
-
173
- markdown_pattern = r'```(?:json)?\s*(.*?)\s*```'
174
- markdown_match = re.search(markdown_pattern, text, re.DOTALL)
175
- if markdown_match:
176
- text = markdown_match.group(1).strip()
177
-
178
- triple_quotes_pattern = r'"""(?:json)?\s*(.*?)\s*"""'
179
- triple_quotes_match = re.search(triple_quotes_pattern, text, re.DOTALL)
180
- if triple_quotes_match:
181
- text = triple_quotes_match.group(1).strip()
182
-
183
- try:
184
- return json.loads(text)
185
- except json.JSONDecodeError:
186
- try:
187
- fixed_text = re.sub(
188
- r'`([^`]*?)`(?=\s*[:,\[\]{}]|$)', r'"\1"', text
189
- )
190
- return json.loads(fixed_text)
191
- except json.JSONDecodeError:
192
- result = {}
193
- try:
194
- bool_pattern = r'"(\w+)"\s*:\s*(true|false)'
195
- for match in re.finditer(bool_pattern, text, re.IGNORECASE):
196
- key, value = match.groups()
197
- result[key] = value.lower() == "true"
198
-
199
- str_pattern = r'"(\w+)"\s*:\s*"([^"]*)"'
200
- for match in re.finditer(str_pattern, text):
201
- key, value = match.groups()
202
- result[key] = value
203
-
204
- num_pattern = r'"(\w+)"\s*:\s*(-?\d+(?:\.\d+)?)'
205
- for match in re.finditer(num_pattern, text):
206
- key, value = match.groups()
207
- try:
208
- result[key] = int(value)
209
- except ValueError:
210
- result[key] = float(value)
211
-
212
- empty_str_pattern = r'"(\w+)"\s*:\s*""'
213
- for match in re.finditer(empty_str_pattern, text):
214
- key = match.group(1)
215
- result[key] = ""
216
-
217
- if result:
218
- return result
219
-
220
- logger.warning(f"Failed to parse JSON output: {text}")
221
- return {}
222
- except Exception as e:
223
- logger.warning(f"Error while extracting fields from JSON: {e}")
224
- return {}
225
-
226
-
227
- def _reload_image(image: Image.Image) -> Image.Image:
228
- buffer = io.BytesIO()
229
- image.save(buffer, format="PNG")
230
- buffer.seek(0)
231
- return Image.open(buffer)
232
-
233
-
234
- def dom_rectangle_from_dict(rect: Dict[str, Any]) -> DOMRectangle:
235
- r"""Create a DOMRectangle object from a dictionary."""
236
- return DOMRectangle(
237
- x=_get_number(rect, "x"),
238
- y=_get_number(rect, "y"),
239
- width=_get_number(rect, "width"),
240
- height=_get_number(rect, "height"),
241
- top=_get_number(rect, "top"),
242
- right=_get_number(rect, "right"),
243
- bottom=_get_number(rect, "bottom"),
244
- left=_get_number(rect, "left"),
245
- )
246
-
247
-
248
- def interactive_region_from_dict(region: Dict[str, Any]) -> InteractiveRegion:
249
- r"""Create an :class:`InteractiveRegion` object from a dictionary."""
250
- typed_rects: List[DOMRectangle] = []
251
- for rect in region["rects"]:
252
- typed_rects.append(dom_rectangle_from_dict(rect))
253
-
254
- return InteractiveRegion(
255
- tag_name=_get_str(region, "tag_name"),
256
- role=_get_str(region, "role"),
257
- aria_name=_get_str(region, "aria-name"),
258
- v_scrollable=_get_bool(region, "v-scrollable"),
259
- rects=typed_rects,
260
- )
261
-
262
-
263
- def visual_viewport_from_dict(viewport: Dict[str, Any]) -> VisualViewport:
264
- r"""Create a :class:`VisualViewport` object from a dictionary."""
265
- return VisualViewport(
266
- height=_get_number(viewport, "height"),
267
- width=_get_number(viewport, "width"),
268
- offsetLeft=_get_number(viewport, "offsetLeft"),
269
- offsetTop=_get_number(viewport, "offsetTop"),
270
- pageLeft=_get_number(viewport, "pageLeft"),
271
- pageTop=_get_number(viewport, "pageTop"),
272
- scale=_get_number(viewport, "scale"),
273
- clientWidth=_get_number(viewport, "clientWidth"),
274
- clientHeight=_get_number(viewport, "clientHeight"),
275
- scrollWidth=_get_number(viewport, "scrollWidth"),
276
- scrollHeight=_get_number(viewport, "scrollHeight"),
277
- )
278
-
279
-
280
- def add_set_of_mark(
281
- screenshot: Union[bytes, Image.Image, io.BufferedIOBase],
282
- ROIs: Dict[str, InteractiveRegion],
283
- ) -> Tuple[Image.Image, List[str], List[str], List[str]]:
284
- if isinstance(screenshot, Image.Image):
285
- return _add_set_of_mark(screenshot, ROIs)
286
-
287
- if isinstance(screenshot, bytes):
288
- screenshot = io.BytesIO(screenshot)
289
-
290
- image = Image.open(cast(BinaryIO, screenshot))
291
- comp, visible_rects, rects_above, rects_below = _add_set_of_mark(
292
- image, ROIs
293
- )
294
- image.close()
295
- return comp, visible_rects, rects_above, rects_below
296
-
297
-
298
- def _add_set_of_mark(
299
- screenshot: Image.Image, ROIs: Dict[str, InteractiveRegion]
300
- ) -> Tuple[Image.Image, List[str], List[str], List[str]]:
301
- r"""Add a set of marks to the screenshot.
302
-
303
- Args:
304
- screenshot (Image.Image): The screenshot to add marks to.
305
- ROIs (Dict[str, InteractiveRegion]): The regions to add marks to.
306
-
307
- Returns:
308
- Tuple[Image.Image, List[str], List[str], List[str]]: A tuple
309
- containing the screenshot with marked ROIs, ROIs fully within the
310
- images, ROIs located above the visible area, and ROIs located below
311
- the visible area.
312
- """
313
- visible_rects: List[str] = list()
314
- rects_above: List[str] = list() # Scroll up to see
315
- rects_below: List[str] = list() # Scroll down to see
316
-
317
- fnt = ImageFont.load_default(14)
318
- base = screenshot.convert("L").convert("RGBA")
319
- overlay = Image.new("RGBA", base.size)
320
-
321
- draw = ImageDraw.Draw(overlay)
322
- for r in ROIs:
323
- for rect in ROIs[r]["rects"]:
324
- # Empty rectangles
325
- if not rect or rect["width"] == 0 or rect["height"] == 0:
326
- continue
327
-
328
- # TODO: add scroll left and right?
329
- horizontal_center = (rect["right"] + rect["left"]) / 2.0
330
- vertical_center = (rect["top"] + rect["bottom"]) / 2.0
331
- is_within_horizon = 0 <= horizontal_center < base.size[0]
332
- is_above_viewport = vertical_center < 0
333
- is_below_viewport = vertical_center >= base.size[1]
334
-
335
- if is_within_horizon:
336
- if is_above_viewport:
337
- rects_above.append(r)
338
- elif is_below_viewport:
339
- rects_below.append(r)
340
- else: # Fully visible
341
- visible_rects.append(r)
342
- _draw_roi(draw, int(r), fnt, rect)
343
-
344
- comp = Image.alpha_composite(base, overlay)
345
- overlay.close()
346
- return comp, visible_rects, rects_above, rects_below
347
-
348
-
349
- def _draw_roi(
350
- draw: ImageDraw.ImageDraw,
351
- idx: int,
352
- font: ImageFont.FreeTypeFont | ImageFont.ImageFont,
353
- rect: DOMRectangle,
354
- ) -> None:
355
- r"""Draw a ROI on the image.
356
-
357
- Args:
358
- draw (ImageDraw.ImageDraw): The draw object.
359
- idx (int): The index of the ROI.
360
- font (ImageFont.FreeTypeFont | ImageFont.ImageFont): The font.
361
- rect (DOMRectangle): The DOM rectangle.
362
- """
363
- color = _get_random_color(idx)
364
- text_color = _get_text_color(color)
365
-
366
- roi = ((rect["left"], rect["top"]), (rect["right"], rect["bottom"]))
367
-
368
- label_location = (rect["right"], rect["top"])
369
- label_anchor = "rb"
370
-
371
- if label_location[1] <= TOP_NO_LABEL_ZONE:
372
- label_location = (rect["right"], rect["bottom"])
373
- label_anchor = "rt"
374
-
375
- draw.rectangle(
376
- roi, outline=color, fill=(color[0], color[1], color[2], 48), width=2
377
- )
378
-
379
- bbox = draw.textbbox(
380
- label_location,
381
- str(idx),
382
- font=font,
383
- anchor=label_anchor,
384
- align="center",
385
- )
386
- bbox = (bbox[0] - 3, bbox[1] - 3, bbox[2] + 3, bbox[3] + 3)
387
- draw.rectangle(bbox, fill=color)
388
-
389
- draw.text(
390
- label_location,
391
- str(idx),
392
- fill=text_color,
393
- font=font,
394
- anchor=label_anchor,
395
- align="center",
396
- )
397
-
398
-
399
- def _get_text_color(
400
- bg_color: Tuple[int, int, int, int],
401
- ) -> Tuple[int, int, int, int]:
402
- r"""Determine the ideal text color (black or white) for contrast.
403
-
404
- Args:
405
- bg_color: The background color (R, G, B, A).
406
-
407
- Returns:
408
- A tuple representing black or white color for text.
409
- """
410
- luminance = bg_color[0] * 0.3 + bg_color[1] * 0.59 + bg_color[2] * 0.11
411
- return (0, 0, 0, 255) if luminance > 120 else (255, 255, 255, 255)
412
-
413
-
414
- def _get_random_color(identifier: int) -> Tuple[int, int, int, int]:
415
- r"""Generate a consistent random RGBA color based on the identifier.
416
-
417
- Args:
418
- identifier: The ID used as a seed to ensure color consistency.
419
-
420
- Returns:
421
- A tuple representing (R, G, B, A) values.
422
- """
423
- rnd = random.Random(int(identifier))
424
- r = rnd.randint(0, 255)
425
- g = rnd.randint(125, 255)
426
- b = rnd.randint(0, 50)
427
- color = [r, g, b]
428
- # TODO: check why shuffle is needed?
429
- rnd.shuffle(color)
430
- color.append(255)
431
- return cast(Tuple[int, int, int, int], tuple(color))
432
-
433
-
434
121
  class BaseBrowser:
435
122
  def __init__(
436
123
  self,
@@ -449,7 +136,8 @@ class BaseBrowser:
449
136
  "chromium".
450
137
  cookie_json_path (Optional[str]): Path to a JSON file containing
451
138
  authentication cookies and browser storage state. If provided
452
- and the file exists, the browser will load this state to maintain
139
+ and the file exists, the browser will load this state to
140
+ maintain
453
141
  authenticated sessions without requiring manual login.
454
142
 
455
143
  Returns:
@@ -459,12 +147,14 @@ class BaseBrowser:
459
147
  sync_playwright,
460
148
  )
461
149
 
462
- self.history: list = []
150
+ self.history: List[Any] = []
463
151
  self.headless = headless
464
152
  self.channel = channel
465
153
  self._ensure_browser_installed()
466
- self.playwright = sync_playwright().start()
467
- self.page_history: list = [] # stores the history of visited pages
154
+ self.playwright: Playwright = sync_playwright().start()
155
+ self.page_history: List[
156
+ str
157
+ ] = [] # stores the history of visited pages
468
158
  self.cookie_json_path = cookie_json_path
469
159
 
470
160
  # Set the cache directory
@@ -483,10 +173,18 @@ class BaseBrowser:
483
173
  raise FileNotFoundError(
484
174
  f"Page script file not found at path: {page_script_path}"
485
175
  )
176
+ self.browser: Optional[Browser] = None
177
+ self.context: Optional[BrowserContext] = None
178
+ self.page: Optional[Page] = None
179
+ self.page_url: Optional[str] = None
180
+ self.web_agent_model: Optional[BaseModelBackend] = (
181
+ None # Added for type hinting
182
+ )
486
183
 
487
184
  def init(self) -> None:
488
185
  r"""Initialize the browser."""
489
186
  # Launch the browser, if headless is False, the browser will display
187
+ assert self.playwright is not None
490
188
  self.browser = self.playwright.chromium.launch(
491
189
  headless=self.headless, channel=self.channel
492
190
  )
@@ -494,6 +192,7 @@ class BaseBrowser:
494
192
  # Check if cookie file exists before using it to maintain
495
193
  # authenticated sessions. This prevents errors when the cookie file
496
194
  # doesn't exist
195
+ assert self.browser is not None
497
196
  if self.cookie_json_path and os.path.exists(self.cookie_json_path):
498
197
  self.context = self.browser.new_context(
499
198
  accept_downloads=True, storage_state=self.cookie_json_path
@@ -503,6 +202,7 @@ class BaseBrowser:
503
202
  accept_downloads=True,
504
203
  )
505
204
  # Create a new page
205
+ assert self.context is not None
506
206
  self.page = self.context.new_page()
507
207
 
508
208
  def clean_cache(self) -> None:
@@ -513,7 +213,7 @@ class BaseBrowser:
513
213
  def _wait_for_load(self, timeout: int = 20) -> None:
514
214
  r"""Wait for a certain amount of time for the page to load."""
515
215
  timeout_ms = timeout * 1000
516
-
216
+ assert self.page is not None
517
217
  self.page.wait_for_load_state("load", timeout=timeout_ms)
518
218
 
519
219
  # TODO: check if this is needed
@@ -521,13 +221,14 @@ class BaseBrowser:
521
221
 
522
222
  def click_blank_area(self) -> None:
523
223
  r"""Click a blank area of the page to unfocus the current element."""
224
+ assert self.page is not None
524
225
  self.page.mouse.click(0, 0)
525
226
  self._wait_for_load()
526
227
 
527
228
  @retry_on_error()
528
229
  def visit_page(self, url: str) -> None:
529
230
  r"""Visit a page with the given URL."""
530
-
231
+ assert self.page is not None
531
232
  self.page.goto(url)
532
233
  self._wait_for_load()
533
234
  self.page_url = url
@@ -544,7 +245,8 @@ class BaseBrowser:
544
245
  """
545
246
  current_url = self.get_url()
546
247
 
547
- # Confirm with user before proceeding due to potential slow processing time
248
+ # Confirm with user before proceeding due to potential slow
249
+ # processing time
548
250
  confirmation_message = (
549
251
  f"Do you want to analyze the video on the current "
550
252
  f"page({current_url})? This operation may take a long time.(y/n): "
@@ -555,7 +257,10 @@ class BaseBrowser:
555
257
  return "User cancelled the video analysis."
556
258
 
557
259
  model = None
558
- if hasattr(self, 'web_agent_model'):
260
+ if (
261
+ hasattr(self, 'web_agent_model')
262
+ and self.web_agent_model is not None
263
+ ):
559
264
  model = self.web_agent_model
560
265
 
561
266
  video_analyzer = VideoAnalysisToolkit(model=model)
@@ -577,7 +282,7 @@ class BaseBrowser:
577
282
  image and the path to the image file if saved, otherwise
578
283
  :obj:`None`.
579
284
  """
580
-
285
+ assert self.page is not None
581
286
  image_data = self.page.screenshot(timeout=60000)
582
287
  image = Image.open(io.BytesIO(image_data))
583
288
 
@@ -585,6 +290,7 @@ class BaseBrowser:
585
290
  if save_image:
586
291
  # Get url name to form a file name
587
292
  # Use urlparser for a safer extraction the url name
293
+ assert self.page_url is not None
588
294
  parsed_url = urllib.parse.urlparse(self.page_url)
589
295
  # Max length is set to 241 as there are 10 characters for the
590
296
  # timestamp and 4 characters for the file extension:
@@ -612,17 +318,24 @@ class BaseBrowser:
612
318
  Returns:
613
319
  List[str]: A list of paths to the screenshot files.
614
320
  """
615
- screenshots = []
616
- scroll_height = self.page.evaluate("document.body.scrollHeight")
321
+ screenshots: List[str] = [] # Ensure screenshots is typed
322
+ assert self.page is not None
323
+ scroll_height_eval = self.page.evaluate("document.body.scrollHeight")
324
+ scroll_height = cast(
325
+ float, scroll_height_eval
326
+ ) # Ensure scroll_height is
327
+ # float
328
+
617
329
  assert self.page.viewport_size is not None
618
330
  viewport_height = self.page.viewport_size["height"]
619
- current_scroll = 0
620
- screenshot_index = 1
331
+ current_scroll_eval = self.page.evaluate("window.scrollY")
332
+ current_scroll = cast(float, current_scroll_eval)
333
+ # screenshot_index = 1 # This variable is not used
621
334
 
622
335
  max_height = scroll_height - viewport_height
623
336
  scroll_step = int(viewport_height * scroll_ratio)
624
337
 
625
- last_height = 0
338
+ last_height = 0.0 # Initialize last_height as float
626
339
 
627
340
  while True:
628
341
  logger.debug(
@@ -631,19 +344,22 @@ class BaseBrowser:
631
344
  )
632
345
 
633
346
  _, file_path = self.get_screenshot(save_image=True)
634
- screenshots.append(file_path)
347
+ if file_path is not None: # Ensure file_path is not None before
348
+ # appending
349
+ screenshots.append(file_path)
635
350
 
636
351
  self.page.evaluate(f"window.scrollBy(0, {scroll_step})")
637
352
  # Allow time for content to load
638
353
  time.sleep(0.5)
639
354
 
640
- current_scroll = self.page.evaluate("window.scrollY")
355
+ current_scroll_eval = self.page.evaluate("window.scrollY")
356
+ current_scroll = cast(float, current_scroll_eval)
641
357
  # Break if there is no significant scroll
642
358
  if abs(current_scroll - last_height) < viewport_height * 0.1:
643
359
  break
644
360
 
645
361
  last_height = current_scroll
646
- screenshot_index += 1
362
+ # screenshot_index += 1 # This variable is not used
647
363
 
648
364
  return screenshots
649
365
 
@@ -653,13 +369,17 @@ class BaseBrowser:
653
369
  Returns:
654
370
  VisualViewport: The visual viewport of the current page.
655
371
  """
372
+ assert self.page is not None
656
373
  try:
657
374
  self.page.evaluate(self.page_script)
658
375
  except Exception as e:
659
376
  logger.warning(f"Error evaluating page script: {e}")
660
377
 
378
+ visual_viewport_eval = self.page.evaluate(
379
+ "MultimodalWebSurfer.getVisualViewport();"
380
+ )
661
381
  return visual_viewport_from_dict(
662
- self.page.evaluate("MultimodalWebSurfer.getVisualViewport();")
382
+ cast(Dict[str, Any], visual_viewport_eval)
663
383
  )
664
384
 
665
385
  def get_interactive_elements(self) -> Dict[str, InteractiveRegion]:
@@ -668,6 +388,7 @@ class BaseBrowser:
668
388
  Returns:
669
389
  Dict[str, InteractiveRegion]: A dictionary of interactive elements.
670
390
  """
391
+ assert self.page is not None
671
392
  try:
672
393
  self.page.evaluate(self.page_script)
673
394
  except Exception as e:
@@ -682,7 +403,7 @@ class BaseBrowser:
682
403
  for k in result:
683
404
  typed_results[k] = interactive_region_from_dict(result[k])
684
405
 
685
- return typed_results # type: ignore[return-value]
406
+ return typed_results
686
407
 
687
408
  def get_som_screenshot(
688
409
  self,
@@ -696,7 +417,8 @@ class BaseBrowser:
696
417
  directory.
697
418
 
698
419
  Returns:
699
- Tuple[Image.Image, Union[str, None]]: A tuple containing the screenshot image
420
+ Tuple[Image.Image, Union[str, None]]: A tuple containing the
421
+ screenshot image
700
422
  and an optional path to the image file if saved, otherwise
701
423
  :obj:`None`.
702
424
  """
@@ -706,11 +428,12 @@ class BaseBrowser:
706
428
  rects = self.get_interactive_elements()
707
429
 
708
430
  file_path: str | None = None
709
- comp, _, _, _ = add_set_of_mark(
431
+ comp, _, _, _ = _add_set_of_mark(
710
432
  screenshot,
711
- rects, # type: ignore[arg-type]
433
+ rects,
712
434
  )
713
435
  if save_image:
436
+ assert self.page_url is not None
714
437
  parsed_url = urllib.parse.urlparse(self.page_url)
715
438
  # Max length is set to 241 as there are 10 characters for the
716
439
  # timestamp and 4 characters for the file extension:
@@ -727,25 +450,30 @@ class BaseBrowser:
727
450
 
728
451
  def scroll_up(self) -> None:
729
452
  r"""Scroll up the page."""
453
+ assert self.page is not None
730
454
  self.page.keyboard.press("PageUp")
731
455
 
732
456
  def scroll_down(self) -> None:
733
457
  r"""Scroll down the page."""
458
+ assert self.page is not None
734
459
  self.page.keyboard.press("PageDown")
735
460
 
736
461
  def get_url(self) -> str:
737
462
  r"""Get the URL of the current page."""
463
+ assert self.page is not None
738
464
  return self.page.url
739
465
 
740
466
  def click_id(self, identifier: Union[str, int]) -> None:
741
467
  r"""Click an element with the given identifier."""
468
+ assert self.page is not None
742
469
  if isinstance(identifier, int):
743
470
  identifier = str(identifier)
744
471
  target = self.page.locator(f"[__elementId='{identifier}']")
745
472
 
746
473
  try:
747
474
  target.wait_for(timeout=5000)
748
- except (TimeoutError, Exception) as e:
475
+ except Exception as e: # Consider using playwright specific
476
+ # TimeoutError
749
477
  logger.debug(f"Error during click operation: {e}")
750
478
  raise ValueError("No such element.") from None
751
479
 
@@ -754,7 +482,13 @@ class BaseBrowser:
754
482
  new_page = None
755
483
  try:
756
484
  with self.page.expect_event("popup", timeout=1000) as page_info:
757
- box = cast(Dict[str, Union[int, float]], target.bounding_box())
485
+ box: Optional[FloatRect] = target.bounding_box()
486
+ if box is None:
487
+ logger.warning(
488
+ f"Bounding box not found for element '{identifier}'. "
489
+ f"Cannot click."
490
+ )
491
+ return
758
492
  self.page.mouse.click(
759
493
  box["x"] + box["width"] / 2, box["y"] + box["height"] / 2
760
494
  )
@@ -765,7 +499,8 @@ class BaseBrowser:
765
499
  self.page_history.append(deepcopy(self.page.url))
766
500
  self.page = new_page
767
501
 
768
- except (TimeoutError, Exception) as e:
502
+ except Exception as e: # Consider using playwright specific
503
+ # TimeoutError
769
504
  logger.debug(f"Error during click operation: {e}")
770
505
  pass
771
506
 
@@ -773,6 +508,7 @@ class BaseBrowser:
773
508
 
774
509
  def extract_url_content(self) -> str:
775
510
  r"""Extract the content of the current page."""
511
+ assert self.page is not None
776
512
  content = self.page.content()
777
513
  return content
778
514
 
@@ -781,17 +517,17 @@ class BaseBrowser:
781
517
 
782
518
  Args:
783
519
  identifier (str): The identifier of the file to download.
784
- file_path (str): The path to save the downloaded file.
785
520
 
786
521
  Returns:
787
522
  str: The result of the action.
788
523
  """
789
-
524
+ assert self.page is not None
790
525
  if isinstance(identifier, int):
791
526
  identifier = str(identifier)
792
527
  try:
793
528
  target = self.page.locator(f"[__elementId='{identifier}']")
794
- except (TimeoutError, Exception) as e:
529
+ except Exception as e: # Consider using playwright specific
530
+ # TimeoutError
795
531
  logger.debug(f"Error during download operation: {e}")
796
532
  logger.warning(
797
533
  f"Element with identifier '{identifier}' not found."
@@ -800,7 +536,7 @@ class BaseBrowser:
800
536
 
801
537
  target.scroll_into_view_if_needed()
802
538
 
803
- file_path = os.path.join(self.cache_dir)
539
+ file_path_val = os.path.join(self.cache_dir)
804
540
  self._wait_for_load()
805
541
 
806
542
  try:
@@ -809,12 +545,13 @@ class BaseBrowser:
809
545
  download = download_info.value
810
546
  file_name = download.suggested_filename
811
547
 
812
- file_path = os.path.join(file_path, file_name)
813
- download.save_as(file_path)
548
+ file_path_val = os.path.join(file_path_val, file_name)
549
+ download.save_as(file_path_val)
814
550
 
815
- return f"Downloaded file to path '{file_path}'."
551
+ return f"Downloaded file to path '{file_path_val}'."
816
552
 
817
- except (TimeoutError, Exception) as e:
553
+ except Exception as e: # Consider using playwright specific
554
+ # TimeoutError
818
555
  logger.debug(f"Error during download operation: {e}")
819
556
  return f"Failed to download file with identifier '{identifier}'."
820
557
 
@@ -828,12 +565,14 @@ class BaseBrowser:
828
565
  Returns:
829
566
  str: The result of the action.
830
567
  """
568
+ assert self.page is not None
831
569
  if isinstance(identifier, int):
832
570
  identifier = str(identifier)
833
571
 
834
572
  try:
835
573
  target = self.page.locator(f"[__elementId='{identifier}']")
836
- except (TimeoutError, Exception) as e:
574
+ except Exception as e: # Consider using playwright specific
575
+ # TimeoutError
837
576
  logger.debug(f"Error during fill operation: {e}")
838
577
  logger.warning(
839
578
  f"Element with identifier '{identifier}' not found."
@@ -844,7 +583,8 @@ class BaseBrowser:
844
583
  target.focus()
845
584
  try:
846
585
  target.fill(text)
847
- except (TimeoutError, Exception) as e:
586
+ except Exception as e: # Consider using playwright specific
587
+ # TimeoutError
848
588
  logger.debug(f"Error during fill operation: {e}")
849
589
  target.press_sequentially(text)
850
590
 
@@ -856,11 +596,13 @@ class BaseBrowser:
856
596
  )
857
597
 
858
598
  def scroll_to_bottom(self) -> str:
599
+ assert self.page is not None
859
600
  self.page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
860
601
  self._wait_for_load()
861
602
  return "Scrolled to the bottom of the page."
862
603
 
863
604
  def scroll_to_top(self) -> str:
605
+ assert self.page is not None
864
606
  self.page.evaluate("window.scrollTo(0, 0);")
865
607
  self._wait_for_load()
866
608
  return "Scrolled to the top of the page."
@@ -874,11 +616,13 @@ class BaseBrowser:
874
616
  Returns:
875
617
  str: The result of the action.
876
618
  """
619
+ assert self.page is not None
877
620
  if isinstance(identifier, int):
878
621
  identifier = str(identifier)
879
622
  try:
880
623
  target = self.page.locator(f"[__elementId='{identifier}']")
881
- except (TimeoutError, Exception) as e:
624
+ except Exception as e: # Consider using playwright specific
625
+ # TimeoutError
882
626
  logger.debug(f"Error during hover operation: {e}")
883
627
  logger.warning(
884
628
  f"Element with identifier '{identifier}' not found."
@@ -896,15 +640,18 @@ class BaseBrowser:
896
640
  the text.
897
641
  """
898
642
  # ruff: noqa: E501
643
+ assert self.page is not None
899
644
  script = f"""
900
- (function() {{
645
+ (function() {{
901
646
  let text = "{search_text}";
902
647
  let found = window.find(text);
903
648
  if (!found) {{
904
- let elements = document.querySelectorAll("*:not(script):not(style)");
649
+ let elements = document.querySelectorAll("*:not(script):not(
650
+ style)");
905
651
  for (let el of elements) {{
906
652
  if (el.innerText && el.innerText.includes(text)) {{
907
- el.scrollIntoView({{behavior: "smooth", block: "center"}});
653
+ el.scrollIntoView({{behavior: "smooth", block:
654
+ "center"}});
908
655
  el.style.backgroundColor = "yellow";
909
656
  el.style.border = '2px solid red';
910
657
  return true;
@@ -915,7 +662,8 @@ class BaseBrowser:
915
662
  return true;
916
663
  }})();
917
664
  """
918
- found = self.page.evaluate(script)
665
+ found_eval = self.page.evaluate(script)
666
+ found = cast(bool, found_eval) # Ensure found is bool
919
667
  self._wait_for_load()
920
668
  if found:
921
669
  return f"Found text '{search_text}' on the page."
@@ -924,7 +672,7 @@ class BaseBrowser:
924
672
 
925
673
  def back(self):
926
674
  r"""Navigate back to the previous page."""
927
-
675
+ assert self.page is not None
928
676
  page_url_before = self.page.url
929
677
  self.page.go_back()
930
678
 
@@ -942,15 +690,21 @@ class BaseBrowser:
942
690
  self._wait_for_load()
943
691
 
944
692
  def close(self):
693
+ assert self.browser is not None
945
694
  self.browser.close()
695
+ if self.playwright:
696
+ self.playwright.stop() # Stop playwright instance
946
697
 
947
698
  # ruff: noqa: E501
948
699
  def show_interactive_elements(self):
949
700
  r"""Show simple interactive elements on the current page."""
701
+ assert self.page is not None
950
702
  self.page.evaluate(self.page_script)
951
703
  self.page.evaluate("""
952
704
  () => {
953
- document.querySelectorAll('a, button, input, select, textarea, [tabindex]:not([tabindex="-1"]), [contenteditable="true"]').forEach(el => {
705
+ document.querySelectorAll('a, button, input, select, textarea,
706
+ [tabindex]:not([tabindex="-1"]),
707
+ [contenteditable="true"]').forEach(el => {
954
708
  el.style.border = '2px solid red';
955
709
  });
956
710
  }
@@ -960,6 +714,7 @@ class BaseBrowser:
960
714
  def get_webpage_content(self) -> str:
961
715
  from html2text import html2text
962
716
 
717
+ assert self.page is not None
963
718
  self._wait_for_load()
964
719
  html_content = self.page.content()
965
720
 
@@ -1045,25 +800,32 @@ class BrowserToolkit(BaseToolkit):
1045
800
  (default: :obj:`"en`")
1046
801
  cookie_json_path (Optional[str]): Path to a JSON file containing
1047
802
  authentication cookies and browser storage state. If provided
1048
- and the file exists, the browser will load this state to maintain
803
+ and the file exists, the browser will load this state to
804
+ maintain
1049
805
  authenticated sessions without requiring manual login.
1050
806
  (default: :obj:`None`)
1051
807
  """
1052
-
808
+ super().__init__() # Call to super().__init__() added
1053
809
  self.browser = BaseBrowser(
1054
810
  headless=headless,
1055
811
  cache_dir=cache_dir,
1056
812
  channel=channel,
1057
813
  cookie_json_path=cookie_json_path,
1058
814
  )
815
+ self.browser.web_agent_model = web_agent_model # Pass model to
816
+ # BaseBrowser instance
1059
817
 
1060
818
  self.history_window = history_window
1061
819
  self.web_agent_model = web_agent_model
1062
820
  self.planning_agent_model = planning_agent_model
1063
821
  self.output_language = output_language
1064
822
 
1065
- self.history: list = []
1066
- self.web_agent, self.planning_agent = self._initialize_agent()
823
+ self.history: List[Dict[str, Any]] = [] # Typed history list
824
+ self.web_agent: ChatAgent
825
+ self.planning_agent: ChatAgent
826
+ self.web_agent, self.planning_agent = self._initialize_agent(
827
+ web_agent_model, planning_agent_model
828
+ )
1067
829
 
1068
830
  def _reset(self):
1069
831
  self.web_agent.reset()
@@ -1071,43 +833,40 @@ class BrowserToolkit(BaseToolkit):
1071
833
  self.history = []
1072
834
  os.makedirs(self.browser.cache_dir, exist_ok=True)
1073
835
 
1074
- def _initialize_agent(self) -> Tuple["ChatAgent", "ChatAgent"]:
836
+ def _initialize_agent(
837
+ self,
838
+ web_agent_model_backend: Optional[BaseModelBackend],
839
+ planning_agent_model_backend: Optional[BaseModelBackend],
840
+ ) -> Tuple[ChatAgent, ChatAgent]:
1075
841
  r"""Initialize the agent."""
1076
842
  from camel.agents import ChatAgent
1077
843
 
1078
- if self.web_agent_model is None:
1079
- web_agent_model = ModelFactory.create(
844
+ if web_agent_model_backend is None:
845
+ web_agent_model_instance = ModelFactory.create(
1080
846
  model_platform=ModelPlatformType.OPENAI,
1081
847
  model_type=ModelType.GPT_4_1,
1082
848
  model_config_dict={"temperature": 0, "top_p": 1},
1083
849
  )
1084
850
  else:
1085
- web_agent_model = self.web_agent_model
851
+ web_agent_model_instance = web_agent_model_backend
1086
852
 
1087
- if self.planning_agent_model is None:
853
+ if planning_agent_model_backend is None:
1088
854
  planning_model = ModelFactory.create(
1089
855
  model_platform=ModelPlatformType.OPENAI,
1090
856
  model_type=ModelType.O3_MINI,
1091
857
  )
1092
858
  else:
1093
- planning_model = self.planning_agent_model
859
+ planning_model = planning_agent_model_backend
1094
860
 
1095
- system_prompt = """
1096
- You are a helpful web agent that can assist users in browsing the web.
1097
- Given a high-level task, you can leverage predefined browser tools to help
1098
- users achieve their goals.
1099
- """
861
+ system_prompt = WEB_AGENT_SYSTEM_PROMPT
1100
862
 
1101
863
  web_agent = ChatAgent(
1102
864
  system_message=system_prompt,
1103
- model=web_agent_model,
865
+ model=web_agent_model_instance,
1104
866
  output_language=self.output_language,
1105
867
  )
1106
868
 
1107
- planning_system_prompt = """
1108
- You are a helpful planning agent that can assist users in planning complex
1109
- tasks which need multi-step browser interaction.
1110
- """
869
+ planning_system_prompt = PLANNING_AGENT_SYSTEM_PROMPT
1111
870
 
1112
871
  planning_agent = ChatAgent(
1113
872
  system_message=planning_system_prompt,
@@ -1120,96 +879,24 @@ tasks which need multi-step browser interaction.
1120
879
  def _observe(
1121
880
  self, task_prompt: str, detailed_plan: Optional[str] = None
1122
881
  ) -> Tuple[str, str, str]:
1123
- r"""Let agent observe the current environment, and get the next action."""
882
+ r"""Let agent observe the current environment, and get the next
883
+ action."""
1124
884
 
1125
- detailed_plan_prompt = ""
885
+ detailed_plan_prompt_str = ""
1126
886
 
1127
887
  if detailed_plan is not None:
1128
- detailed_plan_prompt = f"""
888
+ detailed_plan_prompt_str = f"""
1129
889
  Here is a plan about how to solve the task step-by-step which you must follow:
1130
890
  <detailed_plan>{detailed_plan}<detailed_plan>
1131
891
  """
1132
892
 
1133
- observe_prompt = f"""
1134
- Please act as a web agent to help me complete the following high-level task:
1135
- <task>{task_prompt}</task>
1136
- Now, I have made screenshot (only the current viewport, not the full webpage)
1137
- based on the current browser state, and marked interactive elements in the
1138
- webpage.
1139
- Please carefully examine the requirements of the task, and current state of
1140
- the browser, and provide the next appropriate action to take.
1141
-
1142
- {detailed_plan_prompt}
1143
-
1144
- Here are the current available browser functions you can use:
1145
- {AVAILABLE_ACTIONS_PROMPT}
1146
-
1147
- Here are the latest {self.history_window} trajectory (at most) you have taken:
1148
- <history>
1149
- {self.history[-self.history_window :]}
1150
- </history>
1151
-
1152
- Your output should be in json format, including the following fields:
1153
- - `observation`: The detailed image description about the current viewport. Do
1154
- not over-confident about the correctness of the history actions. You should
1155
- always check the current viewport to make sure the correctness of the next
1156
- action.
1157
- - `reasoning`: The reasoning about the next action you want to take, and the
1158
- possible obstacles you may encounter, and how to solve them. Do not forget to
1159
- check the history actions to avoid the same mistakes.
1160
- - `action_code`: The action code you want to take. It is only one step action
1161
- code, without any other texts (such as annotation)
1162
-
1163
- Here is two example of the output:
1164
- ```json
1165
- {{
1166
- "observation": [IMAGE_DESCRIPTION],
1167
- "reasoning": [YOUR_REASONING],
1168
- "action_code": "fill_input_id([ID], [TEXT])"
1169
- }}
1170
-
1171
- {{
1172
- "observation": "The current page is a CAPTCHA verification page on Amazon. It asks the user to ..",
1173
- "reasoning": "To proceed with the task of searching for products, I need to complete..",
1174
- "action_code": "fill_input_id(3, 'AUXPMR')"
1175
- }}
1176
-
1177
- Here are some tips for you:
1178
- - Never forget the overall question: **{task_prompt}**
1179
- - Maybe after a certain operation (e.g. click_id), the page content has not
1180
- changed. You can check whether the action step is successful by looking at the
1181
- `success` of the action step in the history. If successful, it means that the
1182
- page content is indeed the same after the click. You need to try other methods.
1183
- - If using one way to solve the problem is not successful, try other ways.
1184
- Make sure your provided ID is correct!
1185
- - Some cases are very complex and need to be achieve by an iterative process.
1186
- You can use the `back()` function to go back to the previous page to try other
1187
- methods.
1188
- - There are many links on the page, which may be useful for solving the
1189
- problem. You can use the `click_id()` function to click on the link to see if
1190
- it is useful.
1191
- - Always keep in mind that your action must be based on the ID shown in the
1192
- current image or viewport, not the ID shown in the history.
1193
- - Do not use `stop()` lightly. Always remind yourself that the image only
1194
- shows a part of the full page. If you cannot find the answer, try to use
1195
- functions like `scroll_up()` and `scroll_down()` to check the full content of
1196
- the webpage before doing anything else, because the answer or next key step
1197
- may be hidden in the content below.
1198
- - If the webpage needs human verification, you must avoid processing it.
1199
- Please use `back()` to go back to the previous page, and try other ways.
1200
- - If you have tried everything and still cannot resolve the issue, please stop
1201
- the simulation, and report issues you have encountered.
1202
- - Check the history actions carefully, detect whether you have repeatedly made
1203
- the same actions or not.
1204
- - When dealing with wikipedia revision history related tasks, you need to
1205
- think about the solution flexibly. First, adjust the browsing history
1206
- displayed on a single page to the maximum, and then make use of the
1207
- find_text_on_page function. This is extremely useful which can quickly locate
1208
- the text you want to find and skip massive amount of useless information.
1209
- - Flexibly use interactive elements like slide down selection bar to filter
1210
- out the information you need. Sometimes they are extremely useful.
1211
- ```
1212
- """
893
+ observe_prompt = OBSERVE_PROMPT_TEMPLATE.format(
894
+ task_prompt=task_prompt,
895
+ detailed_plan_prompt=detailed_plan_prompt_str,
896
+ AVAILABLE_ACTIONS_PROMPT=AVAILABLE_ACTIONS_PROMPT,
897
+ history_window=self.history_window,
898
+ history=self.history[-self.history_window :],
899
+ )
1213
900
 
1214
901
  # get current state
1215
902
  som_screenshot, _ = self.browser.get_som_screenshot(save_image=True)
@@ -1223,7 +910,8 @@ out the information you need. Sometimes they are extremely useful.
1223
910
 
1224
911
  resp_content = resp.msgs[0].content
1225
912
 
1226
- resp_dict = _parse_json_output(resp_content)
913
+ resp_dict = _parse_json_output(resp_content, logger) # Pass logger to
914
+ # _parse_json_output
1227
915
  observation_result: str = resp_dict.get("observation", "")
1228
916
  reasoning_result: str = resp_dict.get("reasoning", "")
1229
917
  action_code: str = resp_dict.get("action_code", "")
@@ -1244,7 +932,10 @@ out the information you need. Sometimes they are extremely useful.
1244
932
  id_part = (
1245
933
  parts[0].replace("fill_input_id(", "").strip()
1246
934
  )
1247
- action_code = f"fill_input_id({id_part}, 'Please fill the text here.')"
935
+ action_code = (
936
+ f"fill_input_id({id_part}, 'Please "
937
+ f"fill the text here.')"
938
+ )
1248
939
 
1249
940
  action_code = action_code.replace("`", "").strip()
1250
941
 
@@ -1346,43 +1037,36 @@ out the information you need. Sometimes they are extremely useful.
1346
1037
  )
1347
1038
 
1348
1039
  def _get_final_answer(self, task_prompt: str) -> str:
1349
- r"""Get the final answer based on the task prompt and current browser state.
1350
- It is used when the agent thinks that the task can be completed without any further action, and answer can be directly found in the current viewport.
1040
+ r"""Get the final answer based on the task prompt and current
1041
+ browser state.
1042
+ It is used when the agent thinks that the task can be completed
1043
+ without any further action, and answer can be directly found in the
1044
+ current viewport.
1351
1045
  """
1352
1046
 
1353
- prompt = f"""
1354
- We are solving a complex web task which needs multi-step browser interaction. After the multi-step observation, reasoning and acting with web browser, we think that the task is currently solved.
1355
- Here are all trajectory we have taken:
1356
- <history>{self.history}</history>
1357
- Please find the final answer, or give valuable insights and founds (e.g. if previous actions contain downloading files, your output should include the path of the downloaded file) about the overall task: <task>{task_prompt}</task>
1358
- """
1047
+ prompt = GET_FINAL_ANSWER_PROMPT_TEMPLATE.format(
1048
+ history=self.history, task_prompt=task_prompt
1049
+ )
1359
1050
 
1360
1051
  message = BaseMessage.make_user_message(
1361
1052
  role_name='user',
1362
1053
  content=prompt,
1363
1054
  )
1364
-
1055
+ self.web_agent.reset() # Reset before step
1365
1056
  resp = self.web_agent.step(message)
1366
1057
  return resp.msgs[0].content
1367
1058
 
1368
1059
  def _task_planning(self, task_prompt: str, start_url: str) -> str:
1369
1060
  r"""Plan the task based on the given task prompt."""
1370
1061
 
1371
- # Here are the available browser functions we can use: {AVAILABLE_ACTIONS_PROMPT}
1372
-
1373
- planning_prompt = f"""
1374
- <task>{task_prompt}</task>
1375
- According to the problem above, if we use browser interaction, what is the general process of the interaction after visiting the webpage `{start_url}`?
1376
-
1377
- Please note that it can be viewed as Partially Observable MDP. Do not over-confident about your plan.
1378
- Please first restate the task in detail, and then provide a detailed plan to solve the task.
1379
- """
1380
- # Here are some tips for you: Please note that we can only see a part of the full page because of the limited viewport after an action. Thus, do not forget to use methods like `scroll_up()` and `scroll_down()` to check the full content of the webpage, because the answer or next key step may be hidden in the content below.
1062
+ planning_prompt = TASK_PLANNING_PROMPT_TEMPLATE.format(
1063
+ task_prompt=task_prompt, start_url=start_url
1064
+ )
1381
1065
 
1382
1066
  message = BaseMessage.make_user_message(
1383
1067
  role_name='user', content=planning_prompt
1384
1068
  )
1385
-
1069
+ self.planning_agent.reset() # Reset before step
1386
1070
  resp = self.planning_agent.step(message)
1387
1071
  return resp.msgs[0].content
1388
1072
 
@@ -1396,35 +1080,26 @@ Please first restate the task in detail, and then provide a detailed plan to sol
1396
1080
  detailed_plan (str): The detailed plan to replan.
1397
1081
 
1398
1082
  Returns:
1399
- Tuple[bool, str]: A tuple containing a boolean indicating whether the task needs to be replanned, and the replanned schema.
1083
+ Tuple[bool, str]: A tuple containing a boolean indicating
1084
+ whether the task needs to be replanned, and the replanned schema.
1400
1085
  """
1401
1086
 
1402
- # Here are the available browser functions we can use: {AVAILABLE_ACTIONS_PROMPT}
1403
- replanning_prompt = f"""
1404
- We are using browser interaction to solve a complex task which needs multi-step actions.
1405
- Here are the overall task:
1406
- <overall_task>{task_prompt}</overall_task>
1407
-
1408
- In order to solve the task, we made a detailed plan previously. Here is the detailed plan:
1409
- <detailed plan>{detailed_plan}</detailed plan>
1410
-
1411
- According to the task above, we have made a series of observations, reasonings, and actions. Here are the latest {self.history_window} trajectory (at most) we have taken:
1412
- <history>{self.history[-self.history_window :]}</history>
1413
-
1414
- However, the task is not completed yet. As the task is partially observable, we may need to replan the task based on the current state of the browser if necessary.
1415
- Now please carefully examine the current task planning schema, and our history actions, and then judge whether the task needs to be fundamentally replanned. If so, please provide a detailed replanned schema (including the restated overall task).
1416
-
1417
- Your output should be in json format, including the following fields:
1418
- - `if_need_replan`: bool, A boolean value indicating whether the task needs to be fundamentally replanned.
1419
- - `replanned_schema`: str, The replanned schema for the task, which should not be changed too much compared with the original one. If the task does not need to be replanned, the value should be an empty string.
1420
- """
1087
+ replanning_prompt = TASK_REPLANNING_PROMPT_TEMPLATE.format(
1088
+ task_prompt=task_prompt,
1089
+ detailed_plan=detailed_plan,
1090
+ history_window=self.history_window,
1091
+ history=self.history[-self.history_window :],
1092
+ )
1421
1093
  # Reset the history message of planning_agent.
1422
1094
  self.planning_agent.reset()
1423
1095
  resp = self.planning_agent.step(replanning_prompt)
1424
- resp_dict = _parse_json_output(resp.msgs[0].content)
1096
+ resp_dict = _parse_json_output(
1097
+ resp.msgs[0].content, logger
1098
+ ) # Pass logger
1425
1099
 
1426
- if_need_replan = resp_dict.get("if_need_replan", False)
1427
- replanned_schema = resp_dict.get("replanned_schema", "")
1100
+ if_need_replan_eval = resp_dict.get("if_need_replan", False)
1101
+ if_need_replan = cast(bool, if_need_replan_eval) # Ensure bool
1102
+ replanned_schema: str = resp_dict.get("replanned_schema", "")
1428
1103
 
1429
1104
  if if_need_replan:
1430
1105
  return True, replanned_schema
@@ -1463,10 +1138,10 @@ Your output should be in json format, including the following fields:
1463
1138
  logger.debug(f"Observation: {observation}")
1464
1139
  logger.debug(f"Reasoning: {reasoning}")
1465
1140
  logger.debug(f"Action code: {action_code}")
1466
-
1141
+ trajectory_info: Dict[str, Any]
1467
1142
  if "stop" in action_code:
1468
1143
  task_completed = True
1469
- trajectory_info = {
1144
+ trajectory_info = { # Typed trajectory_info
1470
1145
  "round": i,
1471
1146
  "observation": observation,
1472
1147
  "thought": reasoning,
@@ -1483,7 +1158,7 @@ Your output should be in json format, including the following fields:
1483
1158
  if not success:
1484
1159
  logger.warning(f"Error while executing the action: {info}")
1485
1160
 
1486
- trajectory_info = {
1161
+ trajectory_info = { # Typed trajectory_info
1487
1162
  "round": i,
1488
1163
  "observation": observation,
1489
1164
  "thought": reasoning,
@@ -1502,15 +1177,20 @@ Your output should be in json format, including the following fields:
1502
1177
  detailed_plan = replanned_schema
1503
1178
  logger.debug(f"Replanned schema: {replanned_schema}")
1504
1179
 
1180
+ simulation_result: str
1505
1181
  if not task_completed:
1506
1182
  simulation_result = f"""
1507
- The task is not completed within the round limit. Please check the last round {self.history_window} information to see if there is any useful information:
1183
+ The task is not completed within the round limit. Please
1184
+ check the last round {self.history_window} information to
1185
+ see if there is any useful information:
1508
1186
  <history>{self.history[-self.history_window :]}</history>
1509
1187
  """
1510
1188
 
1511
1189
  else:
1512
1190
  simulation_result = self._get_final_answer(task_prompt)
1513
1191
 
1192
+ self.browser.close() # Close browser after task completion or limit
1193
+ # reached
1514
1194
  return simulation_result
1515
1195
 
1516
1196
  def get_tools(self) -> List[FunctionTool]: