camel-ai 0.2.25__py3-none-any.whl → 0.2.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of camel-ai might be problematic. Click here for more details.

@@ -92,7 +92,7 @@ ACTION_WITH_FEEDBACK_LIST = [
92
92
  ]
93
93
 
94
94
 
95
- # codes from magentic-one
95
+ # Code from magentic-one
96
96
  class DOMRectangle(TypedDict):
97
97
  x: Union[int, float]
98
98
  y: Union[int, float]
@@ -127,21 +127,36 @@ class InteractiveRegion(TypedDict):
127
127
 
128
128
 
129
129
  def _get_str(d: Any, k: str) -> str:
130
+ r"""Safely retrieve a string value from a dictionary."""
131
+ if k not in d:
132
+ raise KeyError(f"Missing required key: '{k}'")
130
133
  val = d[k]
131
- assert isinstance(val, str)
132
- return val
134
+ if isinstance(val, str):
135
+ return val
136
+ raise TypeError(
137
+ f"Expected a string for key '{k}', " f"but got {type(val).__name__}"
138
+ )
133
139
 
134
140
 
135
141
  def _get_number(d: Any, k: str) -> Union[int, float]:
142
+ r"""Safely retrieve a number (int or float) from a dictionary"""
136
143
  val = d[k]
137
- assert isinstance(val, int) or isinstance(val, float)
138
- return val
144
+ if isinstance(val, (int, float)):
145
+ return val
146
+ raise TypeError(
147
+ f"Expected a number (int/float) for key "
148
+ f"'{k}', but got {type(val).__name__}"
149
+ )
139
150
 
140
151
 
141
152
  def _get_bool(d: Any, k: str) -> bool:
153
+ r"""Safely retrieve a boolean value from a dictionary."""
142
154
  val = d[k]
143
- assert isinstance(val, bool)
144
- return val
155
+ if isinstance(val, bool):
156
+ return val
157
+ raise TypeError(
158
+ f"Expected a boolean for key '{k}', " f"but got {type(val).__name__}"
159
+ )
145
160
 
146
161
 
147
162
  def _parse_json_output(text: str) -> Dict[str, Any]:
@@ -208,7 +223,8 @@ def _reload_image(image: Image.Image):
208
223
  return Image.open(buffer)
209
224
 
210
225
 
211
- def domrectangle_from_dict(rect: Dict[str, Any]) -> DOMRectangle:
226
+ def dom_rectangle_from_dict(rect: Dict[str, Any]) -> DOMRectangle:
227
+ r"""Create a DOMRectangle object from a dictionary."""
212
228
  return DOMRectangle(
213
229
  x=_get_number(rect, "x"),
214
230
  y=_get_number(rect, "y"),
@@ -221,10 +237,11 @@ def domrectangle_from_dict(rect: Dict[str, Any]) -> DOMRectangle:
221
237
  )
222
238
 
223
239
 
224
- def interactiveregion_from_dict(region: Dict[str, Any]) -> InteractiveRegion:
240
+ def interactive_region_from_dict(region: Dict[str, Any]) -> InteractiveRegion:
241
+ r"""Create an :class:`InteractiveRegion` object from a dictionary."""
225
242
  typed_rects: List[DOMRectangle] = []
226
243
  for rect in region["rects"]:
227
- typed_rects.append(domrectangle_from_dict(rect))
244
+ typed_rects.append(dom_rectangle_from_dict(rect))
228
245
 
229
246
  return InteractiveRegion(
230
247
  tag_name=_get_str(region, "tag_name"),
@@ -235,7 +252,8 @@ def interactiveregion_from_dict(region: Dict[str, Any]) -> InteractiveRegion:
235
252
  )
236
253
 
237
254
 
238
- def visualviewport_from_dict(viewport: Dict[str, Any]) -> VisualViewport:
255
+ def visual_viewport_from_dict(viewport: Dict[str, Any]) -> VisualViewport:
256
+ r"""Create a :class:`VisualViewport` object from a dictionary."""
239
257
  return VisualViewport(
240
258
  height=_get_number(viewport, "height"),
241
259
  width=_get_number(viewport, "width"),
@@ -252,7 +270,7 @@ def visualviewport_from_dict(viewport: Dict[str, Any]) -> VisualViewport:
252
270
 
253
271
 
254
272
  def add_set_of_mark(
255
- screenshot: bytes | Image.Image | io.BufferedIOBase,
273
+ screenshot: Union[bytes, Image.Image, io.BufferedIOBase],
256
274
  ROIs: Dict[str, InteractiveRegion],
257
275
  ) -> Tuple[Image.Image, List[str], List[str], List[str]]:
258
276
  if isinstance(screenshot, Image.Image):
@@ -272,6 +290,18 @@ def add_set_of_mark(
272
290
  def _add_set_of_mark(
273
291
  screenshot: Image.Image, ROIs: Dict[str, InteractiveRegion]
274
292
  ) -> Tuple[Image.Image, List[str], List[str], List[str]]:
293
+ r"""Add a set of marks to the screenshot.
294
+
295
+ Args:
296
+ screenshot (Image.Image): The screenshot to add marks to.
297
+ ROIs (Dict[str, InteractiveRegion]): The regions to add marks to.
298
+
299
+ Returns:
300
+ Tuple[Image.Image, List[str], List[str], List[str]]: A tuple
301
+ containing the screenshot with marked ROIs, ROIs fully within the
302
+ images, ROIs located above the visible area, and ROIs located below
303
+ the visible area.
304
+ """
275
305
  visible_rects: List[str] = list()
276
306
  rects_above: List[str] = list() # Scroll up to see
277
307
  rects_below: List[str] = list() # Scroll down to see
@@ -284,22 +314,22 @@ def _add_set_of_mark(
284
314
  for r in ROIs:
285
315
  for rect in ROIs[r]["rects"]:
286
316
  # Empty rectangles
287
- if not rect:
288
- continue
289
- if rect["width"] * rect["height"] == 0:
317
+ if not rect or rect["width"] == 0 or rect["height"] == 0:
290
318
  continue
291
319
 
292
- mid = (
293
- (rect["right"] + rect["left"]) / 2.0,
294
- (rect["top"] + rect["bottom"]) / 2.0,
295
- )
320
+ # TODO: add scroll left and right?
321
+ horizontal_center = (rect["right"] + rect["left"]) / 2.0
322
+ vertical_center = (rect["top"] + rect["bottom"]) / 2.0
323
+ is_within_horizon = 0 <= horizontal_center < base.size[0]
324
+ is_above_viewport = vertical_center < 0
325
+ is_below_viewport = vertical_center >= base.size[1]
296
326
 
297
- if 0 <= mid[0] and mid[0] < base.size[0]:
298
- if mid[1] < 0:
327
+ if is_within_horizon:
328
+ if is_above_viewport:
299
329
  rects_above.append(r)
300
- elif mid[1] >= base.size[1]:
330
+ elif is_below_viewport:
301
331
  rects_below.append(r)
302
- else:
332
+ else: # Fully visible
303
333
  visible_rects.append(r)
304
334
  _draw_roi(draw, int(r), fnt, rect)
305
335
 
@@ -314,9 +344,16 @@ def _draw_roi(
314
344
  font: ImageFont.FreeTypeFont | ImageFont.ImageFont,
315
345
  rect: DOMRectangle,
316
346
  ) -> None:
317
- color = _color(idx)
318
- luminance = color[0] * 0.3 + color[1] * 0.59 + color[2] * 0.11
319
- text_color = (0, 0, 0, 255) if luminance > 90 else (255, 255, 255, 255)
347
+ r"""Draw a ROI on the image.
348
+
349
+ Args:
350
+ draw (ImageDraw.ImageDraw): The draw object.
351
+ idx (int): The index of the ROI.
352
+ font (ImageFont.FreeTypeFont | ImageFont.ImageFont): The font.
353
+ rect (DOMRectangle): The DOM rectangle.
354
+ """
355
+ color = _get_random_color(idx)
356
+ text_color = _get_text_color(color)
320
357
 
321
358
  roi = ((rect["left"], rect["top"]), (rect["right"], rect["bottom"]))
322
359
 
@@ -351,9 +388,36 @@ def _draw_roi(
351
388
  )
352
389
 
353
390
 
354
- def _color(identifier: int) -> Tuple[int, int, int, int]:
391
+ def _get_text_color(
392
+ bg_color: Tuple[int, int, int, int],
393
+ ) -> Tuple[int, int, int, int]:
394
+ r"""Determine the ideal text color (black or white) for contrast.
395
+
396
+ Args:
397
+ bg_color: The background color (R, G, B, A).
398
+
399
+ Returns:
400
+ A tuple representing black or white color for text.
401
+ """
402
+ luminance = bg_color[0] * 0.3 + bg_color[1] * 0.59 + bg_color[2] * 0.11
403
+ return (0, 0, 0, 255) if luminance > 120 else (255, 255, 255, 255)
404
+
405
+
406
+ def _get_random_color(identifier: int) -> Tuple[int, int, int, int]:
407
+ r"""Generate a consistent random RGBA color based on the identifier.
408
+
409
+ Args:
410
+ identifier: The ID used as a seed to ensure color consistency.
411
+
412
+ Returns:
413
+ A tuple representing (R, G, B, A) values.
414
+ """
355
415
  rnd = random.Random(int(identifier))
356
- color = [rnd.randint(0, 255), rnd.randint(125, 255), rnd.randint(0, 50)]
416
+ r = rnd.randint(0, 255)
417
+ g = rnd.randint(125, 255)
418
+ b = rnd.randint(0, 50)
419
+ color = [r, g, b]
420
+ # TODO: check why shuffle is needed?
357
421
  rnd.shuffle(color)
358
422
  color.append(255)
359
423
  return cast(Tuple[int, int, int, int], tuple(color))
@@ -379,13 +443,11 @@ class BaseBrowser:
379
443
  self.playwright = sync_playwright().start()
380
444
  self.page_history: list = [] # stores the history of visited pages
381
445
 
382
- # set the cache directory
383
- self.cache_dir = "tmp/"
446
+ # Set the cache directory
447
+ self.cache_dir = "tmp/" if cache_dir is None else cache_dir
384
448
  os.makedirs(self.cache_dir, exist_ok=True)
385
- if cache_dir is not None:
386
- self.cache_dir = cache_dir
387
449
 
388
- # load the page script
450
+ # Load the page script
389
451
  abs_dir_path = os.path.dirname(os.path.abspath(__file__))
390
452
  page_script_path = os.path.join(abs_dir_path, "page_script.js")
391
453
 
@@ -398,34 +460,35 @@ class BaseBrowser:
398
460
  f"Page script file not found at path: {page_script_path}"
399
461
  )
400
462
 
401
- def init(self):
463
+ def init(self) -> None:
402
464
  r"""Initialize the browser."""
403
- self.browser = self.playwright.chromium.launch(
404
- headless=self.headless
405
- ) # Launch the browser, if headless is False, the browser will display
406
- self.context = self.browser.new_context(
407
- accept_downloads=True
408
- ) # create a new context
409
- self.page = self.context.new_page() # create a new page
410
-
411
- def clean_cache(self):
412
- r"""delete the cache directory and its contents."""
465
+ # Launch the browser, if headless is False, the browser will display
466
+ self.browser = self.playwright.chromium.launch(headless=self.headless)
467
+ # Create a new context
468
+ self.context = self.browser.new_context(accept_downloads=True)
469
+ # Create a new page
470
+ self.page = self.context.new_page()
471
+
472
+ def clean_cache(self) -> None:
473
+ r"""Delete the cache directory and its contents."""
413
474
  if os.path.exists(self.cache_dir):
414
475
  shutil.rmtree(self.cache_dir)
415
476
 
416
- def _wait_for_load(self, timeout: int = 20):
477
+ def _wait_for_load(self, timeout: int = 20) -> None:
417
478
  r"""Wait for a certain amount of time for the page to load."""
418
479
  timeout_ms = timeout * 1000
419
480
 
420
481
  self.page.wait_for_load_state("load", timeout=timeout_ms)
482
+
483
+ # TODO: check if this is needed
421
484
  time.sleep(2)
422
485
 
423
- def click_blank_area(self):
486
+ def click_blank_area(self) -> None:
424
487
  r"""Click a blank area of the page to unfocus the current element."""
425
488
  self.page.mouse.click(0, 0)
426
489
  self._wait_for_load()
427
490
 
428
- def visit_page(self, url: str):
491
+ def visit_page(self, url: str) -> None:
429
492
  r"""Visit a page with the given URL."""
430
493
 
431
494
  self.page.goto(url)
@@ -433,8 +496,8 @@ class BaseBrowser:
433
496
  self.page_url = url
434
497
 
435
498
  def ask_question_about_video(self, question: str) -> str:
436
- r"""Ask a question about the video on the current page. It is suitable
437
- to process youtube video.
499
+ r"""Ask a question about the video on the current page,
500
+ such as YouTube video.
438
501
 
439
502
  Args:
440
503
  question (str): The question to ask.
@@ -459,8 +522,9 @@ class BaseBrowser:
459
522
  directory.
460
523
 
461
524
  Returns:
462
- Tuple[Image.Image, str]: A tuple containing the screenshot image
463
- and the path to the image file.
525
+ Tuple[Image.Image, str]: A tuple containing the screenshot
526
+ image and the path to the image file if saved, otherwise
527
+ :obj:`None`.
464
528
  """
465
529
 
466
530
  image_data = self.page.screenshot(timeout=60000)
@@ -468,12 +532,13 @@ class BaseBrowser:
468
532
 
469
533
  file_path = None
470
534
  if save_image:
471
- # get url name to form a file name
535
+ # Get url name to form a file name
536
+ # TODO: Use a safer way for the url name
472
537
  url_name = self.page_url.split("/")[-1]
473
538
  for char in ['\\', '/', ':', '*', '?', '"', '<', '>', '|', '.']:
474
539
  url_name = url_name.replace(char, "_")
475
540
 
476
- # get formatted time: mmddhhmmss
541
+ # Get formatted time: mmddhhmmss
477
542
  timestamp = datetime.datetime.now().strftime("%m%d%H%M%S")
478
543
  file_path = os.path.join(
479
544
  self.cache_dir, f"{url_name}_{timestamp}.png"
@@ -492,23 +557,18 @@ class BaseBrowser:
492
557
 
493
558
  Args:
494
559
  scroll_ratio (float): The ratio of viewport height to scroll each
495
- step (default: 0.7).
560
+ step (default: 0.8).
496
561
 
497
562
  Returns:
498
563
  List[str]: A list of paths to the screenshot files.
499
564
  """
500
565
  screenshots = []
501
566
  scroll_height = self.page.evaluate("document.body.scrollHeight")
567
+ assert self.page.viewport_size is not None
502
568
  viewport_height = self.page.viewport_size["height"]
503
569
  current_scroll = 0
504
570
  screenshot_index = 1
505
571
 
506
- url_name = self.page.url.split("/")[-1].replace(".", "_")
507
- timestamp = datetime.datetime.now().strftime("%m%d%H%M%S")
508
- base_file_path = os.path.join(
509
- self.cache_dir, f"{url_name}_{timestamp}"
510
- )
511
-
512
572
  max_height = scroll_height - viewport_height
513
573
  scroll_step = int(viewport_height * scroll_ratio)
514
574
 
@@ -520,14 +580,15 @@ class BaseBrowser:
520
580
  f"{max_height}, step: {scroll_step}"
521
581
  )
522
582
 
523
- file_path = f"{base_file_path}_{screenshot_index}.png"
524
583
  _, file_path = self.get_screenshot(save_image=True)
525
584
  screenshots.append(file_path)
526
585
 
527
586
  self.page.evaluate(f"window.scrollBy(0, {scroll_step})")
587
+ # Allow time for content to load
528
588
  time.sleep(0.5)
529
589
 
530
590
  current_scroll = self.page.evaluate("window.scrollY")
591
+ # Break if there is no significant scroll
531
592
  if abs(current_scroll - last_height) < viewport_height * 0.1:
532
593
  break
533
594
 
@@ -547,12 +608,16 @@ class BaseBrowser:
547
608
  except Exception as e:
548
609
  logger.warning(f"Error evaluating page script: {e}")
549
610
 
550
- return visualviewport_from_dict(
611
+ return visual_viewport_from_dict(
551
612
  self.page.evaluate("MultimodalWebSurfer.getVisualViewport();")
552
613
  )
553
614
 
554
- def get_interactive_elements(self) -> List[Dict[str, Any]]:
555
- # codes from magentic-one
615
+ def get_interactive_elements(self) -> Dict[str, InteractiveRegion]:
616
+ r"""Get the interactive elements of the current page.
617
+
618
+ Returns:
619
+ Dict[str, InteractiveRegion]: A dictionary of interactive elements.
620
+ """
556
621
  try:
557
622
  self.page.evaluate(self.page_script)
558
623
  except Exception as e:
@@ -565,12 +630,13 @@ class BaseBrowser:
565
630
 
566
631
  typed_results: Dict[str, InteractiveRegion] = {}
567
632
  for k in result:
568
- typed_results[k] = interactiveregion_from_dict(result[k])
633
+ typed_results[k] = interactive_region_from_dict(result[k])
569
634
 
570
635
  return typed_results # type: ignore[return-value]
571
636
 
572
637
  def get_som_screenshot(
573
- self, save_image: bool = False
638
+ self,
639
+ save_image: bool = False,
574
640
  ) -> Tuple[Image.Image, Union[str, None]]:
575
641
  r"""Get a screenshot of the current viewport with interactive elements
576
642
  marked.
@@ -608,15 +674,19 @@ class BaseBrowser:
608
674
  return comp, file_path
609
675
 
610
676
  def scroll_up(self) -> None:
677
+ r"""Scroll up the page."""
611
678
  self.page.keyboard.press("PageUp")
612
679
 
613
680
  def scroll_down(self) -> None:
681
+ r"""Scroll down the page."""
614
682
  self.page.keyboard.press("PageDown")
615
683
 
616
684
  def get_url(self) -> str:
685
+ r"""Get the URL of the current page."""
617
686
  return self.page.url
618
687
 
619
- def click_id(self, identifier: Union[str, int]):
688
+ def click_id(self, identifier: Union[str, int]) -> None:
689
+ r"""Click an element with the given identifier."""
620
690
  if isinstance(identifier, int):
621
691
  identifier = str(identifier)
622
692
  target = self.page.locator(f"[__elementId='{identifier}']")
@@ -649,7 +719,7 @@ class BaseBrowser:
649
719
 
650
720
  self._wait_for_load()
651
721
 
652
- def extract_url_content(self):
722
+ def extract_url_content(self) -> str:
653
723
  r"""Extract the content of the current page."""
654
724
  content = self.page.content()
655
725
  return content
@@ -821,7 +891,6 @@ class BaseBrowser:
821
891
 
822
892
  def close(self):
823
893
  self.browser.close()
824
- self.playwright.stop()
825
894
 
826
895
  # ruff: noqa: E501
827
896
  def show_interactive_elements(self):
@@ -846,7 +915,7 @@ class BaseBrowser:
846
915
  return markdown_content
847
916
 
848
917
 
849
- class WebToolkit(BaseToolkit):
918
+ class BrowserToolkit(BaseToolkit):
850
919
  r"""A class for browsing the web and interacting with web pages.
851
920
 
852
921
  This class provides methods for browsing the web and interacting with web
@@ -862,7 +931,7 @@ class WebToolkit(BaseToolkit):
862
931
  planning_agent_model: Optional[BaseModelBackend] = None,
863
932
  output_language: str = "en",
864
933
  ):
865
- r"""Initialize the WebToolkit instance.
934
+ r"""Initialize the BrowserToolkit instance.
866
935
 
867
936
  Args:
868
937
  headless (bool): Whether to run the browser in headless mode.
@@ -1026,9 +1095,7 @@ out the information you need. Sometimes they are extremely useful.
1026
1095
  """
1027
1096
 
1028
1097
  # get current state
1029
- som_screenshot, som_screenshot_path = self.browser.get_som_screenshot(
1030
- save_image=True
1031
- )
1098
+ som_screenshot, _ = self.browser.get_som_screenshot(save_image=True)
1032
1099
  img = _reload_image(som_screenshot)
1033
1100
  message = BaseMessage.make_user_message(
1034
1101
  role_name='user', content=observe_prompt, image_list=[img]
@@ -1222,7 +1289,7 @@ Your output should be in json format, including the following fields:
1222
1289
  return False, replanned_schema
1223
1290
 
1224
1291
  @dependencies_required("playwright")
1225
- def browser_simulation(
1292
+ def browse_url(
1226
1293
  self, task_prompt: str, start_url: str, round_limit: int = 12
1227
1294
  ) -> str:
1228
1295
  r"""A powerful toolkit which can simulate the browser interaction to solve the task which needs multi-step actions.
@@ -1303,4 +1370,4 @@ Your output should be in json format, including the following fields:
1303
1370
  return simulation_result
1304
1371
 
1305
1372
  def get_tools(self) -> List[FunctionTool]:
1306
- return [FunctionTool(self.browser_simulation)]
1373
+ return [FunctionTool(self.browse_url)]