vision-agent 0.2.42__tar.gz → 0.2.44__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {vision_agent-0.2.42 → vision_agent-0.2.44}/PKG-INFO +2 -2
  2. {vision_agent-0.2.42 → vision_agent-0.2.44}/pyproject.toml +2 -2
  3. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/agent/vision_agent.py +4 -0
  4. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/tools/__init__.py +1 -0
  5. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/tools/tools.py +78 -8
  6. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/utils/execute.py +24 -0
  7. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/utils/video.py +35 -0
  8. {vision_agent-0.2.42 → vision_agent-0.2.44}/LICENSE +0 -0
  9. {vision_agent-0.2.42 → vision_agent-0.2.44}/README.md +0 -0
  10. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/__init__.py +0 -0
  11. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/agent/__init__.py +0 -0
  12. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/agent/agent.py +0 -0
  13. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/agent/agent_coder.py +0 -0
  14. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/agent/agent_coder_prompts.py +0 -0
  15. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/agent/data_interpreter.py +0 -0
  16. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/agent/data_interpreter_prompts.py +0 -0
  17. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/agent/easytool.py +0 -0
  18. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/agent/easytool_prompts.py +0 -0
  19. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/agent/easytool_v2.py +0 -0
  20. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/agent/easytool_v2_prompts.py +0 -0
  21. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/agent/reflexion.py +0 -0
  22. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/agent/reflexion_prompts.py +0 -0
  23. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/agent/vision_agent_prompts.py +0 -0
  24. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/fonts/__init__.py +0 -0
  25. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  26. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/llm/__init__.py +0 -0
  27. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/llm/llm.py +0 -0
  28. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/lmm/__init__.py +0 -0
  29. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/lmm/lmm.py +0 -0
  30. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/tools/easytool_tools.py +0 -0
  31. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/tools/prompts.py +0 -0
  32. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/tools/tool_utils.py +0 -0
  33. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/utils/__init__.py +0 -0
  34. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/utils/image_utils.py +0 -0
  35. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/utils/sim.py +0 -0
  36. {vision_agent-0.2.42 → vision_agent-0.2.44}/vision_agent/utils/type_defs.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.42
3
+ Version: 0.2.44
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -18,7 +18,7 @@ Requires-Dist: nbclient (>=0.10.0,<0.11.0)
18
18
  Requires-Dist: nbformat (>=5.10.4,<6.0.0)
19
19
  Requires-Dist: numpy (>=1.21.0,<2.0.0)
20
20
  Requires-Dist: openai (>=1.0.0,<2.0.0)
21
- Requires-Dist: opencv-python-headless (>=4.0.0,<5.0.0)
21
+ Requires-Dist: opencv-python (>=4.0.0,<5.0.0)
22
22
  Requires-Dist: pandas (>=2.0.0,<3.0.0)
23
23
  Requires-Dist: pillow (>=10.0.0,<11.0.0)
24
24
  Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.42"
7
+ version = "0.2.44"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -25,7 +25,7 @@ pandas = "2.*"
25
25
  openai = "1.*"
26
26
  typing_extensions = "4.*"
27
27
  moviepy = "1.*"
28
- opencv-python-headless = "4.*"
28
+ opencv-python = "4.*"
29
29
  tabulate = "^0.9.0"
30
30
  pydantic-settings = "^2.2.1"
31
31
  scipy = "1.13.*"
@@ -28,6 +28,7 @@ from vision_agent.utils import CodeInterpreterFactory, Execution
28
28
  from vision_agent.utils.execute import CodeInterpreter
29
29
  from vision_agent.utils.image_utils import b64_to_pil
30
30
  from vision_agent.utils.sim import Sim
31
+ from vision_agent.utils.video import play_video
31
32
 
32
33
  logging.basicConfig(stream=sys.stdout)
33
34
  _LOGGER = logging.getLogger(__name__)
@@ -522,6 +523,9 @@ class VisionAgent(Agent):
522
523
  for res in execution_result.results:
523
524
  if res.png:
524
525
  b64_to_pil(res.png).show()
526
+ if res.mp4:
527
+ play_video(res.mp4)
528
+
525
529
  return {
526
530
  "code": code,
527
531
  "test": test,
@@ -22,6 +22,7 @@ from .tools import (
22
22
  overlay_segmentation_masks,
23
23
  save_image,
24
24
  save_json,
25
+ save_video_to_result,
25
26
  visual_prompt_counting,
26
27
  zero_shot_counting,
27
28
  )
@@ -7,14 +7,15 @@ from importlib import resources
7
7
  from pathlib import Path
8
8
  from typing import Any, Callable, Dict, List, Tuple, Union, cast
9
9
 
10
+ import cv2
10
11
  import numpy as np
11
12
  import pandas as pd
12
13
  import requests
13
14
  from PIL import Image, ImageDraw, ImageFont
14
- from scipy.spatial import distance # type: ignore
15
15
 
16
16
  from vision_agent.tools.tool_utils import _send_inference_request
17
17
  from vision_agent.utils import extract_frames_from_video
18
+ from vision_agent.utils.execute import FileSerializer, MimeType
18
19
  from vision_agent.utils.image_utils import (
19
20
  b64_to_pil,
20
21
  convert_to_b64,
@@ -421,10 +422,39 @@ def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
421
422
 
422
423
  mask1 = np.clip(mask1, 0, 1)
423
424
  mask2 = np.clip(mask2, 0, 1)
424
- mask1_points = np.transpose(np.nonzero(mask1))
425
- mask2_points = np.transpose(np.nonzero(mask2))
426
- dist_matrix = distance.cdist(mask1_points, mask2_points, "euclidean")
427
- return cast(float, np.min(dist_matrix))
425
+ contours1, _ = cv2.findContours(mask1, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
426
+ contours2, _ = cv2.findContours(mask2, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
427
+ largest_contour1 = max(contours1, key=cv2.contourArea)
428
+ largest_contour2 = max(contours2, key=cv2.contourArea)
429
+ polygon1 = cv2.approxPolyDP(largest_contour1, 1.0, True)
430
+ polygon2 = cv2.approxPolyDP(largest_contour2, 1.0, True)
431
+ min_distance = np.inf
432
+
433
+ small_polygon, larger_contour = (
434
+ (polygon1, largest_contour2)
435
+ if len(largest_contour1) < len(largest_contour2)
436
+ else (polygon2, largest_contour1)
437
+ )
438
+
439
+ # For each point in the first polygon
440
+ for point in small_polygon:
441
+ # Calculate the distance to the second polygon, -1 is to invert result as point inside the polygon is positive
442
+
443
+ distance = (
444
+ cv2.pointPolygonTest(
445
+ larger_contour, (point[0, 0].item(), point[0, 1].item()), True
446
+ )
447
+ * -1
448
+ )
449
+
450
+ # If the distance is negative, the point is inside the polygon, so the distance is 0
451
+ if distance < 0:
452
+ continue
453
+ else:
454
+ # Update the minimum distance if the point is outside the polygon
455
+ min_distance = min(min_distance, distance)
456
+
457
+ return min_distance if min_distance != np.inf else 0.0
428
458
 
429
459
 
430
460
  def closest_box_distance(
@@ -521,6 +551,29 @@ def save_image(image: np.ndarray) -> str:
521
551
  return f.name
522
552
 
523
553
 
554
+ def save_video_to_result(video_uri: str) -> None:
555
+ """'save_video_to_result' a utility function that saves a video into the result of the code execution (as an intermediate output).
556
+ This function is required to run if user wants to visualize the video generated by the code.
557
+
558
+ Parameters:
559
+ video_uri (str): The URI to the video file. Currently only local file paths are supported.
560
+
561
+ Example
562
+ -------
563
+ >>> save_video_to_result("path/to/video.mp4")
564
+ """
565
+ from IPython.display import display
566
+
567
+ serializer = FileSerializer(video_uri)
568
+ display(
569
+ {
570
+ MimeType.VIDEO_MP4_B64: serializer.base64(),
571
+ MimeType.TEXT_PLAIN: str(serializer),
572
+ },
573
+ raw=True,
574
+ )
575
+
576
+
524
577
  def overlay_bounding_boxes(
525
578
  image: np.ndarray, bboxes: List[Dict[str, Any]]
526
579
  ) -> np.ndarray:
@@ -541,6 +594,8 @@ def overlay_bounding_boxes(
541
594
  image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
542
595
  )
543
596
  """
597
+ from IPython.display import display
598
+
544
599
  pil_image = Image.fromarray(image.astype(np.uint8))
545
600
 
546
601
  if len(set([box["label"] for box in bboxes])) > len(COLORS):
@@ -577,7 +632,10 @@ def overlay_bounding_boxes(
577
632
  text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
578
633
  draw.rectangle((box[0], box[1], text_box[2], text_box[3]), fill=color[label])
579
634
  draw.text((box[0], box[1]), text, fill="black", font=font)
580
- return np.array(pil_image.convert("RGB"))
635
+
636
+ pil_image = pil_image.convert("RGB")
637
+ display(pil_image)
638
+ return np.array(pil_image)
581
639
 
582
640
 
583
641
  def overlay_segmentation_masks(
@@ -608,6 +666,8 @@ def overlay_segmentation_masks(
608
666
  }],
609
667
  )
610
668
  """
669
+ from IPython.display import display
670
+
611
671
  pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGBA")
612
672
 
613
673
  if len(set([mask["label"] for mask in masks])) > len(COLORS):
@@ -627,7 +687,10 @@ def overlay_segmentation_masks(
627
687
  np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
628
688
  mask_img = Image.fromarray(np_mask.astype(np.uint8))
629
689
  pil_image = Image.alpha_composite(pil_image, mask_img)
630
- return np.array(pil_image.convert("RGB"))
690
+
691
+ pil_image = pil_image.convert("RGB")
692
+ display(pil_image)
693
+ return np.array(pil_image)
631
694
 
632
695
 
633
696
  def overlay_heat_map(
@@ -657,6 +720,8 @@ def overlay_heat_map(
657
720
  },
658
721
  )
659
722
  """
723
+ from IPython.display import display
724
+
660
725
  pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
661
726
 
662
727
  if "heat_map" not in heat_map or len(heat_map["heat_map"]) == 0:
@@ -672,7 +737,10 @@ def overlay_heat_map(
672
737
  combined = Image.alpha_composite(
673
738
  pil_image.convert("RGBA"), overlay.resize(pil_image.size)
674
739
  )
675
- return np.array(combined.convert("RGB"))
740
+
741
+ pil_image = combined.convert("RGB")
742
+ display(pil_image)
743
+ return np.array(pil_image)
676
744
 
677
745
 
678
746
  def get_tool_documentation(funcs: List[Callable[..., Any]]) -> str:
@@ -734,6 +802,7 @@ TOOLS = [
734
802
  save_json,
735
803
  load_image,
736
804
  save_image,
805
+ save_video_to_result,
737
806
  overlay_bounding_boxes,
738
807
  overlay_segmentation_masks,
739
808
  overlay_heat_map,
@@ -746,6 +815,7 @@ UTILITIES_DOCSTRING = get_tool_documentation(
746
815
  save_json,
747
816
  load_image,
748
817
  save_image,
818
+ save_video_to_result,
749
819
  overlay_bounding_boxes,
750
820
  overlay_segmentation_masks,
751
821
  overlay_heat_map,
@@ -1,5 +1,6 @@
1
1
  import abc
2
2
  import atexit
3
+ import base64
3
4
  import copy
4
5
  import logging
5
6
  import os
@@ -45,12 +46,31 @@ class MimeType(str, Enum):
45
46
  IMAGE_SVG = "image/svg+xml"
46
47
  IMAGE_PNG = "image/png"
47
48
  IMAGE_JPEG = "image/jpeg"
49
+ VIDEO_MP4_B64 = "video/mp4/base64"
48
50
  APPLICATION_PDF = "application/pdf"
49
51
  TEXT_LATEX = "text/latex"
50
52
  APPLICATION_JSON = "application/json"
51
53
  APPLICATION_JAVASCRIPT = "application/javascript"
52
54
 
53
55
 
56
+ class FileSerializer:
57
+ """Adaptor class that allows IPython.display.display() to serialize a file to a base64 string representation."""
58
+
59
+ def __init__(self, file_uri: str):
60
+ self.video_uri = file_uri
61
+ assert os.path.isfile(
62
+ file_uri
63
+ ), f"Only support local files currently: {file_uri}"
64
+ assert Path(file_uri).exists(), f"File not found: {file_uri}"
65
+
66
+ def __repr__(self) -> str:
67
+ return f"FileSerializer({self.video_uri})"
68
+
69
+ def base64(self) -> str:
70
+ with open(self.video_uri, "rb") as file:
71
+ return base64.b64encode(file.read()).decode("utf-8")
72
+
73
+
54
74
  class Result:
55
75
  """
56
76
  Represents the data to be displayed as a result of executing a cell in a Jupyter notebook.
@@ -70,6 +90,7 @@ class Result:
70
90
  png: Optional[str] = None
71
91
  jpeg: Optional[str] = None
72
92
  pdf: Optional[str] = None
93
+ mp4: Optional[str] = None
73
94
  latex: Optional[str] = None
74
95
  json: Optional[Dict[str, Any]] = None
75
96
  javascript: Optional[str] = None
@@ -93,6 +114,7 @@ class Result:
93
114
  self.png = data.pop(MimeType.IMAGE_PNG, None)
94
115
  self.jpeg = data.pop(MimeType.IMAGE_JPEG, None)
95
116
  self.pdf = data.pop(MimeType.APPLICATION_PDF, None)
117
+ self.mp4 = data.pop(MimeType.VIDEO_MP4_B64, None)
96
118
  self.latex = data.pop(MimeType.TEXT_LATEX, None)
97
119
  self.json = data.pop(MimeType.APPLICATION_JSON, None)
98
120
  self.javascript = data.pop(MimeType.APPLICATION_JAVASCRIPT, None)
@@ -190,6 +212,8 @@ class Result:
190
212
  formats.append("json")
191
213
  if self.javascript:
192
214
  formats.append("javascript")
215
+ if self.mp4:
216
+ formats.append("mp4")
193
217
  if self.extra:
194
218
  formats.extend(iter(self.extra))
195
219
  return formats
@@ -1,7 +1,9 @@
1
+ import base64
1
2
  import logging
2
3
  import math
3
4
  import os
4
5
  from concurrent.futures import ProcessPoolExecutor, as_completed
6
+ import tempfile
5
7
  from typing import List, Tuple, cast
6
8
 
7
9
  import cv2
@@ -14,6 +16,39 @@ _LOGGER = logging.getLogger(__name__)
14
16
  _CLIP_LENGTH = 30.0
15
17
 
16
18
 
19
+ def play_video(video_base64: str) -> None:
20
+ """Play a video file"""
21
+ video_data = base64.b64decode(video_base64)
22
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video:
23
+ temp_video.write(video_data)
24
+ temp_video_path = temp_video.name
25
+
26
+ cap = cv2.VideoCapture(temp_video_path)
27
+ if not cap.isOpened():
28
+ _LOGGER.error("Error: Could not open video.")
29
+ return
30
+
31
+ # Display the first frame and wait for any key press to start the video
32
+ ret, frame = cap.read()
33
+ if ret:
34
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
35
+ cv2.imshow("Video Player", frame)
36
+ _LOGGER.info(f"Press any key to start playing the video: {temp_video_path}")
37
+ cv2.waitKey(0) # Wait for any key press
38
+
39
+ while cap.isOpened():
40
+ ret, frame = cap.read()
41
+ if not ret:
42
+ break
43
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
44
+ cv2.imshow("Video Player", frame)
45
+ # Press 'q' to exit the video
46
+ if cv2.waitKey(200) & 0xFF == ord("q"):
47
+ break
48
+ cap.release()
49
+ cv2.destroyAllWindows()
50
+
51
+
17
52
  def extract_frames_from_video(
18
53
  video_uri: str, fps: float = 0.5, motion_detection_threshold: float = 0.0
19
54
  ) -> List[Tuple[np.ndarray, float]]:
File without changes
File without changes