vision-agent 0.2.43__py3-none-any.whl → 0.2.45__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,9 +2,11 @@ import copy
2
2
  import json
3
3
  import logging
4
4
  import sys
5
+ import tempfile
5
6
  from pathlib import Path
6
- from typing import Any, Callable, Dict, List, Optional, Union, cast
7
+ from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
7
8
 
9
+ from PIL import Image
8
10
  from rich.console import Console
9
11
  from rich.style import Style
10
12
  from rich.syntax import Syntax
@@ -28,6 +30,7 @@ from vision_agent.utils import CodeInterpreterFactory, Execution
28
30
  from vision_agent.utils.execute import CodeInterpreter
29
31
  from vision_agent.utils.image_utils import b64_to_pil
30
32
  from vision_agent.utils.sim import Sim
33
+ from vision_agent.utils.video import play_video
31
34
 
32
35
  logging.basicConfig(stream=sys.stdout)
33
36
  _LOGGER = logging.getLogger(__name__)
@@ -77,12 +80,35 @@ def extract_json(json_str: str) -> Dict[str, Any]:
77
80
  return json_dict # type: ignore
78
81
 
79
82
 
83
+ def extract_image(
84
+ media: Optional[Sequence[Union[str, Path]]]
85
+ ) -> Optional[Sequence[Union[str, Path]]]:
86
+ if media is None:
87
+ return None
88
+
89
+ new_media = []
90
+ for m in media:
91
+ m = Path(m)
92
+ extension = m.suffix
93
+ if extension in [".jpg", ".jpeg", ".png", ".bmp"]:
94
+ new_media.append(m)
95
+ elif extension in [".mp4", ".mov"]:
96
+ frames = T.extract_frames(m)
97
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
98
+ if len(frames) > 0:
99
+ Image.fromarray(frames[0][0]).save(tmp.name)
100
+ new_media.append(Path(tmp.name))
101
+ if len(new_media) == 0:
102
+ return None
103
+ return new_media
104
+
105
+
80
106
  def write_plan(
81
107
  chat: List[Dict[str, str]],
82
108
  tool_desc: str,
83
109
  working_memory: str,
84
110
  model: Union[LLM, LMM],
85
- media: Optional[List[Union[str, Path]]] = None,
111
+ media: Optional[Sequence[Union[str, Path]]] = None,
86
112
  ) -> List[Dict[str, str]]:
87
113
  chat = copy.deepcopy(chat)
88
114
  if chat[-1]["role"] != "user":
@@ -93,6 +119,7 @@ def write_plan(
93
119
  prompt = PLAN.format(context=context, tool_desc=tool_desc, feedback=working_memory)
94
120
  chat[-1]["content"] = prompt
95
121
  if isinstance(model, OpenAILMM):
122
+ media = extract_image(media)
96
123
  return extract_json(model.chat(chat, images=media))["plan"] # type: ignore
97
124
  else:
98
125
  return extract_json(model.chat(chat))["plan"] # type: ignore
@@ -102,7 +129,7 @@ def reflect(
102
129
  chat: List[Dict[str, str]],
103
130
  plan: str,
104
131
  code: str,
105
- model: LLM,
132
+ model: Union[LLM, LMM],
106
133
  ) -> Dict[str, Union[str, bool]]:
107
134
  chat = copy.deepcopy(chat)
108
135
  if chat[-1]["role"] != "user":
@@ -308,7 +335,7 @@ class VisionAgent(Agent):
308
335
 
309
336
  def __init__(
310
337
  self,
311
- planner: Optional[LLM] = None,
338
+ planner: Optional[Union[LLM, LMM]] = None,
312
339
  coder: Optional[LLM] = None,
313
340
  tester: Optional[LLM] = None,
314
341
  debugger: Optional[LLM] = None,
@@ -522,6 +549,9 @@ class VisionAgent(Agent):
522
549
  for res in execution_result.results:
523
550
  if res.png:
524
551
  b64_to_pil(res.png).show()
552
+ if res.mp4:
553
+ play_video(res.mp4)
554
+
525
555
  return {
526
556
  "code": code,
527
557
  "test": test,
@@ -29,14 +29,17 @@ PLAN = """
29
29
  {feedback}
30
30
 
31
31
  **Instructions**:
32
- Based on the context and tools you have available, write a plan of subtasks to achieve the user request utilizing given tools when necessary. Output a list of jsons in the following format:
32
+ 1. Based on the context and tools you have available, write a plan of subtasks to achieve the user request.
33
+ 2. Go over the users request step by step and ensure each step is represented as a clear subtask in your plan.
34
+
35
+ Output a list of jsons in the following format
33
36
 
34
37
  ```json
35
38
  {{
36
39
  "plan":
37
40
  [
38
41
  {{
39
- "instructions": str # what you should do in this task, one short phrase or sentence
42
+ "instructions": str # what you should do in this task associated with a tool
40
43
  }}
41
44
  ]
42
45
  }}
@@ -22,6 +22,7 @@ from .tools import (
22
22
  overlay_segmentation_masks,
23
23
  save_image,
24
24
  save_json,
25
+ save_video_to_result,
25
26
  visual_prompt_counting,
26
27
  zero_shot_counting,
27
28
  )
@@ -15,6 +15,7 @@ from PIL import Image, ImageDraw, ImageFont
15
15
 
16
16
  from vision_agent.tools.tool_utils import _send_inference_request
17
17
  from vision_agent.utils import extract_frames_from_video
18
+ from vision_agent.utils.execute import FileSerializer, MimeType
18
19
  from vision_agent.utils.image_utils import (
19
20
  b64_to_pil,
20
21
  convert_to_b64,
@@ -198,14 +199,15 @@ def extract_frames(
198
199
 
199
200
  def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
200
201
  """'ocr' extracts text from an image. It returns a list of detected text, bounding
201
- boxes, and confidence scores. The results are sorted from top-left to bottom right
202
+ boxes with normalized coordinates, and confidence scores. The results are sorted
203
+ from top-left to bottom right.
202
204
 
203
205
  Parameters:
204
206
  image (np.ndarray): The image to extract text from.
205
207
 
206
208
  Returns:
207
- List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox,
208
- and confidence score.
209
+ List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
210
+ with nornmalized coordinates, and confidence score.
209
211
 
210
212
  Example
211
213
  -------
@@ -550,6 +552,29 @@ def save_image(image: np.ndarray) -> str:
550
552
  return f.name
551
553
 
552
554
 
555
+ def save_video_to_result(video_uri: str) -> None:
556
+ """'save_video_to_result' a utility function that saves a video into the result of the code execution (as an intermediate output).
557
+ This function is required to run if user wants to visualize the video generated by the code.
558
+
559
+ Parameters:
560
+ video_uri (str): The URI to the video file. Currently only local file paths are supported.
561
+
562
+ Example
563
+ -------
564
+ >>> save_video_to_result("path/to/video.mp4")
565
+ """
566
+ from IPython.display import display
567
+
568
+ serializer = FileSerializer(video_uri)
569
+ display(
570
+ {
571
+ MimeType.VIDEO_MP4_B64: serializer.base64(),
572
+ MimeType.TEXT_PLAIN: str(serializer),
573
+ },
574
+ raw=True,
575
+ )
576
+
577
+
553
578
  def overlay_bounding_boxes(
554
579
  image: np.ndarray, bboxes: List[Dict[str, Any]]
555
580
  ) -> np.ndarray:
@@ -570,6 +595,8 @@ def overlay_bounding_boxes(
570
595
  image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
571
596
  )
572
597
  """
598
+ from IPython.display import display
599
+
573
600
  pil_image = Image.fromarray(image.astype(np.uint8))
574
601
 
575
602
  if len(set([box["label"] for box in bboxes])) > len(COLORS):
@@ -606,7 +633,10 @@ def overlay_bounding_boxes(
606
633
  text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
607
634
  draw.rectangle((box[0], box[1], text_box[2], text_box[3]), fill=color[label])
608
635
  draw.text((box[0], box[1]), text, fill="black", font=font)
609
- return np.array(pil_image.convert("RGB"))
636
+
637
+ pil_image = pil_image.convert("RGB")
638
+ display(pil_image)
639
+ return np.array(pil_image)
610
640
 
611
641
 
612
642
  def overlay_segmentation_masks(
@@ -637,6 +667,8 @@ def overlay_segmentation_masks(
637
667
  }],
638
668
  )
639
669
  """
670
+ from IPython.display import display
671
+
640
672
  pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGBA")
641
673
 
642
674
  if len(set([mask["label"] for mask in masks])) > len(COLORS):
@@ -656,7 +688,10 @@ def overlay_segmentation_masks(
656
688
  np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
657
689
  mask_img = Image.fromarray(np_mask.astype(np.uint8))
658
690
  pil_image = Image.alpha_composite(pil_image, mask_img)
659
- return np.array(pil_image.convert("RGB"))
691
+
692
+ pil_image = pil_image.convert("RGB")
693
+ display(pil_image)
694
+ return np.array(pil_image)
660
695
 
661
696
 
662
697
  def overlay_heat_map(
@@ -686,6 +721,8 @@ def overlay_heat_map(
686
721
  },
687
722
  )
688
723
  """
724
+ from IPython.display import display
725
+
689
726
  pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
690
727
 
691
728
  if "heat_map" not in heat_map or len(heat_map["heat_map"]) == 0:
@@ -701,7 +738,10 @@ def overlay_heat_map(
701
738
  combined = Image.alpha_composite(
702
739
  pil_image.convert("RGBA"), overlay.resize(pil_image.size)
703
740
  )
704
- return np.array(combined.convert("RGB"))
741
+
742
+ pil_image = combined.convert("RGB")
743
+ display(pil_image)
744
+ return np.array(pil_image)
705
745
 
706
746
 
707
747
  def get_tool_documentation(funcs: List[Callable[..., Any]]) -> str:
@@ -763,6 +803,7 @@ TOOLS = [
763
803
  save_json,
764
804
  load_image,
765
805
  save_image,
806
+ save_video_to_result,
766
807
  overlay_bounding_boxes,
767
808
  overlay_segmentation_masks,
768
809
  overlay_heat_map,
@@ -775,6 +816,7 @@ UTILITIES_DOCSTRING = get_tool_documentation(
775
816
  save_json,
776
817
  load_image,
777
818
  save_image,
819
+ save_video_to_result,
778
820
  overlay_bounding_boxes,
779
821
  overlay_segmentation_masks,
780
822
  overlay_heat_map,
@@ -1,5 +1,6 @@
1
1
  import abc
2
2
  import atexit
3
+ import base64
3
4
  import copy
4
5
  import logging
5
6
  import os
@@ -45,12 +46,31 @@ class MimeType(str, Enum):
45
46
  IMAGE_SVG = "image/svg+xml"
46
47
  IMAGE_PNG = "image/png"
47
48
  IMAGE_JPEG = "image/jpeg"
49
+ VIDEO_MP4_B64 = "video/mp4/base64"
48
50
  APPLICATION_PDF = "application/pdf"
49
51
  TEXT_LATEX = "text/latex"
50
52
  APPLICATION_JSON = "application/json"
51
53
  APPLICATION_JAVASCRIPT = "application/javascript"
52
54
 
53
55
 
56
+ class FileSerializer:
57
+ """Adaptor class that allows IPython.display.display() to serialize a file to a base64 string representation."""
58
+
59
+ def __init__(self, file_uri: str):
60
+ self.video_uri = file_uri
61
+ assert os.path.isfile(
62
+ file_uri
63
+ ), f"Only support local files currently: {file_uri}"
64
+ assert Path(file_uri).exists(), f"File not found: {file_uri}"
65
+
66
+ def __repr__(self) -> str:
67
+ return f"FileSerializer({self.video_uri})"
68
+
69
+ def base64(self) -> str:
70
+ with open(self.video_uri, "rb") as file:
71
+ return base64.b64encode(file.read()).decode("utf-8")
72
+
73
+
54
74
  class Result:
55
75
  """
56
76
  Represents the data to be displayed as a result of executing a cell in a Jupyter notebook.
@@ -70,6 +90,7 @@ class Result:
70
90
  png: Optional[str] = None
71
91
  jpeg: Optional[str] = None
72
92
  pdf: Optional[str] = None
93
+ mp4: Optional[str] = None
73
94
  latex: Optional[str] = None
74
95
  json: Optional[Dict[str, Any]] = None
75
96
  javascript: Optional[str] = None
@@ -93,6 +114,7 @@ class Result:
93
114
  self.png = data.pop(MimeType.IMAGE_PNG, None)
94
115
  self.jpeg = data.pop(MimeType.IMAGE_JPEG, None)
95
116
  self.pdf = data.pop(MimeType.APPLICATION_PDF, None)
117
+ self.mp4 = data.pop(MimeType.VIDEO_MP4_B64, None)
96
118
  self.latex = data.pop(MimeType.TEXT_LATEX, None)
97
119
  self.json = data.pop(MimeType.APPLICATION_JSON, None)
98
120
  self.javascript = data.pop(MimeType.APPLICATION_JAVASCRIPT, None)
@@ -190,6 +212,8 @@ class Result:
190
212
  formats.append("json")
191
213
  if self.javascript:
192
214
  formats.append("javascript")
215
+ if self.mp4:
216
+ formats.append("mp4")
193
217
  if self.extra:
194
218
  formats.extend(iter(self.extra))
195
219
  return formats
@@ -1,7 +1,9 @@
1
+ import base64
1
2
  import logging
2
3
  import math
3
4
  import os
4
5
  from concurrent.futures import ProcessPoolExecutor, as_completed
6
+ import tempfile
5
7
  from typing import List, Tuple, cast
6
8
 
7
9
  import cv2
@@ -14,6 +16,39 @@ _LOGGER = logging.getLogger(__name__)
14
16
  _CLIP_LENGTH = 30.0
15
17
 
16
18
 
19
+ def play_video(video_base64: str) -> None:
20
+ """Play a video file"""
21
+ video_data = base64.b64decode(video_base64)
22
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video:
23
+ temp_video.write(video_data)
24
+ temp_video_path = temp_video.name
25
+
26
+ cap = cv2.VideoCapture(temp_video_path)
27
+ if not cap.isOpened():
28
+ _LOGGER.error("Error: Could not open video.")
29
+ return
30
+
31
+ # Display the first frame and wait for any key press to start the video
32
+ ret, frame = cap.read()
33
+ if ret:
34
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
35
+ cv2.imshow("Video Player", frame)
36
+ _LOGGER.info(f"Press any key to start playing the video: {temp_video_path}")
37
+ cv2.waitKey(0) # Wait for any key press
38
+
39
+ while cap.isOpened():
40
+ ret, frame = cap.read()
41
+ if not ret:
42
+ break
43
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
44
+ cv2.imshow("Video Player", frame)
45
+ # Press 'q' to exit the video
46
+ if cv2.waitKey(200) & 0xFF == ord("q"):
47
+ break
48
+ cap.release()
49
+ cv2.destroyAllWindows()
50
+
51
+
17
52
  def extract_frames_from_video(
18
53
  video_uri: str, fps: float = 0.5, motion_detection_threshold: float = 0.0
19
54
  ) -> List[Tuple[np.ndarray, float]]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.43
3
+ Version: 0.2.45
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -18,7 +18,7 @@ Requires-Dist: nbclient (>=0.10.0,<0.11.0)
18
18
  Requires-Dist: nbformat (>=5.10.4,<6.0.0)
19
19
  Requires-Dist: numpy (>=1.21.0,<2.0.0)
20
20
  Requires-Dist: openai (>=1.0.0,<2.0.0)
21
- Requires-Dist: opencv-python-headless (>=4.0.0,<5.0.0)
21
+ Requires-Dist: opencv-python (>=4.0.0,<5.0.0)
22
22
  Requires-Dist: pandas (>=2.0.0,<3.0.0)
23
23
  Requires-Dist: pillow (>=10.0.0,<11.0.0)
24
24
  Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
@@ -11,26 +11,26 @@ vision_agent/agent/easytool_v2.py,sha256=CjY-sSj3abxnSq3ZHZMt-7YvRWDXEZsC6RN8FFI
11
11
  vision_agent/agent/easytool_v2_prompts.py,sha256=MZSIwovYgB-f-kdJ6btaNDVXptJn47bfOL3-Zn6NiC0,8573
12
12
  vision_agent/agent/reflexion.py,sha256=AlM5AvBJvCslXlYQdZiadq4oVHsNBm3IF_03DglTxRo,10506
13
13
  vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
14
- vision_agent/agent/vision_agent.py,sha256=Sv9aC9AO1LxsSBG8qGmS6C2ViIFg85p9uWLOdlbTu9g,18624
15
- vision_agent/agent/vision_agent_prompts.py,sha256=FnIYF2Fe3joRvFnOJD9ZyWXMihMyL606nXxWJ0adTZ8,8314
14
+ vision_agent/agent/vision_agent.py,sha256=c3jJd1uiCtmVC2xazUvW9rwc7usi-EOYW7NZnMFOdt8,19586
15
+ vision_agent/agent/vision_agent_prompts.py,sha256=bIcqutsyM2bEhWE2XGw01PuZ9f-jePSwapbvkOOrFZ4,8384
16
16
  vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
18
18
  vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
19
19
  vision_agent/llm/llm.py,sha256=UZ73GqQHE-NKOJWsrOTWfmdHYsbCBkJ5rZ7dhcSCHHw,5951
20
20
  vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
21
21
  vision_agent/lmm/lmm.py,sha256=NwcZYLTzi95LSMAk0sTtw7G_zBLa9lU-DHM5GUUCiK4,10622
22
- vision_agent/tools/__init__.py,sha256=oZa_sslb1UqEgpdWROChDcz5JHdB475ejJX78FMLYvE,1512
22
+ vision_agent/tools/__init__.py,sha256=K_7knxmyTIcSEGL8c9wF8RpVh3GrMYfybFaq-2SUM1w,1538
23
23
  vision_agent/tools/easytool_tools.py,sha256=pZc5dQlYINlV4nYbbzsDi3-wauA-fCeD2iGmJUMoUfE,47373
24
24
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
25
25
  vision_agent/tools/tool_utils.py,sha256=wzRacbUpqk9hhfX_Y08rL8qP0XCN2w-8IZoYLi3Upn4,869
26
- vision_agent/tools/tools.py,sha256=h3TlucPuk3wsQguddtnCf6_ehEuELPrbT6-GI9YZe3E,24764
26
+ vision_agent/tools/tools.py,sha256=66pFXUIVvnOa1fk0PY5u_75kblIbAVqkRP2U9qLixrY,25951
27
27
  vision_agent/utils/__init__.py,sha256=Ce4yPhoWanRsnTy3X7YzZNBYYRJsrJeT7N59WUf8GZM,209
28
- vision_agent/utils/execute.py,sha256=losZeWbhNVlBr4xYsy5dKAslarjiKwuPsKgTmLV6zgE,19497
28
+ vision_agent/utils/execute.py,sha256=GlpUGe3pg5KdSvRHLFfVcn9ptXBIp-QRoHT3Wa6aIMs,20318
29
29
  vision_agent/utils/image_utils.py,sha256=_cdiS5YrLzqkq_ZgFUO897m5M4_SCIThwUy4lOklfB8,7700
30
30
  vision_agent/utils/sim.py,sha256=oUZ-6eu8Io-UNt9GXJ0XRKtP-Wc0sPWVzYGVpB2yDFk,3001
31
31
  vision_agent/utils/type_defs.py,sha256=BlI8ywWHAplC7kYWLvt4AOdnKpEW3qWEFm-GEOSkrFQ,1792
32
- vision_agent/utils/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
33
- vision_agent-0.2.43.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
34
- vision_agent-0.2.43.dist-info/METADATA,sha256=7z0t0gus3S4eVTl3yik6RfX9lvNGwGROSaqdbXCJeRc,6826
35
- vision_agent-0.2.43.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
36
- vision_agent-0.2.43.dist-info/RECORD,,
32
+ vision_agent/utils/video.py,sha256=EuJJ7Owi3pIV-q3WcZ-LaaTrGAmmZ8YAA22rmEkY7GI,8885
33
+ vision_agent-0.2.45.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
34
+ vision_agent-0.2.45.dist-info/METADATA,sha256=G9Cy7cUPEWi42cuVP8V7u_ZjNXEnYp26_kb_u9rXSQk,6817
35
+ vision_agent-0.2.45.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
36
+ vision_agent-0.2.45.dist-info/RECORD,,