PyPI - vision-agent - Versions diffs - 0.2.43__py3-none-any.whl → 0.2.45__py3-none-any.whl - Mend

vision-agent 0.2.43py3-none-any.whl → 0.2.45py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

vision_agent/agent/vision_agent.py CHANGED Viewed

@@ -2,9 +2,11 @@ import copy
 import json
 import logging
 import sys
+import tempfile
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Union, cast
+from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
+from PIL import Image
 from rich.console import Console
 from rich.style import Style
 from rich.syntax import Syntax
@@ -28,6 +30,7 @@ from vision_agent.utils import CodeInterpreterFactory, Execution
 from vision_agent.utils.execute import CodeInterpreter
 from vision_agent.utils.image_utils import b64_to_pil
 from vision_agent.utils.sim import Sim
+from vision_agent.utils.video import play_video
 logging.basicConfig(stream=sys.stdout)
 _LOGGER = logging.getLogger(__name__)
@@ -77,12 +80,35 @@ def extract_json(json_str: str) -> Dict[str, Any]:
     return json_dict  # type: ignore
+def extract_image(
+    media: Optional[Sequence[Union[str, Path]]]
+) -> Optional[Sequence[Union[str, Path]]]:
+    if media is None:
+        return None
+    new_media = []
+    for m in media:
+        m = Path(m)
+        extension = m.suffix
+        if extension in [".jpg", ".jpeg", ".png", ".bmp"]:
+            new_media.append(m)
+        elif extension in [".mp4", ".mov"]:
+            frames = T.extract_frames(m)
+            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
+                if len(frames) > 0:
+                    Image.fromarray(frames[0][0]).save(tmp.name)
+                    new_media.append(Path(tmp.name))
+    if len(new_media) == 0:
+        return None
+    return new_media
 def write_plan(
     chat: List[Dict[str, str]],
     tool_desc: str,
     working_memory: str,
     model: Union[LLM, LMM],
-    media: Optional[List[Union[str, Path]]] = None,
+    media: Optional[Sequence[Union[str, Path]]] = None,
 ) -> List[Dict[str, str]]:
     chat = copy.deepcopy(chat)
     if chat[-1]["role"] != "user":
@@ -93,6 +119,7 @@ def write_plan(
     prompt = PLAN.format(context=context, tool_desc=tool_desc, feedback=working_memory)
     chat[-1]["content"] = prompt
     if isinstance(model, OpenAILMM):
+        media = extract_image(media)
         return extract_json(model.chat(chat, images=media))["plan"]  # type: ignore
     else:
         return extract_json(model.chat(chat))["plan"]  # type: ignore
@@ -102,7 +129,7 @@ def reflect(
     chat: List[Dict[str, str]],
     plan: str,
     code: str,
-    model: LLM,
+    model: Union[LLM, LMM],
 ) -> Dict[str, Union[str, bool]]:
     chat = copy.deepcopy(chat)
     if chat[-1]["role"] != "user":
@@ -308,7 +335,7 @@ class VisionAgent(Agent):
     def __init__(
         self,
-        planner: Optional[LLM] = None,
+        planner: Optional[Union[LLM, LMM]] = None,
         coder: Optional[LLM] = None,
         tester: Optional[LLM] = None,
         debugger: Optional[LLM] = None,
@@ -522,6 +549,9 @@ class VisionAgent(Agent):
                 for res in execution_result.results:
                     if res.png:
                         b64_to_pil(res.png).show()
+                    if res.mp4:
+                        play_video(res.mp4)
             return {
                 "code": code,
                 "test": test,

vision_agent/agent/vision_agent_prompts.py CHANGED Viewed

@@ -29,14 +29,17 @@ PLAN = """
 {feedback}
 **Instructions**:
-Based on the context and tools you have available, write a plan of subtasks to achieve the user request utilizing given tools when necessary. Output a list of jsons in the following format:
+1. Based on the context and tools you have available, write a plan of subtasks to achieve the user request.
+2. Go over the users request step by step and ensure each step is represented as a clear subtask in your plan.
+Output a list of jsons in the following format
 ```json
 {{
     "plan":
         [
             {{
-                "instructions": str # what you should do in this task, one short phrase or sentence
+                "instructions": str # what you should do in this task associated with a tool
             }}
         ]
 }}

vision_agent/tools/__init__.py CHANGED Viewed

@@ -22,6 +22,7 @@ from .tools import (
     overlay_segmentation_masks,
     save_image,
     save_json,
+    save_video_to_result,
     visual_prompt_counting,
     zero_shot_counting,
 )

vision_agent/tools/tools.py CHANGED Viewed

@@ -15,6 +15,7 @@ from PIL import Image, ImageDraw, ImageFont
 from vision_agent.tools.tool_utils import _send_inference_request
 from vision_agent.utils import extract_frames_from_video
+from vision_agent.utils.execute import FileSerializer, MimeType
 from vision_agent.utils.image_utils import (
     b64_to_pil,
     convert_to_b64,
@@ -198,14 +199,15 @@ def extract_frames(
 def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
     """'ocr' extracts text from an image. It returns a list of detected text, bounding
-    boxes, and confidence scores. The results are sorted from top-left to bottom right
+    boxes with normalized coordinates, and confidence scores. The results are sorted
+    from top-left to bottom right.
     Parameters:
         image (np.ndarray): The image to extract text from.
     Returns:
-        List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox,
-            and confidence score.
+        List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
+            with nornmalized coordinates, and confidence score.
     Example
     -------
@@ -550,6 +552,29 @@ def save_image(image: np.ndarray) -> str:
     return f.name
+def save_video_to_result(video_uri: str) -> None:
+    """'save_video_to_result' a utility function that saves a video into the result of the code execution (as an intermediate output).
+    This function is required to run if user wants to visualize the video generated by the code.
+    Parameters:
+        video_uri (str): The URI to the video file. Currently only local file paths are supported.
+    Example
+    -------
+    >>> save_video_to_result("path/to/video.mp4")
+    """
+    from IPython.display import display
+    serializer = FileSerializer(video_uri)
+    display(
+        {
+            MimeType.VIDEO_MP4_B64: serializer.base64(),
+            MimeType.TEXT_PLAIN: str(serializer),
+        },
+        raw=True,
+    )
 def overlay_bounding_boxes(
     image: np.ndarray, bboxes: List[Dict[str, Any]]
 ) -> np.ndarray:
@@ -570,6 +595,8 @@ def overlay_bounding_boxes(
         image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
     )
     """
+    from IPython.display import display
     pil_image = Image.fromarray(image.astype(np.uint8))
     if len(set([box["label"] for box in bboxes])) > len(COLORS):
@@ -606,7 +633,10 @@ def overlay_bounding_boxes(
         text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
         draw.rectangle((box[0], box[1], text_box[2], text_box[3]), fill=color[label])
         draw.text((box[0], box[1]), text, fill="black", font=font)
-    return np.array(pil_image.convert("RGB"))
+    pil_image = pil_image.convert("RGB")
+    display(pil_image)
+    return np.array(pil_image)
 def overlay_segmentation_masks(
@@ -637,6 +667,8 @@ def overlay_segmentation_masks(
         }],
     )
     """
+    from IPython.display import display
     pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGBA")
     if len(set([mask["label"] for mask in masks])) > len(COLORS):
@@ -656,7 +688,10 @@ def overlay_segmentation_masks(
         np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
         mask_img = Image.fromarray(np_mask.astype(np.uint8))
         pil_image = Image.alpha_composite(pil_image, mask_img)
-    return np.array(pil_image.convert("RGB"))
+    pil_image = pil_image.convert("RGB")
+    display(pil_image)
+    return np.array(pil_image)
 def overlay_heat_map(
@@ -686,6 +721,8 @@ def overlay_heat_map(
         },
     )
     """
+    from IPython.display import display
     pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
     if "heat_map" not in heat_map or len(heat_map["heat_map"]) == 0:
@@ -701,7 +738,10 @@ def overlay_heat_map(
     combined = Image.alpha_composite(
         pil_image.convert("RGBA"), overlay.resize(pil_image.size)
     )
-    return np.array(combined.convert("RGB"))
+    pil_image = combined.convert("RGB")
+    display(pil_image)
+    return np.array(pil_image)
 def get_tool_documentation(funcs: List[Callable[..., Any]]) -> str:
@@ -763,6 +803,7 @@ TOOLS = [
     save_json,
     load_image,
     save_image,
+    save_video_to_result,
     overlay_bounding_boxes,
     overlay_segmentation_masks,
     overlay_heat_map,
@@ -775,6 +816,7 @@ UTILITIES_DOCSTRING = get_tool_documentation(
         save_json,
         load_image,
         save_image,
+        save_video_to_result,
         overlay_bounding_boxes,
         overlay_segmentation_masks,
         overlay_heat_map,

vision_agent/utils/execute.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import abc
 import atexit
+import base64
 import copy
 import logging
 import os
@@ -45,12 +46,31 @@ class MimeType(str, Enum):
     IMAGE_SVG = "image/svg+xml"
     IMAGE_PNG = "image/png"
     IMAGE_JPEG = "image/jpeg"
+    VIDEO_MP4_B64 = "video/mp4/base64"
     APPLICATION_PDF = "application/pdf"
     TEXT_LATEX = "text/latex"
     APPLICATION_JSON = "application/json"
     APPLICATION_JAVASCRIPT = "application/javascript"
+class FileSerializer:
+    """Adaptor class that allows IPython.display.display() to serialize a file to a base64 string representation."""
+    def __init__(self, file_uri: str):
+        self.video_uri = file_uri
+        assert os.path.isfile(
+            file_uri
+        ), f"Only support local files currently: {file_uri}"
+        assert Path(file_uri).exists(), f"File not found: {file_uri}"
+    def __repr__(self) -> str:
+        return f"FileSerializer({self.video_uri})"
+    def base64(self) -> str:
+        with open(self.video_uri, "rb") as file:
+            return base64.b64encode(file.read()).decode("utf-8")
 class Result:
     """
     Represents the data to be displayed as a result of executing a cell in a Jupyter notebook.
@@ -70,6 +90,7 @@ class Result:
     png: Optional[str] = None
     jpeg: Optional[str] = None
     pdf: Optional[str] = None
+    mp4: Optional[str] = None
     latex: Optional[str] = None
     json: Optional[Dict[str, Any]] = None
     javascript: Optional[str] = None
@@ -93,6 +114,7 @@ class Result:
         self.png = data.pop(MimeType.IMAGE_PNG, None)
         self.jpeg = data.pop(MimeType.IMAGE_JPEG, None)
         self.pdf = data.pop(MimeType.APPLICATION_PDF, None)
+        self.mp4 = data.pop(MimeType.VIDEO_MP4_B64, None)
         self.latex = data.pop(MimeType.TEXT_LATEX, None)
         self.json = data.pop(MimeType.APPLICATION_JSON, None)
         self.javascript = data.pop(MimeType.APPLICATION_JAVASCRIPT, None)
@@ -190,6 +212,8 @@ class Result:
             formats.append("json")
         if self.javascript:
             formats.append("javascript")
+        if self.mp4:
+            formats.append("mp4")
         if self.extra:
             formats.extend(iter(self.extra))
         return formats

vision_agent/utils/video.py CHANGED Viewed

@@ -1,7 +1,9 @@
+import base64
 import logging
 import math
 import os
 from concurrent.futures import ProcessPoolExecutor, as_completed
+import tempfile
 from typing import List, Tuple, cast
 import cv2
@@ -14,6 +16,39 @@ _LOGGER = logging.getLogger(__name__)
 _CLIP_LENGTH = 30.0
+def play_video(video_base64: str) -> None:
+    """Play a video file"""
+    video_data = base64.b64decode(video_base64)
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video:
+        temp_video.write(video_data)
+        temp_video_path = temp_video.name
+        cap = cv2.VideoCapture(temp_video_path)
+        if not cap.isOpened():
+            _LOGGER.error("Error: Could not open video.")
+            return
+        # Display the first frame and wait for any key press to start the video
+        ret, frame = cap.read()
+        if ret:
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            cv2.imshow("Video Player", frame)
+            _LOGGER.info(f"Press any key to start playing the video: {temp_video_path}")
+            cv2.waitKey(0)  # Wait for any key press
+        while cap.isOpened():
+            ret, frame = cap.read()
+            if not ret:
+                break
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            cv2.imshow("Video Player", frame)
+            # Press 'q' to exit the video
+            if cv2.waitKey(200) & 0xFF == ord("q"):
+                break
+        cap.release()
+        cv2.destroyAllWindows()
 def extract_frames_from_video(
     video_uri: str, fps: float = 0.5, motion_detection_threshold: float = 0.0
 ) -> List[Tuple[np.ndarray, float]]:

{vision_agent-0.2.43.dist-info → vision_agent-0.2.45.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.43
+Version: 0.2.45
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai
@@ -18,7 +18,7 @@ Requires-Dist: nbclient (>=0.10.0,<0.11.0)
 Requires-Dist: nbformat (>=5.10.4,<6.0.0)
 Requires-Dist: numpy (>=1.21.0,<2.0.0)
 Requires-Dist: openai (>=1.0.0,<2.0.0)
-Requires-Dist: opencv-python-headless (>=4.0.0,<5.0.0)
+Requires-Dist: opencv-python (>=4.0.0,<5.0.0)
 Requires-Dist: pandas (>=2.0.0,<3.0.0)
 Requires-Dist: pillow (>=10.0.0,<11.0.0)
 Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)

{vision_agent-0.2.43.dist-info → vision_agent-0.2.45.dist-info}/RECORD RENAMED Viewed

@@ -11,26 +11,26 @@ vision_agent/agent/easytool_v2.py,sha256=CjY-sSj3abxnSq3ZHZMt-7YvRWDXEZsC6RN8FFI
 vision_agent/agent/easytool_v2_prompts.py,sha256=MZSIwovYgB-f-kdJ6btaNDVXptJn47bfOL3-Zn6NiC0,8573
 vision_agent/agent/reflexion.py,sha256=AlM5AvBJvCslXlYQdZiadq4oVHsNBm3IF_03DglTxRo,10506
 vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
-vision_agent/agent/vision_agent.py,sha256=Sv9aC9AO1LxsSBG8qGmS6C2ViIFg85p9uWLOdlbTu9g,18624
-vision_agent/agent/vision_agent_prompts.py,sha256=FnIYF2Fe3joRvFnOJD9ZyWXMihMyL606nXxWJ0adTZ8,8314
+vision_agent/agent/vision_agent.py,sha256=c3jJd1uiCtmVC2xazUvW9rwc7usi-EOYW7NZnMFOdt8,19586
+vision_agent/agent/vision_agent_prompts.py,sha256=bIcqutsyM2bEhWE2XGw01PuZ9f-jePSwapbvkOOrFZ4,8384
 vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
 vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
 vision_agent/llm/llm.py,sha256=UZ73GqQHE-NKOJWsrOTWfmdHYsbCBkJ5rZ7dhcSCHHw,5951
 vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
 vision_agent/lmm/lmm.py,sha256=NwcZYLTzi95LSMAk0sTtw7G_zBLa9lU-DHM5GUUCiK4,10622
-vision_agent/tools/__init__.py,sha256=oZa_sslb1UqEgpdWROChDcz5JHdB475ejJX78FMLYvE,1512
+vision_agent/tools/__init__.py,sha256=K_7knxmyTIcSEGL8c9wF8RpVh3GrMYfybFaq-2SUM1w,1538
 vision_agent/tools/easytool_tools.py,sha256=pZc5dQlYINlV4nYbbzsDi3-wauA-fCeD2iGmJUMoUfE,47373
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
 vision_agent/tools/tool_utils.py,sha256=wzRacbUpqk9hhfX_Y08rL8qP0XCN2w-8IZoYLi3Upn4,869
-vision_agent/tools/tools.py,sha256=h3TlucPuk3wsQguddtnCf6_ehEuELPrbT6-GI9YZe3E,24764
+vision_agent/tools/tools.py,sha256=66pFXUIVvnOa1fk0PY5u_75kblIbAVqkRP2U9qLixrY,25951
 vision_agent/utils/__init__.py,sha256=Ce4yPhoWanRsnTy3X7YzZNBYYRJsrJeT7N59WUf8GZM,209
-vision_agent/utils/execute.py,sha256=losZeWbhNVlBr4xYsy5dKAslarjiKwuPsKgTmLV6zgE,19497
+vision_agent/utils/execute.py,sha256=GlpUGe3pg5KdSvRHLFfVcn9ptXBIp-QRoHT3Wa6aIMs,20318
 vision_agent/utils/image_utils.py,sha256=_cdiS5YrLzqkq_ZgFUO897m5M4_SCIThwUy4lOklfB8,7700
 vision_agent/utils/sim.py,sha256=oUZ-6eu8Io-UNt9GXJ0XRKtP-Wc0sPWVzYGVpB2yDFk,3001
 vision_agent/utils/type_defs.py,sha256=BlI8ywWHAplC7kYWLvt4AOdnKpEW3qWEFm-GEOSkrFQ,1792
-vision_agent/utils/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
-vision_agent-0.2.43.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.43.dist-info/METADATA,sha256=7z0t0gus3S4eVTl3yik6RfX9lvNGwGROSaqdbXCJeRc,6826
-vision_agent-0.2.43.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.43.dist-info/RECORD,,
+vision_agent/utils/video.py,sha256=EuJJ7Owi3pIV-q3WcZ-LaaTrGAmmZ8YAA22rmEkY7GI,8885
+vision_agent-0.2.45.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.45.dist-info/METADATA,sha256=G9Cy7cUPEWi42cuVP8V7u_ZjNXEnYp26_kb_u9rXSQk,6817
+vision_agent-0.2.45.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.45.dist-info/RECORD,,

{vision_agent-0.2.43.dist-info → vision_agent-0.2.45.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.43.dist-info → vision_agent-0.2.45.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.43__py3-none-any.whl → 0.2.45__py3-none-any.whl

vision-agent 0.2.43py3-none-any.whl → 0.2.45py3-none-any.whl