PyPI - vision-agent - Versions diffs - 0.2.126__py3-none-any.whl → 0.2.128__py3-none-any.whl - Mend

vision-agent 0.2.126py3-none-any.whl → 0.2.128py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

vision_agent/agent/vision_agent_coder_prompts.py CHANGED Viewed

@@ -70,30 +70,64 @@ This is the documentation for the functions you have access to. You may call any
 2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
 3. Your test case MUST run only on the given images which are {media}
 4. Print this final dictionary.
+5. For video input, sample at 1 FPS and use the first 10 frames only to reduce processing time.
 **Example**:
+--- EXAMPLE1 ---
 plan1:
 - Load the image from the provided file path 'image.jpg'.
-- Use the 'owl_v2' tool with the prompt 'person' to detect and count the number of people in the image.
+- Use the 'owl_v2_image' tool with the prompt 'person' to detect and count the number of people in the image.
 plan2:
 - Load the image from the provided file path 'image.jpg'.
-- Use the 'grounding_sam' tool with the prompt 'person' to detect and count the number of people in the image.
+- Use the 'florence2_sam2_image' tool with the prompt 'person' to detect and count the number of people in the image.
 - Count the number of detected objects labeled as 'person'.
 plan3:
 - Load the image from the provided file path 'image.jpg'.
 - Use the 'countgd_counting' tool to count the dominant foreground object, which in this case is people.
 ```python
-from vision_agent.tools import load_image, owl_v2, grounding_sam, countgd_counting
+from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_counting
 image = load_image("image.jpg")
-owl_v2_out = owl_v2("person", image)
+owl_v2_out = owl_v2_image("person", image)
-gsam_out = grounding_sam("person", image)
-gsam_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in gsam_out]
+f2s2_out = florence2_sam2_image("person", image)
+# strip out the masks from the output becuase they don't provide useful information when printed
+f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
 cgd_out = countgd_counting(image)
-final_out = {{"owl_v2": owl_v2_out, "florencev2_object_detection": florencev2_out, "countgd_counting": cgd_out}}
+final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_counting": cgd_out}}
+print(final_out)
+--- EXAMPLE2 ---
+plan1:
+- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
+- Use the 'owl_v2_image' tool with the prompt 'person' to detect where the people are in the video.
+plan2:
+- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
+- Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
+plan3:
+- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
+- Use the 'countgd_counting' tool with the prompt 'person' to detect where the people are in the video.
+```python
+from vision_agent.tools import extract_frames, owl_v2_image, florence2_phrase_grounding, countgd_counting
+# sample at 1 FPS and use the first 10 frames to reduce processing time
+frames = extract_frames("video.mp4", 1)
+frames = [f[0] for f in frames][:10]
+# plan1
+owl_v2_out = [owl_v2_image("person", f) for f in frames]
+# plan2
+florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
+# plan3
+countgd_out = [countgd_counting(f) for f in frames]
+final_out = {{"owl_v2_image": owl_v2_out, "florencev2_object_detection": florencev2_out, "countgd_counting": cgd_out}}
 print(final_out)
 ```
 """

vision_agent/tools/__init__.py CHANGED Viewed

@@ -27,7 +27,7 @@ from .tools import (
     florence2_phrase_grounding,
     florence2_roberta_vqa,
     florence2_sam2_image,
-    florence2_sam2_video,
+    florence2_sam2_video_tracking,
     generate_pose_image,
     generate_soft_edge_image,
     get_tool_documentation,
@@ -46,7 +46,8 @@ from .tools import (
     overlay_counting_results,
     overlay_heat_map,
     overlay_segmentation_masks,
-    owl_v2,
+    owl_v2_image,
+    owl_v2_video,
     save_image,
     save_json,
     save_video,

vision_agent/tools/tools.py CHANGED Viewed

@@ -145,15 +145,15 @@ def grounding_dino(
     return return_data
-def owl_v2(
+def owl_v2_image(
     prompt: str,
     image: np.ndarray,
     box_threshold: float = 0.10,
 ) -> List[Dict[str, Any]]:
-    """'owl_v2' is a tool that can detect and count multiple objects given a text
-    prompt such as category names or referring expressions. The categories in text
-    prompt are separated by commas. It returns a list of bounding boxes with normalized
-    coordinates, label names and associated probability scores.
+    """'owl_v2_image' is a tool that can detect and count multiple objects given a text
+    prompt such as category names or referring expressions on images. The categories in
+    text prompt are separated by commas. It returns a list of bounding boxes with
+    normalized coordinates, label names and associated probability scores.
     Parameters:
         prompt (str): The prompt to ground to the image.
@@ -170,32 +170,103 @@ def owl_v2(
     Example
     -------
-        >>> owl_v2("car, dinosaur", image)
+        >>> owl_v2_image("car, dinosaur", image)
         [
             {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
             {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
         ]
     """
     image_size = image.shape[:2]
-    image_b64 = convert_to_b64(image)
-    request_data = {
+    buffer_bytes = numpy_to_bytes(image)
+    files = [("image", buffer_bytes)]
+    payload = {
         "prompts": [s.strip() for s in prompt.split(",")],
-        "image": image_b64,
-        "confidence": box_threshold,
-        "function_name": "owl_v2",
+        "model": "owlv2",
+        "function_name": "owl_v2_image",
     }
-    data: Dict[str, Any] = send_inference_request(request_data, "owlv2", v2=True)
-    return_data = []
+    resp_data = send_inference_request(
+        payload, "text-to-object-detection", files=files, v2=True
+    )
+    bboxes = resp_data[0]
+    bboxes_formatted = [
+        ODResponseData(
+            label=bbox["label"],
+            bbox=normalize_bbox(bbox["bounding_box"], image_size),
+            score=round(bbox["score"], 2),
+        )
+        for bbox in bboxes
+    ]
+    filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
+    return [bbox.model_dump() for bbox in filtered_bboxes]
+def owl_v2_video(
+    prompt: str,
+    frames: List[np.ndarray],
+    box_threshold: float = 0.10,
+) -> List[List[Dict[str, Any]]]:
+    """'owl_v2_video' will run owl_v2 on each frame of a video. It can detect multiple
+    objects per frame given a text prompt sucha s a category name or referring
+    expression. The categories in text prompt are separated by commas. It returns a list
+    of lists where each inner list contains the score, label, and bounding box of the
+    detections for that frame.
+    Parameters:
+        prompt (str): The prompt to ground to the video.
+        frames (List[np.ndarray]): The list of frames to ground the prompt to.
+        box_threshold (float, optional): The threshold for the box detection. Defaults
+            to 0.30.
+    Returns:
+        List[List[Dict[str, Any]]]: A list of lists of dictionaries containing the
+            score, label, and bounding box of the detected objects with normalized
+            coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the
+            coordinates of the top-left and xmax and ymax are the coordinates of the
+            bottom-right of the bounding box.
+    Example
+    -------
+        >>> owl_v2_video("car, dinosaur", frames)
+        [
+            [
+                {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
+                {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
+            ],
+            ...
+        ]
+    """
+    if len(frames) == 0:
+        raise ValueError("No frames provided")
+    image_size = frames[0].shape[:2]
+    buffer_bytes = frames_to_bytes(frames)
+    files = [("video", buffer_bytes)]
+    payload = {
+        "prompts": [s.strip() for s in prompt.split(",")],
+        "model": "owlv2",
+        "function_name": "owl_v2_video",
+    }
+    data: Dict[str, Any] = send_inference_request(
+        payload, "text-to-object-detection", files=files, v2=True
+    )
+    bboxes_formatted = []
     if data is not None:
-        for elt in data:
-            return_data.append(
-                {
-                    "bbox": normalize_bbox(elt["bbox"], image_size),  # type: ignore
-                    "label": elt["label"],  # type: ignore
-                    "score": round(elt["score"], 2),  # type: ignore
-                }
-            )
-    return return_data
+        for frame_data in data:
+            bboxes_formated_frame = []
+            for elt in frame_data:
+                bboxes_formated_frame.append(
+                    ODResponseData(
+                        label=elt["label"],  # type: ignore
+                        bbox=normalize_bbox(elt["bounding_box"], image_size),  # type: ignore
+                        score=round(elt["score"], 2),  # type: ignore
+                    )
+                )
+            bboxes_formatted.append(bboxes_formated_frame)
+    filtered_bboxes = [
+        filter_bboxes_by_threshold(elt, box_threshold) for elt in bboxes_formatted
+    ]
+    return [[bbox.model_dump() for bbox in frame] for frame in filtered_bboxes]
 def grounding_sam(
@@ -317,14 +388,14 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
     return return_data
-def florence2_sam2_video(
+def florence2_sam2_video_tracking(
     prompt: str, frames: List[np.ndarray]
 ) -> List[List[Dict[str, Any]]]:
-    """'florence2_sam2_video' is a tool that can segment and track multiple entities
-    in a video given a text prompt such as category names or referring expressions. You
-    can optionally separate the categories in the text with commas. It only tracks
-    entities present in the first frame and only returns segmentation masks. It is
-    useful for tracking and counting without duplicating counts.
+    """'florence2_sam2_video_tracking' is a tool that can segment and track multiple
+    entities in a video given a text prompt such as category names or referring
+    expressions. You can optionally separate the categories in the text with commas. It
+    only tracks entities present in the first frame and only returns segmentation
+    masks. It is useful for tracking and counting without duplicating counts.
     Parameters:
         prompt (str): The prompt to ground to the video.
@@ -351,14 +422,15 @@ def florence2_sam2_video(
                         [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
                 },
             ],
+            ...
         ]
     """
     buffer_bytes = frames_to_bytes(frames)
     files = [("video", buffer_bytes)]
     payload = {
-        "prompts": prompt.split(","),
-        "function_name": "florence2_sam2_video",
+        "prompts": [s.strip() for s in prompt.split(",")],
+        "function_name": "florence2_sam2_video_tracking",
     }
     data: Dict[str, Any] = send_inference_request(
         payload, "florence2-sam2", files=files, v2=True
@@ -549,7 +621,14 @@ def countgd_counting(
         payload, "text-to-object-detection", files=files, metadata=metadata
     )
     bboxes_per_frame = resp_data[0]
-    bboxes_formatted = [ODResponseData(**bbox) for bbox in bboxes_per_frame]
+    bboxes_formatted = [
+        ODResponseData(
+            label=bbox["label"],
+            bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),
+            score=round(bbox["score"], 2),
+        )
+        for bbox in bboxes_per_frame
+    ]
     filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
     return [bbox.model_dump() for bbox in filtered_bboxes]
@@ -601,7 +680,14 @@ def countgd_example_based_counting(
         payload, "visual-prompts-to-object-detection", files=files, metadata=metadata
     )
     bboxes_per_frame = resp_data[0]
-    bboxes_formatted = [ODResponseData(**bbox) for bbox in bboxes_per_frame]
+    bboxes_formatted = [
+        ODResponseData(
+            label=bbox["label"],
+            bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),
+            score=round(bbox["score"], 2),
+        )
+        for bbox in bboxes_per_frame
+    ]
     filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
     return [bbox.model_dump() for bbox in filtered_bboxes]
@@ -1374,12 +1460,12 @@ def closest_box_distance(
 def extract_frames(
     video_uri: Union[str, Path], fps: float = 1
 ) -> List[Tuple[np.ndarray, float]]:
-    """'extract_frames' extracts frames from a video which can be a file path or youtube
-    link, returns a list of tuples (frame, timestamp), where timestamp is the relative
-    time in seconds where the frame was captured. The frame is a numpy array.
+    """'extract_frames' extracts frames from a video which can be a file path, url or
+    youtube link, returns a list of tuples (frame, timestamp), where timestamp is the
+    relative time in seconds where the frame was captured. The frame is a numpy array.
     Parameters:
-        video_uri (Union[str, Path]): The path to the video file or youtube link
+        video_uri (Union[str, Path]): The path to the video file, url or youtube link
         fps (float, optional): The frame rate per second to extract the frames. Defaults
             to 10.
@@ -1518,7 +1604,9 @@ def save_video(
         raise ValueError(f"fps must be greater than 0 got {fps}")
     if output_video_path is None:
-        output_video_path = tempfile.NamedTemporaryFile(delete=False).name
+        output_video_path = tempfile.NamedTemporaryFile(
+            delete=False, suffix=".mp4"
+        ).name
     output_video_path = video_writer(frames, fps, output_video_path)
     _save_video_to_result(output_video_path)
@@ -1818,7 +1906,8 @@ def overlay_counting_results(
 FUNCTION_TOOLS = [
-    owl_v2,
+    owl_v2_image,
+    owl_v2_video,
     ocr,
     clip,
     vit_image_classification,
@@ -1827,7 +1916,7 @@ FUNCTION_TOOLS = [
     florence2_image_caption,
     florence2_ocr,
     florence2_sam2_image,
-    florence2_sam2_video,
+    florence2_sam2_video_tracking,
     florence2_phrase_grounding,
     ixc25_image_vqa,
     ixc25_video_vqa,

vision_agent/utils/video.py CHANGED Viewed

@@ -4,6 +4,7 @@ import tempfile
 from functools import lru_cache
 from typing import List, Optional, Tuple
+import av  # type: ignore
 import cv2
 import numpy as np
 from decord import VideoReader  # type: ignore
@@ -43,18 +44,36 @@ def play_video(video_base64: str) -> None:
         cv2.destroyAllWindows()
+def _resize_frame(frame: np.ndarray) -> np.ndarray:
+    height, width = frame.shape[:2]
+    new_width = width - (width % 2)
+    new_height = height - (height % 2)
+    return cv2.resize(frame, (new_width, new_height))
 def video_writer(
     frames: List[np.ndarray], fps: float = 1.0, filename: Optional[str] = None
 ) -> str:
     if filename is None:
         filename = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
-    fourcc = cv2.VideoWriter_fourcc(*"mp4v")  # type: ignore
+    container = av.open(filename, mode="w")
+    stream = container.add_stream("h264", rate=fps)
     height, width = frames[0].shape[:2]
-    writer = cv2.VideoWriter(filename, fourcc, fps, (width, height))
+    stream.height = height - (height % 2)
+    stream.width = width - (width % 2)
+    stream.pix_fmt = "yuv420p"
     for frame in frames:
-        writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
-    writer.release()
+        # Remove the alpha channel (convert RGBA to RGB)
+        frame_rgb = frame[:, :, :3]
+        # Resize the frame to make dimensions divisible by 2
+        frame_rgb = _resize_frame(frame_rgb)
+        av_frame = av.VideoFrame.from_ndarray(frame_rgb, format="rgb24")
+        for packet in stream.encode(av_frame):
+            container.mux(packet)
+    for packet in stream.encode():
+        container.mux(packet)
+    container.close()
     return filename

{vision_agent-0.2.126.dist-info → vision_agent-0.2.128.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.126
+Version: 0.2.128
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai
@@ -10,6 +10,7 @@ Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Requires-Dist: anthropic (>=0.31.0,<0.32.0)
+Requires-Dist: av (>=11.0.0,<12.0.0)
 Requires-Dist: e2b (>=0.17.2a50,<0.18.0)
 Requires-Dist: e2b-code-interpreter (==0.0.11a37)
 Requires-Dist: eva-decord (>=0.6.1,<0.7.0)

{vision_agent-0.2.126.dist-info → vision_agent-0.2.128.dist-info}/RECORD RENAMED Viewed

@@ -4,7 +4,7 @@ vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,5
 vision_agent/agent/agent_utils.py,sha256=22LiPhkJlS5mVeo2dIi259pc2NgA7PGHRpcbnrtKo78,1930
 vision_agent/agent/vision_agent.py,sha256=WM1_o0VAQokAKlDr-0lpFxCRwUm_eFfFNWP-wSNjo7s,11180
 vision_agent/agent/vision_agent_coder.py,sha256=_2QQd_nTGojkk2ZOiMevVCY6-eUA9q1QdCWH7-Noq4w,34237
-vision_agent/agent/vision_agent_coder_prompts.py,sha256=Rg7-Ih7oFgFbHFFno0EHpaZEgm0SYj_nTdqqdp21YLo,11246
+vision_agent/agent/vision_agent_coder_prompts.py,sha256=nj4iRRSAWYHjKqyUSp12aTCV1D5iUVCHeezVXoozS4M,12687
 vision_agent/agent/vision_agent_prompts.py,sha256=K1nLo3XKQ-IqCom1TRwh3cMoGZNxNwEgZqf3uJ6eL18,7221
 vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
@@ -14,11 +14,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
 vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,103
 vision_agent/lmm/lmm.py,sha256=092oefI65_QSRvQm2znXkjTdzlZTh-Ni_38610kfbJg,16836
 vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
-vision_agent/tools/__init__.py,sha256=T8Hi5aHf4J2QJDoPRvu5fxbiqMpAY-1Gi2EFIhJbf3A,2331
+vision_agent/tools/__init__.py,sha256=nx60_hujcnLz3d2wQlCbcerUmT6R2vxRy66IsQjdB3M,2364
 vision_agent/tools/meta_tools.py,sha256=KeGiw2OtY8ARpGbtWjoNAoO1dwevt7LbCupaJX61MkE,18929
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
 vision_agent/tools/tool_utils.py,sha256=62NVlojPMf9MuJ-3yJEcrB3mzmOxN2HrNQzzjVa-FZg,7527
-vision_agent/tools/tools.py,sha256=Y6BTLFoueLtjId2qG06UyZwCQA_TTA6uFxPkxzhRI50,65396
+vision_agent/tools/tools.py,sha256=p6QUo7V03UZOKBAGfabVWdPm9vUT9tyP_utCv0yKfcY,68659
 vision_agent/tools/tools_types.py,sha256=rLpCUODPY0yI65SLOTJOxfHFfqWM3WjOq-AYX25Chjk,2356
 vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
 vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -26,8 +26,8 @@ vision_agent/utils/execute.py,sha256=gc4R_0BKUrZyhiKvIxOpYuzQPYVWQEqxr3ANy1lJAw4
 vision_agent/utils/image_utils.py,sha256=zTTOJFOieMzwIquTFnW7T6ssx9o6XfoZ0Unqyk7GJrg,10746
 vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
 vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
-vision_agent/utils/video.py,sha256=oM3sdQVGGI3xwrCN2GKt9otzDb0SPW-JUo5SABxTVl4,3847
-vision_agent-0.2.126.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.126.dist-info/METADATA,sha256=4O_OuQh5yhJ8unzNtfU4E_0RNykXxkbdjkiGPAXi9Ek,12258
-vision_agent-0.2.126.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.126.dist-info/RECORD,,
+vision_agent/utils/video.py,sha256=GmJqu_3WhBMEwP4HToMMp8EwgftliHSpv5nd-QEDOcs,4528
+vision_agent-0.2.128.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.128.dist-info/METADATA,sha256=4E1im4aLvJnSR-tKxWUtKyJ0ZbkHxYMYxfqGz_0Layw,12295
+vision_agent-0.2.128.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.128.dist-info/RECORD,,

{vision_agent-0.2.126.dist-info → vision_agent-0.2.128.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.126.dist-info → vision_agent-0.2.128.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.126__py3-none-any.whl → 0.2.128__py3-none-any.whl

vision-agent 0.2.126py3-none-any.whl → 0.2.128py3-none-any.whl