PyPI - vision-agent - Versions diffs - 0.2.211__py3-none-any.whl → 0.2.213__py3-none-any.whl - Mend

vision-agent 0.2.211py3-none-any.whl → 0.2.213py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

vision_agent/tools/__init__.py CHANGED Viewed

@@ -26,9 +26,10 @@ from .tools import (
     claude35_text_extraction,
     closest_box_distance,
     closest_mask_distance,
+    countgd_example_based_counting,
     countgd_object_detection,
     countgd_sam2_object_detection,
-    countgd_example_based_counting,
+    countgd_sam2_video_tracking,
     depth_anything_v2,
     detr_segmentation,
     extract_frames_and_timestamps,
@@ -46,11 +47,13 @@ from .tools import (
     load_image,
     minimum_distance,
     ocr,
+    od_sam2_video_tracking,
     overlay_bounding_boxes,
     overlay_heat_map,
     overlay_segmentation_masks,
     owl_v2_image,
     owl_v2_video,
+    owlv2_sam2_video_tracking,
     qwen2_vl_images_vqa,
     qwen2_vl_video_vqa,
     sam2,

vision_agent/tools/tools.py CHANGED Viewed

@@ -6,6 +6,7 @@ import tempfile
 import urllib.request
 from base64 import b64encode
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from enum import Enum
 from functools import lru_cache
 from importlib import resources
 from pathlib import Path
@@ -2394,6 +2395,197 @@ def _plot_counting(
     return image
+class ODModels(str, Enum):
+    COUNTGD = "countgd"
+    FLORENCE2 = "florence2"
+    OWLV2 = "owlv2"
+def od_sam2_video_tracking(
+    od_model: ODModels,
+    prompt: str,
+    frames: List[np.ndarray],
+    chunk_length: Optional[int] = 10,
+    fine_tune_id: Optional[str] = None,
+) -> List[List[Dict[str, Any]]]:
+    results: List[Optional[List[Dict[str, Any]]]] = [None] * len(frames)
+    if chunk_length is None:
+        step = 1  # Process every frame
+    elif chunk_length <= 0:
+        raise ValueError("chunk_length must be a positive integer or None.")
+    else:
+        step = chunk_length  # Process frames with the specified step size
+    for idx in range(0, len(frames), step):
+        if od_model == ODModels.COUNTGD:
+            results[idx] = countgd_object_detection(prompt=prompt, image=frames[idx])
+            function_name = "countgd_object_detection"
+        elif od_model == ODModels.OWLV2:
+            results[idx] = owl_v2_image(
+                prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
+            )
+            function_name = "owl_v2_image"
+        elif od_model == ODModels.FLORENCE2:
+            results[idx] = florence2_sam2_image(
+                prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
+            )
+            function_name = "florence2_sam2_image"
+        else:
+            raise NotImplementedError(
+                f"Object detection model '{od_model}' is not implemented."
+            )
+    image_size = frames[0].shape[:2]
+    def _transform_detections(
+        input_list: List[Optional[List[Dict[str, Any]]]]
+    ) -> List[Optional[Dict[str, Any]]]:
+        output_list: List[Optional[Dict[str, Any]]] = []
+        for idx, frame in enumerate(input_list):
+            if frame is not None:
+                labels = [detection["label"] for detection in frame]
+                bboxes = [
+                    denormalize_bbox(detection["bbox"], image_size)
+                    for detection in frame
+                ]
+                output_list.append(
+                    {
+                        "labels": labels,
+                        "bboxes": bboxes,
+                    }
+                )
+            else:
+                output_list.append(None)
+        return output_list
+    output = _transform_detections(results)
+    buffer_bytes = frames_to_bytes(frames)
+    files = [("video", buffer_bytes)]
+    payload = {"bboxes": json.dumps(output), "chunk_length": chunk_length}
+    metadata = {"function_name": function_name}
+    detections = send_task_inference_request(
+        payload,
+        "sam2",
+        files=files,
+        metadata=metadata,
+    )
+    return_data = []
+    for frame in detections:
+        return_frame_data = []
+        for detection in frame:
+            mask = rle_decode_array(detection["mask"])
+            label = str(detection["id"]) + ": " + detection["label"]
+            return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
+        return_data.append(return_frame_data)
+    return_data = add_bboxes_from_masks(return_data)
+    return nms(return_data, iou_threshold=0.95)
+def countgd_sam2_video_tracking(
+    prompt: str,
+    frames: List[np.ndarray],
+    chunk_length: Optional[int] = 10,
+) -> List[List[Dict[str, Any]]]:
+    """'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
+    prompt such as category names or referring expressions. The categories in the text
+    prompt are separated by commas. It returns a list of bounding boxes, label names,
+    mask file names and associated probability scores.
+    Parameters:
+        prompt (str): The prompt to ground to the image.
+        image (np.ndarray): The image to ground the prompt to.
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing the score, label,
+            bounding box, and mask of the detected objects with normalized coordinates
+            (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
+            and xmax and ymax are the coordinates of the bottom-right of the bounding box.
+            The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
+            the background.
+    Example
+    -------
+        >>> countgd_sam2_video_tracking("car, dinosaur", frames)
+        [
+            [
+                {
+                    'label': '0: dinosaur',
+                    'bbox': [0.1, 0.11, 0.35, 0.4],
+                    'mask': array([[0, 0, 0, ..., 0, 0, 0],
+                        [0, 0, 0, ..., 0, 0, 0],
+                        ...,
+                        [0, 0, 0, ..., 0, 0, 0],
+                        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
+                },
+            ],
+            ...
+        ]
+    """
+    return od_sam2_video_tracking(
+        ODModels.COUNTGD, prompt=prompt, frames=frames, chunk_length=chunk_length
+    )
+def owlv2_sam2_video_tracking(
+    prompt: str,
+    frames: List[np.ndarray],
+    chunk_length: Optional[int] = 10,
+    fine_tune_id: Optional[str] = None,
+) -> List[List[Dict[str, Any]]]:
+    """'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
+    prompt such as category names or referring expressions. The categories in the text
+    prompt are separated by commas. It returns a list of bounding boxes, label names,
+    mask file names and associated probability scores.
+    Parameters:
+        prompt (str): The prompt to ground to the image.
+        image (np.ndarray): The image to ground the prompt to.
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing the score, label,
+            bounding box, and mask of the detected objects with normalized coordinates
+            (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
+            and xmax and ymax are the coordinates of the bottom-right of the bounding box.
+            The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
+            the background.
+    Example
+    -------
+        >>> countgd_sam2_video_tracking("car, dinosaur", frames)
+        [
+            [
+                {
+                    'label': '0: dinosaur',
+                    'bbox': [0.1, 0.11, 0.35, 0.4],
+                    'mask': array([[0, 0, 0, ..., 0, 0, 0],
+                        [0, 0, 0, ..., 0, 0, 0],
+                        ...,
+                        [0, 0, 0, ..., 0, 0, 0],
+                        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
+                },
+            ],
+            ...
+        ]
+    """
+    return od_sam2_video_tracking(
+        ODModels.OWLV2,
+        prompt=prompt,
+        frames=frames,
+        chunk_length=chunk_length,
+        fine_tune_id=fine_tune_id,
+    )
 FUNCTION_TOOLS = [
     owl_v2_image,
     owl_v2_video,
@@ -2416,6 +2608,8 @@ FUNCTION_TOOLS = [
     video_temporal_localization,
     flux_image_inpainting,
     siglip_classification,
+    owlv2_sam2_video_tracking,
+    countgd_sam2_video_tracking,
 ]
 UTIL_TOOLS = [

{vision_agent-0.2.211.dist-info → vision_agent-0.2.213.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.211
+Version: 0.2.213
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.211.dist-info → vision_agent-0.2.213.dist-info}/RECORD RENAMED Viewed

@@ -26,12 +26,12 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
 vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
 vision_agent/lmm/lmm.py,sha256=x_nIyDNDZwq4-pfjnJTmcyyJZ2_B7TjkA5jZp88YVO8,17103
 vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
-vision_agent/tools/__init__.py,sha256=-49o3X7bWG7sMxk0pMifO7BmN_cwDFcuGfzll48qAV4,2678
+vision_agent/tools/__init__.py,sha256=InL8zUTRN8i_9J6r2wAtYdtNrVkElqdO_p-e2OA8q5A,2770
 vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
 vision_agent/tools/planner_tools.py,sha256=k7PPu-HhwDwusQgFSPTCWKRVVHBzPMeYB6h2xSEjdUo,13273
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
 vision_agent/tools/tool_utils.py,sha256=LAnrb_nY6PNVamqJahRN-J0cuOy4gsKvCtSuXJf0RsI,10075
-vision_agent/tools/tools.py,sha256=SZVKbPwNRbjul5BBKEZcrzcPJKdnWQXjet4tC5Zkkfw,83797
+vision_agent/tools/tools.py,sha256=ZcXEI0Pb54OGXnLWi690SFx22k7JlEmQ-N16LzRLHlk,90627
 vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
 vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
 vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -40,7 +40,7 @@ vision_agent/utils/image_utils.py,sha256=5uoYgXa6E0-lVrXR7K2XE7fe6r_n7pvK64HYQ50
 vision_agent/utils/sim.py,sha256=f1emBQM8SmyVKSrhj0NHItnfMHSeTw-Nk2pw-0eBZ5c,7462
 vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
 vision_agent/utils/video.py,sha256=tRcGp4vEnaDycigL1hBO9k0FBPtDH35fCQciVr9GqYI,6013
-vision_agent-0.2.211.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.211.dist-info/METADATA,sha256=zbTzutwYFj7D_r8zNm92DmlXZkn4VuEeNDAPoFMq4Ks,19071
-vision_agent-0.2.211.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.211.dist-info/RECORD,,
+vision_agent-0.2.213.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.213.dist-info/METADATA,sha256=iXy6vkFwSXz6UQW1LjuZMCj6YT8YwmjGklhmulFOoIc,19071
+vision_agent-0.2.213.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.213.dist-info/RECORD,,

{vision_agent-0.2.211.dist-info → vision_agent-0.2.213.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.211.dist-info → vision_agent-0.2.213.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.211__py3-none-any.whl → 0.2.213__py3-none-any.whl

vision-agent 0.2.211py3-none-any.whl → 0.2.213py3-none-any.whl