PyPI - vision-agent - Versions diffs - 0.2.224__py3-none-any.whl → 0.2.225__py3-none-any.whl - Mend

vision-agent 0.2.224py3-none-any.whl → 0.2.225py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

vision_agent/tools/tools.py CHANGED Viewed

@@ -6,7 +6,6 @@ import tempfile
 import urllib.request
 from base64 import b64encode
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from enum import Enum
 from importlib import resources
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
@@ -54,6 +53,13 @@ from vision_agent.utils.video import (
     frames_to_bytes,
     video_writer,
 )
+from vision_agent.utils.video_tracking import (
+    ODModels,
+    merge_segments,
+    post_process,
+    process_segment,
+    split_frames_into_segments,
+)
 register_heif_opener()
@@ -224,12 +230,6 @@ def sam2(
     return ret["return_data"]  # type: ignore
-class ODModels(str, Enum):
-    COUNTGD = "countgd"
-    FLORENCE2 = "florence2"
-    OWLV2 = "owlv2"
 def od_sam2_video_tracking(
     od_model: ODModels,
     prompt: str,
@@ -237,105 +237,92 @@ def od_sam2_video_tracking(
     chunk_length: Optional[int] = 10,
     fine_tune_id: Optional[str] = None,
 ) -> Dict[str, Any]:
-    results: List[Optional[List[Dict[str, Any]]]] = [None] * len(frames)
+    SEGMENT_SIZE = 50
+    OVERLAP = 1  # Number of overlapping frames between segments
-    if chunk_length is None:
-        step = 1  # Process every frame
-    elif chunk_length <= 0:
-        raise ValueError("chunk_length must be a positive integer or None.")
-    else:
-        step = chunk_length  # Process frames with the specified step size
+    image_size = frames[0].shape[:2]
+    # Split frames into segments with overlap
+    segments = split_frames_into_segments(frames, SEGMENT_SIZE, OVERLAP)
+    def _apply_object_detection(  # inner method to avoid circular importing issues.
+        od_model: ODModels,
+        prompt: str,
+        segment_index: int,
+        frame_number: int,
+        fine_tune_id: str,
+        segment_frames: list,
+    ) -> tuple:
+        """
+        Applies the specified object detection model to the given image.
+        Args:
+            od_model: The object detection model to use.
+            prompt: The prompt for the object detection model.
+            segment_index: The index of the current segment.
+            frame_number: The number of the current frame.
+            fine_tune_id: Optional fine-tune ID for the model.
+            segment_frames: List of frames for the current segment.
+        Returns:
+            A tuple containing the object detection results and the name of the function used.
+        """
-    for idx in range(0, len(frames), step):
         if od_model == ODModels.COUNTGD:
-            results[idx] = countgd_object_detection(prompt=prompt, image=frames[idx])
+            segment_results = countgd_object_detection(
+                prompt=prompt, image=segment_frames[frame_number]
+            )
             function_name = "countgd_object_detection"
         elif od_model == ODModels.OWLV2:
-            results[idx] = owlv2_object_detection(
-                prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
+            segment_results = owlv2_object_detection(
+                prompt=prompt,
+                image=segment_frames[frame_number],
+                fine_tune_id=fine_tune_id,
             )
             function_name = "owlv2_object_detection"
         elif od_model == ODModels.FLORENCE2:
-            results[idx] = florence2_object_detection(
-                prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
+            segment_results = florence2_object_detection(
+                prompt=prompt,
+                image=segment_frames[frame_number],
+                fine_tune_id=fine_tune_id,
             )
             function_name = "florence2_object_detection"
         else:
             raise NotImplementedError(
                 f"Object detection model '{od_model}' is not implemented."
             )
-    image_size = frames[0].shape[:2]
-    def _transform_detections(
-        input_list: List[Optional[List[Dict[str, Any]]]],
-    ) -> List[Optional[Dict[str, Any]]]:
-        output_list: List[Optional[Dict[str, Any]]] = []
-        for _, frame in enumerate(input_list):
-            if frame is not None:
-                labels = [detection["label"] for detection in frame]
-                bboxes = [
-                    denormalize_bbox(detection["bbox"], image_size)
-                    for detection in frame
-                ]
-                output_list.append(
-                    {
-                        "labels": labels,
-                        "bboxes": bboxes,
-                    }
-                )
-            else:
-                output_list.append(None)
-        return output_list
+        return segment_results, function_name
+    # Process each segment and collect detections
+    detections_per_segment: List[Any] = []
+    for segment_index, segment in enumerate(segments):
+        segment_detections = process_segment(
+            segment_frames=segment,
+            od_model=od_model,
+            prompt=prompt,
+            fine_tune_id=fine_tune_id,
+            chunk_length=chunk_length,
+            image_size=image_size,
+            segment_index=segment_index,
+            object_detection_tool=_apply_object_detection,
+        )
+        detections_per_segment.append(segment_detections)
-    output = _transform_detections(results)
+    merged_detections = merge_segments(detections_per_segment)
+    post_processed = post_process(merged_detections, image_size)
     buffer_bytes = frames_to_bytes(frames)
     files = [("video", buffer_bytes)]
-    payload = {"bboxes": json.dumps(output), "chunk_length_frames": chunk_length}
-    metadata = {"function_name": function_name}
-    detections = send_task_inference_request(
-        payload,
-        "sam2",
-        files=files,
-        metadata=metadata,
-    )
-    return_data = []
-    for frame in detections:
-        return_frame_data = []
-        for detection in frame:
-            mask = rle_decode_array(detection["mask"])
-            label = str(detection["id"]) + ": " + detection["label"]
-            return_frame_data.append(
-                {"label": label, "mask": mask, "score": 1.0, "rle": detection["mask"]}
-            )
-        return_data.append(return_frame_data)
-    return_data = add_bboxes_from_masks(return_data)
-    return_data = nms(return_data, iou_threshold=0.95)
-    # We save the RLE for display purposes, re-calculting RLE can get very expensive.
-    # Deleted here because we are returning the numpy masks instead
-    display_data = []
-    for frame in return_data:
-        display_frame_data = []
-        for obj in frame:
-            display_frame_data.append(
-                {
-                    "label": obj["label"],
-                    "score": obj["score"],
-                    "bbox": denormalize_bbox(obj["bbox"], image_size),
-                    "mask": obj["rle"],
-                }
-            )
-            del obj["rle"]
-        display_data.append(display_frame_data)
-    return {"files": files, "return_data": return_data, "display_data": detections}
+    return {
+        "files": files,
+        "return_data": post_processed["return_data"],
+        "display_data": post_processed["display_data"],
+    }
 # Owl V2 Tools

vision_agent/utils/video_tracking.py ADDED Viewed

@@ -0,0 +1,305 @@
+import json
+from enum import Enum
+from typing import Any, Callable, Dict, List, Optional, Tuple
+import numpy as np
+from vision_agent.tools.tool_utils import (
+    add_bboxes_from_masks,
+    nms,
+    send_task_inference_request,
+)
+from vision_agent.utils.image_utils import denormalize_bbox, rle_decode_array
+from vision_agent.utils.video import frames_to_bytes
+class ODModels(str, Enum):
+    COUNTGD = "countgd"
+    FLORENCE2 = "florence2"
+    OWLV2 = "owlv2"
+def split_frames_into_segments(
+    frames: List[np.ndarray], segment_size: int = 50, overlap: int = 1
+) -> List[List[np.ndarray]]:
+    """
+    Splits the list of frames into segments with a specified size and overlap.
+    Args:
+        frames (List[np.ndarray]): List of video frames.
+        segment_size (int, optional): Number of frames per segment. Defaults to 50.
+        overlap (int, optional): Number of overlapping frames between segments. Defaults to 1.
+    Returns:
+        List[List[np.ndarray]]: List of frame segments.
+    """
+    segments = []
+    start = 0
+    segment_count = 0
+    while start < len(frames):
+        end = start + segment_size
+        if end > len(frames):
+            end = len(frames)
+        if start != 0:
+            # Include the last frame of the previous segment
+            segment = frames[start - overlap : end]
+        else:
+            segment = frames[start:end]
+        segments.append(segment)
+        start += segment_size
+        segment_count += 1
+    return segments
+def process_segment(
+    segment_frames: List[np.ndarray],
+    od_model: ODModels,
+    prompt: str,
+    fine_tune_id: Optional[str],
+    chunk_length: Optional[int],
+    image_size: Tuple[int, ...],
+    segment_index: int,
+    object_detection_tool: Callable,
+) -> Any:
+    """
+    Processes a segment of frames with the specified object detection model.
+    Args:
+        segment_frames (List[np.ndarray]): Frames in the segment.
+        od_model (ODModels): Object detection model to use.
+        prompt (str): Prompt for the model.
+        fine_tune_id (Optional[str]): Fine-tune model ID.
+        chunk_length (Optional[int]): Chunk length for processing.
+        image_size (Tuple[int, int]): Size of the images.
+        segment_index (int): Index of the segment.
+        object_detection_tool (Callable): Object detection tool to use.
+    Returns:
+       Any: Detections for the segment.
+    """
+    segment_results: List[Optional[List[Dict[str, Any]]]] = [None] * len(segment_frames)
+    if chunk_length is None:
+        step = 1
+    elif chunk_length <= 0:
+        raise ValueError("chunk_length must be a positive integer or None.")
+    else:
+        step = chunk_length
+    function_name = ""
+    for idx in range(0, len(segment_frames), step):
+        frame_number = idx
+        segment_results[idx], function_name = object_detection_tool(
+            od_model, prompt, segment_index, frame_number, fine_tune_id, segment_frames
+        )
+    transformed_detections = transform_detections(
+        segment_results, image_size, segment_index
+    )
+    buffer_bytes = frames_to_bytes(segment_frames)
+    files = [("video", buffer_bytes)]
+    payload = {
+        "bboxes": json.dumps(transformed_detections),
+        "chunk_length_frames": chunk_length,
+    }
+    metadata = {"function_name": function_name}
+    segment_detections = send_task_inference_request(
+        payload,
+        "sam2",
+        files=files,
+        metadata=metadata,
+    )
+    return segment_detections
+def transform_detections(
+    input_list: List[Optional[List[Dict[str, Any]]]],
+    image_size: Tuple[int, ...],
+    segment_index: int,
+) -> List[Optional[Dict[str, Any]]]:
+    """
+    Transforms raw detections into a standardized format.
+    Args:
+        input_list (List[Optional[List[Dict[str, Any]]]]): Raw detections.
+        image_size (Tuple[int, int]): Size of the images.
+        segment_index (int): Index of the segment.
+    Returns:
+        List[Optional[Dict[str, Any]]]: Transformed detections.
+    """
+    output_list: List[Optional[Dict[str, Any]]] = []
+    for frame_idx, frame in enumerate(input_list):
+        if frame is not None:
+            labels = [detection["label"] for detection in frame]
+            bboxes = [
+                denormalize_bbox(detection["bbox"], image_size) for detection in frame
+            ]
+            output_list.append(
+                {
+                    "labels": labels,
+                    "bboxes": bboxes,
+                }
+            )
+        else:
+            output_list.append(None)
+    return output_list
+def _calculate_mask_iou(mask1: np.ndarray, mask2: np.ndarray) -> float:
+    mask1 = mask1.astype(bool)
+    mask2 = mask2.astype(bool)
+    intersection = np.sum(np.logical_and(mask1, mask2))
+    union = np.sum(np.logical_or(mask1, mask2))
+    if union == 0:
+        iou = 0.0
+    else:
+        iou = intersection / union
+    return iou
+def _match_by_iou(
+    first_param: List[Dict],
+    second_param: List[Dict],
+    iou_threshold: float = 0.8,
+) -> Tuple[List[Dict], Dict[int, int]]:
+    max_id = max((item["id"] for item in first_param), default=0)
+    matched_new_item_indices = set()
+    id_mapping = {}
+    for new_index, new_item in enumerate(second_param):
+        matched_id = None
+        for existing_item in first_param:
+            iou = _calculate_mask_iou(
+                existing_item["decoded_mask"], new_item["decoded_mask"]
+            )
+            if iou > iou_threshold:
+                matched_id = existing_item["id"]
+                matched_new_item_indices.add(new_index)
+                id_mapping[new_item["id"]] = matched_id
+                break
+        if matched_id:
+            new_item["id"] = matched_id
+        else:
+            max_id += 1
+            id_mapping[new_item["id"]] = max_id
+            new_item["id"] = max_id
+    unmatched_items = [
+        item for i, item in enumerate(second_param) if i not in matched_new_item_indices
+    ]
+    combined_list = first_param + unmatched_items
+    return combined_list, id_mapping
+def _update_ids(detections: List[Dict], id_mapping: Dict[int, int]) -> None:
+    for inner_list in detections:
+        for detection in inner_list:
+            if detection["id"] in id_mapping:
+                detection["id"] = id_mapping[detection["id"]]
+            else:
+                max_new_id = max(id_mapping.values(), default=0)
+                detection["id"] = max_new_id + 1
+                id_mapping[detection["id"]] = detection["id"]
+def _convert_to_2d(detections_per_segment: List[Any]) -> List[Any]:
+    result = []
+    for i, segment in enumerate(detections_per_segment):
+        if i == 0:
+            result.extend(segment)
+        else:
+            result.extend(segment[1:])
+    return result
+def merge_segments(detections_per_segment: List[Any]) -> List[Any]:
+    """
+    Merges detections from all segments into a unified result.
+    Args:
+        detections_per_segment (List[Any]): List of detections per segment.
+    Returns:
+        List[Any]: Merged detections.
+    """
+    for segment in detections_per_segment:
+        for detection in segment:
+            for item in detection:
+                item["decoded_mask"] = rle_decode_array(item["mask"])
+    for segment_idx in range(len(detections_per_segment) - 1):
+        combined_detection, id_mapping = _match_by_iou(
+            detections_per_segment[segment_idx][-1],
+            detections_per_segment[segment_idx + 1][0],
+        )
+        _update_ids(detections_per_segment[segment_idx + 1], id_mapping)
+    merged_result = _convert_to_2d(detections_per_segment)
+    return merged_result
+def post_process(
+    merged_detections: List[Any],
+    image_size: Tuple[int, ...],
+) -> Dict[str, Any]:
+    """
+    Performs post-processing on merged detections, including NMS and preparing display data.
+    Args:
+        merged_detections (List[Any]): Merged detections from all segments.
+        image_size (Tuple[int, int]): Size of the images.
+    Returns:
+        Dict[str, Any]: Post-processed data including return_data and display_data.
+    """
+    return_data = []
+    for frame_idx, frame in enumerate(merged_detections):
+        return_frame_data = []
+        for detection in frame:
+            label = f"{detection['id']}: {detection['label']}"
+            return_frame_data.append(
+                {
+                    "label": label,
+                    "mask": detection["decoded_mask"],
+                    "rle": detection["mask"],
+                    "score": 1.0,
+                }
+            )
+            del detection["decoded_mask"]
+        return_data.append(return_frame_data)
+    return_data = add_bboxes_from_masks(return_data)
+    return_data = nms(return_data, iou_threshold=0.95)
+    # We save the RLE for display purposes, re-calculting RLE can get very expensive.
+    # Deleted here because we are returning the numpy masks instead
+    display_data = []
+    for frame in return_data:
+        display_frame_data = []
+        for obj in frame:
+            display_frame_data.append(
+                {
+                    "label": obj["label"],
+                    "bbox": denormalize_bbox(obj["bbox"], image_size),
+                    "mask": obj["rle"],
+                    "score": obj["score"],
+                }
+            )
+            del obj["rle"]
+        display_data.append(display_frame_data)
+    return {"return_data": return_data, "display_data": display_data}

{vision_agent-0.2.224.dist-info → vision_agent-0.2.225.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.224
+Version: 0.2.225
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.224.dist-info → vision_agent-0.2.225.dist-info}/RECORD RENAMED Viewed

@@ -31,7 +31,7 @@ vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj
 vision_agent/tools/planner_tools.py,sha256=CvaJ2vGM8O_CYvsoSk1avxAMqpIu3tv4C2bY0p1X-X4,13519
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
 vision_agent/tools/tool_utils.py,sha256=q9cqXO2AvigUdO1krjnOy8o0goYhgS6eILl6-F5Kxyk,10211
-vision_agent/tools/tools.py,sha256=60S5ItFG9yKzVb8FU8oLFj_aouDg2-4vlieDbSgfPdQ,91306
+vision_agent/tools/tools.py,sha256=cQYO1TfWhm9C_KaU201aTYec-w0m9QoQMzqjxWvQWGU,90770
 vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
 vision_agent/utils/__init__.py,sha256=QKk4zVjMwGxQI0MQ-aZZA50N-qItxRY4EB9CwQkZ2HY,185
 vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -40,7 +40,8 @@ vision_agent/utils/image_utils.py,sha256=z_ONgcza125B10NkoGwPOzXnL470bpTWZbkB16N
 vision_agent/utils/sim.py,sha256=znsInUDrsyBi3OlgAlV3rDn5UQQRfJAWXTXm7D7eJA8,9125
 vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
 vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
-vision_agent-0.2.224.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.224.dist-info/METADATA,sha256=wT49_byW9-Oz6-1eSlP3cW_AFGbWaxtKrYsGB4nT62o,20039
-vision_agent-0.2.224.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.224.dist-info/RECORD,,
+vision_agent/utils/video_tracking.py,sha256=EeOiSY8gjvvneuAnv-BO7yOyMBF_-1Irk_lLLOt3bDM,9452
+vision_agent-0.2.225.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.225.dist-info/METADATA,sha256=PzehPaQUIj_3TImCmj1YEFMI1rPkrd6FqcIlXhmWjLE,20039
+vision_agent-0.2.225.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.225.dist-info/RECORD,,

{vision_agent-0.2.224.dist-info → vision_agent-0.2.225.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.224.dist-info → vision_agent-0.2.225.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.224__py3-none-any.whl → 0.2.225__py3-none-any.whl

vision-agent 0.2.224py3-none-any.whl → 0.2.225py3-none-any.whl