PyPI - vision-agent - Versions diffs - 0.2.223__py3-none-any.whl → 0.2.225__py3-none-any.whl - Mend

vision-agent 0.2.223py3-none-any.whl → 0.2.225py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

vision_agent/tools/tool_utils.py CHANGED Viewed

@@ -25,6 +25,10 @@ _LND_API_URL = f"{_LND_BASE_URL}/v1/agent/model"
 _LND_API_URL_v2 = f"{_LND_BASE_URL}/v1/tools"
+def should_report_tool_traces() -> bool:
+    return bool(os.environ.get("REPORT_TOOL_TRACES", False))
 class ToolCallTrace(BaseModel):
     endpoint_url: str
     type: str
@@ -251,7 +255,7 @@ def _call_post(
         tool_call_trace.response = result
         return result
     finally:
-        if tool_call_trace is not None:
+        if tool_call_trace is not None and should_report_tool_traces():
             trace = tool_call_trace.model_dump()
             display({MimeType.APPLICATION_JSON: trace}, raw=True)

vision_agent/tools/tools.py CHANGED Viewed

@@ -6,7 +6,6 @@ import tempfile
 import urllib.request
 from base64 import b64encode
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from enum import Enum
 from importlib import resources
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
@@ -32,6 +31,7 @@ from vision_agent.tools.tool_utils import (
     nms,
     send_inference_request,
     send_task_inference_request,
+    should_report_tool_traces,
     single_nms,
 )
 from vision_agent.tools.tools_types import JobStatus
@@ -53,6 +53,13 @@ from vision_agent.utils.video import (
     frames_to_bytes,
     video_writer,
 )
+from vision_agent.utils.video_tracking import (
+    ODModels,
+    merge_segments,
+    post_process,
+    process_segment,
+    split_frames_into_segments,
+)
 register_heif_opener()
@@ -94,6 +101,9 @@ def _display_tool_trace(
     # such as video bytes, which can be slow. Since this is calculated inside the
     # function we can't capture it with a decarator without adding it as a return value
     # which would change the function signature and affect the agent.
+    if not should_report_tool_traces():
+        return
     files_in_b64: List[Tuple[str, str]]
     if isinstance(files, str):
         files_in_b64 = [("images", files)]
@@ -220,12 +230,6 @@ def sam2(
     return ret["return_data"]  # type: ignore
-class ODModels(str, Enum):
-    COUNTGD = "countgd"
-    FLORENCE2 = "florence2"
-    OWLV2 = "owlv2"
 def od_sam2_video_tracking(
     od_model: ODModels,
     prompt: str,
@@ -233,105 +237,92 @@ def od_sam2_video_tracking(
     chunk_length: Optional[int] = 10,
     fine_tune_id: Optional[str] = None,
 ) -> Dict[str, Any]:
-    results: List[Optional[List[Dict[str, Any]]]] = [None] * len(frames)
+    SEGMENT_SIZE = 50
+    OVERLAP = 1  # Number of overlapping frames between segments
-    if chunk_length is None:
-        step = 1  # Process every frame
-    elif chunk_length <= 0:
-        raise ValueError("chunk_length must be a positive integer or None.")
-    else:
-        step = chunk_length  # Process frames with the specified step size
+    image_size = frames[0].shape[:2]
+    # Split frames into segments with overlap
+    segments = split_frames_into_segments(frames, SEGMENT_SIZE, OVERLAP)
+    def _apply_object_detection(  # inner method to avoid circular importing issues.
+        od_model: ODModels,
+        prompt: str,
+        segment_index: int,
+        frame_number: int,
+        fine_tune_id: str,
+        segment_frames: list,
+    ) -> tuple:
+        """
+        Applies the specified object detection model to the given image.
+        Args:
+            od_model: The object detection model to use.
+            prompt: The prompt for the object detection model.
+            segment_index: The index of the current segment.
+            frame_number: The number of the current frame.
+            fine_tune_id: Optional fine-tune ID for the model.
+            segment_frames: List of frames for the current segment.
+        Returns:
+            A tuple containing the object detection results and the name of the function used.
+        """
-    for idx in range(0, len(frames), step):
         if od_model == ODModels.COUNTGD:
-            results[idx] = countgd_object_detection(prompt=prompt, image=frames[idx])
+            segment_results = countgd_object_detection(
+                prompt=prompt, image=segment_frames[frame_number]
+            )
             function_name = "countgd_object_detection"
         elif od_model == ODModels.OWLV2:
-            results[idx] = owlv2_object_detection(
-                prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
+            segment_results = owlv2_object_detection(
+                prompt=prompt,
+                image=segment_frames[frame_number],
+                fine_tune_id=fine_tune_id,
             )
             function_name = "owlv2_object_detection"
         elif od_model == ODModels.FLORENCE2:
-            results[idx] = florence2_object_detection(
-                prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
+            segment_results = florence2_object_detection(
+                prompt=prompt,
+                image=segment_frames[frame_number],
+                fine_tune_id=fine_tune_id,
             )
             function_name = "florence2_object_detection"
         else:
             raise NotImplementedError(
                 f"Object detection model '{od_model}' is not implemented."
             )
-    image_size = frames[0].shape[:2]
-    def _transform_detections(
-        input_list: List[Optional[List[Dict[str, Any]]]]
-    ) -> List[Optional[Dict[str, Any]]]:
-        output_list: List[Optional[Dict[str, Any]]] = []
-        for _, frame in enumerate(input_list):
-            if frame is not None:
-                labels = [detection["label"] for detection in frame]
-                bboxes = [
-                    denormalize_bbox(detection["bbox"], image_size)
-                    for detection in frame
-                ]
-                output_list.append(
-                    {
-                        "labels": labels,
-                        "bboxes": bboxes,
-                    }
-                )
-            else:
-                output_list.append(None)
-        return output_list
+        return segment_results, function_name
+    # Process each segment and collect detections
+    detections_per_segment: List[Any] = []
+    for segment_index, segment in enumerate(segments):
+        segment_detections = process_segment(
+            segment_frames=segment,
+            od_model=od_model,
+            prompt=prompt,
+            fine_tune_id=fine_tune_id,
+            chunk_length=chunk_length,
+            image_size=image_size,
+            segment_index=segment_index,
+            object_detection_tool=_apply_object_detection,
+        )
+        detections_per_segment.append(segment_detections)
-    output = _transform_detections(results)
+    merged_detections = merge_segments(detections_per_segment)
+    post_processed = post_process(merged_detections, image_size)
     buffer_bytes = frames_to_bytes(frames)
     files = [("video", buffer_bytes)]
-    payload = {"bboxes": json.dumps(output), "chunk_length_frames": chunk_length}
-    metadata = {"function_name": function_name}
-    detections = send_task_inference_request(
-        payload,
-        "sam2",
-        files=files,
-        metadata=metadata,
-    )
-    return_data = []
-    for frame in detections:
-        return_frame_data = []
-        for detection in frame:
-            mask = rle_decode_array(detection["mask"])
-            label = str(detection["id"]) + ": " + detection["label"]
-            return_frame_data.append(
-                {"label": label, "mask": mask, "score": 1.0, "rle": detection["mask"]}
-            )
-        return_data.append(return_frame_data)
-    return_data = add_bboxes_from_masks(return_data)
-    return_data = nms(return_data, iou_threshold=0.95)
-    # We save the RLE for display purposes, re-calculting RLE can get very expensive.
-    # Deleted here because we are returning the numpy masks instead
-    display_data = []
-    for frame in return_data:
-        display_frame_data = []
-        for obj in frame:
-            display_frame_data.append(
-                {
-                    "label": obj["label"],
-                    "score": obj["score"],
-                    "bbox": denormalize_bbox(obj["bbox"], image_size),
-                    "mask": obj["rle"],
-                }
-            )
-            del obj["rle"]
-        display_data.append(display_frame_data)
-    return {"files": files, "return_data": return_data, "display_data": detections}
+    return {
+        "files": files,
+        "return_data": post_processed["return_data"],
+        "display_data": post_processed["display_data"],
+    }
 # Owl V2 Tools
@@ -2243,15 +2234,17 @@ def save_image(image: np.ndarray, file_path: str) -> None:
         >>> save_image(image)
     """
     Path(file_path).parent.mkdir(parents=True, exist_ok=True)
-    from IPython.display import display
     if not isinstance(image, np.ndarray) or (
         image.shape[0] == 0 and image.shape[1] == 0
     ):
         raise ValueError("The image is not a valid NumPy array with shape (H, W, C)")
     pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
-    display(pil_image)
+    if should_report_tool_traces():
+        from IPython.display import display
+        display(pil_image)
     pil_image.save(file_path)
@@ -2302,6 +2295,9 @@ def save_video(
 def _save_video_to_result(video_uri: str) -> None:
     """Saves a video into the result of the code execution (as an intermediate output)."""
+    if not should_report_tool_traces():
+        return
     from IPython.display import display
     serializer = FileSerializer(video_uri)

vision_agent/utils/video_tracking.py ADDED Viewed

@@ -0,0 +1,305 @@
+import json
+from enum import Enum
+from typing import Any, Callable, Dict, List, Optional, Tuple
+import numpy as np
+from vision_agent.tools.tool_utils import (
+    add_bboxes_from_masks,
+    nms,
+    send_task_inference_request,
+)
+from vision_agent.utils.image_utils import denormalize_bbox, rle_decode_array
+from vision_agent.utils.video import frames_to_bytes
+class ODModels(str, Enum):
+    COUNTGD = "countgd"
+    FLORENCE2 = "florence2"
+    OWLV2 = "owlv2"
+def split_frames_into_segments(
+    frames: List[np.ndarray], segment_size: int = 50, overlap: int = 1
+) -> List[List[np.ndarray]]:
+    """
+    Splits the list of frames into segments with a specified size and overlap.
+    Args:
+        frames (List[np.ndarray]): List of video frames.
+        segment_size (int, optional): Number of frames per segment. Defaults to 50.
+        overlap (int, optional): Number of overlapping frames between segments. Defaults to 1.
+    Returns:
+        List[List[np.ndarray]]: List of frame segments.
+    """
+    segments = []
+    start = 0
+    segment_count = 0
+    while start < len(frames):
+        end = start + segment_size
+        if end > len(frames):
+            end = len(frames)
+        if start != 0:
+            # Include the last frame of the previous segment
+            segment = frames[start - overlap : end]
+        else:
+            segment = frames[start:end]
+        segments.append(segment)
+        start += segment_size
+        segment_count += 1
+    return segments
+def process_segment(
+    segment_frames: List[np.ndarray],
+    od_model: ODModels,
+    prompt: str,
+    fine_tune_id: Optional[str],
+    chunk_length: Optional[int],
+    image_size: Tuple[int, ...],
+    segment_index: int,
+    object_detection_tool: Callable,
+) -> Any:
+    """
+    Processes a segment of frames with the specified object detection model.
+    Args:
+        segment_frames (List[np.ndarray]): Frames in the segment.
+        od_model (ODModels): Object detection model to use.
+        prompt (str): Prompt for the model.
+        fine_tune_id (Optional[str]): Fine-tune model ID.
+        chunk_length (Optional[int]): Chunk length for processing.
+        image_size (Tuple[int, int]): Size of the images.
+        segment_index (int): Index of the segment.
+        object_detection_tool (Callable): Object detection tool to use.
+    Returns:
+       Any: Detections for the segment.
+    """
+    segment_results: List[Optional[List[Dict[str, Any]]]] = [None] * len(segment_frames)
+    if chunk_length is None:
+        step = 1
+    elif chunk_length <= 0:
+        raise ValueError("chunk_length must be a positive integer or None.")
+    else:
+        step = chunk_length
+    function_name = ""
+    for idx in range(0, len(segment_frames), step):
+        frame_number = idx
+        segment_results[idx], function_name = object_detection_tool(
+            od_model, prompt, segment_index, frame_number, fine_tune_id, segment_frames
+        )
+    transformed_detections = transform_detections(
+        segment_results, image_size, segment_index
+    )
+    buffer_bytes = frames_to_bytes(segment_frames)
+    files = [("video", buffer_bytes)]
+    payload = {
+        "bboxes": json.dumps(transformed_detections),
+        "chunk_length_frames": chunk_length,
+    }
+    metadata = {"function_name": function_name}
+    segment_detections = send_task_inference_request(
+        payload,
+        "sam2",
+        files=files,
+        metadata=metadata,
+    )
+    return segment_detections
+def transform_detections(
+    input_list: List[Optional[List[Dict[str, Any]]]],
+    image_size: Tuple[int, ...],
+    segment_index: int,
+) -> List[Optional[Dict[str, Any]]]:
+    """
+    Transforms raw detections into a standardized format.
+    Args:
+        input_list (List[Optional[List[Dict[str, Any]]]]): Raw detections.
+        image_size (Tuple[int, int]): Size of the images.
+        segment_index (int): Index of the segment.
+    Returns:
+        List[Optional[Dict[str, Any]]]: Transformed detections.
+    """
+    output_list: List[Optional[Dict[str, Any]]] = []
+    for frame_idx, frame in enumerate(input_list):
+        if frame is not None:
+            labels = [detection["label"] for detection in frame]
+            bboxes = [
+                denormalize_bbox(detection["bbox"], image_size) for detection in frame
+            ]
+            output_list.append(
+                {
+                    "labels": labels,
+                    "bboxes": bboxes,
+                }
+            )
+        else:
+            output_list.append(None)
+    return output_list
+def _calculate_mask_iou(mask1: np.ndarray, mask2: np.ndarray) -> float:
+    mask1 = mask1.astype(bool)
+    mask2 = mask2.astype(bool)
+    intersection = np.sum(np.logical_and(mask1, mask2))
+    union = np.sum(np.logical_or(mask1, mask2))
+    if union == 0:
+        iou = 0.0
+    else:
+        iou = intersection / union
+    return iou
+def _match_by_iou(
+    first_param: List[Dict],
+    second_param: List[Dict],
+    iou_threshold: float = 0.8,
+) -> Tuple[List[Dict], Dict[int, int]]:
+    max_id = max((item["id"] for item in first_param), default=0)
+    matched_new_item_indices = set()
+    id_mapping = {}
+    for new_index, new_item in enumerate(second_param):
+        matched_id = None
+        for existing_item in first_param:
+            iou = _calculate_mask_iou(
+                existing_item["decoded_mask"], new_item["decoded_mask"]
+            )
+            if iou > iou_threshold:
+                matched_id = existing_item["id"]
+                matched_new_item_indices.add(new_index)
+                id_mapping[new_item["id"]] = matched_id
+                break
+        if matched_id:
+            new_item["id"] = matched_id
+        else:
+            max_id += 1
+            id_mapping[new_item["id"]] = max_id
+            new_item["id"] = max_id
+    unmatched_items = [
+        item for i, item in enumerate(second_param) if i not in matched_new_item_indices
+    ]
+    combined_list = first_param + unmatched_items
+    return combined_list, id_mapping
+def _update_ids(detections: List[Dict], id_mapping: Dict[int, int]) -> None:
+    for inner_list in detections:
+        for detection in inner_list:
+            if detection["id"] in id_mapping:
+                detection["id"] = id_mapping[detection["id"]]
+            else:
+                max_new_id = max(id_mapping.values(), default=0)
+                detection["id"] = max_new_id + 1
+                id_mapping[detection["id"]] = detection["id"]
+def _convert_to_2d(detections_per_segment: List[Any]) -> List[Any]:
+    result = []
+    for i, segment in enumerate(detections_per_segment):
+        if i == 0:
+            result.extend(segment)
+        else:
+            result.extend(segment[1:])
+    return result
+def merge_segments(detections_per_segment: List[Any]) -> List[Any]:
+    """
+    Merges detections from all segments into a unified result.
+    Args:
+        detections_per_segment (List[Any]): List of detections per segment.
+    Returns:
+        List[Any]: Merged detections.
+    """
+    for segment in detections_per_segment:
+        for detection in segment:
+            for item in detection:
+                item["decoded_mask"] = rle_decode_array(item["mask"])
+    for segment_idx in range(len(detections_per_segment) - 1):
+        combined_detection, id_mapping = _match_by_iou(
+            detections_per_segment[segment_idx][-1],
+            detections_per_segment[segment_idx + 1][0],
+        )
+        _update_ids(detections_per_segment[segment_idx + 1], id_mapping)
+    merged_result = _convert_to_2d(detections_per_segment)
+    return merged_result
+def post_process(
+    merged_detections: List[Any],
+    image_size: Tuple[int, ...],
+) -> Dict[str, Any]:
+    """
+    Performs post-processing on merged detections, including NMS and preparing display data.
+    Args:
+        merged_detections (List[Any]): Merged detections from all segments.
+        image_size (Tuple[int, int]): Size of the images.
+    Returns:
+        Dict[str, Any]: Post-processed data including return_data and display_data.
+    """
+    return_data = []
+    for frame_idx, frame in enumerate(merged_detections):
+        return_frame_data = []
+        for detection in frame:
+            label = f"{detection['id']}: {detection['label']}"
+            return_frame_data.append(
+                {
+                    "label": label,
+                    "mask": detection["decoded_mask"],
+                    "rle": detection["mask"],
+                    "score": 1.0,
+                }
+            )
+            del detection["decoded_mask"]
+        return_data.append(return_frame_data)
+    return_data = add_bboxes_from_masks(return_data)
+    return_data = nms(return_data, iou_threshold=0.95)
+    # We save the RLE for display purposes, re-calculting RLE can get very expensive.
+    # Deleted here because we are returning the numpy masks instead
+    display_data = []
+    for frame in return_data:
+        display_frame_data = []
+        for obj in frame:
+            display_frame_data.append(
+                {
+                    "label": obj["label"],
+                    "bbox": denormalize_bbox(obj["bbox"], image_size),
+                    "mask": obj["rle"],
+                    "score": obj["score"],
+                }
+            )
+            del obj["rle"]
+        display_data.append(display_frame_data)
+    return {"return_data": return_data, "display_data": display_data}

{vision_agent-0.2.223.dist-info → vision_agent-0.2.225.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.223
+Version: 0.2.225
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.223.dist-info → vision_agent-0.2.225.dist-info}/RECORD RENAMED Viewed

@@ -30,8 +30,8 @@ vision_agent/tools/__init__.py,sha256=15O7eQVn0bitmzUO5OxKdA618PoiLt6Z02gmKsSNMF
 vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
 vision_agent/tools/planner_tools.py,sha256=CvaJ2vGM8O_CYvsoSk1avxAMqpIu3tv4C2bY0p1X-X4,13519
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
-vision_agent/tools/tool_utils.py,sha256=LAnrb_nY6PNVamqJahRN-J0cuOy4gsKvCtSuXJf0RsI,10075
-vision_agent/tools/tools.py,sha256=0v7EgsLEN19RoTQIamwXVFdmLZpSIE5mLNNQJMJLWEE,91117
+vision_agent/tools/tool_utils.py,sha256=q9cqXO2AvigUdO1krjnOy8o0goYhgS6eILl6-F5Kxyk,10211
+vision_agent/tools/tools.py,sha256=cQYO1TfWhm9C_KaU201aTYec-w0m9QoQMzqjxWvQWGU,90770
 vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
 vision_agent/utils/__init__.py,sha256=QKk4zVjMwGxQI0MQ-aZZA50N-qItxRY4EB9CwQkZ2HY,185
 vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -40,7 +40,8 @@ vision_agent/utils/image_utils.py,sha256=z_ONgcza125B10NkoGwPOzXnL470bpTWZbkB16N
 vision_agent/utils/sim.py,sha256=znsInUDrsyBi3OlgAlV3rDn5UQQRfJAWXTXm7D7eJA8,9125
 vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
 vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
-vision_agent-0.2.223.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.223.dist-info/METADATA,sha256=BFEkk8Vrf-Q7xG9OrGWdw84Kr9F8x2l-qAIGcsN7zhE,20039
-vision_agent-0.2.223.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.223.dist-info/RECORD,,
+vision_agent/utils/video_tracking.py,sha256=EeOiSY8gjvvneuAnv-BO7yOyMBF_-1Irk_lLLOt3bDM,9452
+vision_agent-0.2.225.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.225.dist-info/METADATA,sha256=PzehPaQUIj_3TImCmj1YEFMI1rPkrd6FqcIlXhmWjLE,20039
+vision_agent-0.2.225.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.225.dist-info/RECORD,,

{vision_agent-0.2.223.dist-info → vision_agent-0.2.225.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.223.dist-info → vision_agent-0.2.225.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.223__py3-none-any.whl → 0.2.225__py3-none-any.whl

vision-agent 0.2.223py3-none-any.whl → 0.2.225py3-none-any.whl