PyPI - vision-agent - Versions diffs - 0.2.234__tar.gz → 0.2.236__tar.gz - Mend

vision-agent 0.2.234tar.gz → 0.2.236tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

{vision_agent-0.2.234 → vision_agent-0.2.236}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.234
+Version: 0.2.236
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.234 → vision_agent-0.2.236}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.234"
+version = "0.2.236"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

{vision_agent-0.2.234 → vision_agent-0.2.236}/vision_agent/agent/vision_agent_coder_prompts.py RENAMED Viewed

@@ -230,7 +230,7 @@ This is the documentation for the functions you have access to. You may call any
 FIX_BUG = """
-**Role** As a coder, your job is to find the error in the code and fix it. You are running in a notebook setting so you can run !pip install to install missing packages.
+**Role** As a coder, your job is to find the error in the code and fix it. You are running in a notebook setting but do not run !pip install to install new packages.
 **Documentation**:
 This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`.

{vision_agent-0.2.234 → vision_agent-0.2.236}/vision_agent/agent/vision_agent_coder_prompts_v2.py RENAMED Viewed

@@ -77,7 +77,7 @@ This is the documentation for the functions you have access to. You may call any
 FIX_BUG = """
-**Role**: As a coder, your job is to find the error in the code and fix it. You are running in a notebook setting so you can run !pip install to install missing packages.
+**Role** As a coder, your job is to find the error in the code and fix it. You are running in a notebook setting but do not run !pip install to install new packages.
 **Task**: A previous agent has written some code and some testing code according to a plan given to it. It has introduced a bug into it's code while trying to implement the plan. You are given the plan, code, test code and error. Your job is to fix the error in the code or test code.

{vision_agent-0.2.234 → vision_agent-0.2.236}/vision_agent/agent/vision_agent_planner_prompts_v2.py RENAMED Viewed

@@ -1,7 +1,7 @@
 PLAN = """
 **Role**: You are an expert planning agent that can understand the user request and search for a plan to accomplish it.
-**Task**: As a planning agent you are required to understand the user's request and search for a plan to accomplish it. Use Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Esnure your response is clear, concise, and helpful. You can use an interactive Pyton (Jupyter Notebok) environment, executing code with <execute_python>, each execution is a new cell so old code and outputs are saved.
+**Task**: As a planning agent you are required to understand the user's request and search for a plan to accomplish it. Use Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Esnure your response is clear, concise, and helpful. You can use an interactive Pyton (Jupyter Notebok) environment but do not !pip install packages, execute code with <execute_python>, each execution is a new cell so old code and outputs are saved.
 **Documentation**: this is the documentation for the functions you can use to accomplish the task:
 {tool_desc}

{vision_agent-0.2.234 → vision_agent-0.2.236}/vision_agent/agent/vision_agent_planner_v2.py RENAMED Viewed

@@ -21,6 +21,7 @@ from vision_agent.agent.agent_utils import (
     extract_tag,
     print_code,
     print_table,
+    remove_installs_from_code,
 )
 from vision_agent.agent.types import AgentMessage, InteractionContext, PlanContext
 from vision_agent.agent.vision_agent_planner_prompts_v2 import (
@@ -180,6 +181,7 @@ def run_critic(
 def code_safeguards(code: str) -> str:
+    code = remove_installs_from_code(code)
     if "get_tool_for_task" in code:
         lines = code.split("\n")
         new_lines = []

{vision_agent-0.2.234 → vision_agent-0.2.236}/vision_agent/tools/tool_utils.py RENAMED Viewed

@@ -270,17 +270,22 @@ def add_bboxes_from_masks(
 ) -> List[List[Dict[str, Any]]]:
     for frame_preds in all_preds:
         for preds in frame_preds:
-            if np.sum(preds["mask"]) == 0:
+            mask = preds["mask"]
+            if mask.sum() == 0:
                 preds["bbox"] = []
             else:
-                rows, cols = np.where(preds["mask"])
-                bbox = [
-                    float(np.min(cols)),
-                    float(np.min(rows)),
-                    float(np.max(cols)),
-                    float(np.max(rows)),
-                ]
-                bbox = normalize_bbox(bbox, preds["mask"].shape)
+                # Get indices where mask is True using axis operations
+                rows = np.any(mask, axis=1)
+                cols = np.any(mask, axis=0)
+                # Find boundaries using argmax/argmin
+                y_min = np.argmax(rows)
+                y_max = len(rows) - np.argmax(rows[::-1])
+                x_min = np.argmax(cols)
+                x_max = len(cols) - np.argmax(cols[::-1])
+                bbox = [float(x_min), float(y_min), float(x_max), float(y_max)]
+                bbox = normalize_bbox(bbox, mask.shape)
                 preds["bbox"] = bbox
     return all_preds

{vision_agent-0.2.234 → vision_agent-0.2.236}/vision_agent/tools/tools.py RENAMED Viewed

@@ -234,16 +234,24 @@ def od_sam2_video_tracking(
     od_model: ODModels,
     prompt: str,
     frames: List[np.ndarray],
-    chunk_length: Optional[int] = 10,
+    chunk_length: Optional[int] = 50,
     fine_tune_id: Optional[str] = None,
 ) -> Dict[str, Any]:
-    SEGMENT_SIZE = 50
-    OVERLAP = 1  # Number of overlapping frames between segments
+    chunk_length = 50 if chunk_length is None else chunk_length
+    segment_size = chunk_length
+    # Number of overlapping frames between segments
+    overlap = 1
+    # chunk_length needs to be segment_size + 1 or else on the last segment it will
+    # run the OD model again and merging will not work
+    chunk_length = chunk_length + 1
+    if len(frames) == 0 or not isinstance(frames, List):
+        return {"files": [], "return_data": [], "display_data": []}
     image_size = frames[0].shape[:2]
     # Split frames into segments with overlap
-    segments = split_frames_into_segments(frames, SEGMENT_SIZE, OVERLAP)
+    segments = split_frames_into_segments(frames, segment_size, overlap)
     def _apply_object_detection(  # inner method to avoid circular importing issues.
         od_model: ODModels,
@@ -538,7 +546,7 @@ def owlv2_sam2_instance_segmentation(
 def owlv2_sam2_video_tracking(
     prompt: str,
     frames: List[np.ndarray],
-    chunk_length: Optional[int] = 10,
+    chunk_length: Optional[int] = 25,
     fine_tune_id: Optional[str] = None,
 ) -> List[List[Dict[str, Any]]]:
     """'owlv2_sam2_video_tracking' is a tool that can track and segment multiple
@@ -771,7 +779,7 @@ def florence2_sam2_instance_segmentation(
 def florence2_sam2_video_tracking(
     prompt: str,
     frames: List[np.ndarray],
-    chunk_length: Optional[int] = 10,
+    chunk_length: Optional[int] = 25,
     fine_tune_id: Optional[str] = None,
 ) -> List[List[Dict[str, Any]]]:
     """'florence2_sam2_video_tracking' is a tool that can track and segment multiple
@@ -1110,7 +1118,7 @@ def countgd_sam2_instance_segmentation(
 def countgd_sam2_video_tracking(
     prompt: str,
     frames: List[np.ndarray],
-    chunk_length: Optional[int] = 10,
+    chunk_length: Optional[int] = 25,
 ) -> List[List[Dict[str, Any]]]:
     """'countgd_sam2_video_tracking' is a tool that can track and segment multiple
     objects in a video given a text prompt such as category names or referring
@@ -1322,7 +1330,7 @@ def custom_object_detection(
 def custom_od_sam2_video_tracking(
     deployment_id: str,
     frames: List[np.ndarray],
-    chunk_length: Optional[int] = 10,
+    chunk_length: Optional[int] = 25,
 ) -> List[List[Dict[str, Any]]]:
     """'custom_od_sam2_video_tracking' is a tool that can segment multiple objects given a
     custom model with predefined category names.
@@ -2366,7 +2374,7 @@ def agentic_sam2_instance_segmentation(
 def agentic_sam2_video_tracking(
     prompt: str,
     frames: List[np.ndarray],
-    chunk_length: Optional[int] = 10,
+    chunk_length: Optional[int] = 25,
     fine_tune_id: Optional[str] = None,
 ) -> List[List[Dict[str, Any]]]:
     """'agentic_sam2_video_tracking' is a tool that can track and segment multiple
@@ -2791,7 +2799,15 @@ def overlay_bounding_boxes(
             "Number of unique labels exceeds the number of available colors. Some labels may have the same color."
         )
-    color = {label: COLORS[i % len(COLORS)] for i, label in enumerate(labels)}
+    use_tracking_label = False
+    if all([":" in label for label in labels]):
+        unique_labels = set([label.split(":")[1].strip() for label in labels])
+        use_tracking_label = True
+        colors = {
+            label: COLORS[i % len(COLORS)] for i, label in enumerate(unique_labels)
+        }
+    else:
+        colors = {label: COLORS[i % len(COLORS)] for i, label in enumerate(labels)}
     frame_out = []
     for i, frame in enumerate(medias_int):
@@ -2802,7 +2818,7 @@ def overlay_bounding_boxes(
         # if more than 50 boxes use small boxes to indicate objects else use regular boxes
         if len(bboxes) > 50:
-            pil_image = _plot_counting(pil_image, bboxes, color)
+            pil_image = _plot_counting(pil_image, bboxes, colors, use_tracking_label)
         else:
             width, height = pil_image.size
             fontsize = max(12, int(min(width, height) / 40))
@@ -2817,18 +2833,20 @@ def overlay_bounding_boxes(
             )
             for elt in bboxes:
+                if use_tracking_label:
+                    color = colors[elt["label"].split(":")[1].strip()]
+                else:
+                    color = colors[elt["label"]]
                 label = elt["label"]
                 box = elt["bbox"]
                 scores = elt["score"]
                 # denormalize the box if it is normalized
                 box = denormalize_bbox(box, (height, width))
-                draw.rectangle(box, outline=color[label], width=4)
+                draw.rectangle(box, outline=color, width=4)
                 text = f"{label}: {scores:.2f}"
                 text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
-                draw.rectangle(
-                    (box[0], box[1], text_box[2], text_box[3]), fill=color[label]
-                )
+                draw.rectangle((box[0], box[1], text_box[2], text_box[3]), fill=color)
                 draw.text((box[0], box[1]), text, fill="black", font=font)
         frame_out.append(np.array(pil_image))
@@ -2911,7 +2929,16 @@ def overlay_segmentation_masks(
     for mask_i in masks_int:
         for mask_j in mask_i:
             labels.add(mask_j["label"])
-    color = {label: COLORS[i % len(COLORS)] for i, label in enumerate(labels)}
+    use_tracking_label = False
+    if all([":" in label for label in labels]):
+        use_tracking_label = True
+        unique_labels = set([label.split(":")[1].strip() for label in labels])
+        colors = {
+            label: COLORS[i % len(COLORS)] for i, label in enumerate(unique_labels)
+        }
+    else:
+        colors = {label: COLORS[i % len(COLORS)] for i, label in enumerate(labels)}
     width, height = Image.fromarray(medias_int[0]).size
     fontsize = max(12, int(min(width, height) / 40))
@@ -2925,12 +2952,16 @@ def overlay_segmentation_masks(
         pil_image = Image.fromarray(frame.astype(np.uint8)).convert("RGBA")
         for elt in masks_int[i]:
             mask = elt["mask"]
+            if use_tracking_label:
+                color = colors[elt["label"].split(":")[1].strip()]
+            else:
+                color = colors[elt["label"]]
             label = elt["label"]
             tracking_lbl = elt.get(secondary_label_key, None)
             # Create semi-transparent mask overlay
             np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
-            np_mask[mask > 0, :] = color[label] + (255 * 0.7,)
+            np_mask[mask > 0, :] = color + (255 * 0.7,)
             mask_img = Image.fromarray(np_mask.astype(np.uint8))
             pil_image = Image.alpha_composite(pil_image, mask_img)
@@ -2942,7 +2973,7 @@ def overlay_segmentation_masks(
             border_mask = np.zeros(
                 (pil_image.size[1], pil_image.size[0], 4), dtype=np.uint8
             )
-            cv2.drawContours(border_mask, contours, -1, color[label] + (255,), 8)
+            cv2.drawContours(border_mask, contours, -1, color + (255,), 8)
             border_img = Image.fromarray(border_mask)
             pil_image = Image.alpha_composite(pil_image, border_img)
@@ -2957,7 +2988,7 @@ def overlay_segmentation_masks(
                 )
                 if x != 0 and y != 0:
                     text_box = draw.textbbox((x, y), text=text, font=font)
-                    draw.rectangle((x, y, text_box[2], text_box[3]), fill=color[label])
+                    draw.rectangle((x, y, text_box[2], text_box[3]), fill=color)
                     draw.text((x, y), text, fill="black", font=font)
         frame_out.append(np.array(pil_image))
     return_frame = frame_out[0] if len(frame_out) == 1 else frame_out
@@ -3014,6 +3045,7 @@ def _plot_counting(
     image: Image.Image,
     bboxes: List[Dict[str, Any]],
     colors: Dict[str, Tuple[int, int, int]],
+    use_tracking_label: bool = False,
 ) -> Image.Image:
     width, height = image.size
     fontsize = max(12, int(min(width, height) / 40))
@@ -3023,7 +3055,12 @@ def _plot_counting(
         fontsize,
     )
     for i, elt in enumerate(bboxes, 1):
-        label = f"{i}"
+        if use_tracking_label:
+            label = elt["label"].split(":")[0]
+            color = colors[elt["label"].split(":")[1].strip()]
+        else:
+            label = f"{i}"
+            color = colors[elt["label"]]
         box = elt["bbox"]
         # denormalize the box if it is normalized
@@ -3044,7 +3081,7 @@ def _plot_counting(
         text_y1 = cy + text_height / 2
         # Draw the rectangle encapsulating the text
-        draw.rectangle((text_x0, text_y0, text_x1, text_y1), fill=colors[elt["label"]])
+        draw.rectangle((text_x0, text_y0, text_x1, text_y1), fill=color)
         # Draw the text at the center of the bounding box
         draw.text(

{vision_agent-0.2.234 → vision_agent-0.2.236}/vision_agent/utils/video_tracking.py RENAMED Viewed

@@ -3,10 +3,10 @@ from enum import Enum
 from typing import Any, Callable, Dict, List, Optional, Tuple
 import numpy as np
+from scipy.optimize import linear_sum_assignment  # type: ignore
 from vision_agent.tools.tool_utils import (
     add_bboxes_from_masks,
-    nms,
     send_task_inference_request,
 )
 from vision_agent.utils.image_utils import denormalize_bbox, rle_decode_array
@@ -171,63 +171,45 @@ def _calculate_mask_iou(mask1: np.ndarray, mask2: np.ndarray) -> float:
 def _match_by_iou(
     first_param: List[Dict],
     second_param: List[Dict],
-    iou_threshold: float = 0.8,
-) -> Tuple[List[Dict], Dict[int, int]]:
-    max_id = max((item["id"] for item in first_param), default=0)
-    matched_new_item_indices = set()
-    id_mapping = {}
-    for new_index, new_item in enumerate(second_param):
-        matched_id = None
-        for existing_item in first_param:
+    max_id: int,
+    iou_threshold: float = 0.05,
+) -> Tuple[Dict[int, int], int]:
+    max_first_id = max((item["id"] for item in first_param), default=0)
+    max_second_id = max((item["id"] for item in second_param), default=0)
+    cost_matrix = np.ones((max_first_id + 1, max_second_id + 1))
+    for first_item in first_param:
+        for second_item in second_param:
             iou = _calculate_mask_iou(
-                existing_item["decoded_mask"], new_item["decoded_mask"]
+                first_item["decoded_mask"], second_item["decoded_mask"]
             )
-            if iou > iou_threshold:
-                matched_id = existing_item["id"]
-                matched_new_item_indices.add(new_index)
-                id_mapping[new_item["id"]] = matched_id
-                break
-        if matched_id:
-            new_item["id"] = matched_id
-        else:
-            max_id += 1
-            id_mapping[new_item["id"]] = max_id
-            new_item["id"] = max_id
-    unmatched_items = [
-        item for i, item in enumerate(second_param) if i not in matched_new_item_indices
-    ]
-    combined_list = first_param + unmatched_items
-    return combined_list, id_mapping
+            cost_matrix[first_item["id"], second_item["id"]] = 1 - iou
+    row_ind, col_ind = linear_sum_assignment(cost_matrix)
+    id_mapping = {second_id: first_id for first_id, second_id in zip(row_ind, col_ind)}
+    first_id_to_label = {item["id"]: item["label"] for item in first_param}
-def _update_ids(detections: List[Dict], id_mapping: Dict[int, int]) -> None:
-    for inner_list in detections:
-        for detection in inner_list:
-            if detection["id"] in id_mapping:
-                detection["id"] = id_mapping[detection["id"]]
+    cleaned_mapping = {}
+    for elt in second_param:
+        second_id = elt["id"]
+        # if the id is not in the mapping, give it a new id
+        if second_id not in id_mapping:
+            max_id += 1
+            cleaned_mapping[second_id] = max_id
+        else:
+            first_id = id_mapping[second_id]
+            iou = 1 - cost_matrix[first_id, second_id]
+            # only map if the iou is above the threshold and the labels match
+            if iou > iou_threshold and first_id_to_label[first_id] == elt["label"]:
+                cleaned_mapping[second_id] = first_id
             else:
-                max_new_id = max(id_mapping.values(), default=0)
-                detection["id"] = max_new_id + 1
-                id_mapping[detection["id"]] = detection["id"]
+                max_id += 1
+                cleaned_mapping[second_id] = max_id
+    return cleaned_mapping, max_id
-def _convert_to_2d(detections_per_segment: List[Any]) -> List[Any]:
-    result = []
-    for i, segment in enumerate(detections_per_segment):
-        if i == 0:
-            result.extend(segment)
-        else:
-            result.extend(segment[1:])
-    return result
-def merge_segments(detections_per_segment: List[Any]) -> List[Any]:
+def merge_segments(detections_per_segment: List[Any], overlap: int = 1) -> List[Any]:
     """
     Merges detections from all segments into a unified result.
@@ -242,16 +224,20 @@ def merge_segments(detections_per_segment: List[Any]) -> List[Any]:
             for item in detection:
                 item["decoded_mask"] = rle_decode_array(item["mask"])
+    merged_result = detections_per_segment[0]
+    max_id = max((item["id"] for item in merged_result[-1]), default=0)
     for segment_idx in range(len(detections_per_segment) - 1):
-        combined_detection, id_mapping = _match_by_iou(
+        id_mapping, max_id = _match_by_iou(
             detections_per_segment[segment_idx][-1],
             detections_per_segment[segment_idx + 1][0],
+            max_id,
         )
-        _update_ids(detections_per_segment[segment_idx + 1], id_mapping)
-    merged_result = _convert_to_2d(detections_per_segment)
+        for frame in detections_per_segment[segment_idx + 1][overlap:]:
+            for detection in frame:
+                detection["id"] = id_mapping[detection["id"]]
+        merged_result.extend(detections_per_segment[segment_idx + 1][overlap:])
-    return merged_result
+    return merged_result  # type: ignore
 def post_process(
@@ -269,10 +255,26 @@ def post_process(
         Dict[str, Any]: Post-processed data including return_data and display_data.
     """
     return_data = []
-    for frame_idx, frame in enumerate(merged_detections):
+    label_remapping = {}
+    for _, frame in enumerate(merged_detections):
         return_frame_data = []
         for detection in frame:
-            label = f"{detection['id']}: {detection['label']}"
+            label = detection["label"]
+            id = detection["id"]
+            # Remap label IDs so for each label the IDs restart at 1. This makes it
+            # easier to count the number of instances per label.
+            if label not in label_remapping:
+                label_remapping[label] = {"max": 1, "remap": {id: 1}}
+            elif label in label_remapping and id not in label_remapping[label]["remap"]:  # type: ignore
+                max_id = label_remapping[label]["max"]
+                max_id += 1  # type: ignore
+                label_remapping[label]["remap"][id] = max_id  # type: ignore
+                label_remapping[label]["max"] = max_id
+            new_id = label_remapping[label]["remap"][id]  # type: ignore
+            label = f"{new_id}: {detection['label']}"
             return_frame_data.append(
                 {
                     "label": label,
@@ -285,7 +287,6 @@ def post_process(
         return_data.append(return_frame_data)
     return_data = add_bboxes_from_masks(return_data)
-    return_data = nms(return_data, iou_threshold=0.95)
     # We save the RLE for display purposes, re-calculting RLE can get very expensive.
     # Deleted here because we are returning the numpy masks instead