PyPI - vision-agent - Versions diffs - 0.2.227__py3-none-any.whl → 0.2.229__py3-none-any.whl - Mend

vision-agent 0.2.227py3-none-any.whl → 0.2.229py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

vision_agent/tools/__init__.py CHANGED Viewed

@@ -64,6 +64,9 @@ from .tools import (
     vit_image_classification,
     vit_nsfw_classification,
     custom_object_detection,
+    agentic_object_detection,
+    agentic_sam2_instance_segmentation,
+    agentic_sam2_video_tracking,
 )
 __new_tools__ = [

vision_agent/tools/tool_utils.py CHANGED Viewed

@@ -72,8 +72,7 @@ def send_inference_request(
     response = _call_post(url, payload, session, files, function_name, is_form)
-    # TODO: consider making the response schema the same between below two sources
-    return response if "TOOL_ENDPOINT_AUTH" in os.environ else response["data"]
+    return response["data"]
 def send_task_inference_request(

vision_agent/tools/tools.py CHANGED Viewed

@@ -290,6 +290,14 @@ def od_sam2_video_tracking(
             )
             function_name = "florence2_object_detection"
+        elif od_model == ODModels.AGENTIC:
+            segment_results = agentic_object_detection(
+                prompt=prompt,
+                image=segment_frames[frame_number],
+                fine_tune_id=fine_tune_id,
+            )
+            function_name = "agentic_object_detection"
         elif od_model == ODModels.CUSTOM:
             segment_results = custom_object_detection(
                 deployment_id=fine_tune_id,
@@ -2140,6 +2148,242 @@ def siglip_classification(image: np.ndarray, labels: List[str]) -> Dict[str, Any
     return response
+# agentic od tools
+def _agentic_object_detection(
+    prompt: str,
+    image: np.ndarray,
+    image_size: Tuple[int, ...],
+    image_bytes: Optional[bytes] = None,
+    fine_tune_id: Optional[str] = None,
+) -> Dict[str, Any]:
+    if image_bytes is None:
+        image_bytes = numpy_to_bytes(image)
+    files = [("image", image_bytes)]
+    payload = {
+        "prompts": [s.strip() for s in prompt.split(",")],
+        "model": "agentic",
+    }
+    metadata = {"function_name": "agentic_object_detection"}
+    if fine_tune_id is not None:
+        landing_api = LandingPublicAPI()
+        status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
+        if status is not JobStatus.SUCCEEDED:
+            raise FineTuneModelIsNotReady(
+                f"Fine-tuned model {fine_tune_id} is not ready yet"
+            )
+        # we can only execute fine-tuned models with florence2
+        payload = {
+            "prompts": payload["prompts"],
+            "jobId": fine_tune_id,
+            "model": "florence2",
+        }
+    detections = send_task_inference_request(
+        payload,
+        "text-to-object-detection",
+        files=files,
+        metadata=metadata,
+    )
+    # get the first frame
+    bboxes = detections[0]
+    bboxes_formatted = [
+        {
+            "label": bbox["label"],
+            "bbox": normalize_bbox(bbox["bounding_box"], image_size),
+            "score": bbox["score"],
+        }
+        for bbox in bboxes
+    ]
+    display_data = [
+        {
+            "label": bbox["label"],
+            "bbox": bbox["bounding_box"],
+            "score": bbox["score"],
+        }
+        for bbox in bboxes
+    ]
+    return {
+        "files": files,
+        "return_data": bboxes_formatted,
+        "display_data": display_data,
+    }
+def agentic_object_detection(
+    prompt: str,
+    image: np.ndarray,
+    fine_tune_id: Optional[str] = None,
+) -> List[Dict[str, Any]]:
+    """'agentic_object_detection' is a tool that can detect and count multiple objects
+    given a text prompt such as category names or referring expressions on images. The
+    categories in text prompt are separated by commas. It returns a list of bounding
+    boxes with normalized coordinates, label names and associated probability scores.
+    Parameters:
+        prompt (str): The prompt to ground to the image.
+        image (np.ndarray): The image to ground the prompt to.
+        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
+            fine-tuned model ID here to use it.
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
+            bounding box of the detected objects with normalized coordinates between 0
+            and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
+            top-left and xmax and ymax are the coordinates of the bottom-right of the
+            bounding box.
+    Example
+    -------
+        >>> agentic_object_detection("car", image)
+        [
+            {'score': 0.99, 'label': 'car', 'bbox': [0.1, 0.11, 0.35, 0.4]},
+            {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
+        ]
+    """
+    image_size = image.shape[:2]
+    if image_size[0] < 1 or image_size[1] < 1:
+        return []
+    ret = _agentic_object_detection(
+        prompt, image, image_size, fine_tune_id=fine_tune_id
+    )
+    _display_tool_trace(
+        agentic_object_detection.__name__,
+        {"prompts": prompt},
+        ret["display_data"],
+        ret["files"],
+    )
+    return ret["return_data"]  # type: ignore
+def agentic_sam2_instance_segmentation(
+    prompt: str, image: np.ndarray
+) -> List[Dict[str, Any]]:
+    """'agentic_sam2_instance_segmentation' is a tool that can detect and count multiple
+    instances of objects given a text prompt such as category names or referring
+    expressions on images. The categories in text prompt are separated by commas. It
+    returns a list of bounding boxes with normalized coordinates, label names, masks
+    and associated probability scores.
+    Parameters:
+        prompt (str): The object that needs to be counted.
+        image (np.ndarray): The image that contains multiple instances of the object.
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing the score, label,
+            bounding box, and mask of the detected objects with normalized coordinates
+            (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
+            and xmax and ymax are the coordinates of the bottom-right of the bounding box.
+            The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
+            the background.
+    Example
+    -------
+        >>> agentic_sam2_instance_segmentation("flower", image)
+        [
+            {
+                'score': 0.49,
+                'label': 'flower',
+                'bbox': [0.1, 0.11, 0.35, 0.4],
+                'mask': array([[0, 0, 0, ..., 0, 0, 0],
+                    [0, 0, 0, ..., 0, 0, 0],
+                    ...,
+                    [0, 0, 0, ..., 0, 0, 0],
+                    [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
+            },
+        ]
+    """
+    od_ret = _agentic_object_detection(prompt, image, image.shape[:2])
+    seg_ret = _sam2(
+        image, od_ret["return_data"], image.shape[:2], image_bytes=od_ret["files"][0][1]
+    )
+    _display_tool_trace(
+        agentic_sam2_instance_segmentation.__name__,
+        {
+            "prompts": prompt,
+        },
+        seg_ret["display_data"],
+        seg_ret["files"],
+    )
+    return seg_ret["return_data"]  # type: ignore
+def agentic_sam2_video_tracking(
+    prompt: str,
+    frames: List[np.ndarray],
+    chunk_length: Optional[int] = 10,
+    fine_tune_id: Optional[str] = None,
+) -> List[List[Dict[str, Any]]]:
+    """'agentic_sam2_video_tracking' is a tool that can track and segment multiple
+    objects in a video given a text prompt such as category names or referring
+    expressions. The categories in the text prompt are separated by commas. It returns
+    a list of bounding boxes, label names, masks and associated probability scores and
+    is useful for tracking and counting without duplicating counts.
+    Parameters:
+        prompt (str): The prompt to ground to the image.
+        frames (List[np.ndarray]): The list of frames to ground the prompt to.
+        chunk_length (Optional[int]): The number of frames to re-run agentic object detection to
+            to find new objects.
+        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
+            fine-tuned model ID here to use it.
+    Returns:
+        List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
+            label, segmentation mask and bounding boxes. The outer list represents each
+            frame and the inner list is the entities per frame. The detected objects
+            have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
+            and ymin are the coordinates of the top-left and xmax and ymax are the
+            coordinates of the bottom-right of the bounding box. The mask is binary 2D
+            numpy array where 1 indicates the object and 0 indicates the background.
+            The label names are prefixed with their ID represent the total count.
+    Example
+    -------
+        >>> agentic_sam2_video_tracking("dinosaur", frames)
+        [
+            [
+                {
+                    'label': '0: dinosaur',
+                    'bbox': [0.1, 0.11, 0.35, 0.4],
+                    'mask': array([[0, 0, 0, ..., 0, 0, 0],
+                        [0, 0, 0, ..., 0, 0, 0],
+                        ...,
+                        [0, 0, 0, ..., 0, 0, 0],
+                        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
+                },
+            ],
+            ...
+        ]
+    """
+    ret = od_sam2_video_tracking(
+        ODModels.AGENTIC,
+        prompt=prompt,
+        frames=frames,
+        chunk_length=chunk_length,
+        fine_tune_id=fine_tune_id,
+    )
+    _display_tool_trace(
+        agentic_sam2_video_tracking.__name__,
+        {},
+        ret["display_data"],
+        ret["files"],
+    )
+    return ret["return_data"]  # type: ignore
 def minimum_distance(
     det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]
 ) -> float:

vision_agent/utils/video_tracking.py CHANGED Viewed

@@ -17,6 +17,7 @@ class ODModels(str, Enum):
     COUNTGD = "countgd"
     FLORENCE2 = "florence2"
     OWLV2 = "owlv2"
+    AGENTIC = "agentic"
     CUSTOM = "custom"

{vision_agent-0.2.227.dist-info → vision_agent-0.2.229.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.227
+Version: 0.2.229
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.227.dist-info → vision_agent-0.2.229.dist-info}/RECORD RENAMED Viewed

@@ -26,12 +26,12 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
 vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
 vision_agent/lmm/lmm.py,sha256=x_nIyDNDZwq4-pfjnJTmcyyJZ2_B7TjkA5jZp88YVO8,17103
 vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
-vision_agent/tools/__init__.py,sha256=fcucnAzr5Hue9xSqpBgA7RcRJP2CgAgQJ31p_R5lg-I,2794
+vision_agent/tools/__init__.py,sha256=8VpAC8zEk8OwcMLcTn7gEAfw6ihqlsEfzjEaW5yd5-4,2897
 vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
 vision_agent/tools/planner_tools.py,sha256=qQvPuCif-KbFi7KsXKkTCfpgEQEJJ6oq6WB3gOuG2Xg,13686
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
-vision_agent/tools/tool_utils.py,sha256=q9cqXO2AvigUdO1krjnOy8o0goYhgS6eILl6-F5Kxyk,10211
-vision_agent/tools/tools.py,sha256=36f0qAhQfA5lDhYv5BKpHfHgBVEBgOD-XNVHG5K4HLY,96619
+vision_agent/tools/tool_utils.py,sha256=kXB0F-HwmiChpQgKk7tMo-Acsl3UXxjaJV9mYo_q6n4,10076
+vision_agent/tools/tools.py,sha256=M_kk17Yr5c6ODKet26GcxZAlGDwl0AwMMD4wCrBhR6Y,105157
 vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
 vision_agent/utils/__init__.py,sha256=QKk4zVjMwGxQI0MQ-aZZA50N-qItxRY4EB9CwQkZ2HY,185
 vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -40,8 +40,8 @@ vision_agent/utils/image_utils.py,sha256=z_ONgcza125B10NkoGwPOzXnL470bpTWZbkB16N
 vision_agent/utils/sim.py,sha256=qr-6UWAxxGwtwIAKZjZCY_pu9VwBI_TTB8bfrGsaABg,9282
 vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
 vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
-vision_agent/utils/video_tracking.py,sha256=7ZiFBqQRTid5ytPmkrAGQUiVMr-twzib8Ha2hN3JsR0,9474
-vision_agent-0.2.227.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.227.dist-info/METADATA,sha256=qFefkLzCo7G98LyhIPqYzPOUv5nyvOK84DJvUWmeqcc,20039
-vision_agent-0.2.227.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.227.dist-info/RECORD,,
+vision_agent/utils/video_tracking.py,sha256=wK5dOutqV2t2aeaxedstCBa7xy-NNQE0-QZqKu1QUds,9498
+vision_agent-0.2.229.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.229.dist-info/METADATA,sha256=ver5sB_NI_dkek1GxY9GsvktACS1Rl6-tgrr_B5p1Zc,20039
+vision_agent-0.2.229.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.229.dist-info/RECORD,,

{vision_agent-0.2.227.dist-info → vision_agent-0.2.229.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.227.dist-info → vision_agent-0.2.229.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.227__py3-none-any.whl → 0.2.229__py3-none-any.whl

vision-agent 0.2.227py3-none-any.whl → 0.2.229py3-none-any.whl