PyPI - vision-agent - Versions diffs - 0.2.221__py3-none-any.whl → 0.2.222__py3-none-any.whl - Mend

vision-agent 0.2.221py3-none-any.whl → 0.2.222py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

vision_agent/.sim_tools/df.csv +253 -244
vision_agent/.sim_tools/embs.npy +0 -0
vision_agent/agent/vision_agent_planner_prompts_v2.py +28 -23
vision_agent/tools/__init__.py +6 -10
vision_agent/tools/tools.py +639 -787
vision_agent/utils/sim.py +24 -1
{vision_agent-0.2.221.dist-info → vision_agent-0.2.222.dist-info}/METADATA +1 -1
{vision_agent-0.2.221.dist-info → vision_agent-0.2.222.dist-info}/RECORD +10 -10
{vision_agent-0.2.221.dist-info → vision_agent-0.2.222.dist-info}/LICENSE +0 -0
{vision_agent-0.2.221.dist-info → vision_agent-0.2.222.dist-info}/WHEEL +0 -0

vision_agent/tools/tools.py CHANGED Viewed

@@ -21,7 +21,7 @@ from pillow_heif import register_heif_opener  # type: ignore
 from pytube import YouTube  # type: ignore
 from vision_agent.clients.landing_public_api import LandingPublicAPI
-from vision_agent.lmm.lmm import AnthropicLMM, OpenAILMM
+from vision_agent.lmm.lmm import AnthropicLMM
 from vision_agent.tools.tool_utils import (
     ToolCallTrace,
     add_bboxes_from_masks,
@@ -112,6 +112,114 @@ def _display_tool_trace(
     display({MimeType.APPLICATION_JSON: tool_call_trace.model_dump()}, raw=True)
+def _sam2(
+    image: np.ndarray,
+    detections: List[Dict[str, Any]],
+    image_size: Tuple[int, ...],
+    image_bytes: Optional[bytes] = None,
+) -> Dict[str, Any]:
+    if image_bytes is None:
+        image_bytes = numpy_to_bytes(image)
+    files = [("images", image_bytes)]
+    payload = {
+        "model": "sam2",
+        "bboxes": json.dumps(
+            [
+                {
+                    "labels": [d["label"] for d in detections],
+                    "bboxes": [
+                        denormalize_bbox(d["bbox"], image_size) for d in detections
+                    ],
+                }
+            ]
+        ),
+    }
+    metadata = {"function_name": "sam2"}
+    pred_detections = send_task_inference_request(
+        payload, "sam2", files=files, metadata=metadata
+    )
+    frame = pred_detections[0]
+    return_data = []
+    display_data = []
+    for inp_detection, detection in zip(detections, frame):
+        mask = rle_decode_array(detection["mask"])
+        label = detection["label"]
+        bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
+        return_data.append(
+            {
+                "label": label,
+                "bbox": bbox,
+                "mask": mask,
+                "score": inp_detection["score"],
+            }
+        )
+        display_data.append(
+            {
+                "label": label,
+                "bbox": detection["bounding_box"],
+                "mask": detection["mask"],
+                "score": inp_detection["score"],
+            }
+        )
+    return {"files": files, "return_data": return_data, "display_data": display_data}
+def sam2(
+    image: np.ndarray,
+    detections: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """'sam2' is a tool that can segment multiple objects given an input bounding box,
+    label and score. It returns a set of masks along with the corresponding bounding
+    boxes and labels.
+    Parameters:
+        image (np.ndarray): The image that contains multiple instances of the object.
+        detections (List[Dict[str, Any]]): A list of dictionaries containing the score,
+            label, and bounding box of the detected objects with normalized coordinates
+            between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates
+            of the top-left and xmax and ymax are the coordinates of the bottom-right of
+            the bounding box.
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing the score, label,
+            bounding box, and mask of the detected objects with normalized coordinates
+            (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
+            and xmax and ymax are the coordinates of the bottom-right of the bounding box.
+            The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
+            the background.
+    Example
+    -------
+        >>> sam2(image, [
+                {'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
+            ])
+        [
+            {
+                'score': 0.49,
+                'label': 'flower',
+                'bbox': [0.1, 0.11, 0.35, 0.4],
+                'mask': array([[0, 0, 0, ..., 0, 0, 0],
+                    [0, 0, 0, ..., 0, 0, 0],
+                    ...,
+                    [0, 0, 0, ..., 0, 0, 0],
+                    [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
+            },
+        ]
+    """
+    image_size = image.shape[:2]
+    ret = _sam2(image, detections, image_size)
+    _display_tool_trace(
+        sam2.__name__,
+        {},
+        ret["display_data"],
+        ret["files"],
+    )
+    return ret["return_data"]  # type: ignore
 class ODModels(str, Enum):
     COUNTGD = "countgd"
     FLORENCE2 = "florence2"
@@ -139,15 +247,15 @@ def od_sam2_video_tracking(
             results[idx] = countgd_object_detection(prompt=prompt, image=frames[idx])
             function_name = "countgd_object_detection"
         elif od_model == ODModels.OWLV2:
-            results[idx] = owl_v2_image(
+            results[idx] = owlv2_object_detection(
                 prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
             )
-            function_name = "owl_v2_image"
+            function_name = "owlv2_object_detection"
         elif od_model == ODModels.FLORENCE2:
-            results[idx] = florence2_sam2_image(
+            results[idx] = florence2_object_detection(
                 prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
             )
-            function_name = "florence2_sam2_image"
+            function_name = "florence2_object_detection"
         else:
             raise NotImplementedError(
                 f"Object detection model '{od_model}' is not implemented."
@@ -183,7 +291,7 @@ def od_sam2_video_tracking(
     buffer_bytes = frames_to_bytes(frames)
     files = [("video", buffer_bytes)]
-    payload = {"bboxes": json.dumps(output), "chunk_length": chunk_length}
+    payload = {"bboxes": json.dumps(output), "chunk_length_frames": chunk_length}
     metadata = {"function_name": function_name}
     detections = send_task_inference_request(
@@ -226,53 +334,27 @@ def od_sam2_video_tracking(
     return {"files": files, "return_data": return_data, "display_data": detections}
-def owl_v2_image(
+# Owl V2 Tools
+def _owlv2_object_detection(
     prompt: str,
     image: np.ndarray,
-    box_threshold: float = 0.10,
+    box_threshold: float,
+    image_size: Tuple[int, ...],
+    image_bytes: Optional[bytes] = None,
     fine_tune_id: Optional[str] = None,
-) -> List[Dict[str, Any]]:
-    """'owl_v2_image' is a tool that can detect and count multiple objects given a text
-    prompt such as category names or referring expressions on images. The categories in
-    text prompt are separated by commas. It returns a list of bounding boxes with
-    normalized coordinates, label names and associated probability scores.
-    Parameters:
-        prompt (str): The prompt to ground to the image.
-        image (np.ndarray): The image to ground the prompt to.
-        box_threshold (float, optional): The threshold for the box detection. Defaults
-            to 0.10.
-        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
-            fine-tuned model ID here to use it.
-    Returns:
-        List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
-            bounding box of the detected objects with normalized coordinates between 0
-            and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
-            top-left and xmax and ymax are the coordinates of the bottom-right of the
-            bounding box.
-    Example
-    -------
-        >>> owl_v2_image("car, dinosaur", image)
-        [
-            {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
-            {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
-        ]
-    """
-    image_size = image.shape[:2]
-    if image_size[0] < 1 or image_size[1] < 1:
-        return []
+) -> Dict[str, Any]:
+    if image_bytes is None:
+        image_bytes = numpy_to_bytes(image)
-    buffer_bytes = numpy_to_bytes(image)
-    files = [("image", buffer_bytes)]
+    files = [("image", image_bytes)]
     payload = {
         "prompts": [s.strip() for s in prompt.split(",")],
         "confidence": box_threshold,
         "model": "owlv2",
     }
-    metadata = {"function_name": "owl_v2_image"}
+    metadata = {"function_name": "owlv2_object_detection"}
     if fine_tune_id is not None:
         landing_api = LandingPublicAPI()
@@ -302,127 +384,96 @@ def owl_v2_image(
         {
             "label": bbox["label"],
             "bbox": normalize_bbox(bbox["bounding_box"], image_size),
-            "score": round(bbox["score"], 2),
+            "score": bbox["score"],
         }
         for bbox in bboxes
     ]
-    _display_tool_trace(
-        owl_v2_image.__name__,
-        payload,
-        detections[0],
-        files,
-    )
-    return bboxes_formatted
+    display_data = [
+        {
+            "label": bbox["label"],
+            "bbox": bbox["bounding_box"],
+            "score": bbox["score"],
+        }
+        for bbox in bboxes
+    ]
+    return {
+        "files": files,
+        "return_data": bboxes_formatted,
+        "display_data": display_data,
+    }
-def owl_v2_video(
+def owlv2_object_detection(
     prompt: str,
-    frames: List[np.ndarray],
+    image: np.ndarray,
     box_threshold: float = 0.10,
     fine_tune_id: Optional[str] = None,
-) -> List[List[Dict[str, Any]]]:
-    """'owl_v2_video' will run owl_v2 on each frame of a video. It can detect multiple
-    objects independently per frame given a text prompt such as a category name or
-    referring expression but does not track objects across frames. The categories in
-    text prompt are separated by commas. It returns a list of lists where each inner
-    list contains the score, label, and bounding box of the detections for that frame.
+) -> List[Dict[str, Any]]:
+    """'owlv2_object_detection' is a tool that can detect and count multiple objects
+    given a text prompt such as category names or referring expressions on images. The
+    categories in text prompt are separated by commas. It returns a list of bounding
+    boxes with normalized coordinates, label names and associated probability scores.
     Parameters:
-        prompt (str): The prompt to ground to the video.
-        frames (List[np.ndarray]): The list of frames to ground the prompt to.
+        prompt (str): The prompt to ground to the image.
+        image (np.ndarray): The image to ground the prompt to.
         box_threshold (float, optional): The threshold for the box detection. Defaults
-            to 0.30.
+            to 0.10.
         fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
             fine-tuned model ID here to use it.
     Returns:
-        List[List[Dict[str, Any]]]: A list of lists of dictionaries containing the
-            score, label, and bounding box of the detected objects with normalized
-            coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the
-            coordinates of the top-left and xmax and ymax are the coordinates of the
-            bottom-right of the bounding box.
+        List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
+            bounding box of the detected objects with normalized coordinates between 0
+            and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
+            top-left and xmax and ymax are the coordinates of the bottom-right of the
+            bounding box.
     Example
     -------
-        >>> owl_v2_video("car, dinosaur", frames)
+        >>> owlv2_object_detection("car, dinosaur", image)
         [
-            [
-                {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
-                {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
-            ],
-            ...
+            {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
+            {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
         ]
     """
-    if len(frames) == 0 or not isinstance(frames, List):
-        raise ValueError("Must provide a list of numpy arrays for frames")
-    image_size = frames[0].shape[:2]
-    buffer_bytes = frames_to_bytes(frames)
-    files = [("video", buffer_bytes)]
-    payload = {
-        "prompts": [s.strip() for s in prompt.split(",")],
-        "confidence": box_threshold,
-        "model": "owlv2",
-    }
-    metadata = {"function_name": "owl_v2_video"}
-    if fine_tune_id is not None:
-        landing_api = LandingPublicAPI()
-        status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
-        if status is not JobStatus.SUCCEEDED:
-            raise FineTuneModelIsNotReady(
-                f"Fine-tuned model {fine_tune_id} is not ready yet"
-            )
-        # we can only execute fine-tuned models with florence2
-        payload = {
-            "prompts": payload["prompts"],
-            "jobId": fine_tune_id,
-            "model": "florence2",
-        }
+    image_size = image.shape[:2]
+    if image_size[0] < 1 or image_size[1] < 1:
+        return []
-    detections = send_task_inference_request(
-        payload,
-        "text-to-object-detection",
-        files=files,
-        metadata=metadata,
+    ret = _owlv2_object_detection(
+        prompt, image, box_threshold, image_size, fine_tune_id=fine_tune_id
     )
-    bboxes_formatted = []
-    for frame_data in detections:
-        bboxes_formatted_per_frame = [
-            {
-                "label": bbox["label"],
-                "bbox": normalize_bbox(bbox["bounding_box"], image_size),
-                "score": round(bbox["score"], 2),
-            }
-            for bbox in frame_data
-        ]
-        bboxes_formatted.append(bboxes_formatted_per_frame)
     _display_tool_trace(
-        owl_v2_video.__name__,
-        payload,
-        detections,
-        files,
+        owlv2_object_detection.__name__,
+        {
+            "prompts": prompt,
+            "confidence": box_threshold,
+        },
+        ret["display_data"],
+        ret["files"],
     )
-    return bboxes_formatted
+    return ret["return_data"]  # type: ignore
-def owlv2_sam2_video_tracking(
+def owlv2_sam2_instance_segmentation(
     prompt: str,
-    frames: List[np.ndarray],
-    chunk_length: Optional[int] = 10,
-    fine_tune_id: Optional[str] = None,
-) -> List[List[Dict[str, Any]]]:
-    """'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
-    prompt such as category names or referring expressions. The categories in the text
-    prompt are separated by commas. It returns a list of bounding boxes, label names,
-    mask file names and associated probability scores.
+    image: np.ndarray,
+    box_threshold: float = 0.10,
+) -> List[Dict[str, Any]]:
+    """'owlv2_sam2_instance_segmentation' is a tool that can detect and count multiple
+    instances of objects given a text prompt such as category names or referring
+    expressions on images. The categories in text prompt are separated by commas. It
+    returns a list of bounding boxes with normalized coordinates, label names, masks
+    and associated probability scores.
     Parameters:
-        prompt (str): The prompt to ground to the image.
-        image (np.ndarray): The image to ground the prompt to.
+        prompt (str): The object that needs to be counted.
+        image (np.ndarray): The image that contains multiple instances of the object.
+        box_threshold (float, optional): The threshold for detection. Defaults
+            to 0.10.
     Returns:
         List[Dict[str, Any]]: A list of dictionaries containing the score, label,
@@ -434,16 +485,76 @@ def owlv2_sam2_video_tracking(
     Example
     -------
-        >>> countgd_sam2_video_tracking("car, dinosaur", frames)
+        >>> owlv2_sam2_instance_segmentation("flower", image)
         [
-            [
-                {
-                    'label': '0: dinosaur',
-                    'bbox': [0.1, 0.11, 0.35, 0.4],
-                    'mask': array([[0, 0, 0, ..., 0, 0, 0],
-                        [0, 0, 0, ..., 0, 0, 0],
-                        ...,
-                        [0, 0, 0, ..., 0, 0, 0],
+            {
+                'score': 0.49,
+                'label': 'flower',
+                'bbox': [0.1, 0.11, 0.35, 0.4],
+                'mask': array([[0, 0, 0, ..., 0, 0, 0],
+                    [0, 0, 0, ..., 0, 0, 0],
+                    ...,
+                    [0, 0, 0, ..., 0, 0, 0],
+                    [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
+            },
+        ]
+    """
+    od_ret = _owlv2_object_detection(prompt, image, box_threshold, image.shape[:2])
+    seg_ret = _sam2(
+        image, od_ret["return_data"], image.shape[:2], image_bytes=od_ret["files"][0][1]
+    )
+    _display_tool_trace(
+        countgd_sam2_instance_segmentation.__name__,
+        {
+            "prompts": prompt,
+            "confidence": box_threshold,
+        },
+        seg_ret["display_data"],
+        seg_ret["files"],
+    )
+    return seg_ret["return_data"]  # type: ignore
+def owlv2_sam2_video_tracking(
+    prompt: str,
+    frames: List[np.ndarray],
+    chunk_length: Optional[int] = 10,
+    fine_tune_id: Optional[str] = None,
+) -> List[List[Dict[str, Any]]]:
+    """'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
+    prompt such as category names or referring expressions. The categories in the text
+    prompt are separated by commas. It returns a list of bounding boxes, label names,
+    mask file names and associated probability scores.
+    Parameters:
+        prompt (str): The prompt to ground to the image.
+        image (np.ndarray): The image to ground the prompt to.
+        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
+            fine-tuned model ID here to use it.
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing the score, label,
+            bounding box, and mask of the detected objects with normalized coordinates
+            (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
+            and xmax and ymax are the coordinates of the bottom-right of the bounding box.
+            The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
+            the background.
+    Example
+    -------
+        >>> owlv2_sam2_video_tracking("car, dinosaur", frames)
+        [
+            [
+                {
+                    'label': '0: dinosaur',
+                    'bbox': [0.1, 0.11, 0.35, 0.4],
+                    'mask': array([[0, 0, 0, ..., 0, 0, 0],
+                        [0, 0, 0, ..., 0, 0, 0],
+                        ...,
+                        [0, 0, 0, ..., 0, 0, 0],
                         [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
                 },
             ],
@@ -467,13 +578,96 @@ def owlv2_sam2_video_tracking(
     return ret["return_data"]  # type: ignore
-def florence2_sam2_image(
+# Florence2 Tools
+def florence2_object_detection(
     prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
 ) -> List[Dict[str, Any]]:
-    """'florence2_sam2_image' is a tool that can segment multiple objects given a text
-    prompt such as category names or referring expressions. The categories in the text
-    prompt are separated by commas. It returns a list of bounding boxes, label names,
-    mask file names and associated probability scores of 1.0.
+    """'florence2_object_detection' is a tool that can detect multiple
+    objects given a text prompt which can be object names or caption. You
+    can optionally separate the object names in the text with commas. It returns a list
+    of bounding boxes with normalized coordinates, label names and associated
+    confidence scores of 1.0.
+    Parameters:
+        prompt (str): The prompt to ground to the image.
+        image (np.ndarray): The image to used to detect objects
+        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
+            fine-tuned model ID here to use it.
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
+            bounding box of the detected objects with normalized coordinates between 0
+            and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
+            top-left and xmax and ymax are the coordinates of the bottom-right of the
+            bounding box. The scores are always 1.0 and cannot be thresholded
+    Example
+    -------
+        >>> florence2_object_detection('person looking at a coyote', image)
+        [
+            {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
+            {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
+        ]
+    """
+    image_size = image.shape[:2]
+    if image_size[0] < 1 or image_size[1] < 1:
+        return []
+    buffer_bytes = numpy_to_bytes(image)
+    files = [("image", buffer_bytes)]
+    payload = {
+        "prompts": [s.strip() for s in prompt.split(",")],
+        "model": "florence2",
+    }
+    metadata = {"function_name": "florence2_object_detection"}
+    if fine_tune_id is not None:
+        landing_api = LandingPublicAPI()
+        status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
+        if status is not JobStatus.SUCCEEDED:
+            raise FineTuneModelIsNotReady(
+                f"Fine-tuned model {fine_tune_id} is not ready yet"
+            )
+        payload["jobId"] = fine_tune_id
+    detections = send_task_inference_request(
+        payload,
+        "text-to-object-detection",
+        files=files,
+        metadata=metadata,
+    )
+    # get the first frame
+    bboxes = detections[0]
+    bboxes_formatted = [
+        {
+            "label": bbox["label"],
+            "bbox": normalize_bbox(bbox["bounding_box"], image_size),
+            "score": round(bbox["score"], 2),
+        }
+        for bbox in bboxes
+    ]
+    _display_tool_trace(
+        florence2_object_detection.__name__,
+        payload,
+        detections[0],
+        files,
+    )
+    return [bbox for bbox in bboxes_formatted]
+def florence2_sam2_instance_segmentation(
+    prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
+) -> List[Dict[str, Any]]:
+    """'florence2_sam2_instance_segmentation' is a tool that can segment multiple
+    objects given a text prompt such as category names or referring expressions. The
+    categories in the text prompt are separated by commas. It returns a list of
+    bounding boxes, label names, mask file names and associated probability scores of
+    1.0.
     Parameters:
         prompt (str): The prompt to ground to the image.
@@ -491,7 +685,7 @@ def florence2_sam2_image(
     Example
     -------
-        >>> florence2_sam2_image("car, dinosaur", image)
+        >>> florence2_sam2_instance_segmentation("car, dinosaur", image)
         [
             {
                 'score': 1.0,
@@ -514,7 +708,7 @@ def florence2_sam2_image(
         "prompt": prompt,
         "model": "florence2sam2",
     }
-    metadata = {"function_name": "florence2_sam2_image"}
+    metadata = {"function_name": "florence2_sam2_instance_segmentation"}
     if fine_tune_id is not None:
         landing_api = LandingPublicAPI()
@@ -543,7 +737,7 @@ def florence2_sam2_image(
         return_data.append({"label": label, "bbox": bbox, "mask": mask, "score": 1.0})
     _display_tool_trace(
-        florence2_sam2_image.__name__,
+        florence2_sam2_instance_segmentation.__name__,
         payload,
         detections[0],
         files,
@@ -580,7 +774,7 @@ def florence2_sam2_video_tracking(
     Example
     -------
-        >>> florence2_sam2_video("car, dinosaur", frames)
+        >>> florence2_sam2_video_tracking("car, dinosaur", frames)
         [
             [
                 {
@@ -665,10 +859,11 @@ def florence2_sam2_video_tracking(
     return return_data
-def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
-    """'ocr' extracts text from an image. It returns a list of detected text, bounding
-    boxes with normalized coordinates, and confidence scores. The results are sorted
-    from top-left to bottom right.
+def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
+    """'florence2_ocr' is a tool that can detect text and text regions in an image.
+    Each text region contains one line of text. It returns a list of detected text,
+    the text region as a bounding box with normalized coordinates, and confidence
+    scores. The results are sorted from top-left to bottom right.
     Parameters:
         image (np.ndarray): The image to extract text from.
@@ -679,173 +874,59 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
     Example
     -------
-        >>> ocr(image)
+        >>> florence2_ocr(image)
         [
             {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
         ]
     """
-    pil_image = Image.fromarray(image).convert("RGB")
-    image_size = pil_image.size[::-1]
+    image_size = image.shape[:2]
     if image_size[0] < 1 or image_size[1] < 1:
         return []
-    image_buffer = io.BytesIO()
-    pil_image.save(image_buffer, format="PNG")
-    buffer_bytes = image_buffer.getvalue()
-    image_buffer.close()
-    res = requests.post(
-        _OCR_URL,
-        files={"images": buffer_bytes},
-        data={"language": "en"},
-        headers={"contentType": "multipart/form-data", "apikey": _API_KEY},
-    )
-    if res.status_code != 200:
-        raise ValueError(f"OCR request failed with status code {res.status_code}")
-    data = res.json()
-    output = []
-    for det in data[0]:
-        label = det["text"]
-        box = [
-            det["location"][0]["x"],
-            det["location"][0]["y"],
-            det["location"][2]["x"],
-            det["location"][2]["y"],
-        ]
-        box = normalize_bbox(box, image_size)
-        output.append({"label": label, "bbox": box, "score": round(det["score"], 2)})
+    image_b64 = convert_to_b64(image)
+    data = {
+        "image": image_b64,
+        "task": "<OCR_WITH_REGION>",
+        "function_name": "florence2_ocr",
+    }
+    detections = send_inference_request(data, "florence2", v2=True)
+    detections = detections["<OCR_WITH_REGION>"]
+    return_data = []
+    for i in range(len(detections["quad_boxes"])):
+        return_data.append(
+            {
+                "label": detections["labels"][i],
+                "bbox": normalize_bbox(
+                    convert_quad_box_to_bbox(detections["quad_boxes"][i]), image_size
+                ),
+                "score": 1.0,
+            }
+        )
     _display_tool_trace(
-        ocr.__name__,
+        florence2_ocr.__name__,
         {},
-        data,
-        cast(List[Tuple[str, bytes]], [("image", buffer_bytes)]),
+        detections,
+        image_b64,
     )
-    return sorted(output, key=lambda x: (x["bbox"][1], x["bbox"][0]))
+    return return_data
-def _sam2(
+# CountGD Tools
+def _countgd_object_detection(
+    prompt: str,
     image: np.ndarray,
-    detections: List[Dict[str, Any]],
+    box_threshold: float,
     image_size: Tuple[int, ...],
     image_bytes: Optional[bytes] = None,
 ) -> Dict[str, Any]:
     if image_bytes is None:
         image_bytes = numpy_to_bytes(image)
-    files = [("images", image_bytes)]
-    payload = {
-        "model": "sam2",
-        "bboxes": json.dumps(
-            [
-                {
-                    "labels": [d["label"] for d in detections],
-                    "bboxes": [
-                        denormalize_bbox(d["bbox"], image_size) for d in detections
-                    ],
-                }
-            ]
-        ),
-    }
-    metadata = {"function_name": "sam2"}
-    pred_detections = send_task_inference_request(
-        payload, "sam2", files=files, metadata=metadata
-    )
-    frame = pred_detections[0]
-    return_data = []
-    display_data = []
-    for inp_detection, detection in zip(detections, frame):
-        mask = rle_decode_array(detection["mask"])
-        label = detection["label"]
-        bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
-        return_data.append(
-            {
-                "label": label,
-                "bbox": bbox,
-                "mask": mask,
-                "score": inp_detection["score"],
-            }
-        )
-        display_data.append(
-            {
-                "label": label,
-                "bbox": detection["bounding_box"],
-                "mask": detection["mask"],
-                "score": inp_detection["score"],
-            }
-        )
-    return {"files": files, "return_data": return_data, "display_data": display_data}
-def sam2(
-    image: np.ndarray,
-    detections: List[Dict[str, Any]],
-) -> List[Dict[str, Any]]:
-    """'sam2' is a tool that can segment multiple objects given an input bounding box,
-    label and score. It returns a set of masks along with the corresponding bounding
-    boxes and labels.
-    Parameters:
-        image (np.ndarray): The image that contains multiple instances of the object.
-        detections (List[Dict[str, Any]]): A list of dictionaries containing the score,
-            label, and bounding box of the detected objects with normalized coordinates
-            between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates
-            of the top-left and xmax and ymax are the coordinates of the bottom-right of
-            the bounding box.
-    Returns:
-        List[Dict[str, Any]]: A list of dictionaries containing the score, label,
-            bounding box, and mask of the detected objects with normalized coordinates
-            (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
-            and xmax and ymax are the coordinates of the bottom-right of the bounding box.
-            The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
-            the background.
-    Example
-    -------
-        >>> sam2(image, [
-                {'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
-            ])
-        [
-            {
-                'score': 0.49,
-                'label': 'flower',
-                'bbox': [0.1, 0.11, 0.35, 0.4],
-                'mask': array([[0, 0, 0, ..., 0, 0, 0],
-                    [0, 0, 0, ..., 0, 0, 0],
-                    ...,
-                    [0, 0, 0, ..., 0, 0, 0],
-                    [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
-            },
-        ]
-    """
-    image_size = image.shape[:2]
-    ret = _sam2(image, detections, image_size)
-    _display_tool_trace(
-        sam2.__name__,
-        {},
-        ret["display_data"],
-        ret["files"],
-    )
-    return ret["return_data"]  # type: ignore
-def _countgd_object_detection(
-    prompt: str,
-    image: np.ndarray,
-    box_threshold: float,
-    image_size: Tuple[int, ...],
-    image_bytes: Optional[bytes] = None,
-) -> Dict[str, Any]:
-    if image_bytes is None:
-        image_bytes = numpy_to_bytes(image)
-    files = [("image", image_bytes)]
-    prompts = [p.strip() for p in prompt.split(", ")]
+    files = [("image", image_bytes)]
+    prompts = [p.strip() for p in prompt.split(", ")]
     def _run_countgd(prompt: str) -> List[Dict[str, Any]]:
         payload = {
@@ -853,7 +934,7 @@ def _countgd_object_detection(
             "confidence": box_threshold,  # still not being used in the API
             "model": "countgd",
         }
-        metadata = {"function_name": "countgd_counting"}
+        metadata = {"function_name": "countgd_object_detection"}
         detections = send_task_inference_request(
             payload, "text-to-object-detection", files=files, metadata=metadata
@@ -939,16 +1020,16 @@ def countgd_object_detection(
     return ret["return_data"]  # type: ignore
-def countgd_sam2_object_detection(
+def countgd_sam2_instance_segmentation(
     prompt: str,
     image: np.ndarray,
     box_threshold: float = 0.23,
 ) -> List[Dict[str, Any]]:
-    """'countgd_sam2_object_detection' is a tool that can detect multiple instances of
-    an object given a text prompt. It is particularly useful when trying to detect and
-    count a large number of objects. You can optionally separate object names in the
-    prompt with commas. It returns a list of bounding boxes with normalized coordinates,
-    label names, masks associated confidence scores.
+    """'countgd_sam2_instance_segmentation' is a tool that can detect multiple
+    instances of an object given a text prompt. It is particularly useful when trying
+    to detect and count a large number of objects. You can optionally separate object
+    names in the prompt with commas. It returns a list of bounding boxes with
+    normalized coordinates, label names, masks associated confidence scores.
     Parameters:
         prompt (str): The object that needs to be counted.
@@ -966,7 +1047,7 @@ def countgd_sam2_object_detection(
     Example
     -------
-        >>> countgd_object_detection("flower", image)
+        >>> countgd_sam2_instance_segmentation("flower", image)
         [
             {
                 'score': 0.49,
@@ -987,7 +1068,7 @@ def countgd_sam2_object_detection(
     )
     _display_tool_trace(
-        countgd_sam2_object_detection.__name__,
+        countgd_sam2_instance_segmentation.__name__,
         {
             "prompts": prompt,
             "confidence": box_threshold,
@@ -1012,6 +1093,8 @@ def countgd_sam2_video_tracking(
     Parameters:
         prompt (str): The prompt to ground to the image.
         image (np.ndarray): The image to ground the prompt to.
+        chunk_length (Optional[int]): The number of frames to re-run florence2 to find
+            new objects.
     Returns:
         List[Dict[str, Any]]: A list of dictionaries containing the score, label,
@@ -1052,14 +1135,15 @@ def countgd_sam2_video_tracking(
     return ret["return_data"]  # type: ignore
-def countgd_example_based_counting(
+def countgd_visual_prompt_object_detection(
     visual_prompts: List[List[float]],
     image: np.ndarray,
     box_threshold: float = 0.23,
 ) -> List[Dict[str, Any]]:
-    """'countgd_example_based_counting' is a tool that can precisely count multiple
-    instances of an object given few visual example prompts. It returns a list of bounding
-    boxes with normalized coordinates, label names and associated confidence scores.
+    """'countgd_visual_prompt_object_detection' is a tool that can precisely count
+    multiple instances of an object given few visual example prompts. It returns a list
+    of bounding boxes with normalized coordinates, label names and associated
+    confidence scores.
     Parameters:
         visual_prompts (List[List[float]]): Bounding boxes of the object in format
@@ -1077,7 +1161,7 @@ def countgd_example_based_counting(
     Example
     -------
-        >>> countgd_example_based_counting(
+        >>> countgd_visual_object_detection(
             visual_prompts=[[0.1, 0.1, 0.4, 0.42], [0.2, 0.3, 0.25, 0.35]],
             image=image
         )
@@ -1098,7 +1182,7 @@ def countgd_example_based_counting(
         denormalize_bbox(bbox, image.shape[:2]) for bbox in visual_prompts
     ]
     payload = {"visual_prompts": json.dumps(visual_prompts), "model": "countgd"}
-    metadata = {"function_name": "countgd_example_based_counting"}
+    metadata = {"function_name": "countgd_visual_prompt_object_detection"}
     detections = send_task_inference_request(
         payload, "visual-prompts-to-object-detection", files=files, metadata=metadata
@@ -1115,7 +1199,7 @@ def countgd_example_based_counting(
         for bbox in bboxes_per_frame
     ]
     _display_tool_trace(
-        countgd_example_based_counting.__name__,
+        countgd_visual_prompt_object_detection.__name__,
         payload,
         [
             {
@@ -1214,6 +1298,67 @@ def qwen2_vl_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
     return cast(str, data)
+def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
+    """'ocr' extracts text from an image. It returns a list of detected text, bounding
+    boxes with normalized coordinates, and confidence scores. The results are sorted
+    from top-left to bottom right.
+    Parameters:
+        image (np.ndarray): The image to extract text from.
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
+            with normalized coordinates, and confidence score.
+    Example
+    -------
+        >>> ocr(image)
+        [
+            {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
+        ]
+    """
+    pil_image = Image.fromarray(image).convert("RGB")
+    image_size = pil_image.size[::-1]
+    if image_size[0] < 1 or image_size[1] < 1:
+        return []
+    image_buffer = io.BytesIO()
+    pil_image.save(image_buffer, format="PNG")
+    buffer_bytes = image_buffer.getvalue()
+    image_buffer.close()
+    res = requests.post(
+        _OCR_URL,
+        files={"images": buffer_bytes},
+        data={"language": "en"},
+        headers={"contentType": "multipart/form-data", "apikey": _API_KEY},
+    )
+    if res.status_code != 200:
+        raise ValueError(f"OCR request failed with status code {res.status_code}")
+    data = res.json()
+    output = []
+    for det in data[0]:
+        label = det["text"]
+        box = [
+            det["location"][0]["x"],
+            det["location"][0]["y"],
+            det["location"][2]["x"],
+            det["location"][2]["y"],
+        ]
+        box = normalize_bbox(box, image_size)
+        output.append({"label": label, "bbox": box, "score": round(det["score"], 2)})
+    _display_tool_trace(
+        ocr.__name__,
+        {},
+        data,
+        cast(List[Tuple[str, bytes]], [("image", buffer_bytes)]),
+    )
+    return sorted(output, key=lambda x: (x["bbox"][1], x["bbox"][0]))
 def claude35_text_extraction(image: np.ndarray) -> str:
     """'claude35_text_extraction' is a tool that can extract text from an image. It
     returns the extracted text as a string and can be used as an alternative to OCR if
@@ -1238,67 +1383,146 @@ def claude35_text_extraction(image: np.ndarray) -> str:
     return cast(str, text)
-def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
-    """'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
-    including regular images or images of documents or presentations. It returns text
-    as an answer to the question.
+def document_extraction(image: np.ndarray) -> Dict[str, Any]:
+    """'document_extraction' is a tool that can extract structured information out of
+    documents with different layouts. It returns the extracted data in a structured
+    hierarchical format containing text, tables, pictures, charts, and other
+    information.
     Parameters:
-        prompt (str): The question about the image
-        image (np.ndarray): The reference image used for the question
+        image (np.ndarray): The document image to analyze
     Returns:
-        str: A string which is the answer to the given prompt.
+        Dict[str, Any]: A dictionary containing the extracted information.
     Example
     -------
-        >>> gpt4o_image_vqa('What is the cat doing?', image)
-        'drinking milk'
+        >>> document_analysis(image)
+        {'pages':
+            [{'bbox': [0, 0, 1.0, 1.0],
+                    'chunks': [{'bbox': [0.8, 0.1, 1.0, 0.2],
+                                'label': 'page_header',
+                                'order': 75
+                                'caption': 'Annual Report 2024',
+                                'summary': 'This annual report summarizes ...' },
+                               {'bbox': [0.2, 0.9, 0.9, 1.0],
+                                'label': 'table',
+                                'order': 1119,
+                                'caption': [{'Column 1': 'Value 1', 'Column 2': 'Value 2'},
+                                'summary': 'This table illustrates a trend of ...'},
+                    ],
     """
-    lmm = OpenAILMM()
-    buffer = io.BytesIO()
-    Image.fromarray(image).save(buffer, format="PNG")
-    image_bytes = buffer.getvalue()
-    image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
-    resp = lmm.generate(prompt, [image_b64])
-    return cast(str, resp)
+    image_file = numpy_to_bytes(image)
+    files = [("image", image_file)]
-def gpt4o_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
-    """'gpt4o_video_vqa' is a tool that can answer any questions about arbitrary videos
-    including regular videos or videos of documents or presentations. It returns text
-    as an answer to the question.
+    payload = {
+        "model": "document-analysis",
+    }
+    data: Dict[str, Any] = send_inference_request(
+        payload=payload,
+        endpoint_name="document-analysis",
+        files=files,
+        v2=True,
+        metadata_payload={"function_name": "document_analysis"},
+    )
+    # don't display normalized bboxes
+    _display_tool_trace(
+        document_extraction.__name__,
+        payload,
+        data,
+        files,
+    )
+    def normalize(data: Any) -> Dict[str, Any]:
+        if isinstance(data, Dict):
+            if "bbox" in data:
+                data["bbox"] = normalize_bbox(data["bbox"], image.shape[:2])
+            for key in data:
+                data[key] = normalize(data[key])
+        elif isinstance(data, List):
+            for i in range(len(data)):
+                data[i] = normalize(data[i])
+        return data  # type: ignore
+    data = normalize(data)
+    return data
+def document_qa(
+    prompt: str,
+    image: np.ndarray,
+) -> str:
+    """'document_qa' is a tool that can answer any questions about arbitrary documents,
+    presentations, or tables. It's very useful for document QA tasks, you can ask it a
+    specific question or ask it to return a JSON object answering multiple questions
+    about the document.
     Parameters:
-        prompt (str): The question about the video
-        frames (List[np.ndarray]): The reference frames used for the question
+        prompt (str): The question to be answered about the document image.
+        image (np.ndarray): The document image to analyze.
     Returns:
-        str: A string which is the answer to the given prompt.
+        str: The answer to the question based on the document's context.
     Example
     -------
-        >>> gpt4o_video_vqa('Which football player made the goal?', frames)
-        'Lionel Messi'
+        >>> document_qa(image, question)
+        'The answer to the question ...'
     """
-    lmm = OpenAILMM()
+    image_file = numpy_to_bytes(image)
-    if len(frames) > 10:
-        step = len(frames) / 10
-        frames = [frames[int(i * step)] for i in range(10)]
+    files = [("image", image_file)]
-    frames_b64 = []
-    for frame in frames:
-        buffer = io.BytesIO()
-        Image.fromarray(frame).save(buffer, format="PNG")
-        image_bytes = buffer.getvalue()
-        image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
-        frames_b64.append(image_b64)
+    payload = {
+        "model": "document-analysis",
+    }
-    resp = lmm.generate(prompt, frames_b64)
-    return cast(str, resp)
+    data: Dict[str, Any] = send_inference_request(
+        payload=payload,
+        endpoint_name="document-analysis",
+        files=files,
+        v2=True,
+        metadata_payload={"function_name": "document_qa"},
+    )
+    def normalize(data: Any) -> Dict[str, Any]:
+        if isinstance(data, Dict):
+            if "bbox" in data:
+                data["bbox"] = normalize_bbox(data["bbox"], image.shape[:2])
+            for key in data:
+                data[key] = normalize(data[key])
+        elif isinstance(data, List):
+            for i in range(len(data)):
+                data[i] = normalize(data[i])
+        return data  # type: ignore
+    data = normalize(data)
+    prompt = f"""
+Document Context:
+{data}\n
+Question: {prompt}\n
+Answer the question directly using only the information from the document, do not answer with any additional text besides the answer. If the answer is not definitively contained in the document, say "I cannot find the answer in the provided document."
+    """
+    lmm = AnthropicLMM()
+    llm_output = lmm.generate(prompt=prompt)
+    llm_output = cast(str, llm_output)
+    _display_tool_trace(
+        document_qa.__name__,
+        payload,
+        llm_output,
+        files,
+    )
+    return llm_output
 def video_temporal_localization(
@@ -1360,284 +1584,70 @@ def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
     Parameters:
         image (np.ndarray): The image to classify or tag
-    Returns:
-        Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
-            contains a list of labels and other a list of scores.
-    Example
-    -------
-        >>> vit_image_classification(image)
-        {"labels": ["leopard", "lemur, otter", "bird"], "scores": [0.68, 0.30, 0.02]},
-    """
-    if image.shape[0] < 1 or image.shape[1] < 1:
-        return {"labels": [], "scores": []}
-    image_b64 = convert_to_b64(image)
-    data = {
-        "image": image_b64,
-        "tool": "image_classification",
-        "function_name": "vit_image_classification",
-    }
-    resp_data: dict[str, Any] = send_inference_request(data, "tools")
-    resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
-    _display_tool_trace(
-        vit_image_classification.__name__,
-        data,
-        resp_data,
-        image_b64,
-    )
-    return resp_data
-def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
-    """'vit_nsfw_classification' is a tool that can classify an image as 'nsfw' or 'normal'.
-    It returns the predicted label and their probability scores based on image content.
-    Parameters:
-        image (np.ndarray): The image to classify or tag
-    Returns:
-        Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
-            contains a list of labels and other a list of scores.
-    Example
-    -------
-        >>> vit_nsfw_classification(image)
-        {"label": "normal", "scores": 0.68},
-    """
-    if image.shape[0] < 1 or image.shape[1] < 1:
-        raise ValueError(f"Image is empty, image shape: {image.shape}")
-    image_b64 = convert_to_b64(image)
-    data = {
-        "image": image_b64,
-        "function_name": "vit_nsfw_classification",
-    }
-    resp_data: dict[str, Any] = send_inference_request(
-        data, "nsfw-classification", v2=True
-    )
-    resp_data["score"] = round(resp_data["score"], 4)
-    _display_tool_trace(
-        vit_nsfw_classification.__name__,
-        data,
-        resp_data,
-        image_b64,
-    )
-    return resp_data
-def florence2_phrase_grounding(
-    prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
-) -> List[Dict[str, Any]]:
-    """'florence2_phrase_grounding' is a tool that can detect multiple
-    objects given a text prompt which can be object names or caption. You
-    can optionally separate the object names in the text with commas. It returns a list
-    of bounding boxes with normalized coordinates, label names and associated
-    confidence scores of 1.0.
-    Parameters:
-        prompt (str): The prompt to ground to the image.
-        image (np.ndarray): The image to used to detect objects
-        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
-            fine-tuned model ID here to use it.
-    Returns:
-        List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
-            bounding box of the detected objects with normalized coordinates between 0
-            and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
-            top-left and xmax and ymax are the coordinates of the bottom-right of the
-            bounding box. The scores are always 1.0 and cannot be thresholded
-    Example
-    -------
-        >>> florence2_phrase_grounding('person looking at a coyote', image)
-        [
-            {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
-            {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
-        ]
-    """
-    image_size = image.shape[:2]
-    if image_size[0] < 1 or image_size[1] < 1:
-        return []
-    buffer_bytes = numpy_to_bytes(image)
-    files = [("image", buffer_bytes)]
-    payload = {
-        "prompts": [s.strip() for s in prompt.split(",")],
-        "model": "florence2",
-    }
-    metadata = {"function_name": "florence2_phrase_grounding"}
-    if fine_tune_id is not None:
-        landing_api = LandingPublicAPI()
-        status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
-        if status is not JobStatus.SUCCEEDED:
-            raise FineTuneModelIsNotReady(
-                f"Fine-tuned model {fine_tune_id} is not ready yet"
-            )
-        payload["jobId"] = fine_tune_id
-    detections = send_task_inference_request(
-        payload,
-        "text-to-object-detection",
-        files=files,
-        metadata=metadata,
-    )
-    # get the first frame
-    bboxes = detections[0]
-    bboxes_formatted = [
-        {
-            "label": bbox["label"],
-            "bbox": normalize_bbox(bbox["bounding_box"], image_size),
-            "score": round(bbox["score"], 2),
-        }
-        for bbox in bboxes
-    ]
-    _display_tool_trace(
-        florence2_phrase_grounding.__name__,
-        payload,
-        detections[0],
-        files,
-    )
-    return [bbox for bbox in bboxes_formatted]
-def florence2_phrase_grounding_video(
-    prompt: str, frames: List[np.ndarray], fine_tune_id: Optional[str] = None
-) -> List[List[Dict[str, Any]]]:
-    """'florence2_phrase_grounding_video' will run florence2 on each frame of a video.
-    It can detect multiple objects given a text prompt which can be object names or
-    caption. You can optionally separate the object names in the text with commas.
-    It returns a list of lists where each inner list contains bounding boxes with
-    normalized coordinates, label names and associated probability scores of 1.0.
-    Parameters:
-        prompt (str): The prompt to ground to the video.
-        frames (List[np.ndarray]): The list of frames to detect objects.
-        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
-            fine-tuned model ID here to use it.
-    Returns:
-        List[List[Dict[str, Any]]]: A list of lists of dictionaries containing the score,
-            label, and bounding box of the detected objects with normalized coordinates
-            between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates
-            of the top-left and xmax and ymax are the coordinates of the bottom-right of
-            the bounding box. The scores are always 1.0 and cannot be thresholded.
-    Example
-    -------
-        >>> florence2_phrase_grounding_video('person looking at a coyote', frames)
-        [
-            [
-                {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
-                {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
-            ],
-            ...
-        ]
-    """
-    if len(frames) == 0:
-        raise ValueError("No frames provided")
-    image_size = frames[0].shape[:2]
-    buffer_bytes = frames_to_bytes(frames)
-    files = [("video", buffer_bytes)]
-    payload = {
-        "prompts": [s.strip() for s in prompt.split(",")],
-        "model": "florence2",
-    }
-    metadata = {"function_name": "florence2_phrase_grounding_video"}
-    if fine_tune_id is not None:
-        landing_api = LandingPublicAPI()
-        status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
-        if status is not JobStatus.SUCCEEDED:
-            raise FineTuneModelIsNotReady(
-                f"Fine-tuned model {fine_tune_id} is not ready yet"
-            )
-        payload["jobId"] = fine_tune_id
-    detections = send_task_inference_request(
-        payload,
-        "text-to-object-detection",
-        files=files,
-        metadata=metadata,
-    )
-    bboxes_formatted = []
-    for frame_data in detections:
-        bboxes_formatted_per_frame = [
-            {
-                "label": bbox["label"],
-                "bbox": normalize_bbox(bbox["bounding_box"], image_size),
-                "score": round(bbox["score"], 2),
-            }
-            for bbox in frame_data
-        ]
-        bboxes_formatted.append(bboxes_formatted_per_frame)
+    Returns:
+        Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
+            contains a list of labels and other a list of scores.
+    Example
+    -------
+        >>> vit_image_classification(image)
+        {"labels": ["leopard", "lemur, otter", "bird"], "scores": [0.68, 0.30, 0.02]},
+    """
+    if image.shape[0] < 1 or image.shape[1] < 1:
+        return {"labels": [], "scores": []}
+    image_b64 = convert_to_b64(image)
+    data = {
+        "image": image_b64,
+        "tool": "image_classification",
+        "function_name": "vit_image_classification",
+    }
+    resp_data: dict[str, Any] = send_inference_request(data, "tools")
+    resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
     _display_tool_trace(
-        florence2_phrase_grounding_video.__name__,
-        payload,
-        detections,
-        files,
+        vit_image_classification.__name__,
+        data,
+        resp_data,
+        image_b64,
     )
-    return bboxes_formatted
+    return resp_data
-def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
-    """'florence2_ocr' is a tool that can detect text and text regions in an image.
-    Each text region contains one line of text. It returns a list of detected text,
-    the text region as a bounding box with normalized coordinates, and confidence
-    scores. The results are sorted from top-left to bottom right.
+def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
+    """'vit_nsfw_classification' is a tool that can classify an image as 'nsfw' or 'normal'.
+    It returns the predicted label and their probability scores based on image content.
     Parameters:
-        image (np.ndarray): The image to extract text from.
+        image (np.ndarray): The image to classify or tag
     Returns:
-        List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
-            with normalized coordinates, and confidence score.
+        Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
+            contains a list of labels and other a list of scores.
     Example
     -------
-        >>> florence2_ocr(image)
-        [
-            {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
-        ]
+        >>> vit_nsfw_classification(image)
+        {"label": "normal", "scores": 0.68},
     """
+    if image.shape[0] < 1 or image.shape[1] < 1:
+        raise ValueError(f"Image is empty, image shape: {image.shape}")
-    image_size = image.shape[:2]
-    if image_size[0] < 1 or image_size[1] < 1:
-        return []
     image_b64 = convert_to_b64(image)
     data = {
         "image": image_b64,
-        "task": "<OCR_WITH_REGION>",
-        "function_name": "florence2_ocr",
+        "function_name": "vit_nsfw_classification",
     }
-    detections = send_inference_request(data, "florence2", v2=True)
-    detections = detections["<OCR_WITH_REGION>"]
-    return_data = []
-    for i in range(len(detections["quad_boxes"])):
-        return_data.append(
-            {
-                "label": detections["labels"][i],
-                "bbox": normalize_bbox(
-                    convert_quad_box_to_bbox(detections["quad_boxes"][i]), image_size
-                ),
-                "score": 1.0,
-            }
-        )
+    resp_data: dict[str, Any] = send_inference_request(
+        data, "nsfw-classification", v2=True
+    )
+    resp_data["score"] = round(resp_data["score"], 4)
     _display_tool_trace(
-        florence2_ocr.__name__,
-        {},
-        detections,
+        vit_nsfw_classification.__name__,
+        data,
+        resp_data,
         image_b64,
     )
-    return return_data
+    return resp_data
 def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
@@ -2097,164 +2107,6 @@ def closest_box_distance(
     return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
-def document_extraction(image: np.ndarray) -> Dict[str, Any]:
-    """'document_extraction' is a tool that can extract structured information out of
-    documents with different layouts. It returns the extracted data in a structured
-    hierarchical format containing text, tables, pictures, charts, and other
-    information.
-    Parameters:
-        image (np.ndarray): The document image to analyze
-    Returns:
-        Dict[str, Any]: A dictionary containing the extracted information.
-    Example
-    -------
-        >>> document_analysis(image)
-        {'pages':
-            [{'bbox': [0, 0, 1.0, 1.0],
-                    'chunks': [{'bbox': [0.8, 0.1, 1.0, 0.2],
-                                'label': 'page_header',
-                                'order': 75
-                                'caption': 'Annual Report 2024',
-                                'summary': 'This annual report summarizes ...' },
-                               {'bbox': [0.2, 0.9, 0.9, 1.0],
-                                'label': table',
-                                'order': 1119,
-                                'caption': [{'Column 1': 'Value 1', 'Column 2': 'Value 2'},
-                                'summary': 'This table illustrates a trend of ...'},
-                    ],
-    """
-    image_file = numpy_to_bytes(image)
-    files = [("image", image_file)]
-    payload = {
-        "model": "document-analysis",
-    }
-    data: Dict[str, Any] = send_inference_request(
-        payload=payload,
-        endpoint_name="document-analysis",
-        files=files,
-        v2=True,
-        metadata_payload={"function_name": "document_analysis"},
-    )
-    # don't display normalized bboxes
-    _display_tool_trace(
-        document_extraction.__name__,
-        payload,
-        data,
-        files,
-    )
-    def normalize(data: Any) -> Dict[str, Any]:
-        if isinstance(data, Dict):
-            if "bbox" in data:
-                data["bbox"] = normalize_bbox(data["bbox"], image.shape[:2])
-            for key in data:
-                data[key] = normalize(data[key])
-        elif isinstance(data, List):
-            for i in range(len(data)):
-                data[i] = normalize(data[i])
-        return data  # type: ignore
-    data = normalize(data)
-    return data
-def document_qa(
-    prompt: str,
-    image: np.ndarray,
-) -> str:
-    """'document_qa' is a tool that can answer any questions about arbitrary documents,
-    presentations, or tables. It's very useful for document QA tasks, you can ask it a
-    specific question or ask it to return a JSON object answering multiple questions
-    about the document.
-    Parameters:
-        prompt (str): The question to be answered about the document image.
-        image (np.ndarray): The document image to analyze.
-    Returns:
-        str: The answer to the question based on the document's context.
-    Example
-    -------
-        >>> document_qa(image, question)
-        'The answer to the question ...'
-    """
-    image_file = numpy_to_bytes(image)
-    files = [("image", image_file)]
-    payload = {
-        "model": "document-analysis",
-    }
-    data: Dict[str, Any] = send_inference_request(
-        payload=payload,
-        endpoint_name="document-analysis",
-        files=files,
-        v2=True,
-        metadata_payload={"function_name": "document_qa"},
-    )
-    def normalize(data: Any) -> Dict[str, Any]:
-        if isinstance(data, Dict):
-            if "bbox" in data:
-                data["bbox"] = normalize_bbox(data["bbox"], image.shape[:2])
-            for key in data:
-                data[key] = normalize(data[key])
-        elif isinstance(data, List):
-            for i in range(len(data)):
-                data[i] = normalize(data[i])
-        return data  # type: ignore
-    data = normalize(data)
-    prompt = f"""
-Document Context:
-{data}\n
-Question: {prompt}\n
-Answer the question directly using only the information from the document, do not answer with any additional text besides the answer. If the answer is not definitively contained in the document, say "I cannot find the answer in the provided document."
-    """
-    lmm = AnthropicLMM()
-    llm_output = lmm.generate(prompt=prompt)
-    llm_output = cast(str, llm_output)
-    _display_tool_trace(
-        document_qa.__name__,
-        payload,
-        llm_output,
-        files,
-    )
-    return llm_output
-def stella_embeddings(prompts: List[str]) -> List[np.ndarray]:
-    payload = {
-        "input": prompts,
-        "model": "stella1.5b",
-    }
-    data: Dict[str, Any] = send_inference_request(
-        payload=payload,
-        endpoint_name="embeddings",
-        v2=True,
-        metadata_payload={"function_name": "get_embeddings"},
-        is_form=True,
-    )
-    return [d["embedding"] for d in data]  # type: ignore
 # Utility and visualization functions
@@ -2772,31 +2624,31 @@ def _plot_counting(
 FUNCTION_TOOLS = [
-    owl_v2_image,
-    owl_v2_video,
-    ocr,
-    vit_image_classification,
-    vit_nsfw_classification,
+    owlv2_object_detection,
+    owlv2_sam2_instance_segmentation,
+    owlv2_sam2_video_tracking,
     countgd_object_detection,
-    countgd_sam2_object_detection,
+    countgd_sam2_instance_segmentation,
+    countgd_sam2_video_tracking,
     florence2_ocr,
-    florence2_sam2_image,
+    florence2_sam2_instance_segmentation,
     florence2_sam2_video_tracking,
-    florence2_phrase_grounding,
+    florence2_object_detection,
     claude35_text_extraction,
+    document_extraction,
+    document_qa,
+    ocr,
+    qwen2_vl_images_vqa,
+    qwen2_vl_video_vqa,
     detr_segmentation,
     depth_anything_v2,
     generate_pose_image,
-    minimum_distance,
-    qwen2_vl_images_vqa,
-    qwen2_vl_video_vqa,
-    document_extraction,
-    document_qa,
+    vit_image_classification,
+    vit_nsfw_classification,
     video_temporal_localization,
     flux_image_inpainting,
     siglip_classification,
-    owlv2_sam2_video_tracking,
-    countgd_sam2_video_tracking,
+    minimum_distance,
 ]
 UTIL_TOOLS = [

vision-agent 0.2.221__py3-none-any.whl → 0.2.222__py3-none-any.whl

vision-agent 0.2.221py3-none-any.whl → 0.2.222py3-none-any.whl