PyPI - vision-agent - Versions diffs - 0.2.14__py3-none-any.whl → 0.2.16__py3-none-any.whl - Mend

vision-agent 0.2.14py3-none-any.whl → 0.2.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

vision_agent/agent/__init__.py +1 -0
vision_agent/agent/agent_coder.py +33 -7
vision_agent/agent/vision_agent.py +16 -14
vision_agent/agent/vision_agent_v2.py +300 -0
vision_agent/agent/vision_agent_v2_prompt.py +170 -0
vision_agent/llm/llm.py +11 -3
vision_agent/tools/__init__.py +3 -3
vision_agent/tools/tool_utils.py +1 -1
vision_agent/tools/tools.py +62 -41
vision_agent/tools/tools_v2.py +278 -17
vision_agent/utils/__init__.py +3 -0
vision_agent/utils/execute.py +104 -0
vision_agent/utils/sim.py +70 -0
{vision_agent-0.2.14.dist-info → vision_agent-0.2.16.dist-info}/METADATA +4 -1
vision_agent-0.2.16.dist-info/RECORD +34 -0
vision_agent/agent/execution.py +0 -287
vision_agent-0.2.14.dist-info/RECORD +0 -30
/vision_agent/{image_utils.py → utils/image_utils.py} +0 -0
/vision_agent/{type_defs.py → utils/type_defs.py} +0 -0
/vision_agent/{tools → utils}/video.py +0 -0
{vision_agent-0.2.14.dist-info → vision_agent-0.2.16.dist-info}/LICENSE +0 -0
{vision_agent-0.2.14.dist-info → vision_agent-0.2.16.dist-info}/WHEEL +0 -0

vision_agent/tools/tools.py CHANGED Viewed

@@ -11,7 +11,10 @@ from PIL import Image
 from PIL.Image import Image as ImageType
 from scipy.spatial import distance  # type: ignore
-from vision_agent.image_utils import (
+from vision_agent.lmm import OpenAILMM
+from vision_agent.tools.tool_utils import _send_inference_request
+from vision_agent.utils import extract_frames_from_video
+from vision_agent.utils.image_utils import (
     b64_to_pil,
     convert_to_b64,
     denormalize_bbox,
@@ -19,9 +22,6 @@ from vision_agent.image_utils import (
     normalize_bbox,
     rle_decode,
 )
-from vision_agent.lmm import OpenAILMM
-from vision_agent.tools.tool_utils import _send_inference_request
-from vision_agent.tools.video import extract_frames_from_video
 _LOGGER = logging.getLogger(__name__)
@@ -174,15 +174,15 @@ class GroundingDINO(Tool):
     """
     name = "grounding_dino_"
-    description = "'grounding_dino_' is a tool that can detect and count objects given a text prompt such as category names or referring expressions. It returns a list and count of bounding boxes, label names and associated probability scores."
+    description = "'grounding_dino_' is a tool that can detect and count multiple objects given a text prompt such as category names or referring expressions. It returns a list and count of bounding boxes, label names and associated probability scores."
     usage = {
         "required_parameters": [
             {"name": "prompt", "type": "str"},
             {"name": "image", "type": "str"},
         ],
         "optional_parameters": [
-            {"name": "box_threshold", "type": "float"},
-            {"name": "iou_threshold", "type": "float"},
+            {"name": "box_threshold", "type": "float", "min": 0.1, "max": 0.5},
+            {"name": "iou_threshold", "type": "float", "min": 0.01, "max": 0.99},
         ],
         "examples": [
             {
@@ -209,7 +209,7 @@ class GroundingDINO(Tool):
                     "prompt": "red shirt. green shirt",
                     "image": "shirts.jpg",
                     "box_threshold": 0.20,
-                    "iou_threshold": 0.75,
+                    "iou_threshold": 0.20,
                 },
             },
         ],
@@ -221,7 +221,7 @@ class GroundingDINO(Tool):
         prompt: str,
         image: Union[str, Path, ImageType],
         box_threshold: float = 0.20,
-        iou_threshold: float = 0.75,
+        iou_threshold: float = 0.20,
     ) -> Dict:
         """Invoke the Grounding DINO model.
@@ -249,7 +249,7 @@ class GroundingDINO(Tool):
             data["scores"] = [round(score, 2) for score in data["scores"]]
         if "labels" in data:
             data["labels"] = list(data["labels"])
-        data["size"] = (image_size[1], image_size[0])
+        data["image_size"] = image_size
         return data
@@ -277,15 +277,15 @@ class GroundingSAM(Tool):
     """
     name = "grounding_sam_"
-    description = "'grounding_sam_' is a tool that can detect and segment objects given a text prompt such as category names or referring expressions. It returns a list of bounding boxes, label names and masks file names and associated probability scores."
+    description = "'grounding_sam_' is a tool that can detect and segment multiple objects given a text prompt such as category names or referring expressions. It returns a list of bounding boxes, label names and masks file names and associated probability scores."
     usage = {
         "required_parameters": [
             {"name": "prompt", "type": "str"},
             {"name": "image", "type": "str"},
         ],
         "optional_parameters": [
-            {"name": "box_threshold", "type": "float"},
-            {"name": "iou_threshold", "type": "float"},
+            {"name": "box_threshold", "type": "float", "min": 0.1, "max": 0.5},
+            {"name": "iou_threshold", "type": "float", "min": 0.01, "max": 0.99},
         ],
         "examples": [
             {
@@ -312,7 +312,7 @@ class GroundingSAM(Tool):
                     "prompt": "red shirt, green shirt",
                     "image": "shirts.jpg",
                     "box_threshold": 0.20,
-                    "iou_threshold": 0.75,
+                    "iou_threshold": 0.20,
                 },
             },
         ],
@@ -324,7 +324,7 @@ class GroundingSAM(Tool):
         prompt: str,
         image: Union[str, ImageType],
         box_threshold: float = 0.2,
-        iou_threshold: float = 0.75,
+        iou_threshold: float = 0.2,
     ) -> Dict:
         """Invoke the Grounding SAM model.
@@ -353,6 +353,7 @@ class GroundingSAM(Tool):
                 rle_decode(mask_rle=mask, shape=data["mask_shape"])
                 for mask in data["masks"]
             ]
+        data["image_size"] = image_size
         data.pop("mask_shape", None)
         return data
@@ -422,7 +423,6 @@ class DINOv(Tool):
         request_data = {
             "prompt": prompt,
             "image": image_b64,
-            "tool": "dinov",
         }
         data: Dict[str, Any] = _send_inference_request(request_data, "dinov")
         if "bboxes" in data:
@@ -435,6 +435,8 @@ class DINOv(Tool):
                 for mask in data["masks"]
             ]
         data["labels"] = ["visual prompt" for _ in range(len(data["masks"]))]
+        mask_shape = data.pop("mask_shape", None)
+        data["image_size"] = (mask_shape[0], mask_shape[1]) if mask_shape else None
         return data
@@ -790,33 +792,49 @@ class Crop(Tool):
         return {"image": tmp.name}
-class BboxArea(Tool):
-    r"""BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places."""
+class BboxStats(Tool):
+    r"""BboxStats returns the height, width and area of the bounding box in pixels to 2 decimal places."""
-    name = "bbox_area_"
-    description = "'bbox_area_' returns the area of the given bounding box in pixels normalized to 2 decimal places."
+    name = "bbox_stats_"
+    description = "'bbox_stats_' returns the height, width and area of the given bounding box in pixels to 2 decimal places."
     usage = {
-        "required_parameters": [{"name": "bboxes", "type": "List[int]"}],
+        "required_parameters": [
+            {"name": "bboxes", "type": "List[int]"},
+            {"name": "image_size", "type": "Tuple[int]"},
+        ],
         "examples": [
             {
-                "scenario": "If you want to calculate the area of the bounding box [0.2, 0.21, 0.34, 0.42]",
-                "parameters": {"bboxes": [0.2, 0.21, 0.34, 0.42]},
-            }
+                "scenario": "Calculate the width and height of the bounding box [0.2, 0.21, 0.34, 0.42]",
+                "parameters": {
+                    "bboxes": [[0.2, 0.21, 0.34, 0.42]],
+                    "image_size": (500, 1200),
+                },
+            },
+            {
+                "scenario": "Calculate the area of the bounding box [0.2, 0.21, 0.34, 0.42]",
+                "parameters": {
+                    "bboxes": [[0.2, 0.21, 0.34, 0.42]],
+                    "image_size": (640, 480),
+                },
+            },
         ],
     }
-    def __call__(self, bboxes: List[Dict]) -> List[Dict]:
+    def __call__(
+        self, bboxes: List[List[int]], image_size: Tuple[int, int]
+    ) -> List[Dict]:
         areas = []
-        for elt in bboxes:
-            height, width = elt["size"]
-            for label, bbox in zip(elt["labels"], elt["bboxes"]):
-                x1, y1, x2, y2 = bbox
-                areas.append(
-                    {
-                        "area": round((x2 - x1) * (y2 - y1) * width * height, 2),
-                        "label": label,
-                    }
-                )
+        height, width = image_size
+        for bbox in bboxes:
+            x1, y1, x2, y2 = bbox
+            areas.append(
+                {
+                    "width": round((x2 - x1) * width, 2),
+                    "height": round((y2 - y1) * height, 2),
+                    "area": round((x2 - x1) * (y2 - y1) * width * height, 2),
+                }
+            )
         return areas
@@ -1055,22 +1073,25 @@ class ExtractFrames(Tool):
     r"""Extract frames from a video."""
     name = "extract_frames_"
-    description = "'extract_frames_' extracts frames from a video, returns a list of tuples (frame, timestamp), where timestamp is the relative time in seconds where the frame was captured. The frame is a local image file path."
+    description = "'extract_frames_' extracts frames from a video every 2 seconds, returns a list of tuples (frame, timestamp), where timestamp is the relative time in seconds where the frame was captured. The frame is a local image file path."
     usage = {
         "required_parameters": [{"name": "video_uri", "type": "str"}],
+        "optional_parameters": [{"name": "frames_every", "type": "float"}],
         "examples": [
             {
                 "scenario": "Can you extract the frames from this video? Video: www.foobar.com/video?name=test.mp4",
                 "parameters": {"video_uri": "www.foobar.com/video?name=test.mp4"},
             },
             {
-                "scenario": "Can you extract the images from this video file? Video path: tests/data/test.mp4",
-                "parameters": {"video_uri": "tests/data/test.mp4"},
+                "scenario": "Can you extract the images from this video file at every 2 seconds ? Video path: tests/data/test.mp4",
+                "parameters": {"video_uri": "tests/data/test.mp4", "frames_every": 2},
             },
         ],
     }
-    def __call__(self, video_uri: str) -> List[Tuple[str, float]]:
+    def __call__(
+        self, video_uri: str, frames_every: float = 2
+    ) -> List[Tuple[str, float]]:
         """Extract frames from a video.
@@ -1080,7 +1101,7 @@ class ExtractFrames(Tool):
         Returns:
             a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(path_to_frame1, 0.0), (path_to_frame2, 0.5), ...]. The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order.
         """
-        frames = extract_frames_from_video(video_uri)
+        frames = extract_frames_from_video(video_uri, fps=round(1 / frames_every, 2))
         result = []
         _LOGGER.info(
             f"Extracted {len(frames)} frames from video {video_uri}. Temporarily saving them as images to disk for downstream tasks."
@@ -1183,7 +1204,7 @@ TOOLS = {
             AgentDINOv,
             ExtractFrames,
             Crop,
-            BboxArea,
+            BboxStats,
             SegArea,
             ObjectDistance,
             BboxContains,

vision_agent/tools/tools_v2.py CHANGED Viewed

@@ -1,13 +1,19 @@
 import inspect
+import io
+import logging
 import tempfile
 from importlib import resources
-from typing import Any, Callable, Dict, List
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Tuple, Union
 import numpy as np
+import pandas as pd
+import requests
 from PIL import Image, ImageDraw, ImageFont
-from vision_agent.image_utils import convert_to_b64, normalize_bbox
 from vision_agent.tools.tool_utils import _send_inference_request
+from vision_agent.utils import extract_frames_from_video
+from vision_agent.utils.image_utils import convert_to_b64, normalize_bbox, rle_decode
 COLORS = [
     (158, 218, 229),
@@ -31,6 +37,10 @@ COLORS = [
     (255, 127, 14),
     (31, 119, 180),
 ]
+_API_KEY = "land_sk_WVYwP00xA3iXely2vuar6YUDZ3MJT9yLX6oW5noUkwICzYLiDV"
+_OCR_URL = "https://app.landing.ai/ocr/v1/detect-text"
+logging.basicConfig(level=logging.INFO)
+_LOGGER = logging.getLogger(__name__)
 def grounding_dino(
@@ -39,23 +49,30 @@ def grounding_dino(
     box_threshold: float = 0.20,
     iou_threshold: float = 0.75,
 ) -> List[Dict[str, Any]]:
-    """'grounding_dino' is a tool that can detect arbitrary objects with inputs such as
-    category names or referring expressions.
+    """'grounding_dino' is a tool that can detect and count objects given a text prompt
+    such as category names or referring expressions. It returns a list and count of
+    bounding boxes, label names and associated probability scores.
     Parameters:
         prompt (str): The prompt to ground to the image.
         image (np.ndarray): The image to ground the prompt to.
-        box_threshold (float, optional): The threshold for the box detection. Defaults to 0.20.
-        iou_threshold (float, optional): The threshold for the Intersection over Union (IoU). Defaults to 0.75.
+        box_threshold (float, optional): The threshold for the box detection. Defaults
+            to 0.20.
+        iou_threshold (float, optional): The threshold for the Intersection over Union
+            (IoU). Defaults to 0.75.
     Returns:
         List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
-        bounding box of the detected objects with normalized coordinates.
+        bounding box of the detected objects with normalized coordinates
+        (x1, y1, x2, y2).
     Example
     -------
     >>> grounding_dino("car. dinosaur", image)
-    [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}]
+    [
+        {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
+        {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
+    ]
     """
     image_size = image.shape[:2]
     image_b64 = convert_to_b64(Image.fromarray(image))
@@ -78,6 +95,147 @@ def grounding_dino(
     return return_data
+def grounding_sam(
+    prompt: str,
+    image: np.ndarray,
+    box_threshold: float = 0.20,
+    iou_threshold: float = 0.75,
+) -> List[Dict[str, Any]]:
+    """'grounding_sam' is a tool that can detect and segment objects given a text
+    prompt such as category names or referring expressions. It returns a list of
+    bounding boxes, label names and masks file names and associated probability scores.
+    Parameters:
+        prompt (str): The prompt to ground to the image.
+        image (np.ndarray): The image to ground the prompt to.
+        box_threshold (float, optional): The threshold for the box detection. Defaults
+            to 0.20.
+        iou_threshold (float, optional): The threshold for the Intersection over Union
+            (IoU). Defaults to 0.75.
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing the score, label,
+        bounding box, and mask of the detected objects with normalized coordinates
+        (x1, y1, x2, y2).
+    Example
+    -------
+    >>> grounding_sam("car. dinosaur", image)
+    [
+        {
+            'score': 0.99,
+            'label': 'dinosaur',
+            'bbox': [0.1, 0.11, 0.35, 0.4],
+            'mask': array([[0, 0, 0, ..., 0, 0, 0],
+                [0, 0, 0, ..., 0, 0, 0],
+                ...,
+                [0, 0, 0, ..., 0, 0, 0],
+                [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
+        },
+    ]
+    """
+    image_size = image.shape[:2]
+    image_b64 = convert_to_b64(Image.fromarray(image))
+    request_data = {
+        "prompt": prompt,
+        "image": image_b64,
+        "tool": "visual_grounding_segment",
+        "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
+    }
+    data: Dict[str, Any] = _send_inference_request(request_data, "tools")
+    return_data = []
+    for i in range(len(data["bboxes"])):
+        return_data.append(
+            {
+                "score": round(data["scores"][i], 2),
+                "label": data["labels"][i],
+                "bbox": normalize_bbox(data["bboxes"][i], image_size),
+                "mask": rle_decode(mask_rle=data["masks"][i], shape=data["mask_shape"]),
+            }
+        )
+    return return_data
+def extract_frames(
+    video_uri: Union[str, Path], fps: float = 0.5
+) -> List[Tuple[np.ndarray, float]]:
+    """'extract_frames' extracts frames from a video, returns a list of tuples (frame,
+    timestamp), where timestamp is the relative time in seconds where the frame was
+    captured. The frame is a local image file path.
+    Parameters:
+        video_uri (Union[str, Path]): The path to the video file.
+        fps (float, optional): The frame rate per second to extract the frames. Defaults
+            to 0.5.
+    Returns:
+        List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame
+        and the timestamp in seconds.
+    Example
+    -------
+    >>> extract_frames("path/to/video.mp4")
+    [(frame1, 0.0), (frame2, 0.5), ...]
+    """
+    return extract_frames_from_video(str(video_uri), fps)
+def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
+    """'ocr' extracts text from an image. It returns a list of detected text, bounding
+    boxes, and confidence scores.
+    Parameters:
+        image (np.ndarray): The image to extract text from.
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox,
+        and confidence score.
+    Example
+    -------
+    >>> ocr(image)
+    [
+        {'label': 'some text', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
+    ]
+    """
+    pil_image = Image.fromarray(image).convert("RGB")
+    image_size = pil_image.size[::-1]
+    image_buffer = io.BytesIO()
+    pil_image.save(image_buffer, format="PNG")
+    buffer_bytes = image_buffer.getvalue()
+    image_buffer.close()
+    res = requests.post(
+        _OCR_URL,
+        files={"images": buffer_bytes},
+        data={"language": "en"},
+        headers={"contentType": "multipart/form-data", "apikey": _API_KEY},
+    )
+    if res.status_code != 200:
+        raise ValueError(f"OCR request failed with status code {res.status_code}")
+    data = res.json()
+    output = []
+    for det in data[0]:
+        label = det["text"]
+        box = [
+            det["location"][0]["x"],
+            det["location"][0]["y"],
+            det["location"][2]["x"],
+            det["location"][2]["y"],
+        ]
+        box = normalize_bbox(box, image_size)
+        output.append({"label": label, "bbox": box, "score": round(det["score"], 2)})
+    return output
+# Utility and visualization functions
 def load_image(image_path: str) -> np.ndarray:
     """'load_image' is a utility function that loads an image from the given path.
@@ -117,24 +275,33 @@ def save_image(image: np.ndarray) -> str:
     return f.name
-def display_bounding_boxes(
+def overlay_bounding_boxes(
     image: np.ndarray, bboxes: List[Dict[str, Any]]
 ) -> np.ndarray:
-    """'display_bounding_boxes' is a utility function that displays bounding boxes on an image.
+    """'display_bounding_boxes' is a utility function that displays bounding boxes on
+    an image.
     Parameters:
         image (np.ndarray): The image to display the bounding boxes on.
-        bboxes (List[Dict[str, Any]]): A list of dictionaries containing the bounding boxes.
+        bboxes (List[Dict[str, Any]]): A list of dictionaries containing the bounding
+            boxes.
     Returns:
-        np.ndarray: The image with the bounding boxes displayed.
+        np.ndarray: The image with the bounding boxes, labels and scores displayed.
     Example
     -------
-    >>> image_with_bboxes = display_bounding_boxes(image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}])
+    >>> image_with_bboxes = display_bounding_boxes(
+        image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
+    )
     """
     pil_image = Image.fromarray(image.astype(np.uint8))
+    if len(set([box["label"] for box in bboxes])) > len(COLORS):
+        _LOGGER.warning(
+            "Number of unique labels exceeds the number of available colors. Some labels may have the same color."
+        )
     color = {
         label: COLORS[i % len(COLORS)]
         for i, label in enumerate(set([box["label"] for box in bboxes]))
@@ -167,15 +334,109 @@ def display_bounding_boxes(
     return np.array(pil_image.convert("RGB"))
-def get_tool_documentation(funcs: List[Callable]) -> str:
+def overlay_segmentation_masks(
+    image: np.ndarray, masks: List[Dict[str, Any]]
+) -> np.ndarray:
+    """'display_segmentation_masks' is a utility function that displays segmentation
+    masks.
+    Parameters:
+        image (np.ndarray): The image to display the masks on.
+        masks (List[Dict[str, Any]]): A list of dictionaries containing the masks.
+    Returns:
+        np.ndarray: The image with the masks displayed.
+    Example
+    -------
+    >>> image_with_masks = display_segmentation_masks(
+        image,
+        [{
+            'score': 0.99,
+            'label': 'dinosaur',
+            'mask': array([[0, 0, 0, ..., 0, 0, 0],
+                [0, 0, 0, ..., 0, 0, 0],
+                ...,
+                [0, 0, 0, ..., 0, 0, 0],
+                [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
+        }],
+    )
+    """
+    pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGBA")
+    if len(set([mask["label"] for mask in masks])) > len(COLORS):
+        _LOGGER.warning(
+            "Number of unique labels exceeds the number of available colors. Some labels may have the same color."
+        )
+    color = {
+        label: COLORS[i % len(COLORS)]
+        for i, label in enumerate(set([mask["label"] for mask in masks]))
+    }
+    for elt in masks:
+        mask = elt["mask"]
+        label = elt["label"]
+        np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
+        np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
+        mask_img = Image.fromarray(np_mask.astype(np.uint8))
+        pil_image = Image.alpha_composite(pil_image, mask_img)
+    return np.array(pil_image.convert("RGB"))
+def get_tool_documentation(funcs: List[Callable[..., Any]]) -> str:
     docstrings = ""
     for func in funcs:
-        docstrings += f"{func.__name__}: {inspect.signature(func)}\n{func.__doc__}\n\n"
+        docstrings += f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}\n\n"
     return docstrings
-TOOLS_DOCSTRING = get_tool_documentation([load_image, grounding_dino])
+def get_tool_descriptions(funcs: List[Callable[..., Any]]) -> str:
+    descriptions = ""
+    for func in funcs:
+        description = func.__doc__
+        if description is None:
+            description = ""
+        description = (
+            description[: description.find("Parameters:")].replace("\n", " ").strip()
+        )
+        description = " ".join(description.split())
+        descriptions += f"- {func.__name__}{inspect.signature(func)}: {description}\n"
+    return descriptions
+def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame:
+    data: Dict[str, List[str]] = {"desc": [], "doc": []}
+    for func in funcs:
+        desc = func.__doc__
+        if desc is None:
+            desc = ""
+        desc = desc[: desc.find("Parameters:")].replace("\n", " ").strip()
+        desc = " ".join(desc.split())
+        doc = f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}"
+        data["desc"].append(desc)
+        data["doc"].append(doc)
+    return pd.DataFrame(data)  # type: ignore
+TOOLS = [
+    grounding_dino,
+    grounding_sam,
+    extract_frames,
+    ocr,
+    load_image,
+    save_image,
+    overlay_bounding_boxes,
+    overlay_segmentation_masks,
+]
+TOOLS_DF = get_tools_df(TOOLS)  # type: ignore
+TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS)  # type: ignore
+TOOL_DOCSTRING = get_tool_documentation(TOOLS)  # type: ignore
 UTILITIES_DOCSTRING = get_tool_documentation(
-    [load_image, save_image, display_bounding_boxes]
+    [load_image, save_image, overlay_bounding_boxes]
 )

vision_agent/utils/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .execute import Execute
+from .sim import Sim
+from .video import extract_frames_from_video

vision-agent 0.2.14__py3-none-any.whl → 0.2.16__py3-none-any.whl

vision-agent 0.2.14py3-none-any.whl → 0.2.16py3-none-any.whl