PyPI - vision-agent - Versions diffs - 0.2.13__py3-none-any.whl → 0.2.15__py3-none-any.whl - Mend

vision-agent 0.2.13py3-none-any.whl → 0.2.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

vision_agent/agent/__init__.py +1 -0
vision_agent/agent/agent_coder.py +33 -7
vision_agent/agent/vision_agent.py +15 -13
vision_agent/agent/vision_agent_prompts.py +3 -3
vision_agent/agent/vision_agent_v2.py +300 -0
vision_agent/agent/vision_agent_v2_prompt.py +170 -0
vision_agent/llm/llm.py +11 -3
vision_agent/tools/__init__.py +2 -2
vision_agent/tools/tool_utils.py +1 -1
vision_agent/tools/tools.py +4 -5
vision_agent/tools/tools_v2.py +278 -17
vision_agent/utils/__init__.py +3 -0
vision_agent/utils/execute.py +104 -0
vision_agent/utils/sim.py +70 -0
{vision_agent-0.2.13.dist-info → vision_agent-0.2.15.dist-info}/METADATA +4 -2
vision_agent-0.2.15.dist-info/RECORD +34 -0
vision_agent/agent/execution.py +0 -287
vision_agent-0.2.13.dist-info/RECORD +0 -30
/vision_agent/{image_utils.py → utils/image_utils.py} +0 -0
/vision_agent/{type_defs.py → utils/type_defs.py} +0 -0
/vision_agent/{tools → utils}/video.py +0 -0
{vision_agent-0.2.13.dist-info → vision_agent-0.2.15.dist-info}/LICENSE +0 -0
{vision_agent-0.2.13.dist-info → vision_agent-0.2.15.dist-info}/WHEEL +0 -0

vision_agent/tools/tool_utils.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Any, Dict
 import requests
-from vision_agent.type_defs import LandingaiAPIKey
+from vision_agent.utils.type_defs import LandingaiAPIKey
 _LOGGER = logging.getLogger(__name__)
 _LND_API_KEY = LandingaiAPIKey().api_key

vision_agent/tools/tools.py CHANGED Viewed

@@ -11,7 +11,10 @@ from PIL import Image
 from PIL.Image import Image as ImageType
 from scipy.spatial import distance  # type: ignore
-from vision_agent.image_utils import (
+from vision_agent.lmm import OpenAILMM
+from vision_agent.tools.tool_utils import _send_inference_request
+from vision_agent.utils import extract_frames_from_video
+from vision_agent.utils.image_utils import (
     b64_to_pil,
     convert_to_b64,
     denormalize_bbox,
@@ -19,9 +22,6 @@ from vision_agent.image_utils import (
     normalize_bbox,
     rle_decode,
 )
-from vision_agent.lmm import OpenAILMM
-from vision_agent.tools.tool_utils import _send_inference_request
-from vision_agent.tools.video import extract_frames_from_video
 _LOGGER = logging.getLogger(__name__)
@@ -422,7 +422,6 @@ class DINOv(Tool):
         request_data = {
             "prompt": prompt,
             "image": image_b64,
-            "tool": "dinov",
         }
         data: Dict[str, Any] = _send_inference_request(request_data, "dinov")
         if "bboxes" in data:

vision_agent/tools/tools_v2.py CHANGED Viewed

@@ -1,13 +1,19 @@
 import inspect
+import io
+import logging
 import tempfile
 from importlib import resources
-from typing import Any, Callable, Dict, List
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Tuple, Union
 import numpy as np
+import pandas as pd
+import requests
 from PIL import Image, ImageDraw, ImageFont
-from vision_agent.image_utils import convert_to_b64, normalize_bbox
 from vision_agent.tools.tool_utils import _send_inference_request
+from vision_agent.utils import extract_frames_from_video
+from vision_agent.utils.image_utils import convert_to_b64, normalize_bbox, rle_decode
 COLORS = [
     (158, 218, 229),
@@ -31,6 +37,10 @@ COLORS = [
     (255, 127, 14),
     (31, 119, 180),
 ]
+_API_KEY = "land_sk_WVYwP00xA3iXely2vuar6YUDZ3MJT9yLX6oW5noUkwICzYLiDV"
+_OCR_URL = "https://app.landing.ai/ocr/v1/detect-text"
+logging.basicConfig(level=logging.INFO)
+_LOGGER = logging.getLogger(__name__)
 def grounding_dino(
@@ -39,23 +49,30 @@ def grounding_dino(
     box_threshold: float = 0.20,
     iou_threshold: float = 0.75,
 ) -> List[Dict[str, Any]]:
-    """'grounding_dino' is a tool that can detect arbitrary objects with inputs such as
-    category names or referring expressions.
+    """'grounding_dino' is a tool that can detect and count objects given a text prompt
+    such as category names or referring expressions. It returns a list and count of
+    bounding boxes, label names and associated probability scores.
     Parameters:
         prompt (str): The prompt to ground to the image.
         image (np.ndarray): The image to ground the prompt to.
-        box_threshold (float, optional): The threshold for the box detection. Defaults to 0.20.
-        iou_threshold (float, optional): The threshold for the Intersection over Union (IoU). Defaults to 0.75.
+        box_threshold (float, optional): The threshold for the box detection. Defaults
+            to 0.20.
+        iou_threshold (float, optional): The threshold for the Intersection over Union
+            (IoU). Defaults to 0.75.
     Returns:
         List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
-        bounding box of the detected objects with normalized coordinates.
+        bounding box of the detected objects with normalized coordinates
+        (x1, y1, x2, y2).
     Example
     -------
     >>> grounding_dino("car. dinosaur", image)
-    [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}]
+    [
+        {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
+        {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
+    ]
     """
     image_size = image.shape[:2]
     image_b64 = convert_to_b64(Image.fromarray(image))
@@ -78,6 +95,147 @@ def grounding_dino(
     return return_data
+def grounding_sam(
+    prompt: str,
+    image: np.ndarray,
+    box_threshold: float = 0.20,
+    iou_threshold: float = 0.75,
+) -> List[Dict[str, Any]]:
+    """'grounding_sam' is a tool that can detect and segment objects given a text
+    prompt such as category names or referring expressions. It returns a list of
+    bounding boxes, label names and masks file names and associated probability scores.
+    Parameters:
+        prompt (str): The prompt to ground to the image.
+        image (np.ndarray): The image to ground the prompt to.
+        box_threshold (float, optional): The threshold for the box detection. Defaults
+            to 0.20.
+        iou_threshold (float, optional): The threshold for the Intersection over Union
+            (IoU). Defaults to 0.75.
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing the score, label,
+        bounding box, and mask of the detected objects with normalized coordinates
+        (x1, y1, x2, y2).
+    Example
+    -------
+    >>> grounding_sam("car. dinosaur", image)
+    [
+        {
+            'score': 0.99,
+            'label': 'dinosaur',
+            'bbox': [0.1, 0.11, 0.35, 0.4],
+            'mask': array([[0, 0, 0, ..., 0, 0, 0],
+                [0, 0, 0, ..., 0, 0, 0],
+                ...,
+                [0, 0, 0, ..., 0, 0, 0],
+                [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
+        },
+    ]
+    """
+    image_size = image.shape[:2]
+    image_b64 = convert_to_b64(Image.fromarray(image))
+    request_data = {
+        "prompt": prompt,
+        "image": image_b64,
+        "tool": "visual_grounding_segment",
+        "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
+    }
+    data: Dict[str, Any] = _send_inference_request(request_data, "tools")
+    return_data = []
+    for i in range(len(data["bboxes"])):
+        return_data.append(
+            {
+                "score": round(data["scores"][i], 2),
+                "label": data["labels"][i],
+                "bbox": normalize_bbox(data["bboxes"][i], image_size),
+                "mask": rle_decode(mask_rle=data["masks"][i], shape=data["mask_shape"]),
+            }
+        )
+    return return_data
+def extract_frames(
+    video_uri: Union[str, Path], fps: float = 0.5
+) -> List[Tuple[np.ndarray, float]]:
+    """'extract_frames' extracts frames from a video, returns a list of tuples (frame,
+    timestamp), where timestamp is the relative time in seconds where the frame was
+    captured. The frame is a local image file path.
+    Parameters:
+        video_uri (Union[str, Path]): The path to the video file.
+        fps (float, optional): The frame rate per second to extract the frames. Defaults
+            to 0.5.
+    Returns:
+        List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame
+        and the timestamp in seconds.
+    Example
+    -------
+    >>> extract_frames("path/to/video.mp4")
+    [(frame1, 0.0), (frame2, 0.5), ...]
+    """
+    return extract_frames_from_video(str(video_uri), fps)
+def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
+    """'ocr' extracts text from an image. It returns a list of detected text, bounding
+    boxes, and confidence scores.
+    Parameters:
+        image (np.ndarray): The image to extract text from.
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox,
+        and confidence score.
+    Example
+    -------
+    >>> ocr(image)
+    [
+        {'label': 'some text', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
+    ]
+    """
+    pil_image = Image.fromarray(image).convert("RGB")
+    image_size = pil_image.size[::-1]
+    image_buffer = io.BytesIO()
+    pil_image.save(image_buffer, format="PNG")
+    buffer_bytes = image_buffer.getvalue()
+    image_buffer.close()
+    res = requests.post(
+        _OCR_URL,
+        files={"images": buffer_bytes},
+        data={"language": "en"},
+        headers={"contentType": "multipart/form-data", "apikey": _API_KEY},
+    )
+    if res.status_code != 200:
+        raise ValueError(f"OCR request failed with status code {res.status_code}")
+    data = res.json()
+    output = []
+    for det in data[0]:
+        label = det["text"]
+        box = [
+            det["location"][0]["x"],
+            det["location"][0]["y"],
+            det["location"][2]["x"],
+            det["location"][2]["y"],
+        ]
+        box = normalize_bbox(box, image_size)
+        output.append({"label": label, "bbox": box, "score": round(det["score"], 2)})
+    return output
+# Utility and visualization functions
 def load_image(image_path: str) -> np.ndarray:
     """'load_image' is a utility function that loads an image from the given path.
@@ -117,24 +275,33 @@ def save_image(image: np.ndarray) -> str:
     return f.name
-def display_bounding_boxes(
+def overlay_bounding_boxes(
     image: np.ndarray, bboxes: List[Dict[str, Any]]
 ) -> np.ndarray:
-    """'display_bounding_boxes' is a utility function that displays bounding boxes on an image.
+    """'display_bounding_boxes' is a utility function that displays bounding boxes on
+    an image.
     Parameters:
         image (np.ndarray): The image to display the bounding boxes on.
-        bboxes (List[Dict[str, Any]]): A list of dictionaries containing the bounding boxes.
+        bboxes (List[Dict[str, Any]]): A list of dictionaries containing the bounding
+            boxes.
     Returns:
-        np.ndarray: The image with the bounding boxes displayed.
+        np.ndarray: The image with the bounding boxes, labels and scores displayed.
     Example
     -------
-    >>> image_with_bboxes = display_bounding_boxes(image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}])
+    >>> image_with_bboxes = display_bounding_boxes(
+        image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
+    )
     """
     pil_image = Image.fromarray(image.astype(np.uint8))
+    if len(set([box["label"] for box in bboxes])) > len(COLORS):
+        _LOGGER.warning(
+            "Number of unique labels exceeds the number of available colors. Some labels may have the same color."
+        )
     color = {
         label: COLORS[i % len(COLORS)]
         for i, label in enumerate(set([box["label"] for box in bboxes]))
@@ -167,15 +334,109 @@ def display_bounding_boxes(
     return np.array(pil_image.convert("RGB"))
-def get_tool_documentation(funcs: List[Callable]) -> str:
+def overlay_segmentation_masks(
+    image: np.ndarray, masks: List[Dict[str, Any]]
+) -> np.ndarray:
+    """'display_segmentation_masks' is a utility function that displays segmentation
+    masks.
+    Parameters:
+        image (np.ndarray): The image to display the masks on.
+        masks (List[Dict[str, Any]]): A list of dictionaries containing the masks.
+    Returns:
+        np.ndarray: The image with the masks displayed.
+    Example
+    -------
+    >>> image_with_masks = display_segmentation_masks(
+        image,
+        [{
+            'score': 0.99,
+            'label': 'dinosaur',
+            'mask': array([[0, 0, 0, ..., 0, 0, 0],
+                [0, 0, 0, ..., 0, 0, 0],
+                ...,
+                [0, 0, 0, ..., 0, 0, 0],
+                [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
+        }],
+    )
+    """
+    pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGBA")
+    if len(set([mask["label"] for mask in masks])) > len(COLORS):
+        _LOGGER.warning(
+            "Number of unique labels exceeds the number of available colors. Some labels may have the same color."
+        )
+    color = {
+        label: COLORS[i % len(COLORS)]
+        for i, label in enumerate(set([mask["label"] for mask in masks]))
+    }
+    for elt in masks:
+        mask = elt["mask"]
+        label = elt["label"]
+        np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
+        np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
+        mask_img = Image.fromarray(np_mask.astype(np.uint8))
+        pil_image = Image.alpha_composite(pil_image, mask_img)
+    return np.array(pil_image.convert("RGB"))
+def get_tool_documentation(funcs: List[Callable[..., Any]]) -> str:
     docstrings = ""
     for func in funcs:
-        docstrings += f"{func.__name__}: {inspect.signature(func)}\n{func.__doc__}\n\n"
+        docstrings += f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}\n\n"
     return docstrings
-TOOLS_DOCSTRING = get_tool_documentation([load_image, grounding_dino])
+def get_tool_descriptions(funcs: List[Callable[..., Any]]) -> str:
+    descriptions = ""
+    for func in funcs:
+        description = func.__doc__
+        if description is None:
+            description = ""
+        description = (
+            description[: description.find("Parameters:")].replace("\n", " ").strip()
+        )
+        description = " ".join(description.split())
+        descriptions += f"- {func.__name__}{inspect.signature(func)}: {description}\n"
+    return descriptions
+def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame:
+    data: Dict[str, List[str]] = {"desc": [], "doc": []}
+    for func in funcs:
+        desc = func.__doc__
+        if desc is None:
+            desc = ""
+        desc = desc[: desc.find("Parameters:")].replace("\n", " ").strip()
+        desc = " ".join(desc.split())
+        doc = f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}"
+        data["desc"].append(desc)
+        data["doc"].append(doc)
+    return pd.DataFrame(data)  # type: ignore
+TOOLS = [
+    grounding_dino,
+    grounding_sam,
+    extract_frames,
+    ocr,
+    load_image,
+    save_image,
+    overlay_bounding_boxes,
+    overlay_segmentation_masks,
+]
+TOOLS_DF = get_tools_df(TOOLS)  # type: ignore
+TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS)  # type: ignore
+TOOL_DOCSTRING = get_tool_documentation(TOOLS)  # type: ignore
 UTILITIES_DOCSTRING = get_tool_documentation(
-    [load_image, save_image, display_bounding_boxes]
+    [load_image, save_image, overlay_bounding_boxes]
 )

vision_agent/utils/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .execute import Execute
+from .sim import Sim
+from .video import extract_frames_from_video

vision_agent/utils/execute.py ADDED Viewed

@@ -0,0 +1,104 @@
+"""This code is adapted from MetaGPT's https://github.com/geekan/MetaGPT/blob/main/metagpt/actions/di/execute_nb_code.py
+"""
+import base64 as b64
+import io
+import re
+from typing import Dict, List, Tuple
+import nbformat
+from nbclient import NotebookClient
+from nbclient.exceptions import CellTimeoutError, DeadKernelError
+from nbclient.util import run_sync
+from nbformat import NotebookNode
+from nbformat.v4 import new_code_cell
+from PIL import Image
+def remove_escape_and_color_codes(input_str: str) -> str:
+    pattern = re.compile(r"\x1b\[[0-9;]*[mK]")
+    result = pattern.sub("", input_str)
+    return result
+def parse_outputs(outputs: List[Dict]) -> Tuple[bool, str]:
+    success, parsed_output = True, []
+    for output in outputs:
+        # TODO: add parse image data
+        if output["output_type"] == "stream":
+            parsed_output.append(output["text"])
+        elif output["output_type"] == "text/plain":
+            parsed_output.append(output["data"]["text/plain"])
+        elif output["output_type"] == "display_data":
+            if "image/png" in output["data"]:
+                image_bytes = b64.b64decode(output["data"]["image/png"])
+                Image.open(io.BytesIO(image_bytes)).show()
+        elif output["output_type"] == "error":
+            success = False
+            output_text = remove_escape_and_color_codes("\n".join(output["traceback"]))
+            parsed_output.append(output_text)
+    return success, ",".join(parsed_output)
+class Execute:
+    def __init__(self, timeout: int = 600) -> None:
+        self.nb = nbformat.v4.new_notebook()
+        self.timeout = timeout
+        self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
+    def build(self) -> None:
+        if self.nb_client.kc is None or not run_sync(self.nb_client.kc.is_alive)():  # type: ignore
+            self.nb_client.create_kernel_manager()
+            self.nb_client.start_new_kernel()
+            self.nb_client.start_new_kernel_client()
+    def terminate(self) -> None:
+        if self.nb_client.km is not None and run_sync(self.nb_client.km.is_alive)():  # type: ignore
+            run_sync(self.nb_client.km.shutdown_kernel)(now=True)
+            run_sync(self.nb_client.km.cleanup_resources)()
+            channels = [
+                self.nb_client.kc.stdin_channel,
+                self.nb_client.kc.hb_channel,
+                self.nb_client.kc.control_channel,
+            ]
+            for ch in channels:
+                if ch.is_alive():
+                    ch.stop()
+            self.nb_client.kc = None
+            self.nb_client.km = None
+    def reset(self) -> None:
+        self.terminate()
+        self.nb = nbformat.v4.new_notebook()
+        self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
+        self.build()
+    def run_cell(self, cell: NotebookNode, cell_index: int) -> Tuple[bool, str]:
+        try:
+            self.nb_client.execute_cell(cell, cell_index)
+            return parse_outputs(self.nb.cells[-1].outputs)
+        except CellTimeoutError:
+            run_sync(self.nb_client.km.interrupt_kernel)()  # type: ignore
+            return False, "Cell execution timed out."
+        except DeadKernelError:
+            self.reset()
+            return False, "DeadKernelError"
+        except Exception:
+            return parse_outputs(self.nb.cells[-1].outputs)
+    def add_code_cell(self, code: str) -> None:
+        self.nb.cells.append(new_code_cell(code))
+    def run_additional(self, code: str) -> Tuple[bool, str]:
+        self.build()
+        self.add_code_cell(code)
+        return self.run_cell(self.nb.cells[-1], len(self.nb.cells) - 1)
+    def run_isolation(self, code: str) -> Tuple[bool, str]:
+        self.reset()
+        self.add_code_cell(code)
+        return self.run_cell(self.nb.cells[-1], len(self.nb.cells) - 1)

vision_agent/utils/sim.py ADDED Viewed

@@ -0,0 +1,70 @@
+from pathlib import Path
+from typing import Dict, List, Optional, Sequence, Union
+import pandas as pd
+from openai import Client
+from scipy.spatial.distance import cosine  # type: ignore
+def get_embedding(
+    client: Client, text: str, model: str = "text-embedding-3-small"
+) -> List[float]:
+    text = text.replace("\n", " ")
+    return client.embeddings.create(input=[text], model=model).data[0].embedding
+class Sim:
+    def __init__(
+        self,
+        df: pd.DataFrame,
+        sim_key: Optional[str] = None,
+        api_key: Optional[str] = None,
+        model: str = "text-embedding-3-small",
+    ) -> None:
+        """Creates a similarity object that can be used to find similar items in a
+        dataframe.
+        Parameters:
+            df: pd.DataFrame: The dataframe to use for similarity.
+            sim_key: Optional[str]: The column name that you want to use to construct
+                the embeddings.
+            model: str: The model to use for embeddings.
+        """
+        self.df = df
+        if not api_key:
+            self.client = Client()
+        else:
+            self.client = Client(api_key=api_key)
+        self.model = model
+        if "embs" not in df.columns and sim_key is None:
+            raise ValueError("key is required if no column 'embs' is present.")
+        if sim_key is not None:
+            self.df["embs"] = self.df[sim_key].apply(
+                lambda x: get_embedding(self.client, x, model=self.model)
+            )
+    def save(self, sim_file: Union[str, Path]) -> None:
+        self.df.to_csv(sim_file, index=False)
+    def top_k(self, query: str, k: int = 5) -> Sequence[Dict]:
+        """Returns the top k most similar items to the query.
+        Parameters:
+            query: str: The query to compare to.
+            k: int: The number of items to return.
+        Returns:
+            Sequence[Dict]: The top k most similar items.
+        """
+        embedding = get_embedding(self.client, query, model=self.model)
+        self.df["sim"] = self.df.embs.apply(lambda x: 1 - cosine(x, embedding))
+        res = self.df.sort_values("sim", ascending=False).head(k)
+        return res[[c for c in res.columns if c != "embs"]].to_dict(orient="records")
+def load_sim(sim_file: Union[str, Path]) -> Sim:
+    df = pd.read_csv(sim_file)
+    return Sim(df)

{vision_agent-0.2.13.dist-info → vision_agent-0.2.15.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.13
+Version: 0.2.15
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai
@@ -10,6 +10,8 @@ Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Requires-Dist: moviepy (>=1.0.0,<2.0.0)
+Requires-Dist: nbclient (>=0.10.0,<0.11.0)
+Requires-Dist: nbformat (>=5.10.4,<6.0.0)
 Requires-Dist: numpy (>=1.21.0,<2.0.0)
 Requires-Dist: openai (>=1.0.0,<2.0.0)
 Requires-Dist: opencv-python-headless (>=4.0.0,<5.0.0)
@@ -17,6 +19,7 @@ Requires-Dist: pandas (>=2.0.0,<3.0.0)
 Requires-Dist: pillow (>=10.0.0,<11.0.0)
 Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
 Requires-Dist: requests (>=2.0.0,<3.0.0)
+Requires-Dist: rich (>=13.7.1,<14.0.0)
 Requires-Dist: scipy (>=1.13.0,<1.14.0)
 Requires-Dist: tabulate (>=0.9.0,<0.10.0)
 Requires-Dist: tqdm (>=4.64.0,<5.0.0)
@@ -181,7 +184,6 @@ find an example that creates a custom tool for template matching [here](examples
 | GroundingDINO | GroundingDINO is a tool that can detect arbitrary objects with inputs such as category names or referring expressions. |
 | GroundingSAM | GroundingSAM is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions. |
 | DINOv | DINOv is a tool that can detect arbitrary objects with using a referring mask. |
-| ExtractFrames | ExtractFrames extracts frames with motion from a video. |
 | Crop | Crop crops an image given a bounding box and returns a file name of the cropped image. |
 | BboxArea | BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places. |
 | SegArea | SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places. |

vision_agent-0.2.15.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,34 @@
+vision_agent/__init__.py,sha256=GVLHCeK_R-zgldpbcPmOzJat-BkadvkuRCMxDvTIcXs,108
+vision_agent/agent/__init__.py,sha256=Zv8lc91mPy0iDySId38_vc4mo56JQ9mCMvUWdAKQjh0,206
+vision_agent/agent/agent.py,sha256=X7kON-g9ePUKumCDaYfQNBX_MEFE-ax5PnRp7-Cc5Wo,529
+vision_agent/agent/agent_coder.py,sha256=e3mQn1xenahYk_uGflvuQ10s6dSHHM6p0jZN9UT1ZpE,6508
+vision_agent/agent/agent_coder_prompts.py,sha256=CJe3v7xvHQ32u3RQAXQga_Tk_4UgU64RBAMHZ3S70KY,5538
+vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMVg,11511
+vision_agent/agent/easytool_prompts.py,sha256=Bikw-PPLkm78dwywTlnv32Y1Tw6JMeC-R7oCnXWLcTk,4656
+vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6wdM,10506
+vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
+vision_agent/agent/vision_agent.py,sha256=4-GjEX8ZmLhvLebqNRRTSSu1kSaFYVR_wFsrjXgKdYI,26984
+vision_agent/agent/vision_agent_prompts.py,sha256=MZSIwovYgB-f-kdJ6btaNDVXptJn47bfOL3-Zn6NiC0,8573
+vision_agent/agent/vision_agent_v2.py,sha256=CDgGBSoa2LoMS0b4JhyDkoS3PJJNmCCPfxIGUc4RfQg,9658
+vision_agent/agent/vision_agent_v2_prompt.py,sha256=-90Hlbtqb5Fp7OVjGabpTdgr-yCr8AYKIfiMRfoL4SY,5141
+vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
+vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
+vision_agent/llm/llm.py,sha256=qWDBpJolGLWNwDjpEXu1NrjlJbo7Fj9efJYkSfVn6oE,5784
+vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
+vision_agent/lmm/lmm.py,sha256=gK90vMxh0OcGSuIZQikBkDXm4pfkdFk1R2y7rtWDl84,10539
+vision_agent/tools/__init__.py,sha256=WiEjXzXyaBq7IQMKOMbFAK3FKvLNfzZ3dd7CPN-d7B8,451
+vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
+vision_agent/tools/tool_utils.py,sha256=moR7X4hkLKQzC56axdojo_OcIuVOv45bKcHPUVZrPvk,753
+vision_agent/tools/tools.py,sha256=WrNu_L5n2cEpe7e1oy8S1o3dy4JJ4AUxTHcjAdX64_g,46398
+vision_agent/tools/tools_v2.py,sha256=1Y_ZbYJyuo2eZZkq7jY3YfuKWC82C-GFCZMLYH-I5ew,13800
+vision_agent/utils/__init__.py,sha256=AKXf1QVOpO6MnqU8RSaFLQ_4us4DcKf8ibgEbhuHjvI,95
+vision_agent/utils/execute.py,sha256=RC_jKrm2kOWwzNe9xKuA2xJcbsNcD0Hb95_o3_Le0_E,3820
+vision_agent/utils/image_utils.py,sha256=1dggPBhW8_hUXDItCRLa23h-hdBwS50cjL4v1hsoUbg,7586
+vision_agent/utils/sim.py,sha256=FaD16kKL1-JR2aSCmznF9KkJux9u3_Nr9tF4smBeoK0,2327
+vision_agent/utils/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
+vision_agent/utils/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
+vision_agent-0.2.15.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.15.dist-info/METADATA,sha256=qK9rIVOI_IL0dcUcIqtgoRCxuk5GZuQ5HHSrdsuVLKs,9121
+vision_agent-0.2.15.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.15.dist-info/RECORD,,

vision-agent 0.2.13__py3-none-any.whl → 0.2.15__py3-none-any.whl

vision-agent 0.2.13py3-none-any.whl → 0.2.15py3-none-any.whl