PyPI - vision-agent - Versions diffs - 0.0.40__py3-none-any.whl → 0.0.42__py3-none-any.whl - Mend

vision-agent 0.0.40py3-none-any.whl → 0.0.42py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

vision_agent/agent/vision_agent.py +88 -29
vision_agent/image_utils.py +95 -5
vision_agent/llm/llm.py +10 -7
vision_agent/lmm/lmm.py +14 -3
vision_agent/tools/__init__.py +14 -1
vision_agent/tools/tools.py +123 -60
vision_agent/tools/video.py +8 -4
{vision_agent-0.0.40.dist-info → vision_agent-0.0.42.dist-info}/METADATA +1 -1
{vision_agent-0.0.40.dist-info → vision_agent-0.0.42.dist-info}/RECORD +11 -11
{vision_agent-0.0.40.dist-info → vision_agent-0.0.42.dist-info}/LICENSE +0 -0
{vision_agent-0.0.40.dist-info → vision_agent-0.0.42.dist-info}/WHEEL +0 -0

vision_agent/agent/vision_agent.py CHANGED Viewed

@@ -1,11 +1,13 @@
 import json
 import logging
 import sys
+import tempfile
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 from tabulate import tabulate
+from vision_agent.image_utils import overlay_bboxes, overlay_masks
 from vision_agent.llm import LLM, OpenAILLM
 from vision_agent.lmm import LMM, OpenAILMM
 from vision_agent.tools import TOOLS
@@ -248,13 +250,12 @@ def retrieval(
     tools: Dict[int, Any],
     previous_log: str,
     reflections: str,
-) -> Tuple[List[Dict], str]:
+) -> Tuple[Dict, str]:
     tool_id = choose_tool(
         model, question, {k: v["description"] for k, v in tools.items()}, reflections
     )
     if tool_id is None:
-        return [{}], ""
-    _LOGGER.info(f"\t(Tool ID, name): ({tool_id}, {tools[tool_id]['name']})")
+        return {}, ""
     tool_instructions = tools[tool_id]
     tool_usage = tool_instructions["usage"]
@@ -263,16 +264,13 @@ def retrieval(
     parameters = choose_parameter(
         model, question, tool_usage, previous_log, reflections
     )
-    _LOGGER.info(f"\tParameters: {parameters} for {tool_name}")
     if parameters is None:
-        return [{}], ""
-    tool_results = [
-        {"task": question, "tool_name": tool_name, "parameters": parameters}
-    ]
+        return {}, ""
+    tool_results = {"task": question, "tool_name": tool_name, "parameters": parameters}
     _LOGGER.info(
-        f"""Going to run the following {len(tool_results)} tool(s) in sequence:
-{tabulate(tool_results, headers="keys", tablefmt="mixed_grid")}"""
+        f"""Going to run the following tool(s) in sequence:
+{tabulate([tool_results], headers="keys", tablefmt="mixed_grid")}"""
     )
     def parse_tool_results(result: Dict[str, Union[Dict, List]]) -> Any:
@@ -286,13 +284,11 @@ def retrieval(
                 call_results.append(function_call(tools[tool_id]["class"], parameters))
         return call_results
-    call_results = []
-    for i, result in enumerate(tool_results):
-        call_results.extend(parse_tool_results(result))
-        tool_results[i]["call_results"] = call_results
+    call_results = parse_tool_results(tool_results)
+    tool_results["call_results"] = call_results
-    call_results_str = "\n\n".join([str(e) for e in call_results if e is not None])
-    _LOGGER.info(f"\tCall Results: {call_results_str}")
+    call_results_str = str(call_results)
+    # _LOGGER.info(f"\tCall Results: {call_results_str}")
     return tool_results, call_results_str
@@ -335,14 +331,70 @@ def self_reflect(
         tool_results=str(tool_result),
         final_answer=final_answer,
     )
-    if issubclass(type(reflect_model), LMM):
+    if (
+        issubclass(type(reflect_model), LMM)
+        and image is not None
+        and Path(image).suffix in [".jpg", ".jpeg", ".png"]
+    ):
         return reflect_model(prompt, image=image)  # type: ignore
     return reflect_model(prompt)
 def parse_reflect(reflect: str) -> bool:
     # GPT-4V has a hard time following directions, so make the criteria less strict
-    return "finish" in reflect.lower() and len(reflect) < 100
+    return (
+        "finish" in reflect.lower() and len(reflect) < 100
+    ) or "finish" in reflect.lower()[-10:]
+def visualize_result(all_tool_results: List[Dict]) -> List[str]:
+    image_to_data: Dict[str, Dict] = {}
+    for tool_result in all_tool_results:
+        if not tool_result["tool_name"] in ["grounding_sam_", "grounding_dino_"]:
+            continue
+        parameters = tool_result["parameters"]
+        # parameters can either be a dictionary or list, parameters can also be malformed
+        # becaus the LLM builds them
+        if isinstance(parameters, dict):
+            if "image" not in parameters:
+                continue
+            parameters = [parameters]
+        elif isinstance(tool_result["parameters"], list):
+            if (
+                len(tool_result["parameters"]) < 1
+                and "image" not in tool_result["parameters"][0]
+            ):
+                continue
+        for param, call_result in zip(parameters, tool_result["call_results"]):
+            # calls can fail, so we need to check if the call was successful
+            if not isinstance(call_result, dict):
+                continue
+            if "bboxes" not in call_result:
+                continue
+            # if the call was successful, then we can add the image data
+            image = param["image"]
+            if image not in image_to_data:
+                image_to_data[image] = {"bboxes": [], "masks": [], "labels": []}
+            image_to_data[image]["bboxes"].extend(call_result["bboxes"])
+            image_to_data[image]["labels"].extend(call_result["labels"])
+            if "masks" in call_result:
+                image_to_data[image]["masks"].extend(call_result["masks"])
+    visualized_images = []
+    for image in image_to_data:
+        image_path = Path(image)
+        image_data = image_to_data[image]
+        image = overlay_masks(image_path, image_data)
+        image = overlay_bboxes(image, image_data)
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
+            image.save(f.name)
+            visualized_images.append(f.name)
+    return visualized_images
 class VisionAgent(Agent):
@@ -371,10 +423,16 @@ class VisionAgent(Agent):
         verbose: bool = False,
     ):
         self.task_model = (
-            OpenAILLM(json_mode=True) if task_model is None else task_model
+            OpenAILLM(json_mode=True, temperature=0.1)
+            if task_model is None
+            else task_model
+        )
+        self.answer_model = (
+            OpenAILLM(temperature=0.1) if answer_model is None else answer_model
+        )
+        self.reflect_model = (
+            OpenAILMM(temperature=0.1) if reflect_model is None else reflect_model
         )
-        self.answer_model = OpenAILLM() if answer_model is None else answer_model
-        self.reflect_model = OpenAILMM() if reflect_model is None else reflect_model
         self.max_retries = max_retries
         self.tools = TOOLS
@@ -389,7 +447,8 @@ class VisionAgent(Agent):
         """Invoke the vision agent.
         Parameters:
-            input: a prompt that describe the task or a conversation in the format of [{"role": "user", "content": "describe your task here..."}].
+            input: a prompt that describe the task or a conversation in the format of
+                [{"role": "user", "content": "describe your task here..."}].
             image: the input image referenced in the prompt parameter.
         Returns:
@@ -413,7 +472,6 @@ class VisionAgent(Agent):
         for _ in range(self.max_retries):
             task_list = create_tasks(self.task_model, question, self.tools, reflections)
-            _LOGGER.info(f"Task Dependency: {task_list}")
             task_depend = {"Original Quesiton": question}
             previous_log = ""
             answers = []
@@ -424,7 +482,6 @@ class VisionAgent(Agent):
             for task in task_list:
                 task_str = task["task"]
                 previous_log = str(task_depend)
-                _LOGGER.info(f"\tSubtask: {task_str}")
                 tool_results, call_results = retrieval(
                     self.task_model,
                     task_str,
@@ -436,10 +493,10 @@ class VisionAgent(Agent):
                     self.answer_model, task_str, call_results, previous_log, reflections
                 )
-                for tool_result in tool_results:
-                    tool_result["answer"] = answer
-                all_tool_results.extend(tool_results)
+                tool_results["answer"] = answer
+                all_tool_results.append(tool_results)
+                _LOGGER.info(f"\tCall Result: {call_results}")
                 _LOGGER.info(f"\tAnswer: {answer}")
                 answers.append({"task": task_str, "answer": answer})
                 task_depend[task["id"]]["answer"] = answer  # type: ignore
@@ -448,15 +505,17 @@ class VisionAgent(Agent):
                 self.answer_model, question, answers, reflections
             )
+            visualized_images = visualize_result(all_tool_results)
+            all_tool_results.append({"visualized_images": visualized_images})
             reflection = self_reflect(
                 self.reflect_model,
                 question,
                 self.tools,
                 all_tool_results,
                 final_answer,
-                image,
+                visualized_images[0] if len(visualized_images) > 0 else image,
             )
-            _LOGGER.info(f"\tReflection: {reflection}")
+            _LOGGER.info(f"Reflection: {reflection}")
             if parse_reflect(reflection):
                 break
             else:

vision_agent/image_utils.py CHANGED Viewed

@@ -3,15 +3,38 @@
 import base64
 from io import BytesIO
 from pathlib import Path
-from typing import Tuple, Union
+from typing import Dict, Tuple, Union
 import numpy as np
-from PIL import Image
+from PIL import Image, ImageDraw, ImageFont
 from PIL.Image import Image as ImageType
+COLORS = [
+    (158, 218, 229),
+    (219, 219, 141),
+    (23, 190, 207),
+    (188, 189, 34),
+    (199, 199, 199),
+    (247, 182, 210),
+    (127, 127, 127),
+    (227, 119, 194),
+    (196, 156, 148),
+    (197, 176, 213),
+    (140, 86, 75),
+    (148, 103, 189),
+    (255, 152, 150),
+    (152, 223, 138),
+    (214, 39, 40),
+    (44, 160, 44),
+    (255, 187, 120),
+    (174, 199, 232),
+    (255, 127, 14),
+    (31, 119, 180),
+]
 def b64_to_pil(b64_str: str) -> ImageType:
-    """Convert a base64 string to a PIL Image.
+    r"""Convert a base64 string to a PIL Image.
     Parameters:
         b64_str: the base64 encoded image
@@ -26,7 +49,7 @@ def b64_to_pil(b64_str: str) -> ImageType:
 def get_image_size(data: Union[str, Path, np.ndarray, ImageType]) -> Tuple[int, ...]:
-    """Get the size of an image.
+    r"""Get the size of an image.
     Parameters:
         data: the input image
@@ -41,7 +64,7 @@ def get_image_size(data: Union[str, Path, np.ndarray, ImageType]) -> Tuple[int,
 def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
-    """Convert an image to a base64 string.
+    r"""Convert an image to a base64 string.
     Parameters:
         data: the input image
@@ -60,3 +83,70 @@ def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
     else:
         arr_bytes = data.tobytes()
         return base64.b64encode(arr_bytes).decode("utf-8")
+def overlay_bboxes(
+    image: Union[str, Path, np.ndarray, ImageType], bboxes: Dict
+) -> ImageType:
+    r"""Plots bounding boxes on to an image.
+    Parameters:
+        image: the input image
+        bboxes: the bounding boxes to overlay
+    Returns:
+        The image with the bounding boxes overlayed
+    """
+    if isinstance(image, (str, Path)):
+        image = Image.open(image)
+    elif isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    color = {label: COLORS[i % len(COLORS)] for i, label in enumerate(bboxes["labels"])}
+    draw = ImageDraw.Draw(image)
+    font = ImageFont.load_default()
+    width, height = image.size
+    if "bboxes" not in bboxes:
+        return image.convert("RGB")
+    for label, box in zip(bboxes["labels"], bboxes["bboxes"]):
+        box = [box[0] * width, box[1] * height, box[2] * width, box[3] * height]
+        draw.rectangle(box, outline=color[label], width=3)
+        label = f"{label}"
+        text_box = draw.textbbox((box[0], box[1]), text=label, font=font)
+        draw.rectangle(text_box, fill=color[label])
+        draw.text((text_box[0], text_box[1]), label, fill="black", font=font)
+    return image.convert("RGB")
+def overlay_masks(
+    image: Union[str, Path, np.ndarray, ImageType], masks: Dict, alpha: float = 0.5
+) -> ImageType:
+    r"""Plots masks on to an image.
+    Parameters:
+        image: the input image
+        masks: the masks to overlay
+        alpha: the transparency of the overlay
+    Returns:
+        The image with the masks overlayed
+    """
+    if isinstance(image, (str, Path)):
+        image = Image.open(image)
+    elif isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    color = {label: COLORS[i % len(COLORS)] for i, label in enumerate(masks["labels"])}
+    if "masks" not in masks:
+        return image.convert("RGB")
+    for label, mask in zip(masks["labels"], masks["masks"]):
+        if isinstance(mask, str):
+            mask = np.array(Image.open(mask))
+        np_mask = np.zeros((image.size[1], image.size[0], 4))
+        np_mask[mask > 0, :] = color[label] + (255 * alpha,)
+        mask_img = Image.fromarray(np_mask.astype(np.uint8))
+        image = Image.alpha_composite(image.convert("RGBA"), mask_img)
+    return image.convert("RGB")

vision_agent/llm/llm.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import json
 from abc import ABC, abstractmethod
-from typing import Callable, Dict, List, Mapping, Union, cast
+from typing import Any, Callable, Dict, List, Mapping, Union, cast
 from openai import OpenAI
@@ -31,30 +31,33 @@ class OpenAILLM(LLM):
     r"""An LLM class for any OpenAI LLM model."""
     def __init__(
-        self, model_name: str = "gpt-4-turbo-preview", json_mode: bool = False
+        self,
+        model_name: str = "gpt-4-turbo-preview",
+        json_mode: bool = False,
+        **kwargs: Any
     ):
         self.model_name = model_name
         self.client = OpenAI()
-        self.json_mode = json_mode
+        self.kwargs = kwargs
+        if json_mode:
+            self.kwargs["response_format"] = {"type": "json_object"}
     def generate(self, prompt: str) -> str:
-        kwargs = {"response_format": {"type": "json_object"}} if self.json_mode else {}
         response = self.client.chat.completions.create(
             model=self.model_name,
             messages=[
                 {"role": "user", "content": prompt},
             ],
-            **kwargs,  # type: ignore
+            **self.kwargs,
         )
         return cast(str, response.choices[0].message.content)
     def chat(self, chat: List[Dict[str, str]]) -> str:
-        kwargs = {"response_format": {"type": "json_object"}} if self.json_mode else {}
         response = self.client.chat.completions.create(
             model=self.model_name,
             messages=chat,  # type: ignore
-            **kwargs,
+            **self.kwargs,
         )
         return cast(str, response.choices[0].message.content)

vision_agent/lmm/lmm.py CHANGED Viewed

@@ -97,11 +97,15 @@ class OpenAILMM(LMM):
     r"""An LMM class for the OpenAI GPT-4 Vision model."""
     def __init__(
-        self, model_name: str = "gpt-4-vision-preview", max_tokens: int = 1024
+        self,
+        model_name: str = "gpt-4-vision-preview",
+        max_tokens: int = 1024,
+        **kwargs: Any,
     ):
         self.model_name = model_name
         self.max_tokens = max_tokens
         self.client = OpenAI()
+        self.kwargs = kwargs
     def __call__(
         self,
@@ -123,6 +127,13 @@ class OpenAILMM(LMM):
         if image:
             extension = Path(image).suffix
+            if extension.lower() == ".jpeg" or extension.lower() == ".jpg":
+                extension = "jpg"
+            elif extension.lower() == ".png":
+                extension = "png"
+            else:
+                raise ValueError(f"Unsupported image extension: {extension}")
             encoded_image = encode_image(image)
             fixed_chat[0]["content"].append(  # type: ignore
                 {
@@ -135,7 +146,7 @@ class OpenAILMM(LMM):
             )
         response = self.client.chat.completions.create(
-            model=self.model_name, messages=fixed_chat, max_tokens=self.max_tokens  # type: ignore
+            model=self.model_name, messages=fixed_chat, max_tokens=self.max_tokens, **self.kwargs  # type: ignore
         )
         return cast(str, response.choices[0].message.content)
@@ -163,7 +174,7 @@ class OpenAILMM(LMM):
             )
         response = self.client.chat.completions.create(
-            model=self.model_name, messages=message, max_tokens=self.max_tokens  # type: ignore
+            model=self.model_name, messages=message, max_tokens=self.max_tokens, **self.kwargs  # type: ignore
         )
         return cast(str, response.choices[0].message.content)

vision_agent/tools/__init__.py CHANGED Viewed

@@ -1,2 +1,15 @@
 from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
-from .tools import CLIP, TOOLS, Counter, Crop, GroundingDINO, GroundingSAM, Tool
+from .tools import (
+    CLIP,
+    TOOLS,
+    BboxArea,
+    BboxIoU,
+    Counter,
+    Crop,
+    ExtractFrames,
+    GroundingDINO,
+    GroundingSAM,
+    SegArea,
+    SegIoU,
+    Tool,
+)

vision_agent/tools/tools.py CHANGED Viewed

@@ -92,7 +92,7 @@ class CLIP(Tool):
     }
     # TODO: Add support for input multiple images, which aligns with the output type.
-    def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> List[Dict]:
+    def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> Dict:
         """Invoke the CLIP model.
         Parameters:
@@ -122,7 +122,7 @@ class CLIP(Tool):
         rets = []
         for elt in resp_json["data"]:
             rets.append({"labels": prompt, "scores": [round(prob, 2) for prob in elt]})
-        return cast(List[Dict], rets)
+        return cast(Dict, rets[0])
 class GroundingDINO(Tool):
@@ -168,7 +168,7 @@ class GroundingDINO(Tool):
     }
     # TODO: Add support for input multiple images, which aligns with the output type.
-    def __call__(self, prompt: str, image: Union[str, Path, ImageType]) -> List[Dict]:
+    def __call__(self, prompt: str, image: Union[str, Path, ImageType]) -> Dict:
         """Invoke the Grounding DINO model.
         Parameters:
@@ -204,7 +204,7 @@ class GroundingDINO(Tool):
             if "scores" in elt:
                 elt["scores"] = [round(score, 2) for score in elt["scores"]]
             elt["size"] = (image_size[1], image_size[0])
-        return cast(List[Dict], resp_data)
+        return cast(Dict, resp_data)
 class GroundingSAM(Tool):
@@ -259,7 +259,7 @@ class GroundingSAM(Tool):
     }
     # TODO: Add support for input multiple images, which aligns with the output type.
-    def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> List[Dict]:
+    def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> Dict:
         """Invoke the Grounding SAM model.
         Parameters:
@@ -294,7 +294,7 @@ class GroundingSAM(Tool):
             ret_pred["labels"].append(pred["label_name"])
             ret_pred["bboxes"].append(normalize_bbox(pred["bbox"], image_size))
             ret_pred["masks"].append(mask)
-        return [ret_pred]
+        return ret_pred
 class AgentGroundingSAM(GroundingSAM):
@@ -302,15 +302,14 @@ class AgentGroundingSAM(GroundingSAM):
     returns the file name. This makes it easier for agents to use.
     """
-    def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> List[Dict]:
+    def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> Dict:
         rets = super().__call__(prompt, image)
-        for ret in rets:
-            mask_files = []
-            for mask in ret["masks"]:
-                with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
-                    Image.fromarray(mask * 255).save(tmp)
-                    mask_files.append(tmp.name)
-            ret["masks"] = mask_files
+        mask_files = []
+        for mask in rets["masks"]:
+            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
+                Image.fromarray(mask * 255).save(tmp)
+                mask_files.append(tmp.name)
+        rets["masks"] = mask_files
         return rets
@@ -363,7 +362,7 @@ class Crop(Tool):
         ],
     }
-    def __call__(self, bbox: List[float], image: Union[str, Path]) -> str:
+    def __call__(self, bbox: List[float], image: Union[str, Path]) -> Dict:
         pil_image = Image.open(image)
         width, height = pil_image.size
         bbox = [
@@ -373,10 +372,10 @@ class Crop(Tool):
             int(bbox[3] * height),
         ]
         cropped_image = pil_image.crop(bbox)  # type: ignore
-        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
             cropped_image.save(tmp.name)
-        return tmp.name
+        return {"image": tmp.name}
 class BboxArea(Tool):
@@ -388,7 +387,7 @@ class BboxArea(Tool):
         "required_parameters": [{"name": "bbox", "type": "List[int]"}],
         "examples": [
             {
-                "scenario": "If you want to calculate the area of the bounding box [0, 0, 100, 100]",
+                "scenario": "If you want to calculate the area of the bounding box [0.2, 0.21, 0.34, 0.42]",
                 "parameters": {"bboxes": [0.2, 0.21, 0.34, 0.42]},
             }
         ],
@@ -430,6 +429,109 @@ class SegArea(Tool):
         return cast(float, round(np.sum(np_mask) / 255, 2))
+class BboxIoU(Tool):
+    name = "bbox_iou_"
+    description = (
+        "'bbox_iou_' returns the intersection over union of two bounding boxes."
+    )
+    usage = {
+        "required_parameters": [
+            {"name": "bbox1", "type": "List[int]"},
+            {"name": "bbox2", "type": "List[int]"},
+        ],
+        "examples": [
+            {
+                "scenario": "If you want to calculate the intersection over union of the bounding boxes [0.2, 0.21, 0.34, 0.42] and [0.3, 0.31, 0.44, 0.52]",
+                "parameters": {
+                    "bbox1": [0.2, 0.21, 0.34, 0.42],
+                    "bbox2": [0.3, 0.31, 0.44, 0.52],
+                },
+            }
+        ],
+    }
+    def __call__(self, bbox1: List[int], bbox2: List[int]) -> float:
+        x1, y1, x2, y2 = bbox1
+        x3, y3, x4, y4 = bbox2
+        xA = max(x1, x3)
+        yA = max(y1, y3)
+        xB = min(x2, x4)
+        yB = min(y2, y4)
+        inter_area = max(0, xB - xA) * max(0, yB - yA)
+        boxa_area = (x2 - x1) * (y2 - y1)
+        boxb_area = (x4 - x3) * (y4 - y3)
+        iou = inter_area / float(boxa_area + boxb_area - inter_area)
+        return round(iou, 2)
+class SegIoU(Tool):
+    name = "seg_iou_"
+    description = "'seg_iou_' returns the intersection over union of two segmentation masks given their segmentation mask files."
+    usage = {
+        "required_parameters": [
+            {"name": "mask1", "type": "str"},
+            {"name": "mask2", "type": "str"},
+        ],
+        "examples": [
+            {
+                "scenario": "If you want to calculate the intersection over union of the segmentation masks for mask_file1.jpg and mask_file2.jpg",
+                "parameters": {"mask1": "mask_file1.png", "mask2": "mask_file2.png"},
+            }
+        ],
+    }
+    def __call__(self, mask1: Union[str, Path], mask2: Union[str, Path]) -> float:
+        pil_mask1 = Image.open(str(mask1))
+        pil_mask2 = Image.open(str(mask2))
+        np_mask1 = np.clip(np.array(pil_mask1), 0, 1)
+        np_mask2 = np.clip(np.array(pil_mask2), 0, 1)
+        intersection = np.logical_and(np_mask1, np_mask2)
+        union = np.logical_or(np_mask1, np_mask2)
+        iou = np.sum(intersection) / np.sum(union)
+        return cast(float, round(iou, 2))
+class ExtractFrames(Tool):
+    r"""Extract frames from a video."""
+    name = "extract_frames_"
+    description = "'extract_frames_' extracts frames where there is motion detected in a video, returns a list of tuples (frame, timestamp), where timestamp is the relative time in seconds where teh frame was captured. The frame is a local image file path."
+    usage = {
+        "required_parameters": [{"name": "video_uri", "type": "str"}],
+        "examples": [
+            {
+                "scenario": "Can you extract the frames from this video? Video: www.foobar.com/video?name=test.mp4",
+                "parameters": {"video_uri": "www.foobar.com/video?name=test.mp4"},
+            },
+            {
+                "scenario": "Can you extract the images from this video file? Video path: tests/data/test.mp4",
+                "parameters": {"video_uri": "tests/data/test.mp4"},
+            },
+        ],
+    }
+    def __call__(self, video_uri: str) -> List[Tuple[str, float]]:
+        """Extract frames from a video.
+        Parameters:
+            video_uri: the path to the video file or a url points to the video data
+        Returns:
+            a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(path_to_frame1, 0.0), (path_to_frame2, 0.5), ...]. The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order.
+        """
+        frames = extract_frames_from_video(video_uri)
+        result = []
+        _LOGGER.info(
+            f"Extracted {len(frames)} frames from video {video_uri}. Temporarily saving them as images to disk for downstream tasks."
+        )
+        for frame, ts in frames:
+            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
+                Image.fromarray(frame).save(tmp)
+            result.append((tmp.name, ts))
+        return result
 class Add(Tool):
     r"""Add returns the sum of all the arguments passed to it, normalized to 2 decimal places."""
@@ -506,47 +608,6 @@ class Divide(Tool):
         return round(input[0] / input[1], 2)
-class ExtractFrames(Tool):
-    r"""Extract frames from a video."""
-    name = "extract_frames_"
-    description = "'extract_frames_' extract image frames from the input video, return a list of tuple (frame, timestamp), where the timestamp is the relative time in seconds of the frame occurred in the video, the frame is a local image file path that stores the frame."
-    usage = {
-        "required_parameters": [{"name": "video_uri", "type": "str"}],
-        "examples": [
-            {
-                "scenario": "Can you extract the frames from this video? Video: www.foobar.com/video?name=test.mp4",
-                "parameters": {"video_uri": "www.foobar.com/video?name=test.mp4"},
-            },
-            {
-                "scenario": "Can you extract the images from this video file? Video path: tests/data/test.mp4",
-                "parameters": {"video_uri": "tests/data/test.mp4"},
-            },
-        ],
-    }
-    def __call__(self, video_uri: str) -> list[tuple[str, float]]:
-        """Extract frames from a video.
-        Parameters:
-            video_uri: the path to the video file or a url points to the video data
-        Returns:
-            a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(path_to_frame1, 0.0), (path_to_frame2, 0.5), ...]. The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order.
-        """
-        frames = extract_frames_from_video(video_uri)
-        result = []
-        _LOGGER.info(
-            f"Extracted {len(frames)} frames from video {video_uri}. Temporarily saving them as images to disk for downstream tasks."
-        )
-        for frame, ts in frames:
-            with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
-                Image.fromarray(frame).save(tmp)
-            result.append((tmp.name, ts))
-        return result
 TOOLS = {
     i: {"name": c.name, "description": c.description, "usage": c.usage, "class": c}
     for i, c in enumerate(
@@ -554,15 +615,17 @@ TOOLS = {
             CLIP,
             GroundingDINO,
             AgentGroundingSAM,
+            ExtractFrames,
             Counter,
             Crop,
             BboxArea,
             SegArea,
+            BboxIoU,
+            SegIoU,
             Add,
             Subtract,
             Multiply,
             Divide,
-            ExtractFrames,
         ]
     )
     if (hasattr(c, "name") and hasattr(c, "description") and hasattr(c, "usage"))

vision_agent/tools/video.py CHANGED Viewed

@@ -22,12 +22,16 @@ def extract_frames_from_video(
     Parameters:
         video_uri: the path to the video file or a video file url
         fps: the frame rate per second to extract the frames
-        motion_detection_threshold: The threshold to detect motion between changes/frames.
-            A value between 0-1, which represents the percentage change required for the frames to be considered in motion.
-            For example, a lower value means more frames will be extracted.
+        motion_detection_threshold: The threshold to detect motion between
+            changes/frames. A value between 0-1, which represents the percentage change
+            required for the frames to be considered in motion. For example, a lower
+            value means more frames will be extracted.
     Returns:
-        a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(frame1, 0.0), (frame2, 0.5), ...]. The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order.
+        a list of tuples containing the extracted frame and the timestamp in seconds.
+        E.g. [(frame1, 0.0), (frame2, 0.5), ...]. The timestamp is the time in seconds
+        from the start of the video. E.g. 12.125 means 12.125 seconds from the start of
+        the video. The frames are sorted by the timestamp in ascending order.
     """
     with VideoFileClip(video_uri) as video:
         video_duration: float = video.duration

{vision_agent-0.0.40.dist-info → vision_agent-0.0.42.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.0.40
+Version: 0.0.42
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.0.40.dist-info → vision_agent-0.0.42.dist-info}/RECORD RENAMED Viewed

@@ -5,22 +5,22 @@ vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMV
 vision_agent/agent/easytool_prompts.py,sha256=uNp12LOFRLr3i2zLhNuLuyFms2-s8es2t6P6h76QDow,4493
 vision_agent/agent/reflexion.py,sha256=wzpptfALNZIh9Q5jgkK3imGL5LWjTW_n_Ypsvxdh07Q,10101
 vision_agent/agent/reflexion_prompts.py,sha256=UPGkt_qgHBMUY0VPVoF-BqhR0d_6WPjjrhbYLBYOtnQ,9342
-vision_agent/agent/vision_agent.py,sha256=AS8-2mKg476X6ydopcT_Ike3GCmSlzbwYaw-yuHCPl0,15262
+vision_agent/agent/vision_agent.py,sha256=P2melU6XQCCiiL1C_4QsxGUaWbwahuJA90eIcQJTR4U,17449
 vision_agent/agent/vision_agent_prompts.py,sha256=otaDRsaHc7bqw_tgWTnu-eUcFeOzBFrn9sPU7_xr2VQ,6151
 vision_agent/data/__init__.py,sha256=YU-5g3LbEQ6a4drz0RLGTagXMVU2Z4Xr3RlfWE-R0jU,46
 vision_agent/data/data.py,sha256=pgtSGZdAnbQ8oGsuapLtFTMPajnCGDGekEXTnFuBwsY,5122
 vision_agent/emb/__init__.py,sha256=YmCkGrJBtXb6X6Z3lnKiFoQYKXMgHMJp8JJyMLVvqcI,75
 vision_agent/emb/emb.py,sha256=la9lhEzk7jqUCjYYQ5oRgVNSnC9_EJBJIpE_B9c6PJo,1375
-vision_agent/image_utils.py,sha256=D5H-GN35Bz3u1Fq_JfYQVjNzAmZjJl138wma5fRtVjA,1684
+vision_agent/image_utils.py,sha256=XiOLpHAvlk55URw6iG7hl1OY71FVRA9_25b650amZXA,4420
 vision_agent/llm/__init__.py,sha256=fBKsIjL4z08eA0QYx6wvhRe4Nkp2pJ4VrZK0-uUL5Ec,32
-vision_agent/llm/llm.py,sha256=d8A7jmLVGx5HzoiYJ75mTMU7dbD5-bOYeXYlHaay6WA,3957
+vision_agent/llm/llm.py,sha256=l8ZVh6vCZOJBHfenfOoHwPySXEUQoNt_gbL14gkvu2g,3904
 vision_agent/lmm/__init__.py,sha256=I8mbeNUajTfWVNqLsuFQVOaNBDlkIhYp9DFU8H4kB7g,51
-vision_agent/lmm/lmm.py,sha256=ARcbgkcyP83TbVVoXI9B-gtG0gJuTaG_MjcUGbams4U,8052
-vision_agent/tools/__init__.py,sha256=aX0pU3pXU1V0Cj9FzYCvdsX76TAglFMHx59kNhXHbPs,131
+vision_agent/lmm/lmm.py,sha256=s_A3SKCoWm2biOt-gS9PXOsa9l-zrmR6mInLjAqam-A,8438
+vision_agent/tools/__init__.py,sha256=AKN-T659HpwVearRnkCd6wWNoJ6K5kW9gAZwb8IQSLE,235
 vision_agent/tools/prompts.py,sha256=9RBbyqlNlExsGKlJ89Jkph83DAEJ8PCVGaHoNbyN7TM,1416
-vision_agent/tools/tools.py,sha256=2mmomPDbldXRpw3q5zAcazKJMjAGd0Jl9ak9JykHQYI,21211
-vision_agent/tools/video.py,sha256=KV_Wcat7DDGxpHSaGBu7s4lj4crlYaUu4YKpCO_86k4,7440
-vision_agent-0.0.40.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.0.40.dist-info/METADATA,sha256=uPMyB4VrvlIs6R7yGCoRkC2Bf9Zemc6wQF03BjgBFgs,5324
-vision_agent-0.0.40.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.0.40.dist-info/RECORD,,
+vision_agent/tools/tools.py,sha256=aMTBxxaXQp33HwplOS8xrgfbsTJ8e1pwO6byR7HcTJI,23447
+vision_agent/tools/video.py,sha256=40rscP8YvKN3lhZ4PDcOK4XbdFX2duCRpHY_krmBYKU,7476
+vision_agent-0.0.42.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.0.42.dist-info/METADATA,sha256=r523uVvu-DsNoA-H-18O2JXF4J9G2nZ2cDSmjXUFq_M,5324
+vision_agent-0.0.42.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.0.42.dist-info/RECORD,,

{vision_agent-0.0.40.dist-info → vision_agent-0.0.42.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.0.40.dist-info → vision_agent-0.0.42.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.0.40__py3-none-any.whl → 0.0.42__py3-none-any.whl

vision-agent 0.0.40py3-none-any.whl → 0.0.42py3-none-any.whl