PyPI - vision-agent - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl - Mend

vision-agent 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

vision_agent/agent/easytool_prompts.py +9 -1
vision_agent/agent/reflexion.py +14 -6
vision_agent/agent/vision_agent.py +122 -57
vision_agent/agent/vision_agent_prompts.py +15 -1
vision_agent/data/data.py +2 -2
vision_agent/llm/llm.py +1 -1
vision_agent/lmm/lmm.py +66 -48
vision_agent/tools/__init__.py +2 -2
vision_agent/tools/tools.py +33 -30
vision_agent/tools/video.py +14 -12
{vision_agent-0.1.1.dist-info → vision_agent-0.1.3.dist-info}/METADATA +1 -1
{vision_agent-0.1.1.dist-info → vision_agent-0.1.3.dist-info}/RECORD +14 -14
{vision_agent-0.1.1.dist-info → vision_agent-0.1.3.dist-info}/LICENSE +0 -0
{vision_agent-0.1.1.dist-info → vision_agent-0.1.3.dist-info}/WHEEL +0 -0

vision_agent/agent/easytool_prompts.py CHANGED Viewed

@@ -56,6 +56,7 @@ Example 2: {{"Parameters":[{{"input": [1,2,3]}}, {{"input": [2,3,4]}}]}}
 These are logs of previous questions and answers:
 {previous_log}
 This is the current user's question: {question}
 This is the API tool documentation: {tool_usage}
 Output: """
@@ -67,15 +68,22 @@ Please note that:
 2. We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible.
 3. If the API tool does not provide useful information in the response, please answer with your knowledge.
 4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers.
 These are logs of previous questions and answers:
 {previous_log}
 This is the user's question: {question}
 This is the response output by the API tool:
 {call_results}
 We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible.
 Output: """
 ANSWER_SUMMARIZE = """We break down a complex user's problems into simple subtasks and provide answers to each simple subtask. You need to organize these answers to each subtask and form a self-consistent final answer to the user's question.
 This is the user's question: {question}
-These are subtasks and their answers: {answers}
+These are subtasks and their answers:
+{answers}
 Final answer: """

vision_agent/agent/reflexion.py CHANGED Viewed

@@ -238,12 +238,20 @@ class Reflexion(Agent):
                     self._build_agent_prompt(question, reflections, scratchpad)
                 )
             )
-        return format_step(
-            self.action_agent(
-                self._build_agent_prompt(question, reflections, scratchpad),
-                image=image,
+        elif isinstance(self.action_agent, LMM):
+            return format_step(
+                self.action_agent(
+                    self._build_agent_prompt(question, reflections, scratchpad),
+                    images=[image] if image is not None else None,
+                )
+            )
+        elif isinstance(self.action_agent, Agent):
+            return format_step(
+                self.action_agent(
+                    self._build_agent_prompt(question, reflections, scratchpad),
+                    image=image,
+                )
             )
-        )
     def prompt_reflection(
         self,
@@ -261,7 +269,7 @@ class Reflexion(Agent):
         return format_step(
             self.self_reflect_model(
                 self._build_reflect_prompt(question, context, scratchpad),
-                image=image,
+                images=[image] if image is not None else None,
             )
         )

vision_agent/agent/vision_agent.py CHANGED Viewed

@@ -3,7 +3,7 @@ import logging
 import sys
 import tempfile
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
 from PIL import Image
 from tabulate import tabulate
@@ -37,10 +37,10 @@ _LOGGER = logging.getLogger(__name__)
 def parse_json(s: str) -> Any:
     s = (
-        s.replace(": true", ": True")
-        .replace(": false", ": False")
-        .replace(":true", ": True")
-        .replace(":false", ": False")
+        s.replace(": True", ": true")
+        .replace(": False", ": false")
+        .replace(":True", ": true")
+        .replace(":False", ": false")
         .replace("```", "")
         .strip()
     )
@@ -62,6 +62,19 @@ def format_tools(tools: Dict[int, Any]) -> str:
     return tool_str
+def format_tool_usage(tools: Dict[int, Any], tool_result: List[Dict]) -> str:
+    usage = []
+    name_to_usage = {v["name"]: v["usage"] for v in tools.values()}
+    for tool_res in tool_result:
+        if "tool_name" in tool_res:
+            usage.append((tool_res["tool_name"], name_to_usage[tool_res["tool_name"]]))
+    usage_str = ""
+    for tool_name, tool_usage in usage:
+        usage_str += f"{tool_name} - {tool_usage}\n"
+    return usage_str
 def topological_sort(tasks: List[Dict]) -> List[Dict]:
     in_degree = {task["id"]: 0 for task in tasks}
     for task in tasks:
@@ -251,58 +264,46 @@ def self_reflect(
     tools: Dict[int, Any],
     tool_result: List[Dict],
     final_answer: str,
-    image: Optional[Union[str, Path]] = None,
+    images: Optional[Sequence[Union[str, Path]]] = None,
 ) -> str:
     prompt = VISION_AGENT_REFLECTION.format(
         question=question,
-        tools=format_tools(tools),
+        tools=format_tools({k: v["description"] for k, v in tools.items()}),
+        tool_usage=format_tool_usage(tools, tool_result),
         tool_results=str(tool_result),
         final_answer=final_answer,
     )
     if (
         issubclass(type(reflect_model), LMM)
-        and image is not None
-        and Path(image).suffix in [".jpg", ".jpeg", ".png"]
+        and images is not None
+        and all([Path(image).suffix in [".jpg", ".jpeg", ".png"] for image in images])
     ):
-        return reflect_model(prompt, image=image)  # type: ignore
+        return reflect_model(prompt, images=images)  # type: ignore
     return reflect_model(prompt)
-def parse_reflect(reflect: str) -> bool:
-    # GPT-4V has a hard time following directions, so make the criteria less strict
-    return (
+def parse_reflect(reflect: str) -> Any:
+    reflect = reflect.strip()
+    try:
+        return parse_json(reflect)
+    except Exception:
+        _LOGGER.error(f"Failed parse json reflection: {reflect}")
+    # LMMs have a hard time following directions, so make the criteria less strict
+    finish = (
         "finish" in reflect.lower() and len(reflect) < 100
     ) or "finish" in reflect.lower()[-10:]
-def visualize_result(all_tool_results: List[Dict]) -> List[str]:
-    image_to_data: Dict[str, Dict] = {}
-    for tool_result in all_tool_results:
-        if tool_result["tool_name"] not in ["grounding_sam_", "grounding_dino_"]:
-            continue
-        parameters = tool_result["parameters"]
-        # parameters can either be a dictionary or list, parameters can also be malformed
-        # becaus the LLM builds them
-        if isinstance(parameters, dict):
-            if "image" not in parameters:
-                continue
-            parameters = [parameters]
-        elif isinstance(tool_result["parameters"], list):
-            if len(tool_result["parameters"]) < 1 or (
-                "image" not in tool_result["parameters"][0]
-            ):
-                continue
-        for param, call_result in zip(parameters, tool_result["call_results"]):
-            # calls can fail, so we need to check if the call was successful
-            if not isinstance(call_result, dict):
-                continue
-            if "bboxes" not in call_result:
-                continue
-            # if the call was successful, then we can add the image data
-            image = param["image"]
+    return {"Finish": finish, "Reflection": reflect}
+def _handle_extract_frames(
+    image_to_data: Dict[str, Dict], tool_result: Dict
+) -> Dict[str, Dict]:
+    image_to_data = image_to_data.copy()
+    # handle extract_frames_ case, useful if it extracts frames but doesn't do
+    # any following processing
+    for video_file_output in tool_result["call_results"]:
+        for frame, _ in video_file_output:
+            image = frame
             if image not in image_to_data:
                 image_to_data[image] = {
                     "bboxes": [],
@@ -310,17 +311,72 @@ def visualize_result(all_tool_results: List[Dict]) -> List[str]:
                     "labels": [],
                     "scores": [],
                 }
+    return image_to_data
+def _handle_viz_tools(
+    image_to_data: Dict[str, Dict], tool_result: Dict
+) -> Dict[str, Dict]:
+    image_to_data = image_to_data.copy()
+    # handle grounding_sam_ and grounding_dino_
+    parameters = tool_result["parameters"]
+    # parameters can either be a dictionary or list, parameters can also be malformed
+    # becaus the LLM builds them
+    if isinstance(parameters, dict):
+        if "image" not in parameters:
+            return image_to_data
+        parameters = [parameters]
+    elif isinstance(tool_result["parameters"], list):
+        if len(tool_result["parameters"]) < 1 or (
+            "image" not in tool_result["parameters"][0]
+        ):
+            return image_to_data
+    for param, call_result in zip(parameters, tool_result["call_results"]):
+        # calls can fail, so we need to check if the call was successful
+        if not isinstance(call_result, dict) or "bboxes" not in call_result:
+            return image_to_data
+        # if the call was successful, then we can add the image data
+        image = param["image"]
+        if image not in image_to_data:
+            image_to_data[image] = {
+                "bboxes": [],
+                "masks": [],
+                "labels": [],
+                "scores": [],
+            }
+        image_to_data[image]["bboxes"].extend(call_result["bboxes"])
+        image_to_data[image]["labels"].extend(call_result["labels"])
+        image_to_data[image]["scores"].extend(call_result["scores"])
+        if "masks" in call_result:
+            image_to_data[image]["masks"].extend(call_result["masks"])
+    return image_to_data
+def visualize_result(all_tool_results: List[Dict]) -> Sequence[Union[str, Path]]:
+    image_to_data: Dict[str, Dict] = {}
+    for tool_result in all_tool_results:
+        # only handle bbox/mask tools or frame extraction
+        if tool_result["tool_name"] not in [
+            "grounding_sam_",
+            "grounding_dino_",
+            "extract_frames_",
+        ]:
+            continue
-            image_to_data[image]["bboxes"].extend(call_result["bboxes"])
-            image_to_data[image]["labels"].extend(call_result["labels"])
-            image_to_data[image]["scores"].extend(call_result["scores"])
-            if "masks" in call_result:
-                image_to_data[image]["masks"].extend(call_result["masks"])
+        if tool_result["tool_name"] == "extract_frames_":
+            image_to_data = _handle_extract_frames(image_to_data, tool_result)
+        else:
+            image_to_data = _handle_viz_tools(image_to_data, tool_result)
     visualized_images = []
-    for image in image_to_data:
-        image_path = Path(image)
-        image_data = image_to_data[image]
+    for image_str in image_to_data:
+        image_path = Path(image_str)
+        image_data = image_to_data[image_str]
         image = overlay_masks(image_path, image_data)
         image = overlay_bboxes(image, image_data)
         with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
@@ -351,7 +407,7 @@ class VisionAgent(Agent):
         task_model: Optional[Union[LLM, LMM]] = None,
         answer_model: Optional[Union[LLM, LMM]] = None,
         reflect_model: Optional[Union[LLM, LMM]] = None,
-        max_retries: int = 2,
+        max_retries: int = 3,
         verbose: bool = False,
         report_progress_callback: Optional[Callable[[str], None]] = None,
     ):
@@ -374,7 +430,9 @@ class VisionAgent(Agent):
             OpenAILLM(temperature=0.1) if answer_model is None else answer_model
         )
         self.reflect_model = (
-            OpenAILMM(temperature=0.1) if reflect_model is None else reflect_model
+            OpenAILMM(json_mode=True, temperature=0.1)
+            if reflect_model is None
+            else reflect_model
         )
         self.max_retries = max_retries
         self.tools = TOOLS
@@ -461,20 +519,27 @@ class VisionAgent(Agent):
             visualized_output = visualize_result(all_tool_results)
             all_tool_results.append({"visualized_output": visualized_output})
+            if len(visualized_output) > 0:
+                reflection_images = visualized_output
+            elif image is not None:
+                reflection_images = [image]
+            else:
+                reflection_images = None
             reflection = self_reflect(
                 self.reflect_model,
                 question,
                 self.tools,
                 all_tool_results,
                 final_answer,
-                visualized_output[0] if len(visualized_output) > 0 else image,
+                reflection_images,
             )
             self.log_progress(f"Reflection: {reflection}")
-            if parse_reflect(reflection):
+            parsed_reflection = parse_reflect(reflection)
+            if parsed_reflection["Finish"]:
                 break
             else:
-                reflections += "\n" + reflection
-        # '<END>' is a symbol to indicate the end of the chat, which is useful for streaming logs.
+                reflections += "\n" + parsed_reflection["Reflection"]
+        # '<ANSWER>' is a symbol to indicate the end of the chat, which is useful for streaming logs.
         self.log_progress(
             f"The Vision Agent has concluded this chat. <ANSWER>{final_answer}</ANSWER>"
         )

vision_agent/agent/vision_agent_prompts.py CHANGED Viewed

@@ -1,4 +1,14 @@
-VISION_AGENT_REFLECTION = """You are an advanced reasoning agent that can improve based on self-refection. You will be given a previous reasoning trial in which you were given the user's question, the available tools that the agent has, the decomposed tasks and tools that the agent used to answer the question and the final answer the agent provided. You may also receive an image with the visualized bounding boxes or masks with their associated labels and scores from the tools used. You must determine if the agent's answer was correct or incorrect. If the agent's answer was correct, respond with Finish. If the agent's answer was incorrect, you must diagnose a possible reason for failure or phrasing discrepancy and devise a new, concise, concrete plan that aims to mitigate the same failure with the tools available. Do not make vague steps like re-evaluate the threshold, instead make concrete steps like use a threshold of 0.5 or whatever threshold you think would fix this issue. If the task cannot be completed with the existing tools, respond with Finish. Use complete sentences.
+VISION_AGENT_REFLECTION = """You are an advanced reasoning agent that can improve based on self-refection. You will be given a previous reasoning trial in which you were given the user's question, the available tools that the agent has, the decomposed tasks and tools that the agent used to answer the question, the tool usage for each of the tools used and the final answer the agent provided. You may also receive an image with the visualized bounding boxes or masks with their associated labels and scores from the tools used.
+Please note that:
+1. You must ONLY output parsible JSON format. If the agents output was correct set "Finish" to true, else set "Finish" to false. An example output looks like:
+{{"Finish": true, "Reflection": "The agent's answer was correct."}}
+2. You must utilize the image with the visualized bounding boxes or masks and determine if the tools were used correctly or if the tools were used incorrectly or the wrong tools were used.
+3. If the agent's answer was incorrect, you must diagnose the reason for failure and devise a new concise and concrete plan that aims to mitigate the same failure with the tools available. An example output looks like:
+    {{"Finish": false, "Reflection": "I can see from the visualized bounding boxes that the agent's answer was incorrect because the grounding_dino_ tool produced false positive predictions. The agent should use the following tools with the following parameters:
+        Step 1: Use 'grounding_dino_' with a 'prompt' of 'baby. bed' and a 'box_threshold' of 0.7 to reduce the false positives.
+        Step 2: Use 'box_iou_' with the baby bounding box and the bed bounding box to determine if the baby is on the bed or not."}}
+4. If the task cannot be completed with the existing tools or by adjusting the parameters, set "Finish" to true.
 User's question: {question}
@@ -8,6 +18,9 @@ Tools available:
 Tasks and tools used:
 {tool_results}
+Tool's used API documentation:
+{tool_usage}
 Final answer:
 {final_answer}
@@ -127,4 +140,5 @@ These are subtasks and their answers:
 This is a reflection from a previous failed attempt:
 {reflections}
 Final answer: """

vision_agent/data/data.py CHANGED Viewed

@@ -63,9 +63,9 @@ class DataStore:
         self.df[name] = self.df["image_paths"].progress_apply(  # type: ignore
             lambda x: (
-                func(self.lmm.generate(prompt, image=x))
+                func(self.lmm.generate(prompt, images=[x]))
                 if func
-                else self.lmm.generate(prompt, image=x)
+                else self.lmm.generate(prompt, images=[x])
             )
         )
         return self

vision_agent/llm/llm.py CHANGED Viewed

@@ -33,7 +33,7 @@ class OpenAILLM(LLM):
     def __init__(
         self,
-        model_name: str = "gpt-4-turbo-preview",
+        model_name: str = "gpt-4-turbo",
         api_key: Optional[str] = None,
         json_mode: bool = False,
         **kwargs: Any

vision_agent/lmm/lmm.py CHANGED Viewed

@@ -30,12 +30,16 @@ def encode_image(image: Union[str, Path]) -> str:
 class LMM(ABC):
     @abstractmethod
-    def generate(self, prompt: str, image: Optional[Union[str, Path]] = None) -> str:
+    def generate(
+        self, prompt: str, images: Optional[List[Union[str, Path]]] = None
+    ) -> str:
         pass
     @abstractmethod
     def chat(
-        self, chat: List[Dict[str, str]], image: Optional[Union[str, Path]] = None
+        self,
+        chat: List[Dict[str, str]],
+        images: Optional[List[Union[str, Path]]] = None,
     ) -> str:
         pass
@@ -43,7 +47,7 @@ class LMM(ABC):
     def __call__(
         self,
         input: Union[str, List[Dict[str, str]]],
-        image: Optional[Union[str, Path]] = None,
+        images: Optional[List[Union[str, Path]]] = None,
     ) -> str:
         pass
@@ -57,27 +61,29 @@ class LLaVALMM(LMM):
     def __call__(
         self,
         input: Union[str, List[Dict[str, str]]],
-        image: Optional[Union[str, Path]] = None,
+        images: Optional[List[Union[str, Path]]] = None,
     ) -> str:
         if isinstance(input, str):
-            return self.generate(input, image)
-        return self.chat(input, image)
+            return self.generate(input, images)
+        return self.chat(input, images)
     def chat(
-        self, chat: List[Dict[str, str]], image: Optional[Union[str, Path]] = None
+        self,
+        chat: List[Dict[str, str]],
+        images: Optional[List[Union[str, Path]]] = None,
     ) -> str:
         raise NotImplementedError("Chat not supported for LLaVA")
     def generate(
         self,
         prompt: str,
-        image: Optional[Union[str, Path]] = None,
+        images: Optional[List[Union[str, Path]]] = None,
         temperature: float = 0.1,
         max_new_tokens: int = 1500,
     ) -> str:
         data = {"prompt": prompt}
-        if image:
-            data["image"] = encode_image(image)
+        if images and len(images) > 0:
+            data["image"] = encode_image(images[0])
         data["temperature"] = temperature  # type: ignore
         data["max_new_tokens"] = max_new_tokens  # type: ignore
         res = requests.post(
@@ -99,9 +105,10 @@ class OpenAILMM(LMM):
     def __init__(
         self,
-        model_name: str = "gpt-4-vision-preview",
+        model_name: str = "gpt-4-turbo",
         api_key: Optional[str] = None,
         max_tokens: int = 1024,
+        json_mode: bool = False,
         **kwargs: Any,
     ):
         if not api_key:
@@ -111,20 +118,25 @@ class OpenAILMM(LMM):
         self.client = OpenAI(api_key=api_key)
         self.model_name = model_name
-        self.max_tokens = max_tokens
+        if "max_tokens" not in kwargs:
+            kwargs["max_tokens"] = max_tokens
+        if json_mode:
+            kwargs["response_format"] = {"type": "json_object"}
         self.kwargs = kwargs
     def __call__(
         self,
         input: Union[str, List[Dict[str, str]]],
-        image: Optional[Union[str, Path]] = None,
+        images: Optional[List[Union[str, Path]]] = None,
     ) -> str:
         if isinstance(input, str):
-            return self.generate(input, image)
-        return self.chat(input, image)
+            return self.generate(input, images)
+        return self.chat(input, images)
     def chat(
-        self, chat: List[Dict[str, str]], image: Optional[Union[str, Path]] = None
+        self,
+        chat: List[Dict[str, str]],
+        images: Optional[List[Union[str, Path]]] = None,
     ) -> str:
         fixed_chat = []
         for c in chat:
@@ -132,33 +144,38 @@ class OpenAILMM(LMM):
             fixed_c["content"] = [{"type": "text", "text": c["content"]}]  # type: ignore
             fixed_chat.append(fixed_c)
-        if image:
-            extension = Path(image).suffix
-            if extension.lower() == ".jpeg" or extension.lower() == ".jpg":
-                extension = "jpg"
-            elif extension.lower() == ".png":
-                extension = "png"
-            else:
-                raise ValueError(f"Unsupported image extension: {extension}")
-            encoded_image = encode_image(image)
-            fixed_chat[0]["content"].append(  # type: ignore
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": f"data:image/{extension};base64,{encoded_image}",
-                        "detail": "low",
+        if images and len(images) > 0:
+            for image in images:
+                extension = Path(image).suffix
+                if extension.lower() == ".jpeg" or extension.lower() == ".jpg":
+                    extension = "jpg"
+                elif extension.lower() == ".png":
+                    extension = "png"
+                else:
+                    raise ValueError(f"Unsupported image extension: {extension}")
+                encoded_image = encode_image(image)
+                fixed_chat[0]["content"].append(  # type: ignore
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/{extension};base64,{encoded_image}",
+                            "detail": "low",
+                        },
                     },
-                },
-            )
+                )
         response = self.client.chat.completions.create(
-            model=self.model_name, messages=fixed_chat, max_tokens=self.max_tokens, **self.kwargs  # type: ignore
+            model=self.model_name, messages=fixed_chat, **self.kwargs  # type: ignore
         )
         return cast(str, response.choices[0].message.content)
-    def generate(self, prompt: str, image: Optional[Union[str, Path]] = None) -> str:
+    def generate(
+        self,
+        prompt: str,
+        images: Optional[List[Union[str, Path]]] = None,
+    ) -> str:
         message: List[Dict[str, Any]] = [
             {
                 "role": "user",
@@ -167,21 +184,22 @@ class OpenAILMM(LMM):
                 ],
             }
         ]
-        if image:
-            extension = Path(image).suffix
-            encoded_image = encode_image(image)
-            message[0]["content"].append(
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": f"data:image/{extension};base64,{encoded_image}",
-                        "detail": "low",
+        if images and len(images) > 0:
+            for image in images:
+                extension = Path(image).suffix
+                encoded_image = encode_image(image)
+                message[0]["content"].append(
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/{extension};base64,{encoded_image}",
+                            "detail": "low",
+                        },
                     },
-                },
-            )
+                )
         response = self.client.chat.completions.create(
-            model=self.model_name, messages=message, max_tokens=self.max_tokens, **self.kwargs  # type: ignore
+            model=self.model_name, messages=message, **self.kwargs  # type: ignore
         )
         return cast(str, response.choices[0].message.content)

vision_agent/tools/__init__.py CHANGED Viewed

@@ -1,10 +1,10 @@
 from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
-from .tools import (
+from .tools import (  # Counter,
     CLIP,
     TOOLS,
     BboxArea,
     BboxIoU,
-    Counter,
+    BoxDistance,
     Crop,
     ExtractFrames,
     GroundingDINO,

vision_agent/tools/tools.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import logging
 import tempfile
 from abc import ABC
-from collections import Counter as CounterClass
 from pathlib import Path
 from typing import Any, Dict, List, Tuple, Union, cast
@@ -396,33 +395,6 @@ class AgentGroundingSAM(GroundingSAM):
         return rets
-class Counter(Tool):
-    r"""Counter detects and counts the number of objects in an image given an input such as a category name or referring expression."""
-    name = "counter_"
-    description = "'counter_' detects and counts the number of objects in an image given an input such as a category name or referring expression. It returns a dictionary containing the labels and their counts."
-    usage = {
-        "required_parameters": [
-            {"name": "prompt", "type": "str"},
-            {"name": "image", "type": "str"},
-        ],
-        "examples": [
-            {
-                "scenario": "Can you count the number of cars in this image? Image name image.jpg",
-                "parameters": {"prompt": "car", "image": "image.jpg"},
-            },
-            {
-                "scenario": "Can you count the number of people? Image name: people.png",
-                "parameters": {"prompt": "person", "image": "people.png"},
-            },
-        ],
-    }
-    def __call__(self, prompt: str, image: Union[str, ImageType]) -> Dict:
-        resp = GroundingDINO()(prompt, image)
-        return dict(CounterClass(resp["labels"]))
 class Crop(Tool):
     r"""Crop crops an image given a bounding box and returns a file name of the cropped image."""
@@ -573,11 +545,42 @@ class SegIoU(Tool):
         return cast(float, round(iou, 2))
+class BoxDistance(Tool):
+    name = "box_distance_"
+    description = (
+        "'box_distance_' returns the minimum distance between two bounding boxes."
+    )
+    usage = {
+        "required_parameters": [
+            {"name": "bbox1", "type": "List[int]"},
+            {"name": "bbox2", "type": "List[int]"},
+        ],
+        "examples": [
+            {
+                "scenario": "If you want to calculate the distance between the bounding boxes [0.2, 0.21, 0.34, 0.42] and [0.3, 0.31, 0.44, 0.52]",
+                "parameters": {
+                    "bbox1": [0.2, 0.21, 0.34, 0.42],
+                    "bbox2": [0.3, 0.31, 0.44, 0.52],
+                },
+            }
+        ],
+    }
+    def __call__(self, bbox1: List[int], bbox2: List[int]) -> float:
+        x11, y11, x12, y12 = bbox1
+        x21, y21, x22, y22 = bbox2
+        horizontal_dist = np.max([0, x21 - x12, x11 - x22])
+        vertical_dist = np.max([0, y21 - y12, y11 - y22])
+        return cast(float, round(np.sqrt(horizontal_dist**2 + vertical_dist**2), 2))
 class ExtractFrames(Tool):
     r"""Extract frames from a video."""
     name = "extract_frames_"
-    description = "'extract_frames_' extracts frames where there is motion detected in a video, returns a list of tuples (frame, timestamp), where timestamp is the relative time in seconds where teh frame was captured. The frame is a local image file path."
+    description = "'extract_frames_' extracts frames from a video, returns a list of tuples (frame, timestamp), where timestamp is the relative time in seconds where the frame was captured. The frame is a local image file path."
     usage = {
         "required_parameters": [{"name": "video_uri", "type": "str"}],
         "examples": [
@@ -650,12 +653,12 @@ TOOLS = {
             GroundingDINO,
             AgentGroundingSAM,
             ExtractFrames,
-            Counter,
             Crop,
             BboxArea,
             SegArea,
             BboxIoU,
             SegIoU,
+            BoxDistance,
             Calculator,
         ]
     )

vision_agent/tools/video.py CHANGED Viewed

@@ -15,7 +15,7 @@ _CLIP_LENGTH = 30.0
 def extract_frames_from_video(
-    video_uri: str, fps: int = 2, motion_detection_threshold: float = 0.06
+    video_uri: str, fps: float = 0.5, motion_detection_threshold: float = 0.0
 ) -> List[Tuple[np.ndarray, float]]:
     """Extract frames from a video
@@ -25,7 +25,8 @@ def extract_frames_from_video(
         motion_detection_threshold: The threshold to detect motion between
             changes/frames. A value between 0-1, which represents the percentage change
             required for the frames to be considered in motion. For example, a lower
-            value means more frames will be extracted.
+            value means more frames will be extracted. A non-positive value will disable
+            motion detection and extract all frames.
     Returns:
         a list of tuples containing the extracted frame and the timestamp in seconds.
@@ -119,18 +120,19 @@ def _extract_frames_by_clip(
             total=processable_frames, desc=f"Extracting frames from clip {start}-{end}"
         )
         for i, frame in enumerate(clip.iter_frames(fps=fps, dtype="uint8")):
-            curr_processed_frame = _preprocess_frame(frame)
             total_count += 1
             pbar.update(1)
-            # Skip the frame if it is similar to the previous one
-            if prev_processed_frame is not None and _similar_frame(
-                prev_processed_frame,
-                curr_processed_frame,
-                threshold=motion_detection_threshold,
-            ):
-                skipped_count += 1
-                continue
-            prev_processed_frame = curr_processed_frame
+            if motion_detection_threshold > 0:
+                curr_processed_frame = _preprocess_frame(frame)
+                # Skip the frame if it is similar to the previous one
+                if prev_processed_frame is not None and _similar_frame(
+                    prev_processed_frame,
+                    curr_processed_frame,
+                    threshold=motion_detection_threshold,
+                ):
+                    skipped_count += 1
+                    continue
+                prev_processed_frame = curr_processed_frame
             ts = round(clip.reader.pos / source_fps, 3)
             frames.append((frame, ts))

{vision_agent-0.1.1.dist-info → vision_agent-0.1.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.1.1
+Version: 0.1.3
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.1.1.dist-info → vision_agent-0.1.3.dist-info}/RECORD RENAMED Viewed

@@ -2,28 +2,28 @@ vision_agent/__init__.py,sha256=wD1cssVTAJ55uTViNfBGooqJUV0p9fmVAuTMHHrmUBU,229
 vision_agent/agent/__init__.py,sha256=B4JVrbY4IRVCJfjmrgvcp7h1mTUEk8MZvL0Zmej4Ka0,127
 vision_agent/agent/agent.py,sha256=X7kON-g9ePUKumCDaYfQNBX_MEFE-ax5PnRp7-Cc5Wo,529
 vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMVg,11511
-vision_agent/agent/easytool_prompts.py,sha256=dYzWa_RaiaFSQ-CowoQOcFmjZtBTTljRyA809bLgrvU,4519
-vision_agent/agent/reflexion.py,sha256=wzpptfALNZIh9Q5jgkK3imGL5LWjTW_n_Ypsvxdh07Q,10101
+vision_agent/agent/easytool_prompts.py,sha256=zdQQw6WpXOmvwOMtlBlNKY5a3WNlr65dbUvMIGiqdeo,4526
+vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6wdM,10506
 vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
-vision_agent/agent/vision_agent.py,sha256=nHmfr-OuMfdH0N8gECXLzTAgRmTx9cYe5_pnQj-HnBE,19764
-vision_agent/agent/vision_agent_prompts.py,sha256=dPg0mLVK_fGJpYK2xXGhm-zuXX1KVZW_zFXyYsspUz8,6567
+vision_agent/agent/vision_agent.py,sha256=4-milD0iSY_vKdpAIctba04Ak_In5tMBE8gATdaGIr0,22019
+vision_agent/agent/vision_agent_prompts.py,sha256=W3Z72FpUt71UIJSkjAcgtQqxeMqkYuATqHAN5fYY26c,7342
 vision_agent/data/__init__.py,sha256=YU-5g3LbEQ6a4drz0RLGTagXMVU2Z4Xr3RlfWE-R0jU,46
-vision_agent/data/data.py,sha256=pgtSGZdAnbQ8oGsuapLtFTMPajnCGDGekEXTnFuBwsY,5122
+vision_agent/data/data.py,sha256=Z2l76OrT0GgyuN52OeJqDitUcP0q1rhfdXd1of3GsVo,5128
 vision_agent/emb/__init__.py,sha256=YmCkGrJBtXb6X6Z3lnKiFoQYKXMgHMJp8JJyMLVvqcI,75
 vision_agent/emb/emb.py,sha256=la9lhEzk7jqUCjYYQ5oRgVNSnC9_EJBJIpE_B9c6PJo,1375
 vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
 vision_agent/image_utils.py,sha256=hFdPoRmeVU5jErFr5xaagMQ6Wy7Xbw8H8HXuLGdJIAM,4786
 vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
-vision_agent/llm/llm.py,sha256=tgL6ZtuwZKuxSNiCxJCuP2ETjNMrosdgxXkZJb0_00E,5024
+vision_agent/llm/llm.py,sha256=Jty_RHdqVmIM0Mm31JNk50c882Tx7hHtkmh0WyXeJd8,5016
 vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
-vision_agent/lmm/lmm.py,sha256=LxwxCArp7DfnPbjf_Gl55xBxPwo2Qx8eDp1gCnGYSO0,9535
-vision_agent/tools/__init__.py,sha256=OEqEysxm5wnnOD73NKNCUggALB72GEmVg9FNsEkSBtA,253
+vision_agent/lmm/lmm.py,sha256=1E7e_S_0fOKnf6mSsEdkXvsIjGmhBGl5XW4By2jvhbY,10045
+vision_agent/tools/__init__.py,sha256=lKv90gLu-mNp4uyGtJ8AUG-73xKwFEugZpe0atpsscA,269
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
-vision_agent/tools/tools.py,sha256=Qsqe8X6VjB0EMWhyKJ5EMPyLIc_d5Vtlw4ugV2FB_Ks,25589
-vision_agent/tools/video.py,sha256=40rscP8YvKN3lhZ4PDcOK4XbdFX2duCRpHY_krmBYKU,7476
+vision_agent/tools/tools.py,sha256=EK9HauKZ1gq795wBZNER6-8PiDTNZwJ1sXYhDeplDZ0,25410
+vision_agent/tools/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
 vision_agent/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
-vision_agent-0.1.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.1.1.dist-info/METADATA,sha256=rWMocnnZwuRhd3xIGyQUzDbsndVASBSu2jvAqt-3Odc,6233
-vision_agent-0.1.1.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.1.1.dist-info/RECORD,,
+vision_agent-0.1.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.1.3.dist-info/METADATA,sha256=iBoN2GBvALl6XxhxRo4o9WaqLgI-UAobSymuZ1RHd9o,6233
+vision_agent-0.1.3.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.1.3.dist-info/RECORD,,

{vision_agent-0.1.1.dist-info → vision_agent-0.1.3.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.1.1.dist-info → vision_agent-0.1.3.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

vision-agent 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl