PyPI - vision-agent - Versions diffs - 0.2.198__py3-none-any.whl → 0.2.200__py3-none-any.whl - Mend

vision-agent 0.2.198py3-none-any.whl → 0.2.200py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

vision_agent/.sim_tools/df.csv +18 -18
vision_agent/.sim_tools/embs.npy +0 -0
vision_agent/agent/__init__.py +2 -1
vision_agent/agent/agent.py +33 -0
vision_agent/agent/agent_utils.py +47 -34
vision_agent/agent/types.py +51 -0
vision_agent/agent/vision_agent_coder_v2.py +131 -43
vision_agent/agent/vision_agent_planner_prompts_v2.py +1 -1
vision_agent/agent/vision_agent_planner_v2.py +109 -50
vision_agent/agent/vision_agent_prompts.py +4 -4
vision_agent/agent/vision_agent_prompts_v2.py +46 -0
vision_agent/agent/vision_agent_v2.py +215 -0
vision_agent/tools/tools.py +1 -1
vision_agent/utils/execute.py +1 -1
{vision_agent-0.2.198.dist-info → vision_agent-0.2.200.dist-info}/METADATA +1 -1
{vision_agent-0.2.198.dist-info → vision_agent-0.2.200.dist-info}/RECORD +18 -15
{vision_agent-0.2.198.dist-info → vision_agent-0.2.200.dist-info}/LICENSE +0 -0
{vision_agent-0.2.198.dist-info → vision_agent-0.2.200.dist-info}/WHEEL +0 -0

vision_agent/.sim_tools/df.csv CHANGED Viewed

@@ -80,24 +80,6 @@ desc,doc,name
             {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
         ]
     ",ocr
-'clip' is a tool that can classify an image or a cropped detection given a list of input classes or tags. It returns the same list of the input classes along with their probability scores based on image content.,"clip(image: numpy.ndarray, classes: List[str]) -> Dict[str, Any]:
-'clip' is a tool that can classify an image or a cropped detection given a list
-    of input classes or tags. It returns the same list of the input classes along with
-    their probability scores based on image content.
-    Parameters:
-        image (np.ndarray): The image to classify or tag
-        classes (List[str]): The list of classes or tags that is associated with the image
-    Returns:
-        Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
-            contains a list of given labels and other a list of scores.
-    Example
-    -------
-        >>> clip(image, ['dog', 'cat', 'bird'])
-        {""labels"": [""dog"", ""cat"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
-    ",clip
 'vit_image_classification' is a tool that can classify an image. It returns a list of classes and their probability scores based on image content.,"vit_image_classification(image: numpy.ndarray) -> Dict[str, Any]:
 'vit_image_classification' is a tool that can classify an image. It returns a
     list of classes and their probability scores based on image content.
@@ -488,6 +470,24 @@ desc,doc,name
         ... )
         >>> save_image(result, ""inpainted_room.png"")
     ",flux_image_inpainting
+'siglip_classification' is a tool that can classify an image or a cropped detection given a list of input labels or tags. It returns the same list of the input labels along with their probability scores based on image content.,"siglip_classification(image: numpy.ndarray, labels: List[str]) -> Dict[str, Any]:
+'siglip_classification' is a tool that can classify an image or a cropped detection given a list
+    of input labels or tags. It returns the same list of the input labels along with
+    their probability scores based on image content.
+    Parameters:
+        image (np.ndarray): The image to classify or tag
+        labels (List[str]): The list of labels or tags that is associated with the image
+    Returns:
+        Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
+            contains a list of given labels and other a list of scores.
+    Example
+    -------
+        >>> siglip_classification(image, ['dog', 'cat', 'bird'])
+        {""labels"": [""dog"", ""cat"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
+    ",siglip_classification
 "'extract_frames_and_timestamps' extracts frames and timestamps from a video which can be a file path, url or youtube link, returns a list of dictionaries with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is the relative time in seconds where the frame was captured. The frame is a numpy array.","extract_frames_and_timestamps(video_uri: Union[str, pathlib.Path], fps: float = 1) -> List[Dict[str, Union[numpy.ndarray, float]]]:
 'extract_frames_and_timestamps' extracts frames and timestamps from a video
     which can be a file path, url or youtube link, returns a list of dictionaries

vision_agent/.sim_tools/embs.npy CHANGED Viewed

Binary file

vision_agent/agent/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .agent import Agent
+from .agent import Agent, AgentCoder, AgentPlanner
 from .vision_agent import VisionAgent
 from .vision_agent_coder import (
     AnthropicVisionAgentCoder,
@@ -17,3 +17,4 @@ from .vision_agent_planner import (
     VisionAgentPlanner,
 )
 from .vision_agent_planner_v2 import VisionAgentPlannerV2
+from .vision_agent_v2 import VisionAgentV2

vision_agent/agent/agent.py CHANGED Viewed

@@ -2,7 +2,9 @@ from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
+from vision_agent.agent.types import AgentMessage, CodeContext, PlanContext
 from vision_agent.lmm.types import Message
+from vision_agent.utils.execute import CodeInterpreter
 class Agent(ABC):
@@ -20,3 +22,34 @@ class Agent(ABC):
         This is a hook that is intended for reporting the progress of the agent.
         """
         pass
+class AgentCoder(Agent):
+    @abstractmethod
+    def generate_code(
+        self,
+        chat: List[AgentMessage],
+        max_steps: Optional[int] = None,
+        code_interpreter: Optional[CodeInterpreter] = None,
+    ) -> CodeContext:
+        pass
+    @abstractmethod
+    def generate_code_from_plan(
+        self,
+        chat: List[AgentMessage],
+        plan_context: PlanContext,
+        code_interpreter: Optional[CodeInterpreter] = None,
+    ) -> CodeContext:
+        pass
+class AgentPlanner(Agent):
+    @abstractmethod
+    def generate_plan(
+        self,
+        chat: List[AgentMessage],
+        max_steps: Optional[int] = None,
+        code_interpreter: Optional[CodeInterpreter] = None,
+    ) -> PlanContext:
+        pass

vision_agent/agent/agent_utils.py CHANGED Viewed

@@ -4,16 +4,17 @@ import logging
 import re
 import sys
 import tempfile
-from typing import Any, Dict, List, Optional, Tuple, cast
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
 import libcst as cst
-from pydantic import BaseModel
 from rich.console import Console
 from rich.style import Style
 from rich.syntax import Syntax
 from rich.table import Table
 import vision_agent.tools as T
+from vision_agent.agent.types import AgentMessage, PlanContext
 from vision_agent.lmm.types import Message
 from vision_agent.utils.execute import CodeInterpreter, Execution
 from vision_agent.utils.image_utils import b64_to_pil, convert_to_b64
@@ -24,19 +25,6 @@ _CONSOLE = Console()
 _MAX_TABULATE_COL_WIDTH = 80
-class PlanContext(BaseModel):
-    plan: str
-    instructions: List[str]
-    code: str
-class CodeContext(BaseModel):
-    code: str
-    test: str
-    success: bool
-    test_result: Execution
 def _extract_sub_json(json_str: str) -> Optional[Dict[str, Any]]:
     json_pattern = r"\{.*\}"
     match = re.search(json_pattern, json_str, re.DOTALL)
@@ -228,15 +216,15 @@ def print_table(title: str, columns: List[str], rows: List[List[str]]) -> None:
 def add_media_to_chat(
-    chat: List[Message], code_interpreter: CodeInterpreter
-) -> Tuple[List[Message], List[Message], List[str]]:
+    chat: List[AgentMessage], code_interpreter: Optional[CodeInterpreter] = None
+) -> Tuple[List[AgentMessage], List[AgentMessage], List[Union[str, Path]]]:
     orig_chat = copy.deepcopy(chat)
     int_chat = copy.deepcopy(chat)
-    media_list = []
+    media_list: List[Union[str, Path]] = []
     for chat_i in int_chat:
-        if "media" in chat_i:
-            media_list_i = []
-            for media in chat_i["media"]:
+        if chat_i.media is not None:
+            media_list_i: List[Union[str, Path]] = []
+            for media in chat_i.media:
                 if isinstance(media, str) and media.startswith("data:image/"):
                     media_pil = b64_to_pil(media)
                     with tempfile.NamedTemporaryFile(
@@ -244,25 +232,29 @@ def add_media_to_chat(
                     ) as temp_file:
                         media_pil.save(temp_file, format="PNG")
                         media = str(temp_file.name)
-                media = str(code_interpreter.upload_file(media))  # type: ignore
+                if code_interpreter is not None:
+                    media = str(code_interpreter.upload_file(media))
                 media_list_i.append(media)
-                # don't duplicate appending media name
-                if not str(chat_i["content"]).endswith(f" Media name {media}"):
-                    chat_i["content"] += f" Media name {media}"  # type: ignore
-            chat_i["media"] = media_list_i
+                # don't duplicate appending media name and only add them for user messages
+                if (
+                    not str(chat_i.content).endswith(f" Media name {media}")
+                    and chat_i.role == "user"
+                ):
+                    chat_i.content += f" Media name {media}"
+            chat_i.media = media_list_i if len(media_list_i) > 0 else None
             media_list.extend(media_list_i)
     int_chat = cast(
-        List[Message],
+        List[AgentMessage],
         [
             (
-                {
-                    "role": c["role"],
-                    "content": c["content"],
-                    "media": c["media"],
-                }
-                if "media" in c
-                else {"role": c["role"], "content": c["content"]}
+                AgentMessage(
+                    role=c.role,
+                    content=c.content,
+                    media=c.media,
+                )
+                if c.media is not None
+                else AgentMessage(role=c.role, content=c.content, media=None)
             )
             for c in int_chat
         ],
@@ -283,6 +275,27 @@ def capture_media_from_exec(execution: Execution) -> List[str]:
     return images
+def convert_message_to_agentmessage(
+    input: Union[str, List[Message]],
+    media: Optional[Union[str, Path]] = None,
+) -> List[AgentMessage]:
+    if isinstance(input, str):
+        input_msg = [
+            AgentMessage(
+                role="user",
+                content=input,
+                media=([media] if media is not None else None),
+            )
+        ]
+    else:
+        input_msg = [
+            AgentMessage(role=msg["role"], content=msg["content"], media=None)
+            for msg in input
+        ]
+        input_msg[0].media = [media] if media is not None else None
+    return input_msg
 def strip_function_calls(  # noqa: C901
     code: str, exclusions: Optional[List[str]] = None
 ) -> str:

vision_agent/agent/types.py ADDED Viewed

@@ -0,0 +1,51 @@
+from pathlib import Path
+from typing import List, Literal, Optional, Union
+from pydantic import BaseModel
+from vision_agent.utils.execute import Execution
+class AgentMessage(BaseModel):
+    """AgentMessage encompases messages sent to the entire Agentic system, which includes
+    both LMMs and sub-agents.
+    user: The user's message.
+    assistant: The assistant's message.
+    observation: An observation made after conducting an action, either by the user or
+        assistant.
+    interaction: An interaction between the user and the assistant. For example if the
+        assistant wants to ask the user for help on a task, it could send an
+        interaction message.
+    conversation: Messages coming from the conversation agent, this is a type of
+        assistant messages.
+    planner: Messages coming from the planner agent, this is a type of assistant
+        messages.
+    coder: Messages coming from the coder agent, this is a type of assistant messages.
+    """
+    role: Union[
+        Literal["user"],
+        Literal["assistant"],  # planner, coder and conversation are of type assistant
+        Literal["observation"],
+        Literal["interaction"],
+        Literal["conversation"],
+        Literal["planner"],
+        Literal["coder"],
+    ]
+    content: str
+    media: Optional[List[Union[str, Path]]] = None
+class PlanContext(BaseModel):
+    plan: str
+    instructions: List[str]
+    code: str
+class CodeContext(BaseModel):
+    code: str
+    test: str
+    success: bool
+    test_result: Execution

vision_agent/agent/vision_agent_coder_v2.py CHANGED Viewed

@@ -6,19 +6,19 @@ from rich.console import Console
 from rich.markup import escape
 import vision_agent.tools as T
-from vision_agent.agent import Agent
+from vision_agent.agent import AgentCoder, AgentPlanner
 from vision_agent.agent.agent_utils import (
-    CodeContext,
     DefaultImports,
-    PlanContext,
     add_media_to_chat,
     capture_media_from_exec,
+    convert_message_to_agentmessage,
     extract_tag,
     format_feedback,
     format_plan_v2,
     print_code,
     strip_function_calls,
 )
+from vision_agent.agent.types import AgentMessage, CodeContext, PlanContext
 from vision_agent.agent.vision_agent_coder_prompts_v2 import CODE, FIX_BUG, TEST
 from vision_agent.agent.vision_agent_planner_v2 import VisionAgentPlannerV2
 from vision_agent.lmm import LMM, AnthropicLMM
@@ -34,6 +34,12 @@ from vision_agent.utils.sim import Sim, load_cached_sim
 _CONSOLE = Console()
+def format_code_context(
+    code_context: CodeContext,
+) -> str:
+    return f"<final_code>{code_context.code}</final_code>\n<final_test>{code_context.test}</final_test>"
 def retrieve_tools(
     plan: List[str],
     tool_recommender: Sim,
@@ -49,46 +55,54 @@ def retrieve_tools(
 def write_code(
     coder: LMM,
-    chat: List[Message],
+    chat: List[AgentMessage],
     tool_docs: str,
     plan: str,
 ) -> str:
     chat = copy.deepcopy(chat)
-    if chat[-1]["role"] != "user":
+    if chat[-1].role != "user":
         raise ValueError("Last chat message must be from the user.")
-    user_request = chat[-1]["content"]
+    user_request = chat[-1].content
     prompt = CODE.format(
         docstring=tool_docs,
         question=user_request,
         plan=plan,
     )
-    chat[-1]["content"] = prompt
-    response = coder(chat, stream=False)
-    return extract_tag(response, "code")  # type: ignore
+    response = cast(str, coder([{"role": "user", "content": prompt}], stream=False))
+    maybe_code = extract_tag(response, "code")
+    # if the response wasn't properly formatted with the code tags just retrun the response
+    if maybe_code is None:
+        return response
+    return maybe_code
 def write_test(
     tester: LMM,
-    chat: List[Message],
+    chat: List[AgentMessage],
     tool_util_docs: str,
     code: str,
     media_list: Optional[Sequence[Union[str, Path]]] = None,
 ) -> str:
     chat = copy.deepcopy(chat)
-    if chat[-1]["role"] != "user":
+    if chat[-1].role != "user":
         raise ValueError("Last chat message must be from the user.")
-    user_request = chat[-1]["content"]
+    user_request = chat[-1].content
     prompt = TEST.format(
         docstring=tool_util_docs,
         question=user_request,
         code=code,
         media=media_list,
     )
-    chat[-1]["content"] = prompt
-    response = tester(chat, stream=False)
-    return extract_tag(response, "code")  # type: ignore
+    response = cast(str, tester([{"role": "user", "content": prompt}], stream=False))
+    maybe_code = extract_tag(response, "code")
+    # if the response wasn't properly formatted with the code tags just retrun the response
+    if maybe_code is None:
+        return response
+    return maybe_code
 def debug_code(
@@ -170,12 +184,11 @@ def write_and_test_code(
     coder: LMM,
     tester: LMM,
     debugger: LMM,
-    chat: List[Message],
+    chat: List[AgentMessage],
     plan: str,
     tool_docs: str,
     code_interpreter: CodeInterpreter,
     media_list: List[Union[str, Path]],
-    update_callback: Callable[[Dict[str, Any]], None],
     verbose: bool,
 ) -> CodeContext:
     code = write_code(
@@ -226,14 +239,6 @@ def write_and_test_code(
                 f"[bold cyan]Code execution result after attempted fix:[/bold cyan] [yellow]{escape(result.text(include_logs=True))}[/yellow]"
             )
-    update_callback(
-        {
-            "role": "assistant",
-            "content": f"<final_code>{DefaultImports.to_code_string()}\n{code}</final_code>\n<final_test>{DefaultImports.to_code_string()}\n{test}</final_test>",
-            "media": capture_media_from_exec(result),
-        }
-    )
     return CodeContext(
         code=f"{DefaultImports.to_code_string()}\n{code}",
         test=f"{DefaultImports.to_code_string()}\n{test}",
@@ -242,10 +247,12 @@ def write_and_test_code(
     )
-class VisionAgentCoderV2(Agent):
+class VisionAgentCoderV2(AgentCoder):
+    """VisionAgentCoderV2 is an agent that will write vision code for you."""
     def __init__(
         self,
-        planner: Optional[Agent] = None,
+        planner: Optional[AgentPlanner] = None,
         coder: Optional[LMM] = None,
         tester: Optional[LMM] = None,
         debugger: Optional[LMM] = None,
@@ -254,6 +261,25 @@ class VisionAgentCoderV2(Agent):
         code_sandbox_runtime: Optional[str] = None,
         update_callback: Callable[[Dict[str, Any]], None] = lambda _: None,
     ) -> None:
+        """Initialize the VisionAgentCoderV2.
+        Parameters:
+            planner (Optional[AgentPlanner]): The planner agent to use for generating
+                vision plans. If None, a default VisionAgentPlannerV2 will be used.
+            coder (Optional[LMM]): The language model to use for the coder agent. If
+                None, a default AnthropicLMM will be used.
+            tester (Optional[LMM]): The language model to use for the tester agent. If
+                None, a default AnthropicLMM will be used.
+            debugger (Optional[LMM]): The language model to use for the debugger agent.
+            tool_recommender (Optional[Union[str, Sim]]): The tool recommender to use.
+            verbose (bool): Whether to print out debug information.
+            code_sandbox_runtime (Optional[str]): The code sandbox runtime to use, can
+                be one of: None, "local" or "e2b". If None, it will read from the
+                environment variable CODE_SANDBOX_RUNTIME.
+            update_callback (Callable[[Dict[str, Any]], None]): The callback function
+                that will send back intermediate conversation messages.
+        """
         self.planner = (
             planner
             if planner is not None
@@ -290,20 +316,52 @@ class VisionAgentCoderV2(Agent):
         self,
         input: Union[str, List[Message]],
         media: Optional[Union[str, Path]] = None,
-    ) -> Union[str, List[Message]]:
-        if isinstance(input, str):
-            input = [{"role": "user", "content": input}]
-        if media is not None:
-            input[0]["media"] = [media]
-        return self.generate_code(input).code
-    def generate_code(self, chat: List[Message]) -> CodeContext:
+    ) -> str:
+        """Generate vision code from a conversation.
+        Parameters:
+            input (Union[str, List[Message]]): The input to the agent. This can be a
+                string or a list of messages in the format of [{"role": "user",
+                "content": "describe your task here..."}, ...].
+            media (Optional[Union[str, Path]]): The path to the media file to use with
+                the input. This can be an image or video file.
+        Returns:
+            str: The generated code as a string.
+        """
+        input_msg = convert_message_to_agentmessage(input, media)
+        return self.generate_code(input_msg).code
+    def generate_code(
+        self,
+        chat: List[AgentMessage],
+        max_steps: Optional[int] = None,
+        code_interpreter: Optional[CodeInterpreter] = None,
+    ) -> CodeContext:
+        """Generate vision code from a conversation.
+        Parameters:
+            chat (List[AgentMessage]): The input to the agent. This should be a list of
+                AgentMessage objects.
+            code_interpreter (Optional[CodeInterpreter]): The code interpreter to use.
+        Returns:
+            CodeContext: The generated code as a CodeContext object which includes the
+                code, test code, whether or not it was exceuted successfully, and the
+                execution result.
+        """
         chat = copy.deepcopy(chat)
-        with CodeInterpreterFactory.new_instance(
-            self.code_sandbox_runtime
+        with (
+            CodeInterpreterFactory.new_instance(self.code_sandbox_runtime)
+            if code_interpreter is None
+            else code_interpreter
         ) as code_interpreter:
             int_chat, orig_chat, _ = add_media_to_chat(chat, code_interpreter)
-            plan_context = self.planner.generate_plan(int_chat, code_interpreter)  # type: ignore
+            plan_context = self.planner.generate_plan(
+                int_chat, max_steps=max_steps, code_interpreter=code_interpreter
+            )
             code_context = self.generate_code_from_plan(
                 orig_chat,
                 plan_context,
@@ -313,13 +371,30 @@ class VisionAgentCoderV2(Agent):
     def generate_code_from_plan(
         self,
-        chat: List[Message],
+        chat: List[AgentMessage],
         plan_context: PlanContext,
         code_interpreter: Optional[CodeInterpreter] = None,
     ) -> CodeContext:
+        """Generate vision code from a conversation and a previously made plan. This
+        will skip the planning step and go straight to generating code.
+        Parameters:
+            chat (List[AgentMessage]): The input to the agent. This should be a list of
+                AgentMessage objects.
+            plan_context (PlanContext): The plan context that was previously generated.
+            code_interpreter (Optional[CodeInterpreter]): The code interpreter to use.
+        Returns:
+            CodeContext: The generated code as a CodeContext object which includes the
+                code, test code, whether or not it was exceuted successfully, and the
+                execution result.
+        """
         chat = copy.deepcopy(chat)
-        with CodeInterpreterFactory.new_instance(
-            self.code_sandbox_runtime
+        with (
+            CodeInterpreterFactory.new_instance(self.code_sandbox_runtime)
+            if code_interpreter is None
+            else code_interpreter
         ) as code_interpreter:
             int_chat, _, media_list = add_media_to_chat(chat, code_interpreter)
             tool_docs = retrieve_tools(plan_context.instructions, self.tool_recommender)
@@ -331,10 +406,23 @@ class VisionAgentCoderV2(Agent):
                 plan=format_plan_v2(plan_context),
                 tool_docs=tool_docs,
                 code_interpreter=code_interpreter,
-                media_list=media_list,  # type: ignore
-                update_callback=self.update_callback,
+                media_list=media_list,
                 verbose=self.verbose,
             )
+        self.update_callback(
+            {
+                "role": "coder",
+                "content": format_code_context(code_context),
+                "media": capture_media_from_exec(code_context.test_result),
+            }
+        )
+        self.update_callback(
+            {
+                "role": "observation",
+                "content": code_context.test_result.text(),
+            }
+        )
         return code_context
     def log_progress(self, data: Dict[str, Any]) -> None:

vision_agent/agent/vision_agent_planner_prompts_v2.py CHANGED Viewed

@@ -389,7 +389,7 @@ for infos in obj_to_info:
 print(f"{len(objects_with_tape)} boxes with tape found")
 </execute_python>
-OBJERVATION:
+OBSERVATION:
 3 boxes were tracked
 2 boxes with tape found
 <count>6</count>

vision_agent/agent/vision_agent_planner_v2.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import copy
 import logging
+import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
@@ -10,16 +11,17 @@ from rich.markup import escape
 import vision_agent.tools as T
 import vision_agent.tools.planner_tools as pt
-from vision_agent.agent import Agent
+from vision_agent.agent import AgentPlanner
 from vision_agent.agent.agent_utils import (
-    PlanContext,
     add_media_to_chat,
     capture_media_from_exec,
+    convert_message_to_agentmessage,
     extract_json,
     extract_tag,
     print_code,
     print_table,
 )
+from vision_agent.agent.types import AgentMessage, PlanContext
 from vision_agent.agent.vision_agent_planner_prompts_v2 import (
     CRITIQUE_PLAN,
     EXAMPLE_PLAN1,
@@ -70,26 +72,24 @@ class DefaultPlanningImports:
 def get_planning(
-    chat: List[Message],
+    chat: List[AgentMessage],
 ) -> str:
     chat = copy.deepcopy(chat)
     planning = ""
     for chat_i in chat:
-        if chat_i["role"] == "user":
-            planning += f"USER: {chat_i['content']}\n\n"
-        elif chat_i["role"] == "observation":
-            planning += f"OBSERVATION: {chat_i['content']}\n\n"
-        elif chat_i["role"] == "assistant":
-            planning += f"ASSISTANT: {chat_i['content']}\n\n"
-        else:
-            raise ValueError(f"Unknown role: {chat_i['role']}")
+        if chat_i.role == "user":
+            planning += f"USER: {chat_i.content}\n\n"
+        elif chat_i.role == "observation":
+            planning += f"OBSERVATION: {chat_i.content}\n\n"
+        elif chat_i.role == "planner":
+            planning += f"AGENT: {chat_i.content}\n\n"
     return planning
 def run_planning(
-    chat: List[Message],
-    media_list: List[str],
+    chat: List[AgentMessage],
+    media_list: List[Union[str, Path]],
     model: LMM,
 ) -> str:
     # only keep last 10 messages for planning
@@ -102,16 +102,16 @@ def run_planning(
     )
     message: Message = {"role": "user", "content": prompt}
-    if chat[-1]["role"] == "observation" and "media" in chat[-1]:
-        message["media"] = chat[-1]["media"]
+    if chat[-1].role == "observation" and chat[-1].media is not None:
+        message["media"] = chat[-1].media
     response = model.chat([message])
     return cast(str, response)
 def run_multi_trial_planning(
-    chat: List[Message],
-    media_list: List[str],
+    chat: List[AgentMessage],
+    media_list: List[Union[str, Path]],
     model: LMM,
 ) -> str:
     planning = get_planning(chat)
@@ -123,8 +123,8 @@ def run_multi_trial_planning(
     )
     message: Message = {"role": "user", "content": prompt}
-    if chat[-1]["role"] == "observation" and "media" in chat[-1]:
-        message["media"] = chat[-1]["media"]
+    if chat[-1].role == "observation" and chat[-1].media is not None:
+        message["media"] = chat[-1].media
     responses = []
     with ThreadPoolExecutor() as executor:
@@ -151,7 +151,9 @@ def run_multi_trial_planning(
         return cast(str, responses[0])
-def run_critic(chat: List[Message], media_list: List[str], model: LMM) -> Optional[str]:
+def run_critic(
+    chat: List[AgentMessage], media_list: List[Union[str, Path]], model: LMM
+) -> Optional[str]:
     planning = get_planning(chat)
     prompt = CRITIQUE_PLAN.format(
         planning=planning,
@@ -196,17 +198,19 @@ def response_safeguards(response: str) -> str:
 def execute_code_action(
     code: str,
     code_interpreter: CodeInterpreter,
-    chat: List[Message],
+    chat: List[AgentMessage],
     model: LMM,
     verbose: bool = False,
 ) -> Tuple[Execution, str, str]:
     if verbose:
         print_code("Code to Execute:", code)
+    start = time.time()
     execution = code_interpreter.exec_cell(DefaultPlanningImports.prepend_imports(code))
+    end = time.time()
     obs = execution.text(include_results=False).strip()
     if verbose:
         _CONSOLE.print(
-            f"[bold cyan]Code Execution Output:[/bold cyan] [yellow]{escape(obs)}[/yellow]"
+            f"[bold cyan]Code Execution Output ({end - start:.2f} sec):[/bold cyan] [yellow]{escape(obs)}[/yellow]"
         )
     count = 1
@@ -246,13 +250,13 @@ def find_and_replace_code(response: str, code: str) -> str:
 def maybe_run_code(
     code: Optional[str],
     response: str,
-    chat: List[Message],
-    media_list: List[str],
+    chat: List[AgentMessage],
+    media_list: List[Union[str, Path]],
     model: LMM,
     code_interpreter: CodeInterpreter,
     verbose: bool = False,
-) -> List[Message]:
-    return_chat: List[Message] = []
+) -> List[AgentMessage]:
+    return_chat: List[AgentMessage] = []
     if code is not None:
         code = code_safeguards(code)
         execution, obs, code = execute_code_action(
@@ -262,30 +266,32 @@ def maybe_run_code(
         # if we had to debug the code to fix an issue, replace the old code
         # with the fixed code in the response
         fixed_response = find_and_replace_code(response, code)
-        return_chat.append({"role": "assistant", "content": fixed_response})
+        return_chat.append(
+            AgentMessage(role="planner", content=fixed_response, media=None)
+        )
         media_data = capture_media_from_exec(execution)
-        int_chat_elt: Message = {"role": "observation", "content": obs}
+        int_chat_elt = AgentMessage(role="observation", content=obs, media=None)
         if media_list:
-            int_chat_elt["media"] = media_data
+            int_chat_elt.media = cast(List[Union[str, Path]], media_data)
         return_chat.append(int_chat_elt)
     else:
-        return_chat.append({"role": "assistant", "content": response})
+        return_chat.append(AgentMessage(role="planner", content=response, media=None))
     return return_chat
 def create_finalize_plan(
-    chat: List[Message],
+    chat: List[AgentMessage],
     model: LMM,
     verbose: bool = False,
-) -> Tuple[List[Message], PlanContext]:
+) -> Tuple[List[AgentMessage], PlanContext]:
     prompt = FINALIZE_PLAN.format(
         planning=get_planning(chat),
         excluded_tools=str([t.__name__ for t in pt.PLANNER_TOOLS]),
     )
     response = model.chat([{"role": "user", "content": prompt}])
     plan_str = cast(str, response)
-    return_chat: List[Message] = [{"role": "assistant", "content": plan_str}]
+    return_chat = [AgentMessage(role="planner", content=plan_str, media=None)]
     plan_json = extract_tag(plan_str, "json")
     plan = (
@@ -305,7 +311,16 @@ def create_finalize_plan(
     return return_chat, PlanContext(**plan)
-class VisionAgentPlannerV2(Agent):
+def get_steps(chat: List[AgentMessage], max_steps: int) -> int:
+    for chat_elt in reversed(chat):
+        if "<count>" in chat_elt.content:
+            return int(extract_tag(chat_elt.content, "count"))  # type: ignore
+    return max_steps
+class VisionAgentPlannerV2(AgentPlanner):
+    """VisionAgentPlannerV2 is a class that generates a plan to solve a vision task."""
     def __init__(
         self,
         planner: Optional[LMM] = None,
@@ -317,6 +332,25 @@ class VisionAgentPlannerV2(Agent):
         code_sandbox_runtime: Optional[str] = None,
         update_callback: Callable[[Dict[str, Any]], None] = lambda _: None,
     ) -> None:
+        """Initialize the VisionAgentPlannerV2.
+        Parameters:
+            planner (Optional[LMM]): The language model to use for planning. If None, a
+                default AnthropicLMM will be used.
+            critic (Optional[LMM]): The language model to use for critiquing the plan.
+                If None, a default AnthropicLMM will be used.
+            max_steps (int): The maximum number of steps to plan.
+            use_multi_trial_planning (bool): Whether to use multi-trial planning.
+            critique_steps (int): The number of steps between critiques. If critic steps
+                is larger than max_steps no critiques will be made.
+            verbose (bool): Whether to print out debug information.
+            code_sandbox_runtime (Optional[str]): The code sandbox runtime to use, can
+                be one of: None, "local" or "e2b". If None, it will read from the
+                environment variable CODE_SANDBOX_RUNTIME.
+            update_callback (Callable[[Dict[str, Any]], None]): The callback function
+                that will send back intermediate conversation messages.
+        """
         self.planner = (
             planner
             if planner is not None
@@ -339,20 +373,42 @@ class VisionAgentPlannerV2(Agent):
         self,
         input: Union[str, List[Message]],
         media: Optional[Union[str, Path]] = None,
-    ) -> Union[str, List[Message]]:
-        if isinstance(input, str):
-            if media is not None:
-                input = [{"role": "user", "content": input, "media": [media]}]
-            else:
-                input = [{"role": "user", "content": input}]
-        plan = self.generate_plan(input)
-        return str(plan)
+    ) -> str:
+        """Generate a plan to solve a vision task.
+        Parameters:
+            input (Union[str, List[Message]]): The input to the agent. This can be a
+                string or a list of messages in the format of [{"role": "user",
+                "content": "describe your task here..."}, ...].
+            media (Optional[Union[str, Path]]): The path to the media file to use with
+                the input. This can be an image or video file.
+        Returns:
+            str: The generated plan as a string.
+        """
+        input_msg = convert_message_to_agentmessage(input, media)
+        plan = self.generate_plan(input_msg)
+        return plan.plan
     def generate_plan(
         self,
-        chat: List[Message],
+        chat: List[AgentMessage],
+        max_steps: Optional[int] = None,
         code_interpreter: Optional[CodeInterpreter] = None,
     ) -> PlanContext:
+        """Generate a plan to solve a vision task.
+        Parameters:
+            chat (List[AgentMessage]): The conversation messages to generate a plan for.
+            max_steps (Optional[int]): The maximum number of steps to plan.
+            code_interpreter (Optional[CodeInterpreter]): The code interpreter to use.
+        Returns:
+            PlanContext: The generated plan including the instructions and code snippets
+                needed to solve the task.
+        """
         if not chat:
             raise ValueError("Chat cannot be empty")
@@ -360,13 +416,16 @@ class VisionAgentPlannerV2(Agent):
         code_interpreter = code_interpreter or CodeInterpreterFactory.new_instance(
             self.code_sandbox_runtime
         )
+        max_steps = max_steps or self.max_steps
         with code_interpreter:
             critque_steps = 1
-            step = self.max_steps
             finished = False
             int_chat, _, media_list = add_media_to_chat(chat, code_interpreter)
-            int_chat[-1]["content"] += f"\n<count>{step}</count>\n"  # type: ignore
+            step = get_steps(int_chat, max_steps)
+            if "<count>" not in int_chat[-1].content and step == max_steps:
+                int_chat[-1].content += f"\n<count>{step}</count>\n"
             while step > 0 and not finished:
                 if self.use_multi_trial_planning:
                     response = run_multi_trial_planning(
@@ -402,29 +461,29 @@ class VisionAgentPlannerV2(Agent):
                 if critque_steps % self.critique_steps == 0:
                     critique = run_critic(int_chat, media_list, self.critic)
-                    if critique is not None and int_chat[-1]["role"] == "observation":
+                    if critique is not None and int_chat[-1].role == "observation":
                         _CONSOLE.print(
                             f"[bold cyan]Critique:[/bold cyan] [red]{critique}[/red]"
                         )
                         critique_str = f"\n[critique]\n{critique}\n[end of critique]"
-                        updated_chat[-1]["content"] += critique_str  # type: ignore
+                        updated_chat[-1].content += critique_str
                         # if plan was critiqued, ensure we don't finish so we can
                         # respond to the critique
                         finished = False
                 critque_steps += 1
                 step -= 1
-                updated_chat[-1]["content"] += f"\n<count>{step}</count>\n"  # type: ignore
+                updated_chat[-1].content += f"\n<count>{step}</count>\n"
                 int_chat.extend(updated_chat)
                 for chat_elt in updated_chat:
-                    self.update_callback(chat_elt)
+                    self.update_callback(chat_elt.model_dump())
             updated_chat, plan_context = create_finalize_plan(
                 int_chat, self.planner, self.verbose
             )
             int_chat.extend(updated_chat)
             for chat_elt in updated_chat:
-                self.update_callback(chat_elt)
+                self.update_callback(chat_elt.model_dump())
         return plan_context

vision_agent/agent/vision_agent_prompts.py CHANGED Viewed

@@ -55,10 +55,10 @@ generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect
 OBSERVATION:
 [Artifact dog_detector.py (5 lines total)]
-0|from vision_agent.tools import load_image, owl_v2
+0|from vision_agent.tools import load_image, owl_v2_image
 1|def detect_dogs(image_path: str):
 2|    image = load_image(image_path)
-3|    dogs = owl_v2("dog", image)
+3|    dogs = owl_v2_image("dog", image)
 4|    return dogs
 [End of artifact]
@@ -96,10 +96,10 @@ edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect do
 OBSERVATION:
 [Artifact dog_detector.py (5 lines total)]
-0|from vision_agent.tools import load_image, owl_v2
+0|from vision_agent.tools import load_image, owl_v2_image
 1|def detect_dogs(image_path: str):
 2|    image = load_image(image_path)
-3|    dogs = owl_v2("dog", image, threshold=0.24)
+3|    dogs = owl_v2_image("dog", image, threshold=0.24)
 4|    return dogs
 [End of artifact]

vision_agent/agent/vision_agent_prompts_v2.py ADDED Viewed

@@ -0,0 +1,46 @@
+CONVERSATION = """
+**Role**: You are a help agent that assists users with their vision-related questions.
+**Actions**:
+`generate_or_edit_vision_code` - This action will generate code for the user to solve a vision task. It will also edit vision code for the user, this is useful if the user wants to modify vision-related aspects of the code such as changing the vision model or the image pre-processing steps.
+`edit_code` - If the user already has code for a vision task, from a previous call to `generate_or_edit_vision_code` and wants to quickly modify the code, you can use this action to edit the code. This is good for non-vision related changes such as formatting the response as a json or removing print statements.
+**Exampels**:
+Here is an example of how you can interact with a user and Actions:
+--- START EXAMPLES ---
+USER: Can you help me write code to detect dogs in this image? Media name images/dogs.jpg
+AGENT: <response>Yes, I can help you with that. I will write the code to detect dogs in the image.</response><action>generate_or_edit_vision_code</action>
+OBSERVATION:
+<final_code>
+from vision_agent.tools import load_image, owl_v2_image
+def detect_dogs(image_path: str):
+    image = load_image(image_path)
+    dogs = owl_v2_image(image)
+    return dogs
+</final_code>
+<final_test>
+def test_detect_dogs():
+    dogs = detect_dogs("images/dogs.jpg")
+    assert len(dogs) > 0
+</final_test>
+AGENT: <response>Here is the code to detect dogs in the image.</response>
+--- END EXAMPLES ---
+**Conversation**:
+Here is the current conversation so far:
+--- START CONVERSATION ---
+{conversation}
+--- END CONVERSATION ---
+**Instructions**:
+1. Only respond with a single <response> tag and a single <action> tag.
+2. Respond in the following format, the <action> tag is optional and can be excluded if you do not want to take any action:
+<response>Your response to the user's message</response>
+<action>The action you want to take from **Actions**</action>
+"""

vision_agent/agent/vision_agent_v2.py ADDED Viewed

@@ -0,0 +1,215 @@
+import copy
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Union, cast
+from vision_agent.agent import Agent, AgentCoder, VisionAgentCoderV2
+from vision_agent.agent.agent_utils import (
+    add_media_to_chat,
+    convert_message_to_agentmessage,
+    extract_tag,
+)
+from vision_agent.agent.types import AgentMessage, PlanContext
+from vision_agent.agent.vision_agent_coder_v2 import format_code_context
+from vision_agent.agent.vision_agent_prompts_v2 import CONVERSATION
+from vision_agent.lmm import LMM, AnthropicLMM
+from vision_agent.lmm.types import Message
+from vision_agent.utils.execute import CodeInterpreter, CodeInterpreterFactory
+def format_conversation(chat: List[AgentMessage]) -> str:
+    chat = copy.deepcopy(chat)
+    prompt = ""
+    for chat_i in chat:
+        if chat_i.role == "user":
+            prompt += f"USER: {chat_i.content}\n\n"
+        elif chat_i.role == "observation" or chat_i.role == "coder":
+            prompt += f"OBSERVATION: {chat_i.content}\n\n"
+        elif chat_i.role == "conversation":
+            prompt += f"AGENT: {chat_i.content}\n\n"
+    return prompt
+def run_conversation(agent: LMM, chat: List[AgentMessage]) -> str:
+    # only keep last 10 messages
+    conv = format_conversation(chat[-10:])
+    prompt = CONVERSATION.format(
+        conversation=conv,
+    )
+    response = agent([{"role": "user", "content": prompt}], stream=False)
+    return cast(str, response)
+def extract_conversation_for_generate_code(
+    chat: List[AgentMessage],
+) -> List[AgentMessage]:
+    chat = copy.deepcopy(chat)
+    extracted_chat = []
+    for chat_i in chat:
+        if chat_i.role == "user":
+            extracted_chat.append(chat_i)
+        elif chat_i.role == "coder":
+            if "<final_code>" in chat_i.content and "<final_test>" in chat_i.content:
+                extracted_chat.append(chat_i)
+    return extracted_chat
+def maybe_run_action(
+    coder: AgentCoder,
+    action: Optional[str],
+    chat: List[AgentMessage],
+    code_interpreter: Optional[CodeInterpreter] = None,
+) -> Optional[List[AgentMessage]]:
+    if action == "generate_or_edit_vision_code":
+        extracted_chat = extract_conversation_for_generate_code(chat)
+        # there's an issue here because coder.generate_code will send it's code_context
+        # to the outside user via it's update_callback, but we don't necessarily have
+        # access to that update_callback here, so we re-create the message using
+        # format_code_context.
+        code_context = coder.generate_code(
+            extracted_chat, code_interpreter=code_interpreter
+        )
+        return [
+            AgentMessage(role="coder", content=format_code_context(code_context)),
+            AgentMessage(role="observation", content=code_context.test_result.text()),
+        ]
+    elif action == "edit_code":
+        extracted_chat = extract_conversation_for_generate_code(chat)
+        plan_context = PlanContext(
+            plan="Edit the latest code observed in the fewest steps possible according to the user's feedback.",
+            instructions=[],
+            code="",
+        )
+        code_context = coder.generate_code_from_plan(
+            extracted_chat, plan_context, code_interpreter=code_interpreter
+        )
+        return [
+            AgentMessage(role="coder", content=format_code_context(code_context)),
+            AgentMessage(role="observation", content=code_context.test_result.text()),
+        ]
+    elif action == "view_image":
+        pass
+    return None
+class VisionAgentV2(Agent):
+    """VisionAgentV2 is a conversational agent that allows you to more easily use a
+    coder agent such as VisionAgentCoderV2 to write vision code for you.
+    """
+    def __init__(
+        self,
+        agent: Optional[LMM] = None,
+        coder: Optional[AgentCoder] = None,
+        verbose: bool = False,
+        code_sandbox_runtime: Optional[str] = None,
+        update_callback: Callable[[Dict[str, Any]], None] = lambda x: None,
+    ) -> None:
+        """Initialize the VisionAgentV2.
+        Parameters:
+            agent (Optional[LMM]): The language model to use for the agent. If None, a
+                default AnthropicLMM will be used.
+            coder (Optional[AgentCoder]): The coder agent to use for generating vision
+                code. If None, a default VisionAgentCoderV2 will be used.
+            verbose (bool): Whether to print out debug information.
+            code_sandbox_runtime (Optional[str]): The code sandbox runtime to use, can
+                be one of: None, "local" or "e2b". If None, it will read from the
+                environment variable CODE_SANDBOX_RUNTIME.
+            update_callback (Callable[[Dict[str, Any]], None]): The callback function
+                that will send back intermediate conversation messages.
+        """
+        self.agent = (
+            agent
+            if agent is not None
+            else AnthropicLMM(
+                model_name="claude-3-5-sonnet-20241022",
+                temperature=0.0,
+            )
+        )
+        self.coder = (
+            coder
+            if coder is not None
+            else VisionAgentCoderV2(verbose=verbose, update_callback=update_callback)
+        )
+        self.verbose = verbose
+        self.code_sandbox_runtime = code_sandbox_runtime
+        self.update_callback = update_callback
+        # force coder to use the same update_callback
+        if hasattr(self.coder, "update_callback"):
+            self.coder.update_callback = update_callback
+    def __call__(
+        self,
+        input: Union[str, List[Message]],
+        media: Optional[Union[str, Path]] = None,
+    ) -> str:
+        """Conversational interface to the agent. This is the main method to use to
+        interact with the agent. It takes in a string or list of messages and returns
+        the agent's response as a string.
+        Parameters:
+            input (Union[str, List[Message]]): The input to the agent. This can be a
+                string or a list of messages in the format of [{"role": "user",
+                "content": "describe your task here..."}, ...].
+            media (Optional[Union[str, Path]]): The path to the media file to use with
+                the input. This can be an image or video file.
+        Returns:
+            str: The agent's response as a string.
+        """
+        input_msg = convert_message_to_agentmessage(input, media)
+        return self.chat(input_msg)[-1].content
+    def chat(
+        self,
+        chat: List[AgentMessage],
+    ) -> List[AgentMessage]:
+        """Conversational interface to the agent. This is the main method to use to
+        interact with the agent. It takes in a list of messages and returns the agent's
+        response as a list of messages.
+        Parameters:
+            chat (List[AgentMessage]): The input to the agent. This should be a list of
+                AgentMessage objects.
+        Returns:
+            List[AgentMessage]: The agent's response as a list of AgentMessage objects.
+        """
+        return_chat = []
+        with CodeInterpreterFactory.new_instance(
+            self.code_sandbox_runtime
+        ) as code_interpreter:
+            int_chat, _, _ = add_media_to_chat(chat, code_interpreter)
+            response_context = run_conversation(self.agent, int_chat)
+            return_chat.append(
+                AgentMessage(role="conversation", content=response_context)
+            )
+            self.update_callback(return_chat[-1].model_dump())
+            action = extract_tag(response_context, "action")
+            updated_chat = maybe_run_action(
+                self.coder, action, int_chat, code_interpreter=code_interpreter
+            )
+            if updated_chat is not None:
+                # do not append updated_chat to return_chat becuase the observation
+                # from running the action will have already been added via the callbacks
+                obs_response_context = run_conversation(
+                    self.agent, return_chat + updated_chat
+                )
+                return_chat.append(
+                    AgentMessage(role="conversation", content=obs_response_context)
+                )
+                self.update_callback(return_chat[-1].model_dump())
+        return return_chat
+    def log_progress(self, data: Dict[str, Any]) -> None:
+        pass

vision_agent/tools/tools.py CHANGED Viewed

@@ -2453,7 +2453,6 @@ FUNCTION_TOOLS = [
     owl_v2_image,
     owl_v2_video,
     ocr,
-    clip,
     vit_image_classification,
     vit_nsfw_classification,
     countgd_counting,
@@ -2471,6 +2470,7 @@ FUNCTION_TOOLS = [
     qwen2_vl_video_vqa,
     video_temporal_localization,
     flux_image_inpainting,
+    siglip_classification,
 ]
 UTIL_TOOLS = [

vision_agent/utils/execute.py CHANGED Viewed

@@ -38,7 +38,7 @@ from vision_agent.utils.exceptions import (
 load_dotenv()
 _LOGGER = logging.getLogger(__name__)
-_SESSION_TIMEOUT = 600  # 10 minutes
+_SESSION_TIMEOUT = 180  # 3 minutes
 WORKSPACE = Path(os.getenv("WORKSPACE", ""))

{vision_agent-0.2.198.dist-info → vision_agent-0.2.200.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.198
+Version: 0.2.200
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.198.dist-info → vision_agent-0.2.200.dist-info}/RECORD RENAMED Viewed

@@ -1,19 +1,22 @@
-vision_agent/.sim_tools/df.csv,sha256=IPYWrI8W8r7zs2-QRjGsAzlg1O3wqngGRSyz9F-BOpo,34008
-vision_agent/.sim_tools/embs.npy,sha256=ItJgcBpT0--0HeZjUV30INzFXNQh-12HoUVevNY38dc,356480
+vision_agent/.sim_tools/df.csv,sha256=0fmLwTDjnRTiqYwamTOdCPjruE6wZz0AVrONIPTHxZY,34086
+vision_agent/.sim_tools/embs.npy,sha256=xF8Cg7Xd09QCTySj831aL1O2_0kRNaaH8XRJIRjgWzQ,356480
 vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
-vision_agent/agent/__init__.py,sha256=j4W3zHXKE96o93ZziY62ZBWgicLYEink1rIU3gPsfwM,548
-vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
-vision_agent/agent/agent_utils.py,sha256=jDkvanBsT_ZH7MnPWP_Wa_ToPOy4hdy4kTw9FZytwwo,12765
+vision_agent/agent/__init__.py,sha256=M8CffavdIh8Zh-skznLHIaQkYGCGK7vk4dq1FaVkbs4,617
+vision_agent/agent/agent.py,sha256=sf8JcA3LNy_4GaS_gQb2Q-PXkl4dBuGh-7raI9KAtZo,1470
+vision_agent/agent/agent_utils.py,sha256=NmrqjhSb6fpnrB8XGWtaywZjr9n89otusOZpcbWLf9k,13534
+vision_agent/agent/types.py,sha256=aAd_ez1-NQh04k27cmywyOV2uA_vWWYE-Ok7zq_JoAk,1532
 vision_agent/agent/vision_agent.py,sha256=rr1P9iTbr7OsjgMYWCeIxQYI4cLwPWia3NIMJNi-9Yo,26110
 vision_agent/agent/vision_agent_coder.py,sha256=waCmw_NTgsy9G-UqlRZFhsFJJVuWVrjxVnShe4Xp_lI,27743
 vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
 vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=9v5HwbNidSzYUEFl6ZMniWWOmyLITM_moWLtKVaTen8,4845
-vision_agent/agent/vision_agent_coder_v2.py,sha256=LVV5Ij-2s03Cj27VJZI11dMKios8ALYZ4_mZTpeMDJU,10863
+vision_agent/agent/vision_agent_coder_v2.py,sha256=SVIJC0N5TBgq9z-F99UebLimRuQuAe_HHvTFupBzVfo,14715
 vision_agent/agent/vision_agent_planner.py,sha256=F_5opnc0XmQmNH40rs2T7DFrai4CC6aDYe02Z8e93AM,18875
 vision_agent/agent/vision_agent_planner_prompts.py,sha256=Y3jz9HRf8fz9NLUseN7cTgZqewP0RazxR7vw1sPhcn0,6691
-vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=PrihfrkxbeVQNzR2Vu3UwG_PRjFsjoC9IQko3WfUqPM,33143
-vision_agent/agent/vision_agent_planner_v2.py,sha256=11pCfaXXsivV9DKWI7nDcLf5dJV3IyHX0IR4Zn7UC9E,14288
-vision_agent/agent/vision_agent_prompts.py,sha256=4329ll0kqCznRALIMl-rlKWGjN92p3bcRrz8R-cO744,13748
+vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=Tzon3h5iZdHJglesk8GVS-2myNf5-fhf7HUbkpZWHQk,33143
+vision_agent/agent/vision_agent_planner_v2.py,sha256=mxQxD_B8sKYharh8e7W0uc1tN11YCztyLowc83seScc,17023
+vision_agent/agent/vision_agent_prompts.py,sha256=PENFd8VM_vHKxeZPiotVM1RBVW9NrXimKbpvI1UteKI,13772
+vision_agent/agent/vision_agent_prompts_v2.py,sha256=-vCWat-ARlCOOOeIDIFhg-kcwRRwjTXYEwsvvqPeaCs,1972
+vision_agent/agent/vision_agent_v2.py,sha256=Cudp_ZZBI9rDwMjIYlvY4jzh_srsulYgfRWZLo4_2TQ,8366
 vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
 vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
@@ -27,16 +30,16 @@ vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB
 vision_agent/tools/planner_tools.py,sha256=FROahw_6Taqvytv6pOjCHUEypOfjsi_f8Vo1c5vz6Mw,8823
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
 vision_agent/tools/tool_utils.py,sha256=GDGOmBCo4UfYz-DJ-olREJHPsqs5mzHu0YXiAnpNE8E,10179
-vision_agent/tools/tools.py,sha256=Q8QSuOUk0df_XueU856vi21GOolp2TB3_f0WvMayjIA,87835
+vision_agent/tools/tools.py,sha256=wXDs0m_Yb601FQVp5fPYYVtt4lHUeMnuqIbfDZhsE4Q,87852
 vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
 vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
 vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
-vision_agent/utils/execute.py,sha256=2sIQn45llOENMyrKu3TPINVRLLbOvvZ6SVHFCB9MQUo,28028
+vision_agent/utils/execute.py,sha256=b3AA1G16Ixwlgd-kke13brKclxh5nJXQTrk25oj1W3o,28027
 vision_agent/utils/image_utils.py,sha256=rRWcxKggPXIRXIY_XT9rZt30ECDRq8zq7FDeXRDqQWs,11679
 vision_agent/utils/sim.py,sha256=NZc9QGD6BTY5O29NVbHH7oxDePL_QMnylT1lYcDUn1Y,7437
 vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
 vision_agent/utils/video.py,sha256=tRcGp4vEnaDycigL1hBO9k0FBPtDH35fCQciVr9GqYI,6013
-vision_agent-0.2.198.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.198.dist-info/METADATA,sha256=M_eocBYeL4bo3mFRuvvQ9wPiu2vieHArd8BZbintgTc,19026
-vision_agent-0.2.198.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.198.dist-info/RECORD,,
+vision_agent-0.2.200.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.200.dist-info/METADATA,sha256=goRTW73tD79-UlJiy4cL0twnVYm9iSjU9f5HsC4A1ZI,19026
+vision_agent-0.2.200.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.200.dist-info/RECORD,,

{vision_agent-0.2.198.dist-info → vision_agent-0.2.200.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.198.dist-info → vision_agent-0.2.200.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.198__py3-none-any.whl → 0.2.200__py3-none-any.whl

vision-agent 0.2.198py3-none-any.whl → 0.2.200py3-none-any.whl