PyPI - vision-agent - Versions diffs - 0.2.198__tar.gz → 0.2.200__tar.gz - Mend

vision-agent 0.2.198tar.gz → 0.2.200tar.gz

Files changed (46) hide show

{vision_agent-0.2.198 → vision_agent-0.2.200}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.198
+Version: 0.2.200
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.198 → vision_agent-0.2.200}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.198"
+version = "0.2.200"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

{vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/.sim_tools/df.csv RENAMED Viewed

@@ -80,24 +80,6 @@ desc,doc,name
             {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
         ]
     ",ocr
-'clip' is a tool that can classify an image or a cropped detection given a list of input classes or tags. It returns the same list of the input classes along with their probability scores based on image content.,"clip(image: numpy.ndarray, classes: List[str]) -> Dict[str, Any]:
-'clip' is a tool that can classify an image or a cropped detection given a list
-    of input classes or tags. It returns the same list of the input classes along with
-    their probability scores based on image content.
-    Parameters:
-        image (np.ndarray): The image to classify or tag
-        classes (List[str]): The list of classes or tags that is associated with the image
-    Returns:
-        Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
-            contains a list of given labels and other a list of scores.
-    Example
-    -------
-        >>> clip(image, ['dog', 'cat', 'bird'])
-        {""labels"": [""dog"", ""cat"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
-    ",clip
 'vit_image_classification' is a tool that can classify an image. It returns a list of classes and their probability scores based on image content.,"vit_image_classification(image: numpy.ndarray) -> Dict[str, Any]:
 'vit_image_classification' is a tool that can classify an image. It returns a
     list of classes and their probability scores based on image content.
@@ -488,6 +470,24 @@ desc,doc,name
         ... )
         >>> save_image(result, ""inpainted_room.png"")
     ",flux_image_inpainting
+'siglip_classification' is a tool that can classify an image or a cropped detection given a list of input labels or tags. It returns the same list of the input labels along with their probability scores based on image content.,"siglip_classification(image: numpy.ndarray, labels: List[str]) -> Dict[str, Any]:
+'siglip_classification' is a tool that can classify an image or a cropped detection given a list
+    of input labels or tags. It returns the same list of the input labels along with
+    their probability scores based on image content.
+    Parameters:
+        image (np.ndarray): The image to classify or tag
+        labels (List[str]): The list of labels or tags that is associated with the image
+    Returns:
+        Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
+            contains a list of given labels and other a list of scores.
+    Example
+    -------
+        >>> siglip_classification(image, ['dog', 'cat', 'bird'])
+        {""labels"": [""dog"", ""cat"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
+    ",siglip_classification
 "'extract_frames_and_timestamps' extracts frames and timestamps from a video which can be a file path, url or youtube link, returns a list of dictionaries with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is the relative time in seconds where the frame was captured. The frame is a numpy array.","extract_frames_and_timestamps(video_uri: Union[str, pathlib.Path], fps: float = 1) -> List[Dict[str, Union[numpy.ndarray, float]]]:
 'extract_frames_and_timestamps' extracts frames and timestamps from a video
     which can be a file path, url or youtube link, returns a list of dictionaries

{vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/.sim_tools/embs.npy RENAMED Viewed

Binary file

{vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/agent/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from .agent import Agent
+from .agent import Agent, AgentCoder, AgentPlanner
 from .vision_agent import VisionAgent
 from .vision_agent_coder import (
     AnthropicVisionAgentCoder,
@@ -17,3 +17,4 @@ from .vision_agent_planner import (
     VisionAgentPlanner,
 )
 from .vision_agent_planner_v2 import VisionAgentPlannerV2
+from .vision_agent_v2 import VisionAgentV2

vision_agent-0.2.200/vision_agent/agent/agent.py ADDED Viewed

@@ -0,0 +1,55 @@
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+from vision_agent.agent.types import AgentMessage, CodeContext, PlanContext
+from vision_agent.lmm.types import Message
+from vision_agent.utils.execute import CodeInterpreter
+class Agent(ABC):
+    @abstractmethod
+    def __call__(
+        self,
+        input: Union[str, List[Message]],
+        media: Optional[Union[str, Path]] = None,
+    ) -> Union[str, List[Message]]:
+        pass
+    @abstractmethod
+    def log_progress(self, data: Dict[str, Any]) -> None:
+        """Log the progress of the agent.
+        This is a hook that is intended for reporting the progress of the agent.
+        """
+        pass
+class AgentCoder(Agent):
+    @abstractmethod
+    def generate_code(
+        self,
+        chat: List[AgentMessage],
+        max_steps: Optional[int] = None,
+        code_interpreter: Optional[CodeInterpreter] = None,
+    ) -> CodeContext:
+        pass
+    @abstractmethod
+    def generate_code_from_plan(
+        self,
+        chat: List[AgentMessage],
+        plan_context: PlanContext,
+        code_interpreter: Optional[CodeInterpreter] = None,
+    ) -> CodeContext:
+        pass
+class AgentPlanner(Agent):
+    @abstractmethod
+    def generate_plan(
+        self,
+        chat: List[AgentMessage],
+        max_steps: Optional[int] = None,
+        code_interpreter: Optional[CodeInterpreter] = None,
+    ) -> PlanContext:
+        pass

{vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/agent/agent_utils.py RENAMED Viewed

@@ -4,16 +4,17 @@ import logging
 import re
 import sys
 import tempfile
-from typing import Any, Dict, List, Optional, Tuple, cast
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
 import libcst as cst
-from pydantic import BaseModel
 from rich.console import Console
 from rich.style import Style
 from rich.syntax import Syntax
 from rich.table import Table
 import vision_agent.tools as T
+from vision_agent.agent.types import AgentMessage, PlanContext
 from vision_agent.lmm.types import Message
 from vision_agent.utils.execute import CodeInterpreter, Execution
 from vision_agent.utils.image_utils import b64_to_pil, convert_to_b64
@@ -24,19 +25,6 @@ _CONSOLE = Console()
 _MAX_TABULATE_COL_WIDTH = 80
-class PlanContext(BaseModel):
-    plan: str
-    instructions: List[str]
-    code: str
-class CodeContext(BaseModel):
-    code: str
-    test: str
-    success: bool
-    test_result: Execution
 def _extract_sub_json(json_str: str) -> Optional[Dict[str, Any]]:
     json_pattern = r"\{.*\}"
     match = re.search(json_pattern, json_str, re.DOTALL)
@@ -228,15 +216,15 @@ def print_table(title: str, columns: List[str], rows: List[List[str]]) -> None:
 def add_media_to_chat(
-    chat: List[Message], code_interpreter: CodeInterpreter
-) -> Tuple[List[Message], List[Message], List[str]]:
+    chat: List[AgentMessage], code_interpreter: Optional[CodeInterpreter] = None
+) -> Tuple[List[AgentMessage], List[AgentMessage], List[Union[str, Path]]]:
     orig_chat = copy.deepcopy(chat)
     int_chat = copy.deepcopy(chat)
-    media_list = []
+    media_list: List[Union[str, Path]] = []
     for chat_i in int_chat:
-        if "media" in chat_i:
-            media_list_i = []
-            for media in chat_i["media"]:
+        if chat_i.media is not None:
+            media_list_i: List[Union[str, Path]] = []
+            for media in chat_i.media:
                 if isinstance(media, str) and media.startswith("data:image/"):
                     media_pil = b64_to_pil(media)
                     with tempfile.NamedTemporaryFile(
@@ -244,25 +232,29 @@ def add_media_to_chat(
                     ) as temp_file:
                         media_pil.save(temp_file, format="PNG")
                         media = str(temp_file.name)
-                media = str(code_interpreter.upload_file(media))  # type: ignore
+                if code_interpreter is not None:
+                    media = str(code_interpreter.upload_file(media))
                 media_list_i.append(media)
-                # don't duplicate appending media name
-                if not str(chat_i["content"]).endswith(f" Media name {media}"):
-                    chat_i["content"] += f" Media name {media}"  # type: ignore
-            chat_i["media"] = media_list_i
+                # don't duplicate appending media name and only add them for user messages
+                if (
+                    not str(chat_i.content).endswith(f" Media name {media}")
+                    and chat_i.role == "user"
+                ):
+                    chat_i.content += f" Media name {media}"
+            chat_i.media = media_list_i if len(media_list_i) > 0 else None
             media_list.extend(media_list_i)
     int_chat = cast(
-        List[Message],
+        List[AgentMessage],
         [
             (
-                {
-                    "role": c["role"],
-                    "content": c["content"],
-                    "media": c["media"],
-                }
-                if "media" in c
-                else {"role": c["role"], "content": c["content"]}
+                AgentMessage(
+                    role=c.role,
+                    content=c.content,
+                    media=c.media,
+                )
+                if c.media is not None
+                else AgentMessage(role=c.role, content=c.content, media=None)
             )
             for c in int_chat
         ],
@@ -283,6 +275,27 @@ def capture_media_from_exec(execution: Execution) -> List[str]:
     return images
+def convert_message_to_agentmessage(
+    input: Union[str, List[Message]],
+    media: Optional[Union[str, Path]] = None,
+) -> List[AgentMessage]:
+    if isinstance(input, str):
+        input_msg = [
+            AgentMessage(
+                role="user",
+                content=input,
+                media=([media] if media is not None else None),
+            )
+        ]
+    else:
+        input_msg = [
+            AgentMessage(role=msg["role"], content=msg["content"], media=None)
+            for msg in input
+        ]
+        input_msg[0].media = [media] if media is not None else None
+    return input_msg
 def strip_function_calls(  # noqa: C901
     code: str, exclusions: Optional[List[str]] = None
 ) -> str:

vision_agent-0.2.200/vision_agent/agent/types.py ADDED Viewed

@@ -0,0 +1,51 @@
+from pathlib import Path
+from typing import List, Literal, Optional, Union
+from pydantic import BaseModel
+from vision_agent.utils.execute import Execution
+class AgentMessage(BaseModel):
+    """AgentMessage encompases messages sent to the entire Agentic system, which includes
+    both LMMs and sub-agents.
+    user: The user's message.
+    assistant: The assistant's message.
+    observation: An observation made after conducting an action, either by the user or
+        assistant.
+    interaction: An interaction between the user and the assistant. For example if the
+        assistant wants to ask the user for help on a task, it could send an
+        interaction message.
+    conversation: Messages coming from the conversation agent, this is a type of
+        assistant messages.
+    planner: Messages coming from the planner agent, this is a type of assistant
+        messages.
+    coder: Messages coming from the coder agent, this is a type of assistant messages.
+    """
+    role: Union[
+        Literal["user"],
+        Literal["assistant"],  # planner, coder and conversation are of type assistant
+        Literal["observation"],
+        Literal["interaction"],
+        Literal["conversation"],
+        Literal["planner"],
+        Literal["coder"],
+    ]
+    content: str
+    media: Optional[List[Union[str, Path]]] = None
+class PlanContext(BaseModel):
+    plan: str
+    instructions: List[str]
+    code: str
+class CodeContext(BaseModel):
+    code: str
+    test: str
+    success: bool
+    test_result: Execution

{vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/agent/vision_agent_coder_v2.py RENAMED Viewed

@@ -6,19 +6,19 @@ from rich.console import Console
 from rich.markup import escape
 import vision_agent.tools as T
-from vision_agent.agent import Agent
+from vision_agent.agent import AgentCoder, AgentPlanner
 from vision_agent.agent.agent_utils import (
-    CodeContext,
     DefaultImports,
-    PlanContext,
     add_media_to_chat,
     capture_media_from_exec,
+    convert_message_to_agentmessage,
     extract_tag,
     format_feedback,
     format_plan_v2,
     print_code,
     strip_function_calls,
 )
+from vision_agent.agent.types import AgentMessage, CodeContext, PlanContext
 from vision_agent.agent.vision_agent_coder_prompts_v2 import CODE, FIX_BUG, TEST
 from vision_agent.agent.vision_agent_planner_v2 import VisionAgentPlannerV2
 from vision_agent.lmm import LMM, AnthropicLMM
@@ -34,6 +34,12 @@ from vision_agent.utils.sim import Sim, load_cached_sim
 _CONSOLE = Console()
+def format_code_context(
+    code_context: CodeContext,
+) -> str:
+    return f"<final_code>{code_context.code}</final_code>\n<final_test>{code_context.test}</final_test>"
 def retrieve_tools(
     plan: List[str],
     tool_recommender: Sim,
@@ -49,46 +55,54 @@ def retrieve_tools(
 def write_code(
     coder: LMM,
-    chat: List[Message],
+    chat: List[AgentMessage],
     tool_docs: str,
     plan: str,
 ) -> str:
     chat = copy.deepcopy(chat)
-    if chat[-1]["role"] != "user":
+    if chat[-1].role != "user":
         raise ValueError("Last chat message must be from the user.")
-    user_request = chat[-1]["content"]
+    user_request = chat[-1].content
     prompt = CODE.format(
         docstring=tool_docs,
         question=user_request,
         plan=plan,
     )
-    chat[-1]["content"] = prompt
-    response = coder(chat, stream=False)
-    return extract_tag(response, "code")  # type: ignore
+    response = cast(str, coder([{"role": "user", "content": prompt}], stream=False))
+    maybe_code = extract_tag(response, "code")
+    # if the response wasn't properly formatted with the code tags just retrun the response
+    if maybe_code is None:
+        return response
+    return maybe_code
 def write_test(
     tester: LMM,
-    chat: List[Message],
+    chat: List[AgentMessage],
     tool_util_docs: str,
     code: str,
     media_list: Optional[Sequence[Union[str, Path]]] = None,
 ) -> str:
     chat = copy.deepcopy(chat)
-    if chat[-1]["role"] != "user":
+    if chat[-1].role != "user":
         raise ValueError("Last chat message must be from the user.")
-    user_request = chat[-1]["content"]
+    user_request = chat[-1].content
     prompt = TEST.format(
         docstring=tool_util_docs,
         question=user_request,
         code=code,
         media=media_list,
     )
-    chat[-1]["content"] = prompt
-    response = tester(chat, stream=False)
-    return extract_tag(response, "code")  # type: ignore
+    response = cast(str, tester([{"role": "user", "content": prompt}], stream=False))
+    maybe_code = extract_tag(response, "code")
+    # if the response wasn't properly formatted with the code tags just retrun the response
+    if maybe_code is None:
+        return response
+    return maybe_code
 def debug_code(
@@ -170,12 +184,11 @@ def write_and_test_code(
     coder: LMM,
     tester: LMM,
     debugger: LMM,
-    chat: List[Message],
+    chat: List[AgentMessage],
     plan: str,
     tool_docs: str,
     code_interpreter: CodeInterpreter,
     media_list: List[Union[str, Path]],
-    update_callback: Callable[[Dict[str, Any]], None],
     verbose: bool,
 ) -> CodeContext:
     code = write_code(
@@ -226,14 +239,6 @@ def write_and_test_code(
                 f"[bold cyan]Code execution result after attempted fix:[/bold cyan] [yellow]{escape(result.text(include_logs=True))}[/yellow]"
             )
-    update_callback(
-        {
-            "role": "assistant",
-            "content": f"<final_code>{DefaultImports.to_code_string()}\n{code}</final_code>\n<final_test>{DefaultImports.to_code_string()}\n{test}</final_test>",
-            "media": capture_media_from_exec(result),
-        }
-    )
     return CodeContext(
         code=f"{DefaultImports.to_code_string()}\n{code}",
         test=f"{DefaultImports.to_code_string()}\n{test}",
@@ -242,10 +247,12 @@ def write_and_test_code(
     )
-class VisionAgentCoderV2(Agent):
+class VisionAgentCoderV2(AgentCoder):
+    """VisionAgentCoderV2 is an agent that will write vision code for you."""
     def __init__(
         self,
-        planner: Optional[Agent] = None,
+        planner: Optional[AgentPlanner] = None,
         coder: Optional[LMM] = None,
         tester: Optional[LMM] = None,
         debugger: Optional[LMM] = None,
@@ -254,6 +261,25 @@ class VisionAgentCoderV2(Agent):
         code_sandbox_runtime: Optional[str] = None,
         update_callback: Callable[[Dict[str, Any]], None] = lambda _: None,
     ) -> None:
+        """Initialize the VisionAgentCoderV2.
+        Parameters:
+            planner (Optional[AgentPlanner]): The planner agent to use for generating
+                vision plans. If None, a default VisionAgentPlannerV2 will be used.
+            coder (Optional[LMM]): The language model to use for the coder agent. If
+                None, a default AnthropicLMM will be used.
+            tester (Optional[LMM]): The language model to use for the tester agent. If
+                None, a default AnthropicLMM will be used.
+            debugger (Optional[LMM]): The language model to use for the debugger agent.
+            tool_recommender (Optional[Union[str, Sim]]): The tool recommender to use.
+            verbose (bool): Whether to print out debug information.
+            code_sandbox_runtime (Optional[str]): The code sandbox runtime to use, can
+                be one of: None, "local" or "e2b". If None, it will read from the
+                environment variable CODE_SANDBOX_RUNTIME.
+            update_callback (Callable[[Dict[str, Any]], None]): The callback function
+                that will send back intermediate conversation messages.
+        """
         self.planner = (
             planner
             if planner is not None
@@ -290,20 +316,52 @@ class VisionAgentCoderV2(Agent):
         self,
         input: Union[str, List[Message]],
         media: Optional[Union[str, Path]] = None,
-    ) -> Union[str, List[Message]]:
-        if isinstance(input, str):
-            input = [{"role": "user", "content": input}]
-        if media is not None:
-            input[0]["media"] = [media]
-        return self.generate_code(input).code
-    def generate_code(self, chat: List[Message]) -> CodeContext:
+    ) -> str:
+        """Generate vision code from a conversation.
+        Parameters:
+            input (Union[str, List[Message]]): The input to the agent. This can be a
+                string or a list of messages in the format of [{"role": "user",
+                "content": "describe your task here..."}, ...].
+            media (Optional[Union[str, Path]]): The path to the media file to use with
+                the input. This can be an image or video file.
+        Returns:
+            str: The generated code as a string.
+        """
+        input_msg = convert_message_to_agentmessage(input, media)
+        return self.generate_code(input_msg).code
+    def generate_code(
+        self,
+        chat: List[AgentMessage],
+        max_steps: Optional[int] = None,
+        code_interpreter: Optional[CodeInterpreter] = None,
+    ) -> CodeContext:
+        """Generate vision code from a conversation.
+        Parameters:
+            chat (List[AgentMessage]): The input to the agent. This should be a list of
+                AgentMessage objects.
+            code_interpreter (Optional[CodeInterpreter]): The code interpreter to use.
+        Returns:
+            CodeContext: The generated code as a CodeContext object which includes the
+                code, test code, whether or not it was exceuted successfully, and the
+                execution result.
+        """
         chat = copy.deepcopy(chat)
-        with CodeInterpreterFactory.new_instance(
-            self.code_sandbox_runtime
+        with (
+            CodeInterpreterFactory.new_instance(self.code_sandbox_runtime)
+            if code_interpreter is None
+            else code_interpreter
         ) as code_interpreter:
             int_chat, orig_chat, _ = add_media_to_chat(chat, code_interpreter)
-            plan_context = self.planner.generate_plan(int_chat, code_interpreter)  # type: ignore
+            plan_context = self.planner.generate_plan(
+                int_chat, max_steps=max_steps, code_interpreter=code_interpreter
+            )
             code_context = self.generate_code_from_plan(
                 orig_chat,
                 plan_context,
@@ -313,13 +371,30 @@ class VisionAgentCoderV2(Agent):
     def generate_code_from_plan(
         self,
-        chat: List[Message],
+        chat: List[AgentMessage],
         plan_context: PlanContext,
         code_interpreter: Optional[CodeInterpreter] = None,
     ) -> CodeContext:
+        """Generate vision code from a conversation and a previously made plan. This
+        will skip the planning step and go straight to generating code.
+        Parameters:
+            chat (List[AgentMessage]): The input to the agent. This should be a list of
+                AgentMessage objects.
+            plan_context (PlanContext): The plan context that was previously generated.
+            code_interpreter (Optional[CodeInterpreter]): The code interpreter to use.
+        Returns:
+            CodeContext: The generated code as a CodeContext object which includes the
+                code, test code, whether or not it was exceuted successfully, and the
+                execution result.
+        """
         chat = copy.deepcopy(chat)
-        with CodeInterpreterFactory.new_instance(
-            self.code_sandbox_runtime
+        with (
+            CodeInterpreterFactory.new_instance(self.code_sandbox_runtime)
+            if code_interpreter is None
+            else code_interpreter
         ) as code_interpreter:
             int_chat, _, media_list = add_media_to_chat(chat, code_interpreter)
             tool_docs = retrieve_tools(plan_context.instructions, self.tool_recommender)
@@ -331,10 +406,23 @@ class VisionAgentCoderV2(Agent):
                 plan=format_plan_v2(plan_context),
                 tool_docs=tool_docs,
                 code_interpreter=code_interpreter,
-                media_list=media_list,  # type: ignore
-                update_callback=self.update_callback,
+                media_list=media_list,
                 verbose=self.verbose,
             )
+        self.update_callback(
+            {
+                "role": "coder",
+                "content": format_code_context(code_context),
+                "media": capture_media_from_exec(code_context.test_result),
+            }
+        )
+        self.update_callback(
+            {
+                "role": "observation",
+                "content": code_context.test_result.text(),
+            }
+        )
         return code_context
     def log_progress(self, data: Dict[str, Any]) -> None:

{vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/agent/vision_agent_planner_prompts_v2.py RENAMED Viewed

@@ -389,7 +389,7 @@ for infos in obj_to_info:
 print(f"{len(objects_with_tape)} boxes with tape found")
 </execute_python>
-OBJERVATION:
+OBSERVATION:
 3 boxes were tracked
 2 boxes with tape found
 <count>6</count>

vision-agent 0.2.198__tar.gz → 0.2.200__tar.gz

vision-agent 0.2.198tar.gz → 0.2.200tar.gz