PyPI - vision-agent - Versions diffs - 0.2.199__py3-none-any.whl → 0.2.201__py3-none-any.whl - Mend

vision-agent 0.2.199py3-none-any.whl → 0.2.201py3-none-any.whl

Files changed (19) hide show

vision_agent/agent/__init__.py +2 -1
vision_agent/agent/agent.py +33 -0
vision_agent/agent/agent_utils.py +47 -34
vision_agent/agent/types.py +51 -0
vision_agent/agent/vision_agent.py +20 -77
vision_agent/agent/vision_agent_coder.py +0 -6
vision_agent/agent/vision_agent_coder_v2.py +131 -43
vision_agent/agent/vision_agent_planner.py +0 -6
vision_agent/agent/vision_agent_planner_prompts_v2.py +1 -1
vision_agent/agent/vision_agent_planner_v2.py +109 -50
vision_agent/agent/vision_agent_prompts.py +4 -4
vision_agent/agent/vision_agent_prompts_v2.py +46 -0
vision_agent/agent/vision_agent_v2.py +215 -0
vision_agent/tools/meta_tools.py +18 -94
vision_agent/utils/execute.py +1 -1
{vision_agent-0.2.199.dist-info → vision_agent-0.2.201.dist-info}/METADATA +1 -1
{vision_agent-0.2.199.dist-info → vision_agent-0.2.201.dist-info}/RECORD +19 -16
{vision_agent-0.2.199.dist-info → vision_agent-0.2.201.dist-info}/LICENSE +0 -0
{vision_agent-0.2.199.dist-info → vision_agent-0.2.201.dist-info}/WHEEL +0 -0

vision_agent/agent/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .agent import Agent
+from .agent import Agent, AgentCoder, AgentPlanner
 from .vision_agent import VisionAgent
 from .vision_agent_coder import (
     AnthropicVisionAgentCoder,
@@ -17,3 +17,4 @@ from .vision_agent_planner import (
     VisionAgentPlanner,
 )
 from .vision_agent_planner_v2 import VisionAgentPlannerV2
+from .vision_agent_v2 import VisionAgentV2

vision_agent/agent/agent.py CHANGED Viewed

@@ -2,7 +2,9 @@ from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
+from vision_agent.agent.types import AgentMessage, CodeContext, PlanContext
 from vision_agent.lmm.types import Message
+from vision_agent.utils.execute import CodeInterpreter
 class Agent(ABC):
@@ -20,3 +22,34 @@ class Agent(ABC):
         This is a hook that is intended for reporting the progress of the agent.
         """
         pass
+class AgentCoder(Agent):
+    @abstractmethod
+    def generate_code(
+        self,
+        chat: List[AgentMessage],
+        max_steps: Optional[int] = None,
+        code_interpreter: Optional[CodeInterpreter] = None,
+    ) -> CodeContext:
+        pass
+    @abstractmethod
+    def generate_code_from_plan(
+        self,
+        chat: List[AgentMessage],
+        plan_context: PlanContext,
+        code_interpreter: Optional[CodeInterpreter] = None,
+    ) -> CodeContext:
+        pass
+class AgentPlanner(Agent):
+    @abstractmethod
+    def generate_plan(
+        self,
+        chat: List[AgentMessage],
+        max_steps: Optional[int] = None,
+        code_interpreter: Optional[CodeInterpreter] = None,
+    ) -> PlanContext:
+        pass

vision_agent/agent/agent_utils.py CHANGED Viewed

@@ -4,16 +4,17 @@ import logging
 import re
 import sys
 import tempfile
-from typing import Any, Dict, List, Optional, Tuple, cast
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
 import libcst as cst
-from pydantic import BaseModel
 from rich.console import Console
 from rich.style import Style
 from rich.syntax import Syntax
 from rich.table import Table
 import vision_agent.tools as T
+from vision_agent.agent.types import AgentMessage, PlanContext
 from vision_agent.lmm.types import Message
 from vision_agent.utils.execute import CodeInterpreter, Execution
 from vision_agent.utils.image_utils import b64_to_pil, convert_to_b64
@@ -24,19 +25,6 @@ _CONSOLE = Console()
 _MAX_TABULATE_COL_WIDTH = 80
-class PlanContext(BaseModel):
-    plan: str
-    instructions: List[str]
-    code: str
-class CodeContext(BaseModel):
-    code: str
-    test: str
-    success: bool
-    test_result: Execution
 def _extract_sub_json(json_str: str) -> Optional[Dict[str, Any]]:
     json_pattern = r"\{.*\}"
     match = re.search(json_pattern, json_str, re.DOTALL)
@@ -228,15 +216,15 @@ def print_table(title: str, columns: List[str], rows: List[List[str]]) -> None:
 def add_media_to_chat(
-    chat: List[Message], code_interpreter: CodeInterpreter
-) -> Tuple[List[Message], List[Message], List[str]]:
+    chat: List[AgentMessage], code_interpreter: Optional[CodeInterpreter] = None
+) -> Tuple[List[AgentMessage], List[AgentMessage], List[Union[str, Path]]]:
     orig_chat = copy.deepcopy(chat)
     int_chat = copy.deepcopy(chat)
-    media_list = []
+    media_list: List[Union[str, Path]] = []
     for chat_i in int_chat:
-        if "media" in chat_i:
-            media_list_i = []
-            for media in chat_i["media"]:
+        if chat_i.media is not None:
+            media_list_i: List[Union[str, Path]] = []
+            for media in chat_i.media:
                 if isinstance(media, str) and media.startswith("data:image/"):
                     media_pil = b64_to_pil(media)
                     with tempfile.NamedTemporaryFile(
@@ -244,25 +232,29 @@ def add_media_to_chat(
                     ) as temp_file:
                         media_pil.save(temp_file, format="PNG")
                         media = str(temp_file.name)
-                media = str(code_interpreter.upload_file(media))  # type: ignore
+                if code_interpreter is not None:
+                    media = str(code_interpreter.upload_file(media))
                 media_list_i.append(media)
-                # don't duplicate appending media name
-                if not str(chat_i["content"]).endswith(f" Media name {media}"):
-                    chat_i["content"] += f" Media name {media}"  # type: ignore
-            chat_i["media"] = media_list_i
+                # don't duplicate appending media name and only add them for user messages
+                if (
+                    not str(chat_i.content).endswith(f" Media name {media}")
+                    and chat_i.role == "user"
+                ):
+                    chat_i.content += f" Media name {media}"
+            chat_i.media = media_list_i if len(media_list_i) > 0 else None
             media_list.extend(media_list_i)
     int_chat = cast(
-        List[Message],
+        List[AgentMessage],
         [
             (
-                {
-                    "role": c["role"],
-                    "content": c["content"],
-                    "media": c["media"],
-                }
-                if "media" in c
-                else {"role": c["role"], "content": c["content"]}
+                AgentMessage(
+                    role=c.role,
+                    content=c.content,
+                    media=c.media,
+                )
+                if c.media is not None
+                else AgentMessage(role=c.role, content=c.content, media=None)
             )
             for c in int_chat
         ],
@@ -283,6 +275,27 @@ def capture_media_from_exec(execution: Execution) -> List[str]:
     return images
+def convert_message_to_agentmessage(
+    input: Union[str, List[Message]],
+    media: Optional[Union[str, Path]] = None,
+) -> List[AgentMessage]:
+    if isinstance(input, str):
+        input_msg = [
+            AgentMessage(
+                role="user",
+                content=input,
+                media=([media] if media is not None else None),
+            )
+        ]
+    else:
+        input_msg = [
+            AgentMessage(role=msg["role"], content=msg["content"], media=None)
+            for msg in input
+        ]
+        input_msg[0].media = [media] if media is not None else None
+    return input_msg
 def strip_function_calls(  # noqa: C901
     code: str, exclusions: Optional[List[str]] = None
 ) -> str:

vision_agent/agent/types.py ADDED Viewed

@@ -0,0 +1,51 @@
+from pathlib import Path
+from typing import List, Literal, Optional, Union
+from pydantic import BaseModel
+from vision_agent.utils.execute import Execution
+class AgentMessage(BaseModel):
+    """AgentMessage encompases messages sent to the entire Agentic system, which includes
+    both LMMs and sub-agents.
+    user: The user's message.
+    assistant: The assistant's message.
+    observation: An observation made after conducting an action, either by the user or
+        assistant.
+    interaction: An interaction between the user and the assistant. For example if the
+        assistant wants to ask the user for help on a task, it could send an
+        interaction message.
+    conversation: Messages coming from the conversation agent, this is a type of
+        assistant messages.
+    planner: Messages coming from the planner agent, this is a type of assistant
+        messages.
+    coder: Messages coming from the coder agent, this is a type of assistant messages.
+    """
+    role: Union[
+        Literal["user"],
+        Literal["assistant"],  # planner, coder and conversation are of type assistant
+        Literal["observation"],
+        Literal["interaction"],
+        Literal["conversation"],
+        Literal["planner"],
+        Literal["coder"],
+    ]
+    content: str
+    media: Optional[List[Union[str, Path]]] = None
+class PlanContext(BaseModel):
+    plan: str
+    instructions: List[str]
+    code: str
+class CodeContext(BaseModel):
+    code: str
+    test: str
+    success: bool
+    test_result: Execution

vision_agent/agent/vision_agent.py CHANGED Viewed

@@ -36,14 +36,10 @@ class BoilerplateCode:
     pre_code = [
         "from typing import *",
         "from vision_agent.utils.execute import CodeInterpreter",
-        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning, list_artifacts, capture_files_into_artifacts",
-        "artifacts = Artifacts('{remote_path}', '{remote_path}')",
-        "artifacts.load('{remote_path}')",
-    ]
-    post_code = [
-        "capture_files_into_artifacts(artifacts)",
-        "artifacts.save()",
+        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning, list_artifacts",
+        "artifacts = Artifacts('{cwd}')",
     ]
+    post_code: List[str] = []
     @staticmethod
     def add_boilerplate(code: str, **format: Any) -> str:
@@ -149,9 +145,7 @@ def execute_code_action(
     code_interpreter: CodeInterpreter,
 ) -> Tuple[Execution, str]:
     result = code_interpreter.exec_isolation(
-        BoilerplateCode.add_boilerplate(
-            code, remote_path=str(artifacts.remote_save_path)
-        )
+        BoilerplateCode.add_boilerplate(code, cwd=str(artifacts.cwd))
     )
     obs = str(result.logs)
@@ -212,19 +206,6 @@ def add_step_descriptions(response: Dict[str, Any]) -> Dict[str, Any]:
     return response
-def setup_artifacts() -> Artifacts:
-    # this is setting remote artifacts path
-    sandbox = os.environ.get("CODE_SANDBOX_RUNTIME", None)
-    if sandbox is None or sandbox == "local":
-        remote = WORKSPACE / "artifacts.pkl"
-    elif sandbox == "e2b":
-        remote = Path("/home/user/artifacts.pkl")
-    else:
-        raise ValueError(f"Unknown code sandbox runtime {sandbox}")
-    artifacts = Artifacts(remote, Path(os.getcwd()) / "artifacts.pkl")
-    return artifacts
 def new_format_to_old_format(new_format: Dict[str, Any]) -> Dict[str, Any]:
     thoughts = new_format["thinking"] if new_format["thinking"] is not None else ""
     response = new_format["response"] if new_format["response"] is not None else ""
@@ -297,9 +278,10 @@ class VisionAgent(Agent):
     def __init__(
         self,
         agent: Optional[LMM] = None,
+        cwd: Optional[Union[Path, str]] = None,
         verbosity: int = 0,
         callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
-        code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
+        code_sandbox_runtime: Optional[str] = None,
     ) -> None:
         """Initialize the VisionAgent.
@@ -317,9 +299,10 @@ class VisionAgent(Agent):
         self.agent = AnthropicLMM(temperature=0.0) if agent is None else agent
         self.max_iterations = 12
+        self.cwd = Path(cwd) if cwd is not None else Path.cwd()
         self.verbosity = verbosity
-        self.code_interpreter = code_interpreter
         self.callback_message = callback_message
+        self.code_sandbox_runtime = code_sandbox_runtime
         if self.verbosity >= 1:
             _LOGGER.setLevel(logging.INFO)
@@ -397,40 +380,21 @@ class VisionAgent(Agent):
             raise ValueError("chat cannot be empty")
         if not artifacts:
-            artifacts = setup_artifacts()
-        # NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
-        code_interpreter = (
-            self.code_interpreter
-            if self.code_interpreter is not None
-            and not isinstance(self.code_interpreter, str)
-            else CodeInterpreterFactory.new_instance(
-                code_sandbox_runtime=self.code_interpreter,
-                remote_path=artifacts.remote_save_path.parent,
-            )
-        )
+            artifacts = Artifacts(self.cwd)
-        if code_interpreter.remote_path != artifacts.remote_save_path.parent:
-            raise ValueError(
-                f"Code interpreter remote path {code_interpreter.remote_path} does not match artifacts remote path {artifacts.remote_save_path.parent}"
-            )
-        with code_interpreter:
+        with CodeInterpreterFactory.new_instance(
+            code_sandbox_runtime=self.code_sandbox_runtime,
+            remote_path=self.cwd,
+        ) as code_interpreter:
             orig_chat = copy.deepcopy(chat)
             int_chat = copy.deepcopy(chat)
             last_user_message = chat[-1]
-            media_list = []
             for chat_i in int_chat:
                 if "media" in chat_i:
                     for media in chat_i["media"]:
                         media = cast(str, media)
-                        artifacts.artifacts[Path(media).name] = open(media, "rb").read()
-                        media_remote_path = (
-                            Path(artifacts.remote_save_path.parent) / Path(media).name
-                        )
+                        media_remote_path = Path(artifacts.cwd) / Path(media).name
                         chat_i["content"] += f" Media name {media_remote_path}"  # type: ignore
-                        media_list.append(media_remote_path)
             int_chat = cast(
                 List[Message],
@@ -452,15 +416,10 @@ class VisionAgent(Agent):
             iterations = 0
             last_response = None
-            # Save the current state of artifacts, will include any images the user
-            # passed in.
-            artifacts.save()
             # Upload artifacts to remote location and show where they are going
             # to be loaded to. The actual loading happens in BoilerplateCode as
             # part of the pre_code.
-            code_interpreter.upload_file(artifacts.local_save_path)
-            artifacts_loaded = artifacts.show(artifacts.remote_save_path.parent)
+            artifacts_loaded = artifacts.show()
             int_chat.append({"role": "observation", "content": artifacts_loaded})
             orig_chat.append({"role": "observation", "content": artifacts_loaded})
             self.streaming_message({"role": "observation", "content": artifacts_loaded})
@@ -487,10 +446,6 @@ class VisionAgent(Agent):
                 )
             while not finished and iterations < self.max_iterations:
-                # ensure we upload the artifacts before each turn, so any local
-                # modifications we made to it will be reflected in the remote
-                code_interpreter.upload_file(artifacts.local_save_path)
                 response = run_conversation(self.agent, int_chat)
                 if self.verbosity >= 1:
                     _LOGGER.info(response)
@@ -555,11 +510,8 @@ class VisionAgent(Agent):
                     obs_chat_elt: Message = {"role": "observation", "content": obs}
                     media_obs = check_and_load_image(code_action)
                     if media_obs and result.success:
-                        # media paths will be under the local_save_path when we download
-                        # them after each turn
                         obs_chat_elt["media"] = [
-                            artifacts.local_save_path.parent / media_ob
-                            for media_ob in media_obs
+                            artifacts.cwd / media_ob for media_ob in media_obs
                         ]
                     if self.verbosity >= 1:
@@ -581,15 +533,6 @@ class VisionAgent(Agent):
                 iterations += 1
                 last_response = response
-                # after each turn, download the artifacts locally
-                code_interpreter.download_file(
-                    str(artifacts.remote_save_path.name),
-                    str(artifacts.local_save_path),
-                )
-                artifacts.load(
-                    artifacts.local_save_path, artifacts.local_save_path.parent
-                )
         return orig_chat, artifacts
     def streaming_message(self, message: Dict[str, Any]) -> None:
@@ -604,9 +547,9 @@ class OpenAIVisionAgent(VisionAgent):
     def __init__(
         self,
         agent: Optional[LMM] = None,
+        cwd: Optional[Union[Path, str]] = None,
         verbosity: int = 0,
         callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
-        code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
     ) -> None:
         """Initialize the VisionAgent using OpenAI LMMs.
@@ -625,9 +568,9 @@ class OpenAIVisionAgent(VisionAgent):
         agent = OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
         super().__init__(
             agent,
+            cwd,
             verbosity,
             callback_message,
-            code_interpreter,
         )
@@ -635,9 +578,9 @@ class AnthropicVisionAgent(VisionAgent):
     def __init__(
         self,
         agent: Optional[LMM] = None,
+        cwd: Optional[Union[Path, str]] = None,
         verbosity: int = 0,
         callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
-        code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
     ) -> None:
         """Initialize the VisionAgent using Anthropic LMMs.
@@ -656,7 +599,7 @@ class AnthropicVisionAgent(VisionAgent):
         agent = AnthropicLMM(temperature=0.0) if agent is None else agent
         super().__init__(
             agent,
+            cwd,
             verbosity,
             callback_message,
-            code_interpreter,
         )

vision_agent/agent/vision_agent_coder.py CHANGED Viewed

@@ -450,12 +450,6 @@ class VisionAgentCoder(Agent):
             for chat_i in chat:
                 if "media" in chat_i:
                     for media in chat_i["media"]:
-                        media = (
-                            media
-                            if type(media) is str
-                            and media.startswith(("http", "https"))
-                            else code_interpreter.upload_file(cast(str, media))
-                        )
                         chat_i["content"] += f" Media name {media}"  # type: ignore
                         media_list.append(str(media))

vision-agent 0.2.199__py3-none-any.whl → 0.2.201__py3-none-any.whl

vision-agent 0.2.199py3-none-any.whl → 0.2.201py3-none-any.whl