PyPI - vision-agent - Versions diffs - 0.2.140__py3-none-any.whl → 0.2.141__py3-none-any.whl - Mend

vision-agent 0.2.140py3-none-any.whl → 0.2.141py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

vision_agent/agent/__init__.py +2 -1
vision_agent/agent/agent_utils.py +8 -2
vision_agent/agent/vision_agent.py +97 -17
vision_agent/agent/vision_agent_coder.py +93 -66
vision_agent/agent/vision_agent_coder_prompts.py +53 -19
vision_agent/agent/vision_agent_prompts.py +31 -9
vision_agent/lmm/__init__.py +1 -1
vision_agent/lmm/lmm.py +6 -9
vision_agent/tools/__init__.py +1 -1
vision_agent/tools/meta_tools.py +64 -32
vision_agent/tools/tools.py +115 -30
vision_agent/tools/tools_types.py +1 -0
vision_agent/utils/image_utils.py +18 -7
vision_agent/utils/video.py +2 -1
{vision_agent-0.2.140.dist-info → vision_agent-0.2.141.dist-info}/METADATA +60 -12
vision_agent-0.2.141.dist-info/RECORD +33 -0
vision_agent-0.2.140.dist-info/RECORD +0 -33
{vision_agent-0.2.140.dist-info → vision_agent-0.2.141.dist-info}/LICENSE +0 -0
{vision_agent-0.2.140.dist-info → vision_agent-0.2.141.dist-info}/WHEEL +0 -0

vision_agent/agent/__init__.py CHANGED Viewed

@@ -1,8 +1,9 @@
 from .agent import Agent
 from .vision_agent import VisionAgent
 from .vision_agent_coder import (
+    AnthropicVisionAgentCoder,
     AzureVisionAgentCoder,
-    ClaudeVisionAgentCoder,
     OllamaVisionAgentCoder,
+    OpenAIVisionAgentCoder,
     VisionAgentCoder,
 )

vision_agent/agent/agent_utils.py CHANGED Viewed

@@ -40,12 +40,18 @@ def _strip_markdown_code(inp_str: str) -> str:
 def extract_json(json_str: str) -> Dict[str, Any]:
-    json_str = json_str.replace("\n", " ").strip()
+    json_str_mod = json_str.replace("\n", " ").strip()
+    json_str_mod = json_str_mod.replace("'", '"')
+    json_str_mod = json_str_mod.replace(": True", ": true").replace(
+        ": False", ": false"
+    )
     try:
-        return json.loads(json_str)  # type: ignore
+        return json.loads(json_str_mod)  # type: ignore
     except json.JSONDecodeError:
         json_orig = json_str
+        # don't replace quotes here or booleans since it can also introduce errors
+        json_str = json_str.replace("\n", " ").strip()
         json_str = _strip_markdown_code(json_str)
         json_str = _find_markdown_json(json_str)
         json_dict = _extract_sub_json(json_str)

vision_agent/agent/vision_agent.py CHANGED Viewed

@@ -3,18 +3,23 @@ import logging
 import os
 import tempfile
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union, cast, Callable
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
 from vision_agent.agent import Agent
 from vision_agent.agent.agent_utils import extract_json
 from vision_agent.agent.vision_agent_prompts import (
     EXAMPLES_CODE1,
     EXAMPLES_CODE2,
+    EXAMPLES_CODE3,
     VA_CODE,
 )
-from vision_agent.lmm import LMM, Message, OpenAILMM
+from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
 from vision_agent.tools import META_TOOL_DOCSTRING
-from vision_agent.tools.meta_tools import Artifacts, use_extra_vision_agent_args
+from vision_agent.tools.meta_tools import (
+    Artifacts,
+    check_and_load_image,
+    use_extra_vision_agent_args,
+)
 from vision_agent.utils import CodeInterpreterFactory
 from vision_agent.utils.execute import CodeInterpreter, Execution
@@ -30,7 +35,7 @@ class BoilerplateCode:
     pre_code = [
         "from typing import *",
         "from vision_agent.utils.execute import CodeInterpreter",
-        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, florence2_fine_tuning, use_florence2_fine_tuning",
+        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
         "artifacts = Artifacts('{remote_path}')",
         "artifacts.load('{remote_path}')",
     ]
@@ -68,10 +73,18 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
     prompt = VA_CODE.format(
         documentation=META_TOOL_DOCSTRING,
-        examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}",
+        examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}",
         conversation=conversation,
     )
-    return extract_json(orch([{"role": "user", "content": prompt}], stream=False))  # type: ignore
+    message: Message = {"role": "user", "content": prompt}
+    # only add recent media so we don't overload the model with old images
+    if (
+        chat[-1]["role"] == "observation"
+        and "media" in chat[-1]
+        and len(chat[-1]["media"]) > 0  # type: ignore
+    ):
+        message["media"] = chat[-1]["media"]
+    return extract_json(orch([message], stream=False))  # type: ignore
 def run_code_action(
@@ -136,10 +149,8 @@ class VisionAgent(Agent):
             code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
         """
-        self.agent = (
-            OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
-        )
-        self.max_iterations = 100
+        self.agent = AnthropicLMM(temperature=0.0) if agent is None else agent
+        self.max_iterations = 12
         self.verbosity = verbosity
         self.code_sandbox_runtime = code_sandbox_runtime
         self.callback_message = callback_message
@@ -267,7 +278,8 @@ class VisionAgent(Agent):
             orig_chat.append({"role": "observation", "content": artifacts_loaded})
             self.streaming_message({"role": "observation", "content": artifacts_loaded})
-            if isinstance(last_user_message_content, str):
+            if int_chat[-1]["role"] == "user":
+                last_user_message_content = cast(str, int_chat[-1].get("content", ""))
                 user_code_action = parse_execution(last_user_message_content, False)
                 if user_code_action is not None:
                     user_result, user_obs = run_code_action(
@@ -309,8 +321,7 @@ class VisionAgent(Agent):
                 else:
                     self.streaming_message({"role": "assistant", "content": response})
-                if response["let_user_respond"]:
-                    break
+                finished = response["let_user_respond"]
                 code_action = parse_execution(
                     response["response"], test_multi_plan, customized_tool_names
@@ -321,13 +332,22 @@ class VisionAgent(Agent):
                         code_action, code_interpreter, str(remote_artifacts_path)
                     )
+                    media_obs = check_and_load_image(code_action)
                     if self.verbosity >= 1:
                         _LOGGER.info(obs)
+                    chat_elt: Message = {"role": "observation", "content": obs}
+                    if media_obs and result.success:
+                        chat_elt["media"] = [
+                            Path(code_interpreter.remote_path) / media_ob
+                            for media_ob in media_obs
+                        ]
                     # don't add execution results to internal chat
-                    int_chat.append({"role": "observation", "content": obs})
-                    orig_chat.append(
-                        {"role": "observation", "content": obs, "execution": result}
-                    )
+                    int_chat.append(chat_elt)
+                    chat_elt["execution"] = result
+                    orig_chat.append(chat_elt)
                     self.streaming_message(
                         {
                             "role": "observation",
@@ -353,3 +373,63 @@ class VisionAgent(Agent):
     def log_progress(self, data: Dict[str, Any]) -> None:
         pass
+class OpenAIVisionAgent(VisionAgent):
+    def __init__(
+        self,
+        agent: Optional[LMM] = None,
+        verbosity: int = 0,
+        local_artifacts_path: Optional[Union[str, Path]] = None,
+        code_sandbox_runtime: Optional[str] = None,
+        callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
+    ) -> None:
+        """Initialize the VisionAgent using OpenAI LMMs.
+        Parameters:
+            agent (Optional[LMM]): The agent to use for conversation and orchestration
+                of other agents.
+            verbosity (int): The verbosity level of the agent.
+            local_artifacts_path (Optional[Union[str, Path]]): The path to the local
+                artifacts file.
+            code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
+        """
+        agent = OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
+        super().__init__(
+            agent,
+            verbosity,
+            local_artifacts_path,
+            code_sandbox_runtime,
+            callback_message,
+        )
+class AnthropicVisionAgent(VisionAgent):
+    def __init__(
+        self,
+        agent: Optional[LMM] = None,
+        verbosity: int = 0,
+        local_artifacts_path: Optional[Union[str, Path]] = None,
+        code_sandbox_runtime: Optional[str] = None,
+        callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
+    ) -> None:
+        """Initialize the VisionAgent using Anthropic LMMs.
+        Parameters:
+            agent (Optional[LMM]): The agent to use for conversation and orchestration
+                of other agents.
+            verbosity (int): The verbosity level of the agent.
+            local_artifacts_path (Optional[Union[str, Path]]): The path to the local
+                artifacts file.
+            code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
+        """
+        agent = AnthropicLMM(temperature=0.0) if agent is None else agent
+        super().__init__(
+            agent,
+            verbosity,
+            local_artifacts_path,
+            code_sandbox_runtime,
+            callback_message,
+        )

vision_agent/agent/vision_agent_coder.py CHANGED Viewed

@@ -2,12 +2,10 @@ import copy
 import logging
 import os
 import sys
-import tempfile
 from json import JSONDecodeError
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, cast
-from PIL import Image
 from rich.console import Console
 from rich.style import Style
 from rich.syntax import Syntax
@@ -29,8 +27,8 @@ from vision_agent.agent.vision_agent_coder_prompts import (
 )
 from vision_agent.lmm import (
     LMM,
+    AnthropicLMM,
     AzureOpenAILMM,
-    ClaudeSonnetLMM,
     Message,
     OllamaLMM,
     OpenAILMM,
@@ -53,6 +51,9 @@ class DefaultImports:
     """Container for default imports used in the code execution."""
     common_imports = [
+        "import os",
+        "import numpy as np",
+        "from vision_agent.tools import *",
         "from typing import *",
         "from pillow_heif import register_heif_opener",
         "register_heif_opener()",
@@ -92,29 +93,6 @@ def format_plans(plans: Dict[str, Any]) -> str:
     return plan_str
-def extract_image(
-    media: Optional[Sequence[Union[str, Path]]],
-) -> Optional[Sequence[Union[str, Path]]]:
-    if media is None:
-        return None
-    new_media = []
-    for m in media:
-        m = Path(m)
-        extension = m.suffix
-        if extension in [".jpg", ".jpeg", ".png", ".bmp"]:
-            new_media.append(m)
-        elif extension in [".mp4", ".mov"]:
-            frames = T.extract_frames(m)
-            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
-                if len(frames) > 0:
-                    Image.fromarray(frames[0][0]).save(tmp.name)
-                    new_media.append(Path(tmp.name))
-    if len(new_media) == 0:
-        return None
-    return new_media
 def write_plans(
     chat: List[Message],
     tool_desc: str,
@@ -146,7 +124,7 @@ def pick_plan(
     log_progress: Callable[[Dict[str, Any]], None],
     verbosity: int = 0,
     max_retries: int = 3,
-) -> Tuple[str, str]:
+) -> Tuple[Dict[str, str], str]:
     log_progress(
         {
             "type": "log",
@@ -199,7 +177,10 @@ def pick_plan(
     # retry if the tool output is empty or code fails
     count = 0
-    while (not tool_output.success or tool_output_str == "") and count < max_retries:
+    while (
+        not tool_output.success
+        or (len(tool_output.logs.stdout) == 0 and len(tool_output.logs.stderr) == 0)
+    ) and count < max_retries:
         prompt = TEST_PLANS.format(
             docstring=tool_info,
             plans=plan_str,
@@ -238,6 +219,7 @@ def pick_plan(
         if verbosity == 2:
             _print_code("Code and test after attempted fix:", code)
             _LOGGER.info(f"Code execution result after attempt {count + 1}")
+            _LOGGER.info(f"{tool_output_str}")
         count += 1
@@ -256,10 +238,10 @@ def pick_plan(
     chat[-1]["content"] = prompt
     count = 0
-    best_plan = None
-    while best_plan is None and count < max_retries:
+    plan_thoughts = None
+    while plan_thoughts is None and count < max_retries:
         try:
-            best_plan = extract_json(model(chat, stream=False))  # type: ignore
+            plan_thoughts = extract_json(model(chat, stream=False))  # type: ignore
         except JSONDecodeError as e:
             _LOGGER.exception(
                 f"Error while extracting JSON during picking best plan {str(e)}"
@@ -268,23 +250,27 @@ def pick_plan(
         count += 1
     if (
-        best_plan is None
-        or "best_plan" not in best_plan
-        or ("best_plan" in best_plan and best_plan["best_plan"] not in plans)
+        plan_thoughts is None
+        or "best_plan" not in plan_thoughts
+        or ("best_plan" in plan_thoughts and plan_thoughts["best_plan"] not in plans)
     ):
-        best_plan = {"best_plan": list(plans.keys())[0]}
+        _LOGGER.info(f"Failed to pick best plan. Using the first plan. {plan_thoughts}")
+        plan_thoughts = {"best_plan": list(plans.keys())[0]}
+    if "thoughts" not in plan_thoughts:
+        plan_thoughts["thoughts"] = ""
     if verbosity >= 1:
-        _LOGGER.info(f"Best plan:\n{best_plan}")
+        _LOGGER.info(f"Best plan:\n{plan_thoughts}")
     log_progress(
         {
             "type": "log",
             "log_content": "Picked best plan",
             "status": "completed",
-            "payload": plans[best_plan["best_plan"]],
+            "payload": plans[plan_thoughts["best_plan"]],
         }
     )
-    return best_plan["best_plan"], tool_output_str
+    return plan_thoughts, "```python\n" + code + "\n```\n" + tool_output_str
 def write_code(
@@ -292,6 +278,7 @@ def write_code(
     chat: List[Message],
     plan: str,
     tool_info: str,
+    plan_thoughts: str,
     tool_output: str,
     feedback: str,
 ) -> str:
@@ -304,6 +291,7 @@ def write_code(
         docstring=tool_info,
         question=FULL_TASK.format(user_request=user_request, subtasks=plan),
         tool_output=tool_output,
+        plan_thoughts=plan_thoughts,
         feedback=feedback,
     )
     chat[-1]["content"] = prompt
@@ -339,6 +327,7 @@ def write_and_test_code(
     plan: str,
     tool_info: str,
     tool_output: str,
+    plan_thoughts: str,
     tool_utils: str,
     working_memory: List[Dict[str, str]],
     coder: LMM,
@@ -363,6 +352,7 @@ def write_and_test_code(
         plan,
         tool_info,
         tool_output,
+        plan_thoughts,
         format_memory(working_memory),
     )
     test = write_test(
@@ -634,31 +624,30 @@ class VisionAgentCoder(Agent):
         """Initialize the Vision Agent Coder.
         Parameters:
-            planner (Optional[LMM]): The planner model to use. Defaults to OpenAILMM.
-            coder (Optional[LMM]): The coder model to use. Defaults to OpenAILMM.
-            tester (Optional[LMM]): The tester model to use. Defaults to OpenAILMM.
-            debugger (Optional[LMM]): The debugger model to
+            planner (Optional[LMM]): The planner model to use. Defaults to AnthropicLMM.
+            coder (Optional[LMM]): The coder model to use. Defaults to AnthropicLMM.
+            tester (Optional[LMM]): The tester model to use. Defaults to AnthropicLMM.
+            debugger (Optional[LMM]): The debugger model to use. Defaults to AnthropicLMM.
             tool_recommender (Optional[Sim]): The tool recommender model to use.
             verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the
                 highest verbosity level which will output all intermediate debugging
                 code.
-            report_progress_callback: a callback to report the progress of the agent.
-                This is useful for streaming logs in a web application where multiple
-                VisionAgentCoder instances are running in parallel. This callback
-                ensures that the progress are not mixed up.
-            code_sandbox_runtime: the code sandbox runtime to use. A code sandbox is
-                 used to run the generated code. It can be one of the following
-                 values: None, "local" or "e2b". If None, VisionAgentCoder will read
-                 the value from the environment variable CODE_SANDBOX_RUNTIME. If it's
-                 also None, the local python runtime environment will be used.
+            report_progress_callback (Optional[Callable[Dict[str, Any]]]): a callback
+                to report the progress of the agent. This is useful for streaming logs
+                in a web application where multiple VisionAgentCoder instances are
+                running in parallel. This callback ensures that the progress are not
+                mixed up.
+            code_sandbox_runtime (Optional[str]): the code sandbox runtime to use. A
+                code sandbox is used to run the generated code. It can be one of the
+                following values: None, "local" or "e2b". If None, VisionAgentCoder
+                will read the value from the environment variable CODE_SANDBOX_RUNTIME.
+                If it's also None, the local python runtime environment will be used.
         """
-        self.planner = (
-            OpenAILMM(temperature=0.0, json_mode=True) if planner is None else planner
-        )
-        self.coder = OpenAILMM(temperature=0.0) if coder is None else coder
-        self.tester = OpenAILMM(temperature=0.0) if tester is None else tester
-        self.debugger = OpenAILMM(temperature=0.0) if debugger is None else debugger
+        self.planner = AnthropicLMM(temperature=0.0) if planner is None else planner
+        self.coder = AnthropicLMM(temperature=0.0) if coder is None else coder
+        self.tester = AnthropicLMM(temperature=0.0) if tester is None else tester
+        self.debugger = AnthropicLMM(temperature=0.0) if debugger is None else debugger
         self.verbosity = verbosity
         if self.verbosity > 0:
             _LOGGER.setLevel(logging.INFO)
@@ -785,7 +774,7 @@ class VisionAgentCoder(Agent):
             )
             if test_multi_plan:
-                best_plan, tool_output_str = pick_plan(
+                plan_thoughts, tool_output_str = pick_plan(
                     int_chat,
                     plans,
                     tool_infos["all"],
@@ -795,9 +784,12 @@ class VisionAgentCoder(Agent):
                     self.log_progress,
                     verbosity=self.verbosity,
                 )
+                best_plan = plan_thoughts["best_plan"]
+                plan_thoughts_str = plan_thoughts["thoughts"]
             else:
                 best_plan = list(plans.keys())[0]
                 tool_output_str = ""
+                plan_thoughts_str = ""
             if best_plan in plans and best_plan in tool_infos:
                 plan_i = plans[best_plan]
@@ -832,6 +824,7 @@ class VisionAgentCoder(Agent):
                 + "\n-".join([e for e in plan_i["instructions"]]),
                 tool_info=tool_info,
                 tool_output=tool_output_str,
+                plan_thoughts=plan_thoughts_str,
                 tool_utils=T.UTILITIES_DOCSTRING,
                 working_memory=working_memory,
                 coder=self.coder,
@@ -862,7 +855,8 @@ class VisionAgentCoder(Agent):
                 "code": DefaultImports.prepend_imports(code),
                 "test": test,
                 "test_result": execution_result,
-                "plan": plan,
+                "plans": plans,
+                "plan_thoughts": plan_thoughts_str,
                 "working_memory": working_memory,
             }
@@ -904,7 +898,9 @@ class VisionAgentCoder(Agent):
                 )
-class ClaudeVisionAgentCoder(VisionAgentCoder):
+class OpenAIVisionAgentCoder(VisionAgentCoder):
+    """Initializes Vision Agent Coder using OpenAI models for planning, coding, testing."""
     def __init__(
         self,
         planner: Optional[LMM] = None,
@@ -916,13 +912,44 @@ class ClaudeVisionAgentCoder(VisionAgentCoder):
         report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
         code_sandbox_runtime: Optional[str] = None,
     ) -> None:
-        # NOTE: Claude doesn't have an official JSON mode
-        self.planner = ClaudeSonnetLMM(temperature=0.0) if planner is None else planner
-        self.coder = ClaudeSonnetLMM(temperature=0.0) if coder is None else coder
-        self.tester = ClaudeSonnetLMM(temperature=0.0) if tester is None else tester
-        self.debugger = (
-            ClaudeSonnetLMM(temperature=0.0) if debugger is None else debugger
+        self.planner = (
+            OpenAILMM(temperature=0.0, json_mode=True) if planner is None else planner
+        )
+        self.coder = OpenAILMM(temperature=0.0) if coder is None else coder
+        self.tester = OpenAILMM(temperature=0.0) if tester is None else tester
+        self.debugger = OpenAILMM(temperature=0.0) if debugger is None else debugger
+        self.verbosity = verbosity
+        if self.verbosity > 0:
+            _LOGGER.setLevel(logging.INFO)
+        self.tool_recommender = (
+            Sim(T.TOOLS_DF, sim_key="desc")
+            if tool_recommender is None
+            else tool_recommender
         )
+        self.report_progress_callback = report_progress_callback
+        self.code_sandbox_runtime = code_sandbox_runtime
+class AnthropicVisionAgentCoder(VisionAgentCoder):
+    """Initializes Vision Agent Coder using Anthropic models for planning, coding, testing."""
+    def __init__(
+        self,
+        planner: Optional[LMM] = None,
+        coder: Optional[LMM] = None,
+        tester: Optional[LMM] = None,
+        debugger: Optional[LMM] = None,
+        tool_recommender: Optional[Sim] = None,
+        verbosity: int = 0,
+        report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
+        code_sandbox_runtime: Optional[str] = None,
+    ) -> None:
+        # NOTE: Claude doesn't have an official JSON mode
+        self.planner = AnthropicLMM(temperature=0.0) if planner is None else planner
+        self.coder = AnthropicLMM(temperature=0.0) if coder is None else coder
+        self.tester = AnthropicLMM(temperature=0.0) if tester is None else tester
+        self.debugger = AnthropicLMM(temperature=0.0) if debugger is None else debugger
         self.verbosity = verbosity
         if self.verbosity > 0:
             _LOGGER.setLevel(logging.INFO)

vision-agent 0.2.140__py3-none-any.whl → 0.2.141__py3-none-any.whl

vision-agent 0.2.140py3-none-any.whl → 0.2.141py3-none-any.whl