PyPI - vision-agent - Versions diffs - 0.2.161__py3-none-any.whl → 0.2.163__py3-none-any.whl - Mend

vision-agent 0.2.161py3-none-any.whl → 0.2.163py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

vision_agent/agent/__init__.py +8 -0
vision_agent/agent/agent_utils.py +98 -2
vision_agent/agent/vision_agent.py +54 -22
vision_agent/agent/vision_agent_coder.py +222 -512
vision_agent/agent/vision_agent_coder_prompts.py +12 -221
vision_agent/agent/vision_agent_planner.py +583 -0
vision_agent/agent/vision_agent_planner_prompts.py +199 -0
vision_agent/tools/__init__.py +0 -1
vision_agent/tools/meta_tools.py +107 -35
vision_agent/tools/tools.py +2 -2
{vision_agent-0.2.161.dist-info → vision_agent-0.2.163.dist-info}/METADATA +8 -7
{vision_agent-0.2.161.dist-info → vision_agent-0.2.163.dist-info}/RECORD +14 -12
{vision_agent-0.2.161.dist-info → vision_agent-0.2.163.dist-info}/LICENSE +0 -0
{vision_agent-0.2.161.dist-info → vision_agent-0.2.163.dist-info}/WHEEL +0 -0

vision_agent/agent/vision_agent_coder.py CHANGED Viewed

@@ -2,32 +2,35 @@ import copy
 import logging
 import os
 import sys
-from json import JSONDecodeError
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, cast
+from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
-from rich.console import Console
-from rich.style import Style
-from rich.syntax import Syntax
+from redbaron import RedBaron  # type: ignore
 from tabulate import tabulate
 import vision_agent.tools as T
-from vision_agent.agent import Agent
+from vision_agent.agent.agent import Agent
 from vision_agent.agent.agent_utils import (
+    _MAX_TABULATE_COL_WIDTH,
+    DefaultImports,
     extract_code,
-    extract_json,
+    extract_tag,
+    format_memory,
+    print_code,
     remove_installs_from_code,
 )
 from vision_agent.agent.vision_agent_coder_prompts import (
     CODE,
     FIX_BUG,
     FULL_TASK,
-    PICK_PLAN,
-    PLAN,
-    PREVIOUS_FAILED,
     SIMPLE_TEST,
-    TEST_PLANS,
-    USER_REQ,
+)
+from vision_agent.agent.vision_agent_planner import (
+    AnthropicVisionAgentPlanner,
+    AzureVisionAgentPlanner,
+    OllamaVisionAgentPlanner,
+    OpenAIVisionAgentPlanner,
+    PlanContext,
 )
 from vision_agent.lmm import (
     LMM,
@@ -40,241 +43,48 @@ from vision_agent.lmm import (
 from vision_agent.tools.meta_tools import get_diff
 from vision_agent.utils import CodeInterpreterFactory, Execution
 from vision_agent.utils.execute import CodeInterpreter
-from vision_agent.utils.image_utils import b64_to_pil
-from vision_agent.utils.sim import AzureSim, OllamaSim, Sim
-from vision_agent.utils.video import play_video
 logging.basicConfig(stream=sys.stdout)
 WORKSPACE = Path(os.getenv("WORKSPACE", ""))
 _LOGGER = logging.getLogger(__name__)
-_MAX_TABULATE_COL_WIDTH = 80
-_CONSOLE = Console()
-class DefaultImports:
-    """Container for default imports used in the code execution."""
-    common_imports = [
-        "import os",
-        "import numpy as np",
-        "from vision_agent.tools import *",
-        "from typing import *",
-        "from pillow_heif import register_heif_opener",
-        "register_heif_opener()",
-    ]
-    @staticmethod
-    def to_code_string() -> str:
-        return "\n".join(DefaultImports.common_imports + T.__new_tools__)
-    @staticmethod
-    def prepend_imports(code: str) -> str:
-        """Run this method to prepend the default imports to the code.
-        NOTE: be sure to run this method after the custom tools have been registered.
-        """
-        return DefaultImports.to_code_string() + "\n\n" + code
-def format_memory(memory: List[Dict[str, str]]) -> str:
-    output_str = ""
-    for i, m in enumerate(memory):
-        output_str += f"### Feedback {i}:\n"
-        output_str += f"Code {i}:\n```python\n{m['code']}```\n\n"
-        output_str += f"Feedback {i}: {m['feedback']}\n\n"
-        if "edits" in m:
-            output_str += f"Edits {i}:\n{m['edits']}\n"
-        output_str += "\n"
-    return output_str
-def format_plans(plans: Dict[str, Any]) -> str:
-    plan_str = ""
-    for k, v in plans.items():
-        plan_str += "\n" + f"{k}: {v['thoughts']}\n"
-        plan_str += "    -" + "\n    -".join([e for e in v["instructions"]])
-    return plan_str
-def write_plans(
-    chat: List[Message],
-    tool_desc: str,
-    working_memory: str,
-    model: LMM,
-) -> Dict[str, Any]:
-    chat = copy.deepcopy(chat)
-    if chat[-1]["role"] != "user":
-        raise ValueError("Last chat message must be from the user.")
-    user_request = chat[-1]["content"]
-    context = USER_REQ.format(user_request=user_request)
-    prompt = PLAN.format(
-        context=context,
-        tool_desc=tool_desc,
-        feedback=working_memory,
-    )
-    chat[-1]["content"] = prompt
-    return extract_json(model(chat, stream=False))  # type: ignore
-def pick_plan(
-    chat: List[Message],
-    plans: Dict[str, Any],
-    tool_info: str,
-    model: LMM,
-    code_interpreter: CodeInterpreter,
-    media: List[str],
-    log_progress: Callable[[Dict[str, Any]], None],
-    verbosity: int = 0,
-    max_retries: int = 3,
-) -> Tuple[Dict[str, str], str]:
-    log_progress(
-        {
-            "type": "log",
-            "log_content": "Generating code to pick the best plan",
-            "status": "started",
-        }
-    )
-    chat = copy.deepcopy(chat)
-    if chat[-1]["role"] != "user":
-        raise ValueError("Last chat message must be from the user.")
-    plan_str = format_plans(plans)
-    prompt = TEST_PLANS.format(
-        docstring=tool_info, plans=plan_str, previous_attempts="", media=media
-    )
-    code = extract_code(model(prompt, stream=False))  # type: ignore
-    log_progress(
-        {
-            "type": "log",
-            "log_content": "Executing code to test plans",
-            "code": DefaultImports.prepend_imports(code),
-            "status": "running",
-        }
-    )
-    tool_output = code_interpreter.exec_isolation(DefaultImports.prepend_imports(code))
-    # Because of the way we trace function calls the trace information ends up in the
-    # results. We don't want to show this info to the LLM so we don't include it in the
-    # tool_output_str.
-    tool_output_str = tool_output.text(include_results=False).strip()
-    if verbosity == 2:
-        _print_code("Initial code and tests:", code)
-        _LOGGER.info(f"Initial code execution result:\n{tool_output_str}")
-    log_progress(
-        {
-            "type": "log",
-            "log_content": (
-                "Code execution succeeded"
-                if tool_output.success
-                else "Code execution failed"
-            ),
-            "code": DefaultImports.prepend_imports(code),
-            # "payload": tool_output.to_json(),
-            "status": "completed" if tool_output.success else "failed",
-        }
-    )
-    # retry if the tool output is empty or code fails
-    count = 0
-    while (
-        not tool_output.success
-        or (len(tool_output.logs.stdout) == 0 and len(tool_output.logs.stderr) == 0)
-    ) and count < max_retries:
-        prompt = TEST_PLANS.format(
-            docstring=tool_info,
-            plans=plan_str,
-            previous_attempts=PREVIOUS_FAILED.format(
-                code=code, error="\n".join(tool_output_str.splitlines()[-50:])
-            ),
-            media=media,
-        )
-        log_progress(
-            {
-                "type": "log",
-                "log_content": "Retrying code to test plans",
-                "status": "running",
-                "code": DefaultImports.prepend_imports(code),
-            }
-        )
-        code = extract_code(model(prompt, stream=False))  # type: ignore
-        tool_output = code_interpreter.exec_isolation(
-            DefaultImports.prepend_imports(code)
-        )
-        log_progress(
-            {
-                "type": "log",
-                "log_content": (
-                    "Code execution succeeded"
-                    if tool_output.success
-                    else "Code execution failed"
-                ),
-                "code": DefaultImports.prepend_imports(code),
-                # "payload": tool_output.to_json(),
-                "status": "completed" if tool_output.success else "failed",
-            }
-        )
-        tool_output_str = tool_output.text(include_results=False).strip()
-        if verbosity == 2:
-            _print_code("Code and test after attempted fix:", code)
-            _LOGGER.info(f"Code execution result after attempt {count + 1}")
-            _LOGGER.info(f"{tool_output_str}")
-        count += 1
-    if verbosity >= 1:
-        _print_code("Final code:", code)
-    user_req = chat[-1]["content"]
-    context = USER_REQ.format(user_request=user_req)
-    # because the tool picker model gets the image as well, we have to be careful with
-    # how much text we send it, so we truncate the tool output to 20,000 characters
-    prompt = PICK_PLAN.format(
-        context=context,
-        plans=format_plans(plans),
-        tool_output=tool_output_str[:20_000],
-    )
-    chat[-1]["content"] = prompt
-    count = 0
-    plan_thoughts = None
-    while plan_thoughts is None and count < max_retries:
-        try:
-            plan_thoughts = extract_json(model(chat, stream=False))  # type: ignore
-        except JSONDecodeError as e:
-            _LOGGER.exception(
-                f"Error while extracting JSON during picking best plan {str(e)}"
-            )
-            pass
-        count += 1
-    if (
-        plan_thoughts is None
-        or "best_plan" not in plan_thoughts
-        or ("best_plan" in plan_thoughts and plan_thoughts["best_plan"] not in plans)
-    ):
-        _LOGGER.info(f"Failed to pick best plan. Using the first plan. {plan_thoughts}")
-        plan_thoughts = {"best_plan": list(plans.keys())[0]}
-    if "thoughts" not in plan_thoughts:
-        plan_thoughts["thoughts"] = ""
-    if verbosity >= 1:
-        _LOGGER.info(f"Best plan:\n{plan_thoughts}")
-    log_progress(
-        {
-            "type": "log",
-            "log_content": "Picked best plan",
-            "status": "completed",
-            "payload": plans[plan_thoughts["best_plan"]],
-        }
-    )
-    return plan_thoughts, "```python\n" + code + "\n```\n" + tool_output_str
+def strip_function_calls(code: str, exclusions: Optional[List[str]] = None) -> str:
+    """This will strip out all code that calls functions except for functions included
+    in exclusions.
+    """
+    if exclusions is None:
+        exclusions = []
+    red = RedBaron(code)
+    nodes_to_remove = []
+    for node in red:
+        if node.type == "def":
+            continue
+        elif node.type == "import" or node.type == "from_import":
+            continue
+        elif node.type == "call":
+            if node.value and node.value[0].value in exclusions:
+                continue
+            nodes_to_remove.append(node)
+        elif node.type == "atomtrailers":
+            if node[0].value in exclusions:
+                continue
+            nodes_to_remove.append(node)
+        elif node.type == "assignment":
+            if node.value.type == "call" or node.value.type == "atomtrailers":
+                func_name = node.value[0].value
+                if func_name in exclusions:
+                    continue
+                nodes_to_remove.append(node)
+        elif node.type == "endl":
+            continue
+        else:
+            nodes_to_remove.append(node)
+    for node in nodes_to_remove:
+        node.parent.remove(node)
+    cleaned_code = red.dumps().strip()
+    return cleaned_code if isinstance(cleaned_code, str) else code
 def write_code(
@@ -359,6 +169,7 @@ def write_and_test_code(
         plan_thoughts,
         format_memory(working_memory),
     )
+    code = strip_function_calls(code)
     test = write_test(
         tester, chat, tool_utils, code, format_memory(working_memory), media
     )
@@ -393,7 +204,7 @@ def write_and_test_code(
         }
     )
     if verbosity == 2:
-        _print_code("Initial code and tests:", code, test)
+        print_code("Initial code and tests:", code, test)
         _LOGGER.info(
             f"Initial code execution result:\n{result.text(include_logs=True)}"
         )
@@ -418,7 +229,7 @@ def write_and_test_code(
         count += 1
     if verbosity >= 1:
-        _print_code("Final code and tests:", code, test)
+        print_code("Final code and tests:", code, test)
     return {
         "code": code,
@@ -449,7 +260,9 @@ def debug_code(
         }
     )
-    fixed_code_and_test = {"code": "", "test": "", "reflections": ""}
+    fixed_code = None
+    fixed_test = None
+    thoughts = ""
     success = False
     count = 0
     while not success and count < 3:
@@ -472,21 +285,16 @@ def debug_code(
                 stream=False,
             )
             fixed_code_and_test_str = cast(str, fixed_code_and_test_str)
-            fixed_code_and_test = extract_json(fixed_code_and_test_str)
-            code = extract_code(fixed_code_and_test_str)
-            if (
-                "which_code" in fixed_code_and_test
-                and fixed_code_and_test["which_code"] == "test"
-            ):
-                fixed_code_and_test["code"] = ""
-                fixed_code_and_test["test"] = code
-            else:  # for everything else always assume it's updating code
-                fixed_code_and_test["code"] = code
-                fixed_code_and_test["test"] = ""
-            if "which_code" in fixed_code_and_test:
-                del fixed_code_and_test["which_code"]
-            success = True
+            thoughts_tag = extract_tag(fixed_code_and_test_str, "thoughts")
+            thoughts = thoughts_tag if thoughts_tag is not None else ""
+            fixed_code = extract_tag(fixed_code_and_test_str, "code")
+            fixed_test = extract_tag(fixed_code_and_test_str, "test")
+            if fixed_code is None and fixed_test is None:
+                success = False
+            else:
+                success = True
         except Exception as e:
             _LOGGER.exception(f"Error while extracting JSON: {e}")
@@ -495,15 +303,15 @@ def debug_code(
     old_code = code
     old_test = test
-    if fixed_code_and_test["code"].strip() != "":
-        code = fixed_code_and_test["code"]
-    if fixed_code_and_test["test"].strip() != "":
-        test = fixed_code_and_test["test"]
+    if fixed_code is not None and fixed_code.strip() != "":
+        code = fixed_code
+    if fixed_test is not None and fixed_test.strip() != "":
+        test = fixed_test
     new_working_memory.append(
         {
             "code": f"{code}\n{test}",
-            "feedback": fixed_code_and_test["reflections"],
+            "feedback": thoughts,
             "edits": get_diff(f"{old_code}\n{old_test}", f"{code}\n{test}"),
         }
     )
@@ -537,70 +345,14 @@ def debug_code(
         }
     )
     if verbosity == 2:
-        _print_code("Code and test after attempted fix:", code, test)
+        print_code("Code and test after attempted fix:", code, test)
         _LOGGER.info(
-            f"Reflection: {fixed_code_and_test['reflections']}\nCode execution result after attempted fix: {result.text(include_logs=True)}"
+            f"Reflection: {thoughts}\nCode execution result after attempted fix: {result.text(include_logs=True)}"
         )
     return code, test, result
-def _print_code(title: str, code: str, test: Optional[str] = None) -> None:
-    _CONSOLE.print(title, style=Style(bgcolor="dark_orange3", bold=True))
-    _CONSOLE.print("=" * 30 + " Code " + "=" * 30)
-    _CONSOLE.print(
-        Syntax(
-            DefaultImports.prepend_imports(code),
-            "python",
-            theme="gruvbox-dark",
-            line_numbers=True,
-        )
-    )
-    if test:
-        _CONSOLE.print("=" * 30 + " Test " + "=" * 30)
-        _CONSOLE.print(Syntax(test, "python", theme="gruvbox-dark", line_numbers=True))
-def retrieve_tools(
-    plans: Dict[str, Dict[str, Any]],
-    tool_recommender: Sim,
-    log_progress: Callable[[Dict[str, Any]], None],
-    verbosity: int = 0,
-) -> Dict[str, str]:
-    log_progress(
-        {
-            "type": "log",
-            "log_content": ("Retrieving tools for each plan"),
-            "status": "started",
-        }
-    )
-    tool_info = []
-    tool_desc = []
-    tool_lists: Dict[str, List[Dict[str, str]]] = {}
-    for k, plan in plans.items():
-        tool_lists[k] = []
-        for task in plan["instructions"]:
-            tools = tool_recommender.top_k(task, k=2, thresh=0.3)
-            tool_info.extend([e["doc"] for e in tools])
-            tool_desc.extend([e["desc"] for e in tools])
-            tool_lists[k].extend(
-                {"description": e["desc"], "documentation": e["doc"]} for e in tools
-            )
-    if verbosity == 2:
-        tool_desc_str = "\n".join(set(tool_desc))
-        _LOGGER.info(f"Tools Description:\n{tool_desc_str}")
-    tool_lists_unique = {}
-    for k in tool_lists:
-        tool_lists_unique[k] = "\n\n".join(
-            set(e["documentation"] for e in tool_lists[k])
-        )
-    all_tools = "\n\n".join(set(tool_info))
-    tool_lists_unique["all"] = all_tools
-    return tool_lists_unique
 class VisionAgentCoder(Agent):
     """Vision Agent Coder is an agentic framework that can output code based on a user
     request. It can plan tasks, retrieve relevant tools, write code, write tests and
@@ -616,23 +368,22 @@ class VisionAgentCoder(Agent):
     def __init__(
         self,
-        planner: Optional[LMM] = None,
+        planner: Optional[Agent] = None,
         coder: Optional[LMM] = None,
         tester: Optional[LMM] = None,
         debugger: Optional[LMM] = None,
-        tool_recommender: Optional[Sim] = None,
         verbosity: int = 0,
         report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
-        code_sandbox_runtime: Optional[str] = None,
+        code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
     ) -> None:
         """Initialize the Vision Agent Coder.
         Parameters:
-            planner (Optional[LMM]): The planner model to use. Defaults to AnthropicLMM.
+            planner (Optional[Agent]): The planner model to use. Defaults to
+                AnthropicVisionAgentPlanner.
             coder (Optional[LMM]): The coder model to use. Defaults to AnthropicLMM.
             tester (Optional[LMM]): The tester model to use. Defaults to AnthropicLMM.
             debugger (Optional[LMM]): The debugger model to use. Defaults to AnthropicLMM.
-            tool_recommender (Optional[Sim]): The tool recommender model to use.
             verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the
                 highest verbosity level which will output all intermediate debugging
                 code.
@@ -641,14 +392,17 @@ class VisionAgentCoder(Agent):
                 in a web application where multiple VisionAgentCoder instances are
                 running in parallel. This callback ensures that the progress are not
                 mixed up.
-            code_sandbox_runtime (Optional[str]): the code sandbox runtime to use. A
-                code sandbox is used to run the generated code. It can be one of the
-                following values: None, "local" or "e2b". If None, VisionAgentCoder
-                will read the value from the environment variable CODE_SANDBOX_RUNTIME.
-                If it's also None, the local python runtime environment will be used.
+            code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
+                it can be one of: None, "local" or "e2b". If None, it will read from
+                the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
+                object is provided it will use that.
         """
-        self.planner = AnthropicLMM(temperature=0.0) if planner is None else planner
+        self.planner = (
+            AnthropicVisionAgentPlanner(verbosity=verbosity)
+            if planner is None
+            else planner
+        )
         self.coder = AnthropicLMM(temperature=0.0) if coder is None else coder
         self.tester = AnthropicLMM(temperature=0.0) if tester is None else tester
         self.debugger = AnthropicLMM(temperature=0.0) if debugger is None else debugger
@@ -656,21 +410,15 @@ class VisionAgentCoder(Agent):
         if self.verbosity > 0:
             _LOGGER.setLevel(logging.INFO)
-        self.tool_recommender = (
-            Sim(T.TOOLS_DF, sim_key="desc")
-            if tool_recommender is None
-            else tool_recommender
-        )
         self.report_progress_callback = report_progress_callback
-        self.code_sandbox_runtime = code_sandbox_runtime
+        self.code_interpreter = code_interpreter
     def __call__(
         self,
         input: Union[str, List[Message]],
         media: Optional[Union[str, Path]] = None,
     ) -> str:
-        """Chat with VisionAgentCoder and return intermediate information regarding the
-        task.
+        """Generate code based on a user request.
         Parameters:
             input (Union[str, List[Message]]): A conversation in the format of
@@ -686,46 +434,58 @@ class VisionAgentCoder(Agent):
             input = [{"role": "user", "content": input}]
             if media is not None:
                 input[0]["media"] = [media]
-        results = self.chat_with_workflow(input)
-        results.pop("working_memory")
-        return results["code"]  # type: ignore
+        code_and_context = self.generate_code(input)
+        return code_and_context["code"]  # type: ignore
-    def chat_with_workflow(
+    def generate_code_from_plan(
         self,
         chat: List[Message],
-        test_multi_plan: bool = True,
-        display_visualization: bool = False,
-        custom_tool_names: Optional[List[str]] = None,
+        plan_context: PlanContext,
+        code_interpreter: Optional[CodeInterpreter] = None,
     ) -> Dict[str, Any]:
-        """Chat with VisionAgentCoder and return intermediate information regarding the
-        task.
+        """Generates code and other intermediate outputs from a chat input and a plan.
+        The plan includes:
+            - plans: The plans generated by the planner.
+            - best_plan: The best plan selected by the planner.
+            - plan_thoughts: The thoughts of the planner, including any modifications
+                to the plan.
+            - tool_doc: The tool documentation for the best plan.
+            - tool_output: The tool output from the tools used by the best plan.
         Parameters:
-            chat (List[Message]): A conversation
-                in the format of:
-                [{"role": "user", "content": "describe your task here..."}]
-                or if it contains media files, it should be in the format of:
-                [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
-            test_multi_plan (bool): If True, it will test tools for multiple plans and
-                pick the best one based off of the tool results. If False, it will go
-                with the first plan.
-            display_visualization (bool): If True, it opens a new window locally to
-                show the image(s) created by visualization code (if there is any).
-            custom_tool_names (List[str]): A list of custom tools for the agent to pick
-                and use. If not provided, default to full tool set from vision_agent.tools.
+            chat (List[Message]): A conversation in the format of
+                [{"role": "user", "content": "describe your task here..."}].
+            plan_context (PlanContext): The context of the plan, including the plans,
+                best_plan, plan_thoughts, tool_doc, and tool_output.
+            test_multi_plan (bool): Whether to test multiple plans or just the best plan.
+            custom_tool_names (Optional[List[str]]): A list of custom tool names to use
+                for the planner.
         Returns:
-            Dict[str, Any]: A dictionary containing the code, test, test result, plan,
-                and working memory of the agent.
+            Dict[str, Any]: A dictionary containing the code output by the
+                VisionAgentCoder and other intermediate outputs. include:
+                - status (str): Whether or not the agent completed or failed generating
+                    the code.
+                - code (str): The code output by the VisionAgentCoder.
+                - test (str): The test output by the VisionAgentCoder.
+                - test_result (Execution): The result of the test execution.
+                - plans (Dict[str, Any]): The plans generated by the planner.
+                - plan_thoughts (str): The thoughts of the planner.
+                - working_memory (List[Dict[str, str]]): The working memory of the agent.
         """
         if not chat:
             raise ValueError("Chat cannot be empty.")
         # NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
-        with CodeInterpreterFactory.new_instance(
-            code_sandbox_runtime=self.code_sandbox_runtime
-        ) as code_interpreter:
+        code_interpreter = (
+            self.code_interpreter
+            if self.code_interpreter is not None
+            and not isinstance(self.code_interpreter, str)
+            else CodeInterpreterFactory.new_instance(
+                code_sandbox_runtime=self.code_interpreter,
+            )
+        )
+        with code_interpreter:
             chat = copy.deepcopy(chat)
             media_list = []
             for chat_i in chat:
@@ -759,74 +519,22 @@ class VisionAgentCoder(Agent):
             code = ""
             test = ""
             working_memory: List[Dict[str, str]] = []
-            results = {"code": "", "test": "", "plan": []}
-            plan = []
-            success = False
-            plans = self._create_plans(
-                int_chat, custom_tool_names, working_memory, self.planner
-            )
-            if test_multi_plan:
-                self._log_plans(plans, self.verbosity)
-            tool_infos = retrieve_tools(
-                plans,
-                self.tool_recommender,
-                self.log_progress,
-                self.verbosity,
-            )
-            if test_multi_plan:
-                plan_thoughts, tool_output_str = pick_plan(
-                    int_chat,
-                    plans,
-                    tool_infos["all"],
-                    self.coder,
-                    code_interpreter,
-                    media_list,
-                    self.log_progress,
-                    verbosity=self.verbosity,
-                )
-                best_plan = plan_thoughts["best_plan"]
-                plan_thoughts_str = plan_thoughts["thoughts"]
-            else:
-                best_plan = list(plans.keys())[0]
-                tool_output_str = ""
-                plan_thoughts_str = ""
-            if best_plan in plans and best_plan in tool_infos:
-                plan_i = plans[best_plan]
-                tool_info = tool_infos[best_plan]
-            else:
-                if self.verbosity >= 1:
-                    _LOGGER.warning(
-                        f"Best plan {best_plan} not found in plans or tool_infos. Using the first plan and tool info."
-                    )
-                k = list(plans.keys())[0]
-                plan_i = plans[k]
-                tool_info = tool_infos[k]
-            self.log_progress(
-                {
-                    "type": "log",
-                    "log_content": "Creating plans",
-                    "status": "completed",
-                    "payload": tool_info,
-                }
-            )
+            plan = plan_context.plans[plan_context.best_plan]
+            tool_doc = plan_context.tool_doc
+            tool_output_str = plan_context.tool_output
+            plan_thoughts_str = str(plan_context.plan_thoughts)
             if self.verbosity >= 1:
-                plan_i_fixed = [{"instructions": e} for e in plan_i["instructions"]]
+                plan_fixed = [{"instructions": e} for e in plan["instructions"]]
                 _LOGGER.info(
-                    f"Picked best plan:\n{tabulate(tabular_data=plan_i_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
+                    f"Picked best plan:\n{tabulate(tabular_data=plan_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
                 )
             results = write_and_test_code(
                 chat=[{"role": c["role"], "content": c["content"]} for c in int_chat],
-                plan=f"\n{plan_i['thoughts']}\n-"
-                + "\n-".join([e for e in plan_i["instructions"]]),
-                tool_info=tool_info,
+                plan=f"\n{plan['thoughts']}\n-"
+                + "\n-".join([e for e in plan["instructions"]]),
+                tool_info=tool_doc,
                 tool_output=tool_output_str,
                 plan_thoughts=plan_thoughts_str,
                 tool_utils=T.UTILITIES_DOCSTRING,
@@ -842,64 +550,82 @@ class VisionAgentCoder(Agent):
             success = cast(bool, results["success"])
             code = remove_installs_from_code(cast(str, results["code"]))
             test = remove_installs_from_code(cast(str, results["test"]))
-            working_memory.extend(results["working_memory"])  # type: ignore
-            plan.append({"code": code, "test": test, "plan": plan_i})
+            working_memory.extend(results["working_memory"])
             execution_result = cast(Execution, results["test_result"])
-            if display_visualization:
-                for res in execution_result.results:
-                    if res.png:
-                        b64_to_pil(res.png).show()
-                    if res.mp4:
-                        play_video(res.mp4)
             return {
                 "status": "completed" if success else "failed",
                 "code": DefaultImports.prepend_imports(code),
                 "test": test,
                 "test_result": execution_result,
-                "plans": plans,
+                "plans": plan_context.plans,
                 "plan_thoughts": plan_thoughts_str,
                 "working_memory": working_memory,
             }
-    def log_progress(self, data: Dict[str, Any]) -> None:
-        if self.report_progress_callback is not None:
-            self.report_progress_callback(data)
-    def _create_plans(
+    def generate_code(
         self,
-        int_chat: List[Message],
-        customized_tool_names: Optional[List[str]],
-        working_memory: List[Dict[str, str]],
-        planner: LMM,
+        chat: List[Message],
+        test_multi_plan: bool = True,
+        custom_tool_names: Optional[List[str]] = None,
     ) -> Dict[str, Any]:
-        self.log_progress(
-            {
-                "type": "log",
-                "log_content": "Creating plans",
-                "status": "started",
-            }
-        )
-        plans = write_plans(
-            int_chat,
-            T.get_tool_descriptions_by_names(
-                customized_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS  # type: ignore
-            ),
-            format_memory(working_memory),
-            planner,
+        """Generates code and other intermediate outputs from a chat input.
+        Parameters:
+            chat (List[Message]): A conversation in the format of
+                [{"role": "user", "content": "describe your task here..."}].
+            test_multi_plan (bool): Whether to test multiple plans or just the best plan.
+            custom_tool_names (Optional[List[str]]): A list of custom tool names to use
+                for the planner.
+        Returns:
+            Dict[str, Any]: A dictionary containing the code output by the
+                VisionAgentCoder and other intermediate outputs. include:
+                - status (str): Whether or not the agent completed or failed generating
+                    the code.
+                - code (str): The code output by the VisionAgentCoder.
+                - test (str): The test output by the VisionAgentCoder.
+                - test_result (Execution): The result of the test execution.
+                - plans (Dict[str, Any]): The plans generated by the planner.
+                - plan_thoughts (str): The thoughts of the planner.
+                - working_memory (List[Dict[str, str]]): The working memory of the agent.
+        """
+        if not chat:
+            raise ValueError("Chat cannot be empty.")
+        # NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
+        code_interpreter = (
+            self.code_interpreter
+            if self.code_interpreter is not None
+            and not isinstance(self.code_interpreter, str)
+            else CodeInterpreterFactory.new_instance(
+                code_sandbox_runtime=self.code_interpreter,
+            )
         )
-        return plans
+        with code_interpreter:
+            plan_context = self.planner.generate_plan(  # type: ignore
+                chat,
+                test_multi_plan=test_multi_plan,
+                custom_tool_names=custom_tool_names,
+                code_interpreter=code_interpreter,
+            )
-    def _log_plans(self, plans: Dict[str, Any], verbosity: int) -> None:
-        if verbosity >= 1:
-            for p in plans:
-                # tabulate will fail if the keys are not the same for all elements
-                p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
-                _LOGGER.info(
-                    f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
-                )
+            code_and_context = self.generate_code_from_plan(
+                chat,
+                plan_context,
+                code_interpreter=code_interpreter,
+            )
+        return code_and_context
+    def chat(self, chat: List[Message]) -> List[Message]:
+        chat = copy.deepcopy(chat)
+        code = self.generate_code(chat)
+        chat.append({"role": "agent", "content": code["code"]})
+        return chat
+    def log_progress(self, data: Dict[str, Any]) -> None:
+        if self.report_progress_callback is not None:
+            self.report_progress_callback(data)
 class OpenAIVisionAgentCoder(VisionAgentCoder):
@@ -907,17 +633,18 @@ class OpenAIVisionAgentCoder(VisionAgentCoder):
     def __init__(
         self,
-        planner: Optional[LMM] = None,
+        planner: Optional[Agent] = None,
         coder: Optional[LMM] = None,
         tester: Optional[LMM] = None,
         debugger: Optional[LMM] = None,
-        tool_recommender: Optional[Sim] = None,
         verbosity: int = 0,
         report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
-        code_sandbox_runtime: Optional[str] = None,
+        code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
     ) -> None:
         self.planner = (
-            OpenAILMM(temperature=0.0, json_mode=True) if planner is None else planner
+            OpenAIVisionAgentPlanner(verbosity=verbosity)
+            if planner is None
+            else planner
         )
         self.coder = OpenAILMM(temperature=0.0) if coder is None else coder
         self.tester = OpenAILMM(temperature=0.0) if tester is None else tester
@@ -926,13 +653,8 @@ class OpenAIVisionAgentCoder(VisionAgentCoder):
         if self.verbosity > 0:
             _LOGGER.setLevel(logging.INFO)
-        self.tool_recommender = (
-            Sim(T.TOOLS_DF, sim_key="desc")
-            if tool_recommender is None
-            else tool_recommender
-        )
         self.report_progress_callback = report_progress_callback
-        self.code_sandbox_runtime = code_sandbox_runtime
+        self.code_interpreter = code_interpreter
 class AnthropicVisionAgentCoder(VisionAgentCoder):
@@ -940,17 +662,20 @@ class AnthropicVisionAgentCoder(VisionAgentCoder):
     def __init__(
         self,
-        planner: Optional[LMM] = None,
+        planner: Optional[Agent] = None,
         coder: Optional[LMM] = None,
         tester: Optional[LMM] = None,
         debugger: Optional[LMM] = None,
-        tool_recommender: Optional[Sim] = None,
         verbosity: int = 0,
         report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
-        code_sandbox_runtime: Optional[str] = None,
+        code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
     ) -> None:
         # NOTE: Claude doesn't have an official JSON mode
-        self.planner = AnthropicLMM(temperature=0.0) if planner is None else planner
+        self.planner = (
+            AnthropicVisionAgentPlanner(verbosity=verbosity)
+            if planner is None
+            else planner
+        )
         self.coder = AnthropicLMM(temperature=0.0) if coder is None else coder
         self.tester = AnthropicLMM(temperature=0.0) if tester is None else tester
         self.debugger = AnthropicLMM(temperature=0.0) if debugger is None else debugger
@@ -958,15 +683,8 @@ class AnthropicVisionAgentCoder(VisionAgentCoder):
         if self.verbosity > 0:
             _LOGGER.setLevel(logging.INFO)
-        # Anthropic does not offer any embedding models and instead recomends Voyage,
-        # we're using OpenAI's embedder for now.
-        self.tool_recommender = (
-            Sim(T.TOOLS_DF, sim_key="desc")
-            if tool_recommender is None
-            else tool_recommender
-        )
         self.report_progress_callback = report_progress_callback
-        self.code_sandbox_runtime = code_sandbox_runtime
+        self.code_interpreter = code_interpreter
 class OllamaVisionAgentCoder(VisionAgentCoder):
@@ -988,17 +706,17 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
     def __init__(
         self,
-        planner: Optional[LMM] = None,
+        planner: Optional[Agent] = None,
         coder: Optional[LMM] = None,
         tester: Optional[LMM] = None,
         debugger: Optional[LMM] = None,
-        tool_recommender: Optional[Sim] = None,
         verbosity: int = 0,
         report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
+        code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
     ) -> None:
         super().__init__(
             planner=(
-                OllamaLMM(model_name="llama3.1", temperature=0.0, json_mode=True)
+                OllamaVisionAgentPlanner(verbosity=verbosity)
                 if planner is None
                 else planner
             ),
@@ -1017,13 +735,9 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
                 if debugger is None
                 else debugger
             ),
-            tool_recommender=(
-                OllamaSim(T.TOOLS_DF, sim_key="desc")
-                if tool_recommender is None
-                else tool_recommender
-            ),
             verbosity=verbosity,
             report_progress_callback=report_progress_callback,
+            code_interpreter=code_interpreter,
         )
@@ -1043,22 +757,22 @@ class AzureVisionAgentCoder(VisionAgentCoder):
     def __init__(
         self,
-        planner: Optional[LMM] = None,
+        planner: Optional[Agent] = None,
         coder: Optional[LMM] = None,
         tester: Optional[LMM] = None,
         debugger: Optional[LMM] = None,
-        tool_recommender: Optional[Sim] = None,
         verbosity: int = 0,
         report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
+        code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
     ) -> None:
         """Initialize the Vision Agent Coder.
         Parameters:
-            planner (Optional[LMM]): The planner model to use. Defaults to OpenAILMM.
+            planner (Optional[Agent]): The planner model to use. Defaults to
+                AzureVisionAgentPlanner.
             coder (Optional[LMM]): The coder model to use. Defaults to OpenAILMM.
             tester (Optional[LMM]): The tester model to use. Defaults to OpenAILMM.
             debugger (Optional[LMM]): The debugger model to
-            tool_recommender (Optional[Sim]): The tool recommender model to use.
             verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the
                 highest verbosity level which will output all intermediate debugging
                 code.
@@ -1069,7 +783,7 @@ class AzureVisionAgentCoder(VisionAgentCoder):
         """
         super().__init__(
             planner=(
-                AzureOpenAILMM(temperature=0.0, json_mode=True)
+                AzureVisionAgentPlanner(verbosity=verbosity)
                 if planner is None
                 else planner
             ),
@@ -1078,11 +792,7 @@ class AzureVisionAgentCoder(VisionAgentCoder):
             debugger=(
                 AzureOpenAILMM(temperature=0.0) if debugger is None else debugger
             ),
-            tool_recommender=(
-                AzureSim(T.TOOLS_DF, sim_key="desc")
-                if tool_recommender is None
-                else tool_recommender
-            ),
             verbosity=verbosity,
             report_progress_callback=report_progress_callback,
+            code_interpreter=code_interpreter,
         )

vision-agent 0.2.161__py3-none-any.whl → 0.2.163__py3-none-any.whl

vision-agent 0.2.161py3-none-any.whl → 0.2.163py3-none-any.whl