PyPI - vision-agent - Versions diffs - 0.2.161__py3-none-any.whl → 0.2.162__py3-none-any.whl - Mend

vision-agent 0.2.161py3-none-any.whl → 0.2.162py3-none-any.whl

Files changed (13) hide show

vision_agent/agent/__init__.py +8 -0
vision_agent/agent/agent_utils.py +76 -2
vision_agent/agent/vision_agent.py +49 -17
vision_agent/agent/vision_agent_coder.py +163 -489
vision_agent/agent/vision_agent_coder_prompts.py +0 -203
vision_agent/agent/vision_agent_planner.py +553 -0
vision_agent/agent/vision_agent_planner_prompts.py +199 -0
vision_agent/tools/__init__.py +0 -1
vision_agent/tools/meta_tools.py +84 -3
{vision_agent-0.2.161.dist-info → vision_agent-0.2.162.dist-info}/METADATA +7 -7
{vision_agent-0.2.161.dist-info → vision_agent-0.2.162.dist-info}/RECORD +13 -11
{vision_agent-0.2.161.dist-info → vision_agent-0.2.162.dist-info}/LICENSE +0 -0
{vision_agent-0.2.161.dist-info → vision_agent-0.2.162.dist-info}/WHEEL +0 -0

vision_agent/agent/vision_agent_coder.py CHANGED Viewed

@@ -2,32 +2,33 @@ import copy
 import logging
 import os
 import sys
-from json import JSONDecodeError
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, cast
+from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
-from rich.console import Console
-from rich.style import Style
-from rich.syntax import Syntax
 from tabulate import tabulate
 import vision_agent.tools as T
-from vision_agent.agent import Agent
+from vision_agent.agent.agent import Agent
 from vision_agent.agent.agent_utils import (
+    DefaultImports,
     extract_code,
     extract_json,
+    format_memory,
+    print_code,
     remove_installs_from_code,
 )
 from vision_agent.agent.vision_agent_coder_prompts import (
     CODE,
     FIX_BUG,
     FULL_TASK,
-    PICK_PLAN,
-    PLAN,
-    PREVIOUS_FAILED,
     SIMPLE_TEST,
-    TEST_PLANS,
-    USER_REQ,
+)
+from vision_agent.agent.vision_agent_planner import (
+    AnthropicVisionAgentPlanner,
+    AzureVisionAgentPlanner,
+    OllamaVisionAgentPlanner,
+    OpenAIVisionAgentPlanner,
+    PlanContext,
 )
 from vision_agent.lmm import (
     LMM,
@@ -40,241 +41,11 @@ from vision_agent.lmm import (
 from vision_agent.tools.meta_tools import get_diff
 from vision_agent.utils import CodeInterpreterFactory, Execution
 from vision_agent.utils.execute import CodeInterpreter
-from vision_agent.utils.image_utils import b64_to_pil
-from vision_agent.utils.sim import AzureSim, OllamaSim, Sim
-from vision_agent.utils.video import play_video
 logging.basicConfig(stream=sys.stdout)
 WORKSPACE = Path(os.getenv("WORKSPACE", ""))
 _LOGGER = logging.getLogger(__name__)
 _MAX_TABULATE_COL_WIDTH = 80
-_CONSOLE = Console()
-class DefaultImports:
-    """Container for default imports used in the code execution."""
-    common_imports = [
-        "import os",
-        "import numpy as np",
-        "from vision_agent.tools import *",
-        "from typing import *",
-        "from pillow_heif import register_heif_opener",
-        "register_heif_opener()",
-    ]
-    @staticmethod
-    def to_code_string() -> str:
-        return "\n".join(DefaultImports.common_imports + T.__new_tools__)
-    @staticmethod
-    def prepend_imports(code: str) -> str:
-        """Run this method to prepend the default imports to the code.
-        NOTE: be sure to run this method after the custom tools have been registered.
-        """
-        return DefaultImports.to_code_string() + "\n\n" + code
-def format_memory(memory: List[Dict[str, str]]) -> str:
-    output_str = ""
-    for i, m in enumerate(memory):
-        output_str += f"### Feedback {i}:\n"
-        output_str += f"Code {i}:\n```python\n{m['code']}```\n\n"
-        output_str += f"Feedback {i}: {m['feedback']}\n\n"
-        if "edits" in m:
-            output_str += f"Edits {i}:\n{m['edits']}\n"
-        output_str += "\n"
-    return output_str
-def format_plans(plans: Dict[str, Any]) -> str:
-    plan_str = ""
-    for k, v in plans.items():
-        plan_str += "\n" + f"{k}: {v['thoughts']}\n"
-        plan_str += "    -" + "\n    -".join([e for e in v["instructions"]])
-    return plan_str
-def write_plans(
-    chat: List[Message],
-    tool_desc: str,
-    working_memory: str,
-    model: LMM,
-) -> Dict[str, Any]:
-    chat = copy.deepcopy(chat)
-    if chat[-1]["role"] != "user":
-        raise ValueError("Last chat message must be from the user.")
-    user_request = chat[-1]["content"]
-    context = USER_REQ.format(user_request=user_request)
-    prompt = PLAN.format(
-        context=context,
-        tool_desc=tool_desc,
-        feedback=working_memory,
-    )
-    chat[-1]["content"] = prompt
-    return extract_json(model(chat, stream=False))  # type: ignore
-def pick_plan(
-    chat: List[Message],
-    plans: Dict[str, Any],
-    tool_info: str,
-    model: LMM,
-    code_interpreter: CodeInterpreter,
-    media: List[str],
-    log_progress: Callable[[Dict[str, Any]], None],
-    verbosity: int = 0,
-    max_retries: int = 3,
-) -> Tuple[Dict[str, str], str]:
-    log_progress(
-        {
-            "type": "log",
-            "log_content": "Generating code to pick the best plan",
-            "status": "started",
-        }
-    )
-    chat = copy.deepcopy(chat)
-    if chat[-1]["role"] != "user":
-        raise ValueError("Last chat message must be from the user.")
-    plan_str = format_plans(plans)
-    prompt = TEST_PLANS.format(
-        docstring=tool_info, plans=plan_str, previous_attempts="", media=media
-    )
-    code = extract_code(model(prompt, stream=False))  # type: ignore
-    log_progress(
-        {
-            "type": "log",
-            "log_content": "Executing code to test plans",
-            "code": DefaultImports.prepend_imports(code),
-            "status": "running",
-        }
-    )
-    tool_output = code_interpreter.exec_isolation(DefaultImports.prepend_imports(code))
-    # Because of the way we trace function calls the trace information ends up in the
-    # results. We don't want to show this info to the LLM so we don't include it in the
-    # tool_output_str.
-    tool_output_str = tool_output.text(include_results=False).strip()
-    if verbosity == 2:
-        _print_code("Initial code and tests:", code)
-        _LOGGER.info(f"Initial code execution result:\n{tool_output_str}")
-    log_progress(
-        {
-            "type": "log",
-            "log_content": (
-                "Code execution succeeded"
-                if tool_output.success
-                else "Code execution failed"
-            ),
-            "code": DefaultImports.prepend_imports(code),
-            # "payload": tool_output.to_json(),
-            "status": "completed" if tool_output.success else "failed",
-        }
-    )
-    # retry if the tool output is empty or code fails
-    count = 0
-    while (
-        not tool_output.success
-        or (len(tool_output.logs.stdout) == 0 and len(tool_output.logs.stderr) == 0)
-    ) and count < max_retries:
-        prompt = TEST_PLANS.format(
-            docstring=tool_info,
-            plans=plan_str,
-            previous_attempts=PREVIOUS_FAILED.format(
-                code=code, error="\n".join(tool_output_str.splitlines()[-50:])
-            ),
-            media=media,
-        )
-        log_progress(
-            {
-                "type": "log",
-                "log_content": "Retrying code to test plans",
-                "status": "running",
-                "code": DefaultImports.prepend_imports(code),
-            }
-        )
-        code = extract_code(model(prompt, stream=False))  # type: ignore
-        tool_output = code_interpreter.exec_isolation(
-            DefaultImports.prepend_imports(code)
-        )
-        log_progress(
-            {
-                "type": "log",
-                "log_content": (
-                    "Code execution succeeded"
-                    if tool_output.success
-                    else "Code execution failed"
-                ),
-                "code": DefaultImports.prepend_imports(code),
-                # "payload": tool_output.to_json(),
-                "status": "completed" if tool_output.success else "failed",
-            }
-        )
-        tool_output_str = tool_output.text(include_results=False).strip()
-        if verbosity == 2:
-            _print_code("Code and test after attempted fix:", code)
-            _LOGGER.info(f"Code execution result after attempt {count + 1}")
-            _LOGGER.info(f"{tool_output_str}")
-        count += 1
-    if verbosity >= 1:
-        _print_code("Final code:", code)
-    user_req = chat[-1]["content"]
-    context = USER_REQ.format(user_request=user_req)
-    # because the tool picker model gets the image as well, we have to be careful with
-    # how much text we send it, so we truncate the tool output to 20,000 characters
-    prompt = PICK_PLAN.format(
-        context=context,
-        plans=format_plans(plans),
-        tool_output=tool_output_str[:20_000],
-    )
-    chat[-1]["content"] = prompt
-    count = 0
-    plan_thoughts = None
-    while plan_thoughts is None and count < max_retries:
-        try:
-            plan_thoughts = extract_json(model(chat, stream=False))  # type: ignore
-        except JSONDecodeError as e:
-            _LOGGER.exception(
-                f"Error while extracting JSON during picking best plan {str(e)}"
-            )
-            pass
-        count += 1
-    if (
-        plan_thoughts is None
-        or "best_plan" not in plan_thoughts
-        or ("best_plan" in plan_thoughts and plan_thoughts["best_plan"] not in plans)
-    ):
-        _LOGGER.info(f"Failed to pick best plan. Using the first plan. {plan_thoughts}")
-        plan_thoughts = {"best_plan": list(plans.keys())[0]}
-    if "thoughts" not in plan_thoughts:
-        plan_thoughts["thoughts"] = ""
-    if verbosity >= 1:
-        _LOGGER.info(f"Best plan:\n{plan_thoughts}")
-    log_progress(
-        {
-            "type": "log",
-            "log_content": "Picked best plan",
-            "status": "completed",
-            "payload": plans[plan_thoughts["best_plan"]],
-        }
-    )
-    return plan_thoughts, "```python\n" + code + "\n```\n" + tool_output_str
 def write_code(
@@ -393,7 +164,7 @@ def write_and_test_code(
         }
     )
     if verbosity == 2:
-        _print_code("Initial code and tests:", code, test)
+        print_code("Initial code and tests:", code, test)
         _LOGGER.info(
             f"Initial code execution result:\n{result.text(include_logs=True)}"
         )
@@ -418,7 +189,7 @@ def write_and_test_code(
         count += 1
     if verbosity >= 1:
-        _print_code("Final code and tests:", code, test)
+        print_code("Final code and tests:", code, test)
     return {
         "code": code,
@@ -537,7 +308,7 @@ def debug_code(
         }
     )
     if verbosity == 2:
-        _print_code("Code and test after attempted fix:", code, test)
+        print_code("Code and test after attempted fix:", code, test)
         _LOGGER.info(
             f"Reflection: {fixed_code_and_test['reflections']}\nCode execution result after attempted fix: {result.text(include_logs=True)}"
         )
@@ -545,62 +316,6 @@ def debug_code(
     return code, test, result
-def _print_code(title: str, code: str, test: Optional[str] = None) -> None:
-    _CONSOLE.print(title, style=Style(bgcolor="dark_orange3", bold=True))
-    _CONSOLE.print("=" * 30 + " Code " + "=" * 30)
-    _CONSOLE.print(
-        Syntax(
-            DefaultImports.prepend_imports(code),
-            "python",
-            theme="gruvbox-dark",
-            line_numbers=True,
-        )
-    )
-    if test:
-        _CONSOLE.print("=" * 30 + " Test " + "=" * 30)
-        _CONSOLE.print(Syntax(test, "python", theme="gruvbox-dark", line_numbers=True))
-def retrieve_tools(
-    plans: Dict[str, Dict[str, Any]],
-    tool_recommender: Sim,
-    log_progress: Callable[[Dict[str, Any]], None],
-    verbosity: int = 0,
-) -> Dict[str, str]:
-    log_progress(
-        {
-            "type": "log",
-            "log_content": ("Retrieving tools for each plan"),
-            "status": "started",
-        }
-    )
-    tool_info = []
-    tool_desc = []
-    tool_lists: Dict[str, List[Dict[str, str]]] = {}
-    for k, plan in plans.items():
-        tool_lists[k] = []
-        for task in plan["instructions"]:
-            tools = tool_recommender.top_k(task, k=2, thresh=0.3)
-            tool_info.extend([e["doc"] for e in tools])
-            tool_desc.extend([e["desc"] for e in tools])
-            tool_lists[k].extend(
-                {"description": e["desc"], "documentation": e["doc"]} for e in tools
-            )
-    if verbosity == 2:
-        tool_desc_str = "\n".join(set(tool_desc))
-        _LOGGER.info(f"Tools Description:\n{tool_desc_str}")
-    tool_lists_unique = {}
-    for k in tool_lists:
-        tool_lists_unique[k] = "\n\n".join(
-            set(e["documentation"] for e in tool_lists[k])
-        )
-    all_tools = "\n\n".join(set(tool_info))
-    tool_lists_unique["all"] = all_tools
-    return tool_lists_unique
 class VisionAgentCoder(Agent):
     """Vision Agent Coder is an agentic framework that can output code based on a user
     request. It can plan tasks, retrieve relevant tools, write code, write tests and
@@ -616,23 +331,22 @@ class VisionAgentCoder(Agent):
     def __init__(
         self,
-        planner: Optional[LMM] = None,
+        planner: Optional[Agent] = None,
         coder: Optional[LMM] = None,
         tester: Optional[LMM] = None,
         debugger: Optional[LMM] = None,
-        tool_recommender: Optional[Sim] = None,
         verbosity: int = 0,
         report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
-        code_sandbox_runtime: Optional[str] = None,
+        code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
     ) -> None:
         """Initialize the Vision Agent Coder.
         Parameters:
-            planner (Optional[LMM]): The planner model to use. Defaults to AnthropicLMM.
+            planner (Optional[Agent]): The planner model to use. Defaults to
+                AnthropicVisionAgentPlanner.
             coder (Optional[LMM]): The coder model to use. Defaults to AnthropicLMM.
             tester (Optional[LMM]): The tester model to use. Defaults to AnthropicLMM.
             debugger (Optional[LMM]): The debugger model to use. Defaults to AnthropicLMM.
-            tool_recommender (Optional[Sim]): The tool recommender model to use.
             verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the
                 highest verbosity level which will output all intermediate debugging
                 code.
@@ -641,14 +355,17 @@ class VisionAgentCoder(Agent):
                 in a web application where multiple VisionAgentCoder instances are
                 running in parallel. This callback ensures that the progress are not
                 mixed up.
-            code_sandbox_runtime (Optional[str]): the code sandbox runtime to use. A
-                code sandbox is used to run the generated code. It can be one of the
-                following values: None, "local" or "e2b". If None, VisionAgentCoder
-                will read the value from the environment variable CODE_SANDBOX_RUNTIME.
-                If it's also None, the local python runtime environment will be used.
+            code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
+                it can be one of: None, "local" or "e2b". If None, it will read from
+                the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
+                object is provided it will use that.
         """
-        self.planner = AnthropicLMM(temperature=0.0) if planner is None else planner
+        self.planner = (
+            AnthropicVisionAgentPlanner(verbosity=verbosity)
+            if planner is None
+            else planner
+        )
         self.coder = AnthropicLMM(temperature=0.0) if coder is None else coder
         self.tester = AnthropicLMM(temperature=0.0) if tester is None else tester
         self.debugger = AnthropicLMM(temperature=0.0) if debugger is None else debugger
@@ -656,21 +373,15 @@ class VisionAgentCoder(Agent):
         if self.verbosity > 0:
             _LOGGER.setLevel(logging.INFO)
-        self.tool_recommender = (
-            Sim(T.TOOLS_DF, sim_key="desc")
-            if tool_recommender is None
-            else tool_recommender
-        )
         self.report_progress_callback = report_progress_callback
-        self.code_sandbox_runtime = code_sandbox_runtime
+        self.code_interpreter = code_interpreter
     def __call__(
         self,
         input: Union[str, List[Message]],
         media: Optional[Union[str, Path]] = None,
     ) -> str:
-        """Chat with VisionAgentCoder and return intermediate information regarding the
-        task.
+        """Generate code based on a user request.
         Parameters:
             input (Union[str, List[Message]]): A conversation in the format of
@@ -686,46 +397,58 @@ class VisionAgentCoder(Agent):
             input = [{"role": "user", "content": input}]
             if media is not None:
                 input[0]["media"] = [media]
-        results = self.chat_with_workflow(input)
-        results.pop("working_memory")
-        return results["code"]  # type: ignore
+        code_and_context = self.generate_code(input)
+        return code_and_context["code"]  # type: ignore
-    def chat_with_workflow(
+    def generate_code_from_plan(
         self,
         chat: List[Message],
-        test_multi_plan: bool = True,
-        display_visualization: bool = False,
-        custom_tool_names: Optional[List[str]] = None,
+        plan_context: PlanContext,
+        code_interpreter: Optional[CodeInterpreter] = None,
     ) -> Dict[str, Any]:
-        """Chat with VisionAgentCoder and return intermediate information regarding the
-        task.
+        """Generates code and other intermediate outputs from a chat input and a plan.
+        The plan includes:
+            - plans: The plans generated by the planner.
+            - best_plan: The best plan selected by the planner.
+            - plan_thoughts: The thoughts of the planner, including any modifications
+                to the plan.
+            - tool_doc: The tool documentation for the best plan.
+            - tool_output: The tool output from the tools used by the best plan.
         Parameters:
-            chat (List[Message]): A conversation
-                in the format of:
-                [{"role": "user", "content": "describe your task here..."}]
-                or if it contains media files, it should be in the format of:
-                [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
-            test_multi_plan (bool): If True, it will test tools for multiple plans and
-                pick the best one based off of the tool results. If False, it will go
-                with the first plan.
-            display_visualization (bool): If True, it opens a new window locally to
-                show the image(s) created by visualization code (if there is any).
-            custom_tool_names (List[str]): A list of custom tools for the agent to pick
-                and use. If not provided, default to full tool set from vision_agent.tools.
+            chat (List[Message]): A conversation in the format of
+                [{"role": "user", "content": "describe your task here..."}].
+            plan_context (PlanContext): The context of the plan, including the plans,
+                best_plan, plan_thoughts, tool_doc, and tool_output.
+            test_multi_plan (bool): Whether to test multiple plans or just the best plan.
+            custom_tool_names (Optional[List[str]]): A list of custom tool names to use
+                for the planner.
         Returns:
-            Dict[str, Any]: A dictionary containing the code, test, test result, plan,
-                and working memory of the agent.
+            Dict[str, Any]: A dictionary containing the code output by the
+                VisionAgentCoder and other intermediate outputs. include:
+                - status (str): Whether or not the agent completed or failed generating
+                    the code.
+                - code (str): The code output by the VisionAgentCoder.
+                - test (str): The test output by the VisionAgentCoder.
+                - test_result (Execution): The result of the test execution.
+                - plans (Dict[str, Any]): The plans generated by the planner.
+                - plan_thoughts (str): The thoughts of the planner.
+                - working_memory (List[Dict[str, str]]): The working memory of the agent.
         """
         if not chat:
             raise ValueError("Chat cannot be empty.")
         # NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
-        with CodeInterpreterFactory.new_instance(
-            code_sandbox_runtime=self.code_sandbox_runtime
-        ) as code_interpreter:
+        code_interpreter = (
+            self.code_interpreter
+            if self.code_interpreter is not None
+            and not isinstance(self.code_interpreter, str)
+            else CodeInterpreterFactory.new_instance(
+                code_sandbox_runtime=self.code_interpreter,
+            )
+        )
+        with code_interpreter:
             chat = copy.deepcopy(chat)
             media_list = []
             for chat_i in chat:
@@ -759,74 +482,22 @@ class VisionAgentCoder(Agent):
             code = ""
             test = ""
             working_memory: List[Dict[str, str]] = []
-            results = {"code": "", "test": "", "plan": []}
-            plan = []
-            success = False
-            plans = self._create_plans(
-                int_chat, custom_tool_names, working_memory, self.planner
-            )
-            if test_multi_plan:
-                self._log_plans(plans, self.verbosity)
-            tool_infos = retrieve_tools(
-                plans,
-                self.tool_recommender,
-                self.log_progress,
-                self.verbosity,
-            )
-            if test_multi_plan:
-                plan_thoughts, tool_output_str = pick_plan(
-                    int_chat,
-                    plans,
-                    tool_infos["all"],
-                    self.coder,
-                    code_interpreter,
-                    media_list,
-                    self.log_progress,
-                    verbosity=self.verbosity,
-                )
-                best_plan = plan_thoughts["best_plan"]
-                plan_thoughts_str = plan_thoughts["thoughts"]
-            else:
-                best_plan = list(plans.keys())[0]
-                tool_output_str = ""
-                plan_thoughts_str = ""
-            if best_plan in plans and best_plan in tool_infos:
-                plan_i = plans[best_plan]
-                tool_info = tool_infos[best_plan]
-            else:
-                if self.verbosity >= 1:
-                    _LOGGER.warning(
-                        f"Best plan {best_plan} not found in plans or tool_infos. Using the first plan and tool info."
-                    )
-                k = list(plans.keys())[0]
-                plan_i = plans[k]
-                tool_info = tool_infos[k]
-            self.log_progress(
-                {
-                    "type": "log",
-                    "log_content": "Creating plans",
-                    "status": "completed",
-                    "payload": tool_info,
-                }
-            )
+            plan = plan_context.plans[plan_context.best_plan]
+            tool_doc = plan_context.tool_doc
+            tool_output_str = plan_context.tool_output
+            plan_thoughts_str = str(plan_context.plan_thoughts)
             if self.verbosity >= 1:
-                plan_i_fixed = [{"instructions": e} for e in plan_i["instructions"]]
+                plan_fixed = [{"instructions": e} for e in plan["instructions"]]
                 _LOGGER.info(
-                    f"Picked best plan:\n{tabulate(tabular_data=plan_i_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
+                    f"Picked best plan:\n{tabulate(tabular_data=plan_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
                 )
             results = write_and_test_code(
                 chat=[{"role": c["role"], "content": c["content"]} for c in int_chat],
-                plan=f"\n{plan_i['thoughts']}\n-"
-                + "\n-".join([e for e in plan_i["instructions"]]),
-                tool_info=tool_info,
+                plan=f"\n{plan['thoughts']}\n-"
+                + "\n-".join([e for e in plan["instructions"]]),
+                tool_info=tool_doc,
                 tool_output=tool_output_str,
                 plan_thoughts=plan_thoughts_str,
                 tool_utils=T.UTILITIES_DOCSTRING,
@@ -842,64 +513,83 @@ class VisionAgentCoder(Agent):
             success = cast(bool, results["success"])
             code = remove_installs_from_code(cast(str, results["code"]))
             test = remove_installs_from_code(cast(str, results["test"]))
-            working_memory.extend(results["working_memory"])  # type: ignore
-            plan.append({"code": code, "test": test, "plan": plan_i})
+            working_memory.extend(results["working_memory"])
             execution_result = cast(Execution, results["test_result"])
-            if display_visualization:
-                for res in execution_result.results:
-                    if res.png:
-                        b64_to_pil(res.png).show()
-                    if res.mp4:
-                        play_video(res.mp4)
             return {
                 "status": "completed" if success else "failed",
                 "code": DefaultImports.prepend_imports(code),
                 "test": test,
                 "test_result": execution_result,
-                "plans": plans,
+                "plans": plan_context.plans,
                 "plan_thoughts": plan_thoughts_str,
                 "working_memory": working_memory,
             }
-    def log_progress(self, data: Dict[str, Any]) -> None:
-        if self.report_progress_callback is not None:
-            self.report_progress_callback(data)
-    def _create_plans(
+    def generate_code(
         self,
-        int_chat: List[Message],
-        customized_tool_names: Optional[List[str]],
-        working_memory: List[Dict[str, str]],
-        planner: LMM,
+        chat: List[Message],
+        test_multi_plan: bool = True,
+        custom_tool_names: Optional[List[str]] = None,
     ) -> Dict[str, Any]:
-        self.log_progress(
-            {
-                "type": "log",
-                "log_content": "Creating plans",
-                "status": "started",
-            }
-        )
-        plans = write_plans(
-            int_chat,
-            T.get_tool_descriptions_by_names(
-                customized_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS  # type: ignore
-            ),
-            format_memory(working_memory),
-            planner,
+        """Generates code and other intermediate outputs from a chat input.
+        Parameters:
+            chat (List[Message]): A conversation in the format of
+                [{"role": "user", "content": "describe your task here..."}].
+            test_multi_plan (bool): Whether to test multiple plans or just the best plan.
+            custom_tool_names (Optional[List[str]]): A list of custom tool names to use
+                for the planner.
+        Returns:
+            Dict[str, Any]: A dictionary containing the code output by the
+                VisionAgentCoder and other intermediate outputs. include:
+                - status (str): Whether or not the agent completed or failed generating
+                    the code.
+                - code (str): The code output by the VisionAgentCoder.
+                - test (str): The test output by the VisionAgentCoder.
+                - test_result (Execution): The result of the test execution.
+                - plans (Dict[str, Any]): The plans generated by the planner.
+                - plan_thoughts (str): The thoughts of the planner.
+                - working_memory (List[Dict[str, str]]): The working memory of the agent.
+        """
+        if not chat:
+            raise ValueError("Chat cannot be empty.")
+        # NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
+        code_interpreter = (
+            self.code_interpreter
+            if self.code_interpreter is not None
+            and not isinstance(self.code_interpreter, str)
+            else CodeInterpreterFactory.new_instance(
+                code_sandbox_runtime=self.code_interpreter,
+            )
         )
-        return plans
+        with code_interpreter:
+            plan_context = self.planner.generate_plan(  # type: ignore
+                chat,
+                test_multi_plan=test_multi_plan,
+                custom_tool_names=custom_tool_names,
+                code_interpreter=code_interpreter,
+            )
-    def _log_plans(self, plans: Dict[str, Any], verbosity: int) -> None:
-        if verbosity >= 1:
-            for p in plans:
-                # tabulate will fail if the keys are not the same for all elements
-                p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
-                _LOGGER.info(
-                    f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
-                )
+            code_and_context = self.generate_code_from_plan(
+                chat,
+                plan_context,
+                code_interpreter=code_interpreter,
+            )
+        return code_and_context
+    def chat(self, chat: List[Message]) -> List[Message]:
+        chat = copy.deepcopy(chat)
+        code = self.generate_code(chat)
+        chat.append({"role": "agent", "content": code["code"]})
+        return chat
+    def log_progress(self, data: Dict[str, Any]) -> None:
+        if self.report_progress_callback is not None:
+            self.report_progress_callback(data)
 class OpenAIVisionAgentCoder(VisionAgentCoder):
@@ -907,17 +597,18 @@ class OpenAIVisionAgentCoder(VisionAgentCoder):
     def __init__(
         self,
-        planner: Optional[LMM] = None,
+        planner: Optional[Agent] = None,
         coder: Optional[LMM] = None,
         tester: Optional[LMM] = None,
         debugger: Optional[LMM] = None,
-        tool_recommender: Optional[Sim] = None,
         verbosity: int = 0,
         report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
-        code_sandbox_runtime: Optional[str] = None,
+        code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
     ) -> None:
         self.planner = (
-            OpenAILMM(temperature=0.0, json_mode=True) if planner is None else planner
+            OpenAIVisionAgentPlanner(verbosity=verbosity)
+            if planner is None
+            else planner
         )
         self.coder = OpenAILMM(temperature=0.0) if coder is None else coder
         self.tester = OpenAILMM(temperature=0.0) if tester is None else tester
@@ -926,13 +617,8 @@ class OpenAIVisionAgentCoder(VisionAgentCoder):
         if self.verbosity > 0:
             _LOGGER.setLevel(logging.INFO)
-        self.tool_recommender = (
-            Sim(T.TOOLS_DF, sim_key="desc")
-            if tool_recommender is None
-            else tool_recommender
-        )
         self.report_progress_callback = report_progress_callback
-        self.code_sandbox_runtime = code_sandbox_runtime
+        self.code_interpreter = code_interpreter
 class AnthropicVisionAgentCoder(VisionAgentCoder):
@@ -940,17 +626,20 @@ class AnthropicVisionAgentCoder(VisionAgentCoder):
     def __init__(
         self,
-        planner: Optional[LMM] = None,
+        planner: Optional[Agent] = None,
         coder: Optional[LMM] = None,
         tester: Optional[LMM] = None,
         debugger: Optional[LMM] = None,
-        tool_recommender: Optional[Sim] = None,
         verbosity: int = 0,
         report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
-        code_sandbox_runtime: Optional[str] = None,
+        code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
     ) -> None:
         # NOTE: Claude doesn't have an official JSON mode
-        self.planner = AnthropicLMM(temperature=0.0) if planner is None else planner
+        self.planner = (
+            AnthropicVisionAgentPlanner(verbosity=verbosity)
+            if planner is None
+            else planner
+        )
         self.coder = AnthropicLMM(temperature=0.0) if coder is None else coder
         self.tester = AnthropicLMM(temperature=0.0) if tester is None else tester
         self.debugger = AnthropicLMM(temperature=0.0) if debugger is None else debugger
@@ -958,15 +647,8 @@ class AnthropicVisionAgentCoder(VisionAgentCoder):
         if self.verbosity > 0:
             _LOGGER.setLevel(logging.INFO)
-        # Anthropic does not offer any embedding models and instead recomends Voyage,
-        # we're using OpenAI's embedder for now.
-        self.tool_recommender = (
-            Sim(T.TOOLS_DF, sim_key="desc")
-            if tool_recommender is None
-            else tool_recommender
-        )
         self.report_progress_callback = report_progress_callback
-        self.code_sandbox_runtime = code_sandbox_runtime
+        self.code_interpreter = code_interpreter
 class OllamaVisionAgentCoder(VisionAgentCoder):
@@ -988,17 +670,17 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
     def __init__(
         self,
-        planner: Optional[LMM] = None,
+        planner: Optional[Agent] = None,
         coder: Optional[LMM] = None,
         tester: Optional[LMM] = None,
         debugger: Optional[LMM] = None,
-        tool_recommender: Optional[Sim] = None,
         verbosity: int = 0,
         report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
+        code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
     ) -> None:
         super().__init__(
             planner=(
-                OllamaLMM(model_name="llama3.1", temperature=0.0, json_mode=True)
+                OllamaVisionAgentPlanner(verbosity=verbosity)
                 if planner is None
                 else planner
             ),
@@ -1017,13 +699,9 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
                 if debugger is None
                 else debugger
             ),
-            tool_recommender=(
-                OllamaSim(T.TOOLS_DF, sim_key="desc")
-                if tool_recommender is None
-                else tool_recommender
-            ),
             verbosity=verbosity,
             report_progress_callback=report_progress_callback,
+            code_interpreter=code_interpreter,
         )
@@ -1043,22 +721,22 @@ class AzureVisionAgentCoder(VisionAgentCoder):
     def __init__(
         self,
-        planner: Optional[LMM] = None,
+        planner: Optional[Agent] = None,
         coder: Optional[LMM] = None,
         tester: Optional[LMM] = None,
         debugger: Optional[LMM] = None,
-        tool_recommender: Optional[Sim] = None,
         verbosity: int = 0,
         report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
+        code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
     ) -> None:
         """Initialize the Vision Agent Coder.
         Parameters:
-            planner (Optional[LMM]): The planner model to use. Defaults to OpenAILMM.
+            planner (Optional[Agent]): The planner model to use. Defaults to
+                AzureVisionAgentPlanner.
             coder (Optional[LMM]): The coder model to use. Defaults to OpenAILMM.
             tester (Optional[LMM]): The tester model to use. Defaults to OpenAILMM.
             debugger (Optional[LMM]): The debugger model to
-            tool_recommender (Optional[Sim]): The tool recommender model to use.
             verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the
                 highest verbosity level which will output all intermediate debugging
                 code.
@@ -1069,7 +747,7 @@ class AzureVisionAgentCoder(VisionAgentCoder):
         """
         super().__init__(
             planner=(
-                AzureOpenAILMM(temperature=0.0, json_mode=True)
+                AzureVisionAgentPlanner(verbosity=verbosity)
                 if planner is None
                 else planner
             ),
@@ -1078,11 +756,7 @@ class AzureVisionAgentCoder(VisionAgentCoder):
             debugger=(
                 AzureOpenAILMM(temperature=0.0) if debugger is None else debugger
             ),
-            tool_recommender=(
-                AzureSim(T.TOOLS_DF, sim_key="desc")
-                if tool_recommender is None
-                else tool_recommender
-            ),
             verbosity=verbosity,
             report_progress_callback=report_progress_callback,
+            code_interpreter=code_interpreter,
         )

vision-agent 0.2.161__py3-none-any.whl → 0.2.162__py3-none-any.whl

vision-agent 0.2.161py3-none-any.whl → 0.2.162py3-none-any.whl