PyPI - vision-agent - Versions diffs - 0.2.14__py3-none-any.whl → 0.2.16__py3-none-any.whl - Mend

vision-agent 0.2.14py3-none-any.whl → 0.2.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

vision_agent/agent/__init__.py +1 -0
vision_agent/agent/agent_coder.py +33 -7
vision_agent/agent/vision_agent.py +16 -14
vision_agent/agent/vision_agent_v2.py +300 -0
vision_agent/agent/vision_agent_v2_prompt.py +170 -0
vision_agent/llm/llm.py +11 -3
vision_agent/tools/__init__.py +3 -3
vision_agent/tools/tool_utils.py +1 -1
vision_agent/tools/tools.py +62 -41
vision_agent/tools/tools_v2.py +278 -17
vision_agent/utils/__init__.py +3 -0
vision_agent/utils/execute.py +104 -0
vision_agent/utils/sim.py +70 -0
{vision_agent-0.2.14.dist-info → vision_agent-0.2.16.dist-info}/METADATA +4 -1
vision_agent-0.2.16.dist-info/RECORD +34 -0
vision_agent/agent/execution.py +0 -287
vision_agent-0.2.14.dist-info/RECORD +0 -30
/vision_agent/{image_utils.py → utils/image_utils.py} +0 -0
/vision_agent/{type_defs.py → utils/type_defs.py} +0 -0
/vision_agent/{tools → utils}/video.py +0 -0
{vision_agent-0.2.14.dist-info → vision_agent-0.2.16.dist-info}/LICENSE +0 -0
{vision_agent-0.2.14.dist-info → vision_agent-0.2.16.dist-info}/WHEEL +0 -0

vision_agent/agent/__init__.py CHANGED Viewed

@@ -3,3 +3,4 @@ from .agent_coder import AgentCoder
 from .easytool import EasyTool
 from .reflexion import Reflexion
 from .vision_agent import VisionAgent
+from .vision_agent_v2 import VisionAgentV2

vision_agent/agent/agent_coder.py CHANGED Viewed

@@ -6,15 +6,40 @@ from pathlib import Path
 from typing import Dict, List, Optional, Union
 from vision_agent.agent import Agent
+from vision_agent.agent.agent_coder_prompts import (
+    DEBUG,
+    FIX_BUG,
+    PROGRAM,
+    TEST,
+    VISUAL_TEST,
+)
 from vision_agent.llm import LLM, OpenAILLM
 from vision_agent.lmm import LMM, OpenAILMM
-from vision_agent.tools.tools_v2 import TOOLS_DOCSTRING, UTILITIES_DOCSTRING
-from .agent_coder_prompts import DEBUG, FIX_BUG, PROGRAM, TEST, VISUAL_TEST
-from .execution import IMPORT_HELPER, check_correctness
+from vision_agent.tools.tools_v2 import TOOL_DOCSTRING, UTILITIES_DOCSTRING
+from vision_agent.utils import Execute
+IMPORT_HELPER = """
+import math
+import re
+import sys
+import copy
+import datetime
+import itertools
+import collections
+import heapq
+import statistics
+import functools
+import hashlib
+import numpy
+import numpy as np
+import string
+from typing import *
+from collections import *
+from vision_agent.tools.tools_v2 import *
+"""
 logging.basicConfig(stream=sys.stdout)
 _LOGGER = logging.getLogger(__name__)
+_EXECUTE = Execute()
 def write_tests(question: str, code: str, model: LLM) -> str:
@@ -40,7 +65,7 @@ def parse_file_name(s: str) -> str:
 def write_program(question: str, feedback: str, model: LLM) -> str:
     prompt = PROGRAM.format(
-        docstring=TOOLS_DOCSTRING, question=question, feedback=feedback
+        docstring=TOOL_DOCSTRING, question=question, feedback=feedback
     )
     completion = model(prompt)
     return preprocess_data(completion)
@@ -59,14 +84,15 @@ def write_debug(question: str, code: str, feedback: str, model: LLM) -> str:
 def execute_tests(code: str, tests: str) -> Dict[str, Union[str, bool]]:
     full_code = f"{IMPORT_HELPER}\n{code}\n{tests}"
-    return check_correctness(full_code, 20.0)
+    success, result = _EXECUTE.run_isolation(full_code)
+    return {"code": code, "result": result, "passed": success}
 def run_visual_tests(
     question: str, code: str, viz_file: str, feedback: str, model: LMM
 ) -> Dict[str, Union[str, bool]]:
     prompt = VISUAL_TEST.format(
-        docstring=TOOLS_DOCSTRING,
+        docstring=TOOL_DOCSTRING,
         code=code,
         question=question,
         feedback=feedback,

vision_agent/agent/vision_agent.py CHANGED Viewed

@@ -8,18 +8,8 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
 from PIL import Image
 from tabulate import tabulate
-from vision_agent.image_utils import (
-    convert_to_b64,
-    overlay_bboxes,
-    overlay_heat_map,
-    overlay_masks,
-)
-from vision_agent.llm import LLM, OpenAILLM
-from vision_agent.lmm import LMM, OpenAILMM
-from vision_agent.tools import TOOLS
-from .agent import Agent
-from .easytool_prompts import (
+from vision_agent.agent.agent import Agent
+from vision_agent.agent.easytool_prompts import (
     ANSWER_GENERATE,
     ANSWER_SUMMARIZE,
     CHOOSE_PARAMETER,
@@ -27,7 +17,7 @@ from .easytool_prompts import (
     TASK_DECOMPOSE,
     TASK_TOPOLOGY,
 )
-from .vision_agent_prompts import (
+from vision_agent.agent.vision_agent_prompts import (
     ANSWER_GENERATE_DEPENDS,
     ANSWER_SUMMARIZE_DEPENDS,
     CHOOSE_PARAMETER_DEPENDS,
@@ -35,6 +25,15 @@ from .vision_agent_prompts import (
     TASK_DECOMPOSE_DEPENDS,
     VISION_AGENT_REFLECTION,
 )
+from vision_agent.llm import LLM, OpenAILLM
+from vision_agent.lmm import LMM, OpenAILMM
+from vision_agent.tools import TOOLS
+from vision_agent.utils.image_utils import (
+    convert_to_b64,
+    overlay_bboxes,
+    overlay_heat_map,
+    overlay_masks,
+)
 logging.basicConfig(stream=sys.stdout)
 _LOGGER = logging.getLogger(__name__)
@@ -309,7 +308,7 @@ def _handle_extract_frames(
     # any following processing
     for video_file_output in tool_result["call_results"]:
         # When the video tool is run with wrong parameters, exit the loop
-        if len(video_file_output) < 2:
+        if not isinstance(video_file_output, tuple) or len(video_file_output) < 2:
             break
         for frame, _ in video_file_output:
             image = frame
@@ -561,6 +560,9 @@ class VisionAgent(Agent):
             list of all the tool results. The last item in the tool results also
             contains the visualized output.
         """
+        if len(chat) == 0:
+            raise ValueError("Input cannot be empty.")
         question = chat[0]["content"]
         if image:
             question += f" Image name: {image}"

vision_agent/agent/vision_agent_v2.py ADDED Viewed

@@ -0,0 +1,300 @@
+import json
+import logging
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from rich.console import Console
+from rich.syntax import Syntax
+from tabulate import tabulate
+from vision_agent.agent import Agent
+from vision_agent.agent.vision_agent_v2_prompt import (
+    CODE,
+    CODE_SYS_MSG,
+    DEBUG,
+    DEBUG_EXAMPLE,
+    DEBUG_SYS_MSG,
+    PLAN,
+    PREV_CODE_CONTEXT,
+    PREV_CODE_CONTEXT_WITH_REFLECTION,
+    TEST,
+    USER_REQ_CONTEXT,
+    USER_REQ_SUBTASK_CONTEXT,
+)
+from vision_agent.llm import LLM, OpenAILLM
+from vision_agent.tools.tools_v2 import TOOL_DESCRIPTIONS, TOOLS_DF
+from vision_agent.utils import Execute, Sim
+logging.basicConfig(level=logging.INFO)
+_LOGGER = logging.getLogger(__name__)
+_MAX_TABULATE_COL_WIDTH = 80
+_CONSOLE = Console()
+def extract_code(code: str) -> str:
+    if "```python" in code:
+        code = code[code.find("```python") + len("```python") :]
+        code = code[: code.find("```")]
+    return code
+def write_plan(
+    user_requirements: str, tool_desc: str, model: LLM
+) -> List[Dict[str, Any]]:
+    context = USER_REQ_CONTEXT.format(user_requirement=user_requirements)
+    prompt = PLAN.format(context=context, plan="", tool_desc=tool_desc)
+    plan = json.loads(model(prompt).replace("```", "").strip())
+    return plan["plan"]  # type: ignore
+def write_code(
+    user_req: str, subtask: str, tool_info: str, code: str, model: LLM
+) -> str:
+    prompt = CODE.format(
+        context=USER_REQ_SUBTASK_CONTEXT.format(
+            user_requirement=user_req, subtask=subtask
+        ),
+        tool_info=tool_info,
+        code=code,
+    )
+    messages = [
+        {"role": "system", "content": CODE_SYS_MSG},
+        {"role": "user", "content": prompt},
+    ]
+    code = model.chat(messages)
+    return extract_code(code)
+def write_test(
+    user_req: str, subtask: str, tool_info: str, code: str, model: LLM
+) -> str:
+    prompt = TEST.format(
+        context=USER_REQ_SUBTASK_CONTEXT.format(
+            user_requirement=user_req, subtask=subtask
+        ),
+        tool_info=tool_info,
+        code=code,
+    )
+    messages = [
+        {"role": "system", "content": CODE_SYS_MSG},
+        {"role": "user", "content": prompt},
+    ]
+    code = model.chat(messages)
+    return extract_code(code)
+def debug_code(sub_task: str, working_memory: List[str], model: LLM) -> Tuple[str, str]:
+    # Make debug model output JSON
+    if hasattr(model, "kwargs"):
+        model.kwargs["response_format"] = {"type": "json_object"}
+    prompt = DEBUG.format(
+        debug_example=DEBUG_EXAMPLE,
+        context=USER_REQ_CONTEXT.format(user_requirement=sub_task),
+        previous_impl="\n".join(working_memory),
+    )
+    messages = [
+        {"role": "system", "content": DEBUG_SYS_MSG},
+        {"role": "user", "content": prompt},
+    ]
+    code_and_ref = json.loads(model.chat(messages).replace("```", "").strip())
+    if hasattr(model, "kwargs"):
+        del model.kwargs["response_format"]
+    return extract_code(code_and_ref["improved_impl"]), code_and_ref["reflection"]
+def write_and_exec_code(
+    user_req: str,
+    subtask: str,
+    orig_code: str,
+    code_writer_call: Callable,
+    model: LLM,
+    tool_info: str,
+    exec: Execute,
+    max_retry: int = 3,
+    verbose: bool = False,
+) -> Tuple[bool, str, str, Dict[str, List[str]]]:
+    success = False
+    counter = 0
+    reflection = ""
+    # TODO: add working memory to code_writer_call and debug_code
+    code = code_writer_call(user_req, subtask, tool_info, orig_code, model)
+    success, result = exec.run_isolation(code)
+    working_memory: Dict[str, List[str]] = {}
+    while not success and counter < max_retry:
+        if subtask not in working_memory:
+            working_memory[subtask] = []
+        if reflection:
+            working_memory[subtask].append(
+                PREV_CODE_CONTEXT_WITH_REFLECTION.format(
+                    code=code, result=result, reflection=reflection
+                )
+            )
+        else:
+            working_memory[subtask].append(
+                PREV_CODE_CONTEXT.format(code=code, result=result)
+            )
+        code, reflection = debug_code(subtask, working_memory[subtask], model)
+        success, result = exec.run_isolation(code)
+        counter += 1
+        if verbose:
+            _CONSOLE.print(
+                Syntax(code, "python", theme="gruvbox-dark", line_numbers=True)
+            )
+        _LOGGER.info(f"\tDebugging reflection, result: {reflection}, {result}")
+        if success:
+            working_memory[subtask].append(
+                PREV_CODE_CONTEXT_WITH_REFLECTION.format(
+                    code=code, result=result, reflection=reflection
+                )
+            )
+    return success, code, result, working_memory
+def run_plan(
+    user_req: str,
+    plan: List[Dict[str, Any]],
+    coder: LLM,
+    exec: Execute,
+    code: str,
+    tool_recommender: Sim,
+    verbose: bool = False,
+) -> Tuple[str, str, List[Dict[str, Any]], Dict[str, List[str]]]:
+    active_plan = [e for e in plan if "success" not in e or not e["success"]]
+    working_memory: Dict[str, List[str]] = {}
+    current_code = code
+    current_test = ""
+    for task in active_plan:
+        _LOGGER.info(
+            f"""
+{tabulate(tabular_data=[task], headers="keys", tablefmt="mixed_grid", maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"""
+        )
+        tool_info = "\n".join(
+            [e["doc"] for e in tool_recommender.top_k(task["instruction"])]
+        )
+        success, code, result, task_memory = write_and_exec_code(
+            user_req,
+            task["instruction"],
+            current_code,
+            write_code if task["type"] == "code" else write_test,
+            coder,
+            tool_info,
+            exec,
+            verbose,
+        )
+        if task["type"] == "code":
+            current_code = code
+        else:
+            current_test = code
+        working_memory.update(task_memory)
+        if verbose:
+            _CONSOLE.print(
+                Syntax(code, "python", theme="gruvbox-dark", line_numbers=True)
+            )
+        _LOGGER.info(f"\tCode success, result: {success}, {str(result)}")
+        task["success"] = success
+        task["result"] = result
+        task["code"] = code
+        if not success:
+            break
+    return current_code, current_test, plan, working_memory
+class VisionAgentV2(Agent):
+    """Vision Agent is an AI agentic framework geared towards outputting Python code to
+    solve vision tasks. It is inspired by MetaGPT's Data Interpreter
+    https://arxiv.org/abs/2402.18679. Vision Agent has several key features to help it
+    generate code:
+    - A planner to generate a plan of tasks to solve a user requirement. The planner
+    can output code tasks or test tasks, where test tasks are used to verify the code.
+    - Automatic debugging, if a task fails, the agent will attempt to debug the code
+    using the failed output to fix it.
+    - A tool recommender to recommend tools to use for a given task. LLM performance
+    on tool retrieval starts to decrease as you add more tools, tool retrieval helps
+    keep the number of tools to choose from low.
+    - Memory retrieval, the agent can remember previous iterations on tasks to help it
+    with new tasks.
+    - Dynamic replanning, the agent can ask for feedback and replan remaining tasks
+    based off of that feedback.
+    """
+    def __init__(
+        self,
+        timeout: int = 600,
+        tool_recommender: Optional[Sim] = None,
+        verbose: bool = False,
+    ) -> None:
+        self.planner = OpenAILLM(temperature=0.1, json_mode=True)
+        self.coder = OpenAILLM(temperature=0.1)
+        self.exec = Execute(timeout=timeout)
+        if tool_recommender is None:
+            self.tool_recommender = Sim(TOOLS_DF, sim_key="desc")
+        else:
+            self.tool_recommender = tool_recommender
+        self.verbose = verbose
+        if self.verbose:
+            _LOGGER.setLevel(logging.INFO)
+    def __call__(
+        self,
+        input: Union[List[Dict[str, str]], str],
+        image: Optional[Union[str, Path]] = None,
+    ) -> str:
+        if isinstance(input, str):
+            input = [{"role": "user", "content": input}]
+        code, _ = self.chat_with_tests(input, image)
+        return code
+    def chat_with_tests(
+        self,
+        chat: List[Dict[str, str]],
+        image: Optional[Union[str, Path]] = None,
+    ) -> Tuple[str, str]:
+        if len(chat) == 0:
+            raise ValueError("Input cannot be empty.")
+        user_req = chat[0]["content"]
+        if image is not None:
+            user_req += f" Image name {image}"
+        plan = write_plan(user_req, TOOL_DESCRIPTIONS, self.planner)
+        _LOGGER.info(
+            f"""Plan:
+{tabulate(tabular_data=plan, headers="keys", tablefmt="mixed_grid", maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"""
+        )
+        working_memory: Dict[str, List[str]] = {}
+        working_code = ""
+        working_test = ""
+        success = False
+        while not success:
+            working_code, working_test, plan, working_memory_i = run_plan(
+                user_req,
+                plan,
+                self.coder,
+                self.exec,
+                working_code,
+                self.tool_recommender,
+                self.verbose,
+            )
+            success = all(task["success"] for task in plan)
+            working_memory.update(working_memory_i)
+            if not success:
+                # TODO: ask for feedback and replan
+                break
+        return working_code, working_test
+    def log_progress(self, description: str) -> None:
+        pass

vision_agent/agent/vision_agent_v2_prompt.py ADDED Viewed

@@ -0,0 +1,170 @@
+USER_REQ_SUBTASK_CONTEXT = """
+## User Requirement
+{user_requirement}
+## Current Subtask
+{subtask}
+"""
+USER_REQ_CONTEXT = """
+## User Requirement
+{user_requirement}
+"""
+PLAN = """
+# Context
+{context}
+# Current Plan
+{plan}
+# Tools Available
+{tool_desc}
+# Task:
+Based on the context and the tools you have available, write a plan of subtasks to achieve the user request that adhere to the following requirements:
+- For each subtask, you should provide a short instruction on what to do. Ensure the subtasks are large enough to be meaningful, encompassing multiple lines of code.
+- You do not need to have the agent rewrite any tool functionality you already have, you should instead instruct it to utilize one or more of those tools in each subtask.
+- You can have agents either write coding tasks, to code some functionality or testing tasks to test previous functionality.
+Output a list of jsons in the following format:
+```json
+{{
+    "plan":
+        [
+            {{
+                "task_id": int, # "unique identifier for a task in plan, can be an ordinal"
+                "dependent_task_ids": list[int], # "ids of tasks prerequisite to this task"
+                "instruction": str, # "what you should do in this task, one short phrase or sentence"
+                "type": str, # "the type of the task, tasks can either be 'code' for coding tasks or 'test' for testing tasks"
+            }},
+            ...
+        ]
+}}
+```
+"""
+CODE_SYS_MSG = """You are an AI Python assistant. You need to help user to achieve their goal by implementing a function. Your code will be run in a jupyter notebook environment so don't use asyncio.run. Instead, use await if you need to call an async function. Do not use 'display' for showing images, instead use matplotlib or PIL."""
+CODE = """
+# Context
+{context}
+# Tool Info for Current Subtask
+{tool_info}
+# Previous Code
+{code}
+# Constraints
+- Write a function that accomplishes the User Requirement. You are supplied code from a previous task, feel free to copy over that code into your own implementation if you need it.
+- Always prioritize using pre-defined tools or code for the same functionality. You have access to all these tools through the `from vision_agent.tools.tools_v2 import *` import.
+- Write clean, readable, and well-documented code.
+# Output
+While some concise thoughts are helpful, code is absolutely required. If possible, execute your defined functions in the code output. Output code in the following format:
+```python
+from vision_agent.tools.tools_v2 imoprt *
+# your code goes here
+```
+"""
+DEBUG_SYS_MSG = """You are an AI Python assistant. You will be given your previous implementation code of a task, runtime error results, and a hint to change the implementation appropriately. Your code will be run in a jupyter notebook environment. Write your full implementation."""
+DEBUG_EXAMPLE = '''
+[previous impl]:
+```python
+def add(a: int, b: int) -> int:
+   """Given integers a and b, return the total value of a and b."""
+   return a - b
+```
+[previous output]
+Tests failed:
+assert add(1, 2) == 3 # output: -1
+assert add(1, 3) == 4 # output: -2
+[reflection on previous impl]:
+The implementation failed the test cases where the input integers are 1 and 2. The issue arises because the code does not add the two integers together, but instead subtracts the second integer from the first. To fix this issue, we should change the operator from `-` to `+` in the return statement. This will ensure that the function returns the correct output for the given input.
+[improved impl]:
+def add(a: int, b: int) -> int:
+   """Given integers a and b, return the total value of a and b."""
+   return a + b
+'''
+PREV_CODE_CONTEXT = """
+```python
+{code}
+```
+[previous output]
+{result}
+"""
+PREV_CODE_CONTEXT_WITH_REFLECTION = """
+```python
+{code}
+```
+[previous output]
+{result}
+[reflection on previous impl]
+{reflection}
+"""
+DEBUG = """
+[example]
+Here is an example of debugging with reflection.
+{debug_example}
+[/example]
+[context]
+{context}
+[previous impl]
+{previous_impl}
+[instruction]
+Analyze your previous code and error in [context] step by step, provide me with improved method and code. Remember to follow [context] requirement. Because you are writing code in a jupyter notebook, you can run `!pip install` to install missing packages. Output a json following the format:
+```json
+{{
+    "reflection": str = "Reflection on previous implementation",
+    "improved_impl": str = "Refined code after reflection.",
+}}
+```
+"""
+TEST = """
+# Context
+{context}
+# Tool Info for Current Subtask
+{tool_info}
+# Code to Test
+{code}
+# Constraints
+- Write code to test the functionality of the provided code according to the Current Subtask. If you cannot test the code, then write code to visualize the result by calling the code.
+- Always prioritize using pre-defined tools for the same functionality.
+- Write clean, readable, and well-documented code.
+# Output
+While some concise thoughts are helpful, code is absolutely required. Always output one and only one code block in your response. Output code in the following format:
+```python
+your code
+```
+"""

vision_agent/llm/llm.py CHANGED Viewed

@@ -37,6 +37,7 @@ class OpenAILLM(LLM):
         model_name: str = "gpt-4-turbo",
         api_key: Optional[str] = None,
         json_mode: bool = False,
+        system_prompt: Optional[str] = None,
         **kwargs: Any
     ):
         if not api_key:
@@ -45,22 +46,29 @@ class OpenAILLM(LLM):
             self.client = OpenAI(api_key=api_key)
         self.model_name = model_name
+        self.system_prompt = system_prompt
         self.kwargs = kwargs
         if json_mode:
             self.kwargs["response_format"] = {"type": "json_object"}
     def generate(self, prompt: str) -> str:
+        messages = []
+        if self.system_prompt:
+            messages.append({"role": "system", "content": self.system_prompt})
+        messages.append({"role": "user", "content": prompt})
         response = self.client.chat.completions.create(
             model=self.model_name,
-            messages=[
-                {"role": "user", "content": prompt},
-            ],
+            messages=messages,  # type: ignore
             **self.kwargs,
         )
         return cast(str, response.choices[0].message.content)
     def chat(self, chat: List[Dict[str, str]]) -> str:
+        if self.system_prompt and not any(msg["role"] == "system" for msg in chat):
+            chat.insert(0, {"role": "system", "content": self.system_prompt})
         response = self.client.chat.completions.create(
             model=self.model_name,
             messages=chat,  # type: ignore

vision_agent/tools/__init__.py CHANGED Viewed

@@ -3,11 +3,9 @@ from .tools import (  # Counter,
     CLIP,
     OCR,
     TOOLS,
-    BboxArea,
+    BboxStats,
     BboxIoU,
-    ObjectDistance,
     BoxDistance,
-    MaskDistance,
     Crop,
     DINOv,
     ExtractFrames,
@@ -15,6 +13,8 @@ from .tools import (  # Counter,
     GroundingSAM,
     ImageCaption,
     ImageQuestionAnswering,
+    MaskDistance,
+    ObjectDistance,
     SegArea,
     SegIoU,
     Tool,

vision_agent/tools/tool_utils.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Any, Dict
 import requests
-from vision_agent.type_defs import LandingaiAPIKey
+from vision_agent.utils.type_defs import LandingaiAPIKey
 _LOGGER = logging.getLogger(__name__)
 _LND_API_KEY = LandingaiAPIKey().api_key

vision-agent 0.2.14__py3-none-any.whl → 0.2.16__py3-none-any.whl

vision-agent 0.2.14py3-none-any.whl → 0.2.16py3-none-any.whl