PyPI - vision-agent - Versions diffs - 0.2.10__py3-none-any.whl → 0.2.22__py3-none-any.whl - Mend

vision-agent 0.2.10py3-none-any.whl → 0.2.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

vision_agent/agent/__init__.py +2 -0
vision_agent/agent/agent_coder.py +196 -0
vision_agent/agent/agent_coder_prompts.py +135 -0
vision_agent/agent/vision_agent.py +46 -30
vision_agent/agent/vision_agent_prompts.py +3 -3
vision_agent/agent/vision_agent_v2.py +396 -0
vision_agent/agent/vision_agent_v2_prompt.py +185 -0
vision_agent/llm/llm.py +12 -4
vision_agent/tools/__init__.py +3 -1
vision_agent/tools/tool_utils.py +30 -0
vision_agent/tools/tools.py +157 -79
vision_agent/tools/tools_v2.py +442 -0
vision_agent/utils/__init__.py +3 -0
vision_agent/utils/execute.py +104 -0
vision_agent/utils/sim.py +85 -0
{vision_agent-0.2.10.dist-info → vision_agent-0.2.22.dist-info}/METADATA +7 -3
vision_agent-0.2.22.dist-info/RECORD +34 -0
vision_agent-0.2.10.dist-info/RECORD +0 -25
/vision_agent/{image_utils.py → utils/image_utils.py} +0 -0
/vision_agent/{type_defs.py → utils/type_defs.py} +0 -0
/vision_agent/{tools → utils}/video.py +0 -0
{vision_agent-0.2.10.dist-info → vision_agent-0.2.22.dist-info}/LICENSE +0 -0
{vision_agent-0.2.10.dist-info → vision_agent-0.2.22.dist-info}/WHEEL +0 -0

vision_agent/agent/vision_agent_v2.py ADDED Viewed

@@ -0,0 +1,396 @@
+import json
+import logging
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union
+import pandas as pd
+from rich.console import Console
+from rich.syntax import Syntax
+from tabulate import tabulate
+from vision_agent.agent import Agent
+from vision_agent.agent.vision_agent_v2_prompt import (
+    CODE,
+    CODE_SYS_MSG,
+    DEBUG,
+    DEBUG_EXAMPLE,
+    DEBUG_SYS_MSG,
+    PLAN,
+    PREV_CODE_CONTEXT,
+    PREV_CODE_CONTEXT_WITH_REFLECTION,
+    TEST,
+    USER_REQ_CONTEXT,
+    USER_REQ_SUBTASK_CONTEXT,
+    USER_REQ_SUBTASK_WM_CONTEXT,
+)
+from vision_agent.llm import LLM, OpenAILLM
+from vision_agent.tools.tools_v2 import TOOL_DESCRIPTIONS, TOOLS_DF
+from vision_agent.utils import Execute, Sim
+logging.basicConfig(level=logging.INFO)
+_LOGGER = logging.getLogger(__name__)
+_MAX_TABULATE_COL_WIDTH = 80
+_CONSOLE = Console()
+def build_working_memory(working_memory: Mapping[str, List[str]]) -> Sim:
+    data: Mapping[str, List[str]] = {"desc": [], "doc": []}
+    for key, value in working_memory.items():
+        data["desc"].append(key)
+        data["doc"].append("\n".join(value))
+    df = pd.DataFrame(data)  # type: ignore
+    return Sim(df, sim_key="desc")
+def extract_code(code: str) -> str:
+    if "```python" in code:
+        code = code[code.find("```python") + len("```python") :]
+        code = code[: code.find("```")]
+    if code.startswith("python\n"):
+        code = code[len("python\n") :]
+    return code
+def extract_json(json_str: str) -> Dict[str, Any]:
+    try:
+        json_dict = json.loads(json_str)
+    except json.JSONDecodeError:
+        if "```json" in json_str:
+            json_str = json_str[json_str.find("```json") + len("```json") :]
+            json_str = json_str[: json_str.find("```")]
+        elif "```" in json_str:
+            json_str = json_str[json_str.find("```") + len("```") :]
+            # get the last ``` not one from an intermediate string
+            json_str = json_str[: json_str.find("}```")]
+        json_dict = json.loads(json_str)
+    return json_dict  # type: ignore
+def write_plan(
+    chat: List[Dict[str, str]],
+    plan: Optional[List[Dict[str, Any]]],
+    tool_desc: str,
+    model: LLM,
+) -> Tuple[str, List[Dict[str, Any]]]:
+    # Get last user request
+    if chat[-1]["role"] != "user":
+        raise ValueError("Last chat message must be from the user.")
+    user_requirements = chat[-1]["content"]
+    context = USER_REQ_CONTEXT.format(user_requirement=user_requirements)
+    prompt = PLAN.format(context=context, plan=str(plan), tool_desc=tool_desc)
+    chat[-1]["content"] = prompt
+    new_plan = extract_json(model.chat(chat))
+    return new_plan["user_req"], new_plan["plan"]
+def write_code(
+    user_req: str,
+    subtask: str,
+    working_memory: str,
+    tool_info: str,
+    code: str,
+    model: LLM,
+) -> str:
+    prompt = CODE.format(
+        context=USER_REQ_SUBTASK_WM_CONTEXT.format(
+            user_requirement=user_req, working_memory=working_memory, subtask=subtask
+        ),
+        tool_info=tool_info,
+        code=code,
+    )
+    messages = [
+        {"role": "system", "content": CODE_SYS_MSG},
+        {"role": "user", "content": prompt},
+    ]
+    code = model.chat(messages)
+    return extract_code(code)
+def write_test(
+    user_req: str, subtask: str, tool_info: str, _: str, code: str, model: LLM
+) -> str:
+    prompt = TEST.format(
+        context=USER_REQ_SUBTASK_CONTEXT.format(
+            user_requirement=user_req, subtask=subtask
+        ),
+        tool_info=tool_info,
+        code=code,
+    )
+    messages = [
+        {"role": "system", "content": CODE_SYS_MSG},
+        {"role": "user", "content": prompt},
+    ]
+    code = model.chat(messages)
+    return extract_code(code)
+def debug_code(
+    user_req: str,
+    subtask: str,
+    retrieved_ltm: str,
+    working_memory: str,
+    model: LLM,
+) -> Tuple[str, str]:
+    # Make debug model output JSON
+    if hasattr(model, "kwargs"):
+        model.kwargs["response_format"] = {"type": "json_object"}
+    prompt = DEBUG.format(
+        debug_example=DEBUG_EXAMPLE,
+        context=USER_REQ_SUBTASK_WM_CONTEXT.format(
+            user_requirement=user_req,
+            subtask=subtask,
+            working_memory=retrieved_ltm,
+        ),
+        previous_impl=working_memory,
+    )
+    messages = [
+        {"role": "system", "content": DEBUG_SYS_MSG},
+        {"role": "user", "content": prompt},
+    ]
+    code_and_ref = extract_json(model.chat(messages))
+    if hasattr(model, "kwargs"):
+        del model.kwargs["response_format"]
+    return extract_code(code_and_ref["improved_impl"]), code_and_ref["reflection"]
+def write_and_exec_code(
+    user_req: str,
+    subtask: str,
+    orig_code: str,
+    code_writer_call: Callable[..., str],
+    model: LLM,
+    tool_info: str,
+    exec: Execute,
+    retrieved_ltm: str,
+    max_retry: int = 3,
+    verbosity: int = 0,
+) -> Tuple[bool, str, str, Dict[str, List[str]]]:
+    success = False
+    counter = 0
+    reflection = ""
+    code = code_writer_call(
+        user_req, subtask, retrieved_ltm, tool_info, orig_code, model
+    )
+    success, result = exec.run_isolation(code)
+    if verbosity == 2:
+        _CONSOLE.print(Syntax(code, "python", theme="gruvbox-dark", line_numbers=True))
+        _LOGGER.info(f"\tCode success: {success}, result: {str(result)}")
+    working_memory: Dict[str, List[str]] = {}
+    while not success and counter < max_retry:
+        if subtask not in working_memory:
+            working_memory[subtask] = []
+        if reflection:
+            working_memory[subtask].append(
+                PREV_CODE_CONTEXT_WITH_REFLECTION.format(
+                    code=code, result=result, reflection=reflection
+                )
+            )
+        else:
+            working_memory[subtask].append(
+                PREV_CODE_CONTEXT.format(code=code, result=result)
+            )
+        code, reflection = debug_code(
+            user_req, subtask, retrieved_ltm, "\n".join(working_memory[subtask]), model
+        )
+        success, result = exec.run_isolation(code)
+        counter += 1
+        if verbosity == 2:
+            _CONSOLE.print(
+                Syntax(code, "python", theme="gruvbox-dark", line_numbers=True)
+            )
+            _LOGGER.info(f"\tDebugging reflection: {reflection}, result: {result}")
+        if success:
+            working_memory[subtask].append(
+                PREV_CODE_CONTEXT_WITH_REFLECTION.format(
+                    reflection=reflection, code=code, result=result
+                )
+            )
+    return success, code, result, working_memory
+def run_plan(
+    user_req: str,
+    plan: List[Dict[str, Any]],
+    coder: LLM,
+    exec: Execute,
+    code: str,
+    tool_recommender: Sim,
+    long_term_memory: Optional[Sim] = None,
+    verbosity: int = 0,
+) -> Tuple[str, str, List[Dict[str, Any]], Dict[str, List[str]]]:
+    active_plan = [e for e in plan if "success" not in e or not e["success"]]
+    current_code = code
+    current_test = ""
+    retrieved_ltm = ""
+    working_memory: Dict[str, List[str]] = {}
+    for task in active_plan:
+        _LOGGER.info(
+            f"""
+{tabulate(tabular_data=[task], headers="keys", tablefmt="mixed_grid", maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"""
+        )
+        tool_info = "\n".join(
+            [e["doc"] for e in tool_recommender.top_k(task["instruction"])]
+        )
+        if long_term_memory is not None:
+            retrieved_ltm = "\n".join(
+                [e["doc"] for e in long_term_memory.top_k(task["instruction"], 1)]
+            )
+        success, code, result, working_memory_i = write_and_exec_code(
+            user_req,
+            task["instruction"],
+            current_code,
+            write_code if task["type"] == "code" else write_test,
+            coder,
+            tool_info,
+            exec,
+            retrieved_ltm,
+            verbosity=verbosity,
+        )
+        if task["type"] == "code":
+            current_code = code
+        else:
+            current_test = code
+        working_memory.update(working_memory_i)
+        if verbosity == 1:
+            _CONSOLE.print(
+                Syntax(code, "python", theme="gruvbox-dark", line_numbers=True)
+            )
+        _LOGGER.info(f"\tCode success: {success} result: {str(result)}")
+        task["success"] = success
+        task["result"] = result
+        task["code"] = code
+        if not success:
+            break
+    return current_code, current_test, plan, working_memory
+class VisionAgentV2(Agent):
+    """Vision Agent is an AI agentic framework geared towards outputting Python code to
+    solve vision tasks. It is inspired by MetaGPT's Data Interpreter
+    https://arxiv.org/abs/2402.18679. Vision Agent has several key features to help it
+    generate code:
+    - A planner to generate a plan of tasks to solve a user requirement. The planner
+    can output code tasks or test tasks, where test tasks are used to verify the code.
+    - Automatic debugging, if a task fails, the agent will attempt to debug the code
+    using the failed output to fix it.
+    - A tool recommender to recommend tools to use for a given task. LLM performance
+    on tool retrieval starts to decrease as you add more tools, tool retrieval helps
+    keep the number of tools to choose from low.
+    - Memory retrieval, the agent can remember previous iterations on tasks to help it
+    with new tasks.
+    - Dynamic replanning, the agent can ask for feedback and replan remaining tasks
+    based off of that feedback.
+    """
+    def __init__(
+        self,
+        timeout: int = 600,
+        tool_recommender: Optional[Sim] = None,
+        long_term_memory: Optional[Sim] = None,
+        verbosity: int = 0,
+    ) -> None:
+        self.planner = OpenAILLM(temperature=0.0, json_mode=True)
+        self.coder = OpenAILLM(temperature=0.0)
+        self.exec = Execute(timeout=timeout)
+        if tool_recommender is None:
+            self.tool_recommender = Sim(TOOLS_DF, sim_key="desc")
+        else:
+            self.tool_recommender = tool_recommender
+        self.verbosity = verbosity
+        self._working_memory: Dict[str, List[str]] = {}
+        if long_term_memory is not None:
+            if "doc" not in long_term_memory.df.columns:
+                raise ValueError("Long term memory must have a 'doc' column.")
+        self.long_term_memory = long_term_memory
+        self.max_retries = 3
+        if self.verbosity:
+            _LOGGER.setLevel(logging.INFO)
+    def __call__(
+        self,
+        input: Union[List[Dict[str, str]], str],
+        image: Optional[Union[str, Path]] = None,
+        plan: Optional[List[Dict[str, Any]]] = None,
+    ) -> str:
+        if isinstance(input, str):
+            input = [{"role": "user", "content": input}]
+        results = self.chat_with_workflow(input, image, plan)
+        return results["code"]  # type: ignore
+    def chat_with_workflow(
+        self,
+        chat: List[Dict[str, str]],
+        image: Optional[Union[str, Path]] = None,
+        plan: Optional[List[Dict[str, Any]]] = None,
+    ) -> Dict[str, Any]:
+        if len(chat) == 0:
+            raise ValueError("Input cannot be empty.")
+        if image is not None:
+            # append file names to all user messages
+            for chat_i in chat:
+                if chat_i["role"] == "user":
+                    chat_i["content"] += f" Image name {image}"
+        working_code = ""
+        if plan is not None:
+            # grab the latest working code from a previous plan
+            for task in plan:
+                if "success" in task and "code" in task and task["success"]:
+                    working_code = task["code"]
+        user_req, plan = write_plan(chat, plan, TOOL_DESCRIPTIONS, self.planner)
+        _LOGGER.info(
+            f"""Plan:
+{tabulate(tabular_data=plan, headers="keys", tablefmt="mixed_grid", maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"""
+        )
+        working_test = ""
+        working_memory: Dict[str, List[str]] = {}
+        success = False
+        retries = 0
+        while not success and retries < self.max_retries:
+            working_code, working_test, plan, working_memory_i = run_plan(
+                user_req,
+                plan,
+                self.coder,
+                self.exec,
+                working_code,
+                self.tool_recommender,
+                self.long_term_memory,
+                self.verbosity,
+            )
+            success = all(task["success"] for task in plan)
+            working_memory.update(working_memory_i)
+            if not success:
+                # return to user and request feedback
+                break
+            retries += 1
+        return {
+            "code": working_code,
+            "test": working_test,
+            "success": success,
+            "working_memory": build_working_memory(working_memory),
+            "plan": plan,
+        }
+    def log_progress(self, description: str) -> None:
+        pass

vision_agent/agent/vision_agent_v2_prompt.py ADDED Viewed

@@ -0,0 +1,185 @@
+USER_REQ_CONTEXT = """
+## User Requirement
+{user_requirement}
+"""
+USER_REQ_SUBTASK_CONTEXT = """
+## User Requirement
+{user_requirement}
+## Current Subtask
+{subtask}
+"""
+USER_REQ_SUBTASK_WM_CONTEXT = """
+## User Requirement
+{user_requirement}
+## Current Subtask
+{subtask}
+## Previous Task
+{working_memory}
+"""
+PLAN = """
+# Context
+{context}
+# Current Plan
+{plan}
+# Tools Available
+{tool_desc}
+# Task:
+Based on the context and the tools you have available, write a plan of subtasks to achieve the user request that adhere to the following requirements:
+- For each subtask, you should provide a short instruction on what to do. Ensure the subtasks are large enough to be meaningful, encompassing multiple lines of code.
+- You do not need to have the agent rewrite any tool functionality you already have, you should instead instruct it to utilize one or more of those tools in each subtask.
+- You can have agents either write coding tasks, to code some functionality or testing tasks to test previous functionality.
+- If a current plan exists, examine each item in the plan to determine if it was successful. If there was an item that failed, i.e. 'success': False, then you should rewrite that item and all subsequent items to ensure that the rewritten plan is successful.
+Output a list of jsons in the following format:
+```json
+{{
+    "user_req": str, # "a summarized version of the user requirement"
+    "plan":
+        [
+            {{
+                "task_id": int, # "unique identifier for a task in plan, can be an ordinal"
+                "dependent_task_ids": list[int], # "ids of tasks prerequisite to this task"
+                "instruction": str, # "what you should do in this task, one short phrase or sentence"
+                "type": str, # "the type of the task, tasks can either be 'code' for coding tasks or 'test' for testing tasks"
+            }},
+            ...
+        ]
+}}
+```
+"""
+CODE_SYS_MSG = """You are an AI Python assistant. You need to help user to achieve their goal by implementing a function. Your code will be run in a jupyter notebook environment so don't use asyncio.run. Instead, use await if you need to call an async function. Do not use 'display' for showing images, instead use matplotlib or PIL."""
+CODE = """
+# Context
+{context}
+# Tool Info for Current Subtask
+{tool_info}
+# Previous Code
+{code}
+# Constraints
+- Write a function that accomplishes the 'User Requirement'. You are supplied code from a previous task under 'Previous Code', feel free to copy over that code into your own implementation if you need it.
+- Always prioritize using pre-defined tools or code for the same functionality from 'Tool Info for Current Subtask'. You have access to all these tools through the `from vision_agent.tools.tools_v2 import *` import.
+- You may recieve previous trials and errors under 'Previous Task', this is code, output and reflections from previous tasks. You can use these to avoid running in to the same issues when writing your code.
+- Write clean, readable, and well-documented code.
+# Output
+While some concise thoughts are helpful, code is absolutely required. If possible, execute your defined functions in the code output. Output code in the following format:
+```python
+from vision_agent.tools.tools_v2 imoprt *
+# your code goes here
+```
+"""
+DEBUG_SYS_MSG = """You are an AI Python assistant. You will be given your previous implementation code of a task, runtime error results, and a hint to change the implementation appropriately. Your code will be run in a jupyter notebook environment. Write your full implementation."""
+DEBUG_EXAMPLE = '''
+[previous impl]:
+```python
+def add(a: int, b: int) -> int:
+   """Given integers a and b, return the total value of a and b."""
+   return a - b
+```
+[previous output]
+Tests failed:
+assert add(1, 2) == 3 # output: -1
+assert add(1, 3) == 4 # output: -2
+[reflection on previous impl]:
+The implementation failed the test cases where the input integers are 1 and 2. The issue arises because the code does not add the two integers together, but instead subtracts the second integer from the first. To fix this issue, we should change the operator from `-` to `+` in the return statement. This will ensure that the function returns the correct output for the given input.
+[improved impl]:
+def add(a: int, b: int) -> int:
+   """Given integers a and b, return the total value of a and b."""
+   return a + b
+'''
+PREV_CODE_CONTEXT = """
+[previous impl]
+```python
+{code}
+```
+[previous output]
+{result}
+"""
+PREV_CODE_CONTEXT_WITH_REFLECTION = """
+[reflection on previous impl]
+{reflection}
+[new impl]
+```python
+{code}
+```
+[new output]
+{result}
+"""
+# don't need [previous impl] because it will come from PREV_CODE_CONTEXT or PREV_CODE_CONTEXT_WITH_REFLECTION
+DEBUG = """
+[example]
+Here is an example of debugging with reflection.
+{debug_example}
+[/example]
+[context]
+{context}
+{previous_impl}
+[instruction]
+Analyze your previous code and error in [context] step by step, provide me with improved method and code. Remember to follow [context] requirement. Because you are writing code in a jupyter notebook, you can run `!pip install` to install missing packages. Output a json following the format:
+```json
+{{
+    "reflection": str = "Reflection on previous implementation",
+    "improved_impl": str = "Refined code after reflection.",
+}}
+```
+"""
+TEST = """
+# Context
+{context}
+# Tool Info for Current Subtask
+{tool_info}
+# Code to Test
+{code}
+# Constraints
+- Write code to test the functionality of the provided code according to the 'Current Subtask'. If you cannot test the code, then write code to visualize the result by calling the code.
+- Always prioritize using pre-defined tools for the same functionality.
+- Write clean, readable, and well-documented code.
+# Output
+While some concise thoughts are helpful, code is absolutely required. Always output one and only one code block in your response. Output code in the following format:
+```python
+your code
+```
+"""

vision_agent/llm/llm.py CHANGED Viewed

@@ -34,9 +34,10 @@ class OpenAILLM(LLM):
     def __init__(
         self,
-        model_name: str = "gpt-4-turbo",
+        model_name: str = "gpt-4o",
         api_key: Optional[str] = None,
         json_mode: bool = False,
+        system_prompt: Optional[str] = None,
         **kwargs: Any
     ):
         if not api_key:
@@ -45,22 +46,29 @@ class OpenAILLM(LLM):
             self.client = OpenAI(api_key=api_key)
         self.model_name = model_name
+        self.system_prompt = system_prompt
         self.kwargs = kwargs
         if json_mode:
             self.kwargs["response_format"] = {"type": "json_object"}
     def generate(self, prompt: str) -> str:
+        messages = []
+        if self.system_prompt:
+            messages.append({"role": "system", "content": self.system_prompt})
+        messages.append({"role": "user", "content": prompt})
         response = self.client.chat.completions.create(
             model=self.model_name,
-            messages=[
-                {"role": "user", "content": prompt},
-            ],
+            messages=messages,  # type: ignore
             **self.kwargs,
         )
         return cast(str, response.choices[0].message.content)
     def chat(self, chat: List[Dict[str, str]]) -> str:
+        if self.system_prompt and not any(msg["role"] == "system" for msg in chat):
+            chat.insert(0, {"role": "system", "content": self.system_prompt})
         response = self.client.chat.completions.create(
             model=self.model_name,
             messages=chat,  # type: ignore

vision_agent/tools/__init__.py CHANGED Viewed

@@ -3,7 +3,7 @@ from .tools import (  # Counter,
     CLIP,
     OCR,
     TOOLS,
-    BboxArea,
+    BboxStats,
     BboxIoU,
     BoxDistance,
     Crop,
@@ -13,6 +13,8 @@ from .tools import (  # Counter,
     GroundingSAM,
     ImageCaption,
     ImageQuestionAnswering,
+    MaskDistance,
+    ObjectDistance,
     SegArea,
     SegIoU,
     Tool,

vision_agent/tools/tool_utils.py ADDED Viewed

@@ -0,0 +1,30 @@
+import logging
+import os
+from typing import Any, Dict
+import requests
+from vision_agent.utils.type_defs import LandingaiAPIKey
+_LOGGER = logging.getLogger(__name__)
+_LND_API_KEY = LandingaiAPIKey().api_key
+_LND_API_URL = "https://api.dev.landing.ai/v1/agent"
+def _send_inference_request(
+    payload: Dict[str, Any], endpoint_name: str
+) -> Dict[str, Any]:
+    if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
+        payload["runtime_tag"] = runtime_tag
+    res = requests.post(
+        f"{_LND_API_URL}/model/{endpoint_name}",
+        headers={
+            "Content-Type": "application/json",
+            "apikey": _LND_API_KEY,
+        },
+        json=payload,
+    )
+    if res.status_code != 200:
+        _LOGGER.error(f"Request failed: {res.text}")
+        raise ValueError(f"Request failed: {res.text}")
+    return res.json()["data"]  # type: ignore

vision-agent 0.2.10__py3-none-any.whl → 0.2.22__py3-none-any.whl

vision-agent 0.2.10py3-none-any.whl → 0.2.22py3-none-any.whl