PyPI - vision-agent - Versions diffs - 1.0.4__py3-none-any.whl → 1.0.7__py3-none-any.whl - Mend

vision-agent 1.0.4py3-none-any.whl → 1.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

vision_agent/.sim_tools/df.csv +46 -47
vision_agent/.sim_tools/embs.npy +0 -0
vision_agent/agent/__init__.py +0 -16
vision_agent/agent/vision_agent_planner_prompts_v2.py +57 -58
vision_agent/agent/vision_agent_planner_v2.py +3 -2
vision_agent/configs/anthropic_config.py +29 -16
vision_agent/configs/config.py +14 -15
vision_agent/configs/openai_config.py +10 -10
vision_agent/lmm/lmm.py +2 -2
vision_agent/tools/__init__.py +0 -6
vision_agent/tools/meta_tools.py +1 -492
vision_agent/tools/planner_tools.py +13 -14
vision_agent/tools/tools.py +16 -27
{vision_agent-1.0.4.dist-info → vision_agent-1.0.7.dist-info}/METADATA +31 -3
{vision_agent-1.0.4.dist-info → vision_agent-1.0.7.dist-info}/RECORD +17 -24
vision_agent/agent/vision_agent.py +0 -605
vision_agent/agent/vision_agent_coder.py +0 -742
vision_agent/agent/vision_agent_coder_prompts.py +0 -290
vision_agent/agent/vision_agent_planner.py +0 -564
vision_agent/agent/vision_agent_planner_prompts.py +0 -199
vision_agent/agent/vision_agent_prompts.py +0 -312
vision_agent/configs/anthropic_openai_config.py +0 -164
{vision_agent-1.0.4.dist-info → vision_agent-1.0.7.dist-info}/LICENSE +0 -0
{vision_agent-1.0.4.dist-info → vision_agent-1.0.7.dist-info}/WHEEL +0 -0

vision_agent/tools/meta_tools.py CHANGED Viewed

@@ -1,17 +1,11 @@
 import difflib
-import json
 import os
 import re
-import subprocess
-import tempfile
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Tuple, Union
-import libcst as cst
 from IPython.display import display
-import vision_agent as va
-from vision_agent.models import Message
 from vision_agent.tools.tools import get_tools_descriptions as _get_tool_descriptions
 from vision_agent.utils.execute import Execution, MimeType
 from vision_agent.utils.tools_doc import get_tool_documentation
@@ -152,392 +146,6 @@ def view_lines(
     return return_str
-def open_code_artifact(
-    artifacts: Artifacts, name: str, line_num: int = 0, window_size: int = 100
-) -> str:
-    """Opens the provided code artifact. If `line_num` is provided, the window will be
-    moved to include that line. It only shows the first 100 lines by default! Max
-    `window_size` supported is 2000.
-    Parameters:
-        artifacts (Artifacts): The artifacts object to open the artifact from.
-        name (str): The name of the artifact to open.
-        line_num (int): The line number to move the window to.
-        window_size (int): The number of lines to show above and below the line.
-    """
-    if name not in artifacts:
-        return f"[Artifact {name} does not exist]"
-    total_lines = len(artifacts[name].splitlines())
-    window_size = min(window_size, 2000)
-    window_size = window_size // 2
-    if line_num - window_size < 0:
-        line_num = window_size
-    elif line_num >= total_lines:
-        line_num = total_lines - 1 - window_size
-    lines = artifacts[name].splitlines(keepends=True)
-    return view_lines(lines, line_num, window_size, name, total_lines)
-def create_code_artifact(artifacts: Artifacts, name: str) -> str:
-    """Creates a new code artifiact with the given name.
-    Parameters:
-        artifacts (Artifacts): The artifacts object to add the new artifact to.
-        name (str): The name of the new artifact.
-    """
-    if name in artifacts:
-        return_str = f"[Artifact {name} already exists]"
-    else:
-        artifacts[name] = ""
-        return_str = f"[Artifact {name} created]"
-    print(return_str)
-    display(
-        {
-            MimeType.APPLICATION_ARTIFACT: json.dumps(
-                {
-                    "name": name,
-                    "content": artifacts[name],
-                    "action": "create",
-                }
-            )
-        },
-        raw=True,
-    )
-    return return_str
-def edit_code_artifact(
-    artifacts: Artifacts, name: str, start: int, end: int, content: str
-) -> str:
-    """Edits the given code artifact with the provided content. The content will be
-    inserted between the `start` and `end` line numbers. If the `start` and `end` are
-    the same, the content will be inserted at the `start` line number. If the `end` is
-    greater than the total number of lines in the file, the content will be inserted at
-    the end of the file. If the `start` or `end` are negative, the function will return
-    an error message.
-    Parameters:
-        artifacts (Artifacts): The artifacts object to edit the artifact from.
-        name (str): The name of the artifact to edit.
-        start (int): The line number to start the edit, can be in [-1, total_lines]
-            where -1 represents the end of the file.
-        end (int): The line number to end the edit, can be in [-1, total_lines] where
-            -1 represents the end of the file.
-        content (str): The content to insert.
-    """
-    # just make the artifact if it doesn't exist instead of forcing agent to call
-    # create_artifact
-    if name not in artifacts:
-        artifacts[name] = ""
-    total_lines = len(artifacts[name].splitlines())
-    if start == -1:
-        start = total_lines
-    if end == -1:
-        end = total_lines
-    if start < 0 or end < 0 or start > end or end > total_lines:
-        print("[Invalid line range]")
-        return "[Invalid line range]"
-    new_content_lines = content.splitlines(keepends=True)
-    new_content_lines = [
-        line if line.endswith("\n") else line + "\n" for line in new_content_lines
-    ]
-    lines = artifacts[name].splitlines(keepends=True)
-    lines = [line if line.endswith("\n") else line + "\n" for line in lines]
-    edited_lines = lines[:start] + new_content_lines + lines[end:]
-    cur_line = start + len(content.split("\n")) // 2
-    with tempfile.NamedTemporaryFile(delete=True) as f:
-        with open(f.name, "w") as f:  # type: ignore
-            f.writelines(edited_lines)
-        process = subprocess.Popen(
-            [
-                "flake8",
-                "--isolated",
-                "--select=F821,F822,F831,E111,E112,E113,E999,E902",
-                f.name,
-            ],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            text=True,
-        )
-        stdout, _ = process.communicate()
-        if stdout != "":
-            stdout = stdout.replace(f.name, name)
-            error_msg = "[Edit failed with the following status]\n" + stdout
-            original_view = view_lines(
-                lines,
-                start + ((end - start) // 2),
-                DEFAULT_WINDOW_SIZE,
-                name,
-                total_lines,
-                print_output=False,
-            )
-            total_lines_edit = sum(1 for _ in edited_lines)
-            edited_view = view_lines(
-                edited_lines,
-                cur_line,
-                DEFAULT_WINDOW_SIZE,
-                name,
-                total_lines_edit,
-                print_output=False,
-            )
-            error_msg += f"\n[This is how your edit would have looked like if applied]\n{edited_view}\n\n[This is the original code before your edit]\n{original_view}"
-            print(error_msg)
-            return error_msg
-    artifacts[name] = "".join(edited_lines)
-    display(
-        {
-            MimeType.APPLICATION_ARTIFACT: json.dumps(
-                {
-                    "name": name,
-                    "content": artifacts[name],
-                    "action": "edit",
-                }
-            )
-        },
-        raw=True,
-    )
-    return open_code_artifact(artifacts, name, cur_line)
-def generate_vision_plan(
-    artifacts: Artifacts,
-    name: str,
-    chat: str,
-    media: List[str],
-    test_multi_plan: bool = True,
-    custom_tool_names: Optional[List[str]] = None,
-) -> str:
-    """Generates a plan to solve vision based tasks.
-    Parameters:
-        artifacts (Artifacts): The artifacts object to save the plan to.
-        name (str): The name of the artifact to save the plan context to.
-        chat (str): The chat message from the user.
-        media (List[str]): The media files to use.
-        test_multi_plan (bool): Do not change this parameter.
-        custom_tool_names (Optional[List[str]]): Do not change this parameter.
-    Returns:
-        str: The generated plan.
-    Examples
-    --------
-        >>> generate_vision_plan(artifacts, "plan.json", "Can you detect the dogs in this image?", ["image.jpg"])
-        [Start Plan Context]
-        plan1: This is a plan to detect dogs in an image
-        -load image
-        -detect dogs
-        -return detections
-        [End Plan Context]
-    """
-    # verbosity is set to 0 to avoid adding extra content to the VisionAgent conversation
-    if ZMQ_PORT is not None:
-        agent = va.agent.VisionAgentPlanner(
-            report_progress_callback=lambda inp: report_progress_callback(
-                int(ZMQ_PORT), inp
-            ),
-            verbosity=0,
-        )
-    else:
-        agent = va.agent.VisionAgentPlanner(verbosity=0)
-    fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
-    response = agent.generate_plan(
-        fixed_chat,
-        test_multi_plan=test_multi_plan,
-        custom_tool_names=custom_tool_names,
-    )
-    if response.test_results is not None:
-        redisplay_results(response.test_results)
-    response.test_results = None
-    artifacts[name] = response.model_dump_json()
-    output_str = f"[Start Plan Context, saved at {name}]"
-    for plan in response.plans.keys():
-        output_str += f"\n{plan}: {response.plans[plan]['thoughts'].strip()}\n"  # type: ignore
-        output_str += "    -" + "\n    -".join(
-            e.strip() for e in response.plans[plan]["instructions"]
-        )
-    output_str += f"\nbest plan: {response.best_plan}\n"
-    output_str += "thoughts: " + response.plan_thoughts.strip() + "\n"
-    output_str += "[End Plan Context]"
-    print(output_str)
-    return output_str
-def generate_vision_code(
-    artifacts: Artifacts,
-    name: str,
-    chat: str,
-    media: List[str],
-    test_multi_plan: bool = True,
-    custom_tool_names: Optional[List[str]] = None,
-) -> str:
-    """Generates python code to solve vision based tasks.
-    Parameters:
-        artifacts (Artifacts): The artifacts object to save the code to.
-        name (str): The name of the artifact to save the code to.
-        chat (str): The chat message from the user.
-        media (List[str]): The media files to use.
-        test_multi_plan (bool): Do not change this parameter.
-        custom_tool_names (Optional[List[str]]): Do not change this parameter.
-    Returns:
-        str: The generated code.
-    Examples
-    --------
-        >>> generate_vision_code(artifacts, "code.py", "Can you detect the dogs in this image?", ["image.jpg"])
-        from vision_agent.tools import load_image, owl_v2
-        def detect_dogs(image_path: str):
-            image = load_image(image_path)
-            dogs = owl_v2("dog", image)
-            return dogs
-    """
-    # verbosity is set to 0 to avoid adding extra content to the VisionAgent conversation
-    if ZMQ_PORT is not None:
-        agent = va.agent.VisionAgentCoder(
-            report_progress_callback=lambda inp: report_progress_callback(
-                int(ZMQ_PORT), inp
-            ),
-            verbosity=0,
-        )
-    else:
-        agent = va.agent.VisionAgentCoder(verbosity=0)
-    fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
-    response = agent.generate_code(
-        fixed_chat,
-        test_multi_plan=test_multi_plan,
-        custom_tool_names=custom_tool_names,
-    )
-    redisplay_results(response["test_result"])
-    code = response["code"]
-    artifacts[name] = code
-    code_lines = code.splitlines(keepends=True)
-    total_lines = len(code_lines)
-    display(
-        {
-            MimeType.APPLICATION_ARTIFACT: json.dumps(
-                {
-                    "name": name,
-                    "content": code,
-                    "contentType": "vision_code",
-                    "action": "create",
-                }
-            )
-        },
-        raw=True,
-    )
-    return view_lines(code_lines, 0, total_lines, name, total_lines)
-def edit_vision_code(
-    artifacts: Artifacts,
-    name: str,
-    chat_history: List[str],
-    media: List[str],
-    custom_tool_names: Optional[List[str]] = None,
-) -> str:
-    """Edits python code to solve a vision based task.
-    Parameters:
-        artifacts (Artifacts): The artifacts object to save the code to.
-        name (str): The file path to the code.
-        chat_history (List[str]): The chat history to used to generate the code.
-        custom_tool_names (Optional[List[str]]): Do not change this parameter.
-    Returns:
-        str: The edited code.
-    Examples
-    --------
-        >>> edit_vision_code(
-        >>>     artifacts,
-        >>>     "code.py",
-        >>>     ["Can you detect the dogs in this image?", "Can you use a higher threshold?"],
-        >>>     ["dog.jpg"],
-        >>> )
-        from vision_agent.tools import load_image, owl_v2
-        def detect_dogs(image_path: str):
-            image = load_image(image_path)
-            dogs = owl_v2("dog", image, threshold=0.8)
-            return dogs
-    """
-    # verbosity is set to 0 to avoid adding extra content to the VisionAgent conversation
-    agent = va.agent.VisionAgentCoder(verbosity=0)
-    if name not in artifacts:
-        print(f"[Artifact {name} does not exist]")
-        return f"[Artifact {name} does not exist]"
-    code = artifacts[name]
-    # Append latest code to second to last message from assistant
-    fixed_chat_history: List[Message] = []
-    user_message = "Previous user requests:"
-    for i, chat in enumerate(chat_history):
-        if i < len(chat_history) - 1:
-            user_message += " " + chat
-        else:
-            fixed_chat_history.append(
-                {"role": "user", "content": user_message, "media": media}
-            )
-            fixed_chat_history.append({"role": "assistant", "content": code})
-            fixed_chat_history.append({"role": "user", "content": chat})
-    response = agent.generate_code(
-        fixed_chat_history,
-        test_multi_plan=False,
-        custom_tool_names=custom_tool_names,
-    )
-    redisplay_results(response["test_result"])
-    code = response["code"]
-    artifacts[name] = code
-    code_lines = code.splitlines(keepends=True)
-    total_lines = len(code_lines)
-    display(
-        {
-            MimeType.APPLICATION_ARTIFACT: json.dumps(
-                {
-                    "name": name,
-                    "content": code,
-                    "action": "edit",
-                }
-            )
-        },
-        raw=True,
-    )
-    return view_lines(code_lines, 0, total_lines, name, total_lines)
-def list_artifacts(artifacts: Artifacts) -> str:
-    """Lists all the artifacts that have been loaded into the artifacts object."""
-    output_str = artifacts.show()
-    print(output_str)
-    return output_str
 def check_and_load_image(code: str) -> List[str]:
     if not code.strip():
         return []
@@ -584,108 +192,9 @@ def get_diff_with_prompts(name: str, before: str, after: str) -> str:
     return f"[Artifact {name} edits]\n{diff}\n[End of edits]"
-def use_extra_vision_agent_args(
-    code: Optional[str],
-    test_multi_plan: bool = True,
-    custom_tool_names: Optional[List[str]] = None,
-) -> Optional[str]:
-    """This is for forcing arguments passed by the user to VisionAgent into the
-    VisionAgentCoder call.
-    Parameters:
-        code (str): The code to edit.
-        test_multi_plan (bool): Do not change this parameter.
-        custom_tool_names (Optional[List[str]]): Do not change this parameter.
-    Returns:
-        str: The edited code.
-    """
-    if code is None:
-        return None
-    class VisionAgentTransformer(cst.CSTTransformer):
-        def __init__(
-            self, test_multi_plan: bool, custom_tool_names: Optional[List[str]]
-        ):
-            self.test_multi_plan = test_multi_plan
-            self.custom_tool_names = custom_tool_names
-        def leave_Call(
-            self, original_node: cst.Call, updated_node: cst.Call
-        ) -> cst.Call:
-            # Check if the function being called is generate_vision_code or edit_vision_code
-            if isinstance(updated_node.func, cst.Name) and updated_node.func.value in [
-                "generate_vision_code",
-                "edit_vision_code",
-            ]:
-                # Add test_multi_plan argument to generate_vision_code calls
-                if updated_node.func.value == "generate_vision_code":
-                    new_arg = cst.Arg(
-                        keyword=cst.Name("test_multi_plan"),
-                        value=cst.Name(str(self.test_multi_plan)),
-                        equal=cst.AssignEqual(
-                            whitespace_before=cst.SimpleWhitespace(""),
-                            whitespace_after=cst.SimpleWhitespace(""),
-                        ),
-                    )
-                    updated_node = updated_node.with_changes(
-                        args=[*updated_node.args, new_arg]
-                    )
-                # Add custom_tool_names if provided
-                if self.custom_tool_names is not None:
-                    list_arg = []
-                    for i, tool_name in enumerate(self.custom_tool_names):
-                        if i < len(self.custom_tool_names) - 1:
-                            list_arg.append(
-                                cst._nodes.expression.Element(
-                                    value=cst.SimpleString(value=f'"{tool_name}"'),
-                                    comma=cst.Comma(
-                                        whitespace_before=cst.SimpleWhitespace(""),
-                                        whitespace_after=cst.SimpleWhitespace(" "),
-                                    ),
-                                )
-                            )
-                        else:
-                            list_arg.append(
-                                cst._nodes.expression.Element(
-                                    value=cst.SimpleString(value=f'"{tool_name}"'),
-                                )
-                            )
-                    new_arg = cst.Arg(
-                        keyword=cst.Name("custom_tool_names"),
-                        value=cst.List(list_arg),
-                        equal=cst.AssignEqual(
-                            whitespace_before=cst.SimpleWhitespace(""),
-                            whitespace_after=cst.SimpleWhitespace(""),
-                        ),
-                    )
-                    updated_node = updated_node.with_changes(
-                        args=[*updated_node.args, new_arg]
-                    )
-            return updated_node
-    # Parse the input code into a CST node
-    tree = cst.parse_module(code)
-    # Apply the transformer to modify the CST
-    transformer = VisionAgentTransformer(test_multi_plan, custom_tool_names)
-    modified_tree = tree.visit(transformer)
-    # Return the modified code as a string
-    return modified_tree.code
 META_TOOL_DOCSTRING = get_tool_documentation(
     [
         get_tool_descriptions,
-        open_code_artifact,
-        create_code_artifact,
-        edit_code_artifact,
-        generate_vision_code,
-        edit_vision_code,
         view_media_artifact,
-        list_artifacts,
     ]
 )

vision_agent/tools/planner_tools.py CHANGED Viewed

@@ -236,7 +236,7 @@ def retrieve_tool_docs(lmm: LMM, task: str, exclude_tools: Optional[List[str]])
     all_tool_docs = []
     all_tool_doc_names = set()
     exclude_tools = [] if exclude_tools is None else exclude_tools
-    for category in categories:
+    for category in categories + [task]:
         tool_docs = sim.top_k(category, k=3, thresh=0.3)
         for tool_doc in tool_docs:
@@ -248,9 +248,7 @@ def retrieve_tool_docs(lmm: LMM, task: str, exclude_tools: Optional[List[str]])
                 all_tool_doc_names.add(tool_doc["name"])
     tool_docs_str = explanation + "\n\n" + "\n".join([e["doc"] for e in all_tool_docs])
-    tool_docs_str += (
-        "\n" + get_load_tools_docstring() + get_tool_documentation([judge_od_results])
-    )
+    tool_docs_str += get_load_tools_docstring()
     return tool_docs_str
@@ -346,22 +344,22 @@ def get_tool_for_task(
     and output signatures are.
     Parameters:
-        task: str: The task to accomplish.
-        images: Union[Dict[str, List[np.ndarray]], List[np.ndarray]]: The images to use
+        task (str): The task to accomplish.
+        images (Union[Dict[str, List[np.ndarray]], List[np.ndarray]]): The images to use
             for the task. If a key is provided, it is used as the file name.
-        exclude_tools: Optional[List[str]]: A list of tool names to exclude from the
+        exclude_tools (Optional[List[str]]): A list of tool names to exclude from the
             recommendations. This is helpful if you are calling get_tool_for_task twice
             and do not want the same tool recommended.
     Returns:
-        The tool to use for the task is printed to stdout
+        None: The function does not return the tool but prints it to stdout.
     Examples
     --------
         >>> get_tool_for_task(
         >>>     "Give me an OCR model that can find 'hot chocolate' in the image",
         >>>     {"image": [image]})
-        >>> get_tool_for_taks(
+        >>> get_tool_for_task(
         >>>     "I need a tool that can paint a background for this image and maks",
         >>>     {"image": [image], "mask": [mask]})
     """
@@ -497,8 +495,8 @@ def finalize_plan(user_request: str, chain_of_thoughts: str) -> str:
     return finalized_plan
-def claude35_vqa(prompt: str, medias: List[np.ndarray]) -> None:
-    """Asks the Claude-3.5 model a question about the given media and returns an answer.
+def vqa(prompt: str, medias: List[np.ndarray]) -> None:
+    """Asks the VQA model a question about the given media and returns an answer.
     Parameters:
         prompt: str: The question to ask the model.
@@ -515,13 +513,14 @@ def claude35_vqa(prompt: str, medias: List[np.ndarray]) -> None:
     ]
     response = cast(str, vqa.generate(prompt, media=all_media_b64))
-    print(f"[claude35_vqa output]\n{response}\n[end of claude35_vqa output]")
+    print(f"[vqa output]\n{response}\n[end of vqa output]")
 def suggestion(prompt: str, medias: List[np.ndarray]) -> None:
     """Given your problem statement and the images, this will provide you with a
     suggested plan on how to proceed. Always call suggestion when starting to solve
-    a problem.
+    a problem. 'suggestion' will only print pseudo code for you to execute, it will not
+    execute the code for you.
     Parameters:
         prompt: str: The problem statement, provide a detailed description of the
@@ -538,7 +537,7 @@ def suggestion(prompt: str, medias: List[np.ndarray]) -> None:
 PLANNER_TOOLS = [
-    claude35_vqa,
+    vqa,
     suggestion,
     get_tool_for_task,
 ]

vision-agent 1.0.4__py3-none-any.whl → 1.0.7__py3-none-any.whl

vision-agent 1.0.4py3-none-any.whl → 1.0.7py3-none-any.whl