PyPI - vision-agent - Versions diffs - 0.2.164__py3-none-any.whl → 0.2.165__py3-none-any.whl - Mend

vision-agent 0.2.164py3-none-any.whl → 0.2.165py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

vision_agent/agent/vision_agent.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import copy
+import json
 import logging
 import os
 import tempfile
@@ -6,7 +7,7 @@ from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
 from vision_agent.agent import Agent
-from vision_agent.agent.agent_utils import extract_json
+from vision_agent.agent.agent_utils import extract_json, extract_tag
 from vision_agent.agent.vision_agent_prompts import (
     EXAMPLES_CODE1,
     EXAMPLES_CODE2,
@@ -18,6 +19,7 @@ from vision_agent.tools.meta_tools import (
     META_TOOL_DOCSTRING,
     Artifacts,
     check_and_load_image,
+    extract_and_save_files_to_artifacts,
     use_extra_vision_agent_args,
 )
 from vision_agent.utils import CodeInterpreterFactory
@@ -35,7 +37,7 @@ class BoilerplateCode:
     pre_code = [
         "from typing import *",
         "from vision_agent.utils.execute import CodeInterpreter",
-        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
+        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
         "artifacts = Artifacts('{remote_path}')",
         "artifacts.load('{remote_path}')",
     ]
@@ -57,6 +59,32 @@ class BoilerplateCode:
         )
+def format_agent_message(agent_message: str) -> str:
+    agent_message_json = extract_json(agent_message)
+    output = ""
+    if "thinking" in agent_message_json and agent_message_json["thinking"]:
+        output += "<thinking>" + agent_message_json["thinking"] + "</thinking>"
+    if "response" in agent_message_json and agent_message_json["response"]:
+        output += "<response>" + agent_message_json["response"] + "</response>"
+    if "execute_python" in agent_message_json and agent_message_json["execute_python"]:
+        output += (
+            "\n<execute_python>\n"
+            + agent_message_json["execute_python"]
+            + "\n</execute_python>\n"
+        )
+    if (
+        "let_user_respond" in agent_message_json
+        and agent_message_json["let_user_respond"]
+    ):
+        output += (
+            "<let_user_respond>"
+            + str(agent_message_json["let_user_respond"])
+            + "</let_user_respond>"
+        )
+    return output
 def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
     chat = copy.deepcopy(chat)
@@ -67,7 +95,7 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
         elif chat_i["role"] == "observation":
             conversation += f"OBSERVATION:\n{chat_i['content']}\n\n"
         elif chat_i["role"] == "assistant":
-            conversation += f"AGENT: {chat_i['content']}\n\n"
+            conversation += f"AGENT: {format_agent_message(chat_i['content'])}\n\n"  # type: ignore
         else:
             raise ValueError(f"role {chat_i['role']} is not supported")
@@ -84,15 +112,31 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
         and len(chat[-1]["media"]) > 0  # type: ignore
     ):
         message["media"] = chat[-1]["media"]
-    return extract_json(orch([message], stream=False))  # type: ignore
+    conv_resp = cast(str, orch([message], stream=False))
+    let_user_respond_str = extract_tag(conv_resp, "let_user_respond")
+    let_user_respond = (
+        "true" in let_user_respond_str.lower() if let_user_respond_str else False
+    )
+    return {
+        "thinking": extract_tag(conv_resp, "thinking"),
+        "response": extract_tag(conv_resp, "response"),
+        "execute_python": extract_tag(conv_resp, "execute_python"),
+        "let_user_respond": let_user_respond,
+    }
 def execute_code_action(
-    code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
+    artifacts: Artifacts,
+    code: str,
+    code_interpreter: CodeInterpreter,
+    artifact_remote_path: str,
 ) -> Tuple[Execution, str]:
     result = code_interpreter.exec_isolation(
         BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
     )
+    extract_and_save_files_to_artifacts(artifacts, code)
     obs = str(result.logs)
     if result.error:
@@ -100,33 +144,8 @@ def execute_code_action(
     return result, obs
-def parse_execution(
-    response: str,
-    test_multi_plan: bool = True,
-    custom_tool_names: Optional[List[str]] = None,
-) -> Optional[str]:
-    code = None
-    remaining = response
-    all_code = []
-    while "<execute_python>" in remaining:
-        code_i = remaining[
-            remaining.find("<execute_python>") + len("<execute_python>") :
-        ]
-        code_i = code_i[: code_i.find("</execute_python>")]
-        remaining = remaining[
-            remaining.find("</execute_python>") + len("</execute_python>") :
-        ]
-        all_code.append(code_i)
-    if len(all_code) > 0:
-        code = "\n".join(all_code)
-    if code is not None:
-        code = use_extra_vision_agent_args(code, test_multi_plan, custom_tool_names)
-    return code
 def execute_user_code_action(
+    artifacts: Artifacts,
     last_user_message: Message,
     code_interpreter: CodeInterpreter,
     artifact_remote_path: str,
@@ -138,40 +157,43 @@ def execute_user_code_action(
         return user_result, user_obs
     last_user_content = cast(str, last_user_message.get("content", ""))
+    try:
+        user_code_action = json.loads(last_user_content).get("execute_python", None)
+    except json.JSONDecodeError:
+        return user_result, user_obs
-    user_code_action = parse_execution(last_user_content, False)
     if user_code_action is not None:
+        user_code_action = use_extra_vision_agent_args(user_code_action, False)
         user_result, user_obs = execute_code_action(
-            user_code_action, code_interpreter, artifact_remote_path
+            artifacts, user_code_action, code_interpreter, artifact_remote_path
         )
         if user_result.error:
             user_obs += f"\n{user_result.error}"
     return user_result, user_obs
-def add_step_descriptions(response: Dict[str, str]) -> Dict[str, str]:
+def add_step_descriptions(response: Dict[str, Any]) -> Dict[str, Any]:
     response = copy.deepcopy(response)
-    if "response" in response:
-        resp_str = response["response"]
-        if "<execute_python>" in resp_str:
-            # only include descriptions for these, the rest will just have executing
-            # code
-            description_map = {
-                "open_code_artifact": "Reading file.",
-                "create_code_artifact": "Creating file.",
-                "edit_code_artifact": "Editing file.",
-                "generate_vision_code": "Generating vision code.",
-                "edit_vision_code": "Editing vision code.",
-            }
-            description = ""
-            for k, v in description_map.items():
-                if k in resp_str:
-                    description += v + " "
-            if description == "":
-                description = "Executing code."
-            resp_str = resp_str[resp_str.find("<execute_python>") :]
-            resp_str = description + resp_str
-        response["response"] = resp_str
+    if "execute_python" in response and response["execute_python"]:
+        # only include descriptions for these, the rest will just have executing
+        # code
+        description_map = {
+            "open_code_artifact": "Reading file.",
+            "create_code_artifact": "Creating file.",
+            "edit_code_artifact": "Editing file.",
+            "generate_vision_code": "Generating vision code.",
+            "edit_vision_code": "Editing vision code.",
+        }
+        description = ""
+        for k, v in description_map.items():
+            if k in response["execute_python"]:
+                description += v + " "
+        if description == "":
+            description = "Executing code."
+        response["response"] = description
     return response
@@ -369,7 +391,10 @@ class VisionAgent(Agent):
             self.streaming_message({"role": "observation", "content": artifacts_loaded})
             user_result, user_obs = execute_user_code_action(
-                last_user_message, code_interpreter, str(remote_artifacts_path)
+                artifacts,
+                last_user_message,
+                code_interpreter,
+                str(remote_artifacts_path),
             )
             finished = user_result is not None and user_obs is not None
             if user_result is not None and user_obs is not None:
@@ -394,13 +419,13 @@ class VisionAgent(Agent):
                 int_chat.append(
                     {
                         "role": "assistant",
-                        "content": str(add_step_descriptions(response)),
+                        "content": json.dumps(add_step_descriptions(response)),
                     }
                 )
                 orig_chat.append(
                     {
                         "role": "assistant",
-                        "content": str(add_step_descriptions(response)),
+                        "content": json.dumps(add_step_descriptions(response)),
                     }
                 )
@@ -408,11 +433,13 @@ class VisionAgent(Agent):
                 if last_response == response:
                     response["let_user_respond"] = True
-                finished = response["let_user_respond"]
+                finished = response.get("let_user_respond", False)
-                code_action = parse_execution(
-                    response["response"], test_multi_plan, custom_tool_names
-                )
+                code_action = response.get("execute_python", None)
+                if code_action is not None:
+                    code_action = use_extra_vision_agent_args(
+                        code_action, test_multi_plan, custom_tool_names
+                    )
                 if last_response == response:
                     self.streaming_message(
@@ -431,14 +458,17 @@ class VisionAgent(Agent):
                     self.streaming_message(
                         {
                             "role": "assistant",
-                            "content": response,
+                            "content": json.dumps(response),
                             "finished": finished and code_action is None,
                         }
                     )
                 if code_action is not None:
                     result, obs = execute_code_action(
-                        code_action, code_interpreter, str(remote_artifacts_path)
+                        artifacts,
+                        code_action,
+                        code_interpreter,
+                        str(remote_artifacts_path),
                     )
                     media_obs = check_and_load_image(code_action)

vision_agent/agent/vision_agent_prompts.py CHANGED Viewed

@@ -27,13 +27,14 @@ Here is the current conversation so far:
 **Instructions**:
 1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
 2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`.
-3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question, set `let_user_respond` to `true`.
+3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question or show your results to the user, set <let_user_respond> to `true`.
 4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
-5. **Output in JSON**: Respond in the following format in JSON:
+5. **Output**: You can only respond with <thinking>, <response>, <execute_python>, and <let_user_respond> tags.
-```json
-{{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
-```
+<thinking>Your thoughts here...</thinking>
+<response>Your response to the user here...</response>
+<execute_python>Your code here...</execute_python>
+<let_user_respond>true/false for whether or not to you want to let the user respond.</let_user_respond>
 """
@@ -45,7 +46,11 @@ OBSERVATION:
 Artifact dog.jpg loaded to /path/to/images/dog.jpg
 [End of artifacts]
-AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
+AGENT: <thinking>I will use the generate_vision_code to detect the dogs in the image.</thinking>
+<execute_python>
+generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])
+</execute_python>
+<let_user_respond>false</let_user_respond>
 OBSERVATION:
 [Artifact dog_detector.py]
@@ -56,14 +61,21 @@ OBSERVATION:
 4|    return dogs
 [End of artifact]
-AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
+AGENT: <thinking>I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.</thinking>
+<execute_python>
+from dog_detector import detect_dogs
+print(detect_dogs('/example/workspace/dog.jpg'))
+</execute_python>
+<let_user_respond>false</let_user_respond>
 OBSERVATION:
 ----- stdout -----
 [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}, {'score': 0.23, 'label': 'dog', 'box': [0.2, 0.3, 0.4, 0.5]}]
-AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "The code detectd two dogs, do the results look good to you?", "let_user_respond": true}
+AGENT: <thinking>Two dogs are detected, I will show this to the user and ask them if the result looks good.</thinking>
+<response>The code detectd two dogs, do the results look good to you?</response>
+<let_user_respond>true</let_user_respond>
 """
 EXAMPLES_CODE1_EXTRA = """
@@ -75,7 +87,11 @@ Artifact dog.jpg loaded to /path/to/images/dog.jpg
 Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
 [End of artifacts]
-AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
+AGENT: <thinking>I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that</thinking>
+<execute_python>
+edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])
+</execute_python>
+<let_user_respond>false</let_user_respond>
 OBSERVATION:
 [Artifact dog_detector.py]
@@ -86,13 +102,20 @@ OBSERVATION:
 4|    return dogs
 [End of artifact]
-AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code and print the results to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
+AGENT: <thinking>I have edited the code to detect only one dog, I must now run the code and print the results to get the output.</thinking>
+<execute_python>
+from dog_detector import detect_dogs
+print(detect_dogs('/path/to/images/dog.jpg'))
+</execute_python>
+<let_user_respond>false</let_user_respond>
 OBSERVATION:
 ----- stdout -----
 [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}]
-AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "The code detected one dog, do these results look good to you?", "let_user_respond": true}
+AGENT: <thinking>One dog is detected, I will show this to the user and ask them if the result looks good.</thinking>
+<response>The code detected one dog, do these results look good to you?</response>
+<let_user_respond>true</let_user_respond>
 """
 EXAMPLES_CODE2 = """
@@ -103,12 +126,18 @@ OBSERVATION:
 Artifact image.jpg loaded to /path/to/images/image.jpg
 [End of artifacts]
-AGENT: {"thoughts": "The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.", "response": "<execute_python>view_media_artifacts('image.jpg')</execute_python>", "let_user_respond": false}
+AGENT: <thinking>The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.</thinking>
+<execute_python>
+view_media_artifacts('image.jpg')
+</execute_python>
+<let_user_respond>false</let_user_respond>
 OBSERVATION:
 [Image image.jpg displayed]
-AGENT: {"thoughts": "The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.", "response": "The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?", "let_user_respond": true}
+AGENT: <thinking>The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.</thinking>
+<response>The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?</response>
+<let_user_respond>true</let_user_respond>
 """
@@ -119,7 +148,9 @@ OBSERVATION:
 [Artifacts loaded]
 [End of artifacts]
-AGENT: {"thoughts": "The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.", "response": "Can you provide an image of workers with helmets?", "let_user_respond": true}
+AGENT: <thinking>The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.</thinking>
+<response>Can you provide an image of workers with helmets?</response>
+<let_user_respond>true</let_user_respond>
 USER: Yes you can use workers.png
@@ -128,7 +159,11 @@ OBSERVATION:
 Artifact workers.png loaded to /path/to/images/workers.png
 [End of artifacts]
-AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])</execute_python>", "let_user_respond": false}
+AGENT: <thinking>I will use the generate_vision_code to count the workers with helmets in the image.</thinking>
+<execute_python>
+generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])
+</execute_python>
+<let_user_respond>false</let_user_respond>
 OBSERVATION:
 [Artifact code.py]
@@ -152,22 +187,37 @@ OBSERVATION:
 16|    return count
 [End of artifact]
-AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n    write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png')</execute_python>", "let_user_respond": false}
+AGENT: <thinking>I have generated the code to count the workers with helmets in the image, I must now run the code and print the output.</thinking>
+<execute_python>
+from code import count_workers_with_helmets
+print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))
+</execute_python>
+<let_user_respond>false</let_user_respond>
 OBSERVATION:
 ----- stdout -----
 2
-AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.", "let_user_respond": true}
+AGENT: <thinking>Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
+<response>The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.</response>
+<let_user_respond>true</let_user_respond>
 USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
-AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "<execute_python>object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])</execute_python>", "let_user_respond": false}
+AGENT: <thinking>Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model</thinking>
+<execute_python>
+object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])
+</execute_python>
+<let_user_respond>false</let_user_respond>
 OBSERVATION:
 [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
-AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "<execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
+AGENT: <thinking>The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.</thinking>
+<execute_python>
+use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")
+</execute_python>
+<let_user_respond>false</let_user_respond>
 OBSERVATION:
 [Artifact code.py edits]
@@ -184,11 +234,18 @@ OBSERVATION:
      count = 0
 [End of edits]
-AGENT: {"thoughts": "I have updated the code to use the fine tuned model, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n    write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png')</execute_python>", "let_user_respond": false}
+AGENT: <thinking>I have updated the code to use the fine tuned model, I must now run the code and print the output.</thinking>
+<execute_python>
+from code import count_workers_with_helmets
+print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))
+</execute_python>
+<let_user_respond>false</let_user_respond>
 OBSERVATION:
 ----- stdout -----
 3
-AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
+AGENT: <thinking>Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
+<response>I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.</response>
+<let_user_respond>true</let_user_respond>
 """

vision_agent/tools/__init__.py CHANGED Viewed

@@ -37,7 +37,6 @@ from .tools import (
     grounding_dino,
     grounding_sam,
     ixc25_image_vqa,
-    ixc25_temporal_localization,
     ixc25_video_vqa,
     load_image,
     loca_visual_prompt_counting,

vision_agent/tools/meta_tools.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import base64
 import difflib
 import json
 import os
@@ -9,7 +8,6 @@ import tempfile
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
-import numpy as np
 from IPython.display import display
 from redbaron import RedBaron  # type: ignore
@@ -22,8 +20,7 @@ from vision_agent.tools.tool_utils import get_tool_documentation
 from vision_agent.tools.tools import TOOL_DESCRIPTIONS
 from vision_agent.tools.tools_types import BboxInput, BboxInputBase64, PromptTask
 from vision_agent.utils.execute import Execution, MimeType
-from vision_agent.utils.image_utils import convert_to_b64, numpy_to_bytes
-from vision_agent.utils.video import frames_to_bytes
+from vision_agent.utils.image_utils import convert_to_b64
 CURRENT_FILE = None
 CURRENT_LINE = 0
@@ -393,19 +390,6 @@ def generate_vision_plan(
         redisplay_results(response.test_results)
     response.test_results = None
     artifacts[name] = response.model_dump_json()
-    media_names = extract_json(
-        AnthropicLMM()(  # type: ignore
-            f"""Extract any media file names from this output in the following JSON format:
-{{"media": ["image1.jpg", "image2.jpg"]}}
-{artifacts[name]}"""
-        )
-    )
-    if "media" in media_names and isinstance(media_names, dict):
-        for media in media_names["media"]:
-            if isinstance(media, str):
-                with open(media, "rb") as f:
-                    artifacts[media] = f.read()
     output_str = f"[Start Plan Context, saved at {name}]"
     for plan in response.plans.keys():
@@ -466,6 +450,12 @@ def generate_vision_code(
         test_multi_plan=test_multi_plan,
         custom_tool_names=custom_tool_names,
     )
+    # capture and save any files that were saved in the code to the artifacts
+    extract_and_save_files_to_artifacts(
+        artifacts, response["code"] + "\n" + response["test"]
+    )
     redisplay_results(response["test_result"])
     code = response["code"]
     artifacts[name] = code
@@ -546,6 +536,11 @@ def edit_vision_code(
         test_multi_plan=False,
         custom_tool_names=custom_tool_names,
     )
+    # capture and save any files that were saved in the code to the artifacts
+    extract_and_save_files_to_artifacts(
+        artifacts, response["code"] + "\n" + response["test"]
+    )
     redisplay_results(response["test_result"])
     code = response["code"]
     artifacts[name] = code
@@ -567,49 +562,6 @@ def edit_vision_code(
     return view_lines(code_lines, 0, total_lines, name, total_lines)
-def write_media_artifact(
-    artifacts: Artifacts,
-    name: str,
-    media: Union[str, np.ndarray, List[np.ndarray]],
-    fps: Optional[float] = None,
-) -> str:
-    """Writes a media file to the artifacts object.
-    Parameters:
-        artifacts (Artifacts): The artifacts object to save the media to.
-        name (str): The name of the media artifact to save.
-        media (Union[str, np.ndarray, List[np.ndarray]]): The media to save, can either
-            be a file path, single image or list of frames for a video.
-        fps (Optional[float]): The frames per second if you are writing a video.
-    """
-    if isinstance(media, str):
-        with open(media, "rb") as f:
-            media_bytes = f.read()
-    elif isinstance(media, list):
-        media_bytes = frames_to_bytes(media, fps=fps if fps is not None else 1.0)
-    elif isinstance(media, np.ndarray):
-        media_bytes = numpy_to_bytes(media)
-    else:
-        print(f"[Invalid media type {type(media)}]")
-        return f"[Invalid media type {type(media)}]"
-    artifacts[name] = media_bytes
-    print(f"[Media {name} saved]")
-    display(
-        {
-            MimeType.APPLICATION_ARTIFACT: json.dumps(
-                {
-                    "name": name,
-                    "action": "create",
-                    "content": base64.b64encode(media_bytes).decode("utf-8"),
-                    "contentType": "media_output",
-                }
-            )
-        },
-        raw=True,
-    )
-    return f"[Media {name} saved]"
 def list_artifacts(artifacts: Artifacts) -> str:
     """Lists all the artifacts that have been loaded into the artifacts object."""
     output_str = artifacts.show()
@@ -813,6 +765,61 @@ def use_object_detection_fine_tuning(
     return diff
+def extract_and_save_files_to_artifacts(artifacts: Artifacts, code: str) -> None:
+    """Extracts and saves files used in the code to the artifacts object.
+    Parameters:
+        artifacts (Artifacts): The artifacts object to save the files to.
+        code (str): The code to extract the files from.
+    """
+    try:
+        response = extract_json(
+            AnthropicLMM()(  # type: ignore
+                f"""You are a helpful AI assistant. Your job is to look at a snippet of code and return the file paths that are being saved in the file. Below is the code snippet:
+```python
+{code}
+```
+Return the file paths in the following JSON format:
+{{"file_paths": ["/path/to/image1.jpg", "/other/path/to/data.json"]}}"""
+            )
+        )
+    except json.JSONDecodeError:
+        return
+    text_file_ext = [
+        ".txt",
+        ".md",
+        "rtf",
+        ".html",
+        ".htm",
+        "xml",
+        ".json",
+        ".csv",
+        ".tsv",
+        ".yaml",
+        ".yml",
+        ".toml",
+        ".conf",
+        ".env" ".ini",
+        ".log",
+        ".py",
+        ".java",
+        ".js",
+        ".cpp",
+        ".c" ".sql",
+        ".sh",
+    ]
+    if "file_paths" in response and isinstance(response["file_paths"], list):
+        for file_path in response["file_paths"]:
+            read_mode = "r" if Path(file_path).suffix in text_file_ext else "rb"
+            if Path(file_path).is_file():
+                with open(file_path, read_mode) as f:
+                    artifacts[Path(file_path).name] = f.read()
 META_TOOL_DOCSTRING = get_tool_documentation(
     [
         get_tool_descriptions,
@@ -822,7 +829,6 @@ META_TOOL_DOCSTRING = get_tool_documentation(
         generate_vision_plan,
         generate_vision_code,
         edit_vision_code,
-        write_media_artifact,
         view_media_artifact,
         object_detection_fine_tuning,
         use_object_detection_fine_tuning,

vision_agent/tools/tools.py CHANGED Viewed

@@ -181,6 +181,8 @@ def owl_v2_image(
     """
     image_size = image.shape[:2]
+    if image_size[0] < 1 or image_size[1] < 1:
+        return []
     if fine_tune_id is not None:
         image_b64 = convert_to_b64(image)
@@ -413,6 +415,9 @@ def florence2_sam2_image(
             },
         ]
     """
+    if image.shape[0] < 1 or image.shape[1] < 1:
+        return []
     if fine_tune_id is not None:
         image_b64 = convert_to_b64(image)
         landing_api = LandingPublicAPI()
@@ -701,6 +706,8 @@ def countgd_counting(
         ]
     """
     image_size = image.shape[:2]
+    if image_size[0] < 1 or image_size[1] < 1:
+        return []
     buffer_bytes = numpy_to_bytes(image)
     files = [("image", buffer_bytes)]
     prompt = prompt.replace(", ", " .")
@@ -759,6 +766,8 @@ def countgd_example_based_counting(
         ]
     """
     image_size = image.shape[:2]
+    if image_size[0] < 1 or image_size[1] < 1:
+        return []
     buffer_bytes = numpy_to_bytes(image)
     files = [("image", buffer_bytes)]
     visual_prompts = [
@@ -828,6 +837,8 @@ def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
         >>> ixc25_image_vqa('What is the cat doing?', image)
         'drinking milk'
     """
+    if image.shape[0] < 1 or image.shape[1] < 1:
+        raise ValueError(f"Image is empty, image shape: {image.shape}")
     buffer_bytes = numpy_to_bytes(image)
     files = [("image", buffer_bytes)]
@@ -871,47 +882,6 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
     return cast(str, data["answer"])
-def ixc25_temporal_localization(prompt: str, frames: List[np.ndarray]) -> List[bool]:
-    """'ixc25_temporal_localization' uses ixc25_video_vqa to temporally segment a video
-    given a prompt that can be other an object or a phrase. It returns a list of
-    boolean values indicating whether the object or phrase is present in the
-    corresponding frame.
-    Parameters:
-        prompt (str): The question about the video
-        frames (List[np.ndarray]): The reference frames used for the question
-    Returns:
-        List[bool]: A list of boolean values indicating whether the object or phrase is
-            present in the corresponding frame.
-    Example
-    -------
-        >>> output = ixc25_temporal_localization('soccer goal', frames)
-        >>> print(output)
-        [False, False, False, True, True, True, False, False, False, False]
-        >>> save_video([f for i, f in enumerate(frames) if output[i]], 'output.mp4')
-    """
-    buffer_bytes = frames_to_bytes(frames)
-    files = [("video", buffer_bytes)]
-    payload = {
-        "prompt": prompt,
-        "chunk_length": 2,
-        "function_name": "ixc25_temporal_localization",
-    }
-    data: List[int] = send_inference_request(
-        payload,
-        "video-temporal-localization?model=internlm-xcomposer",
-        files=files,
-        v2=True,
-    )
-    chunk_size = round(len(frames) / len(data))
-    data_explode = [[elt] * chunk_size for elt in data]
-    data_bool = [bool(elt) for sublist in data_explode for elt in sublist]
-    return data_bool[: len(frames)]
 def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
     """'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
     including regular images or images of documents or presentations. It returns text
@@ -1024,6 +994,9 @@ def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
         {"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]},
     """
+    if image.shape[0] < 1 or image.shape[1] < 1:
+        return {"labels": [], "scores": []}
     image_b64 = convert_to_b64(image)
     data = {
         "prompt": ",".join(classes),
@@ -1052,6 +1025,8 @@ def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
         >>> vit_image_classification(image)
         {"labels": ["leopard", "lemur, otter", "bird"], "scores": [0.68, 0.30, 0.02]},
     """
+    if image.shape[0] < 1 or image.shape[1] < 1:
+        return {"labels": [], "scores": []}
     image_b64 = convert_to_b64(image)
     data = {
@@ -1080,6 +1055,8 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
         >>> vit_nsfw_classification(image)
         {"label": "normal", "scores": 0.68},
     """
+    if image.shape[0] < 1 or image.shape[1] < 1:
+        raise ValueError(f"Image is empty, image shape: {image.shape}")
     image_b64 = convert_to_b64(image)
     data = {
@@ -1180,6 +1157,8 @@ def florence2_phrase_grounding(
         ]
     """
     image_size = image.shape[:2]
+    if image_size[0] < 1 or image_size[1] < 1:
+        return []
     image_b64 = convert_to_b64(image)
     if fine_tune_id is not None:
@@ -1399,6 +1378,8 @@ def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
             },
         ]
     """
+    if image.shape[0] < 1 or image.shape[1] < 1:
+        return []
     image_b64 = convert_to_b64(image)
     data = {
         "image": image_b64,
@@ -1442,6 +1423,9 @@ def depth_anything_v2(image: np.ndarray) -> np.ndarray:
                 [10, 11, 15, ..., 202, 202, 205],
                 [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
     """
+    if image.shape[0] < 1 or image.shape[1] < 1:
+        raise ValueError(f"Image is empty, image shape: {image.shape}")
     image_b64 = convert_to_b64(image)
     data = {
         "image": image_b64,

{vision_agent-0.2.164.dist-info → vision_agent-0.2.165.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.164
+Version: 0.2.165
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai
@@ -41,7 +41,7 @@ Project-URL: repository, https://github.com/landing-ai/vision-agent
 Description-Content-Type: text/markdown
 <div align="center">
-    <img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.jpg?raw=true">
+    <img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.png?raw=true">
 # 🔍🤖 Vision Agent
 [![](https://dcbadge.vercel.app/api/server/wPdN8RCYew?compact=true&style=flat)](https://discord.gg/wPdN8RCYew)
@@ -387,6 +387,11 @@ result = agent.generate_code(conv)
 ## Additional Backends
+### E2B Code Execution
+If you wish to run your code on the E2B backend, make sure you have your `E2B_API_KEY`
+set and then set `CODE_SANDBOX_RUNTIME=e2b` in your environment variables. This will
+run all the agent generated code on the E2B backend.
 ### Anthropic
 `AnthropicVisionAgentCoder` uses Anthropic. To get started you just need to get an
 Anthropic API key and set it in your environment variables:

{vision_agent-0.2.164.dist-info → vision_agent-0.2.165.dist-info}/RECORD RENAMED Viewed

@@ -2,12 +2,12 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
 vision_agent/agent/__init__.py,sha256=RRMPhH8mgm_pCtEKiVFSjJyDi4lCr4F7k05AhK01xlM,436
 vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
 vision_agent/agent/agent_utils.py,sha256=eSgg8CwWylX_erLTqTg2pVhEEgVkMLRrQfYRyJzI3so,5443
-vision_agent/agent/vision_agent.py,sha256=MUigVufYML2sYn9Hsngswa77XxlZBgCwQyBfK8tlsio,22551
+vision_agent/agent/vision_agent.py,sha256=cbY_V3f85_g8JmASa3m2LBX4G6xgsOKX1n7YtCf-C98,23676
 vision_agent/agent/vision_agent_coder.py,sha256=aVkl0b9LKvy-auuHGYSag-ixYnue0iRQqD1PYLPBR-s,29312
 vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
 vision_agent/agent/vision_agent_planner.py,sha256=mjmnXG9CvYf_ZA7ZJ3ri4H-2U_Km55gF1sZYRSOlxpY,19027
 vision_agent/agent/vision_agent_planner_prompts.py,sha256=JDARUzko2HZdxkBtcy6wuP9DCCmbqhK_gnVgrjr6l1k,6691
-vision_agent/agent/vision_agent_prompts.py,sha256=LZ9Bnx7ZFkqbNOMqwfdiWZU4niND9Z1ArcFHNSn_jzA,11187
+vision_agent/agent/vision_agent_prompts.py,sha256=_xAITNDKcS45tqhEax5i6vDQa4V39f9n55iRGk2R6RM,11218
 vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
 vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
@@ -16,11 +16,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
 vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
 vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
 vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
-vision_agent/tools/__init__.py,sha256=50wwisjudmZn7_SEwigTiiDxQ0HXbSIhVI4O8kvE9Es,2365
-vision_agent/tools/meta_tools.py,sha256=MULJrZiTODOAN20TGceLdXcwoSGMNaE7bQbywySITnA,28458
+vision_agent/tools/__init__.py,sha256=u-vS5iORB4ccvxoAjbtpvhTALDhXGilcATIq1_eZhKo,2332
+vision_agent/tools/meta_tools.py,sha256=ZF-7z3KT-Su08MvF5OhSm3Taqeu1Ek-EZjFhpN5w1uU,28257
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
 vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
-vision_agent/tools/tools.py,sha256=hjv1mZdq8AHgsX_0zmya0i9yiEK6My5FO6mWbGjZvV4,78521
+vision_agent/tools/tools.py,sha256=iKsBZxJ5--xWK-mqgZ1jbX_bfGS5HmAp-VRZ69m9yPg,77921
 vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
 vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
 vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -29,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
 vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
 vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
 vision_agent/utils/video.py,sha256=xbMEoRk13l4fHeQlbvMQhLCn8RNndYmsDhUf01TUeR8,4781
-vision_agent-0.2.164.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.164.dist-info/METADATA,sha256=v8XdfcxjcFNC1sgOI5BBGklOoEtb6QxpHremU02onX0,17785
-vision_agent-0.2.164.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.164.dist-info/RECORD,,
+vision_agent-0.2.165.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.165.dist-info/METADATA,sha256=jvrYb4IyKp79Sqrhyul6pu0EtEZRewumAZCVR6qWZWg,18034
+vision_agent-0.2.165.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.165.dist-info/RECORD,,

{vision_agent-0.2.164.dist-info → vision_agent-0.2.165.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.164.dist-info → vision_agent-0.2.165.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.164__py3-none-any.whl → 0.2.165__py3-none-any.whl

vision-agent 0.2.164py3-none-any.whl → 0.2.165py3-none-any.whl