PyPI - vision-agent - Versions diffs - 0.2.164__tar.gz → 0.2.165__tar.gz - Mend

vision-agent 0.2.164tar.gz → 0.2.165tar.gz

Files changed (36) hide show

{vision_agent-0.2.164 → vision_agent-0.2.165}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.164
+Version: 0.2.165
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai
@@ -41,7 +41,7 @@ Project-URL: repository, https://github.com/landing-ai/vision-agent
 Description-Content-Type: text/markdown
 <div align="center">
-    <img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.jpg?raw=true">
+    <img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.png?raw=true">
 # 🔍🤖 Vision Agent
 [![](https://dcbadge.vercel.app/api/server/wPdN8RCYew?compact=true&style=flat)](https://discord.gg/wPdN8RCYew)
@@ -387,6 +387,11 @@ result = agent.generate_code(conv)
 ## Additional Backends
+### E2B Code Execution
+If you wish to run your code on the E2B backend, make sure you have your `E2B_API_KEY`
+set and then set `CODE_SANDBOX_RUNTIME=e2b` in your environment variables. This will
+run all the agent generated code on the E2B backend.
 ### Anthropic
 `AnthropicVisionAgentCoder` uses Anthropic. To get started you just need to get an
 Anthropic API key and set it in your environment variables:

{vision_agent-0.2.164 → vision_agent-0.2.165}/README.md RENAMED Viewed

@@ -1,5 +1,5 @@
 <div align="center">
-    <img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.jpg?raw=true">
+    <img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.png?raw=true">
 # 🔍🤖 Vision Agent
 [![](https://dcbadge.vercel.app/api/server/wPdN8RCYew?compact=true&style=flat)](https://discord.gg/wPdN8RCYew)
@@ -345,6 +345,11 @@ result = agent.generate_code(conv)
 ## Additional Backends
+### E2B Code Execution
+If you wish to run your code on the E2B backend, make sure you have your `E2B_API_KEY`
+set and then set `CODE_SANDBOX_RUNTIME=e2b` in your environment variables. This will
+run all the agent generated code on the E2B backend.
 ### Anthropic
 `AnthropicVisionAgentCoder` uses Anthropic. To get started you just need to get an
 Anthropic API key and set it in your environment variables:

{vision_agent-0.2.164 → vision_agent-0.2.165}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.164"
+version = "0.2.165"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

{vision_agent-0.2.164 → vision_agent-0.2.165}/vision_agent/agent/vision_agent.py RENAMED Viewed

@@ -1,4 +1,5 @@
 import copy
+import json
 import logging
 import os
 import tempfile
@@ -6,7 +7,7 @@ from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
 from vision_agent.agent import Agent
-from vision_agent.agent.agent_utils import extract_json
+from vision_agent.agent.agent_utils import extract_json, extract_tag
 from vision_agent.agent.vision_agent_prompts import (
     EXAMPLES_CODE1,
     EXAMPLES_CODE2,
@@ -18,6 +19,7 @@ from vision_agent.tools.meta_tools import (
     META_TOOL_DOCSTRING,
     Artifacts,
     check_and_load_image,
+    extract_and_save_files_to_artifacts,
     use_extra_vision_agent_args,
 )
 from vision_agent.utils import CodeInterpreterFactory
@@ -35,7 +37,7 @@ class BoilerplateCode:
     pre_code = [
         "from typing import *",
         "from vision_agent.utils.execute import CodeInterpreter",
-        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
+        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
         "artifacts = Artifacts('{remote_path}')",
         "artifacts.load('{remote_path}')",
     ]
@@ -57,6 +59,32 @@ class BoilerplateCode:
         )
+def format_agent_message(agent_message: str) -> str:
+    agent_message_json = extract_json(agent_message)
+    output = ""
+    if "thinking" in agent_message_json and agent_message_json["thinking"]:
+        output += "<thinking>" + agent_message_json["thinking"] + "</thinking>"
+    if "response" in agent_message_json and agent_message_json["response"]:
+        output += "<response>" + agent_message_json["response"] + "</response>"
+    if "execute_python" in agent_message_json and agent_message_json["execute_python"]:
+        output += (
+            "\n<execute_python>\n"
+            + agent_message_json["execute_python"]
+            + "\n</execute_python>\n"
+        )
+    if (
+        "let_user_respond" in agent_message_json
+        and agent_message_json["let_user_respond"]
+    ):
+        output += (
+            "<let_user_respond>"
+            + str(agent_message_json["let_user_respond"])
+            + "</let_user_respond>"
+        )
+    return output
 def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
     chat = copy.deepcopy(chat)
@@ -67,7 +95,7 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
         elif chat_i["role"] == "observation":
             conversation += f"OBSERVATION:\n{chat_i['content']}\n\n"
         elif chat_i["role"] == "assistant":
-            conversation += f"AGENT: {chat_i['content']}\n\n"
+            conversation += f"AGENT: {format_agent_message(chat_i['content'])}\n\n"  # type: ignore
         else:
             raise ValueError(f"role {chat_i['role']} is not supported")
@@ -84,15 +112,31 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
         and len(chat[-1]["media"]) > 0  # type: ignore
     ):
         message["media"] = chat[-1]["media"]
-    return extract_json(orch([message], stream=False))  # type: ignore
+    conv_resp = cast(str, orch([message], stream=False))
+    let_user_respond_str = extract_tag(conv_resp, "let_user_respond")
+    let_user_respond = (
+        "true" in let_user_respond_str.lower() if let_user_respond_str else False
+    )
+    return {
+        "thinking": extract_tag(conv_resp, "thinking"),
+        "response": extract_tag(conv_resp, "response"),
+        "execute_python": extract_tag(conv_resp, "execute_python"),
+        "let_user_respond": let_user_respond,
+    }
 def execute_code_action(
-    code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
+    artifacts: Artifacts,
+    code: str,
+    code_interpreter: CodeInterpreter,
+    artifact_remote_path: str,
 ) -> Tuple[Execution, str]:
     result = code_interpreter.exec_isolation(
         BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
     )
+    extract_and_save_files_to_artifacts(artifacts, code)
     obs = str(result.logs)
     if result.error:
@@ -100,33 +144,8 @@ def execute_code_action(
     return result, obs
-def parse_execution(
-    response: str,
-    test_multi_plan: bool = True,
-    custom_tool_names: Optional[List[str]] = None,
-) -> Optional[str]:
-    code = None
-    remaining = response
-    all_code = []
-    while "<execute_python>" in remaining:
-        code_i = remaining[
-            remaining.find("<execute_python>") + len("<execute_python>") :
-        ]
-        code_i = code_i[: code_i.find("</execute_python>")]
-        remaining = remaining[
-            remaining.find("</execute_python>") + len("</execute_python>") :
-        ]
-        all_code.append(code_i)
-    if len(all_code) > 0:
-        code = "\n".join(all_code)
-    if code is not None:
-        code = use_extra_vision_agent_args(code, test_multi_plan, custom_tool_names)
-    return code
 def execute_user_code_action(
+    artifacts: Artifacts,
     last_user_message: Message,
     code_interpreter: CodeInterpreter,
     artifact_remote_path: str,
@@ -138,40 +157,43 @@ def execute_user_code_action(
         return user_result, user_obs
     last_user_content = cast(str, last_user_message.get("content", ""))
+    try:
+        user_code_action = json.loads(last_user_content).get("execute_python", None)
+    except json.JSONDecodeError:
+        return user_result, user_obs
-    user_code_action = parse_execution(last_user_content, False)
     if user_code_action is not None:
+        user_code_action = use_extra_vision_agent_args(user_code_action, False)
         user_result, user_obs = execute_code_action(
-            user_code_action, code_interpreter, artifact_remote_path
+            artifacts, user_code_action, code_interpreter, artifact_remote_path
         )
         if user_result.error:
             user_obs += f"\n{user_result.error}"
     return user_result, user_obs
-def add_step_descriptions(response: Dict[str, str]) -> Dict[str, str]:
+def add_step_descriptions(response: Dict[str, Any]) -> Dict[str, Any]:
     response = copy.deepcopy(response)
-    if "response" in response:
-        resp_str = response["response"]
-        if "<execute_python>" in resp_str:
-            # only include descriptions for these, the rest will just have executing
-            # code
-            description_map = {
-                "open_code_artifact": "Reading file.",
-                "create_code_artifact": "Creating file.",
-                "edit_code_artifact": "Editing file.",
-                "generate_vision_code": "Generating vision code.",
-                "edit_vision_code": "Editing vision code.",
-            }
-            description = ""
-            for k, v in description_map.items():
-                if k in resp_str:
-                    description += v + " "
-            if description == "":
-                description = "Executing code."
-            resp_str = resp_str[resp_str.find("<execute_python>") :]
-            resp_str = description + resp_str
-        response["response"] = resp_str
+    if "execute_python" in response and response["execute_python"]:
+        # only include descriptions for these, the rest will just have executing
+        # code
+        description_map = {
+            "open_code_artifact": "Reading file.",
+            "create_code_artifact": "Creating file.",
+            "edit_code_artifact": "Editing file.",
+            "generate_vision_code": "Generating vision code.",
+            "edit_vision_code": "Editing vision code.",
+        }
+        description = ""
+        for k, v in description_map.items():
+            if k in response["execute_python"]:
+                description += v + " "
+        if description == "":
+            description = "Executing code."
+        response["response"] = description
     return response
@@ -369,7 +391,10 @@ class VisionAgent(Agent):
             self.streaming_message({"role": "observation", "content": artifacts_loaded})
             user_result, user_obs = execute_user_code_action(
-                last_user_message, code_interpreter, str(remote_artifacts_path)
+                artifacts,
+                last_user_message,
+                code_interpreter,
+                str(remote_artifacts_path),
             )
             finished = user_result is not None and user_obs is not None
             if user_result is not None and user_obs is not None:
@@ -394,13 +419,13 @@ class VisionAgent(Agent):
                 int_chat.append(
                     {
                         "role": "assistant",
-                        "content": str(add_step_descriptions(response)),
+                        "content": json.dumps(add_step_descriptions(response)),
                     }
                 )
                 orig_chat.append(
                     {
                         "role": "assistant",
-                        "content": str(add_step_descriptions(response)),
+                        "content": json.dumps(add_step_descriptions(response)),
                     }
                 )
@@ -408,11 +433,13 @@ class VisionAgent(Agent):
                 if last_response == response:
                     response["let_user_respond"] = True
-                finished = response["let_user_respond"]
+                finished = response.get("let_user_respond", False)
-                code_action = parse_execution(
-                    response["response"], test_multi_plan, custom_tool_names
-                )
+                code_action = response.get("execute_python", None)
+                if code_action is not None:
+                    code_action = use_extra_vision_agent_args(
+                        code_action, test_multi_plan, custom_tool_names
+                    )
                 if last_response == response:
                     self.streaming_message(
@@ -431,14 +458,17 @@ class VisionAgent(Agent):
                     self.streaming_message(
                         {
                             "role": "assistant",
-                            "content": response,
+                            "content": json.dumps(response),
                             "finished": finished and code_action is None,
                         }
                     )
                 if code_action is not None:
                     result, obs = execute_code_action(
-                        code_action, code_interpreter, str(remote_artifacts_path)
+                        artifacts,
+                        code_action,
+                        code_interpreter,
+                        str(remote_artifacts_path),
                     )
                     media_obs = check_and_load_image(code_action)

vision_agent-0.2.165/vision_agent/agent/vision_agent_prompts.py ADDED Viewed

@@ -0,0 +1,251 @@
+VA_CODE = """
+**Role**: You are a helpful agent that assists users with writing code.
+**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code.
+<execute_python>
+print("Hello World!")
+</execute_python>
+**Documentation**:
+This is the documentation for the different actions you can take:
+{documentation}
+**Examples**:
+Here is an example of how you can interact with a user and Actions to complete a task:
+--- START EXAMPLES ---
+{examples}
+--- END EXAMPLES ---
+**Conversation**:
+Here is the current conversation so far:
+--- START CONVERSATION ---
+{conversation}
+--- END CONVERSATION ---
+**Instructions**:
+1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
+2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`.
+3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question or show your results to the user, set <let_user_respond> to `true`.
+4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
+5. **Output**: You can only respond with <thinking>, <response>, <execute_python>, and <let_user_respond> tags.
+<thinking>Your thoughts here...</thinking>
+<response>Your response to the user here...</response>
+<execute_python>Your code here...</execute_python>
+<let_user_respond>true/false for whether or not to you want to let the user respond.</let_user_respond>
+"""
+EXAMPLES_CODE1 = """
+USER: Can you write code to detect the dogs in this image? Media name dog.jpg
+OBSERVATION:
+[Artifacts loaded]
+Artifact dog.jpg loaded to /path/to/images/dog.jpg
+[End of artifacts]
+AGENT: <thinking>I will use the generate_vision_code to detect the dogs in the image.</thinking>
+<execute_python>
+generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])
+</execute_python>
+<let_user_respond>false</let_user_respond>
+OBSERVATION:
+[Artifact dog_detector.py]
+0|from vision_agent.tools import load_image, owl_v2
+1|def detect_dogs(image_path: str):
+2|    image = load_image(image_path)
+3|    dogs = owl_v2("dog", image)
+4|    return dogs
+[End of artifact]
+AGENT: <thinking>I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.</thinking>
+<execute_python>
+from dog_detector import detect_dogs
+print(detect_dogs('/example/workspace/dog.jpg'))
+</execute_python>
+<let_user_respond>false</let_user_respond>
+OBSERVATION:
+----- stdout -----
+[{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}, {'score': 0.23, 'label': 'dog', 'box': [0.2, 0.3, 0.4, 0.5]}]
+AGENT: <thinking>Two dogs are detected, I will show this to the user and ask them if the result looks good.</thinking>
+<response>The code detectd two dogs, do the results look good to you?</response>
+<let_user_respond>true</let_user_respond>
+"""
+EXAMPLES_CODE1_EXTRA = """
+USER: The the image only has one dog, can you fix this?
+OBSERVATION:
+[Artifacts loaded]
+Artifact dog.jpg loaded to /path/to/images/dog.jpg
+Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
+[End of artifacts]
+AGENT: <thinking>I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that</thinking>
+<execute_python>
+edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])
+</execute_python>
+<let_user_respond>false</let_user_respond>
+OBSERVATION:
+[Artifact dog_detector.py]
+0|from vision_agent.tools import load_image, owl_v2
+1|def detect_dogs(image_path: str):
+2|    image = load_image(image_path)
+3|    dogs = owl_v2("dog", image, threshold=0.24)
+4|    return dogs
+[End of artifact]
+AGENT: <thinking>I have edited the code to detect only one dog, I must now run the code and print the results to get the output.</thinking>
+<execute_python>
+from dog_detector import detect_dogs
+print(detect_dogs('/path/to/images/dog.jpg'))
+</execute_python>
+<let_user_respond>false</let_user_respond>
+OBSERVATION:
+----- stdout -----
+[{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}]
+AGENT: <thinking>One dog is detected, I will show this to the user and ask them if the result looks good.</thinking>
+<response>The code detected one dog, do these results look good to you?</response>
+<let_user_respond>true</let_user_respond>
+"""
+EXAMPLES_CODE2 = """
+USER: Can you describe this image?
+OBSERVATION:
+[Artifacts loaded]
+Artifact image.jpg loaded to /path/to/images/image.jpg
+[End of artifacts]
+AGENT: <thinking>The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.</thinking>
+<execute_python>
+view_media_artifacts('image.jpg')
+</execute_python>
+<let_user_respond>false</let_user_respond>
+OBSERVATION:
+[Image image.jpg displayed]
+AGENT: <thinking>The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.</thinking>
+<response>The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?</response>
+<let_user_respond>true</let_user_respond>
+"""
+EXAMPLES_CODE3 = """
+USER: Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?
+OBSERVATION:
+[Artifacts loaded]
+[End of artifacts]
+AGENT: <thinking>The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.</thinking>
+<response>Can you provide an image of workers with helmets?</response>
+<let_user_respond>true</let_user_respond>
+USER: Yes you can use workers.png
+OBSERVATION:
+[Artifacts loaded]
+Artifact workers.png loaded to /path/to/images/workers.png
+[End of artifacts]
+AGENT: <thinking>I will use the generate_vision_code to count the workers with helmets in the image.</thinking>
+<execute_python>
+generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])
+</execute_python>
+<let_user_respond>false</let_user_respond>
+OBSERVATION:
+[Artifact code.py]
+0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
+1|def count_workers_with_helmets(image_path: str, output_path: str):
+2|    image = load_image(image_path)
+3|    detections = florence2_phrase_grounding("worker, helmet", image)
+4|    workers = [d for d in detections if d['label'] == 'worker']
+5|    helmets = [d for d in detections if d['label'] == 'helmet']
+6|    count = 0
+6|    for worker in workers:
+7|        person_has_helmet = False
+8|        for helmet in helmets:
+9|            if closest_box_distance(worker['box'], helmet['box']) < 0.01:
+10|                person_has_helmet = True
+11|                break
+11|        if person_has_helmet:
+13|            count += 1
+14|    overlay_bounding_boxes(image, detections)
+15|    save_image(output_path, image)
+16|    return count
+[End of artifact]
+AGENT: <thinking>I have generated the code to count the workers with helmets in the image, I must now run the code and print the output.</thinking>
+<execute_python>
+from code import count_workers_with_helmets
+print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))
+</execute_python>
+<let_user_respond>false</let_user_respond>
+OBSERVATION:
+----- stdout -----
+2
+AGENT: <thinking>Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
+<response>The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.</response>
+<let_user_respond>true</let_user_respond>
+USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
+AGENT: <thinking>Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model</thinking>
+<execute_python>
+object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])
+</execute_python>
+<let_user_respond>false</let_user_respond>
+OBSERVATION:
+[Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
+AGENT: <thinking>The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.</thinking>
+<execute_python>
+use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")
+</execute_python>
+<let_user_respond>false</let_user_respond>
+OBSERVATION:
+[Artifact code.py edits]
+---
++++
+@@ -1,7 +1,7 @@
+ from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
+ def count_workers_with_helmets(image_path: str, output_path: str):
+     image = load_image(image_path)
+-    detections = florence2_phrase_grounding("worker, helmet", image)
++    detections = florence2_phrase_grounding("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
+     workers = [d for d in detections if d['label'] == 'worker']
+     helmets = [d for d in detections if d['label'] == 'helmet']
+     count = 0
+[End of edits]
+AGENT: <thinking>I have updated the code to use the fine tuned model, I must now run the code and print the output.</thinking>
+<execute_python>
+from code import count_workers_with_helmets
+print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))
+</execute_python>
+<let_user_respond>false</let_user_respond>
+OBSERVATION:
+----- stdout -----
+3
+AGENT: <thinking>Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
+<response>I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.</response>
+<let_user_respond>true</let_user_respond>
+"""

{vision_agent-0.2.164 → vision_agent-0.2.165}/vision_agent/tools/__init__.py RENAMED Viewed

@@ -37,7 +37,6 @@ from .tools import (
     grounding_dino,
     grounding_sam,
     ixc25_image_vqa,
-    ixc25_temporal_localization,
     ixc25_video_vqa,
     load_image,
     loca_visual_prompt_counting,

{vision_agent-0.2.164 → vision_agent-0.2.165}/vision_agent/tools/meta_tools.py RENAMED Viewed

@@ -1,4 +1,3 @@
-import base64
 import difflib
 import json
 import os
@@ -9,7 +8,6 @@ import tempfile
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
-import numpy as np
 from IPython.display import display
 from redbaron import RedBaron  # type: ignore
@@ -22,8 +20,7 @@ from vision_agent.tools.tool_utils import get_tool_documentation
 from vision_agent.tools.tools import TOOL_DESCRIPTIONS
 from vision_agent.tools.tools_types import BboxInput, BboxInputBase64, PromptTask
 from vision_agent.utils.execute import Execution, MimeType
-from vision_agent.utils.image_utils import convert_to_b64, numpy_to_bytes
-from vision_agent.utils.video import frames_to_bytes
+from vision_agent.utils.image_utils import convert_to_b64
 CURRENT_FILE = None
 CURRENT_LINE = 0
@@ -393,19 +390,6 @@ def generate_vision_plan(
         redisplay_results(response.test_results)
     response.test_results = None
     artifacts[name] = response.model_dump_json()
-    media_names = extract_json(
-        AnthropicLMM()(  # type: ignore
-            f"""Extract any media file names from this output in the following JSON format:
-{{"media": ["image1.jpg", "image2.jpg"]}}
-{artifacts[name]}"""
-        )
-    )
-    if "media" in media_names and isinstance(media_names, dict):
-        for media in media_names["media"]:
-            if isinstance(media, str):
-                with open(media, "rb") as f:
-                    artifacts[media] = f.read()
     output_str = f"[Start Plan Context, saved at {name}]"
     for plan in response.plans.keys():
@@ -466,6 +450,12 @@ def generate_vision_code(
         test_multi_plan=test_multi_plan,
         custom_tool_names=custom_tool_names,
     )
+    # capture and save any files that were saved in the code to the artifacts
+    extract_and_save_files_to_artifacts(
+        artifacts, response["code"] + "\n" + response["test"]
+    )
     redisplay_results(response["test_result"])
     code = response["code"]
     artifacts[name] = code
@@ -546,6 +536,11 @@ def edit_vision_code(
         test_multi_plan=False,
         custom_tool_names=custom_tool_names,
     )
+    # capture and save any files that were saved in the code to the artifacts
+    extract_and_save_files_to_artifacts(
+        artifacts, response["code"] + "\n" + response["test"]
+    )
     redisplay_results(response["test_result"])
     code = response["code"]
     artifacts[name] = code
@@ -567,49 +562,6 @@ def edit_vision_code(
     return view_lines(code_lines, 0, total_lines, name, total_lines)
-def write_media_artifact(
-    artifacts: Artifacts,
-    name: str,
-    media: Union[str, np.ndarray, List[np.ndarray]],
-    fps: Optional[float] = None,
-) -> str:
-    """Writes a media file to the artifacts object.
-    Parameters:
-        artifacts (Artifacts): The artifacts object to save the media to.
-        name (str): The name of the media artifact to save.
-        media (Union[str, np.ndarray, List[np.ndarray]]): The media to save, can either
-            be a file path, single image or list of frames for a video.
-        fps (Optional[float]): The frames per second if you are writing a video.
-    """
-    if isinstance(media, str):
-        with open(media, "rb") as f:
-            media_bytes = f.read()
-    elif isinstance(media, list):
-        media_bytes = frames_to_bytes(media, fps=fps if fps is not None else 1.0)
-    elif isinstance(media, np.ndarray):
-        media_bytes = numpy_to_bytes(media)
-    else:
-        print(f"[Invalid media type {type(media)}]")
-        return f"[Invalid media type {type(media)}]"
-    artifacts[name] = media_bytes
-    print(f"[Media {name} saved]")
-    display(
-        {
-            MimeType.APPLICATION_ARTIFACT: json.dumps(
-                {
-                    "name": name,
-                    "action": "create",
-                    "content": base64.b64encode(media_bytes).decode("utf-8"),
-                    "contentType": "media_output",
-                }
-            )
-        },
-        raw=True,
-    )
-    return f"[Media {name} saved]"
 def list_artifacts(artifacts: Artifacts) -> str:
     """Lists all the artifacts that have been loaded into the artifacts object."""
     output_str = artifacts.show()
@@ -813,6 +765,61 @@ def use_object_detection_fine_tuning(
     return diff
+def extract_and_save_files_to_artifacts(artifacts: Artifacts, code: str) -> None:
+    """Extracts and saves files used in the code to the artifacts object.
+    Parameters:
+        artifacts (Artifacts): The artifacts object to save the files to.
+        code (str): The code to extract the files from.
+    """
+    try:
+        response = extract_json(
+            AnthropicLMM()(  # type: ignore
+                f"""You are a helpful AI assistant. Your job is to look at a snippet of code and return the file paths that are being saved in the file. Below is the code snippet:
+```python
+{code}
+```
+Return the file paths in the following JSON format:
+{{"file_paths": ["/path/to/image1.jpg", "/other/path/to/data.json"]}}"""
+            )
+        )
+    except json.JSONDecodeError:
+        return
+    text_file_ext = [
+        ".txt",
+        ".md",
+        "rtf",
+        ".html",
+        ".htm",
+        "xml",
+        ".json",
+        ".csv",
+        ".tsv",
+        ".yaml",
+        ".yml",
+        ".toml",
+        ".conf",
+        ".env" ".ini",
+        ".log",
+        ".py",
+        ".java",
+        ".js",
+        ".cpp",
+        ".c" ".sql",
+        ".sh",
+    ]
+    if "file_paths" in response and isinstance(response["file_paths"], list):
+        for file_path in response["file_paths"]:
+            read_mode = "r" if Path(file_path).suffix in text_file_ext else "rb"
+            if Path(file_path).is_file():
+                with open(file_path, read_mode) as f:
+                    artifacts[Path(file_path).name] = f.read()
 META_TOOL_DOCSTRING = get_tool_documentation(
     [
         get_tool_descriptions,
@@ -822,7 +829,6 @@ META_TOOL_DOCSTRING = get_tool_documentation(
         generate_vision_plan,
         generate_vision_code,
         edit_vision_code,
-        write_media_artifact,
         view_media_artifact,
         object_detection_fine_tuning,
         use_object_detection_fine_tuning,

{vision_agent-0.2.164 → vision_agent-0.2.165}/vision_agent/tools/tools.py RENAMED Viewed

@@ -181,6 +181,8 @@ def owl_v2_image(
     """
     image_size = image.shape[:2]
+    if image_size[0] < 1 or image_size[1] < 1:
+        return []
     if fine_tune_id is not None:
         image_b64 = convert_to_b64(image)
@@ -413,6 +415,9 @@ def florence2_sam2_image(
             },
         ]
     """
+    if image.shape[0] < 1 or image.shape[1] < 1:
+        return []
     if fine_tune_id is not None:
         image_b64 = convert_to_b64(image)
         landing_api = LandingPublicAPI()
@@ -701,6 +706,8 @@ def countgd_counting(
         ]
     """
     image_size = image.shape[:2]
+    if image_size[0] < 1 or image_size[1] < 1:
+        return []
     buffer_bytes = numpy_to_bytes(image)
     files = [("image", buffer_bytes)]
     prompt = prompt.replace(", ", " .")
@@ -759,6 +766,8 @@ def countgd_example_based_counting(
         ]
     """
     image_size = image.shape[:2]
+    if image_size[0] < 1 or image_size[1] < 1:
+        return []
     buffer_bytes = numpy_to_bytes(image)
     files = [("image", buffer_bytes)]
     visual_prompts = [
@@ -828,6 +837,8 @@ def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
         >>> ixc25_image_vqa('What is the cat doing?', image)
         'drinking milk'
     """
+    if image.shape[0] < 1 or image.shape[1] < 1:
+        raise ValueError(f"Image is empty, image shape: {image.shape}")
     buffer_bytes = numpy_to_bytes(image)
     files = [("image", buffer_bytes)]
@@ -871,47 +882,6 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
     return cast(str, data["answer"])
-def ixc25_temporal_localization(prompt: str, frames: List[np.ndarray]) -> List[bool]:
-    """'ixc25_temporal_localization' uses ixc25_video_vqa to temporally segment a video
-    given a prompt that can be other an object or a phrase. It returns a list of
-    boolean values indicating whether the object or phrase is present in the
-    corresponding frame.
-    Parameters:
-        prompt (str): The question about the video
-        frames (List[np.ndarray]): The reference frames used for the question
-    Returns:
-        List[bool]: A list of boolean values indicating whether the object or phrase is
-            present in the corresponding frame.
-    Example
-    -------
-        >>> output = ixc25_temporal_localization('soccer goal', frames)
-        >>> print(output)
-        [False, False, False, True, True, True, False, False, False, False]
-        >>> save_video([f for i, f in enumerate(frames) if output[i]], 'output.mp4')
-    """
-    buffer_bytes = frames_to_bytes(frames)
-    files = [("video", buffer_bytes)]
-    payload = {
-        "prompt": prompt,
-        "chunk_length": 2,
-        "function_name": "ixc25_temporal_localization",
-    }
-    data: List[int] = send_inference_request(
-        payload,
-        "video-temporal-localization?model=internlm-xcomposer",
-        files=files,
-        v2=True,
-    )
-    chunk_size = round(len(frames) / len(data))
-    data_explode = [[elt] * chunk_size for elt in data]
-    data_bool = [bool(elt) for sublist in data_explode for elt in sublist]
-    return data_bool[: len(frames)]
 def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
     """'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
     including regular images or images of documents or presentations. It returns text
@@ -1024,6 +994,9 @@ def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
         {"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]},
     """
+    if image.shape[0] < 1 or image.shape[1] < 1:
+        return {"labels": [], "scores": []}
     image_b64 = convert_to_b64(image)
     data = {
         "prompt": ",".join(classes),
@@ -1052,6 +1025,8 @@ def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
         >>> vit_image_classification(image)
         {"labels": ["leopard", "lemur, otter", "bird"], "scores": [0.68, 0.30, 0.02]},
     """
+    if image.shape[0] < 1 or image.shape[1] < 1:
+        return {"labels": [], "scores": []}
     image_b64 = convert_to_b64(image)
     data = {
@@ -1080,6 +1055,8 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
         >>> vit_nsfw_classification(image)
         {"label": "normal", "scores": 0.68},
     """
+    if image.shape[0] < 1 or image.shape[1] < 1:
+        raise ValueError(f"Image is empty, image shape: {image.shape}")
     image_b64 = convert_to_b64(image)
     data = {
@@ -1180,6 +1157,8 @@ def florence2_phrase_grounding(
         ]
     """
     image_size = image.shape[:2]
+    if image_size[0] < 1 or image_size[1] < 1:
+        return []
     image_b64 = convert_to_b64(image)
     if fine_tune_id is not None:
@@ -1399,6 +1378,8 @@ def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
             },
         ]
     """
+    if image.shape[0] < 1 or image.shape[1] < 1:
+        return []
     image_b64 = convert_to_b64(image)
     data = {
         "image": image_b64,
@@ -1442,6 +1423,9 @@ def depth_anything_v2(image: np.ndarray) -> np.ndarray:
                 [10, 11, 15, ..., 202, 202, 205],
                 [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
     """
+    if image.shape[0] < 1 or image.shape[1] < 1:
+        raise ValueError(f"Image is empty, image shape: {image.shape}")
     image_b64 = convert_to_b64(image)
     data = {
         "image": image_b64,

vision_agent-0.2.164/vision_agent/agent/vision_agent_prompts.py DELETED Viewed

@@ -1,194 +0,0 @@
-VA_CODE = """
-**Role**: You are a helpful agent that assists users with writing code.
-**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code.
-<execute_python>
-print("Hello World!")
-</execute_python>
-**Documentation**:
-This is the documentation for the different actions you can take:
-{documentation}
-**Examples**:
-Here is an example of how you can interact with a user and Actions to complete a task:
---- START EXAMPLES ---
-{examples}
---- END EXAMPLES ---
-**Conversation**:
-Here is the current conversation so far:
---- START CONVERSATION ---
-{conversation}
---- END CONVERSATION ---
-**Instructions**:
-1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
-2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`.
-3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question, set `let_user_respond` to `true`.
-4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
-5. **Output in JSON**: Respond in the following format in JSON:
-```json
-{{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
-```
-"""
-EXAMPLES_CODE1 = """
-USER: Can you write code to detect the dogs in this image? Media name dog.jpg
-OBSERVATION:
-[Artifacts loaded]
-Artifact dog.jpg loaded to /path/to/images/dog.jpg
-[End of artifacts]
-AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
-OBSERVATION:
-[Artifact dog_detector.py]
-0|from vision_agent.tools import load_image, owl_v2
-1|def detect_dogs(image_path: str):
-2|    image = load_image(image_path)
-3|    dogs = owl_v2("dog", image)
-4|    return dogs
-[End of artifact]
-AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
-OBSERVATION:
------ stdout -----
-[{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}, {'score': 0.23, 'label': 'dog', 'box': [0.2, 0.3, 0.4, 0.5]}]
-AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "The code detectd two dogs, do the results look good to you?", "let_user_respond": true}
-"""
-EXAMPLES_CODE1_EXTRA = """
-USER: The the image only has one dog, can you fix this?
-OBSERVATION:
-[Artifacts loaded]
-Artifact dog.jpg loaded to /path/to/images/dog.jpg
-Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
-[End of artifacts]
-AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
-OBSERVATION:
-[Artifact dog_detector.py]
-0|from vision_agent.tools import load_image, owl_v2
-1|def detect_dogs(image_path: str):
-2|    image = load_image(image_path)
-3|    dogs = owl_v2("dog", image, threshold=0.24)
-4|    return dogs
-[End of artifact]
-AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code and print the results to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
-OBSERVATION:
------ stdout -----
-[{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}]
-AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "The code detected one dog, do these results look good to you?", "let_user_respond": true}
-"""
-EXAMPLES_CODE2 = """
-USER: Can you describe this image?
-OBSERVATION:
-[Artifacts loaded]
-Artifact image.jpg loaded to /path/to/images/image.jpg
-[End of artifacts]
-AGENT: {"thoughts": "The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.", "response": "<execute_python>view_media_artifacts('image.jpg')</execute_python>", "let_user_respond": false}
-OBSERVATION:
-[Image image.jpg displayed]
-AGENT: {"thoughts": "The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.", "response": "The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?", "let_user_respond": true}
-"""
-EXAMPLES_CODE3 = """
-USER: Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?
-OBSERVATION:
-[Artifacts loaded]
-[End of artifacts]
-AGENT: {"thoughts": "The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.", "response": "Can you provide an image of workers with helmets?", "let_user_respond": true}
-USER: Yes you can use workers.png
-OBSERVATION:
-[Artifacts loaded]
-Artifact workers.png loaded to /path/to/images/workers.png
-[End of artifacts]
-AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])</execute_python>", "let_user_respond": false}
-OBSERVATION:
-[Artifact code.py]
-0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
-1|def count_workers_with_helmets(image_path: str, output_path: str):
-2|    image = load_image(image_path)
-3|    detections = florence2_phrase_grounding("worker, helmet", image)
-4|    workers = [d for d in detections if d['label'] == 'worker']
-5|    helmets = [d for d in detections if d['label'] == 'helmet']
-6|    count = 0
-6|    for worker in workers:
-7|        person_has_helmet = False
-8|        for helmet in helmets:
-9|            if closest_box_distance(worker['box'], helmet['box']) < 0.01:
-10|                person_has_helmet = True
-11|                break
-11|        if person_has_helmet:
-13|            count += 1
-14|    overlay_bounding_boxes(image, detections)
-15|    save_image(output_path, image)
-16|    return count
-[End of artifact]
-AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n    write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png')</execute_python>", "let_user_respond": false}
-OBSERVATION:
------ stdout -----
-2
-AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.", "let_user_respond": true}
-USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
-AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "<execute_python>object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])</execute_python>", "let_user_respond": false}
-OBSERVATION:
-[Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
-AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "<execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
-OBSERVATION:
-[Artifact code.py edits]
----
-+++
-@@ -1,7 +1,7 @@
- from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
- def count_workers_with_helmets(image_path: str, output_path: str):
-     image = load_image(image_path)
--    detections = florence2_phrase_grounding("worker, helmet", image)
-+    detections = florence2_phrase_grounding("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
-     workers = [d for d in detections if d['label'] == 'worker']
-     helmets = [d for d in detections if d['label'] == 'helmet']
-     count = 0
-[End of edits]
-AGENT: {"thoughts": "I have updated the code to use the fine tuned model, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n    write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png')</execute_python>", "let_user_respond": false}
-OBSERVATION:
------ stdout -----
-3
-AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
-"""