PyPI - vision-agent - Versions diffs - 0.2.90__py3-none-any.whl → 0.2.92__py3-none-any.whl - Mend

vision-agent 0.2.90py3-none-any.whl → 0.2.92py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

vision_agent/agent/__init__.py +2 -1
vision_agent/agent/agent.py +1 -1
vision_agent/agent/agent_utils.py +43 -0
vision_agent/agent/vision_agent.py +116 -824
vision_agent/agent/vision_agent_coder.py +897 -0
vision_agent/agent/vision_agent_coder_prompts.py +328 -0
vision_agent/agent/vision_agent_prompts.py +89 -302
vision_agent/lmm/__init__.py +2 -1
vision_agent/lmm/lmm.py +3 -5
vision_agent/lmm/types.py +5 -0
vision_agent/tools/__init__.py +1 -0
vision_agent/tools/meta_tools.py +402 -0
vision_agent/tools/tool_utils.py +48 -2
vision_agent/tools/tools.py +7 -49
vision_agent/utils/execute.py +52 -76
vision_agent/utils/image_utils.py +1 -1
vision_agent/utils/type_defs.py +1 -1
{vision_agent-0.2.90.dist-info → vision_agent-0.2.92.dist-info}/METADATA +42 -12
vision_agent-0.2.92.dist-info/RECORD +29 -0
vision_agent-0.2.90.dist-info/RECORD +0 -24
{vision_agent-0.2.90.dist-info → vision_agent-0.2.92.dist-info}/LICENSE +0 -0
{vision_agent-0.2.90.dist-info → vision_agent-0.2.92.dist-info}/WHEEL +0 -0

vision_agent/agent/vision_agent_prompts.py CHANGED Viewed

@@ -1,327 +1,114 @@
-USER_REQ = """
-## User Request
-{user_request}
-"""
-FULL_TASK = """
-## User Request
-{user_request}
-## Subtasks
-{subtasks}
-"""
-FEEDBACK = """
-## This contains code and feedback from previous runs and is used for providing context so you do not make the same mistake again.
-{feedback}
-"""
-PLAN = """
-**Context**:
-{context}
-**Tools Available**:
-{tool_desc}
-**Previous Feedback**:
-{feedback}
-**Instructions**:
-1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
-2. Output three different plans each utilize a different strategy or tool.
-Output a list of jsons in the following format
-```json
-{{
-    "plan1":
-        [
-            {{
-                "instructions": str # what you should do in this task associated with a tool
-            }}
-        ],
-    "plan2": ...,
-    "plan3": ...
-}}
-```
-"""
-TEST_PLANS = """
-**Role**: You are a software programmer responsible for testing different tools.
-**Task**: Your responsibility is to take a set of several plans and test the different tools for each plan.
-**Documentation**:
-This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`.
-{docstring}
-**Plans**:
-{plans}
-{previous_attempts}
-**Instructions**:
-1. Write a program to load the media and call each tool and save it's output.
-2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove any array types from the printed dictionary.
-3. Print this final dictionary.
-**Example**:
-plan1:
-- Load the image from the provided file path 'image.jpg'.
-- Use the 'owl_v2' tool with the prompt 'person' to detect and count the number of people in the image.
-plan2:
-- Load the image from the provided file path 'image.jpg'.
-- Use the 'grounding_sam' tool with the prompt 'person' to detect and count the number of people in the image.
-- Count the number of detected objects labeled as 'person'.
-plan3:
-- Load the image from the provided file path 'image.jpg'.
-- Use the 'loca_zero_shot_counting' tool to count the dominant foreground object, which in this case is people.
-```python
-from vision_agent.tools import load_image, owl_v2, grounding_sam, loca_zero_shot_counting
-image = load_image("image.jpg")
-owl_v2_out = owl_v2("person", image)
-gsam_out = grounding_sam("person", image)
-gsam_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in gsam_out]
-loca_out = loca_zero_shot_counting(image)
-loca_out = loca_out["count"]
-final_out = {{"owl_v2": owl_v2_out, "florencev2_object_detection": florencev2_out, "loca_zero_shot_counting": loca_out}}
-print(final_out)
-```
-"""
-PREVIOUS_FAILED = """
-**Previous Failed Attempts**:
-You previously ran this code:
-```python
-{code}
-```
-But got the following error or no stdout:
-{error}
-"""
-PICK_PLAN = """
-**Role**: You are a software programmer.
-**Task**: Your responsibility is to pick the best plan from the three plans provided.
+VA_CODE = """
+**Role**: You are a helpful conversational agent that assists users with their requests by writing code to solve it.
-**Context**:
-{context}
+**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>.
-**Plans**:
-{plans}
-**Tool Output**:
-{tool_output}
-**Instructions**:
-1. Given the plans, image, and tool outputs, decide which plan is the best to achieve the user request.
-2. Output a JSON object with the following format:
-{{
-    "thoughts": str # your thought process for choosing the best plan
-    "best_plan": str # the best plan you have chosen
-}}
-"""
-CODE = """
-**Role**: You are a software programmer.
-**Task**: As a programmer, you are required to complete the function. Use a Chain-of-Thought approach to break down the problem, create pseudocode, and then write the code in Python language. Ensure that your code is efficient, readable, and well-commented. Return the requested information from the function you create. Do not call your code, a test will be run after the code is submitted.
-**Documentation**:
-This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`.
-{docstring}
-**Input Code Snippet**:
-```python
-# Your code here
-```
-**User Instructions**:
-{question}
-**Tool Output**:
-{tool_output}
-**Previous Feedback**:
-{feedback}
-**Instructions**:
-1. **Understand and Clarify**: Make sure you understand the task.
-2. **Algorithm/Method Selection**: Decide on the most efficient way.
-3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode.
-4. **Code Generation**: Translate your pseudocode into executable Python code. Ensure you use correct arguments, remember coordinates are always returned normalized from `vision_agent.tools`. All images from `vision_agent.tools` are in RGB format, red is (255, 0, 0) and blue is (0, 0, 255).
-"""
-TEST = """
-**Role**: As a tester, your task is to create comprehensive test cases for the provided code. These test cases should encompass Basic and Edge case scenarios to ensure the code's robustness and reliability if possible.
+<execute_python>
+print("Hello World!")
+</execute_python>
 **Documentation**:
-This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`. You do not need to test these functions. Test only the code provided by the user.
-{docstring}
+This is the documentation for the different actions you can take:
-**User Instructions**:
-{question}
-**Input Code Snippet**:
-```python
-### Please decided how would you want to generate test cases. Based on incomplete code or completed version.
-{code}
-```
-**Instructions**:
-1. Verify the fundamental functionality under normal conditions.
-2. Ensure each test case is well-documented with comments explaining the scenario it covers.
-3. DO NOT use any files that are not provided by the user's instructions, your test must be run and will crash if it tries to load a non-existent file.
-4. DO NOT mock any functions, you must test their functionality as is.
-You should format your test cases at the end of your response wrapped in ```python ``` tags like in the following example:
-```python
-# You can run assertions to ensure the function is working as expected
-assert function(input) == expected_output, "Test case description"
-# You can simply call the function to ensure it runs
-function(input)
-# Or you can visualize the output
-output = function(input)
-visualize(output)
-```
+{documentation}
 **Examples**:
-## Prompt 1:
-```python
-def detect_cats_and_dogs(image_path: str) -> Dict[str, List[List[float]]]:
-    \""" Detects cats and dogs in an image. Returns a dictionary with
-    {{
-        "cats": [[x1, y1, x2, y2], ...], "dogs": [[x1, y1, x2, y2], ...]
-    }}
-    \"""
-```
-## Completion 1:
-```python
-# We can test to ensure the output has the correct structure but we cannot test the
-# content of the output without knowing the image. We can test on "image.jpg" because
-# it is provided by the user so we know it exists.
-output = detect_cats_and_dogs("image.jpg")
-assert "cats" in output, "The output should contain 'cats'
-assert "dogs" in output, "The output should contain 'dogs'
-```
-## Prompt 2:
-```python
-def find_text(image_path: str, text: str) -> str:
-    \""" Finds the text in the image and returns the text. \"""
-## Completion 2:
-```python
-# Because we do not know ahead of time what text is in the image, we can only run the
-# code and print the results. We can test on "image.jpg" because it is provided by the
-# user so we know it exists.
-found_text = find_text("image.jpg", "Hello World")
-print(found_text)
-```
-"""
-SIMPLE_TEST = """
-**Role**: As a tester, your task is to create a simple test case for the provided code. This test case should verify the fundamental functionality under normal conditions.
-**Documentation**:
-This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`. You do not need to test these functions, only the code provided by the user.
-{docstring}
-**User Instructions**:
-{question}
-**Input Code Snippet**:
-```python
-### Please decide how would you want to generate test cases. Based on incomplete code or completed version.
-{code}
-```
-**Previous Feedback**:
-{feedback}
+Here is an example of how you can interact with a user and Actions to complete a task:
+--- START EXAMPLES ---
+[Current directory: /workspace/test]
+{examples}
+--- END EXAMPLES ---
 **Instructions**:
-1. Verify the fundamental functionality under normal conditions.
-2. Ensure each test case is well-documented with comments explaining the scenario it covers.
-3. Your test case MUST run only on the given images which are {media}
-4. Your test case MUST run only with the given values which is available in the question - {question}
-5. DO NOT use any non-existent or dummy image or video files that are not provided by the user's instructions.
-6. DO NOT mock any functions, you must test their functionality as is.
-7. DO NOT assert the output value, run the code and assert only the output format or data structure.
-8. DO NOT use try except block to handle the error, let the error be raised if the code is incorrect.
-9. DO NOT import the testing function as it will available in the testing environment.
-10. Print the output of the function that is being tested.
-11. Use the output of the function that is being tested as the return value of the testing function.
-12. Run the testing function in the end and don't assign a variable to its output.
+1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
+2. **Output in JSON**: Respond in JSON format, {{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
+**Conversation**:
+Here is the current conversation so far:
+--- START CONVERSATION ---
+[Current directory: {dir}]
+{conversation}
 """
+EXAMPLES_CODE1 = """
+USER: Can you detect the dogs in this image? Media name dog.jpg
-FIX_BUG = """
-**Role** As a coder, your job is to find the error in the code and fix it. You are running in a notebook setting so you can run !pip install to install missing packages.
+AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/workspace/test/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/workspace/test/dog.jpg'])</execute_python>", "let_user_respond": false}
-**Instructions**:
-Please re-complete the code to fix the error message. Here is the previous version:
-```python
-{code}
-```
+OBSERVATION:
+[File /workspace/test/dog_detector.py]
+0|from vision_agent.tools import load_image, owl_v2
+1|def detect_dogs(image_path: str):
+2|    image = load_image(image_path)
+3|    dogs = owl_v2("dog", image)
+4|    return dogs
+[End of file]
-When we run this test code:
-```python
-{tests}
-```
+AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/workspace/test/dog.jpg'))</execute_python>", "let_user_respond": false}
-It raises this error:
-{result}
+OBSERVATION:
+----- stdout -----
+[{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}, {'score': 0.23, 'label': 'dog', 'box': [0.2, 0.3, 0.4, 0.5]}]
-This is previous feedback provided on the code:
-{feedback}
-Please fix the bug by follow the error information and return a JSON object with the following format:
-{{
-    "reflections": str # any thoughts you have about the bug and how you fixed it
-    "code": str # the fixed code if any, else an empty string
-    "test": str # the fixed test code if any, else an empty string
-}}
-"""
+AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect dogs and shown the output, do the results look good to you?", "let_user_respond": true}
+USER: The the image only has one dog, can you fix this?
-REFLECT = """
-**Role**: You are a reflection agent. Your job is to look at the original user request and the code produced and determine if the code satisfies the user's request. If it does not, you must provide feedback on how to improve the code. You are concerned only if the code meets the user request, not if the code is good or bad.
+AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/workspace/test/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/workspace/test/dog.jpg'])</execute_python>", "let_user_respond": false}
-**Context**:
-{context}
+OBSERVATION:
+[File /workspace/test/dog_detector.py]
+0|from vision_agent.tools import load_image, owl_v2
+1|def detect_dogs(image_path: str):
+2|    image = load_image(image_path)
+3|    dogs = owl_v2("dog", image, threshold=0.24)
+4|    return dogs
+[End of file]
-**Plan**:
-{plan}
+AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/workspace/test/dog.jpg'))</execute_python>", "let_user_respond": false}
-**Code**:
-{code}
+OBSERVATION:
+----- stdout -----
+[{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}]
-**Instructions**:
-1. **Understand the User Request**: Read the user request and understand what the user is asking for.
-2. **Review the Plan**: Check the plan to see if it is a viable approach to solving the user request.
-3. **Review the Code**: Check the code to see if it solves the user request.
-4. DO NOT add any reflections for test cases, these are taken care of.
+AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true}
+"""
-Respond in JSON format with the following structure:
-{{
-    "feedback": str # the feedback you would give to the coder and tester
-    "success": bool # whether the code and tests meet the user request
-}}
+EXAMPLES_CODE2 = """
+USER: Can you create a function to count workers with helmets?
+AGENT: {"thoughts": "The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.", "response": "Can you provide an image of workers with helmets?", "let_user_respond": true}
+USER: Yes you can use workers.png
+AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/workspace/test/code.py', 'Can you write code to count workers with helmets in this image?', media=['/workspace/test/workers.png'])</execute_python>", "let_user_respond": false}
+OBSERVATION:
+[File /workspace/test/code.py]
+0|from vision_agent.tools import load_image, owl_v2, closest_box_distance
+1|def count_workers_with_helmets(image_path: str):
+2|    image = load_image(image_path)
+3|    workers = owl_v2("worker", image)
+4|    helmets = owl_v2("helmet", image)
+5|    count = 0
+6|    for worker in workers:
+7|        person_box = worker['bbox']
+8|        person_has_helmet = False
+9|        for helmet in helmets:
+10|            if closest_box_distance(worker['box'], helmet['box']) < 0.01:
+11|                person_has_helmet = True
+12|                break
+13|        if person_has_helmet:
+14|            count += 1
+15|    return count
+[End of file]
+AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/workspace/test/workers.png'))</execute_python>", "let_user_respond": false}
+OBSERVATION:
+----- stdout -----
+2
+AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py", "let_user_respond": true}
 """

vision_agent/lmm/__init__.py CHANGED Viewed

@@ -1 +1,2 @@
-from .lmm import LMM, AzureOpenAILMM, ClaudeSonnetLMM, Message, OllamaLMM, OpenAILMM
+from .lmm import LMM, AzureOpenAILMM, ClaudeSonnetLMM, OllamaLMM, OpenAILMM
+from .types import Message

vision_agent/lmm/lmm.py CHANGED Viewed

@@ -16,6 +16,8 @@ from PIL import Image
 import vision_agent.tools as T
 from vision_agent.tools.prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
+from .types import Message
 _LOGGER = logging.getLogger(__name__)
@@ -53,10 +55,6 @@ def encode_media(media: Union[str, Path]) -> str:
     return encode_image_bytes(image_bytes)
-TextOrImage = Union[str, List[Union[str, Path]]]
-Message = Dict[str, TextOrImage]
 class LMM(ABC):
     @abstractmethod
     def generate(
@@ -136,7 +134,7 @@ class OpenAILMM(LMM):
                         {
                             "type": "image_url",
                             "image_url": {
-                                "url": f"data:image/png;base64,{encoded_media}",  # type: ignore
+                                "url": f"data:image/png;base64,{encoded_media}",
                                 "detail": "low",
                             },
                         },

vision_agent/lmm/types.py ADDED Viewed

@@ -0,0 +1,5 @@
+from pathlib import Path
+from typing import Dict, Sequence, Union
+TextOrImage = Union[str, Sequence[Union[str, Path]]]
+Message = Dict[str, TextOrImage]

vision_agent/tools/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from typing import Callable, List, Optional
+from .meta_tools import META_TOOL_DOCSTRING
 from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
 from .tools import (
     TOOL_DESCRIPTIONS,

vision-agent 0.2.90__py3-none-any.whl → 0.2.92__py3-none-any.whl

vision-agent 0.2.90py3-none-any.whl → 0.2.92py3-none-any.whl