vision-agent 0.2.90__py3-none-any.whl → 0.2.92__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/__init__.py +2 -1
- vision_agent/agent/agent.py +1 -1
- vision_agent/agent/agent_utils.py +43 -0
- vision_agent/agent/vision_agent.py +116 -824
- vision_agent/agent/vision_agent_coder.py +897 -0
- vision_agent/agent/vision_agent_coder_prompts.py +328 -0
- vision_agent/agent/vision_agent_prompts.py +89 -302
- vision_agent/lmm/__init__.py +2 -1
- vision_agent/lmm/lmm.py +3 -5
- vision_agent/lmm/types.py +5 -0
- vision_agent/tools/__init__.py +1 -0
- vision_agent/tools/meta_tools.py +402 -0
- vision_agent/tools/tool_utils.py +48 -2
- vision_agent/tools/tools.py +7 -49
- vision_agent/utils/execute.py +52 -76
- vision_agent/utils/image_utils.py +1 -1
- vision_agent/utils/type_defs.py +1 -1
- {vision_agent-0.2.90.dist-info → vision_agent-0.2.92.dist-info}/METADATA +42 -12
- vision_agent-0.2.92.dist-info/RECORD +29 -0
- vision_agent-0.2.90.dist-info/RECORD +0 -24
- {vision_agent-0.2.90.dist-info → vision_agent-0.2.92.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.90.dist-info → vision_agent-0.2.92.dist-info}/WHEEL +0 -0
@@ -1,327 +1,114 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
{user_request}
|
4
|
-
"""
|
5
|
-
|
6
|
-
FULL_TASK = """
|
7
|
-
## User Request
|
8
|
-
{user_request}
|
9
|
-
|
10
|
-
## Subtasks
|
11
|
-
{subtasks}
|
12
|
-
"""
|
13
|
-
|
14
|
-
FEEDBACK = """
|
15
|
-
## This contains code and feedback from previous runs and is used for providing context so you do not make the same mistake again.
|
16
|
-
|
17
|
-
{feedback}
|
18
|
-
"""
|
19
|
-
|
20
|
-
|
21
|
-
PLAN = """
|
22
|
-
**Context**:
|
23
|
-
{context}
|
24
|
-
|
25
|
-
**Tools Available**:
|
26
|
-
{tool_desc}
|
27
|
-
|
28
|
-
**Previous Feedback**:
|
29
|
-
{feedback}
|
30
|
-
|
31
|
-
**Instructions**:
|
32
|
-
1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
|
33
|
-
2. Output three different plans each utilize a different strategy or tool.
|
34
|
-
|
35
|
-
Output a list of jsons in the following format
|
36
|
-
|
37
|
-
```json
|
38
|
-
{{
|
39
|
-
"plan1":
|
40
|
-
[
|
41
|
-
{{
|
42
|
-
"instructions": str # what you should do in this task associated with a tool
|
43
|
-
}}
|
44
|
-
],
|
45
|
-
"plan2": ...,
|
46
|
-
"plan3": ...
|
47
|
-
}}
|
48
|
-
```
|
49
|
-
"""
|
50
|
-
|
51
|
-
|
52
|
-
TEST_PLANS = """
|
53
|
-
**Role**: You are a software programmer responsible for testing different tools.
|
54
|
-
|
55
|
-
**Task**: Your responsibility is to take a set of several plans and test the different tools for each plan.
|
56
|
-
|
57
|
-
**Documentation**:
|
58
|
-
This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`.
|
59
|
-
|
60
|
-
{docstring}
|
61
|
-
|
62
|
-
**Plans**:
|
63
|
-
{plans}
|
64
|
-
|
65
|
-
{previous_attempts}
|
66
|
-
|
67
|
-
**Instructions**:
|
68
|
-
1. Write a program to load the media and call each tool and save it's output.
|
69
|
-
2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove any array types from the printed dictionary.
|
70
|
-
3. Print this final dictionary.
|
71
|
-
|
72
|
-
**Example**:
|
73
|
-
plan1:
|
74
|
-
- Load the image from the provided file path 'image.jpg'.
|
75
|
-
- Use the 'owl_v2' tool with the prompt 'person' to detect and count the number of people in the image.
|
76
|
-
plan2:
|
77
|
-
- Load the image from the provided file path 'image.jpg'.
|
78
|
-
- Use the 'grounding_sam' tool with the prompt 'person' to detect and count the number of people in the image.
|
79
|
-
- Count the number of detected objects labeled as 'person'.
|
80
|
-
plan3:
|
81
|
-
- Load the image from the provided file path 'image.jpg'.
|
82
|
-
- Use the 'loca_zero_shot_counting' tool to count the dominant foreground object, which in this case is people.
|
83
|
-
|
84
|
-
```python
|
85
|
-
from vision_agent.tools import load_image, owl_v2, grounding_sam, loca_zero_shot_counting
|
86
|
-
image = load_image("image.jpg")
|
87
|
-
owl_v2_out = owl_v2("person", image)
|
88
|
-
|
89
|
-
gsam_out = grounding_sam("person", image)
|
90
|
-
gsam_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in gsam_out]
|
91
|
-
|
92
|
-
loca_out = loca_zero_shot_counting(image)
|
93
|
-
loca_out = loca_out["count"]
|
94
|
-
|
95
|
-
final_out = {{"owl_v2": owl_v2_out, "florencev2_object_detection": florencev2_out, "loca_zero_shot_counting": loca_out}}
|
96
|
-
print(final_out)
|
97
|
-
```
|
98
|
-
"""
|
99
|
-
|
100
|
-
|
101
|
-
PREVIOUS_FAILED = """
|
102
|
-
**Previous Failed Attempts**:
|
103
|
-
You previously ran this code:
|
104
|
-
```python
|
105
|
-
{code}
|
106
|
-
```
|
107
|
-
|
108
|
-
But got the following error or no stdout:
|
109
|
-
{error}
|
110
|
-
"""
|
111
|
-
|
112
|
-
|
113
|
-
PICK_PLAN = """
|
114
|
-
**Role**: You are a software programmer.
|
115
|
-
|
116
|
-
**Task**: Your responsibility is to pick the best plan from the three plans provided.
|
1
|
+
VA_CODE = """
|
2
|
+
**Role**: You are a helpful conversational agent that assists users with their requests by writing code to solve it.
|
117
3
|
|
118
|
-
**
|
119
|
-
{context}
|
4
|
+
**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>.
|
120
5
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
**Tool Output**:
|
125
|
-
{tool_output}
|
126
|
-
|
127
|
-
**Instructions**:
|
128
|
-
1. Given the plans, image, and tool outputs, decide which plan is the best to achieve the user request.
|
129
|
-
2. Output a JSON object with the following format:
|
130
|
-
{{
|
131
|
-
"thoughts": str # your thought process for choosing the best plan
|
132
|
-
"best_plan": str # the best plan you have chosen
|
133
|
-
}}
|
134
|
-
"""
|
135
|
-
|
136
|
-
CODE = """
|
137
|
-
**Role**: You are a software programmer.
|
138
|
-
|
139
|
-
**Task**: As a programmer, you are required to complete the function. Use a Chain-of-Thought approach to break down the problem, create pseudocode, and then write the code in Python language. Ensure that your code is efficient, readable, and well-commented. Return the requested information from the function you create. Do not call your code, a test will be run after the code is submitted.
|
140
|
-
|
141
|
-
**Documentation**:
|
142
|
-
This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`.
|
143
|
-
|
144
|
-
{docstring}
|
145
|
-
|
146
|
-
**Input Code Snippet**:
|
147
|
-
```python
|
148
|
-
# Your code here
|
149
|
-
```
|
150
|
-
|
151
|
-
**User Instructions**:
|
152
|
-
{question}
|
153
|
-
|
154
|
-
**Tool Output**:
|
155
|
-
{tool_output}
|
156
|
-
|
157
|
-
**Previous Feedback**:
|
158
|
-
{feedback}
|
159
|
-
|
160
|
-
**Instructions**:
|
161
|
-
1. **Understand and Clarify**: Make sure you understand the task.
|
162
|
-
2. **Algorithm/Method Selection**: Decide on the most efficient way.
|
163
|
-
3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode.
|
164
|
-
4. **Code Generation**: Translate your pseudocode into executable Python code. Ensure you use correct arguments, remember coordinates are always returned normalized from `vision_agent.tools`. All images from `vision_agent.tools` are in RGB format, red is (255, 0, 0) and blue is (0, 0, 255).
|
165
|
-
"""
|
166
|
-
|
167
|
-
TEST = """
|
168
|
-
**Role**: As a tester, your task is to create comprehensive test cases for the provided code. These test cases should encompass Basic and Edge case scenarios to ensure the code's robustness and reliability if possible.
|
6
|
+
<execute_python>
|
7
|
+
print("Hello World!")
|
8
|
+
</execute_python>
|
169
9
|
|
170
10
|
**Documentation**:
|
171
|
-
This is the documentation for the
|
172
|
-
|
173
|
-
{docstring}
|
11
|
+
This is the documentation for the different actions you can take:
|
174
12
|
|
175
|
-
|
176
|
-
{question}
|
177
|
-
|
178
|
-
**Input Code Snippet**:
|
179
|
-
```python
|
180
|
-
### Please decided how would you want to generate test cases. Based on incomplete code or completed version.
|
181
|
-
{code}
|
182
|
-
```
|
183
|
-
|
184
|
-
**Instructions**:
|
185
|
-
1. Verify the fundamental functionality under normal conditions.
|
186
|
-
2. Ensure each test case is well-documented with comments explaining the scenario it covers.
|
187
|
-
3. DO NOT use any files that are not provided by the user's instructions, your test must be run and will crash if it tries to load a non-existent file.
|
188
|
-
4. DO NOT mock any functions, you must test their functionality as is.
|
189
|
-
|
190
|
-
You should format your test cases at the end of your response wrapped in ```python ``` tags like in the following example:
|
191
|
-
```python
|
192
|
-
# You can run assertions to ensure the function is working as expected
|
193
|
-
assert function(input) == expected_output, "Test case description"
|
194
|
-
|
195
|
-
# You can simply call the function to ensure it runs
|
196
|
-
function(input)
|
197
|
-
|
198
|
-
# Or you can visualize the output
|
199
|
-
output = function(input)
|
200
|
-
visualize(output)
|
201
|
-
```
|
13
|
+
{documentation}
|
202
14
|
|
203
15
|
**Examples**:
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
"cats": [[x1, y1, x2, y2], ...], "dogs": [[x1, y1, x2, y2], ...]
|
210
|
-
}}
|
211
|
-
\"""
|
212
|
-
```
|
213
|
-
|
214
|
-
## Completion 1:
|
215
|
-
```python
|
216
|
-
# We can test to ensure the output has the correct structure but we cannot test the
|
217
|
-
# content of the output without knowing the image. We can test on "image.jpg" because
|
218
|
-
# it is provided by the user so we know it exists.
|
219
|
-
output = detect_cats_and_dogs("image.jpg")
|
220
|
-
assert "cats" in output, "The output should contain 'cats'
|
221
|
-
assert "dogs" in output, "The output should contain 'dogs'
|
222
|
-
```
|
223
|
-
|
224
|
-
## Prompt 2:
|
225
|
-
```python
|
226
|
-
def find_text(image_path: str, text: str) -> str:
|
227
|
-
\""" Finds the text in the image and returns the text. \"""
|
228
|
-
|
229
|
-
## Completion 2:
|
230
|
-
```python
|
231
|
-
# Because we do not know ahead of time what text is in the image, we can only run the
|
232
|
-
# code and print the results. We can test on "image.jpg" because it is provided by the
|
233
|
-
# user so we know it exists.
|
234
|
-
found_text = find_text("image.jpg", "Hello World")
|
235
|
-
print(found_text)
|
236
|
-
```
|
237
|
-
"""
|
238
|
-
|
239
|
-
SIMPLE_TEST = """
|
240
|
-
**Role**: As a tester, your task is to create a simple test case for the provided code. This test case should verify the fundamental functionality under normal conditions.
|
241
|
-
|
242
|
-
**Documentation**:
|
243
|
-
This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`. You do not need to test these functions, only the code provided by the user.
|
244
|
-
|
245
|
-
{docstring}
|
246
|
-
|
247
|
-
**User Instructions**:
|
248
|
-
{question}
|
249
|
-
|
250
|
-
**Input Code Snippet**:
|
251
|
-
```python
|
252
|
-
### Please decide how would you want to generate test cases. Based on incomplete code or completed version.
|
253
|
-
{code}
|
254
|
-
```
|
255
|
-
|
256
|
-
**Previous Feedback**:
|
257
|
-
{feedback}
|
16
|
+
Here is an example of how you can interact with a user and Actions to complete a task:
|
17
|
+
--- START EXAMPLES ---
|
18
|
+
[Current directory: /workspace/test]
|
19
|
+
{examples}
|
20
|
+
--- END EXAMPLES ---
|
258
21
|
|
259
22
|
**Instructions**:
|
260
|
-
1.
|
261
|
-
2.
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
9. DO NOT import the testing function as it will available in the testing environment.
|
269
|
-
10. Print the output of the function that is being tested.
|
270
|
-
11. Use the output of the function that is being tested as the return value of the testing function.
|
271
|
-
12. Run the testing function in the end and don't assign a variable to its output.
|
23
|
+
1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
|
24
|
+
2. **Output in JSON**: Respond in JSON format, {{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
|
25
|
+
|
26
|
+
**Conversation**:
|
27
|
+
Here is the current conversation so far:
|
28
|
+
--- START CONVERSATION ---
|
29
|
+
[Current directory: {dir}]
|
30
|
+
{conversation}
|
272
31
|
"""
|
273
32
|
|
33
|
+
EXAMPLES_CODE1 = """
|
34
|
+
USER: Can you detect the dogs in this image? Media name dog.jpg
|
274
35
|
|
275
|
-
|
276
|
-
**Role** As a coder, your job is to find the error in the code and fix it. You are running in a notebook setting so you can run !pip install to install missing packages.
|
36
|
+
AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/workspace/test/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/workspace/test/dog.jpg'])</execute_python>", "let_user_respond": false}
|
277
37
|
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
38
|
+
OBSERVATION:
|
39
|
+
[File /workspace/test/dog_detector.py]
|
40
|
+
0|from vision_agent.tools import load_image, owl_v2
|
41
|
+
1|def detect_dogs(image_path: str):
|
42
|
+
2| image = load_image(image_path)
|
43
|
+
3| dogs = owl_v2("dog", image)
|
44
|
+
4| return dogs
|
45
|
+
[End of file]
|
283
46
|
|
284
|
-
|
285
|
-
```python
|
286
|
-
{tests}
|
287
|
-
```
|
47
|
+
AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/workspace/test/dog.jpg'))</execute_python>", "let_user_respond": false}
|
288
48
|
|
289
|
-
|
290
|
-
|
49
|
+
OBSERVATION:
|
50
|
+
----- stdout -----
|
51
|
+
[{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}, {'score': 0.23, 'label': 'dog', 'box': [0.2, 0.3, 0.4, 0.5]}]
|
291
52
|
|
292
|
-
This is previous feedback provided on the code:
|
293
|
-
{feedback}
|
294
53
|
|
295
|
-
|
296
|
-
{{
|
297
|
-
"reflections": str # any thoughts you have about the bug and how you fixed it
|
298
|
-
"code": str # the fixed code if any, else an empty string
|
299
|
-
"test": str # the fixed test code if any, else an empty string
|
300
|
-
}}
|
301
|
-
"""
|
54
|
+
AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect dogs and shown the output, do the results look good to you?", "let_user_respond": true}
|
302
55
|
|
56
|
+
USER: The the image only has one dog, can you fix this?
|
303
57
|
|
304
|
-
|
305
|
-
**Role**: You are a reflection agent. Your job is to look at the original user request and the code produced and determine if the code satisfies the user's request. If it does not, you must provide feedback on how to improve the code. You are concerned only if the code meets the user request, not if the code is good or bad.
|
58
|
+
AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/workspace/test/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/workspace/test/dog.jpg'])</execute_python>", "let_user_respond": false}
|
306
59
|
|
307
|
-
|
308
|
-
|
60
|
+
OBSERVATION:
|
61
|
+
[File /workspace/test/dog_detector.py]
|
62
|
+
0|from vision_agent.tools import load_image, owl_v2
|
63
|
+
1|def detect_dogs(image_path: str):
|
64
|
+
2| image = load_image(image_path)
|
65
|
+
3| dogs = owl_v2("dog", image, threshold=0.24)
|
66
|
+
4| return dogs
|
67
|
+
[End of file]
|
309
68
|
|
310
|
-
|
311
|
-
{plan}
|
69
|
+
AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/workspace/test/dog.jpg'))</execute_python>", "let_user_respond": false}
|
312
70
|
|
313
|
-
|
314
|
-
|
71
|
+
OBSERVATION:
|
72
|
+
----- stdout -----
|
73
|
+
[{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}]
|
315
74
|
|
316
|
-
|
317
|
-
|
318
|
-
2. **Review the Plan**: Check the plan to see if it is a viable approach to solving the user request.
|
319
|
-
3. **Review the Code**: Check the code to see if it solves the user request.
|
320
|
-
4. DO NOT add any reflections for test cases, these are taken care of.
|
75
|
+
AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true}
|
76
|
+
"""
|
321
77
|
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
78
|
+
EXAMPLES_CODE2 = """
|
79
|
+
USER: Can you create a function to count workers with helmets?
|
80
|
+
|
81
|
+
AGENT: {"thoughts": "The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.", "response": "Can you provide an image of workers with helmets?", "let_user_respond": true}
|
82
|
+
|
83
|
+
USER: Yes you can use workers.png
|
84
|
+
|
85
|
+
AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/workspace/test/code.py', 'Can you write code to count workers with helmets in this image?', media=['/workspace/test/workers.png'])</execute_python>", "let_user_respond": false}
|
86
|
+
|
87
|
+
OBSERVATION:
|
88
|
+
[File /workspace/test/code.py]
|
89
|
+
0|from vision_agent.tools import load_image, owl_v2, closest_box_distance
|
90
|
+
1|def count_workers_with_helmets(image_path: str):
|
91
|
+
2| image = load_image(image_path)
|
92
|
+
3| workers = owl_v2("worker", image)
|
93
|
+
4| helmets = owl_v2("helmet", image)
|
94
|
+
5| count = 0
|
95
|
+
6| for worker in workers:
|
96
|
+
7| person_box = worker['bbox']
|
97
|
+
8| person_has_helmet = False
|
98
|
+
9| for helmet in helmets:
|
99
|
+
10| if closest_box_distance(worker['box'], helmet['box']) < 0.01:
|
100
|
+
11| person_has_helmet = True
|
101
|
+
12| break
|
102
|
+
13| if person_has_helmet:
|
103
|
+
14| count += 1
|
104
|
+
15| return count
|
105
|
+
[End of file]
|
106
|
+
|
107
|
+
AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/workspace/test/workers.png'))</execute_python>", "let_user_respond": false}
|
108
|
+
|
109
|
+
OBSERVATION:
|
110
|
+
----- stdout -----
|
111
|
+
2
|
112
|
+
|
113
|
+
AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py", "let_user_respond": true}
|
327
114
|
"""
|
vision_agent/lmm/__init__.py
CHANGED
@@ -1 +1,2 @@
|
|
1
|
-
from .lmm import LMM, AzureOpenAILMM, ClaudeSonnetLMM,
|
1
|
+
from .lmm import LMM, AzureOpenAILMM, ClaudeSonnetLMM, OllamaLMM, OpenAILMM
|
2
|
+
from .types import Message
|
vision_agent/lmm/lmm.py
CHANGED
@@ -16,6 +16,8 @@ from PIL import Image
|
|
16
16
|
import vision_agent.tools as T
|
17
17
|
from vision_agent.tools.prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
|
18
18
|
|
19
|
+
from .types import Message
|
20
|
+
|
19
21
|
_LOGGER = logging.getLogger(__name__)
|
20
22
|
|
21
23
|
|
@@ -53,10 +55,6 @@ def encode_media(media: Union[str, Path]) -> str:
|
|
53
55
|
return encode_image_bytes(image_bytes)
|
54
56
|
|
55
57
|
|
56
|
-
TextOrImage = Union[str, List[Union[str, Path]]]
|
57
|
-
Message = Dict[str, TextOrImage]
|
58
|
-
|
59
|
-
|
60
58
|
class LMM(ABC):
|
61
59
|
@abstractmethod
|
62
60
|
def generate(
|
@@ -136,7 +134,7 @@ class OpenAILMM(LMM):
|
|
136
134
|
{
|
137
135
|
"type": "image_url",
|
138
136
|
"image_url": {
|
139
|
-
"url": f"data:image/png;base64,{encoded_media}",
|
137
|
+
"url": f"data:image/png;base64,{encoded_media}",
|
140
138
|
"detail": "low",
|
141
139
|
},
|
142
140
|
},
|