vision-agent 0.2.90__py3-none-any.whl → 0.2.92__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,327 +1,114 @@
1
- USER_REQ = """
2
- ## User Request
3
- {user_request}
4
- """
5
-
6
- FULL_TASK = """
7
- ## User Request
8
- {user_request}
9
-
10
- ## Subtasks
11
- {subtasks}
12
- """
13
-
14
- FEEDBACK = """
15
- ## This contains code and feedback from previous runs and is used for providing context so you do not make the same mistake again.
16
-
17
- {feedback}
18
- """
19
-
20
-
21
- PLAN = """
22
- **Context**:
23
- {context}
24
-
25
- **Tools Available**:
26
- {tool_desc}
27
-
28
- **Previous Feedback**:
29
- {feedback}
30
-
31
- **Instructions**:
32
- 1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
33
- 2. Output three different plans each utilize a different strategy or tool.
34
-
35
- Output a list of jsons in the following format
36
-
37
- ```json
38
- {{
39
- "plan1":
40
- [
41
- {{
42
- "instructions": str # what you should do in this task associated with a tool
43
- }}
44
- ],
45
- "plan2": ...,
46
- "plan3": ...
47
- }}
48
- ```
49
- """
50
-
51
-
52
- TEST_PLANS = """
53
- **Role**: You are a software programmer responsible for testing different tools.
54
-
55
- **Task**: Your responsibility is to take a set of several plans and test the different tools for each plan.
56
-
57
- **Documentation**:
58
- This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`.
59
-
60
- {docstring}
61
-
62
- **Plans**:
63
- {plans}
64
-
65
- {previous_attempts}
66
-
67
- **Instructions**:
68
- 1. Write a program to load the media and call each tool and save it's output.
69
- 2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove any array types from the printed dictionary.
70
- 3. Print this final dictionary.
71
-
72
- **Example**:
73
- plan1:
74
- - Load the image from the provided file path 'image.jpg'.
75
- - Use the 'owl_v2' tool with the prompt 'person' to detect and count the number of people in the image.
76
- plan2:
77
- - Load the image from the provided file path 'image.jpg'.
78
- - Use the 'grounding_sam' tool with the prompt 'person' to detect and count the number of people in the image.
79
- - Count the number of detected objects labeled as 'person'.
80
- plan3:
81
- - Load the image from the provided file path 'image.jpg'.
82
- - Use the 'loca_zero_shot_counting' tool to count the dominant foreground object, which in this case is people.
83
-
84
- ```python
85
- from vision_agent.tools import load_image, owl_v2, grounding_sam, loca_zero_shot_counting
86
- image = load_image("image.jpg")
87
- owl_v2_out = owl_v2("person", image)
88
-
89
- gsam_out = grounding_sam("person", image)
90
- gsam_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in gsam_out]
91
-
92
- loca_out = loca_zero_shot_counting(image)
93
- loca_out = loca_out["count"]
94
-
95
- final_out = {{"owl_v2": owl_v2_out, "florencev2_object_detection": florencev2_out, "loca_zero_shot_counting": loca_out}}
96
- print(final_out)
97
- ```
98
- """
99
-
100
-
101
- PREVIOUS_FAILED = """
102
- **Previous Failed Attempts**:
103
- You previously ran this code:
104
- ```python
105
- {code}
106
- ```
107
-
108
- But got the following error or no stdout:
109
- {error}
110
- """
111
-
112
-
113
- PICK_PLAN = """
114
- **Role**: You are a software programmer.
115
-
116
- **Task**: Your responsibility is to pick the best plan from the three plans provided.
1
+ VA_CODE = """
2
+ **Role**: You are a helpful conversational agent that assists users with their requests by writing code to solve it.
117
3
 
118
- **Context**:
119
- {context}
4
+ **Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>.
120
5
 
121
- **Plans**:
122
- {plans}
123
-
124
- **Tool Output**:
125
- {tool_output}
126
-
127
- **Instructions**:
128
- 1. Given the plans, image, and tool outputs, decide which plan is the best to achieve the user request.
129
- 2. Output a JSON object with the following format:
130
- {{
131
- "thoughts": str # your thought process for choosing the best plan
132
- "best_plan": str # the best plan you have chosen
133
- }}
134
- """
135
-
136
- CODE = """
137
- **Role**: You are a software programmer.
138
-
139
- **Task**: As a programmer, you are required to complete the function. Use a Chain-of-Thought approach to break down the problem, create pseudocode, and then write the code in Python language. Ensure that your code is efficient, readable, and well-commented. Return the requested information from the function you create. Do not call your code, a test will be run after the code is submitted.
140
-
141
- **Documentation**:
142
- This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`.
143
-
144
- {docstring}
145
-
146
- **Input Code Snippet**:
147
- ```python
148
- # Your code here
149
- ```
150
-
151
- **User Instructions**:
152
- {question}
153
-
154
- **Tool Output**:
155
- {tool_output}
156
-
157
- **Previous Feedback**:
158
- {feedback}
159
-
160
- **Instructions**:
161
- 1. **Understand and Clarify**: Make sure you understand the task.
162
- 2. **Algorithm/Method Selection**: Decide on the most efficient way.
163
- 3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode.
164
- 4. **Code Generation**: Translate your pseudocode into executable Python code. Ensure you use correct arguments, remember coordinates are always returned normalized from `vision_agent.tools`. All images from `vision_agent.tools` are in RGB format, red is (255, 0, 0) and blue is (0, 0, 255).
165
- """
166
-
167
- TEST = """
168
- **Role**: As a tester, your task is to create comprehensive test cases for the provided code. These test cases should encompass Basic and Edge case scenarios to ensure the code's robustness and reliability if possible.
6
+ <execute_python>
7
+ print("Hello World!")
8
+ </execute_python>
169
9
 
170
10
  **Documentation**:
171
- This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`. You do not need to test these functions. Test only the code provided by the user.
172
-
173
- {docstring}
11
+ This is the documentation for the different actions you can take:
174
12
 
175
- **User Instructions**:
176
- {question}
177
-
178
- **Input Code Snippet**:
179
- ```python
180
- ### Please decided how would you want to generate test cases. Based on incomplete code or completed version.
181
- {code}
182
- ```
183
-
184
- **Instructions**:
185
- 1. Verify the fundamental functionality under normal conditions.
186
- 2. Ensure each test case is well-documented with comments explaining the scenario it covers.
187
- 3. DO NOT use any files that are not provided by the user's instructions, your test must be run and will crash if it tries to load a non-existent file.
188
- 4. DO NOT mock any functions, you must test their functionality as is.
189
-
190
- You should format your test cases at the end of your response wrapped in ```python ``` tags like in the following example:
191
- ```python
192
- # You can run assertions to ensure the function is working as expected
193
- assert function(input) == expected_output, "Test case description"
194
-
195
- # You can simply call the function to ensure it runs
196
- function(input)
197
-
198
- # Or you can visualize the output
199
- output = function(input)
200
- visualize(output)
201
- ```
13
+ {documentation}
202
14
 
203
15
  **Examples**:
204
- ## Prompt 1:
205
- ```python
206
- def detect_cats_and_dogs(image_path: str) -> Dict[str, List[List[float]]]:
207
- \""" Detects cats and dogs in an image. Returns a dictionary with
208
- {{
209
- "cats": [[x1, y1, x2, y2], ...], "dogs": [[x1, y1, x2, y2], ...]
210
- }}
211
- \"""
212
- ```
213
-
214
- ## Completion 1:
215
- ```python
216
- # We can test to ensure the output has the correct structure but we cannot test the
217
- # content of the output without knowing the image. We can test on "image.jpg" because
218
- # it is provided by the user so we know it exists.
219
- output = detect_cats_and_dogs("image.jpg")
220
- assert "cats" in output, "The output should contain 'cats'
221
- assert "dogs" in output, "The output should contain 'dogs'
222
- ```
223
-
224
- ## Prompt 2:
225
- ```python
226
- def find_text(image_path: str, text: str) -> str:
227
- \""" Finds the text in the image and returns the text. \"""
228
-
229
- ## Completion 2:
230
- ```python
231
- # Because we do not know ahead of time what text is in the image, we can only run the
232
- # code and print the results. We can test on "image.jpg" because it is provided by the
233
- # user so we know it exists.
234
- found_text = find_text("image.jpg", "Hello World")
235
- print(found_text)
236
- ```
237
- """
238
-
239
- SIMPLE_TEST = """
240
- **Role**: As a tester, your task is to create a simple test case for the provided code. This test case should verify the fundamental functionality under normal conditions.
241
-
242
- **Documentation**:
243
- This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`. You do not need to test these functions, only the code provided by the user.
244
-
245
- {docstring}
246
-
247
- **User Instructions**:
248
- {question}
249
-
250
- **Input Code Snippet**:
251
- ```python
252
- ### Please decide how would you want to generate test cases. Based on incomplete code or completed version.
253
- {code}
254
- ```
255
-
256
- **Previous Feedback**:
257
- {feedback}
16
+ Here is an example of how you can interact with a user and Actions to complete a task:
17
+ --- START EXAMPLES ---
18
+ [Current directory: /workspace/test]
19
+ {examples}
20
+ --- END EXAMPLES ---
258
21
 
259
22
  **Instructions**:
260
- 1. Verify the fundamental functionality under normal conditions.
261
- 2. Ensure each test case is well-documented with comments explaining the scenario it covers.
262
- 3. Your test case MUST run only on the given images which are {media}
263
- 4. Your test case MUST run only with the given values which is available in the question - {question}
264
- 5. DO NOT use any non-existent or dummy image or video files that are not provided by the user's instructions.
265
- 6. DO NOT mock any functions, you must test their functionality as is.
266
- 7. DO NOT assert the output value, run the code and assert only the output format or data structure.
267
- 8. DO NOT use try except block to handle the error, let the error be raised if the code is incorrect.
268
- 9. DO NOT import the testing function as it will available in the testing environment.
269
- 10. Print the output of the function that is being tested.
270
- 11. Use the output of the function that is being tested as the return value of the testing function.
271
- 12. Run the testing function in the end and don't assign a variable to its output.
23
+ 1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
24
+ 2. **Output in JSON**: Respond in JSON format, {{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
25
+
26
+ **Conversation**:
27
+ Here is the current conversation so far:
28
+ --- START CONVERSATION ---
29
+ [Current directory: {dir}]
30
+ {conversation}
272
31
  """
273
32
 
33
+ EXAMPLES_CODE1 = """
34
+ USER: Can you detect the dogs in this image? Media name dog.jpg
274
35
 
275
- FIX_BUG = """
276
- **Role** As a coder, your job is to find the error in the code and fix it. You are running in a notebook setting so you can run !pip install to install missing packages.
36
+ AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/workspace/test/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/workspace/test/dog.jpg'])</execute_python>", "let_user_respond": false}
277
37
 
278
- **Instructions**:
279
- Please re-complete the code to fix the error message. Here is the previous version:
280
- ```python
281
- {code}
282
- ```
38
+ OBSERVATION:
39
+ [File /workspace/test/dog_detector.py]
40
+ 0|from vision_agent.tools import load_image, owl_v2
41
+ 1|def detect_dogs(image_path: str):
42
+ 2| image = load_image(image_path)
43
+ 3| dogs = owl_v2("dog", image)
44
+ 4| return dogs
45
+ [End of file]
283
46
 
284
- When we run this test code:
285
- ```python
286
- {tests}
287
- ```
47
+ AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/workspace/test/dog.jpg'))</execute_python>", "let_user_respond": false}
288
48
 
289
- It raises this error:
290
- {result}
49
+ OBSERVATION:
50
+ ----- stdout -----
51
+ [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}, {'score': 0.23, 'label': 'dog', 'box': [0.2, 0.3, 0.4, 0.5]}]
291
52
 
292
- This is previous feedback provided on the code:
293
- {feedback}
294
53
 
295
- Please fix the bug by follow the error information and return a JSON object with the following format:
296
- {{
297
- "reflections": str # any thoughts you have about the bug and how you fixed it
298
- "code": str # the fixed code if any, else an empty string
299
- "test": str # the fixed test code if any, else an empty string
300
- }}
301
- """
54
+ AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect dogs and shown the output, do the results look good to you?", "let_user_respond": true}
302
55
 
56
+ USER: The the image only has one dog, can you fix this?
303
57
 
304
- REFLECT = """
305
- **Role**: You are a reflection agent. Your job is to look at the original user request and the code produced and determine if the code satisfies the user's request. If it does not, you must provide feedback on how to improve the code. You are concerned only if the code meets the user request, not if the code is good or bad.
58
+ AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/workspace/test/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/workspace/test/dog.jpg'])</execute_python>", "let_user_respond": false}
306
59
 
307
- **Context**:
308
- {context}
60
+ OBSERVATION:
61
+ [File /workspace/test/dog_detector.py]
62
+ 0|from vision_agent.tools import load_image, owl_v2
63
+ 1|def detect_dogs(image_path: str):
64
+ 2| image = load_image(image_path)
65
+ 3| dogs = owl_v2("dog", image, threshold=0.24)
66
+ 4| return dogs
67
+ [End of file]
309
68
 
310
- **Plan**:
311
- {plan}
69
+ AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/workspace/test/dog.jpg'))</execute_python>", "let_user_respond": false}
312
70
 
313
- **Code**:
314
- {code}
71
+ OBSERVATION:
72
+ ----- stdout -----
73
+ [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}]
315
74
 
316
- **Instructions**:
317
- 1. **Understand the User Request**: Read the user request and understand what the user is asking for.
318
- 2. **Review the Plan**: Check the plan to see if it is a viable approach to solving the user request.
319
- 3. **Review the Code**: Check the code to see if it solves the user request.
320
- 4. DO NOT add any reflections for test cases, these are taken care of.
75
+ AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true}
76
+ """
321
77
 
322
- Respond in JSON format with the following structure:
323
- {{
324
- "feedback": str # the feedback you would give to the coder and tester
325
- "success": bool # whether the code and tests meet the user request
326
- }}
78
+ EXAMPLES_CODE2 = """
79
+ USER: Can you create a function to count workers with helmets?
80
+
81
+ AGENT: {"thoughts": "The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.", "response": "Can you provide an image of workers with helmets?", "let_user_respond": true}
82
+
83
+ USER: Yes you can use workers.png
84
+
85
+ AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/workspace/test/code.py', 'Can you write code to count workers with helmets in this image?', media=['/workspace/test/workers.png'])</execute_python>", "let_user_respond": false}
86
+
87
+ OBSERVATION:
88
+ [File /workspace/test/code.py]
89
+ 0|from vision_agent.tools import load_image, owl_v2, closest_box_distance
90
+ 1|def count_workers_with_helmets(image_path: str):
91
+ 2| image = load_image(image_path)
92
+ 3| workers = owl_v2("worker", image)
93
+ 4| helmets = owl_v2("helmet", image)
94
+ 5| count = 0
95
+ 6| for worker in workers:
96
+ 7| person_box = worker['bbox']
97
+ 8| person_has_helmet = False
98
+ 9| for helmet in helmets:
99
+ 10| if closest_box_distance(worker['box'], helmet['box']) < 0.01:
100
+ 11| person_has_helmet = True
101
+ 12| break
102
+ 13| if person_has_helmet:
103
+ 14| count += 1
104
+ 15| return count
105
+ [End of file]
106
+
107
+ AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/workspace/test/workers.png'))</execute_python>", "let_user_respond": false}
108
+
109
+ OBSERVATION:
110
+ ----- stdout -----
111
+ 2
112
+
113
+ AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py", "let_user_respond": true}
327
114
  """
@@ -1 +1,2 @@
1
- from .lmm import LMM, AzureOpenAILMM, ClaudeSonnetLMM, Message, OllamaLMM, OpenAILMM
1
+ from .lmm import LMM, AzureOpenAILMM, ClaudeSonnetLMM, OllamaLMM, OpenAILMM
2
+ from .types import Message
vision_agent/lmm/lmm.py CHANGED
@@ -16,6 +16,8 @@ from PIL import Image
16
16
  import vision_agent.tools as T
17
17
  from vision_agent.tools.prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
18
18
 
19
+ from .types import Message
20
+
19
21
  _LOGGER = logging.getLogger(__name__)
20
22
 
21
23
 
@@ -53,10 +55,6 @@ def encode_media(media: Union[str, Path]) -> str:
53
55
  return encode_image_bytes(image_bytes)
54
56
 
55
57
 
56
- TextOrImage = Union[str, List[Union[str, Path]]]
57
- Message = Dict[str, TextOrImage]
58
-
59
-
60
58
  class LMM(ABC):
61
59
  @abstractmethod
62
60
  def generate(
@@ -136,7 +134,7 @@ class OpenAILMM(LMM):
136
134
  {
137
135
  "type": "image_url",
138
136
  "image_url": {
139
- "url": f"data:image/png;base64,{encoded_media}", # type: ignore
137
+ "url": f"data:image/png;base64,{encoded_media}",
140
138
  "detail": "low",
141
139
  },
142
140
  },
@@ -0,0 +1,5 @@
1
+ from pathlib import Path
2
+ from typing import Dict, Sequence, Union
3
+
4
+ TextOrImage = Union[str, Sequence[Union[str, Path]]]
5
+ Message = Dict[str, TextOrImage]
@@ -1,5 +1,6 @@
1
1
  from typing import Callable, List, Optional
2
2
 
3
+ from .meta_tools import META_TOOL_DOCSTRING
3
4
  from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
4
5
  from .tools import (
5
6
  TOOL_DESCRIPTIONS,