vision-agent 0.2.30__py3-none-any.whl → 0.2.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/__init__.py +2 -2
- vision_agent/agent/agent.py +1 -1
- vision_agent/agent/agent_coder.py +7 -7
- vision_agent/agent/{vision_agent_v2.py → data_interpreter.py} +12 -12
- vision_agent/agent/{vision_agent_v2_prompts.py → data_interpreter_prompts.py} +3 -3
- vision_agent/agent/easytool.py +8 -8
- vision_agent/agent/easytool_v2.py +778 -0
- vision_agent/agent/easytool_v2_prompts.py +152 -0
- vision_agent/agent/reflexion.py +8 -8
- vision_agent/agent/vision_agent.py +360 -691
- vision_agent/agent/vision_agent_prompts.py +231 -149
- vision_agent/llm/llm.py +3 -4
- vision_agent/lmm/lmm.py +6 -6
- vision_agent/tools/__init__.py +21 -22
- vision_agent/tools/easytool_tools.py +1242 -0
- vision_agent/tools/tools.py +533 -1090
- vision_agent-0.2.31.dist-info/METADATA +175 -0
- vision_agent-0.2.31.dist-info/RECORD +36 -0
- vision_agent/agent/vision_agent_v3.py +0 -394
- vision_agent/agent/vision_agent_v3_prompts.py +0 -234
- vision_agent/tools/tools_v2.py +0 -685
- vision_agent-0.2.30.dist-info/METADATA +0 -226
- vision_agent-0.2.30.dist-info/RECORD +0 -36
- {vision_agent-0.2.30.dist-info → vision_agent-0.2.31.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.30.dist-info → vision_agent-0.2.31.dist-info}/WHEEL +0 -0
@@ -1,152 +1,234 @@
|
|
1
|
-
|
1
|
+
USER_REQ = """
|
2
|
+
## User Request
|
3
|
+
{user_request}
|
4
|
+
"""
|
2
5
|
|
3
|
-
|
4
|
-
|
5
|
-
{
|
6
|
-
2. You must utilize the image with the visualized bounding boxes or masks and determine if the tools were used correctly or if the tools were used incorrectly or the wrong tools were used.
|
7
|
-
3. If the agent's answer was incorrect, you must diagnose the reason for failure and devise a new concise and concrete plan that aims to mitigate the same failure with the tools available. An example output looks like:
|
8
|
-
{{"Finish": false, "Reflection": "I can see from the visualized bounding boxes that the agent's answer was incorrect because the grounding_dino_ tool produced false positive predictions. The agent should use the following tools with the following parameters:
|
9
|
-
Step 1: Use 'grounding_dino_' with a 'prompt' of 'baby. bed' and a 'box_threshold' of 0.7 to reduce the false positives.
|
10
|
-
Step 2: Use 'box_iou_' with the baby bounding box and the bed bounding box to determine if the baby is on the bed or not."}}
|
11
|
-
4. If the task cannot be completed with the existing tools or by adjusting the parameters, set "Finish" to true.
|
6
|
+
FULL_TASK = """
|
7
|
+
## User Request
|
8
|
+
{user_request}
|
12
9
|
|
13
|
-
|
10
|
+
## Subtasks
|
11
|
+
{subtasks}
|
12
|
+
"""
|
14
13
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
{{
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
{
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
Please
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
14
|
+
FEEDBACK = """
|
15
|
+
## This contains code and feedback from previous runs and is used for providing context so you do not make the same mistake again.
|
16
|
+
|
17
|
+
{feedback}
|
18
|
+
"""
|
19
|
+
|
20
|
+
|
21
|
+
PLAN = """
|
22
|
+
**Context**
|
23
|
+
{context}
|
24
|
+
|
25
|
+
**Tools Available**:
|
26
|
+
{tool_desc}
|
27
|
+
|
28
|
+
**Previous Feedback**:
|
29
|
+
{feedback}
|
30
|
+
|
31
|
+
**Instructions**:
|
32
|
+
Based on the context and tools you have available, write a plan of subtasks to achieve the user request utilizing given tools when necessary. Output a list of jsons in the following format:
|
33
|
+
|
34
|
+
```json
|
35
|
+
{{
|
36
|
+
"plan":
|
37
|
+
[
|
38
|
+
{{
|
39
|
+
"instructions": str # what you should do in this task, one short phrase or sentence
|
40
|
+
}}
|
41
|
+
]
|
42
|
+
}}
|
43
|
+
```
|
44
|
+
"""
|
45
|
+
|
46
|
+
CODE = """
|
47
|
+
**Role**: You are a software programmer.
|
48
|
+
|
49
|
+
**Task**: As a programmer, you are required to complete the function. Use a Chain-of-Thought approach to break down the problem, create pseudocode, and then write the code in Python language. Ensure that your code is efficient, readable, and well-commented. Return the requested information from the function you create. Do not call your code, a test will be run after the code is submitted.
|
50
|
+
|
51
|
+
**Documentation**:
|
52
|
+
This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`.
|
53
|
+
|
54
|
+
{docstring}
|
55
|
+
|
56
|
+
**Input Code Snippet**:
|
57
|
+
```python
|
58
|
+
# Your code here
|
59
|
+
```
|
60
|
+
|
61
|
+
**User Instructions**:
|
62
|
+
{question}
|
63
|
+
|
64
|
+
**Previous Feedback**:
|
65
|
+
{feedback}
|
66
|
+
|
67
|
+
**Instructions**:
|
68
|
+
1. **Understand and Clarify**: Make sure you understand the task.
|
69
|
+
2. **Algorithm/Method Selection**: Decide on the most efficient way.
|
70
|
+
3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode.
|
71
|
+
4. **Code Generation**: Translate your pseudocode into executable Python code.
|
72
|
+
5. **Logging**: Log the output of the custom functions that were provided to you from `from vision_agent.tools import *`. Use a debug flag in the function parameters to toggle logging on and off.
|
73
|
+
"""
|
74
|
+
|
75
|
+
TEST = """
|
76
|
+
**Role**: As a tester, your task is to create comprehensive test cases for the provided code. These test cases should encompass Basic and Edge case scenarios to ensure the code's robustness and reliability if possible.
|
77
|
+
|
78
|
+
**Documentation**:
|
79
|
+
This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`. You do not need to test these functions. Test only the code provided by the user.
|
80
|
+
|
81
|
+
{docstring}
|
82
|
+
|
83
|
+
**User Instructions**:
|
84
|
+
{question}
|
85
|
+
|
86
|
+
**Input Code Snippet**:
|
87
|
+
```python
|
88
|
+
### Please decided how would you want to generate test cases. Based on incomplete code or completed version.
|
89
|
+
{code}
|
90
|
+
```
|
91
|
+
|
92
|
+
**Instructions**:
|
93
|
+
1. Verify the fundamental functionality under normal conditions.
|
94
|
+
2. Ensure each test case is well-documented with comments explaining the scenario it covers.
|
95
|
+
3. DO NOT use any files that are not provided by the user's instructions, your test must be run and will crash if it tries to load a non-existent file.
|
96
|
+
4. DO NOT mock any functions, you must test their functionality as is.
|
97
|
+
|
98
|
+
You should format your test cases at the end of your response wrapped in ```python ``` tags like in the following example:
|
99
|
+
```python
|
100
|
+
# You can run assertions to ensure the function is working as expected
|
101
|
+
assert function(input) == expected_output, "Test case description"
|
102
|
+
|
103
|
+
# You can simply call the function to ensure it runs
|
104
|
+
function(input)
|
105
|
+
|
106
|
+
# Or you can visualize the output
|
107
|
+
output = function(input)
|
108
|
+
visualize(output)
|
109
|
+
```
|
110
|
+
|
111
|
+
**Examples**:
|
112
|
+
## Prompt 1:
|
113
|
+
```python
|
114
|
+
def detect_cats_and_dogs(image_path: str) -> Dict[str, List[List[float]]]:
|
115
|
+
\""" Detects cats and dogs in an image. Returns a dictionary with
|
116
|
+
{{
|
117
|
+
"cats": [[x1, y1, x2, y2], ...], "dogs": [[x1, y1, x2, y2], ...]
|
118
|
+
}}
|
119
|
+
\"""
|
120
|
+
```
|
121
|
+
|
122
|
+
## Completion 1:
|
123
|
+
```python
|
124
|
+
# We can test to ensure the output has the correct structure but we cannot test the
|
125
|
+
# content of the output without knowing the image. We can test on "image.jpg" because
|
126
|
+
# it is provided by the user so we know it exists.
|
127
|
+
output = detect_cats_and_dogs("image.jpg")
|
128
|
+
assert "cats" in output, "The output should contain 'cats'
|
129
|
+
assert "dogs" in output, "The output should contain 'dogs'
|
130
|
+
```
|
131
|
+
|
132
|
+
## Prompt 2:
|
133
|
+
```python
|
134
|
+
def find_text(image_path: str, text: str) -> str:
|
135
|
+
\""" Finds the text in the image and returns the text. \"""
|
136
|
+
|
137
|
+
## Completion 2:
|
138
|
+
```python
|
139
|
+
# Because we do not know ahead of time what text is in the image, we can only run the
|
140
|
+
# code and print the results. We can test on "image.jpg" because it is provided by the
|
141
|
+
# user so we know it exists.
|
142
|
+
found_text = find_text("image.jpg", "Hello World")
|
143
|
+
print(found_text)
|
144
|
+
```
|
145
|
+
"""
|
146
|
+
|
147
|
+
|
148
|
+
SIMPLE_TEST = """
|
149
|
+
**Role**: As a tester, your task is to create a simple test case for the provided code. This test case should verify the fundamental functionality under normal conditions.
|
150
|
+
|
151
|
+
**Documentation**:
|
152
|
+
This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`. You do not need to test these functions, only the code provided by the user.
|
153
|
+
|
154
|
+
{docstring}
|
155
|
+
|
156
|
+
**User Instructions**:
|
157
|
+
{question}
|
158
|
+
|
159
|
+
**Input Code Snippet**:
|
160
|
+
```python
|
161
|
+
### Please decide how would you want to generate test cases. Based on incomplete code or completed version.
|
162
|
+
{code}
|
163
|
+
```
|
164
|
+
|
165
|
+
**Previous Feedback**:
|
166
|
+
{feedback}
|
167
|
+
|
168
|
+
**Instructions**:
|
169
|
+
1. Verify the fundamental functionality under normal conditions.
|
170
|
+
2. Ensure each test case is well-documented with comments explaining the scenario it covers.
|
171
|
+
3. Your test case MUST run only on the given image which is {media}
|
172
|
+
4. DO NOT use any non-existent or dummy image or video files that are not provided by the user's instructions.
|
173
|
+
5. DO NOT mock any functions, you must test their functionality as is.
|
174
|
+
6. DO NOT assert the output value, run the code and verify it runs without any errors and assert only the output format or data structure.
|
175
|
+
7. DO NOT import the testing function as it will available in the testing environment.
|
176
|
+
8. Print the output of the function that is being tested.
|
177
|
+
"""
|
178
|
+
|
179
|
+
|
180
|
+
FIX_BUG = """
|
181
|
+
**Role** As a coder, your job is to find the error in the code and fix it. You are running in a notebook setting so you can run !pip install to install missing packages.
|
182
|
+
|
183
|
+
**Instructions**:
|
184
|
+
Please re-complete the code to fix the error message. Here is the previous version:
|
185
|
+
```python
|
186
|
+
{code}
|
187
|
+
```
|
188
|
+
|
189
|
+
When we run this test code:
|
190
|
+
```python
|
191
|
+
{tests}
|
192
|
+
```
|
193
|
+
|
194
|
+
It raises this error:
|
195
|
+
```python
|
196
|
+
{result}
|
197
|
+
```
|
198
|
+
|
199
|
+
This is previous feedback provided on the code:
|
200
|
+
{feedback}
|
201
|
+
|
202
|
+
Please fix the bug by follow the error information and return a JSON object with the following format:
|
203
|
+
{{
|
204
|
+
"reflections": str # any thoughts you have about the bug and how you fixed it
|
205
|
+
"code": str # the fixed code if any, else an empty string
|
206
|
+
"test": str # the fixed test code if any, else an empty string
|
207
|
+
}}
|
208
|
+
"""
|
209
|
+
|
210
|
+
|
211
|
+
REFLECT = """
|
212
|
+
**Role**: You are a reflection agent. Your job is to look at the original user request and the code produced and determine if the code satisfies the user's request. If it does not, you must provide feedback on how to improve the code. You are concerned only if the code meets the user request, not if the code is good or bad.
|
213
|
+
|
214
|
+
**Context**:
|
215
|
+
{context}
|
216
|
+
|
217
|
+
**Plan**:
|
218
|
+
{plan}
|
219
|
+
|
220
|
+
**Code**:
|
221
|
+
{code}
|
222
|
+
|
223
|
+
**Instructions**:
|
224
|
+
1. **Understand the User Request**: Read the user request and understand what the user is asking for.
|
225
|
+
2. **Review the Plan**: Check the plan to see if it is a viable approach to solving the user request.
|
226
|
+
3. **Review the Code**: Check the code to see if it solves the user request.
|
227
|
+
4. DO NOT add any reflections for test cases, these are taken care of.
|
228
|
+
|
229
|
+
Respond in JSON format with the following structure:
|
230
|
+
{{
|
231
|
+
"feedback": str # the feedback you would give to the coder and tester
|
232
|
+
"success": bool # whether the code and tests meet the user request
|
233
|
+
}}
|
234
|
+
"""
|
vision_agent/llm/llm.py
CHANGED
@@ -6,14 +6,13 @@ from typing import Any, Callable, Dict, List, Mapping, Optional, Union, cast
|
|
6
6
|
from langsmith.wrappers import wrap_openai
|
7
7
|
from openai import AzureOpenAI, OpenAI
|
8
8
|
|
9
|
-
from vision_agent.tools import (
|
10
|
-
CHOOSE_PARAMS,
|
9
|
+
from vision_agent.tools.easytool_tools import (
|
11
10
|
CLIP,
|
12
|
-
SYSTEM_PROMPT,
|
13
11
|
GroundingDINO,
|
14
12
|
GroundingSAM,
|
15
13
|
ZeroShotCounting,
|
16
14
|
)
|
15
|
+
from vision_agent.tools.prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
|
17
16
|
|
18
17
|
|
19
18
|
class LLM(ABC):
|
@@ -141,7 +140,7 @@ class OpenAILLM(LLM):
|
|
141
140
|
return lambda x: ZeroShotCounting()(**{"image": x})
|
142
141
|
|
143
142
|
def generate_image_qa_tool(self, question: str) -> Callable:
|
144
|
-
from vision_agent.tools import ImageQuestionAnswering
|
143
|
+
from vision_agent.tools.easytool_tools import ImageQuestionAnswering
|
145
144
|
|
146
145
|
return lambda x: ImageQuestionAnswering()(**{"prompt": question, "image": x})
|
147
146
|
|
vision_agent/lmm/lmm.py
CHANGED
@@ -9,7 +9,7 @@ from typing import Any, Callable, Dict, List, Optional, Union, cast
|
|
9
9
|
import requests
|
10
10
|
from openai import AzureOpenAI, OpenAI
|
11
11
|
|
12
|
-
from vision_agent.tools import CHOOSE_PARAMS, SYSTEM_PROMPT
|
12
|
+
from vision_agent.tools.prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
|
13
13
|
|
14
14
|
_LOGGER = logging.getLogger(__name__)
|
15
15
|
|
@@ -198,7 +198,7 @@ class OpenAILMM(LMM):
|
|
198
198
|
return cast(str, response.choices[0].message.content)
|
199
199
|
|
200
200
|
def generate_classifier(self, question: str) -> Callable:
|
201
|
-
from vision_agent.tools import CLIP
|
201
|
+
from vision_agent.tools.easytool_tools import CLIP
|
202
202
|
|
203
203
|
api_doc = CLIP.description + "\n" + str(CLIP.usage)
|
204
204
|
prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
|
@@ -223,7 +223,7 @@ class OpenAILMM(LMM):
|
|
223
223
|
return lambda x: CLIP()(**{"prompt": params["prompt"], "image": x})
|
224
224
|
|
225
225
|
def generate_detector(self, question: str) -> Callable:
|
226
|
-
from vision_agent.tools import GroundingDINO
|
226
|
+
from vision_agent.tools.easytool_tools import GroundingDINO
|
227
227
|
|
228
228
|
api_doc = GroundingDINO.description + "\n" + str(GroundingDINO.usage)
|
229
229
|
prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
|
@@ -248,7 +248,7 @@ class OpenAILMM(LMM):
|
|
248
248
|
return lambda x: GroundingDINO()(**{"prompt": params["prompt"], "image": x})
|
249
249
|
|
250
250
|
def generate_segmentor(self, question: str) -> Callable:
|
251
|
-
from vision_agent.tools import GroundingSAM
|
251
|
+
from vision_agent.tools.easytool_tools import GroundingSAM
|
252
252
|
|
253
253
|
api_doc = GroundingSAM.description + "\n" + str(GroundingSAM.usage)
|
254
254
|
prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
|
@@ -273,12 +273,12 @@ class OpenAILMM(LMM):
|
|
273
273
|
return lambda x: GroundingSAM()(**{"prompt": params["prompt"], "image": x})
|
274
274
|
|
275
275
|
def generate_zero_shot_counter(self, question: str) -> Callable:
|
276
|
-
from vision_agent.tools import ZeroShotCounting
|
276
|
+
from vision_agent.tools.easytool_tools import ZeroShotCounting
|
277
277
|
|
278
278
|
return lambda x: ZeroShotCounting()(**{"image": x})
|
279
279
|
|
280
280
|
def generate_image_qa_tool(self, question: str) -> Callable:
|
281
|
-
from vision_agent.tools import ImageQuestionAnswering
|
281
|
+
from vision_agent.tools.easytool_tools import ImageQuestionAnswering
|
282
282
|
|
283
283
|
return lambda x: ImageQuestionAnswering()(**{"prompt": question, "image": x})
|
284
284
|
|
vision_agent/tools/__init__.py
CHANGED
@@ -1,25 +1,24 @@
|
|
1
1
|
from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
|
2
|
-
from .tools import (
|
3
|
-
|
4
|
-
|
2
|
+
from .tools import (
|
3
|
+
TOOL_DESCRIPTIONS,
|
4
|
+
TOOL_DOCSTRING,
|
5
5
|
TOOLS,
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
register_tool,
|
6
|
+
TOOLS_DF,
|
7
|
+
UTILITIES_DOCSTRING,
|
8
|
+
clip,
|
9
|
+
closest_box_distance,
|
10
|
+
closest_mask_distance,
|
11
|
+
extract_frames,
|
12
|
+
grounding_dino,
|
13
|
+
grounding_sam,
|
14
|
+
image_caption,
|
15
|
+
image_question_answering,
|
16
|
+
load_image,
|
17
|
+
ocr,
|
18
|
+
overlay_bounding_boxes,
|
19
|
+
overlay_segmentation_masks,
|
20
|
+
save_image,
|
21
|
+
save_json,
|
22
|
+
visual_prompt_counting,
|
23
|
+
zero_shot_counting,
|
25
24
|
)
|