vision-agent 0.2.161__py3-none-any.whl → 0.2.162__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/agent/__init__.py +8 -0
- vision_agent/agent/agent_utils.py +76 -2
- vision_agent/agent/vision_agent.py +49 -17
- vision_agent/agent/vision_agent_coder.py +163 -489
- vision_agent/agent/vision_agent_coder_prompts.py +0 -203
- vision_agent/agent/vision_agent_planner.py +553 -0
- vision_agent/agent/vision_agent_planner_prompts.py +199 -0
- vision_agent/tools/__init__.py +0 -1
- vision_agent/tools/meta_tools.py +84 -3
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.162.dist-info}/METADATA +7 -7
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.162.dist-info}/RECORD +13 -11
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.162.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.162.dist-info}/WHEEL +0 -0
@@ -1,8 +1,3 @@
|
|
1
|
-
USER_REQ = """
|
2
|
-
## User Request
|
3
|
-
{user_request}
|
4
|
-
"""
|
5
|
-
|
6
1
|
FULL_TASK = """
|
7
2
|
## User Request
|
8
3
|
{user_request}
|
@@ -18,204 +13,6 @@ FEEDBACK = """
|
|
18
13
|
"""
|
19
14
|
|
20
15
|
|
21
|
-
PLAN = """
|
22
|
-
**Context**:
|
23
|
-
{context}
|
24
|
-
|
25
|
-
**Tools Available**:
|
26
|
-
{tool_desc}
|
27
|
-
|
28
|
-
**Previous Feedback**:
|
29
|
-
{feedback}
|
30
|
-
|
31
|
-
**Instructions**:
|
32
|
-
1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
|
33
|
-
2. For each subtask, be sure to include the tool(s) you want to use to accomplish that subtask.
|
34
|
-
3. Output three different plans each utilize a different strategy or set of tools ordering them from most likely to least likely to succeed.
|
35
|
-
|
36
|
-
Output a list of jsons in the following format:
|
37
|
-
|
38
|
-
```json
|
39
|
-
{{
|
40
|
-
"plan1":
|
41
|
-
{{
|
42
|
-
"thoughts": str # your thought process for choosing this plan
|
43
|
-
"instructions": [
|
44
|
-
str # what you should do in this task associated with a tool
|
45
|
-
]
|
46
|
-
}},
|
47
|
-
"plan2": ...,
|
48
|
-
"plan3": ...
|
49
|
-
}}
|
50
|
-
```
|
51
|
-
"""
|
52
|
-
|
53
|
-
|
54
|
-
TEST_PLANS = """
|
55
|
-
**Role**: You are a software programmer responsible for testing different tools.
|
56
|
-
|
57
|
-
**Task**: Your responsibility is to take a set of several plans and test the different tools for each plan.
|
58
|
-
|
59
|
-
**Documentation**:
|
60
|
-
This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`.
|
61
|
-
|
62
|
-
{docstring}
|
63
|
-
|
64
|
-
**Plans**:
|
65
|
-
{plans}
|
66
|
-
|
67
|
-
**Previous Attempts**:
|
68
|
-
{previous_attempts}
|
69
|
-
|
70
|
-
**Examples**:
|
71
|
-
--- EXAMPLE1 ---
|
72
|
-
plan1:
|
73
|
-
- Load the image from the provided file path 'image.jpg'.
|
74
|
-
- Use the 'owl_v2_image' tool with the prompt 'person' to detect and count the number of people in the image.
|
75
|
-
plan2:
|
76
|
-
- Load the image from the provided file path 'image.jpg'.
|
77
|
-
- Use the 'florence2_sam2_image' tool with the prompt 'person' to detect and count the number of people in the image.
|
78
|
-
- Count the number of detected objects labeled as 'person'.
|
79
|
-
plan3:
|
80
|
-
- Load the image from the provided file path 'image.jpg'.
|
81
|
-
- Use the 'countgd_counting' tool to count the dominant foreground object, which in this case is people.
|
82
|
-
|
83
|
-
```python
|
84
|
-
from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_counting
|
85
|
-
image = load_image("image.jpg")
|
86
|
-
owl_v2_out = owl_v2_image("person", image)
|
87
|
-
|
88
|
-
f2s2_out = florence2_sam2_image("person", image)
|
89
|
-
# strip out the masks from the output becuase they don't provide useful information when printed
|
90
|
-
f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
|
91
|
-
|
92
|
-
cgd_out = countgd_counting(image)
|
93
|
-
|
94
|
-
final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_counting": cgd_out}}
|
95
|
-
print(final_out)
|
96
|
-
--- END EXAMPLE1 ---
|
97
|
-
|
98
|
-
--- EXAMPLE2 ---
|
99
|
-
plan1:
|
100
|
-
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
|
101
|
-
- Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
|
102
|
-
plan2:
|
103
|
-
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
|
104
|
-
- Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
|
105
|
-
plan3:
|
106
|
-
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
|
107
|
-
- Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
|
108
|
-
|
109
|
-
|
110
|
-
```python
|
111
|
-
import numpy as np
|
112
|
-
from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
|
113
|
-
|
114
|
-
# sample at 1 FPS and use the first 10 frames to reduce processing time
|
115
|
-
frames = extract_frames_and_timestamps("video.mp4", 1)
|
116
|
-
frames = [f["frame"] for f in frames][:10]
|
117
|
-
|
118
|
-
# strip arrays from the output to make it easier to read
|
119
|
-
def remove_arrays(o):
|
120
|
-
if isinstance(o, list):
|
121
|
-
return [remove_arrays(e) for e in o]
|
122
|
-
elif isinstance(o, dict):
|
123
|
-
return {{k: remove_arrays(v) for k, v in o.items()}}
|
124
|
-
elif isinstance(o, np.ndarray):
|
125
|
-
return "array: " + str(o.shape)
|
126
|
-
else:
|
127
|
-
return o
|
128
|
-
|
129
|
-
# return the counts of each label per frame to help determine the stability of the model results
|
130
|
-
def get_counts(preds):
|
131
|
-
counts = {{}}
|
132
|
-
for i, pred_frame in enumerate(preds):
|
133
|
-
counts_i = {{}}
|
134
|
-
for pred in pred_frame:
|
135
|
-
label = pred["label"].split(":")[1] if ":" in pred["label"] else pred["label"]
|
136
|
-
counts_i[label] = counts_i.get(label, 0) + 1
|
137
|
-
counts[f"frame_{{i}}"] = counts_i
|
138
|
-
return counts
|
139
|
-
|
140
|
-
|
141
|
-
# plan1
|
142
|
-
owl_v2_out = owl_v2_video("person", frames)
|
143
|
-
owl_v2_counts = get_counts(owl_v2_out)
|
144
|
-
|
145
|
-
# plan2
|
146
|
-
florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
|
147
|
-
florence2_counts = get_counts(florence2_out)
|
148
|
-
|
149
|
-
# plan3
|
150
|
-
f2s2_tracking_out = florence2_sam2_video_tracking("person", frames)
|
151
|
-
remove_arrays(f2s2_tracking_out)
|
152
|
-
f2s2_counts = get_counts(f2s2_tracking_out)
|
153
|
-
|
154
|
-
final_out = {{
|
155
|
-
"owl_v2_video": owl_v2_out,
|
156
|
-
"florence2_phrase_grounding": florence2_out,
|
157
|
-
"florence2_sam2_video_tracking": f2s2_out,
|
158
|
-
}}
|
159
|
-
|
160
|
-
counts = {{
|
161
|
-
"owl_v2_video": owl_v2_counts,
|
162
|
-
"florence2_phrase_grounding": florence2_counts,
|
163
|
-
"florence2_sam2_video_tracking": f2s2_counts,
|
164
|
-
}}
|
165
|
-
|
166
|
-
print(final_out)
|
167
|
-
print(labels_and_scores)
|
168
|
-
print(counts)
|
169
|
-
```
|
170
|
-
--- END EXAMPLE2 ---
|
171
|
-
|
172
|
-
**Instructions**:
|
173
|
-
1. Write a program to load the media and call each tool and print it's output along with other relevant information.
|
174
|
-
2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
|
175
|
-
3. Your test case MUST run only on the given images which are {media}
|
176
|
-
4. Print this final dictionary.
|
177
|
-
5. For video input, sample at 1 FPS and use the first 10 frames only to reduce processing time.
|
178
|
-
"""
|
179
|
-
|
180
|
-
|
181
|
-
PREVIOUS_FAILED = """
|
182
|
-
**Previous Failed Attempts**:
|
183
|
-
You previously ran this code:
|
184
|
-
```python
|
185
|
-
{code}
|
186
|
-
```
|
187
|
-
|
188
|
-
But got the following error or no stdout:
|
189
|
-
{error}
|
190
|
-
"""
|
191
|
-
|
192
|
-
|
193
|
-
PICK_PLAN = """
|
194
|
-
**Role**: You are an advanced AI model that can understand the user request and construct plans to accomplish it.
|
195
|
-
|
196
|
-
**Task**: Your responsibility is to pick the best plan from the three plans provided.
|
197
|
-
|
198
|
-
**Context**:
|
199
|
-
{context}
|
200
|
-
|
201
|
-
**Plans**:
|
202
|
-
{plans}
|
203
|
-
|
204
|
-
**Tool Output**:
|
205
|
-
{tool_output}
|
206
|
-
|
207
|
-
**Instructions**:
|
208
|
-
1. Re-read the user request, plans, tool outputs and examine the image.
|
209
|
-
2. Solve the problem yourself given the image and pick the most accurate plan that matches your solution the best.
|
210
|
-
3. Add modifications to improve the plan including: changing a tool, adding thresholds, string matching.
|
211
|
-
3. Output a JSON object with the following format:
|
212
|
-
{{
|
213
|
-
"predicted_answer": str # the answer you would expect from the best plan
|
214
|
-
"thoughts": str # your thought process for choosing the best plan over other plans and any modifications you made
|
215
|
-
"best_plan": str # the best plan you have chosen, must be `plan1`, `plan2`, or `plan3`
|
216
|
-
}}
|
217
|
-
"""
|
218
|
-
|
219
16
|
CODE = """
|
220
17
|
**Role**: You are a software programmer.
|
221
18
|
|