vision-agent 0.2.232__py3-none-any.whl → 0.2.234__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/agent_utils.py +1 -1
- vision_agent/agent/vision_agent_coder_v2.py +1 -1
- vision_agent/agent/vision_agent_planner_prompts_v2.py +127 -64
- vision_agent/agent/vision_agent_planner_v2.py +2 -3
- vision_agent/agent/vision_agent_v2.py +7 -7
- {vision_agent-0.2.232.dist-info → vision_agent-0.2.234.dist-info}/METADATA +1 -1
- {vision_agent-0.2.232.dist-info → vision_agent-0.2.234.dist-info}/RECORD +9 -9
- {vision_agent-0.2.232.dist-info → vision_agent-0.2.234.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.232.dist-info → vision_agent-0.2.234.dist-info}/WHEEL +0 -0
@@ -160,7 +160,7 @@ def format_conversation(chat: List[AgentMessage]) -> str:
|
|
160
160
|
prompt = ""
|
161
161
|
for chat_i in chat:
|
162
162
|
if chat_i.role == "user" or chat_i.role == "coder":
|
163
|
-
if "<final_code>" in chat_i.
|
163
|
+
if "<final_code>" in chat_i.content:
|
164
164
|
prompt += f"OBSERVATION: {chat_i.content}\n\n"
|
165
165
|
elif chat_i.role == "user":
|
166
166
|
prompt += f"USER: {chat_i.content}\n\n"
|
@@ -443,7 +443,7 @@ class VisionAgentCoderV2(AgentCoder):
|
|
443
443
|
|
444
444
|
# we don't need the user_interaction response for generating code since it's
|
445
445
|
# already in the plan context
|
446
|
-
while chat[-1].role != "user":
|
446
|
+
while len(chat) > 0 and chat[-1].role != "user":
|
447
447
|
chat.pop()
|
448
448
|
|
449
449
|
if not chat:
|
@@ -20,7 +20,7 @@ PLAN = """
|
|
20
20
|
3. Only output <finalize_plan> when you are done planning and want to end the planning process. DO NOT output <finalize_plan> with <execute_python> tags, only after OBSERVATION's.
|
21
21
|
4. Only load/save files from {media_list} unless you specifically saved the file previously.
|
22
22
|
5. Ensure you always call `suggestion` initially and `get_tool_for_task` to get the right tool for the subtask.
|
23
|
-
6. Calling `plt.imshow` or `save_image` will display the image to you
|
23
|
+
6. Calling `plt.imshow` or `save_image` will display the image to you so you can check your results. If you see an image after <execute_python> it's generated from your code.
|
24
24
|
7. DO NOT hard code the answer into your code, it should be dynamic and work for any similar request.
|
25
25
|
8. DO NOT over index on claude35_vqa, if tool output is close to claude35_vqa's output you do not need to improve the tool.
|
26
26
|
9. You can only respond in the following format with a single <thinking>, <execute_python> or <finalize_plan> tag:
|
@@ -50,7 +50,7 @@ From this aerial view of a busy urban street, it's difficult to clearly see or c
|
|
50
50
|
[suggestion 0]
|
51
51
|
The image is very large and the items you need to detect are small.
|
52
52
|
|
53
|
-
Step 1: You should start by splitting the image into sections and runing the detection algorithm on each section:
|
53
|
+
Step 1: You should start by splitting the image into overlapping sections and runing the detection algorithm on each section:
|
54
54
|
|
55
55
|
def subdivide_image(image):
|
56
56
|
height, width, _ = image.shape
|
@@ -66,41 +66,96 @@ def subdivide_image(image):
|
|
66
66
|
|
67
67
|
get_tool_for_task('<your prompt here>', subdivide_image(image))
|
68
68
|
|
69
|
-
Step 2: Once you have the detections from each subdivided image, you will need to merge them back together to remove overlapping predictions:
|
70
|
-
|
71
|
-
def
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
69
|
+
Step 2: Once you have the detections from each subdivided image, you will need to merge them back together to remove overlapping predictions, be sure to tranlate the offset back to the original image:
|
70
|
+
|
71
|
+
def bounding_box_match(b1: List[float], b2: List[float], iou_threshold: float = 0.1) -> bool:
|
72
|
+
# Calculate intersection coordinates
|
73
|
+
x1 = max(b1[0], b2[0])
|
74
|
+
y1 = max(b1[1], b2[1])
|
75
|
+
x2 = min(b1[2], b2[2])
|
76
|
+
y2 = min(b1[3], b2[3])
|
77
|
+
|
78
|
+
# Calculate intersection area
|
79
|
+
if x2 < x1 or y2 < y1:
|
80
|
+
return False # No overlap
|
81
|
+
|
82
|
+
intersection = (x2 - x1) * (y2 - y1)
|
83
|
+
|
84
|
+
# Calculate union area
|
85
|
+
area1 = (b1[2] - b1[0]) * (b1[3] - b1[1])
|
86
|
+
area2 = (b2[2] - b2[0]) * (b2[3] - b2[1])
|
87
|
+
union = area1 + area2 - intersection
|
88
|
+
|
89
|
+
# Calculate IoU
|
90
|
+
iou = intersection / union if union > 0 else 0
|
91
|
+
|
92
|
+
return iou >= iou_threshold
|
93
|
+
|
94
|
+
def merge_bounding_box_list(detections):
|
95
|
+
merged_detections = []
|
96
|
+
for detection in detections:
|
97
|
+
matching_box = None
|
98
|
+
for i, other in enumerate(merged_detections):
|
99
|
+
if bounding_box_match(detection["bbox"], other["bbox"]):
|
100
|
+
matching_box = i
|
97
101
|
break
|
98
|
-
if not overlap_found:
|
99
|
-
p
|
100
|
-
merged_bboxes.append(bbox)
|
101
|
-
return merged_bboxes
|
102
102
|
|
103
|
-
|
103
|
+
if matching_box is not None:
|
104
|
+
# Keep the box with higher confidence score
|
105
|
+
if detection["score"] > merged_detections[matching_box]["score"]:
|
106
|
+
merged_detections[matching_box] = detection
|
107
|
+
else:
|
108
|
+
merged_detections.append(detection)
|
109
|
+
|
110
|
+
def sub_image_to_original(elt, sub_image_position, original_size):
|
111
|
+
offset_x, offset_y = sub_image_position
|
112
|
+
return {
|
113
|
+
"label": elt["label"],
|
114
|
+
"score": elt["score"],
|
115
|
+
"bbox": [
|
116
|
+
(elt["bbox"][0] + offset_x) / original_size[1],
|
117
|
+
(elt["bbox"][1] + offset_y) / original_size[0],
|
118
|
+
(elt["bbox"][2] + offset_x) / original_size[1],
|
119
|
+
(elt["bbox"][3] + offset_y) / original_size[0],
|
120
|
+
],
|
121
|
+
}
|
122
|
+
|
123
|
+
def normalized_to_unnormalized(elt, image_size):
|
124
|
+
return {
|
125
|
+
"label": elt["label"],
|
126
|
+
"score": elt["score"],
|
127
|
+
"bbox": [
|
128
|
+
elt["bbox"][0] * image_size[1],
|
129
|
+
elt["bbox"][1] * image_size[0],
|
130
|
+
elt["bbox"][2] * image_size[1],
|
131
|
+
elt["bbox"][3] * image_size[0],
|
132
|
+
],
|
133
|
+
}
|
134
|
+
|
135
|
+
height, width, _ = image.shape
|
136
|
+
mid_width = width // 2
|
137
|
+
mid_height = height // 2
|
138
|
+
|
139
|
+
detection_from_subdivided_images = []
|
140
|
+
for i, sub_image in enumerate(subdivided_images):
|
141
|
+
detections = <your detection function here>("pedestrian", sub_image)
|
142
|
+
unnorm_detections = [
|
143
|
+
normalized_to_unnormalized(
|
144
|
+
detection, (sub_image.shape[0], sub_image.shape[1])
|
145
|
+
)
|
146
|
+
for detection in detections
|
147
|
+
]
|
148
|
+
offset_x = i % 2 * (mid_width - int(mid_width * 0.1))
|
149
|
+
offset_y = i // 2 * (mid_height - int(mid_height * 0.1))
|
150
|
+
offset_detections = [
|
151
|
+
sub_image_to_original(
|
152
|
+
unnorm_detection, (offset_x, offset_y), (height, width)
|
153
|
+
)
|
154
|
+
for unnorm_detection in unnorm_detections
|
155
|
+
]
|
156
|
+
detection_from_subdivided_images.extend(offset_detections)
|
157
|
+
|
158
|
+
detections = merge_bounding_box_list(detection_from_subdivided_images)
|
104
159
|
[end of suggestion 0]
|
105
160
|
[end of suggestion]
|
106
161
|
<count>9</count>
|
@@ -164,36 +219,44 @@ countgd_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float
|
|
164
219
|
|
165
220
|
AGENT: <thinking>I need to now merge the boxes from all region and use the countgd_object_detection tool with the prompt 'pedestrian' as suggested by get_tool_for_task.</thinking>
|
166
221
|
<execute_python>
|
167
|
-
def
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
222
|
+
def bounding_box_match(b1: List[float], b2: List[float], iou_threshold: float = 0.1) -> bool:
|
223
|
+
# Calculate intersection coordinates
|
224
|
+
x1 = max(b1[0], b2[0])
|
225
|
+
y1 = max(b1[1], b2[1])
|
226
|
+
x2 = min(b1[2], b2[2])
|
227
|
+
y2 = min(b1[3], b2[3])
|
228
|
+
|
229
|
+
# Calculate intersection area
|
230
|
+
if x2 < x1 or y2 < y1:
|
231
|
+
return False # No overlap
|
232
|
+
|
233
|
+
intersection = (x2 - x1) * (y2 - y1)
|
234
|
+
|
235
|
+
# Calculate union area
|
236
|
+
area1 = (b1[2] - b1[0]) * (b1[3] - b1[1])
|
237
|
+
area2 = (b2[2] - b2[0]) * (b2[3] - b2[1])
|
238
|
+
union = area1 + area2 - intersection
|
239
|
+
|
240
|
+
# Calculate IoU
|
241
|
+
iou = intersection / union if union > 0 else 0
|
242
|
+
|
243
|
+
return iou >= iou_threshold
|
244
|
+
|
245
|
+
def merge_bounding_box_list(detections):
|
246
|
+
merged_detections = []
|
247
|
+
for detection in detections:
|
248
|
+
matching_box = None
|
249
|
+
for i, other in enumerate(merged_detections):
|
250
|
+
if bounding_box_match(detection["bbox"], other["bbox"]):
|
251
|
+
matching_box = i
|
193
252
|
break
|
194
|
-
|
195
|
-
|
196
|
-
|
253
|
+
|
254
|
+
if matching_box is not None:
|
255
|
+
# Keep the box with higher confidence score
|
256
|
+
if detection["score"] > merged_detections[matching_box]["score"]:
|
257
|
+
merged_detections[matching_box] = detection
|
258
|
+
else:
|
259
|
+
merged_detections.append(detection)
|
197
260
|
|
198
261
|
detections = []
|
199
262
|
for region in subdivide_image(image):
|
@@ -97,8 +97,7 @@ def run_planning(
|
|
97
97
|
media_list: List[Union[str, Path]],
|
98
98
|
model: LMM,
|
99
99
|
) -> str:
|
100
|
-
|
101
|
-
planning = get_planning(chat[-10:])
|
100
|
+
planning = get_planning(chat)
|
102
101
|
prompt = PLAN.format(
|
103
102
|
tool_desc=PLANNING_TOOLS_DOCSTRING,
|
104
103
|
examples=f"{EXAMPLE_PLAN1}\n{EXAMPLE_PLAN2}",
|
@@ -372,7 +371,7 @@ def replace_interaction_with_obs(chat: List[AgentMessage]) -> List[AgentMessage]
|
|
372
371
|
function_name = response["function_name"]
|
373
372
|
tool_doc = get_tool_documentation(function_name)
|
374
373
|
if "box_threshold" in response:
|
375
|
-
tool_doc = f"Use the following function with box_threshold={response['box_threshold']}
|
374
|
+
tool_doc = f"Use the following function with box_threshold={response['box_threshold']}. This tool and its parameters were chosen by the user so do not change them in your planning.\n\n{tool_doc}."
|
376
375
|
new_chat.append(AgentMessage(role="observation", content=tool_doc))
|
377
376
|
except (json.JSONDecodeError, KeyError):
|
378
377
|
raise ValueError(f"Invalid JSON in interaction response: {chat_i}")
|
@@ -27,7 +27,7 @@ CONFIG = Config()
|
|
27
27
|
|
28
28
|
|
29
29
|
def extract_conversation(
|
30
|
-
chat: List[AgentMessage],
|
30
|
+
chat: List[AgentMessage], include_conv: bool = False
|
31
31
|
) -> Tuple[List[AgentMessage], Optional[str]]:
|
32
32
|
chat = copy.deepcopy(chat)
|
33
33
|
|
@@ -43,6 +43,8 @@ def extract_conversation(
|
|
43
43
|
elif chat_i.role == "coder":
|
44
44
|
if "<final_code>" in chat_i.content:
|
45
45
|
extracted_chat.append(chat_i)
|
46
|
+
elif include_conv and chat_i.role == "conversation":
|
47
|
+
extracted_chat.append(chat_i)
|
46
48
|
|
47
49
|
# only keep the last <final_code> and <final_test>
|
48
50
|
final_code = None
|
@@ -64,10 +66,9 @@ def extract_conversation(
|
|
64
66
|
|
65
67
|
|
66
68
|
def run_conversation(agent: LMM, chat: List[AgentMessage]) -> str:
|
67
|
-
extracted_chat, _ = extract_conversation(chat)
|
68
|
-
extracted_chat = extracted_chat[-10:]
|
69
|
+
extracted_chat, _ = extract_conversation(chat, include_conv=True)
|
69
70
|
|
70
|
-
conv = format_conversation(
|
71
|
+
conv = format_conversation(extracted_chat)
|
71
72
|
prompt = CONVERSATION.format(
|
72
73
|
conversation=conv,
|
73
74
|
)
|
@@ -90,8 +91,6 @@ def maybe_run_action(
|
|
90
91
|
code_interpreter: Optional[CodeInterpreter] = None,
|
91
92
|
) -> Optional[List[AgentMessage]]:
|
92
93
|
extracted_chat, final_code = extract_conversation(chat)
|
93
|
-
# only keep last 5 messages to keep context recent and not overwhelm LLM
|
94
|
-
extracted_chat = extracted_chat[-5:]
|
95
94
|
if action == "generate_or_edit_vision_code":
|
96
95
|
# there's an issue here because coder.generate_code will send it's code_context
|
97
96
|
# to the outside user via it's update_callback, but we don't necessarily have
|
@@ -124,6 +123,7 @@ def maybe_run_action(
|
|
124
123
|
],
|
125
124
|
code="",
|
126
125
|
)
|
126
|
+
|
127
127
|
context = coder.generate_code_from_plan(
|
128
128
|
extracted_chat, plan_context, code_interpreter=code_interpreter
|
129
129
|
)
|
@@ -263,7 +263,7 @@ class VisionAgentV2(Agent):
|
|
263
263
|
# do not append updated_chat to return_chat becuase the observation
|
264
264
|
# from running the action will have already been added via the callbacks
|
265
265
|
obs_response_context = run_conversation(
|
266
|
-
self.agent, return_chat + updated_chat
|
266
|
+
self.agent, int_chat + return_chat + updated_chat
|
267
267
|
)
|
268
268
|
return_chat.append(
|
269
269
|
AgentMessage(role="conversation", content=obs_response_context)
|
@@ -4,20 +4,20 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
|
4
4
|
vision_agent/agent/README.md,sha256=Q4w7FWw38qaWosQYAZ7NqWx8Q5XzuWrlv7nLhjUd1-8,5527
|
5
5
|
vision_agent/agent/__init__.py,sha256=M8CffavdIh8Zh-skznLHIaQkYGCGK7vk4dq1FaVkbs4,617
|
6
6
|
vision_agent/agent/agent.py,sha256=_1tHWAs7Jm5tqDzEcPfCRvJV3uRRveyh4n9_9pd6I1w,1565
|
7
|
-
vision_agent/agent/agent_utils.py,sha256=
|
7
|
+
vision_agent/agent/agent_utils.py,sha256=4RgG8SUEGuMFHkIt0jCFkRQF6G1PZp3Ub4LuVYKF7Ic,14092
|
8
8
|
vision_agent/agent/types.py,sha256=dIdxATH_PP76pD5Wfo0oofWt6iPQh0vpf48QbEQSzhs,2472
|
9
9
|
vision_agent/agent/vision_agent.py,sha256=fH9NOLk7twL1fPr9vLSqkaYhah-gfDWfTOVF2FfMyzI,23461
|
10
10
|
vision_agent/agent/vision_agent_coder.py,sha256=flUxOibyGZK19BCSK5mhaD3HjCxHw6c6FtKom6N2q1E,27359
|
11
11
|
vision_agent/agent/vision_agent_coder_prompts.py,sha256=_kkPLezUVnBXieNPlxMQab_6J6P7F-aa6ItF5NhZZsM,12281
|
12
12
|
vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=NUMWq-Lxq5JmmyWs3C5O_1Hm-zCbf9I_yPK5UtWGspE,4871
|
13
|
-
vision_agent/agent/vision_agent_coder_v2.py,sha256=
|
13
|
+
vision_agent/agent/vision_agent_coder_v2.py,sha256=yQYcO0s4BI9pWaAQQAVtkwWa3UF5w0iLKvwpeJ6iegM,17077
|
14
14
|
vision_agent/agent/vision_agent_planner.py,sha256=fFzjNkZBKkh8Y_oS06ATI4qz31xmIJvixb_tV1kX8KA,18590
|
15
15
|
vision_agent/agent/vision_agent_planner_prompts.py,sha256=rYRdJthc-sQN57VgCBKrF09Sd73BSxcBdjNe6C4WNZ8,6837
|
16
|
-
vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=
|
17
|
-
vision_agent/agent/vision_agent_planner_v2.py,sha256=
|
16
|
+
vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=JLiFZvwQFLYukgj4l-SzxSIjmJHTEkL2HSZbkjU529w,35591
|
17
|
+
vision_agent/agent/vision_agent_planner_v2.py,sha256=wISmdTN-W1pjgZg3_aneGowI3lRQRHTSbpyeTJ79O5A,20645
|
18
18
|
vision_agent/agent/vision_agent_prompts.py,sha256=KaJwYPUP7_GvQsCPPs6Fdawmi3AQWmWajBUuzj7gTG4,13812
|
19
19
|
vision_agent/agent/vision_agent_prompts_v2.py,sha256=Wyxa15NOe75PefAfw3_RRwvgjg8YVqCrU7WvvWoYJpk,2733
|
20
|
-
vision_agent/agent/vision_agent_v2.py,sha256=
|
20
|
+
vision_agent/agent/vision_agent_v2.py,sha256=Q96YsLovCUOd6m2Cg7EGCiHshDq65vxHsfVj7IToyls,10936
|
21
21
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
22
|
vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
|
23
23
|
vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
|
@@ -46,7 +46,7 @@ vision_agent/utils/sim.py,sha256=DYya76dYVtifFyXilMLxBzGgyfyeqhEwU4RJ4894lCI,979
|
|
46
46
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
47
47
|
vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
|
48
48
|
vision_agent/utils/video_tracking.py,sha256=wK5dOutqV2t2aeaxedstCBa7xy-NNQE0-QZqKu1QUds,9498
|
49
|
-
vision_agent-0.2.
|
50
|
-
vision_agent-0.2.
|
51
|
-
vision_agent-0.2.
|
52
|
-
vision_agent-0.2.
|
49
|
+
vision_agent-0.2.234.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
50
|
+
vision_agent-0.2.234.dist-info/METADATA,sha256=HE_nkLFPBpXjw0zU3A0K3c-c6jt3qsdZvT9FhQy0bw8,5760
|
51
|
+
vision_agent-0.2.234.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
52
|
+
vision_agent-0.2.234.dist-info/RECORD,,
|
File without changes
|
File without changes
|