vision-agent 0.2.231__py3-none-any.whl → 0.2.233__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/.sim_tools/df.csv +12 -10
- vision_agent/agent/agent_utils.py +5 -3
- vision_agent/agent/vision_agent_coder_prompts_v2.py +1 -1
- vision_agent/agent/vision_agent_coder_v2.py +16 -2
- vision_agent/agent/vision_agent_planner_prompts_v2.py +138 -71
- vision_agent/agent/vision_agent_planner_v2.py +1 -0
- vision_agent/agent/vision_agent_prompts_v2.py +4 -1
- vision_agent/agent/vision_agent_v2.py +11 -7
- vision_agent/tools/planner_tools.py +33 -13
- vision_agent/tools/tools.py +44 -18
- {vision_agent-0.2.231.dist-info → vision_agent-0.2.233.dist-info}/METADATA +1 -1
- {vision_agent-0.2.231.dist-info → vision_agent-0.2.233.dist-info}/RECORD +14 -14
- {vision_agent-0.2.231.dist-info → vision_agent-0.2.233.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.231.dist-info → vision_agent-0.2.233.dist-info}/WHEEL +0 -0
vision_agent/.sim_tools/df.csv
CHANGED
@@ -514,7 +514,7 @@ desc,doc,name
|
|
514
514
|
>>> vit_nsfw_classification(image)
|
515
515
|
{""label"": ""normal"", ""scores"": 0.68},
|
516
516
|
",vit_nsfw_classification
|
517
|
-
'video_temporal_localization' will run qwen2vl on each chunk_length_frames value selected for the video. It can detect multiple objects independently per chunk_length_frames given a text prompt such as a referring expression but does not track objects across frames. It returns a list of floats with a value of 1.0 if the objects are found in a given chunk_length_frames of the video.,"video_temporal_localization(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames:
|
517
|
+
'video_temporal_localization' will run qwen2vl on each chunk_length_frames value selected for the video. It can detect multiple objects independently per chunk_length_frames given a text prompt such as a referring expression but does not track objects across frames. It returns a list of floats with a value of 1.0 if the objects are found in a given chunk_length_frames of the video.,"video_temporal_localization(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames: int = 2) -> List[float]:
|
518
518
|
'video_temporal_localization' will run qwen2vl on each chunk_length_frames
|
519
519
|
value selected for the video. It can detect multiple objects independently per
|
520
520
|
chunk_length_frames given a text prompt such as a referring expression
|
@@ -527,7 +527,7 @@ desc,doc,name
|
|
527
527
|
frames (List[np.ndarray]): The reference frames used for the question
|
528
528
|
model (str): The model to use for the inference. Valid values are
|
529
529
|
'qwen2vl', 'gpt4o'.
|
530
|
-
chunk_length_frames (
|
530
|
+
chunk_length_frames (int): length of each chunk in frames
|
531
531
|
|
532
532
|
Returns:
|
533
533
|
List[float]: A list of floats with a value of 1.0 if the objects to be found
|
@@ -540,16 +540,18 @@ desc,doc,name
|
|
540
540
|
",video_temporal_localization
|
541
541
|
"'flux_image_inpainting' performs image inpainting to fill the masked regions, given by mask, in the image, given image based on the text prompt and surrounding image context. It can be used to edit regions of an image according to the prompt given.","flux_image_inpainting(prompt: str, image: numpy.ndarray, mask: numpy.ndarray) -> numpy.ndarray:
|
542
542
|
'flux_image_inpainting' performs image inpainting to fill the masked regions,
|
543
|
-
given by mask, in the image, given image based on the text prompt and surrounding
|
544
|
-
It can be used to edit regions of an image according to the prompt
|
543
|
+
given by mask, in the image, given image based on the text prompt and surrounding
|
544
|
+
image context. It can be used to edit regions of an image according to the prompt
|
545
|
+
given.
|
545
546
|
|
546
547
|
Parameters:
|
547
548
|
prompt (str): A detailed text description guiding what should be generated
|
548
|
-
in the masked area. More detailed and specific prompts typically yield
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
549
|
+
in the masked area. More detailed and specific prompts typically yield
|
550
|
+
better results.
|
551
|
+
image (np.ndarray): The source image to be inpainted. The image will serve as
|
552
|
+
the base context for the inpainting process.
|
553
|
+
mask (np.ndarray): A binary mask image with 0's and 1's, where 1 indicates
|
554
|
+
areas to be inpainted and 0 indicates areas to be preserved.
|
553
555
|
|
554
556
|
Returns:
|
555
557
|
np.ndarray: The generated image(s) as a numpy array in RGB format with values
|
@@ -658,7 +660,7 @@ desc,doc,name
|
|
658
660
|
-------
|
659
661
|
>>> save_image(image)
|
660
662
|
",save_image
|
661
|
-
'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.,"save_video(frames: List[numpy.ndarray], output_video_path: Optional[str] = None, fps: float =
|
663
|
+
'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.,"save_video(frames: List[numpy.ndarray], output_video_path: Optional[str] = None, fps: float = 5) -> str:
|
662
664
|
'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.
|
663
665
|
|
664
666
|
Parameters:
|
@@ -148,8 +148,10 @@ def format_plan_v2(plan: PlanContext) -> str:
|
|
148
148
|
plan_str += "Instructions:\n"
|
149
149
|
for v in plan.instructions:
|
150
150
|
plan_str += f" - {v}\n"
|
151
|
-
|
152
|
-
|
151
|
+
|
152
|
+
if plan.code:
|
153
|
+
plan_str += "Code:\n"
|
154
|
+
plan_str += plan.code
|
153
155
|
return plan_str
|
154
156
|
|
155
157
|
|
@@ -158,7 +160,7 @@ def format_conversation(chat: List[AgentMessage]) -> str:
|
|
158
160
|
prompt = ""
|
159
161
|
for chat_i in chat:
|
160
162
|
if chat_i.role == "user" or chat_i.role == "coder":
|
161
|
-
if "<final_code>" in chat_i.
|
163
|
+
if "<final_code>" in chat_i.content:
|
162
164
|
prompt += f"OBSERVATION: {chat_i.content}\n\n"
|
163
165
|
elif chat_i.role == "user":
|
164
166
|
prompt += f"USER: {chat_i.content}\n\n"
|
@@ -6,7 +6,7 @@ FEEDBACK = """
|
|
6
6
|
|
7
7
|
|
8
8
|
CODE = """
|
9
|
-
**Role**: You are an
|
9
|
+
**Role**: You are an expert software programmer.
|
10
10
|
|
11
11
|
**Task**: You are given a plan by a planning agent that solves a vision problem posed by the user. You are also given code snippets that the planning agent used to solve the task. Your job is to organize the code so that it can be easily called by the user to solve the task.
|
12
12
|
|
@@ -425,6 +425,8 @@ class VisionAgentCoderV2(AgentCoder):
|
|
425
425
|
chat (List[AgentMessage]): The input to the agent. This should be a list of
|
426
426
|
AgentMessage objects.
|
427
427
|
plan_context (PlanContext): The plan context that was previously generated.
|
428
|
+
If plan_context.code is not provided, then the code will be generated
|
429
|
+
from the chat messages.
|
428
430
|
code_interpreter (Optional[CodeInterpreter]): The code interpreter to use.
|
429
431
|
|
430
432
|
Returns:
|
@@ -441,7 +443,7 @@ class VisionAgentCoderV2(AgentCoder):
|
|
441
443
|
|
442
444
|
# we don't need the user_interaction response for generating code since it's
|
443
445
|
# already in the plan context
|
444
|
-
while chat[-1].role != "user":
|
446
|
+
while len(chat) > 0 and chat[-1].role != "user":
|
445
447
|
chat.pop()
|
446
448
|
|
447
449
|
if not chat:
|
@@ -455,12 +457,24 @@ class VisionAgentCoderV2(AgentCoder):
|
|
455
457
|
int_chat, _, media_list = add_media_to_chat(chat, code_interpreter)
|
456
458
|
tool_docs = retrieve_tools(plan_context.instructions, self.tool_recommender)
|
457
459
|
|
460
|
+
# If code is not provided from the plan_context then generate it, else use
|
461
|
+
# the provided code and start with testing
|
462
|
+
if not plan_context.code.strip():
|
463
|
+
code = write_code(
|
464
|
+
coder=self.coder,
|
465
|
+
chat=int_chat,
|
466
|
+
tool_docs=tool_docs,
|
467
|
+
plan=format_plan_v2(plan_context),
|
468
|
+
)
|
469
|
+
else:
|
470
|
+
code = plan_context.code
|
471
|
+
|
458
472
|
code_context = test_code(
|
459
473
|
tester=self.tester,
|
460
474
|
debugger=self.debugger,
|
461
475
|
chat=int_chat,
|
462
476
|
plan=format_plan_v2(plan_context),
|
463
|
-
code=
|
477
|
+
code=code,
|
464
478
|
tool_docs=tool_docs,
|
465
479
|
code_interpreter=code_interpreter,
|
466
480
|
media_list=media_list,
|
@@ -50,7 +50,7 @@ From this aerial view of a busy urban street, it's difficult to clearly see or c
|
|
50
50
|
[suggestion 0]
|
51
51
|
The image is very large and the items you need to detect are small.
|
52
52
|
|
53
|
-
Step 1: You should start by splitting the image into sections and runing the detection algorithm on each section:
|
53
|
+
Step 1: You should start by splitting the image into overlapping sections and runing the detection algorithm on each section:
|
54
54
|
|
55
55
|
def subdivide_image(image):
|
56
56
|
height, width, _ = image.shape
|
@@ -66,41 +66,96 @@ def subdivide_image(image):
|
|
66
66
|
|
67
67
|
get_tool_for_task('<your prompt here>', subdivide_image(image))
|
68
68
|
|
69
|
-
Step 2: Once you have the detections from each subdivided image, you will need to merge them back together to remove overlapping predictions:
|
70
|
-
|
71
|
-
def
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
69
|
+
Step 2: Once you have the detections from each subdivided image, you will need to merge them back together to remove overlapping predictions, be sure to tranlate the offset back to the original image:
|
70
|
+
|
71
|
+
def bounding_box_match(b1: List[float], b2: List[float], iou_threshold: float = 0.1) -> bool:
|
72
|
+
# Calculate intersection coordinates
|
73
|
+
x1 = max(b1[0], b2[0])
|
74
|
+
y1 = max(b1[1], b2[1])
|
75
|
+
x2 = min(b1[2], b2[2])
|
76
|
+
y2 = min(b1[3], b2[3])
|
77
|
+
|
78
|
+
# Calculate intersection area
|
79
|
+
if x2 < x1 or y2 < y1:
|
80
|
+
return False # No overlap
|
81
|
+
|
82
|
+
intersection = (x2 - x1) * (y2 - y1)
|
83
|
+
|
84
|
+
# Calculate union area
|
85
|
+
area1 = (b1[2] - b1[0]) * (b1[3] - b1[1])
|
86
|
+
area2 = (b2[2] - b2[0]) * (b2[3] - b2[1])
|
87
|
+
union = area1 + area2 - intersection
|
88
|
+
|
89
|
+
# Calculate IoU
|
90
|
+
iou = intersection / union if union > 0 else 0
|
91
|
+
|
92
|
+
return iou >= iou_threshold
|
93
|
+
|
94
|
+
def merge_bounding_box_list(detections):
|
95
|
+
merged_detections = []
|
96
|
+
for detection in detections:
|
97
|
+
matching_box = None
|
98
|
+
for i, other in enumerate(merged_detections):
|
99
|
+
if bounding_box_match(detection["bbox"], other["bbox"]):
|
100
|
+
matching_box = i
|
97
101
|
break
|
98
|
-
if not overlap_found:
|
99
|
-
p
|
100
|
-
merged_bboxes.append(bbox)
|
101
|
-
return merged_bboxes
|
102
102
|
|
103
|
-
|
103
|
+
if matching_box is not None:
|
104
|
+
# Keep the box with higher confidence score
|
105
|
+
if detection["score"] > merged_detections[matching_box]["score"]:
|
106
|
+
merged_detections[matching_box] = detection
|
107
|
+
else:
|
108
|
+
merged_detections.append(detection)
|
109
|
+
|
110
|
+
def sub_image_to_original(elt, sub_image_position, original_size):
|
111
|
+
offset_x, offset_y = sub_image_position
|
112
|
+
return {
|
113
|
+
"label": elt["label"],
|
114
|
+
"score": elt["score"],
|
115
|
+
"bbox": [
|
116
|
+
(elt["bbox"][0] + offset_x) / original_size[1],
|
117
|
+
(elt["bbox"][1] + offset_y) / original_size[0],
|
118
|
+
(elt["bbox"][2] + offset_x) / original_size[1],
|
119
|
+
(elt["bbox"][3] + offset_y) / original_size[0],
|
120
|
+
],
|
121
|
+
}
|
122
|
+
|
123
|
+
def normalized_to_unnormalized(elt, image_size):
|
124
|
+
return {
|
125
|
+
"label": elt["label"],
|
126
|
+
"score": elt["score"],
|
127
|
+
"bbox": [
|
128
|
+
elt["bbox"][0] * image_size[1],
|
129
|
+
elt["bbox"][1] * image_size[0],
|
130
|
+
elt["bbox"][2] * image_size[1],
|
131
|
+
elt["bbox"][3] * image_size[0],
|
132
|
+
],
|
133
|
+
}
|
134
|
+
|
135
|
+
height, width, _ = image.shape
|
136
|
+
mid_width = width // 2
|
137
|
+
mid_height = height // 2
|
138
|
+
|
139
|
+
detection_from_subdivided_images = []
|
140
|
+
for i, sub_image in enumerate(subdivided_images):
|
141
|
+
detections = <your detection function here>("pedestrian", sub_image)
|
142
|
+
unnorm_detections = [
|
143
|
+
normalized_to_unnormalized(
|
144
|
+
detection, (sub_image.shape[0], sub_image.shape[1])
|
145
|
+
)
|
146
|
+
for detection in detections
|
147
|
+
]
|
148
|
+
offset_x = i % 2 * (mid_width - int(mid_width * 0.1))
|
149
|
+
offset_y = i // 2 * (mid_height - int(mid_height * 0.1))
|
150
|
+
offset_detections = [
|
151
|
+
sub_image_to_original(
|
152
|
+
unnorm_detection, (offset_x, offset_y), (height, width)
|
153
|
+
)
|
154
|
+
for unnorm_detection in unnorm_detections
|
155
|
+
]
|
156
|
+
detection_from_subdivided_images.extend(offset_detections)
|
157
|
+
|
158
|
+
detections = merge_bounding_box_list(detection_from_subdivided_images)
|
104
159
|
[end of suggestion 0]
|
105
160
|
[end of suggestion]
|
106
161
|
<count>9</count>
|
@@ -164,36 +219,44 @@ countgd_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float
|
|
164
219
|
|
165
220
|
AGENT: <thinking>I need to now merge the boxes from all region and use the countgd_object_detection tool with the prompt 'pedestrian' as suggested by get_tool_for_task.</thinking>
|
166
221
|
<execute_python>
|
167
|
-
def
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
222
|
+
def bounding_box_match(b1: List[float], b2: List[float], iou_threshold: float = 0.1) -> bool:
|
223
|
+
# Calculate intersection coordinates
|
224
|
+
x1 = max(b1[0], b2[0])
|
225
|
+
y1 = max(b1[1], b2[1])
|
226
|
+
x2 = min(b1[2], b2[2])
|
227
|
+
y2 = min(b1[3], b2[3])
|
228
|
+
|
229
|
+
# Calculate intersection area
|
230
|
+
if x2 < x1 or y2 < y1:
|
231
|
+
return False # No overlap
|
232
|
+
|
233
|
+
intersection = (x2 - x1) * (y2 - y1)
|
234
|
+
|
235
|
+
# Calculate union area
|
236
|
+
area1 = (b1[2] - b1[0]) * (b1[3] - b1[1])
|
237
|
+
area2 = (b2[2] - b2[0]) * (b2[3] - b2[1])
|
238
|
+
union = area1 + area2 - intersection
|
239
|
+
|
240
|
+
# Calculate IoU
|
241
|
+
iou = intersection / union if union > 0 else 0
|
242
|
+
|
243
|
+
return iou >= iou_threshold
|
244
|
+
|
245
|
+
def merge_bounding_box_list(detections):
|
246
|
+
merged_detections = []
|
247
|
+
for detection in detections:
|
248
|
+
matching_box = None
|
249
|
+
for i, other in enumerate(merged_detections):
|
250
|
+
if bounding_box_match(detection["bbox"], other["bbox"]):
|
251
|
+
matching_box = i
|
193
252
|
break
|
194
|
-
|
195
|
-
|
196
|
-
|
253
|
+
|
254
|
+
if matching_box is not None:
|
255
|
+
# Keep the box with higher confidence score
|
256
|
+
if detection["score"] > merged_detections[matching_box]["score"]:
|
257
|
+
merged_detections[matching_box] = detection
|
258
|
+
else:
|
259
|
+
merged_detections.append(detection)
|
197
260
|
|
198
261
|
detections = []
|
199
262
|
for region in subdivide_image(image):
|
@@ -458,6 +521,8 @@ You are given a task: "{task}" from the user. You must extract the type of categ
|
|
458
521
|
- "DocQA" - answering questions about a document or extracting information from a document.
|
459
522
|
- "video object tracking" - tracking objects in a video.
|
460
523
|
- "depth and pose estimation" - estimating the depth or pose of objects in an image.
|
524
|
+
- "temporal localization" - localizing the time period an event occurs in a video.
|
525
|
+
- "inpainting" - filling in masked parts of an image.
|
461
526
|
|
462
527
|
Return the category or categories (comma separated) inside tags <category># your categories here</category>. If you are unsure about a task, it is better to include more categories than less.
|
463
528
|
"""
|
@@ -651,22 +716,24 @@ PICK_TOOL = """
|
|
651
716
|
"""
|
652
717
|
|
653
718
|
FINALIZE_PLAN = """
|
654
|
-
**
|
719
|
+
**Task**: You are given a chain of thoughts, python executions and observations from a planning agent as it tries to construct a plan to solve a user request. Your task is to summarize the plan it found so that another programming agent to write a program to accomplish the user request.
|
655
720
|
|
656
|
-
**
|
721
|
+
**Documentation**: You can use these tools to help you visualize or save the output:
|
722
|
+
{tool_desc}
|
657
723
|
|
658
724
|
**Planning**: Here is chain of thoughts, executions and observations from the planning agent:
|
659
725
|
{planning}
|
660
726
|
|
661
727
|
**Instructions**:
|
662
728
|
1. Summarize the plan that the planning agent found.
|
663
|
-
2. Write a single function that solves the problem based on what the planner found.
|
664
|
-
3.
|
729
|
+
2. Write a single function that solves the problem based on what the planner found and only returns the final solution.
|
730
|
+
3. Only use tools obtained from calling `get_tool_for_task`.
|
665
731
|
4. Do not include {excluded_tools} tools in your instructions.
|
666
|
-
5.
|
667
|
-
6.
|
668
|
-
7.
|
669
|
-
8.
|
732
|
+
5. Ensure the function is well documented and easy to understand.
|
733
|
+
6. Ensure you visualize the output with `overlay_bounding_boxes` or `overlay_segmentation_masks` and save it to a file with `save_image` or `save_video`.
|
734
|
+
7. Use the default FPS for extracting frames from videos unless otherwise specified by the user.
|
735
|
+
8. Include the expected answer in your 'plan' so that the programming agent can properly test if it has the correct answer.
|
736
|
+
9. Respond in the following format with JSON surrounded by <json> tags and code surrounded by <code> tags:
|
670
737
|
|
671
738
|
<json>
|
672
739
|
{{
|
@@ -326,6 +326,7 @@ def create_finalize_plan(
|
|
326
326
|
return [], PlanContext(plan="", instructions=[], code="")
|
327
327
|
|
328
328
|
prompt = FINALIZE_PLAN.format(
|
329
|
+
tool_desc=UTIL_DOCSTRING,
|
329
330
|
planning=get_planning(chat),
|
330
331
|
excluded_tools=str([t.__name__ for t in pt.PLANNER_TOOLS]),
|
331
332
|
)
|
@@ -42,6 +42,8 @@ AGENT: <response>I am VisionAgent, an agent built by LandingAI, to help users wr
|
|
42
42
|
- Understanding documents
|
43
43
|
- Pose estimation
|
44
44
|
- Visual question answering for both images and videos
|
45
|
+
- Action recognition in videos
|
46
|
+
- Image inpainting
|
45
47
|
|
46
48
|
How can I help you?</response>
|
47
49
|
--- END EXAMPLE2 ---
|
@@ -54,7 +56,8 @@ Here is the current conversation so far:
|
|
54
56
|
|
55
57
|
**Instructions**:
|
56
58
|
1. Only respond with a single <response> tag and a single <action> tag.
|
57
|
-
2.
|
59
|
+
2. You can only take one action at a time in response to the user's message. Do not offer to fix code on the user's behalf, only if they have directly asked you to.
|
60
|
+
3. Respond in the following format, the <action> tag is optional and can be excluded if you do not want to take any action:
|
58
61
|
|
59
62
|
<response>Your response to the user's message</response>
|
60
63
|
<action>The action you want to take from **Actions**</action>
|
@@ -27,7 +27,7 @@ CONFIG = Config()
|
|
27
27
|
|
28
28
|
|
29
29
|
def extract_conversation(
|
30
|
-
chat: List[AgentMessage],
|
30
|
+
chat: List[AgentMessage], include_conv: bool = False
|
31
31
|
) -> Tuple[List[AgentMessage], Optional[str]]:
|
32
32
|
chat = copy.deepcopy(chat)
|
33
33
|
|
@@ -43,6 +43,8 @@ def extract_conversation(
|
|
43
43
|
elif chat_i.role == "coder":
|
44
44
|
if "<final_code>" in chat_i.content:
|
45
45
|
extracted_chat.append(chat_i)
|
46
|
+
elif include_conv and chat_i.role == "conversation":
|
47
|
+
extracted_chat.append(chat_i)
|
46
48
|
|
47
49
|
# only keep the last <final_code> and <final_test>
|
48
50
|
final_code = None
|
@@ -64,10 +66,9 @@ def extract_conversation(
|
|
64
66
|
|
65
67
|
|
66
68
|
def run_conversation(agent: LMM, chat: List[AgentMessage]) -> str:
|
67
|
-
extracted_chat, _ = extract_conversation(chat)
|
68
|
-
extracted_chat = extracted_chat[-10:]
|
69
|
+
extracted_chat, _ = extract_conversation(chat, include_conv=True)
|
69
70
|
|
70
|
-
conv = format_conversation(
|
71
|
+
conv = format_conversation(extracted_chat)
|
71
72
|
prompt = CONVERSATION.format(
|
72
73
|
conversation=conv,
|
73
74
|
)
|
@@ -112,14 +113,17 @@ def maybe_run_action(
|
|
112
113
|
)
|
113
114
|
]
|
114
115
|
elif action == "edit_code":
|
116
|
+
# We don't want to pass code in plan_context.code so the coder will generate
|
117
|
+
# new code from plan_context.plan
|
115
118
|
plan_context = PlanContext(
|
116
|
-
plan="Edit the latest code observed in the fewest steps possible according to the user's feedback."
|
119
|
+
plan="Edit the latest code observed in the fewest steps possible according to the user's feedback."
|
120
|
+
+ ("<code>\n" + final_code + "\n</code>" if final_code is not None else ""),
|
117
121
|
instructions=[
|
118
122
|
chat_i.content
|
119
123
|
for chat_i in extracted_chat
|
120
124
|
if chat_i.role == "user" and "<final_code>" not in chat_i.content
|
121
125
|
],
|
122
|
-
code=
|
126
|
+
code="",
|
123
127
|
)
|
124
128
|
context = coder.generate_code_from_plan(
|
125
129
|
extracted_chat, plan_context, code_interpreter=code_interpreter
|
@@ -260,7 +264,7 @@ class VisionAgentV2(Agent):
|
|
260
264
|
# do not append updated_chat to return_chat becuase the observation
|
261
265
|
# from running the action will have already been added via the callbacks
|
262
266
|
obs_response_context = run_conversation(
|
263
|
-
self.agent, return_chat + updated_chat
|
267
|
+
self.agent, int_chat + return_chat + updated_chat
|
264
268
|
)
|
265
269
|
return_chat.append(
|
266
270
|
AgentMessage(role="conversation", content=obs_response_context)
|
@@ -2,7 +2,7 @@ import inspect
|
|
2
2
|
import logging
|
3
3
|
import tempfile
|
4
4
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
5
|
-
from typing import Any, Callable, Dict, List, Optional, Tuple, cast
|
5
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
6
6
|
|
7
7
|
import libcst as cst
|
8
8
|
import numpy as np
|
@@ -235,7 +235,9 @@ def run_tool_testing(
|
|
235
235
|
|
236
236
|
|
237
237
|
def get_tool_for_task(
|
238
|
-
task: str,
|
238
|
+
task: str,
|
239
|
+
images: Union[Dict[str, List[np.ndarray]], List[np.ndarray]],
|
240
|
+
exclude_tools: Optional[List[str]] = None,
|
239
241
|
) -> None:
|
240
242
|
"""Given a task and one or more images this function will find a tool to accomplish
|
241
243
|
the jobs. It prints the tool documentation and thoughts on why it chose the tool.
|
@@ -248,6 +250,8 @@ def get_tool_for_task(
|
|
248
250
|
- VQA
|
249
251
|
- Depth and pose estimation
|
250
252
|
- Video object tracking
|
253
|
+
- Video temporal localization (action recognition)
|
254
|
+
- Image inpainting
|
251
255
|
|
252
256
|
Only ask for one type of task at a time, for example a task needing to identify
|
253
257
|
text is one OCR task while needing to identify non-text objects is an OD task. Wait
|
@@ -256,7 +260,8 @@ def get_tool_for_task(
|
|
256
260
|
|
257
261
|
Parameters:
|
258
262
|
task: str: The task to accomplish.
|
259
|
-
images: List[np.ndarray]: The images to use
|
263
|
+
images: Union[Dict[str, List[np.ndarray]], List[np.ndarray]]: The images to use
|
264
|
+
for the task. If a key is provided, it is used as the file name.
|
260
265
|
exclude_tools: Optional[List[str]]: A list of tool names to exclude from the
|
261
266
|
recommendations. This is helpful if you are calling get_tool_for_task twice
|
262
267
|
and do not want the same tool recommended.
|
@@ -266,20 +271,29 @@ def get_tool_for_task(
|
|
266
271
|
|
267
272
|
Examples
|
268
273
|
--------
|
269
|
-
>>> get_tool_for_task(
|
274
|
+
>>> get_tool_for_task(
|
275
|
+
>>> "Give me an OCR model that can find 'hot chocolate' in the image",
|
276
|
+
>>> {"image": [image]})
|
277
|
+
>>> get_tool_for_taks(
|
278
|
+
>>> "I need a tool that can paint a background for this image and maks",
|
279
|
+
>>> {"image": [image], "mask": [mask]})
|
270
280
|
"""
|
271
281
|
tool_tester = CONFIG.create_tool_tester()
|
272
282
|
tool_chooser = CONFIG.create_tool_chooser()
|
273
283
|
|
284
|
+
if isinstance(images, list):
|
285
|
+
images = {"image": images}
|
286
|
+
|
274
287
|
with (
|
275
288
|
tempfile.TemporaryDirectory() as tmpdirname,
|
276
289
|
CodeInterpreterFactory.new_instance() as code_interpreter,
|
277
290
|
):
|
278
291
|
image_paths = []
|
279
|
-
for
|
280
|
-
|
281
|
-
|
282
|
-
|
292
|
+
for k in images.keys():
|
293
|
+
for i, image in enumerate(images[k]):
|
294
|
+
image_path = f"{tmpdirname}/{k}_{i}.png"
|
295
|
+
Image.fromarray(image).save(image_path)
|
296
|
+
image_paths.append(image_path)
|
283
297
|
|
284
298
|
code, tool_docs_str, tool_output = run_tool_testing(
|
285
299
|
task, image_paths, tool_tester, exclude_tools, code_interpreter
|
@@ -300,20 +314,26 @@ def get_tool_documentation(tool_name: str) -> str:
|
|
300
314
|
|
301
315
|
|
302
316
|
def get_tool_for_task_human_reviewer(
|
303
|
-
task: str,
|
317
|
+
task: str,
|
318
|
+
images: Union[Dict[str, List[np.ndarray]], List[np.ndarray]],
|
319
|
+
exclude_tools: Optional[List[str]] = None,
|
304
320
|
) -> None:
|
305
321
|
# NOTE: this will have the same documentation as get_tool_for_task
|
306
322
|
tool_tester = CONFIG.create_tool_tester()
|
307
323
|
|
324
|
+
if isinstance(images, list):
|
325
|
+
images = {"image": images}
|
326
|
+
|
308
327
|
with (
|
309
328
|
tempfile.TemporaryDirectory() as tmpdirname,
|
310
329
|
CodeInterpreterFactory.new_instance() as code_interpreter,
|
311
330
|
):
|
312
331
|
image_paths = []
|
313
|
-
for
|
314
|
-
|
315
|
-
|
316
|
-
|
332
|
+
for k in images.keys():
|
333
|
+
for i, image in enumerate(images[k]):
|
334
|
+
image_path = f"{tmpdirname}/{k}_{i}.png"
|
335
|
+
Image.fromarray(image).save(image_path)
|
336
|
+
image_paths.append(image_path)
|
317
337
|
|
318
338
|
tools = [
|
319
339
|
t.__name__
|
vision_agent/tools/tools.py
CHANGED
@@ -1727,22 +1727,46 @@ def video_temporal_localization(
|
|
1727
1727
|
}
|
1728
1728
|
payload["chunk_length_frames"] = chunk_length_frames
|
1729
1729
|
|
1730
|
-
|
1731
|
-
|
1732
|
-
|
1730
|
+
segments = split_frames_into_segments(frames, segment_size=50, overlap=0)
|
1731
|
+
|
1732
|
+
def _apply_temporal_localization(
|
1733
|
+
segment: List[np.ndarray],
|
1734
|
+
) -> List[float]:
|
1735
|
+
segment_buffer_bytes = [("video", frames_to_bytes(segment))]
|
1736
|
+
data = send_inference_request(
|
1737
|
+
payload, "video-temporal-localization", files=segment_buffer_bytes, v2=True
|
1738
|
+
)
|
1739
|
+
chunked_data = [cast(float, value) for value in data]
|
1740
|
+
|
1741
|
+
full_data = []
|
1742
|
+
for value in chunked_data:
|
1743
|
+
full_data.extend([value] * chunk_length_frames)
|
1744
|
+
|
1745
|
+
return full_data[: len(segment)]
|
1746
|
+
|
1747
|
+
with ThreadPoolExecutor() as executor:
|
1748
|
+
futures = {
|
1749
|
+
executor.submit(_apply_temporal_localization, segment): segment_index
|
1750
|
+
for segment_index, segment in enumerate(segments)
|
1751
|
+
}
|
1752
|
+
|
1753
|
+
localization_per_segment = []
|
1754
|
+
for future in as_completed(futures):
|
1755
|
+
segment_index = futures[future]
|
1756
|
+
localization_per_segment.append((segment_index, future.result()))
|
1757
|
+
|
1758
|
+
localization_per_segment = [
|
1759
|
+
x[1] for x in sorted(localization_per_segment, key=lambda x: x[0]) # type: ignore
|
1760
|
+
]
|
1761
|
+
localizations = cast(List[float], [e for o in localization_per_segment for e in o])
|
1762
|
+
|
1733
1763
|
_display_tool_trace(
|
1734
1764
|
video_temporal_localization.__name__,
|
1735
1765
|
payload,
|
1736
|
-
|
1766
|
+
localization_per_segment,
|
1737
1767
|
files,
|
1738
1768
|
)
|
1739
|
-
|
1740
|
-
|
1741
|
-
full_data = []
|
1742
|
-
for value in chunked_data:
|
1743
|
-
full_data.extend([value] * chunk_length_frames)
|
1744
|
-
|
1745
|
-
return full_data[: len(frames)]
|
1769
|
+
return localizations
|
1746
1770
|
|
1747
1771
|
|
1748
1772
|
def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
|
@@ -2028,16 +2052,18 @@ def flux_image_inpainting(
|
|
2028
2052
|
mask: np.ndarray,
|
2029
2053
|
) -> np.ndarray:
|
2030
2054
|
"""'flux_image_inpainting' performs image inpainting to fill the masked regions,
|
2031
|
-
given by mask, in the image, given image based on the text prompt and surrounding
|
2032
|
-
It can be used to edit regions of an image according to the prompt
|
2055
|
+
given by mask, in the image, given image based on the text prompt and surrounding
|
2056
|
+
image context. It can be used to edit regions of an image according to the prompt
|
2057
|
+
given.
|
2033
2058
|
|
2034
2059
|
Parameters:
|
2035
2060
|
prompt (str): A detailed text description guiding what should be generated
|
2036
|
-
in the masked area. More detailed and specific prompts typically yield
|
2037
|
-
|
2038
|
-
|
2039
|
-
|
2040
|
-
|
2061
|
+
in the masked area. More detailed and specific prompts typically yield
|
2062
|
+
better results.
|
2063
|
+
image (np.ndarray): The source image to be inpainted. The image will serve as
|
2064
|
+
the base context for the inpainting process.
|
2065
|
+
mask (np.ndarray): A binary mask image with 0's and 1's, where 1 indicates
|
2066
|
+
areas to be inpainted and 0 indicates areas to be preserved.
|
2041
2067
|
|
2042
2068
|
Returns:
|
2043
2069
|
np.ndarray: The generated image(s) as a numpy array in RGB format with values
|
@@ -1,23 +1,23 @@
|
|
1
|
-
vision_agent/.sim_tools/df.csv,sha256=
|
1
|
+
vision_agent/.sim_tools/df.csv,sha256=oVUuyoVTCnayorbGUAvWed8l1YA_-rF9rSF78fMtvuU,42468
|
2
2
|
vision_agent/.sim_tools/embs.npy,sha256=YJe8EcKVNmeX_75CS2T1sbY-sUS_1HQAMT-34zc18a0,254080
|
3
3
|
vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
4
4
|
vision_agent/agent/README.md,sha256=Q4w7FWw38qaWosQYAZ7NqWx8Q5XzuWrlv7nLhjUd1-8,5527
|
5
5
|
vision_agent/agent/__init__.py,sha256=M8CffavdIh8Zh-skznLHIaQkYGCGK7vk4dq1FaVkbs4,617
|
6
6
|
vision_agent/agent/agent.py,sha256=_1tHWAs7Jm5tqDzEcPfCRvJV3uRRveyh4n9_9pd6I1w,1565
|
7
|
-
vision_agent/agent/agent_utils.py,sha256=
|
7
|
+
vision_agent/agent/agent_utils.py,sha256=4RgG8SUEGuMFHkIt0jCFkRQF6G1PZp3Ub4LuVYKF7Ic,14092
|
8
8
|
vision_agent/agent/types.py,sha256=dIdxATH_PP76pD5Wfo0oofWt6iPQh0vpf48QbEQSzhs,2472
|
9
9
|
vision_agent/agent/vision_agent.py,sha256=fH9NOLk7twL1fPr9vLSqkaYhah-gfDWfTOVF2FfMyzI,23461
|
10
10
|
vision_agent/agent/vision_agent_coder.py,sha256=flUxOibyGZK19BCSK5mhaD3HjCxHw6c6FtKom6N2q1E,27359
|
11
11
|
vision_agent/agent/vision_agent_coder_prompts.py,sha256=_kkPLezUVnBXieNPlxMQab_6J6P7F-aa6ItF5NhZZsM,12281
|
12
|
-
vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=
|
13
|
-
vision_agent/agent/vision_agent_coder_v2.py,sha256=
|
12
|
+
vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=NUMWq-Lxq5JmmyWs3C5O_1Hm-zCbf9I_yPK5UtWGspE,4871
|
13
|
+
vision_agent/agent/vision_agent_coder_v2.py,sha256=yQYcO0s4BI9pWaAQQAVtkwWa3UF5w0iLKvwpeJ6iegM,17077
|
14
14
|
vision_agent/agent/vision_agent_planner.py,sha256=fFzjNkZBKkh8Y_oS06ATI4qz31xmIJvixb_tV1kX8KA,18590
|
15
15
|
vision_agent/agent/vision_agent_planner_prompts.py,sha256=rYRdJthc-sQN57VgCBKrF09Sd73BSxcBdjNe6C4WNZ8,6837
|
16
|
-
vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=
|
17
|
-
vision_agent/agent/vision_agent_planner_v2.py,sha256=
|
16
|
+
vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=U88z1Y7CifFs7t53aUrl8qjWtBYs0f_F5vyg_0VYJko,35528
|
17
|
+
vision_agent/agent/vision_agent_planner_v2.py,sha256=NUyi57zxCmOO004_cJcCCDa4UgcKSWB1WCGuyOhhXQE,20602
|
18
18
|
vision_agent/agent/vision_agent_prompts.py,sha256=KaJwYPUP7_GvQsCPPs6Fdawmi3AQWmWajBUuzj7gTG4,13812
|
19
|
-
vision_agent/agent/vision_agent_prompts_v2.py,sha256=
|
20
|
-
vision_agent/agent/vision_agent_v2.py,sha256=
|
19
|
+
vision_agent/agent/vision_agent_prompts_v2.py,sha256=Wyxa15NOe75PefAfw3_RRwvgjg8YVqCrU7WvvWoYJpk,2733
|
20
|
+
vision_agent/agent/vision_agent_v2.py,sha256=86_pPdkkMBk08TTFZ7zu9QG37Iz9uI8Nmt79wwm_EIA,11053
|
21
21
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
22
|
vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
|
23
23
|
vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
|
@@ -33,10 +33,10 @@ vision_agent/lmm/lmm.py,sha256=arwfYPWme_RxCxSpEQ0ZkpHO22GFPCwVeoSvXqLPOAk,19288
|
|
33
33
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
34
34
|
vision_agent/tools/__init__.py,sha256=zopUrANPx7p0NGy6BxmEaYhDrj8DX8w7BLfgmCbz-mU,2897
|
35
35
|
vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
|
36
|
-
vision_agent/tools/planner_tools.py,sha256=
|
36
|
+
vision_agent/tools/planner_tools.py,sha256=8pJZCGGOGIqGiV2or52BjyRP6eDlporuQ2hXCIHfLTQ,15382
|
37
37
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
38
38
|
vision_agent/tools/tool_utils.py,sha256=xJRWF96Ge9RvhhVHrOtifjUYoc4HIJ2y7c2VOQ2Lp8s,10152
|
39
|
-
vision_agent/tools/tools.py,sha256=
|
39
|
+
vision_agent/tools/tools.py,sha256=Eb2paiXjik0HyGeZzXctTpJCLG0V3NnNL9awtaB8HN4,107011
|
40
40
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
41
41
|
vision_agent/utils/__init__.py,sha256=QKk4zVjMwGxQI0MQ-aZZA50N-qItxRY4EB9CwQkZ2HY,185
|
42
42
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -46,7 +46,7 @@ vision_agent/utils/sim.py,sha256=DYya76dYVtifFyXilMLxBzGgyfyeqhEwU4RJ4894lCI,979
|
|
46
46
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
47
47
|
vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
|
48
48
|
vision_agent/utils/video_tracking.py,sha256=wK5dOutqV2t2aeaxedstCBa7xy-NNQE0-QZqKu1QUds,9498
|
49
|
-
vision_agent-0.2.
|
50
|
-
vision_agent-0.2.
|
51
|
-
vision_agent-0.2.
|
52
|
-
vision_agent-0.2.
|
49
|
+
vision_agent-0.2.233.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
50
|
+
vision_agent-0.2.233.dist-info/METADATA,sha256=EoNuerRth0lHRC7TK2Xh7w6V__YtUJraKk9yN8AMx2U,5760
|
51
|
+
vision_agent-0.2.233.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
52
|
+
vision_agent-0.2.233.dist-info/RECORD,,
|
File without changes
|
File without changes
|