vision-agent 0.2.224__tar.gz → 0.2.226__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {vision_agent-0.2.224 → vision_agent-0.2.226}/PKG-INFO +1 -1
- {vision_agent-0.2.224 → vision_agent-0.2.226}/pyproject.toml +1 -1
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/.sim_tools/df.csv +49 -91
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/.sim_tools/embs.npy +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/agent/agent_utils.py +13 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/agent/vision_agent_coder_prompts_v2.py +1 -1
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/agent/vision_agent_coder_v2.py +6 -1
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/agent/vision_agent_planner_prompts_v2.py +42 -33
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/agent/vision_agent_v2.py +30 -22
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/tools/planner_tools.py +4 -2
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/tools/tools.py +119 -123
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/utils/sim.py +6 -0
- vision_agent-0.2.226/vision_agent/utils/video_tracking.py +305 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/LICENSE +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/README.md +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/agent/README.md +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/agent/types.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/agent/vision_agent.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/agent/vision_agent_coder.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/agent/vision_agent_planner.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/agent/vision_agent_planner_prompts.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/agent/vision_agent_planner_v2.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/agent/vision_agent_prompts.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/agent/vision_agent_prompts_v2.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/clients/http.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/clients/landing_public_api.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/lmm/types.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/tools/__init__.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/tools/meta_tools.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/tools/tool_utils.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/tools/tools_types.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/utils/execute.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/utils/image_utils.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/utils/video.py +0 -0
@@ -65,25 +65,30 @@ desc,doc,name
|
|
65
65
|
},
|
66
66
|
]
|
67
67
|
",owlv2_sam2_instance_segmentation
|
68
|
-
"'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names,
|
69
|
-
'owlv2_sam2_video_tracking' is a tool that can segment multiple
|
70
|
-
prompt such as category names or referring
|
71
|
-
prompt are separated by commas. It returns
|
72
|
-
|
68
|
+
"'owlv2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","owlv2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
|
69
|
+
'owlv2_sam2_video_tracking' is a tool that can track and segment multiple
|
70
|
+
objects in a video given a text prompt such as category names or referring
|
71
|
+
expressions. The categories in the text prompt are separated by commas. It returns
|
72
|
+
a list of bounding boxes, label names, masks and associated probability scores and
|
73
|
+
is useful for tracking and counting without duplicating counts.
|
73
74
|
|
74
75
|
Parameters:
|
75
76
|
prompt (str): The prompt to ground to the image.
|
76
|
-
|
77
|
+
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
78
|
+
chunk_length (Optional[int]): The number of frames to re-run owlv2 to find
|
79
|
+
new objects.
|
77
80
|
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
78
81
|
fine-tuned model ID here to use it.
|
79
82
|
|
80
83
|
Returns:
|
81
|
-
List[Dict[str, Any]]: A list of dictionaries containing the
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
the
|
84
|
+
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
|
85
|
+
label, segmentation mask and bounding boxes. The outer list represents each
|
86
|
+
frame and the inner list is the entities per frame. The detected objects
|
87
|
+
have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
|
88
|
+
and ymin are the coordinates of the top-left and xmax and ymax are the
|
89
|
+
coordinates of the bottom-right of the bounding box. The mask is binary 2D
|
90
|
+
numpy array where 1 indicates the object and 0 indicates the background.
|
91
|
+
The label names are prefixed with their ID represent the total count.
|
87
92
|
|
88
93
|
Example
|
89
94
|
-------
|
@@ -170,25 +175,28 @@ desc,doc,name
|
|
170
175
|
},
|
171
176
|
]
|
172
177
|
",countgd_sam2_instance_segmentation
|
173
|
-
"'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names,
|
174
|
-
'countgd_sam2_video_tracking' is a tool that can segment multiple
|
175
|
-
prompt such as category names or referring
|
176
|
-
prompt are separated by commas. It returns
|
177
|
-
|
178
|
+
"'countgd_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","countgd_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10) -> List[List[Dict[str, Any]]]:
|
179
|
+
'countgd_sam2_video_tracking' is a tool that can track and segment multiple
|
180
|
+
objects in a video given a text prompt such as category names or referring
|
181
|
+
expressions. The categories in the text prompt are separated by commas. It returns
|
182
|
+
a list of bounding boxes, label names, masks and associated probability scores and
|
183
|
+
is useful for tracking and counting without duplicating counts.
|
178
184
|
|
179
185
|
Parameters:
|
180
186
|
prompt (str): The prompt to ground to the image.
|
181
|
-
|
182
|
-
chunk_length (Optional[int]): The number of frames to re-run
|
187
|
+
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
188
|
+
chunk_length (Optional[int]): The number of frames to re-run countgd to find
|
183
189
|
new objects.
|
184
190
|
|
185
191
|
Returns:
|
186
|
-
List[Dict[str, Any]]: A list of dictionaries containing the
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
the
|
192
|
+
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
|
193
|
+
label, segmentation mask and bounding boxes. The outer list represents each
|
194
|
+
frame and the inner list is the entities per frame. The detected objects
|
195
|
+
have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
|
196
|
+
and ymin are the coordinates of the top-left and xmax and ymax are the
|
197
|
+
coordinates of the bottom-right of the bounding box. The mask is binary 2D
|
198
|
+
numpy array where 1 indicates the object and 0 indicates the background.
|
199
|
+
The label names are prefixed with their ID represent the total count.
|
192
200
|
|
193
201
|
Example
|
194
202
|
-------
|
@@ -265,12 +273,12 @@ desc,doc,name
|
|
265
273
|
},
|
266
274
|
]
|
267
275
|
",florence2_sam2_instance_segmentation
|
268
|
-
'florence2_sam2_video_tracking' is a tool that can
|
269
|
-
'florence2_sam2_video_tracking' is a tool that can
|
270
|
-
|
271
|
-
expressions.
|
272
|
-
|
273
|
-
|
276
|
+
"'florence2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","florence2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
|
277
|
+
'florence2_sam2_video_tracking' is a tool that can track and segment multiple
|
278
|
+
objects in a video given a text prompt such as category names or referring
|
279
|
+
expressions. The categories in the text prompt are separated by commas. It returns
|
280
|
+
a list of bounding boxes, label names, masks and associated probability scores and
|
281
|
+
is useful for tracking and counting without duplicating counts.
|
274
282
|
|
275
283
|
Parameters:
|
276
284
|
prompt (str): The prompt to ground to the video.
|
@@ -282,10 +290,13 @@ desc,doc,name
|
|
282
290
|
|
283
291
|
Returns:
|
284
292
|
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
293
|
+
label, segmentation mask and bounding boxes. The outer list represents each
|
294
|
+
frame and the inner list is the entities per frame. The detected objects
|
295
|
+
have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
|
296
|
+
and ymin are the coordinates of the top-left and xmax and ymax are the
|
297
|
+
coordinates of the bottom-right of the bounding box. The mask is binary 2D
|
298
|
+
numpy array where 1 indicates the object and 0 indicates the background.
|
299
|
+
The label names are prefixed with their ID represent the total count.
|
289
300
|
|
290
301
|
Example
|
291
302
|
-------
|
@@ -445,43 +456,6 @@ desc,doc,name
|
|
445
456
|
>>> qwen2_vl_video_vqa('Which football player made the goal?', frames)
|
446
457
|
'Lionel Messi'
|
447
458
|
",qwen2_vl_video_vqa
|
448
|
-
"'detr_segmentation' is a tool that can segment common objects in an image without any text prompt. It returns a list of detected objects as labels, their regions as masks and their scores.","detr_segmentation(image: numpy.ndarray) -> List[Dict[str, Any]]:
|
449
|
-
'detr_segmentation' is a tool that can segment common objects in an
|
450
|
-
image without any text prompt. It returns a list of detected objects
|
451
|
-
as labels, their regions as masks and their scores.
|
452
|
-
|
453
|
-
Parameters:
|
454
|
-
image (np.ndarray): The image used to segment things and objects
|
455
|
-
|
456
|
-
Returns:
|
457
|
-
List[Dict[str, Any]]: A list of dictionaries containing the score, label
|
458
|
-
and mask of the detected objects. The mask is binary 2D numpy array where 1
|
459
|
-
indicates the object and 0 indicates the background.
|
460
|
-
|
461
|
-
Example
|
462
|
-
-------
|
463
|
-
>>> detr_segmentation(image)
|
464
|
-
[
|
465
|
-
{
|
466
|
-
'score': 0.45,
|
467
|
-
'label': 'window',
|
468
|
-
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
469
|
-
[0, 0, 0, ..., 0, 0, 0],
|
470
|
-
...,
|
471
|
-
[0, 0, 0, ..., 0, 0, 0],
|
472
|
-
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
473
|
-
},
|
474
|
-
{
|
475
|
-
'score': 0.70,
|
476
|
-
'label': 'bird',
|
477
|
-
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
478
|
-
[0, 0, 0, ..., 0, 0, 0],
|
479
|
-
...,
|
480
|
-
[0, 0, 0, ..., 0, 0, 0],
|
481
|
-
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
482
|
-
},
|
483
|
-
]
|
484
|
-
",detr_segmentation
|
485
459
|
'depth_anything_v2' is a tool that runs depth_anythingv2 model to generate a depth image from a given RGB image. The returned depth image is monochrome and represents depth values as pixel intesities with pixel values ranging from 0 to 255.,"depth_anything_v2(image: numpy.ndarray) -> numpy.ndarray:
|
486
460
|
'depth_anything_v2' is a tool that runs depth_anythingv2 model to generate a
|
487
461
|
depth image from a given RGB image. The returned depth image is monochrome and
|
@@ -522,22 +496,6 @@ desc,doc,name
|
|
522
496
|
[10, 11, 15, ..., 202, 202, 205],
|
523
497
|
[10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
|
524
498
|
",generate_pose_image
|
525
|
-
'vit_image_classification' is a tool that can classify an image. It returns a list of classes and their probability scores based on image content.,"vit_image_classification(image: numpy.ndarray) -> Dict[str, Any]:
|
526
|
-
'vit_image_classification' is a tool that can classify an image. It returns a
|
527
|
-
list of classes and their probability scores based on image content.
|
528
|
-
|
529
|
-
Parameters:
|
530
|
-
image (np.ndarray): The image to classify or tag
|
531
|
-
|
532
|
-
Returns:
|
533
|
-
Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
|
534
|
-
contains a list of labels and other a list of scores.
|
535
|
-
|
536
|
-
Example
|
537
|
-
-------
|
538
|
-
>>> vit_image_classification(image)
|
539
|
-
{""labels"": [""leopard"", ""lemur, otter"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
|
540
|
-
",vit_image_classification
|
541
499
|
'vit_nsfw_classification' is a tool that can classify an image as 'nsfw' or 'normal'. It returns the predicted label and their probability scores based on image content.,"vit_nsfw_classification(image: numpy.ndarray) -> Dict[str, Any]:
|
542
500
|
'vit_nsfw_classification' is a tool that can classify an image as 'nsfw' or 'normal'.
|
543
501
|
It returns the predicted label and their probability scores based on image content.
|
@@ -566,7 +524,7 @@ desc,doc,name
|
|
566
524
|
prompt (str): The question about the video
|
567
525
|
frames (List[np.ndarray]): The reference frames used for the question
|
568
526
|
model (str): The model to use for the inference. Valid values are
|
569
|
-
'qwen2vl', 'gpt4o'
|
527
|
+
'qwen2vl', 'gpt4o'.
|
570
528
|
chunk_length_frames (Optional[int]): length of each chunk in frames
|
571
529
|
|
572
530
|
Returns:
|
@@ -641,7 +599,7 @@ desc,doc,name
|
|
641
599
|
>>> closest_distance(det1, det2, image_size)
|
642
600
|
141.42
|
643
601
|
",minimum_distance
|
644
|
-
"'extract_frames_and_timestamps' extracts frames and timestamps from a video which can be a file path, url or youtube link, returns a list of dictionaries with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is the relative time in seconds where the frame was captured. The frame is a numpy array.","extract_frames_and_timestamps(video_uri: Union[str, pathlib.Path], fps: float =
|
602
|
+
"'extract_frames_and_timestamps' extracts frames and timestamps from a video which can be a file path, url or youtube link, returns a list of dictionaries with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is the relative time in seconds where the frame was captured. The frame is a numpy array.","extract_frames_and_timestamps(video_uri: Union[str, pathlib.Path], fps: float = 5) -> List[Dict[str, Union[numpy.ndarray, float]]]:
|
645
603
|
'extract_frames_and_timestamps' extracts frames and timestamps from a video
|
646
604
|
which can be a file path, url or youtube link, returns a list of dictionaries
|
647
605
|
with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is
|
@@ -651,7 +609,7 @@ desc,doc,name
|
|
651
609
|
Parameters:
|
652
610
|
video_uri (Union[str, Path]): The path to the video file, url or youtube link
|
653
611
|
fps (float, optional): The frame rate per second to extract the frames. Defaults
|
654
|
-
to
|
612
|
+
to 5.
|
655
613
|
|
656
614
|
Returns:
|
657
615
|
List[Dict[str, Union[np.ndarray, float]]]: A list of dictionaries containing the
|
Binary file
|
@@ -153,6 +153,19 @@ def format_plan_v2(plan: PlanContext) -> str:
|
|
153
153
|
return plan_str
|
154
154
|
|
155
155
|
|
156
|
+
def format_conversation(chat: List[AgentMessage]) -> str:
|
157
|
+
chat = copy.deepcopy(chat)
|
158
|
+
prompt = ""
|
159
|
+
for chat_i in chat:
|
160
|
+
if chat_i.role == "user":
|
161
|
+
prompt += f"USER: {chat_i.content}\n\n"
|
162
|
+
elif chat_i.role == "observation" or chat_i.role == "coder":
|
163
|
+
prompt += f"OBSERVATION: {chat_i.content}\n\n"
|
164
|
+
elif chat_i.role == "conversation":
|
165
|
+
prompt += f"AGENT: {chat_i.content}\n\n"
|
166
|
+
return prompt
|
167
|
+
|
168
|
+
|
156
169
|
def format_plans(plans: Dict[str, Any]) -> str:
|
157
170
|
plan_str = ""
|
158
171
|
for k, v in plans.items():
|
{vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/agent/vision_agent_coder_prompts_v2.py
RENAMED
@@ -65,7 +65,7 @@ This is the documentation for the functions you have access to. You may call any
|
|
65
65
|
7. DO NOT assert the output value, run the code and assert only the output format or data structure.
|
66
66
|
8. DO NOT use try except block to handle the error, let the error be raised if the code is incorrect.
|
67
67
|
9. DO NOT import the testing function as it will available in the testing environment.
|
68
|
-
10. Print the output of the function that is being tested.
|
68
|
+
10. Print the output of the function that is being tested and ensure it is not empty.
|
69
69
|
11. Use the output of the function that is being tested as the return value of the testing function.
|
70
70
|
12. Run the testing function in the end and don't assign a variable to its output.
|
71
71
|
13. Output your test code using <code> tags:
|
@@ -202,7 +202,12 @@ def write_and_test_code(
|
|
202
202
|
tool_docs=tool_docs,
|
203
203
|
plan=plan,
|
204
204
|
)
|
205
|
-
|
205
|
+
try:
|
206
|
+
code = strip_function_calls(code)
|
207
|
+
except Exception:
|
208
|
+
# the code may be malformatted, this will fail in the exec call and the agent
|
209
|
+
# will attempt to debug it
|
210
|
+
pass
|
206
211
|
test = write_test(
|
207
212
|
tester=tester,
|
208
213
|
chat=chat,
|
{vision_agent-0.2.224 → vision_agent-0.2.226}/vision_agent/agent/vision_agent_planner_prompts_v2.py
RENAMED
@@ -136,8 +136,9 @@ Tool Documentation:
|
|
136
136
|
countgd_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
|
137
137
|
'countgd_object_detection' is a tool that can detect multiple instances of an
|
138
138
|
object given a text prompt. It is particularly useful when trying to detect and
|
139
|
-
count a large number of objects.
|
140
|
-
|
139
|
+
count a large number of objects. You can optionally separate object names in the
|
140
|
+
prompt with commas. It returns a list of bounding boxes with normalized
|
141
|
+
coordinates, label names and associated confidence scores.
|
141
142
|
|
142
143
|
Parameters:
|
143
144
|
prompt (str): The object that needs to be counted.
|
@@ -272,40 +273,47 @@ OBSERVATION:
|
|
272
273
|
[get_tool_for_task output]
|
273
274
|
For tracking boxes moving on a conveyor belt, we need a tool that can consistently track the same box across frames without losing it or double counting. Looking at the outputs: florence2_sam2_video_tracking successfully tracks the single box across all 5 frames, maintaining consistent tracking IDs and showing the box's movement along the conveyor.
|
274
275
|
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
276
|
+
Tool Documentation:
|
277
|
+
def florence2_sam2_video_tracking(prompt: str, frames: List[np.ndarray], chunk_length: Optional[int] = 10) -> List[List[Dict[str, Any]]]:
|
278
|
+
'florence2_sam2_video_tracking' is a tool that can track and segment multiple
|
279
|
+
objects in a video given a text prompt such as category names or referring
|
280
|
+
expressions. The categories in the text prompt are separated by commas. It returns
|
281
|
+
a list of bounding boxes, label names, masks and associated probability scores and
|
282
|
+
is useful for tracking and counting without duplicating counts.
|
280
283
|
|
281
|
-
Parameters:
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
284
|
+
Parameters:
|
285
|
+
prompt (str): The prompt to ground to the video.
|
286
|
+
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
287
|
+
chunk_length (Optional[int]): The number of frames to re-run florence2 to find
|
288
|
+
new objects.
|
289
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
290
|
+
fine-tuned model ID here to use it.
|
286
291
|
|
287
|
-
Returns:
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
292
|
+
Returns:
|
293
|
+
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
|
294
|
+
label, segmentation mask and bounding boxes. The outer list represents each
|
295
|
+
frame and the inner list is the entities per frame. The detected objects
|
296
|
+
have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
|
297
|
+
and ymin are the coordinates of the top-left and xmax and ymax are the
|
298
|
+
coordinates of the bottom-right of the bounding box. The mask is binary 2D
|
299
|
+
numpy array where 1 indicates the object and 0 indicates the background.
|
300
|
+
The label names are prefixed with their ID represent the total count.
|
293
301
|
|
294
|
-
Example
|
295
|
-
-------
|
296
|
-
|
297
|
-
[
|
302
|
+
Example
|
303
|
+
-------
|
304
|
+
>>> florence2_sam2_video_tracking("car, dinosaur", frames)
|
298
305
|
[
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
...,
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
306
|
+
[
|
307
|
+
{
|
308
|
+
'label': '0: dinosaur',
|
309
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
310
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
311
|
+
...,
|
312
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
313
|
+
},
|
314
|
+
],
|
315
|
+
...
|
316
|
+
]
|
309
317
|
[end of get_tool_for_task output]
|
310
318
|
<count>8</count>
|
311
319
|
|
@@ -691,7 +699,8 @@ FINALIZE_PLAN = """
|
|
691
699
|
4. Specifically call out the tools used and the order in which they were used. Only include tools obtained from calling `get_tool_for_task`.
|
692
700
|
5. Do not include {excluded_tools} tools in your instructions.
|
693
701
|
6. Add final instructions for visualizing the output with `overlay_bounding_boxes` or `overlay_segmentation_masks` and saving it to a file with `save_file` or `save_video`.
|
694
|
-
|
702
|
+
7. Use the default FPS for extracting frames from videos unless otherwise specified by the user.
|
703
|
+
8. Respond in the following format with JSON surrounded by <json> tags and code surrounded by <code> tags:
|
695
704
|
|
696
705
|
<json>
|
697
706
|
{{
|
@@ -1,13 +1,14 @@
|
|
1
1
|
import copy
|
2
2
|
import json
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import Any, Callable, Dict, List, Optional, Union, cast
|
4
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
5
5
|
|
6
6
|
from vision_agent.agent import Agent, AgentCoder, VisionAgentCoderV2
|
7
7
|
from vision_agent.agent.agent_utils import (
|
8
8
|
add_media_to_chat,
|
9
9
|
convert_message_to_agentmessage,
|
10
10
|
extract_tag,
|
11
|
+
format_conversation,
|
11
12
|
)
|
12
13
|
from vision_agent.agent.types import (
|
13
14
|
AgentMessage,
|
@@ -22,19 +23,6 @@ from vision_agent.lmm.types import Message
|
|
22
23
|
from vision_agent.utils.execute import CodeInterpreter, CodeInterpreterFactory
|
23
24
|
|
24
25
|
|
25
|
-
def format_conversation(chat: List[AgentMessage]) -> str:
|
26
|
-
chat = copy.deepcopy(chat)
|
27
|
-
prompt = ""
|
28
|
-
for chat_i in chat:
|
29
|
-
if chat_i.role == "user":
|
30
|
-
prompt += f"USER: {chat_i.content}\n\n"
|
31
|
-
elif chat_i.role == "observation" or chat_i.role == "coder":
|
32
|
-
prompt += f"OBSERVATION: {chat_i.content}\n\n"
|
33
|
-
elif chat_i.role == "conversation":
|
34
|
-
prompt += f"AGENT: {chat_i.content}\n\n"
|
35
|
-
return prompt
|
36
|
-
|
37
|
-
|
38
26
|
def run_conversation(agent: LMM, chat: List[AgentMessage]) -> str:
|
39
27
|
# only keep last 10 messages
|
40
28
|
conv = format_conversation(chat[-10:])
|
@@ -55,23 +43,39 @@ def check_for_interaction(chat: List[AgentMessage]) -> bool:
|
|
55
43
|
|
56
44
|
def extract_conversation_for_generate_code(
|
57
45
|
chat: List[AgentMessage],
|
58
|
-
) -> List[AgentMessage]:
|
46
|
+
) -> Tuple[List[AgentMessage], Optional[str]]:
|
59
47
|
chat = copy.deepcopy(chat)
|
60
48
|
|
61
49
|
# if we are in the middle of an interaction, return all the intermediate planning
|
62
50
|
# steps
|
63
51
|
if check_for_interaction(chat):
|
64
|
-
return chat
|
52
|
+
return chat, None
|
65
53
|
|
66
54
|
extracted_chat = []
|
67
55
|
for chat_i in chat:
|
68
56
|
if chat_i.role == "user":
|
69
57
|
extracted_chat.append(chat_i)
|
70
58
|
elif chat_i.role == "coder":
|
71
|
-
if "<final_code>" in chat_i.content
|
59
|
+
if "<final_code>" in chat_i.content:
|
72
60
|
extracted_chat.append(chat_i)
|
73
61
|
|
74
|
-
|
62
|
+
# only keep the last <final_code> and <final_test>
|
63
|
+
final_code = None
|
64
|
+
extracted_chat_strip_code: List[AgentMessage] = []
|
65
|
+
for chat_i in reversed(extracted_chat):
|
66
|
+
if "<final_code>" in chat_i.content and final_code is None:
|
67
|
+
extracted_chat_strip_code = [chat_i] + extracted_chat_strip_code
|
68
|
+
final_code = extract_tag(chat_i.content, "final_code")
|
69
|
+
if final_code is not None:
|
70
|
+
test_code = extract_tag(chat_i.content, "final_test")
|
71
|
+
final_code += "\n" + test_code if test_code is not None else ""
|
72
|
+
|
73
|
+
if "<final_code>" in chat_i.content and final_code is not None:
|
74
|
+
continue
|
75
|
+
|
76
|
+
extracted_chat_strip_code = [chat_i] + extracted_chat_strip_code
|
77
|
+
|
78
|
+
return extracted_chat_strip_code[-5:], final_code
|
75
79
|
|
76
80
|
|
77
81
|
def maybe_run_action(
|
@@ -81,7 +85,7 @@ def maybe_run_action(
|
|
81
85
|
code_interpreter: Optional[CodeInterpreter] = None,
|
82
86
|
) -> Optional[List[AgentMessage]]:
|
83
87
|
if action == "generate_or_edit_vision_code":
|
84
|
-
extracted_chat = extract_conversation_for_generate_code(chat)
|
88
|
+
extracted_chat, _ = extract_conversation_for_generate_code(chat)
|
85
89
|
# there's an issue here because coder.generate_code will send it's code_context
|
86
90
|
# to the outside user via it's update_callback, but we don't necessarily have
|
87
91
|
# access to that update_callback here, so we re-create the message using
|
@@ -101,11 +105,15 @@ def maybe_run_action(
|
|
101
105
|
)
|
102
106
|
]
|
103
107
|
elif action == "edit_code":
|
104
|
-
extracted_chat = extract_conversation_for_generate_code(chat)
|
108
|
+
extracted_chat, final_code = extract_conversation_for_generate_code(chat)
|
105
109
|
plan_context = PlanContext(
|
106
110
|
plan="Edit the latest code observed in the fewest steps possible according to the user's feedback.",
|
107
|
-
instructions=[
|
108
|
-
|
111
|
+
instructions=[
|
112
|
+
chat_i.content
|
113
|
+
for chat_i in extracted_chat
|
114
|
+
if chat_i.role == "user" and "<final_code>" not in chat_i.content
|
115
|
+
],
|
116
|
+
code=final_code if final_code is not None else "",
|
109
117
|
)
|
110
118
|
context = coder.generate_code_from_plan(
|
111
119
|
extracted_chat, plan_context, code_interpreter=code_interpreter
|
@@ -193,8 +193,10 @@ def get_tool_for_task(
|
|
193
193
|
- Depth and pose estimation
|
194
194
|
- Video object tracking
|
195
195
|
|
196
|
-
|
197
|
-
|
196
|
+
Only ask for one type of task at a time, for example a task needing to identify
|
197
|
+
text is one OCR task while needing to identify non-text objects is an OD task. Wait
|
198
|
+
until the documentation is printed to use the function so you know what the input
|
199
|
+
and output signatures are.
|
198
200
|
|
199
201
|
Parameters:
|
200
202
|
task: str: The task to accomplish.
|