vision-agent 0.2.225__py3-none-any.whl → 0.2.227__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/.sim_tools/df.csv +49 -91
- vision_agent/.sim_tools/embs.npy +0 -0
- vision_agent/agent/agent_utils.py +13 -0
- vision_agent/agent/vision_agent_coder_prompts_v2.py +1 -1
- vision_agent/agent/vision_agent_coder_v2.py +6 -1
- vision_agent/agent/vision_agent_planner_prompts_v2.py +42 -33
- vision_agent/agent/vision_agent_v2.py +30 -22
- vision_agent/tools/__init__.py +1 -0
- vision_agent/tools/planner_tools.py +4 -2
- vision_agent/tools/tools.py +186 -37
- vision_agent/utils/sim.py +6 -0
- vision_agent/utils/video_tracking.py +1 -0
- {vision_agent-0.2.225.dist-info → vision_agent-0.2.227.dist-info}/METADATA +1 -1
- {vision_agent-0.2.225.dist-info → vision_agent-0.2.227.dist-info}/RECORD +16 -16
- {vision_agent-0.2.225.dist-info → vision_agent-0.2.227.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.225.dist-info → vision_agent-0.2.227.dist-info}/WHEEL +0 -0
vision_agent/.sim_tools/df.csv
CHANGED
@@ -65,25 +65,30 @@ desc,doc,name
|
|
65
65
|
},
|
66
66
|
]
|
67
67
|
",owlv2_sam2_instance_segmentation
|
68
|
-
"'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names,
|
69
|
-
'owlv2_sam2_video_tracking' is a tool that can segment multiple
|
70
|
-
prompt such as category names or referring
|
71
|
-
prompt are separated by commas. It returns
|
72
|
-
|
68
|
+
"'owlv2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","owlv2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
|
69
|
+
'owlv2_sam2_video_tracking' is a tool that can track and segment multiple
|
70
|
+
objects in a video given a text prompt such as category names or referring
|
71
|
+
expressions. The categories in the text prompt are separated by commas. It returns
|
72
|
+
a list of bounding boxes, label names, masks and associated probability scores and
|
73
|
+
is useful for tracking and counting without duplicating counts.
|
73
74
|
|
74
75
|
Parameters:
|
75
76
|
prompt (str): The prompt to ground to the image.
|
76
|
-
|
77
|
+
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
78
|
+
chunk_length (Optional[int]): The number of frames to re-run owlv2 to find
|
79
|
+
new objects.
|
77
80
|
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
78
81
|
fine-tuned model ID here to use it.
|
79
82
|
|
80
83
|
Returns:
|
81
|
-
List[Dict[str, Any]]: A list of dictionaries containing the
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
the
|
84
|
+
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
|
85
|
+
label, segmentation mask and bounding boxes. The outer list represents each
|
86
|
+
frame and the inner list is the entities per frame. The detected objects
|
87
|
+
have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
|
88
|
+
and ymin are the coordinates of the top-left and xmax and ymax are the
|
89
|
+
coordinates of the bottom-right of the bounding box. The mask is binary 2D
|
90
|
+
numpy array where 1 indicates the object and 0 indicates the background.
|
91
|
+
The label names are prefixed with their ID represent the total count.
|
87
92
|
|
88
93
|
Example
|
89
94
|
-------
|
@@ -170,25 +175,28 @@ desc,doc,name
|
|
170
175
|
},
|
171
176
|
]
|
172
177
|
",countgd_sam2_instance_segmentation
|
173
|
-
"'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names,
|
174
|
-
'countgd_sam2_video_tracking' is a tool that can segment multiple
|
175
|
-
prompt such as category names or referring
|
176
|
-
prompt are separated by commas. It returns
|
177
|
-
|
178
|
+
"'countgd_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","countgd_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10) -> List[List[Dict[str, Any]]]:
|
179
|
+
'countgd_sam2_video_tracking' is a tool that can track and segment multiple
|
180
|
+
objects in a video given a text prompt such as category names or referring
|
181
|
+
expressions. The categories in the text prompt are separated by commas. It returns
|
182
|
+
a list of bounding boxes, label names, masks and associated probability scores and
|
183
|
+
is useful for tracking and counting without duplicating counts.
|
178
184
|
|
179
185
|
Parameters:
|
180
186
|
prompt (str): The prompt to ground to the image.
|
181
|
-
|
182
|
-
chunk_length (Optional[int]): The number of frames to re-run
|
187
|
+
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
188
|
+
chunk_length (Optional[int]): The number of frames to re-run countgd to find
|
183
189
|
new objects.
|
184
190
|
|
185
191
|
Returns:
|
186
|
-
List[Dict[str, Any]]: A list of dictionaries containing the
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
the
|
192
|
+
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
|
193
|
+
label, segmentation mask and bounding boxes. The outer list represents each
|
194
|
+
frame and the inner list is the entities per frame. The detected objects
|
195
|
+
have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
|
196
|
+
and ymin are the coordinates of the top-left and xmax and ymax are the
|
197
|
+
coordinates of the bottom-right of the bounding box. The mask is binary 2D
|
198
|
+
numpy array where 1 indicates the object and 0 indicates the background.
|
199
|
+
The label names are prefixed with their ID represent the total count.
|
192
200
|
|
193
201
|
Example
|
194
202
|
-------
|
@@ -265,12 +273,12 @@ desc,doc,name
|
|
265
273
|
},
|
266
274
|
]
|
267
275
|
",florence2_sam2_instance_segmentation
|
268
|
-
'florence2_sam2_video_tracking' is a tool that can
|
269
|
-
'florence2_sam2_video_tracking' is a tool that can
|
270
|
-
|
271
|
-
expressions.
|
272
|
-
|
273
|
-
|
276
|
+
"'florence2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","florence2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
|
277
|
+
'florence2_sam2_video_tracking' is a tool that can track and segment multiple
|
278
|
+
objects in a video given a text prompt such as category names or referring
|
279
|
+
expressions. The categories in the text prompt are separated by commas. It returns
|
280
|
+
a list of bounding boxes, label names, masks and associated probability scores and
|
281
|
+
is useful for tracking and counting without duplicating counts.
|
274
282
|
|
275
283
|
Parameters:
|
276
284
|
prompt (str): The prompt to ground to the video.
|
@@ -282,10 +290,13 @@ desc,doc,name
|
|
282
290
|
|
283
291
|
Returns:
|
284
292
|
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
293
|
+
label, segmentation mask and bounding boxes. The outer list represents each
|
294
|
+
frame and the inner list is the entities per frame. The detected objects
|
295
|
+
have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
|
296
|
+
and ymin are the coordinates of the top-left and xmax and ymax are the
|
297
|
+
coordinates of the bottom-right of the bounding box. The mask is binary 2D
|
298
|
+
numpy array where 1 indicates the object and 0 indicates the background.
|
299
|
+
The label names are prefixed with their ID represent the total count.
|
289
300
|
|
290
301
|
Example
|
291
302
|
-------
|
@@ -445,43 +456,6 @@ desc,doc,name
|
|
445
456
|
>>> qwen2_vl_video_vqa('Which football player made the goal?', frames)
|
446
457
|
'Lionel Messi'
|
447
458
|
",qwen2_vl_video_vqa
|
448
|
-
"'detr_segmentation' is a tool that can segment common objects in an image without any text prompt. It returns a list of detected objects as labels, their regions as masks and their scores.","detr_segmentation(image: numpy.ndarray) -> List[Dict[str, Any]]:
|
449
|
-
'detr_segmentation' is a tool that can segment common objects in an
|
450
|
-
image without any text prompt. It returns a list of detected objects
|
451
|
-
as labels, their regions as masks and their scores.
|
452
|
-
|
453
|
-
Parameters:
|
454
|
-
image (np.ndarray): The image used to segment things and objects
|
455
|
-
|
456
|
-
Returns:
|
457
|
-
List[Dict[str, Any]]: A list of dictionaries containing the score, label
|
458
|
-
and mask of the detected objects. The mask is binary 2D numpy array where 1
|
459
|
-
indicates the object and 0 indicates the background.
|
460
|
-
|
461
|
-
Example
|
462
|
-
-------
|
463
|
-
>>> detr_segmentation(image)
|
464
|
-
[
|
465
|
-
{
|
466
|
-
'score': 0.45,
|
467
|
-
'label': 'window',
|
468
|
-
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
469
|
-
[0, 0, 0, ..., 0, 0, 0],
|
470
|
-
...,
|
471
|
-
[0, 0, 0, ..., 0, 0, 0],
|
472
|
-
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
473
|
-
},
|
474
|
-
{
|
475
|
-
'score': 0.70,
|
476
|
-
'label': 'bird',
|
477
|
-
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
478
|
-
[0, 0, 0, ..., 0, 0, 0],
|
479
|
-
...,
|
480
|
-
[0, 0, 0, ..., 0, 0, 0],
|
481
|
-
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
482
|
-
},
|
483
|
-
]
|
484
|
-
",detr_segmentation
|
485
459
|
'depth_anything_v2' is a tool that runs depth_anythingv2 model to generate a depth image from a given RGB image. The returned depth image is monochrome and represents depth values as pixel intesities with pixel values ranging from 0 to 255.,"depth_anything_v2(image: numpy.ndarray) -> numpy.ndarray:
|
486
460
|
'depth_anything_v2' is a tool that runs depth_anythingv2 model to generate a
|
487
461
|
depth image from a given RGB image. The returned depth image is monochrome and
|
@@ -522,22 +496,6 @@ desc,doc,name
|
|
522
496
|
[10, 11, 15, ..., 202, 202, 205],
|
523
497
|
[10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
|
524
498
|
",generate_pose_image
|
525
|
-
'vit_image_classification' is a tool that can classify an image. It returns a list of classes and their probability scores based on image content.,"vit_image_classification(image: numpy.ndarray) -> Dict[str, Any]:
|
526
|
-
'vit_image_classification' is a tool that can classify an image. It returns a
|
527
|
-
list of classes and their probability scores based on image content.
|
528
|
-
|
529
|
-
Parameters:
|
530
|
-
image (np.ndarray): The image to classify or tag
|
531
|
-
|
532
|
-
Returns:
|
533
|
-
Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
|
534
|
-
contains a list of labels and other a list of scores.
|
535
|
-
|
536
|
-
Example
|
537
|
-
-------
|
538
|
-
>>> vit_image_classification(image)
|
539
|
-
{""labels"": [""leopard"", ""lemur, otter"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
|
540
|
-
",vit_image_classification
|
541
499
|
'vit_nsfw_classification' is a tool that can classify an image as 'nsfw' or 'normal'. It returns the predicted label and their probability scores based on image content.,"vit_nsfw_classification(image: numpy.ndarray) -> Dict[str, Any]:
|
542
500
|
'vit_nsfw_classification' is a tool that can classify an image as 'nsfw' or 'normal'.
|
543
501
|
It returns the predicted label and their probability scores based on image content.
|
@@ -566,7 +524,7 @@ desc,doc,name
|
|
566
524
|
prompt (str): The question about the video
|
567
525
|
frames (List[np.ndarray]): The reference frames used for the question
|
568
526
|
model (str): The model to use for the inference. Valid values are
|
569
|
-
'qwen2vl', 'gpt4o'
|
527
|
+
'qwen2vl', 'gpt4o'.
|
570
528
|
chunk_length_frames (Optional[int]): length of each chunk in frames
|
571
529
|
|
572
530
|
Returns:
|
@@ -641,7 +599,7 @@ desc,doc,name
|
|
641
599
|
>>> closest_distance(det1, det2, image_size)
|
642
600
|
141.42
|
643
601
|
",minimum_distance
|
644
|
-
"'extract_frames_and_timestamps' extracts frames and timestamps from a video which can be a file path, url or youtube link, returns a list of dictionaries with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is the relative time in seconds where the frame was captured. The frame is a numpy array.","extract_frames_and_timestamps(video_uri: Union[str, pathlib.Path], fps: float =
|
602
|
+
"'extract_frames_and_timestamps' extracts frames and timestamps from a video which can be a file path, url or youtube link, returns a list of dictionaries with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is the relative time in seconds where the frame was captured. The frame is a numpy array.","extract_frames_and_timestamps(video_uri: Union[str, pathlib.Path], fps: float = 5) -> List[Dict[str, Union[numpy.ndarray, float]]]:
|
645
603
|
'extract_frames_and_timestamps' extracts frames and timestamps from a video
|
646
604
|
which can be a file path, url or youtube link, returns a list of dictionaries
|
647
605
|
with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is
|
@@ -651,7 +609,7 @@ desc,doc,name
|
|
651
609
|
Parameters:
|
652
610
|
video_uri (Union[str, Path]): The path to the video file, url or youtube link
|
653
611
|
fps (float, optional): The frame rate per second to extract the frames. Defaults
|
654
|
-
to
|
612
|
+
to 5.
|
655
613
|
|
656
614
|
Returns:
|
657
615
|
List[Dict[str, Union[np.ndarray, float]]]: A list of dictionaries containing the
|
vision_agent/.sim_tools/embs.npy
CHANGED
Binary file
|
@@ -153,6 +153,19 @@ def format_plan_v2(plan: PlanContext) -> str:
|
|
153
153
|
return plan_str
|
154
154
|
|
155
155
|
|
156
|
+
def format_conversation(chat: List[AgentMessage]) -> str:
|
157
|
+
chat = copy.deepcopy(chat)
|
158
|
+
prompt = ""
|
159
|
+
for chat_i in chat:
|
160
|
+
if chat_i.role == "user":
|
161
|
+
prompt += f"USER: {chat_i.content}\n\n"
|
162
|
+
elif chat_i.role == "observation" or chat_i.role == "coder":
|
163
|
+
prompt += f"OBSERVATION: {chat_i.content}\n\n"
|
164
|
+
elif chat_i.role == "conversation":
|
165
|
+
prompt += f"AGENT: {chat_i.content}\n\n"
|
166
|
+
return prompt
|
167
|
+
|
168
|
+
|
156
169
|
def format_plans(plans: Dict[str, Any]) -> str:
|
157
170
|
plan_str = ""
|
158
171
|
for k, v in plans.items():
|
@@ -65,7 +65,7 @@ This is the documentation for the functions you have access to. You may call any
|
|
65
65
|
7. DO NOT assert the output value, run the code and assert only the output format or data structure.
|
66
66
|
8. DO NOT use try except block to handle the error, let the error be raised if the code is incorrect.
|
67
67
|
9. DO NOT import the testing function as it will available in the testing environment.
|
68
|
-
10. Print the output of the function that is being tested.
|
68
|
+
10. Print the output of the function that is being tested and ensure it is not empty.
|
69
69
|
11. Use the output of the function that is being tested as the return value of the testing function.
|
70
70
|
12. Run the testing function in the end and don't assign a variable to its output.
|
71
71
|
13. Output your test code using <code> tags:
|
@@ -202,7 +202,12 @@ def write_and_test_code(
|
|
202
202
|
tool_docs=tool_docs,
|
203
203
|
plan=plan,
|
204
204
|
)
|
205
|
-
|
205
|
+
try:
|
206
|
+
code = strip_function_calls(code)
|
207
|
+
except Exception:
|
208
|
+
# the code may be malformatted, this will fail in the exec call and the agent
|
209
|
+
# will attempt to debug it
|
210
|
+
pass
|
206
211
|
test = write_test(
|
207
212
|
tester=tester,
|
208
213
|
chat=chat,
|
@@ -136,8 +136,9 @@ Tool Documentation:
|
|
136
136
|
countgd_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
|
137
137
|
'countgd_object_detection' is a tool that can detect multiple instances of an
|
138
138
|
object given a text prompt. It is particularly useful when trying to detect and
|
139
|
-
count a large number of objects.
|
140
|
-
|
139
|
+
count a large number of objects. You can optionally separate object names in the
|
140
|
+
prompt with commas. It returns a list of bounding boxes with normalized
|
141
|
+
coordinates, label names and associated confidence scores.
|
141
142
|
|
142
143
|
Parameters:
|
143
144
|
prompt (str): The object that needs to be counted.
|
@@ -272,40 +273,47 @@ OBSERVATION:
|
|
272
273
|
[get_tool_for_task output]
|
273
274
|
For tracking boxes moving on a conveyor belt, we need a tool that can consistently track the same box across frames without losing it or double counting. Looking at the outputs: florence2_sam2_video_tracking successfully tracks the single box across all 5 frames, maintaining consistent tracking IDs and showing the box's movement along the conveyor.
|
274
275
|
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
276
|
+
Tool Documentation:
|
277
|
+
def florence2_sam2_video_tracking(prompt: str, frames: List[np.ndarray], chunk_length: Optional[int] = 10) -> List[List[Dict[str, Any]]]:
|
278
|
+
'florence2_sam2_video_tracking' is a tool that can track and segment multiple
|
279
|
+
objects in a video given a text prompt such as category names or referring
|
280
|
+
expressions. The categories in the text prompt are separated by commas. It returns
|
281
|
+
a list of bounding boxes, label names, masks and associated probability scores and
|
282
|
+
is useful for tracking and counting without duplicating counts.
|
280
283
|
|
281
|
-
Parameters:
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
284
|
+
Parameters:
|
285
|
+
prompt (str): The prompt to ground to the video.
|
286
|
+
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
287
|
+
chunk_length (Optional[int]): The number of frames to re-run florence2 to find
|
288
|
+
new objects.
|
289
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
290
|
+
fine-tuned model ID here to use it.
|
286
291
|
|
287
|
-
Returns:
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
292
|
+
Returns:
|
293
|
+
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
|
294
|
+
label, segmentation mask and bounding boxes. The outer list represents each
|
295
|
+
frame and the inner list is the entities per frame. The detected objects
|
296
|
+
have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
|
297
|
+
and ymin are the coordinates of the top-left and xmax and ymax are the
|
298
|
+
coordinates of the bottom-right of the bounding box. The mask is binary 2D
|
299
|
+
numpy array where 1 indicates the object and 0 indicates the background.
|
300
|
+
The label names are prefixed with their ID represent the total count.
|
293
301
|
|
294
|
-
Example
|
295
|
-
-------
|
296
|
-
|
297
|
-
[
|
302
|
+
Example
|
303
|
+
-------
|
304
|
+
>>> florence2_sam2_video_tracking("car, dinosaur", frames)
|
298
305
|
[
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
...,
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
306
|
+
[
|
307
|
+
{
|
308
|
+
'label': '0: dinosaur',
|
309
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
310
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
311
|
+
...,
|
312
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
313
|
+
},
|
314
|
+
],
|
315
|
+
...
|
316
|
+
]
|
309
317
|
[end of get_tool_for_task output]
|
310
318
|
<count>8</count>
|
311
319
|
|
@@ -691,7 +699,8 @@ FINALIZE_PLAN = """
|
|
691
699
|
4. Specifically call out the tools used and the order in which they were used. Only include tools obtained from calling `get_tool_for_task`.
|
692
700
|
5. Do not include {excluded_tools} tools in your instructions.
|
693
701
|
6. Add final instructions for visualizing the output with `overlay_bounding_boxes` or `overlay_segmentation_masks` and saving it to a file with `save_file` or `save_video`.
|
694
|
-
|
702
|
+
7. Use the default FPS for extracting frames from videos unless otherwise specified by the user.
|
703
|
+
8. Respond in the following format with JSON surrounded by <json> tags and code surrounded by <code> tags:
|
695
704
|
|
696
705
|
<json>
|
697
706
|
{{
|
@@ -1,13 +1,14 @@
|
|
1
1
|
import copy
|
2
2
|
import json
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import Any, Callable, Dict, List, Optional, Union, cast
|
4
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
5
5
|
|
6
6
|
from vision_agent.agent import Agent, AgentCoder, VisionAgentCoderV2
|
7
7
|
from vision_agent.agent.agent_utils import (
|
8
8
|
add_media_to_chat,
|
9
9
|
convert_message_to_agentmessage,
|
10
10
|
extract_tag,
|
11
|
+
format_conversation,
|
11
12
|
)
|
12
13
|
from vision_agent.agent.types import (
|
13
14
|
AgentMessage,
|
@@ -22,19 +23,6 @@ from vision_agent.lmm.types import Message
|
|
22
23
|
from vision_agent.utils.execute import CodeInterpreter, CodeInterpreterFactory
|
23
24
|
|
24
25
|
|
25
|
-
def format_conversation(chat: List[AgentMessage]) -> str:
|
26
|
-
chat = copy.deepcopy(chat)
|
27
|
-
prompt = ""
|
28
|
-
for chat_i in chat:
|
29
|
-
if chat_i.role == "user":
|
30
|
-
prompt += f"USER: {chat_i.content}\n\n"
|
31
|
-
elif chat_i.role == "observation" or chat_i.role == "coder":
|
32
|
-
prompt += f"OBSERVATION: {chat_i.content}\n\n"
|
33
|
-
elif chat_i.role == "conversation":
|
34
|
-
prompt += f"AGENT: {chat_i.content}\n\n"
|
35
|
-
return prompt
|
36
|
-
|
37
|
-
|
38
26
|
def run_conversation(agent: LMM, chat: List[AgentMessage]) -> str:
|
39
27
|
# only keep last 10 messages
|
40
28
|
conv = format_conversation(chat[-10:])
|
@@ -55,23 +43,39 @@ def check_for_interaction(chat: List[AgentMessage]) -> bool:
|
|
55
43
|
|
56
44
|
def extract_conversation_for_generate_code(
|
57
45
|
chat: List[AgentMessage],
|
58
|
-
) -> List[AgentMessage]:
|
46
|
+
) -> Tuple[List[AgentMessage], Optional[str]]:
|
59
47
|
chat = copy.deepcopy(chat)
|
60
48
|
|
61
49
|
# if we are in the middle of an interaction, return all the intermediate planning
|
62
50
|
# steps
|
63
51
|
if check_for_interaction(chat):
|
64
|
-
return chat
|
52
|
+
return chat, None
|
65
53
|
|
66
54
|
extracted_chat = []
|
67
55
|
for chat_i in chat:
|
68
56
|
if chat_i.role == "user":
|
69
57
|
extracted_chat.append(chat_i)
|
70
58
|
elif chat_i.role == "coder":
|
71
|
-
if "<final_code>" in chat_i.content
|
59
|
+
if "<final_code>" in chat_i.content:
|
72
60
|
extracted_chat.append(chat_i)
|
73
61
|
|
74
|
-
|
62
|
+
# only keep the last <final_code> and <final_test>
|
63
|
+
final_code = None
|
64
|
+
extracted_chat_strip_code: List[AgentMessage] = []
|
65
|
+
for chat_i in reversed(extracted_chat):
|
66
|
+
if "<final_code>" in chat_i.content and final_code is None:
|
67
|
+
extracted_chat_strip_code = [chat_i] + extracted_chat_strip_code
|
68
|
+
final_code = extract_tag(chat_i.content, "final_code")
|
69
|
+
if final_code is not None:
|
70
|
+
test_code = extract_tag(chat_i.content, "final_test")
|
71
|
+
final_code += "\n" + test_code if test_code is not None else ""
|
72
|
+
|
73
|
+
if "<final_code>" in chat_i.content and final_code is not None:
|
74
|
+
continue
|
75
|
+
|
76
|
+
extracted_chat_strip_code = [chat_i] + extracted_chat_strip_code
|
77
|
+
|
78
|
+
return extracted_chat_strip_code[-5:], final_code
|
75
79
|
|
76
80
|
|
77
81
|
def maybe_run_action(
|
@@ -81,7 +85,7 @@ def maybe_run_action(
|
|
81
85
|
code_interpreter: Optional[CodeInterpreter] = None,
|
82
86
|
) -> Optional[List[AgentMessage]]:
|
83
87
|
if action == "generate_or_edit_vision_code":
|
84
|
-
extracted_chat = extract_conversation_for_generate_code(chat)
|
88
|
+
extracted_chat, _ = extract_conversation_for_generate_code(chat)
|
85
89
|
# there's an issue here because coder.generate_code will send it's code_context
|
86
90
|
# to the outside user via it's update_callback, but we don't necessarily have
|
87
91
|
# access to that update_callback here, so we re-create the message using
|
@@ -101,11 +105,15 @@ def maybe_run_action(
|
|
101
105
|
)
|
102
106
|
]
|
103
107
|
elif action == "edit_code":
|
104
|
-
extracted_chat = extract_conversation_for_generate_code(chat)
|
108
|
+
extracted_chat, final_code = extract_conversation_for_generate_code(chat)
|
105
109
|
plan_context = PlanContext(
|
106
110
|
plan="Edit the latest code observed in the fewest steps possible according to the user's feedback.",
|
107
|
-
instructions=[
|
108
|
-
|
111
|
+
instructions=[
|
112
|
+
chat_i.content
|
113
|
+
for chat_i in extracted_chat
|
114
|
+
if chat_i.role == "user" and "<final_code>" not in chat_i.content
|
115
|
+
],
|
116
|
+
code=final_code if final_code is not None else "",
|
109
117
|
)
|
110
118
|
context = coder.generate_code_from_plan(
|
111
119
|
extracted_chat, plan_context, code_interpreter=code_interpreter
|
vision_agent/tools/__init__.py
CHANGED
@@ -193,8 +193,10 @@ def get_tool_for_task(
|
|
193
193
|
- Depth and pose estimation
|
194
194
|
- Video object tracking
|
195
195
|
|
196
|
-
|
197
|
-
|
196
|
+
Only ask for one type of task at a time, for example a task needing to identify
|
197
|
+
text is one OCR task while needing to identify non-text objects is an OD task. Wait
|
198
|
+
until the documentation is printed to use the function so you know what the input
|
199
|
+
and output signatures are.
|
198
200
|
|
199
201
|
Parameters:
|
200
202
|
task: str: The task to accomplish.
|
vision_agent/tools/tools.py
CHANGED
@@ -290,6 +290,13 @@ def od_sam2_video_tracking(
|
|
290
290
|
)
|
291
291
|
function_name = "florence2_object_detection"
|
292
292
|
|
293
|
+
elif od_model == ODModels.CUSTOM:
|
294
|
+
segment_results = custom_object_detection(
|
295
|
+
deployment_id=fine_tune_id,
|
296
|
+
image=segment_frames[frame_number],
|
297
|
+
)
|
298
|
+
function_name = "custom_object_detection"
|
299
|
+
|
293
300
|
else:
|
294
301
|
raise NotImplementedError(
|
295
302
|
f"Object detection model '{od_model}' is not implemented."
|
@@ -515,24 +522,29 @@ def owlv2_sam2_video_tracking(
|
|
515
522
|
chunk_length: Optional[int] = 10,
|
516
523
|
fine_tune_id: Optional[str] = None,
|
517
524
|
) -> List[List[Dict[str, Any]]]:
|
518
|
-
"""'owlv2_sam2_video_tracking' is a tool that can segment multiple
|
519
|
-
prompt such as category names or referring
|
520
|
-
prompt are separated by commas. It returns
|
521
|
-
|
525
|
+
"""'owlv2_sam2_video_tracking' is a tool that can track and segment multiple
|
526
|
+
objects in a video given a text prompt such as category names or referring
|
527
|
+
expressions. The categories in the text prompt are separated by commas. It returns
|
528
|
+
a list of bounding boxes, label names, masks and associated probability scores and
|
529
|
+
is useful for tracking and counting without duplicating counts.
|
522
530
|
|
523
531
|
Parameters:
|
524
532
|
prompt (str): The prompt to ground to the image.
|
525
|
-
|
533
|
+
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
534
|
+
chunk_length (Optional[int]): The number of frames to re-run owlv2 to find
|
535
|
+
new objects.
|
526
536
|
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
527
537
|
fine-tuned model ID here to use it.
|
528
538
|
|
529
539
|
Returns:
|
530
|
-
List[Dict[str, Any]]: A list of dictionaries containing the
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
the
|
540
|
+
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
|
541
|
+
label, segmentation mask and bounding boxes. The outer list represents each
|
542
|
+
frame and the inner list is the entities per frame. The detected objects
|
543
|
+
have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
|
544
|
+
and ymin are the coordinates of the top-left and xmax and ymax are the
|
545
|
+
coordinates of the bottom-right of the bounding box. The mask is binary 2D
|
546
|
+
numpy array where 1 indicates the object and 0 indicates the background.
|
547
|
+
The label names are prefixed with their ID represent the total count.
|
536
548
|
|
537
549
|
Example
|
538
550
|
-------
|
@@ -742,11 +754,11 @@ def florence2_sam2_video_tracking(
|
|
742
754
|
chunk_length: Optional[int] = 10,
|
743
755
|
fine_tune_id: Optional[str] = None,
|
744
756
|
) -> List[List[Dict[str, Any]]]:
|
745
|
-
"""'florence2_sam2_video_tracking' is a tool that can
|
746
|
-
|
747
|
-
expressions.
|
748
|
-
|
749
|
-
|
757
|
+
"""'florence2_sam2_video_tracking' is a tool that can track and segment multiple
|
758
|
+
objects in a video given a text prompt such as category names or referring
|
759
|
+
expressions. The categories in the text prompt are separated by commas. It returns
|
760
|
+
a list of bounding boxes, label names, masks and associated probability scores and
|
761
|
+
is useful for tracking and counting without duplicating counts.
|
750
762
|
|
751
763
|
Parameters:
|
752
764
|
prompt (str): The prompt to ground to the video.
|
@@ -758,10 +770,13 @@ def florence2_sam2_video_tracking(
|
|
758
770
|
|
759
771
|
Returns:
|
760
772
|
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
773
|
+
label, segmentation mask and bounding boxes. The outer list represents each
|
774
|
+
frame and the inner list is the entities per frame. The detected objects
|
775
|
+
have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
|
776
|
+
and ymin are the coordinates of the top-left and xmax and ymax are the
|
777
|
+
coordinates of the bottom-right of the bounding box. The mask is binary 2D
|
778
|
+
numpy array where 1 indicates the object and 0 indicates the background.
|
779
|
+
The label names are prefixed with their ID represent the total count.
|
765
780
|
|
766
781
|
Example
|
767
782
|
-------
|
@@ -1076,24 +1091,27 @@ def countgd_sam2_video_tracking(
|
|
1076
1091
|
frames: List[np.ndarray],
|
1077
1092
|
chunk_length: Optional[int] = 10,
|
1078
1093
|
) -> List[List[Dict[str, Any]]]:
|
1079
|
-
"""'countgd_sam2_video_tracking' is a tool that can segment multiple
|
1080
|
-
prompt such as category names or referring
|
1081
|
-
prompt are separated by commas. It returns
|
1082
|
-
|
1094
|
+
"""'countgd_sam2_video_tracking' is a tool that can track and segment multiple
|
1095
|
+
objects in a video given a text prompt such as category names or referring
|
1096
|
+
expressions. The categories in the text prompt are separated by commas. It returns
|
1097
|
+
a list of bounding boxes, label names, masks and associated probability scores and
|
1098
|
+
is useful for tracking and counting without duplicating counts.
|
1083
1099
|
|
1084
1100
|
Parameters:
|
1085
1101
|
prompt (str): The prompt to ground to the image.
|
1086
|
-
|
1087
|
-
chunk_length (Optional[int]): The number of frames to re-run
|
1102
|
+
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
1103
|
+
chunk_length (Optional[int]): The number of frames to re-run countgd to find
|
1088
1104
|
new objects.
|
1089
1105
|
|
1090
1106
|
Returns:
|
1091
|
-
List[Dict[str, Any]]: A list of dictionaries containing the
|
1092
|
-
|
1093
|
-
|
1094
|
-
|
1095
|
-
|
1096
|
-
the
|
1107
|
+
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
|
1108
|
+
label, segmentation mask and bounding boxes. The outer list represents each
|
1109
|
+
frame and the inner list is the entities per frame. The detected objects
|
1110
|
+
have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
|
1111
|
+
and ymin are the coordinates of the top-left and xmax and ymax are the
|
1112
|
+
coordinates of the bottom-right of the bounding box. The mask is binary 2D
|
1113
|
+
numpy array where 1 indicates the object and 0 indicates the background.
|
1114
|
+
The label names are prefixed with their ID represent the total count.
|
1097
1115
|
|
1098
1116
|
Example
|
1099
1117
|
-------
|
@@ -1206,6 +1224,139 @@ def countgd_visual_prompt_object_detection(
|
|
1206
1224
|
return bboxes_formatted
|
1207
1225
|
|
1208
1226
|
|
1227
|
+
def custom_object_detection(
|
1228
|
+
deployment_id: str,
|
1229
|
+
image: np.ndarray,
|
1230
|
+
box_threshold: float = 0.1,
|
1231
|
+
) -> List[Dict[str, Any]]:
|
1232
|
+
"""'custom_object_detection' is a tool that can detect instances of an
|
1233
|
+
object given a deployment_id of a previously finetuned object detection model.
|
1234
|
+
It is particularly useful when trying to detect objects that are not well detected by generalist models.
|
1235
|
+
It returns a list of bounding boxes with normalized
|
1236
|
+
coordinates, label names and associated confidence scores.
|
1237
|
+
|
1238
|
+
Parameters:
|
1239
|
+
deployment_id (str): The id of the finetuned model.
|
1240
|
+
image (np.ndarray): The image that contains instances of the object.
|
1241
|
+
box_threshold (float, optional): The threshold for detection. Defaults
|
1242
|
+
to 0.1.
|
1243
|
+
|
1244
|
+
Returns:
|
1245
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
1246
|
+
bounding box of the detected objects with normalized coordinates between 0
|
1247
|
+
and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
|
1248
|
+
top-left and xmax and ymax are the coordinates of the bottom-right of the
|
1249
|
+
bounding box.
|
1250
|
+
|
1251
|
+
Example
|
1252
|
+
-------
|
1253
|
+
>>> custom_object_detection("abcd1234-5678efg", image)
|
1254
|
+
[
|
1255
|
+
{'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
1256
|
+
{'score': 0.68, 'label': 'flower', 'bbox': [0.2, 0.21, 0.45, 0.5]},
|
1257
|
+
{'score': 0.78, 'label': 'flower', 'bbox': [0.3, 0.35, 0.48, 0.52]},
|
1258
|
+
{'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58]},
|
1259
|
+
]
|
1260
|
+
"""
|
1261
|
+
image_size = image.shape[:2]
|
1262
|
+
if image_size[0] < 1 or image_size[1] < 1:
|
1263
|
+
return []
|
1264
|
+
|
1265
|
+
files = [("image", numpy_to_bytes(image))]
|
1266
|
+
payload = {
|
1267
|
+
"deployment_id": deployment_id,
|
1268
|
+
"confidence": box_threshold,
|
1269
|
+
}
|
1270
|
+
detections: List[List[Dict[str, Any]]] = send_inference_request(
|
1271
|
+
payload, "custom-object-detection", files=files, v2=True
|
1272
|
+
)
|
1273
|
+
|
1274
|
+
bboxes = detections[0]
|
1275
|
+
bboxes_formatted = [
|
1276
|
+
{
|
1277
|
+
"label": bbox["label"],
|
1278
|
+
"bbox": normalize_bbox(bbox["bounding_box"], image_size),
|
1279
|
+
"score": bbox["score"],
|
1280
|
+
}
|
1281
|
+
for bbox in bboxes
|
1282
|
+
]
|
1283
|
+
display_data = [
|
1284
|
+
{
|
1285
|
+
"label": bbox["label"],
|
1286
|
+
"bbox": bbox["bounding_box"],
|
1287
|
+
"score": bbox["score"],
|
1288
|
+
}
|
1289
|
+
for bbox in bboxes
|
1290
|
+
]
|
1291
|
+
|
1292
|
+
_display_tool_trace(
|
1293
|
+
custom_object_detection.__name__,
|
1294
|
+
payload,
|
1295
|
+
display_data,
|
1296
|
+
files,
|
1297
|
+
)
|
1298
|
+
return bboxes_formatted
|
1299
|
+
|
1300
|
+
|
1301
|
+
def custom_od_sam2_video_tracking(
|
1302
|
+
deployment_id: str,
|
1303
|
+
frames: List[np.ndarray],
|
1304
|
+
chunk_length: Optional[int] = 10,
|
1305
|
+
) -> List[List[Dict[str, Any]]]:
|
1306
|
+
"""'custom_od_sam2_video_tracking' is a tool that can segment multiple objects given a
|
1307
|
+
custom model with predefined category names.
|
1308
|
+
It returns a list of bounding boxes, label names,
|
1309
|
+
mask file names and associated probability scores.
|
1310
|
+
|
1311
|
+
Parameters:
|
1312
|
+
deployment_id (str): The id of the deployed custom model.
|
1313
|
+
image (np.ndarray): The image to ground the prompt to.
|
1314
|
+
chunk_length (Optional[int]): The number of frames to re-run florence2 to find
|
1315
|
+
new objects.
|
1316
|
+
|
1317
|
+
Returns:
|
1318
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
1319
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
1320
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
1321
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
1322
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
1323
|
+
the background.
|
1324
|
+
|
1325
|
+
Example
|
1326
|
+
-------
|
1327
|
+
>>> custom_od_sam2_video_tracking("abcd1234-5678efg", frames)
|
1328
|
+
[
|
1329
|
+
[
|
1330
|
+
{
|
1331
|
+
'label': '0: dinosaur',
|
1332
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
1333
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
1334
|
+
[0, 0, 0, ..., 0, 0, 0],
|
1335
|
+
...,
|
1336
|
+
[0, 0, 0, ..., 0, 0, 0],
|
1337
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
1338
|
+
},
|
1339
|
+
],
|
1340
|
+
...
|
1341
|
+
]
|
1342
|
+
"""
|
1343
|
+
|
1344
|
+
ret = od_sam2_video_tracking(
|
1345
|
+
ODModels.CUSTOM,
|
1346
|
+
prompt="",
|
1347
|
+
frames=frames,
|
1348
|
+
chunk_length=chunk_length,
|
1349
|
+
fine_tune_id=deployment_id,
|
1350
|
+
)
|
1351
|
+
_display_tool_trace(
|
1352
|
+
custom_od_sam2_video_tracking.__name__,
|
1353
|
+
{},
|
1354
|
+
ret["display_data"],
|
1355
|
+
ret["files"],
|
1356
|
+
)
|
1357
|
+
return ret["return_data"] # type: ignore
|
1358
|
+
|
1359
|
+
|
1209
1360
|
def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
|
1210
1361
|
"""'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary
|
1211
1362
|
images including regular images or images of documents or presentations. It can be
|
@@ -1533,7 +1684,7 @@ def video_temporal_localization(
|
|
1533
1684
|
prompt (str): The question about the video
|
1534
1685
|
frames (List[np.ndarray]): The reference frames used for the question
|
1535
1686
|
model (str): The model to use for the inference. Valid values are
|
1536
|
-
'qwen2vl', 'gpt4o'
|
1687
|
+
'qwen2vl', 'gpt4o'.
|
1537
1688
|
chunk_length_frames (Optional[int]): length of each chunk in frames
|
1538
1689
|
|
1539
1690
|
Returns:
|
@@ -2102,7 +2253,7 @@ def closest_box_distance(
|
|
2102
2253
|
|
2103
2254
|
|
2104
2255
|
def extract_frames_and_timestamps(
|
2105
|
-
video_uri: Union[str, Path], fps: float =
|
2256
|
+
video_uri: Union[str, Path], fps: float = 5
|
2106
2257
|
) -> List[Dict[str, Union[np.ndarray, float]]]:
|
2107
2258
|
"""'extract_frames_and_timestamps' extracts frames and timestamps from a video
|
2108
2259
|
which can be a file path, url or youtube link, returns a list of dictionaries
|
@@ -2113,7 +2264,7 @@ def extract_frames_and_timestamps(
|
|
2113
2264
|
Parameters:
|
2114
2265
|
video_uri (Union[str, Path]): The path to the video file, url or youtube link
|
2115
2266
|
fps (float, optional): The frame rate per second to extract the frames. Defaults
|
2116
|
-
to
|
2267
|
+
to 5.
|
2117
2268
|
|
2118
2269
|
Returns:
|
2119
2270
|
List[Dict[str, Union[np.ndarray, float]]]: A list of dictionaries containing the
|
@@ -2636,10 +2787,8 @@ FUNCTION_TOOLS = [
|
|
2636
2787
|
ocr,
|
2637
2788
|
qwen2_vl_images_vqa,
|
2638
2789
|
qwen2_vl_video_vqa,
|
2639
|
-
detr_segmentation,
|
2640
2790
|
depth_anything_v2,
|
2641
2791
|
generate_pose_image,
|
2642
|
-
vit_image_classification,
|
2643
2792
|
vit_nsfw_classification,
|
2644
2793
|
video_temporal_localization,
|
2645
2794
|
flux_image_inpainting,
|
vision_agent/utils/sim.py
CHANGED
@@ -133,6 +133,12 @@ class Sim:
|
|
133
133
|
df: pd.DataFrame,
|
134
134
|
) -> bool:
|
135
135
|
load_dir = Path(load_dir)
|
136
|
+
if (
|
137
|
+
not Path(load_dir / "df.csv").exists()
|
138
|
+
or not Path(load_dir / "embs.npy").exists()
|
139
|
+
):
|
140
|
+
return False
|
141
|
+
|
136
142
|
df_load = pd.read_csv(load_dir / "df.csv")
|
137
143
|
if platform.system() == "Windows":
|
138
144
|
df_load["doc"] = df_load["doc"].apply(lambda x: x.replace("\r", ""))
|
@@ -1,23 +1,23 @@
|
|
1
|
-
vision_agent/.sim_tools/df.csv,sha256=
|
2
|
-
vision_agent/.sim_tools/embs.npy,sha256=
|
1
|
+
vision_agent/.sim_tools/df.csv,sha256=Vamicw8MiSGildK1r3-HXY4cKiq17GZxsgBsHbk7jpM,42158
|
2
|
+
vision_agent/.sim_tools/embs.npy,sha256=YJe8EcKVNmeX_75CS2T1sbY-sUS_1HQAMT-34zc18a0,254080
|
3
3
|
vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
4
4
|
vision_agent/agent/README.md,sha256=Q4w7FWw38qaWosQYAZ7NqWx8Q5XzuWrlv7nLhjUd1-8,5527
|
5
5
|
vision_agent/agent/__init__.py,sha256=M8CffavdIh8Zh-skznLHIaQkYGCGK7vk4dq1FaVkbs4,617
|
6
6
|
vision_agent/agent/agent.py,sha256=_1tHWAs7Jm5tqDzEcPfCRvJV3uRRveyh4n9_9pd6I1w,1565
|
7
|
-
vision_agent/agent/agent_utils.py,sha256=
|
7
|
+
vision_agent/agent/agent_utils.py,sha256=pP4u5tiami7C3ChgjgYLqJITnmkTI1_GsUj6g5czSRk,13994
|
8
8
|
vision_agent/agent/types.py,sha256=DkFm3VMMrKlhYyfxEmZx4keppD72Ov3wmLCbM2J2o10,2437
|
9
9
|
vision_agent/agent/vision_agent.py,sha256=I75bEU-os9Lf9OSICKfvQ_H_ftg-zOwgTwWnu41oIdo,23555
|
10
10
|
vision_agent/agent/vision_agent_coder.py,sha256=flUxOibyGZK19BCSK5mhaD3HjCxHw6c6FtKom6N2q1E,27359
|
11
11
|
vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
|
12
|
-
vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=
|
13
|
-
vision_agent/agent/vision_agent_coder_v2.py,sha256=
|
12
|
+
vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=idmSMfxebPULqqvllz3gqRzGDchEvS5dkGngvBs4PGo,4872
|
13
|
+
vision_agent/agent/vision_agent_coder_v2.py,sha256=i1qgXp5YsWVRoA_qO429Ef-aKZBakveCl1F_2ZbSzk8,16287
|
14
14
|
vision_agent/agent/vision_agent_planner.py,sha256=fFzjNkZBKkh8Y_oS06ATI4qz31xmIJvixb_tV1kX8KA,18590
|
15
15
|
vision_agent/agent/vision_agent_planner_prompts.py,sha256=mn9NlZpRkW4XAvlNuMZwIs1ieHCFds5aYZJ55WXupZY,6733
|
16
|
-
vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=
|
16
|
+
vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=YgemW2PRPYd8o8XpmwSJBUOJSxMUXMNr2DZNQnS4jEI,34988
|
17
17
|
vision_agent/agent/vision_agent_planner_v2.py,sha256=vvxfmGydBIKB8CtNSAJyPvdEXkG7nIO5-Hs2SjNc48Y,20465
|
18
18
|
vision_agent/agent/vision_agent_prompts.py,sha256=NtGdCfzzilCRtscKALC9FK55d1h4CBpMnbhLzg0PYlc,13772
|
19
19
|
vision_agent/agent/vision_agent_prompts_v2.py,sha256=-vCWat-ARlCOOOeIDIFhg-kcwRRwjTXYEwsvvqPeaCs,1972
|
20
|
-
vision_agent/agent/vision_agent_v2.py,sha256=
|
20
|
+
vision_agent/agent/vision_agent_v2.py,sha256=1wu_vH_onic2kLYPKW2nAF2e6Zz5vmUt5Acv4Seq3sQ,10796
|
21
21
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
22
|
vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
|
23
23
|
vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
|
@@ -26,22 +26,22 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
|
|
26
26
|
vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
|
27
27
|
vision_agent/lmm/lmm.py,sha256=x_nIyDNDZwq4-pfjnJTmcyyJZ2_B7TjkA5jZp88YVO8,17103
|
28
28
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
29
|
-
vision_agent/tools/__init__.py,sha256=
|
29
|
+
vision_agent/tools/__init__.py,sha256=fcucnAzr5Hue9xSqpBgA7RcRJP2CgAgQJ31p_R5lg-I,2794
|
30
30
|
vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
|
31
|
-
vision_agent/tools/planner_tools.py,sha256=
|
31
|
+
vision_agent/tools/planner_tools.py,sha256=qQvPuCif-KbFi7KsXKkTCfpgEQEJJ6oq6WB3gOuG2Xg,13686
|
32
32
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
33
33
|
vision_agent/tools/tool_utils.py,sha256=q9cqXO2AvigUdO1krjnOy8o0goYhgS6eILl6-F5Kxyk,10211
|
34
|
-
vision_agent/tools/tools.py,sha256=
|
34
|
+
vision_agent/tools/tools.py,sha256=36f0qAhQfA5lDhYv5BKpHfHgBVEBgOD-XNVHG5K4HLY,96619
|
35
35
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
36
36
|
vision_agent/utils/__init__.py,sha256=QKk4zVjMwGxQI0MQ-aZZA50N-qItxRY4EB9CwQkZ2HY,185
|
37
37
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
38
38
|
vision_agent/utils/execute.py,sha256=vOEP5Ys7S2lc0_7pOJbgk7OaWi85hrCNu9_8Bo3zk6I,29356
|
39
39
|
vision_agent/utils/image_utils.py,sha256=z_ONgcza125B10NkoGwPOzXnL470bpTWZbkB16NeeH0,12188
|
40
|
-
vision_agent/utils/sim.py,sha256=
|
40
|
+
vision_agent/utils/sim.py,sha256=qr-6UWAxxGwtwIAKZjZCY_pu9VwBI_TTB8bfrGsaABg,9282
|
41
41
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
42
42
|
vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
|
43
|
-
vision_agent/utils/video_tracking.py,sha256=
|
44
|
-
vision_agent-0.2.
|
45
|
-
vision_agent-0.2.
|
46
|
-
vision_agent-0.2.
|
47
|
-
vision_agent-0.2.
|
43
|
+
vision_agent/utils/video_tracking.py,sha256=7ZiFBqQRTid5ytPmkrAGQUiVMr-twzib8Ha2hN3JsR0,9474
|
44
|
+
vision_agent-0.2.227.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
45
|
+
vision_agent-0.2.227.dist-info/METADATA,sha256=qFefkLzCo7G98LyhIPqYzPOUv5nyvOK84DJvUWmeqcc,20039
|
46
|
+
vision_agent-0.2.227.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
47
|
+
vision_agent-0.2.227.dist-info/RECORD,,
|
File without changes
|
File without changes
|