vision-agent 0.2.193__py3-none-any.whl → 0.2.196__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/.sim_tools/df.csv +640 -0
- vision_agent/.sim_tools/embs.npy +0 -0
- vision_agent/agent/__init__.py +2 -0
- vision_agent/agent/agent_utils.py +211 -3
- vision_agent/agent/vision_agent_coder.py +5 -113
- vision_agent/agent/vision_agent_coder_prompts_v2.py +119 -0
- vision_agent/agent/vision_agent_coder_v2.py +341 -0
- vision_agent/agent/vision_agent_planner.py +2 -2
- vision_agent/agent/vision_agent_planner_prompts.py +1 -1
- vision_agent/agent/vision_agent_planner_prompts_v2.py +748 -0
- vision_agent/agent/vision_agent_planner_v2.py +432 -0
- vision_agent/lmm/lmm.py +4 -0
- vision_agent/tools/__init__.py +2 -1
- vision_agent/tools/planner_tools.py +246 -0
- vision_agent/tools/tool_utils.py +65 -1
- vision_agent/tools/tools.py +76 -22
- vision_agent/utils/image_utils.py +12 -6
- vision_agent/utils/sim.py +65 -14
- {vision_agent-0.2.193.dist-info → vision_agent-0.2.196.dist-info}/METADATA +2 -1
- vision_agent-0.2.196.dist-info/RECORD +42 -0
- vision_agent-0.2.193.dist-info/RECORD +0 -35
- {vision_agent-0.2.193.dist-info → vision_agent-0.2.196.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.193.dist-info → vision_agent-0.2.196.dist-info}/WHEEL +0 -0
@@ -0,0 +1,640 @@
|
|
1
|
+
desc,doc,name
|
2
|
+
"'owl_v2_image' is a tool that can detect and count multiple objects given a text prompt such as category names or referring expressions on images. The categories in text prompt are separated by commas. It returns a list of bounding boxes with normalized coordinates, label names and associated probability scores.","owl_v2_image(prompt: str, image: numpy.ndarray, box_threshold: float = 0.1, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
|
3
|
+
'owl_v2_image' is a tool that can detect and count multiple objects given a text
|
4
|
+
prompt such as category names or referring expressions on images. The categories in
|
5
|
+
text prompt are separated by commas. It returns a list of bounding boxes with
|
6
|
+
normalized coordinates, label names and associated probability scores.
|
7
|
+
|
8
|
+
Parameters:
|
9
|
+
prompt (str): The prompt to ground to the image.
|
10
|
+
image (np.ndarray): The image to ground the prompt to.
|
11
|
+
box_threshold (float, optional): The threshold for the box detection. Defaults
|
12
|
+
to 0.10.
|
13
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
14
|
+
fine-tuned model ID here to use it.
|
15
|
+
|
16
|
+
Returns:
|
17
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
18
|
+
bounding box of the detected objects with normalized coordinates between 0
|
19
|
+
and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
|
20
|
+
top-left and xmax and ymax are the coordinates of the bottom-right of the
|
21
|
+
bounding box.
|
22
|
+
|
23
|
+
Example
|
24
|
+
-------
|
25
|
+
>>> owl_v2_image(""car, dinosaur"", image)
|
26
|
+
[
|
27
|
+
{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
28
|
+
{'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
29
|
+
]
|
30
|
+
",owl_v2_image
|
31
|
+
"'owl_v2_video' will run owl_v2 on each frame of a video. It can detect multiple objects independently per frame given a text prompt such as a category name or referring expression but does not track objects across frames. The categories in text prompt are separated by commas. It returns a list of lists where each inner list contains the score, label, and bounding box of the detections for that frame.","owl_v2_video(prompt: str, frames: List[numpy.ndarray], box_threshold: float = 0.1, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
|
32
|
+
'owl_v2_video' will run owl_v2 on each frame of a video. It can detect multiple
|
33
|
+
objects independently per frame given a text prompt such as a category name or
|
34
|
+
referring expression but does not track objects across frames. The categories in
|
35
|
+
text prompt are separated by commas. It returns a list of lists where each inner
|
36
|
+
list contains the score, label, and bounding box of the detections for that frame.
|
37
|
+
|
38
|
+
Parameters:
|
39
|
+
prompt (str): The prompt to ground to the video.
|
40
|
+
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
41
|
+
box_threshold (float, optional): The threshold for the box detection. Defaults
|
42
|
+
to 0.30.
|
43
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
44
|
+
fine-tuned model ID here to use it.
|
45
|
+
|
46
|
+
Returns:
|
47
|
+
List[List[Dict[str, Any]]]: A list of lists of dictionaries containing the
|
48
|
+
score, label, and bounding box of the detected objects with normalized
|
49
|
+
coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the
|
50
|
+
coordinates of the top-left and xmax and ymax are the coordinates of the
|
51
|
+
bottom-right of the bounding box.
|
52
|
+
|
53
|
+
Example
|
54
|
+
-------
|
55
|
+
>>> owl_v2_video(""car, dinosaur"", frames)
|
56
|
+
[
|
57
|
+
[
|
58
|
+
{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
59
|
+
{'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
60
|
+
],
|
61
|
+
...
|
62
|
+
]
|
63
|
+
",owl_v2_video
|
64
|
+
"'ocr' extracts text from an image. It returns a list of detected text, bounding boxes with normalized coordinates, and confidence scores. The results are sorted from top-left to bottom right.","ocr(image: numpy.ndarray) -> List[Dict[str, Any]]:
|
65
|
+
'ocr' extracts text from an image. It returns a list of detected text, bounding
|
66
|
+
boxes with normalized coordinates, and confidence scores. The results are sorted
|
67
|
+
from top-left to bottom right.
|
68
|
+
|
69
|
+
Parameters:
|
70
|
+
image (np.ndarray): The image to extract text from.
|
71
|
+
|
72
|
+
Returns:
|
73
|
+
List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
|
74
|
+
with normalized coordinates, and confidence score.
|
75
|
+
|
76
|
+
Example
|
77
|
+
-------
|
78
|
+
>>> ocr(image)
|
79
|
+
[
|
80
|
+
{'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
|
81
|
+
]
|
82
|
+
",ocr
|
83
|
+
'clip' is a tool that can classify an image or a cropped detection given a list of input classes or tags. It returns the same list of the input classes along with their probability scores based on image content.,"clip(image: numpy.ndarray, classes: List[str]) -> Dict[str, Any]:
|
84
|
+
'clip' is a tool that can classify an image or a cropped detection given a list
|
85
|
+
of input classes or tags. It returns the same list of the input classes along with
|
86
|
+
their probability scores based on image content.
|
87
|
+
|
88
|
+
Parameters:
|
89
|
+
image (np.ndarray): The image to classify or tag
|
90
|
+
classes (List[str]): The list of classes or tags that is associated with the image
|
91
|
+
|
92
|
+
Returns:
|
93
|
+
Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
|
94
|
+
contains a list of given labels and other a list of scores.
|
95
|
+
|
96
|
+
Example
|
97
|
+
-------
|
98
|
+
>>> clip(image, ['dog', 'cat', 'bird'])
|
99
|
+
{""labels"": [""dog"", ""cat"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
|
100
|
+
",clip
|
101
|
+
'vit_image_classification' is a tool that can classify an image. It returns a list of classes and their probability scores based on image content.,"vit_image_classification(image: numpy.ndarray) -> Dict[str, Any]:
|
102
|
+
'vit_image_classification' is a tool that can classify an image. It returns a
|
103
|
+
list of classes and their probability scores based on image content.
|
104
|
+
|
105
|
+
Parameters:
|
106
|
+
image (np.ndarray): The image to classify or tag
|
107
|
+
|
108
|
+
Returns:
|
109
|
+
Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
|
110
|
+
contains a list of labels and other a list of scores.
|
111
|
+
|
112
|
+
Example
|
113
|
+
-------
|
114
|
+
>>> vit_image_classification(image)
|
115
|
+
{""labels"": [""leopard"", ""lemur, otter"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
|
116
|
+
",vit_image_classification
|
117
|
+
'vit_nsfw_classification' is a tool that can classify an image as 'nsfw' or 'normal'. It returns the predicted label and their probability scores based on image content.,"vit_nsfw_classification(image: numpy.ndarray) -> Dict[str, Any]:
|
118
|
+
'vit_nsfw_classification' is a tool that can classify an image as 'nsfw' or 'normal'.
|
119
|
+
It returns the predicted label and their probability scores based on image content.
|
120
|
+
|
121
|
+
Parameters:
|
122
|
+
image (np.ndarray): The image to classify or tag
|
123
|
+
|
124
|
+
Returns:
|
125
|
+
Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
|
126
|
+
contains a list of labels and other a list of scores.
|
127
|
+
|
128
|
+
Example
|
129
|
+
-------
|
130
|
+
>>> vit_nsfw_classification(image)
|
131
|
+
{""label"": ""normal"", ""scores"": 0.68},
|
132
|
+
",vit_nsfw_classification
|
133
|
+
"'countgd_counting' is a tool that can detect multiple instances of an object given a text prompt. It is particularly useful when trying to detect and count a large number of objects. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores.","countgd_counting(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
|
134
|
+
'countgd_counting' is a tool that can detect multiple instances of an object
|
135
|
+
given a text prompt. It is particularly useful when trying to detect and count a
|
136
|
+
large number of objects. It returns a list of bounding boxes with normalized
|
137
|
+
coordinates, label names and associated confidence scores.
|
138
|
+
|
139
|
+
Parameters:
|
140
|
+
prompt (str): The object that needs to be counted.
|
141
|
+
image (np.ndarray): The image that contains multiple instances of the object.
|
142
|
+
box_threshold (float, optional): The threshold for detection. Defaults
|
143
|
+
to 0.23.
|
144
|
+
|
145
|
+
Returns:
|
146
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
147
|
+
bounding box of the detected objects with normalized coordinates between 0
|
148
|
+
and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
|
149
|
+
top-left and xmax and ymax are the coordinates of the bottom-right of the
|
150
|
+
bounding box.
|
151
|
+
|
152
|
+
Example
|
153
|
+
-------
|
154
|
+
>>> countgd_counting(""flower"", image)
|
155
|
+
[
|
156
|
+
{'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
157
|
+
{'score': 0.68, 'label': 'flower', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
158
|
+
{'score': 0.78, 'label': 'flower', 'bbox': [0.3, 0.35, 0.48, 0.52},
|
159
|
+
{'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
|
160
|
+
]
|
161
|
+
",countgd_counting
|
162
|
+
"'florence2_ocr' is a tool that can detect text and text regions in an image. Each text region contains one line of text. It returns a list of detected text, the text region as a bounding box with normalized coordinates, and confidence scores. The results are sorted from top-left to bottom right.","florence2_ocr(image: numpy.ndarray) -> List[Dict[str, Any]]:
|
163
|
+
'florence2_ocr' is a tool that can detect text and text regions in an image.
|
164
|
+
Each text region contains one line of text. It returns a list of detected text,
|
165
|
+
the text region as a bounding box with normalized coordinates, and confidence
|
166
|
+
scores. The results are sorted from top-left to bottom right.
|
167
|
+
|
168
|
+
Parameters:
|
169
|
+
image (np.ndarray): The image to extract text from.
|
170
|
+
|
171
|
+
Returns:
|
172
|
+
List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
|
173
|
+
with normalized coordinates, and confidence score.
|
174
|
+
|
175
|
+
Example
|
176
|
+
-------
|
177
|
+
>>> florence2_ocr(image)
|
178
|
+
[
|
179
|
+
{'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
|
180
|
+
]
|
181
|
+
",florence2_ocr
|
182
|
+
"'florence2_sam2_image' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores of 1.0.","florence2_sam2_image(prompt: str, image: numpy.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
|
183
|
+
'florence2_sam2_image' is a tool that can segment multiple objects given a text
|
184
|
+
prompt such as category names or referring expressions. The categories in the text
|
185
|
+
prompt are separated by commas. It returns a list of bounding boxes, label names,
|
186
|
+
mask file names and associated probability scores of 1.0.
|
187
|
+
|
188
|
+
Parameters:
|
189
|
+
prompt (str): The prompt to ground to the image.
|
190
|
+
image (np.ndarray): The image to ground the prompt to.
|
191
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
192
|
+
fine-tuned model ID here to use it.
|
193
|
+
|
194
|
+
Returns:
|
195
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
196
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
197
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
198
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
199
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
200
|
+
the background.
|
201
|
+
|
202
|
+
Example
|
203
|
+
-------
|
204
|
+
>>> florence2_sam2_image(""car, dinosaur"", image)
|
205
|
+
[
|
206
|
+
{
|
207
|
+
'score': 1.0,
|
208
|
+
'label': 'dinosaur',
|
209
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
210
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
211
|
+
[0, 0, 0, ..., 0, 0, 0],
|
212
|
+
...,
|
213
|
+
[0, 0, 0, ..., 0, 0, 0],
|
214
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
215
|
+
},
|
216
|
+
]
|
217
|
+
",florence2_sam2_image
|
218
|
+
'florence2_sam2_video_tracking' is a tool that can segment and track multiple entities in a video given a text prompt such as category names or referring expressions. You can optionally separate the categories in the text with commas. It can find new objects every 'chunk_length' frames and is useful for tracking and counting without duplicating counts and always outputs scores of 1.0.,"florence2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
|
219
|
+
'florence2_sam2_video_tracking' is a tool that can segment and track multiple
|
220
|
+
entities in a video given a text prompt such as category names or referring
|
221
|
+
expressions. You can optionally separate the categories in the text with commas. It
|
222
|
+
can find new objects every 'chunk_length' frames and is useful for tracking and
|
223
|
+
counting without duplicating counts and always outputs scores of 1.0.
|
224
|
+
|
225
|
+
Parameters:
|
226
|
+
prompt (str): The prompt to ground to the video.
|
227
|
+
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
228
|
+
chunk_length (Optional[int]): The number of frames to re-run florence2 to find
|
229
|
+
new objects.
|
230
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
231
|
+
fine-tuned model ID here to use it.
|
232
|
+
|
233
|
+
Returns:
|
234
|
+
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
|
235
|
+
label,segment mask and bounding boxes. The outer list represents each frame and
|
236
|
+
the inner list is the entities per frame. The label contains the object ID
|
237
|
+
followed by the label name. The objects are only identified in the first framed
|
238
|
+
and tracked throughout the video.
|
239
|
+
|
240
|
+
Example
|
241
|
+
-------
|
242
|
+
>>> florence2_sam2_video(""car, dinosaur"", frames)
|
243
|
+
[
|
244
|
+
[
|
245
|
+
{
|
246
|
+
'label': '0: dinosaur',
|
247
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
248
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
249
|
+
[0, 0, 0, ..., 0, 0, 0],
|
250
|
+
...,
|
251
|
+
[0, 0, 0, ..., 0, 0, 0],
|
252
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
253
|
+
},
|
254
|
+
],
|
255
|
+
...
|
256
|
+
]
|
257
|
+
",florence2_sam2_video_tracking
|
258
|
+
"'florence2_phrase_grounding' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated probability scores of 1.0.","florence2_phrase_grounding(prompt: str, image: numpy.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
|
259
|
+
'florence2_phrase_grounding' is a tool that can detect multiple
|
260
|
+
objects given a text prompt which can be object names or caption. You
|
261
|
+
can optionally separate the object names in the text with commas. It returns a list
|
262
|
+
of bounding boxes with normalized coordinates, label names and associated
|
263
|
+
probability scores of 1.0.
|
264
|
+
|
265
|
+
Parameters:
|
266
|
+
prompt (str): The prompt to ground to the image.
|
267
|
+
image (np.ndarray): The image to used to detect objects
|
268
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
269
|
+
fine-tuned model ID here to use it.
|
270
|
+
|
271
|
+
Returns:
|
272
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
273
|
+
bounding box of the detected objects with normalized coordinates between 0
|
274
|
+
and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
|
275
|
+
top-left and xmax and ymax are the coordinates of the bottom-right of the
|
276
|
+
bounding box. The scores are always 1.0 and cannot be thresholded
|
277
|
+
|
278
|
+
Example
|
279
|
+
-------
|
280
|
+
>>> florence2_phrase_grounding('person looking at a coyote', image)
|
281
|
+
[
|
282
|
+
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
283
|
+
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
284
|
+
]
|
285
|
+
",florence2_phrase_grounding
|
286
|
+
'claude35_text_extraction' is a tool that can extract text from an image. It returns the extracted text as a string and can be used as an alternative to OCR if you do not need to know the exact bounding box of the text.,"claude35_text_extraction(image: numpy.ndarray) -> str:
|
287
|
+
'claude35_text_extraction' is a tool that can extract text from an image. It
|
288
|
+
returns the extracted text as a string and can be used as an alternative to OCR if
|
289
|
+
you do not need to know the exact bounding box of the text.
|
290
|
+
|
291
|
+
Parameters:
|
292
|
+
image (np.ndarray): The image to extract text from.
|
293
|
+
|
294
|
+
Returns:
|
295
|
+
str: The extracted text from the image.
|
296
|
+
",claude35_text_extraction
|
297
|
+
"'detr_segmentation' is a tool that can segment common objects in an image without any text prompt. It returns a list of detected objects as labels, their regions as masks and their scores.","detr_segmentation(image: numpy.ndarray) -> List[Dict[str, Any]]:
|
298
|
+
'detr_segmentation' is a tool that can segment common objects in an
|
299
|
+
image without any text prompt. It returns a list of detected objects
|
300
|
+
as labels, their regions as masks and their scores.
|
301
|
+
|
302
|
+
Parameters:
|
303
|
+
image (np.ndarray): The image used to segment things and objects
|
304
|
+
|
305
|
+
Returns:
|
306
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label
|
307
|
+
and mask of the detected objects. The mask is binary 2D numpy array where 1
|
308
|
+
indicates the object and 0 indicates the background.
|
309
|
+
|
310
|
+
Example
|
311
|
+
-------
|
312
|
+
>>> detr_segmentation(image)
|
313
|
+
[
|
314
|
+
{
|
315
|
+
'score': 0.45,
|
316
|
+
'label': 'window',
|
317
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
318
|
+
[0, 0, 0, ..., 0, 0, 0],
|
319
|
+
...,
|
320
|
+
[0, 0, 0, ..., 0, 0, 0],
|
321
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
322
|
+
},
|
323
|
+
{
|
324
|
+
'score': 0.70,
|
325
|
+
'label': 'bird',
|
326
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
327
|
+
[0, 0, 0, ..., 0, 0, 0],
|
328
|
+
...,
|
329
|
+
[0, 0, 0, ..., 0, 0, 0],
|
330
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
331
|
+
},
|
332
|
+
]
|
333
|
+
",detr_segmentation
|
334
|
+
'depth_anything_v2' is a tool that runs depth_anythingv2 model to generate a depth image from a given RGB image. The returned depth image is monochrome and represents depth values as pixel intesities with pixel values ranging from 0 to 255.,"depth_anything_v2(image: numpy.ndarray) -> numpy.ndarray:
|
335
|
+
'depth_anything_v2' is a tool that runs depth_anythingv2 model to generate a
|
336
|
+
depth image from a given RGB image. The returned depth image is monochrome and
|
337
|
+
represents depth values as pixel intesities with pixel values ranging from 0 to 255.
|
338
|
+
|
339
|
+
Parameters:
|
340
|
+
image (np.ndarray): The image to used to generate depth image
|
341
|
+
|
342
|
+
Returns:
|
343
|
+
np.ndarray: A grayscale depth image with pixel values ranging from 0 to 255.
|
344
|
+
|
345
|
+
Example
|
346
|
+
-------
|
347
|
+
>>> depth_anything_v2(image)
|
348
|
+
array([[0, 0, 0, ..., 0, 0, 0],
|
349
|
+
[0, 20, 24, ..., 0, 100, 103],
|
350
|
+
...,
|
351
|
+
[10, 11, 15, ..., 202, 202, 205],
|
352
|
+
[10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
|
353
|
+
",depth_anything_v2
|
354
|
+
'generate_pose_image' is a tool that generates a open pose bone/stick image from a given RGB image. The returned bone image is RGB with the pose amd keypoints colored and background as black.,"generate_pose_image(image: numpy.ndarray) -> numpy.ndarray:
|
355
|
+
'generate_pose_image' is a tool that generates a open pose bone/stick image from
|
356
|
+
a given RGB image. The returned bone image is RGB with the pose amd keypoints colored
|
357
|
+
and background as black.
|
358
|
+
|
359
|
+
Parameters:
|
360
|
+
image (np.ndarray): The image to used to generate pose image
|
361
|
+
|
362
|
+
Returns:
|
363
|
+
np.ndarray: A bone or pose image indicating the pose and keypoints
|
364
|
+
|
365
|
+
Example
|
366
|
+
-------
|
367
|
+
>>> generate_pose_image(image)
|
368
|
+
array([[0, 0, 0, ..., 0, 0, 0],
|
369
|
+
[0, 20, 24, ..., 0, 100, 103],
|
370
|
+
...,
|
371
|
+
[10, 11, 15, ..., 202, 202, 205],
|
372
|
+
[10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
|
373
|
+
",generate_pose_image
|
374
|
+
'closest_mask_distance' calculates the closest distance between two masks.,"closest_mask_distance(mask1: numpy.ndarray, mask2: numpy.ndarray) -> float:
|
375
|
+
'closest_mask_distance' calculates the closest distance between two masks.
|
376
|
+
|
377
|
+
Parameters:
|
378
|
+
mask1 (np.ndarray): The first mask.
|
379
|
+
mask2 (np.ndarray): The second mask.
|
380
|
+
|
381
|
+
Returns:
|
382
|
+
float: The closest distance between the two masks.
|
383
|
+
|
384
|
+
Example
|
385
|
+
-------
|
386
|
+
>>> closest_mask_distance(mask1, mask2)
|
387
|
+
0.5
|
388
|
+
",closest_mask_distance
|
389
|
+
'closest_box_distance' calculates the closest distance between two bounding boxes.,"closest_box_distance(box1: List[float], box2: List[float], image_size: Tuple[int, int]) -> float:
|
390
|
+
'closest_box_distance' calculates the closest distance between two bounding boxes.
|
391
|
+
|
392
|
+
Parameters:
|
393
|
+
box1 (List[float]): The first bounding box.
|
394
|
+
box2 (List[float]): The second bounding box.
|
395
|
+
image_size (Tuple[int, int]): The size of the image given as (height, width).
|
396
|
+
|
397
|
+
Returns:
|
398
|
+
float: The closest distance between the two bounding boxes.
|
399
|
+
|
400
|
+
Example
|
401
|
+
-------
|
402
|
+
>>> closest_box_distance([100, 100, 200, 200], [300, 300, 400, 400])
|
403
|
+
141.42
|
404
|
+
",closest_box_distance
|
405
|
+
'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary images including regular images or images of documents or presentations. It can be very useful for document QA or OCR text extraction. It returns text as an answer to the question.,"qwen2_vl_images_vqa(prompt: str, images: List[numpy.ndarray]) -> str:
|
406
|
+
'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary
|
407
|
+
images including regular images or images of documents or presentations. It can be
|
408
|
+
very useful for document QA or OCR text extraction. It returns text as an answer to
|
409
|
+
the question.
|
410
|
+
|
411
|
+
Parameters:
|
412
|
+
prompt (str): The question about the document image
|
413
|
+
images (List[np.ndarray]): The reference images used for the question
|
414
|
+
|
415
|
+
Returns:
|
416
|
+
str: A string which is the answer to the given prompt.
|
417
|
+
|
418
|
+
Example
|
419
|
+
-------
|
420
|
+
>>> qwen2_vl_images_vqa('Give a summary of the document', images)
|
421
|
+
'The document talks about the history of the United States of America and its...'
|
422
|
+
",qwen2_vl_images_vqa
|
423
|
+
'qwen2_vl_video_vqa' is a tool that can answer any questions about arbitrary videos including regular videos or videos of documents or presentations. It returns text as an answer to the question.,"qwen2_vl_video_vqa(prompt: str, frames: List[numpy.ndarray]) -> str:
|
424
|
+
'qwen2_vl_video_vqa' is a tool that can answer any questions about arbitrary videos
|
425
|
+
including regular videos or videos of documents or presentations. It returns text
|
426
|
+
as an answer to the question.
|
427
|
+
|
428
|
+
Parameters:
|
429
|
+
prompt (str): The question about the video
|
430
|
+
frames (List[np.ndarray]): The reference frames used for the question
|
431
|
+
|
432
|
+
Returns:
|
433
|
+
str: A string which is the answer to the given prompt.
|
434
|
+
|
435
|
+
Example
|
436
|
+
-------
|
437
|
+
>>> qwen2_vl_video_vqa('Which football player made the goal?', frames)
|
438
|
+
'Lionel Messi'
|
439
|
+
",qwen2_vl_video_vqa
|
440
|
+
'video_temporal_localization' will run qwen2vl on each chunk_length_frames value selected for the video. It can detect multiple objects independently per chunk_length_frames given a text prompt such as a referring expression but does not track objects across frames. It returns a list of floats with a value of 1.0 if the objects are found in a given chunk_length_frames of the video.,"video_temporal_localization(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames: Optional[int] = 2) -> List[float]:
|
441
|
+
'video_temporal_localization' will run qwen2vl on each chunk_length_frames
|
442
|
+
value selected for the video. It can detect multiple objects independently per
|
443
|
+
chunk_length_frames given a text prompt such as a referring expression
|
444
|
+
but does not track objects across frames.
|
445
|
+
It returns a list of floats with a value of 1.0 if the objects are found in a given
|
446
|
+
chunk_length_frames of the video.
|
447
|
+
|
448
|
+
Parameters:
|
449
|
+
prompt (str): The question about the video
|
450
|
+
frames (List[np.ndarray]): The reference frames used for the question
|
451
|
+
model (str): The model to use for the inference. Valid values are
|
452
|
+
'qwen2vl', 'gpt4o', 'internlm-xcomposer'
|
453
|
+
chunk_length_frames (Optional[int]): length of each chunk in frames
|
454
|
+
|
455
|
+
Returns:
|
456
|
+
List[float]: A list of floats with a value of 1.0 if the objects to be found
|
457
|
+
are present in the chunk_length_frames of the video.
|
458
|
+
|
459
|
+
Example
|
460
|
+
-------
|
461
|
+
>>> video_temporal_localization('Did a goal happened?', frames)
|
462
|
+
[0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
|
463
|
+
",video_temporal_localization
|
464
|
+
"'flux_image_inpainting' performs image inpainting to fill the masked regions, given by mask, in the image, given image based on the text prompt and surrounding image context. It can be used to edit regions of an image according to the prompt given.","flux_image_inpainting(prompt: str, image: numpy.ndarray, mask: numpy.ndarray) -> numpy.ndarray:
|
465
|
+
'flux_image_inpainting' performs image inpainting to fill the masked regions,
|
466
|
+
given by mask, in the image, given image based on the text prompt and surrounding image context.
|
467
|
+
It can be used to edit regions of an image according to the prompt given.
|
468
|
+
|
469
|
+
Parameters:
|
470
|
+
prompt (str): A detailed text description guiding what should be generated
|
471
|
+
in the masked area. More detailed and specific prompts typically yield better results.
|
472
|
+
image (np.ndarray): The source image to be inpainted.
|
473
|
+
The image will serve as the base context for the inpainting process.
|
474
|
+
mask (np.ndarray): A binary mask image with 0's and 1's,
|
475
|
+
where 1 indicates areas to be inpainted and 0 indicates areas to be preserved.
|
476
|
+
|
477
|
+
Returns:
|
478
|
+
np.ndarray: The generated image(s) as a numpy array in RGB format with values
|
479
|
+
ranging from 0 to 255.
|
480
|
+
|
481
|
+
-------
|
482
|
+
Example:
|
483
|
+
>>> # Generate inpainting
|
484
|
+
>>> result = flux_image_inpainting(
|
485
|
+
... prompt=""a modern black leather sofa with white pillows"",
|
486
|
+
... image=image,
|
487
|
+
... mask=mask,
|
488
|
+
... )
|
489
|
+
>>> save_image(result, ""inpainted_room.png"")
|
490
|
+
",flux_image_inpainting
|
491
|
+
"'extract_frames_and_timestamps' extracts frames and timestamps from a video which can be a file path, url or youtube link, returns a list of dictionaries with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is the relative time in seconds where the frame was captured. The frame is a numpy array.","extract_frames_and_timestamps(video_uri: Union[str, pathlib.Path], fps: float = 1) -> List[Dict[str, Union[numpy.ndarray, float]]]:
|
492
|
+
'extract_frames_and_timestamps' extracts frames and timestamps from a video
|
493
|
+
which can be a file path, url or youtube link, returns a list of dictionaries
|
494
|
+
with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is
|
495
|
+
the relative time in seconds where the frame was captured. The frame is a numpy
|
496
|
+
array.
|
497
|
+
|
498
|
+
Parameters:
|
499
|
+
video_uri (Union[str, Path]): The path to the video file, url or youtube link
|
500
|
+
fps (float, optional): The frame rate per second to extract the frames. Defaults
|
501
|
+
to 1.
|
502
|
+
|
503
|
+
Returns:
|
504
|
+
List[Dict[str, Union[np.ndarray, float]]]: A list of dictionaries containing the
|
505
|
+
extracted frame as a numpy array and the timestamp in seconds.
|
506
|
+
|
507
|
+
Example
|
508
|
+
-------
|
509
|
+
>>> extract_frames(""path/to/video.mp4"")
|
510
|
+
[{""frame"": np.ndarray, ""timestamp"": 0.0}, ...]
|
511
|
+
",extract_frames_and_timestamps
|
512
|
+
'save_json' is a utility function that saves data as a JSON file. It is helpful for saving data that contains NumPy arrays which are not JSON serializable.,"save_json(data: Any, file_path: str) -> None:
|
513
|
+
'save_json' is a utility function that saves data as a JSON file. It is helpful
|
514
|
+
for saving data that contains NumPy arrays which are not JSON serializable.
|
515
|
+
|
516
|
+
Parameters:
|
517
|
+
data (Any): The data to save.
|
518
|
+
file_path (str): The path to save the JSON file.
|
519
|
+
|
520
|
+
Example
|
521
|
+
-------
|
522
|
+
>>> save_json(data, ""path/to/file.json"")
|
523
|
+
",save_json
|
524
|
+
'load_image' is a utility function that loads an image from the given file path string or an URL.,"load_image(image_path: str) -> numpy.ndarray:
|
525
|
+
'load_image' is a utility function that loads an image from the given file path string or an URL.
|
526
|
+
|
527
|
+
Parameters:
|
528
|
+
image_path (str): The path or URL to the image.
|
529
|
+
|
530
|
+
Returns:
|
531
|
+
np.ndarray: The image as a NumPy array.
|
532
|
+
|
533
|
+
Example
|
534
|
+
-------
|
535
|
+
>>> load_image(""path/to/image.jpg"")
|
536
|
+
",load_image
|
537
|
+
'save_image' is a utility function that saves an image to a file path.,"save_image(image: numpy.ndarray, file_path: str) -> None:
|
538
|
+
'save_image' is a utility function that saves an image to a file path.
|
539
|
+
|
540
|
+
Parameters:
|
541
|
+
image (np.ndarray): The image to save.
|
542
|
+
file_path (str): The path to save the image file.
|
543
|
+
|
544
|
+
Example
|
545
|
+
-------
|
546
|
+
>>> save_image(image)
|
547
|
+
",save_image
|
548
|
+
'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.,"save_video(frames: List[numpy.ndarray], output_video_path: Optional[str] = None, fps: float = 1) -> str:
|
549
|
+
'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.
|
550
|
+
|
551
|
+
Parameters:
|
552
|
+
frames (list[np.ndarray]): A list of frames to save.
|
553
|
+
output_video_path (str): The path to save the video file. If not provided, a temporary file will be created.
|
554
|
+
fps (float): The number of frames composes a second in the video.
|
555
|
+
|
556
|
+
Returns:
|
557
|
+
str: The path to the saved video file.
|
558
|
+
|
559
|
+
Example
|
560
|
+
-------
|
561
|
+
>>> save_video(frames)
|
562
|
+
""/tmp/tmpvideo123.mp4""
|
563
|
+
",save_video
|
564
|
+
'overlay_bounding_boxes' is a utility function that displays bounding boxes on an image.,"overlay_bounding_boxes(medias: Union[numpy.ndarray, List[numpy.ndarray]], bboxes: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]) -> Union[numpy.ndarray, List[numpy.ndarray]]:
|
565
|
+
'overlay_bounding_boxes' is a utility function that displays bounding boxes on
|
566
|
+
an image.
|
567
|
+
|
568
|
+
Parameters:
|
569
|
+
medias (Union[np.ndarray, List[np.ndarra]]): The image or frames to display the
|
570
|
+
bounding boxes on.
|
571
|
+
bboxes (Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]): A list of
|
572
|
+
dictionaries or a list of list of dictionaries containing the bounding
|
573
|
+
boxes.
|
574
|
+
|
575
|
+
Returns:
|
576
|
+
np.ndarray: The image with the bounding boxes, labels and scores displayed.
|
577
|
+
|
578
|
+
Example
|
579
|
+
-------
|
580
|
+
>>> image_with_bboxes = overlay_bounding_boxes(
|
581
|
+
image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
|
582
|
+
)
|
583
|
+
",overlay_bounding_boxes
|
584
|
+
'overlay_segmentation_masks' is a utility function that displays segmentation masks.,"overlay_segmentation_masks(medias: Union[numpy.ndarray, List[numpy.ndarray]], masks: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]], draw_label: bool = True, secondary_label_key: str = 'tracking_label') -> Union[numpy.ndarray, List[numpy.ndarray]]:
|
585
|
+
'overlay_segmentation_masks' is a utility function that displays segmentation
|
586
|
+
masks.
|
587
|
+
|
588
|
+
Parameters:
|
589
|
+
medias (Union[np.ndarray, List[np.ndarray]]): The image or frames to display
|
590
|
+
the masks on.
|
591
|
+
masks (Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]): A list of
|
592
|
+
dictionaries or a list of list of dictionaries containing the masks, labels
|
593
|
+
and scores.
|
594
|
+
draw_label (bool, optional): If True, the labels will be displayed on the image.
|
595
|
+
secondary_label_key (str, optional): The key to use for the secondary
|
596
|
+
tracking label which is needed in videos to display tracking information.
|
597
|
+
|
598
|
+
Returns:
|
599
|
+
np.ndarray: The image with the masks displayed.
|
600
|
+
|
601
|
+
Example
|
602
|
+
-------
|
603
|
+
>>> image_with_masks = overlay_segmentation_masks(
|
604
|
+
image,
|
605
|
+
[{
|
606
|
+
'score': 0.99,
|
607
|
+
'label': 'dinosaur',
|
608
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
609
|
+
[0, 0, 0, ..., 0, 0, 0],
|
610
|
+
...,
|
611
|
+
[0, 0, 0, ..., 0, 0, 0],
|
612
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
613
|
+
}],
|
614
|
+
)
|
615
|
+
",overlay_segmentation_masks
|
616
|
+
'overlay_heat_map' is a utility function that displays a heat map on an image.,"overlay_heat_map(image: numpy.ndarray, heat_map: Dict[str, Any], alpha: float = 0.8) -> numpy.ndarray:
|
617
|
+
'overlay_heat_map' is a utility function that displays a heat map on an image.
|
618
|
+
|
619
|
+
Parameters:
|
620
|
+
image (np.ndarray): The image to display the heat map on.
|
621
|
+
heat_map (Dict[str, Any]): A dictionary containing the heat map under the key
|
622
|
+
'heat_map'.
|
623
|
+
alpha (float, optional): The transparency of the overlay. Defaults to 0.8.
|
624
|
+
|
625
|
+
Returns:
|
626
|
+
np.ndarray: The image with the heat map displayed.
|
627
|
+
|
628
|
+
Example
|
629
|
+
-------
|
630
|
+
>>> image_with_heat_map = overlay_heat_map(
|
631
|
+
image,
|
632
|
+
{
|
633
|
+
'heat_map': array([[0, 0, 0, ..., 0, 0, 0],
|
634
|
+
[0, 0, 0, ..., 0, 0, 0],
|
635
|
+
...,
|
636
|
+
[0, 0, 0, ..., 0, 0, 0],
|
637
|
+
[0, 0, 0, ..., 125, 125, 125]], dtype=uint8),
|
638
|
+
},
|
639
|
+
)
|
640
|
+
",overlay_heat_map
|
Binary file
|
vision_agent/agent/__init__.py
CHANGED
@@ -7,6 +7,7 @@ from .vision_agent_coder import (
|
|
7
7
|
OpenAIVisionAgentCoder,
|
8
8
|
VisionAgentCoder,
|
9
9
|
)
|
10
|
+
from .vision_agent_coder_v2 import VisionAgentCoderV2
|
10
11
|
from .vision_agent_planner import (
|
11
12
|
AnthropicVisionAgentPlanner,
|
12
13
|
AzureVisionAgentPlanner,
|
@@ -15,3 +16,4 @@ from .vision_agent_planner import (
|
|
15
16
|
PlanContext,
|
16
17
|
VisionAgentPlanner,
|
17
18
|
)
|
19
|
+
from .vision_agent_planner_v2 import VisionAgentPlannerV2
|