vision-agent 0.2.221__py3-none-any.whl → 0.2.222__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/.sim_tools/df.csv +253 -244
- vision_agent/.sim_tools/embs.npy +0 -0
- vision_agent/agent/vision_agent_planner_prompts_v2.py +28 -23
- vision_agent/tools/__init__.py +6 -10
- vision_agent/tools/tools.py +639 -787
- vision_agent/utils/sim.py +24 -1
- {vision_agent-0.2.221.dist-info → vision_agent-0.2.222.dist-info}/METADATA +1 -1
- {vision_agent-0.2.221.dist-info → vision_agent-0.2.222.dist-info}/RECORD +10 -10
- {vision_agent-0.2.221.dist-info → vision_agent-0.2.222.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.221.dist-info → vision_agent-0.2.222.dist-info}/WHEEL +0 -0
vision_agent/.sim_tools/df.csv
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
desc,doc,name
|
2
|
-
"'
|
3
|
-
'
|
4
|
-
prompt such as category names or referring expressions on images. The
|
5
|
-
text prompt are separated by commas. It returns a list of bounding
|
6
|
-
normalized coordinates, label names and associated probability scores.
|
2
|
+
"'owlv2_object_detection' is a tool that can detect and count multiple objects given a text prompt such as category names or referring expressions on images. The categories in text prompt are separated by commas. It returns a list of bounding boxes with normalized coordinates, label names and associated probability scores.","owlv2_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.1, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
|
3
|
+
'owlv2_object_detection' is a tool that can detect and count multiple objects
|
4
|
+
given a text prompt such as category names or referring expressions on images. The
|
5
|
+
categories in text prompt are separated by commas. It returns a list of bounding
|
6
|
+
boxes with normalized coordinates, label names and associated probability scores.
|
7
7
|
|
8
8
|
Parameters:
|
9
9
|
prompt (str): The prompt to ground to the image.
|
@@ -22,96 +22,87 @@ desc,doc,name
|
|
22
22
|
|
23
23
|
Example
|
24
24
|
-------
|
25
|
-
>>>
|
25
|
+
>>> owlv2_object_detection(""car, dinosaur"", image)
|
26
26
|
[
|
27
27
|
{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
28
28
|
{'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
29
29
|
]
|
30
|
-
",
|
31
|
-
"'
|
32
|
-
'
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
30
|
+
",owlv2_object_detection
|
31
|
+
"'owlv2_sam2_instance_segmentation' is a tool that can detect and count multiple instances of objects given a text prompt such as category names or referring expressions on images. The categories in text prompt are separated by commas. It returns a list of bounding boxes with normalized coordinates, label names, masks and associated probability scores.","owlv2_sam2_instance_segmentation(prompt: str, image: numpy.ndarray, box_threshold: float = 0.1) -> List[Dict[str, Any]]:
|
32
|
+
'owlv2_sam2_instance_segmentation' is a tool that can detect and count multiple
|
33
|
+
instances of objects given a text prompt such as category names or referring
|
34
|
+
expressions on images. The categories in text prompt are separated by commas. It
|
35
|
+
returns a list of bounding boxes with normalized coordinates, label names, masks
|
36
|
+
and associated probability scores.
|
37
37
|
|
38
38
|
Parameters:
|
39
|
-
prompt (str): The
|
40
|
-
|
41
|
-
box_threshold (float, optional): The threshold for
|
42
|
-
to 0.
|
43
|
-
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
44
|
-
fine-tuned model ID here to use it.
|
39
|
+
prompt (str): The object that needs to be counted.
|
40
|
+
image (np.ndarray): The image that contains multiple instances of the object.
|
41
|
+
box_threshold (float, optional): The threshold for detection. Defaults
|
42
|
+
to 0.10.
|
45
43
|
|
46
44
|
Returns:
|
47
|
-
List[
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
45
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
46
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
47
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
48
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
49
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
50
|
+
the background.
|
52
51
|
|
53
52
|
Example
|
54
53
|
-------
|
55
|
-
>>>
|
54
|
+
>>> owlv2_sam2_instance_segmentation(""flower"", image)
|
56
55
|
[
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
56
|
+
{
|
57
|
+
'score': 0.49,
|
58
|
+
'label': 'flower',
|
59
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
60
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
61
|
+
[0, 0, 0, ..., 0, 0, 0],
|
62
|
+
...,
|
63
|
+
[0, 0, 0, ..., 0, 0, 0],
|
64
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
65
|
+
},
|
62
66
|
]
|
63
|
-
",
|
64
|
-
"'
|
65
|
-
'
|
66
|
-
|
67
|
-
|
67
|
+
",owlv2_sam2_instance_segmentation
|
68
|
+
"'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores.","owlv2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
|
69
|
+
'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
|
70
|
+
prompt such as category names or referring expressions. The categories in the text
|
71
|
+
prompt are separated by commas. It returns a list of bounding boxes, label names,
|
72
|
+
mask file names and associated probability scores.
|
68
73
|
|
69
74
|
Parameters:
|
70
|
-
|
75
|
+
prompt (str): The prompt to ground to the image.
|
76
|
+
image (np.ndarray): The image to ground the prompt to.
|
77
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
78
|
+
fine-tuned model ID here to use it.
|
71
79
|
|
72
80
|
Returns:
|
73
|
-
List[Dict[str, Any]]: A list of dictionaries containing the
|
74
|
-
|
81
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
82
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
83
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
84
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
85
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
86
|
+
the background.
|
75
87
|
|
76
88
|
Example
|
77
89
|
-------
|
78
|
-
>>>
|
90
|
+
>>> owlv2_sam2_video_tracking(""car, dinosaur"", frames)
|
79
91
|
[
|
80
|
-
|
92
|
+
[
|
93
|
+
{
|
94
|
+
'label': '0: dinosaur',
|
95
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
96
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
97
|
+
[0, 0, 0, ..., 0, 0, 0],
|
98
|
+
...,
|
99
|
+
[0, 0, 0, ..., 0, 0, 0],
|
100
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
101
|
+
},
|
102
|
+
],
|
103
|
+
...
|
81
104
|
]
|
82
|
-
",
|
83
|
-
'vit_image_classification' is a tool that can classify an image. It returns a list of classes and their probability scores based on image content.,"vit_image_classification(image: numpy.ndarray) -> Dict[str, Any]:
|
84
|
-
'vit_image_classification' is a tool that can classify an image. It returns a
|
85
|
-
list of classes and their probability scores based on image content.
|
86
|
-
|
87
|
-
Parameters:
|
88
|
-
image (np.ndarray): The image to classify or tag
|
89
|
-
|
90
|
-
Returns:
|
91
|
-
Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
|
92
|
-
contains a list of labels and other a list of scores.
|
93
|
-
|
94
|
-
Example
|
95
|
-
-------
|
96
|
-
>>> vit_image_classification(image)
|
97
|
-
{""labels"": [""leopard"", ""lemur, otter"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
|
98
|
-
",vit_image_classification
|
99
|
-
'vit_nsfw_classification' is a tool that can classify an image as 'nsfw' or 'normal'. It returns the predicted label and their probability scores based on image content.,"vit_nsfw_classification(image: numpy.ndarray) -> Dict[str, Any]:
|
100
|
-
'vit_nsfw_classification' is a tool that can classify an image as 'nsfw' or 'normal'.
|
101
|
-
It returns the predicted label and their probability scores based on image content.
|
102
|
-
|
103
|
-
Parameters:
|
104
|
-
image (np.ndarray): The image to classify or tag
|
105
|
-
|
106
|
-
Returns:
|
107
|
-
Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
|
108
|
-
contains a list of labels and other a list of scores.
|
109
|
-
|
110
|
-
Example
|
111
|
-
-------
|
112
|
-
>>> vit_nsfw_classification(image)
|
113
|
-
{""label"": ""normal"", ""scores"": 0.68},
|
114
|
-
",vit_nsfw_classification
|
105
|
+
",owlv2_sam2_video_tracking
|
115
106
|
"'countgd_object_detection' is a tool that can detect multiple instances of an object given a text prompt. It is particularly useful when trying to detect and count a large number of objects. You can optionally separate object names in the prompt with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores.","countgd_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
|
116
107
|
'countgd_object_detection' is a tool that can detect multiple instances of an
|
117
108
|
object given a text prompt. It is particularly useful when trying to detect and
|
@@ -142,12 +133,12 @@ desc,doc,name
|
|
142
133
|
{'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
|
143
134
|
]
|
144
135
|
",countgd_object_detection
|
145
|
-
"'
|
146
|
-
'
|
147
|
-
an object given a text prompt. It is particularly useful when trying
|
148
|
-
count a large number of objects. You can optionally separate object
|
149
|
-
prompt with commas. It returns a list of bounding boxes with
|
150
|
-
label names, masks associated confidence scores.
|
136
|
+
"'countgd_sam2_instance_segmentation' is a tool that can detect multiple instances of an object given a text prompt. It is particularly useful when trying to detect and count a large number of objects. You can optionally separate object names in the prompt with commas. It returns a list of bounding boxes with normalized coordinates, label names, masks associated confidence scores.","countgd_sam2_instance_segmentation(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
|
137
|
+
'countgd_sam2_instance_segmentation' is a tool that can detect multiple
|
138
|
+
instances of an object given a text prompt. It is particularly useful when trying
|
139
|
+
to detect and count a large number of objects. You can optionally separate object
|
140
|
+
names in the prompt with commas. It returns a list of bounding boxes with
|
141
|
+
normalized coordinates, label names, masks associated confidence scores.
|
151
142
|
|
152
143
|
Parameters:
|
153
144
|
prompt (str): The object that needs to be counted.
|
@@ -165,7 +156,7 @@ desc,doc,name
|
|
165
156
|
|
166
157
|
Example
|
167
158
|
-------
|
168
|
-
>>>
|
159
|
+
>>> countgd_sam2_instance_segmentation(""flower"", image)
|
169
160
|
[
|
170
161
|
{
|
171
162
|
'score': 0.49,
|
@@ -178,7 +169,45 @@ desc,doc,name
|
|
178
169
|
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
179
170
|
},
|
180
171
|
]
|
181
|
-
",
|
172
|
+
",countgd_sam2_instance_segmentation
|
173
|
+
"'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores.","countgd_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10) -> List[List[Dict[str, Any]]]:
|
174
|
+
'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
|
175
|
+
prompt such as category names or referring expressions. The categories in the text
|
176
|
+
prompt are separated by commas. It returns a list of bounding boxes, label names,
|
177
|
+
mask file names and associated probability scores.
|
178
|
+
|
179
|
+
Parameters:
|
180
|
+
prompt (str): The prompt to ground to the image.
|
181
|
+
image (np.ndarray): The image to ground the prompt to.
|
182
|
+
chunk_length (Optional[int]): The number of frames to re-run florence2 to find
|
183
|
+
new objects.
|
184
|
+
|
185
|
+
Returns:
|
186
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
187
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
188
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
189
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
190
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
191
|
+
the background.
|
192
|
+
|
193
|
+
Example
|
194
|
+
-------
|
195
|
+
>>> countgd_sam2_video_tracking(""car, dinosaur"", frames)
|
196
|
+
[
|
197
|
+
[
|
198
|
+
{
|
199
|
+
'label': '0: dinosaur',
|
200
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
201
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
202
|
+
[0, 0, 0, ..., 0, 0, 0],
|
203
|
+
...,
|
204
|
+
[0, 0, 0, ..., 0, 0, 0],
|
205
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
206
|
+
},
|
207
|
+
],
|
208
|
+
...
|
209
|
+
]
|
210
|
+
",countgd_sam2_video_tracking
|
182
211
|
"'florence2_ocr' is a tool that can detect text and text regions in an image. Each text region contains one line of text. It returns a list of detected text, the text region as a bounding box with normalized coordinates, and confidence scores. The results are sorted from top-left to bottom right.","florence2_ocr(image: numpy.ndarray) -> List[Dict[str, Any]]:
|
183
212
|
'florence2_ocr' is a tool that can detect text and text regions in an image.
|
184
213
|
Each text region contains one line of text. It returns a list of detected text,
|
@@ -199,11 +228,12 @@ desc,doc,name
|
|
199
228
|
{'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
|
200
229
|
]
|
201
230
|
",florence2_ocr
|
202
|
-
"'
|
203
|
-
'
|
204
|
-
prompt such as category names or referring expressions. The
|
205
|
-
prompt are separated by commas. It returns a list of
|
206
|
-
mask file names and associated probability scores of
|
231
|
+
"'florence2_sam2_instance_segmentation' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores of 1.0.","florence2_sam2_instance_segmentation(prompt: str, image: numpy.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
|
232
|
+
'florence2_sam2_instance_segmentation' is a tool that can segment multiple
|
233
|
+
objects given a text prompt such as category names or referring expressions. The
|
234
|
+
categories in the text prompt are separated by commas. It returns a list of
|
235
|
+
bounding boxes, label names, mask file names and associated probability scores of
|
236
|
+
1.0.
|
207
237
|
|
208
238
|
Parameters:
|
209
239
|
prompt (str): The prompt to ground to the image.
|
@@ -221,7 +251,7 @@ desc,doc,name
|
|
221
251
|
|
222
252
|
Example
|
223
253
|
-------
|
224
|
-
>>>
|
254
|
+
>>> florence2_sam2_instance_segmentation(""car, dinosaur"", image)
|
225
255
|
[
|
226
256
|
{
|
227
257
|
'score': 1.0,
|
@@ -234,7 +264,7 @@ desc,doc,name
|
|
234
264
|
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
235
265
|
},
|
236
266
|
]
|
237
|
-
",
|
267
|
+
",florence2_sam2_instance_segmentation
|
238
268
|
'florence2_sam2_video_tracking' is a tool that can segment and track multiple entities in a video given a text prompt such as category names or referring expressions. You can optionally separate the categories in the text with commas. It can find new objects every 'chunk_length' frames and is useful for tracking and counting without duplicating counts and always outputs scores of 1.0.,"florence2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
|
239
269
|
'florence2_sam2_video_tracking' is a tool that can segment and track multiple
|
240
270
|
entities in a video given a text prompt such as category names or referring
|
@@ -259,7 +289,7 @@ desc,doc,name
|
|
259
289
|
|
260
290
|
Example
|
261
291
|
-------
|
262
|
-
>>>
|
292
|
+
>>> florence2_sam2_video_tracking(""car, dinosaur"", frames)
|
263
293
|
[
|
264
294
|
[
|
265
295
|
{
|
@@ -275,8 +305,8 @@ desc,doc,name
|
|
275
305
|
...
|
276
306
|
]
|
277
307
|
",florence2_sam2_video_tracking
|
278
|
-
"'
|
279
|
-
'
|
308
|
+
"'florence2_object_detection' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores of 1.0.","florence2_object_detection(prompt: str, image: numpy.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
|
309
|
+
'florence2_object_detection' is a tool that can detect multiple
|
280
310
|
objects given a text prompt which can be object names or caption. You
|
281
311
|
can optionally separate the object names in the text with commas. It returns a list
|
282
312
|
of bounding boxes with normalized coordinates, label names and associated
|
@@ -297,12 +327,12 @@ desc,doc,name
|
|
297
327
|
|
298
328
|
Example
|
299
329
|
-------
|
300
|
-
>>>
|
330
|
+
>>> florence2_object_detection('person looking at a coyote', image)
|
301
331
|
[
|
302
332
|
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
303
333
|
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
304
334
|
]
|
305
|
-
",
|
335
|
+
",florence2_object_detection
|
306
336
|
'claude35_text_extraction' is a tool that can extract text from an image. It returns the extracted text as a string and can be used as an alternative to OCR if you do not need to know the exact bounding box of the text.,"claude35_text_extraction(image: numpy.ndarray) -> str:
|
307
337
|
'claude35_text_extraction' is a tool that can extract text from an image. It
|
308
338
|
returns the extracted text as a string and can be used as an alternative to OCR if
|
@@ -314,6 +344,107 @@ desc,doc,name
|
|
314
344
|
Returns:
|
315
345
|
str: The extracted text from the image.
|
316
346
|
",claude35_text_extraction
|
347
|
+
"'document_extraction' is a tool that can extract structured information out of documents with different layouts. It returns the extracted data in a structured hierarchical format containing text, tables, pictures, charts, and other information.","document_extraction(image: numpy.ndarray) -> Dict[str, Any]:
|
348
|
+
'document_extraction' is a tool that can extract structured information out of
|
349
|
+
documents with different layouts. It returns the extracted data in a structured
|
350
|
+
hierarchical format containing text, tables, pictures, charts, and other
|
351
|
+
information.
|
352
|
+
|
353
|
+
Parameters:
|
354
|
+
image (np.ndarray): The document image to analyze
|
355
|
+
|
356
|
+
Returns:
|
357
|
+
Dict[str, Any]: A dictionary containing the extracted information.
|
358
|
+
|
359
|
+
Example
|
360
|
+
-------
|
361
|
+
>>> document_analysis(image)
|
362
|
+
{'pages':
|
363
|
+
[{'bbox': [0, 0, 1.0, 1.0],
|
364
|
+
'chunks': [{'bbox': [0.8, 0.1, 1.0, 0.2],
|
365
|
+
'label': 'page_header',
|
366
|
+
'order': 75
|
367
|
+
'caption': 'Annual Report 2024',
|
368
|
+
'summary': 'This annual report summarizes ...' },
|
369
|
+
{'bbox': [0.2, 0.9, 0.9, 1.0],
|
370
|
+
'label': 'table',
|
371
|
+
'order': 1119,
|
372
|
+
'caption': [{'Column 1': 'Value 1', 'Column 2': 'Value 2'},
|
373
|
+
'summary': 'This table illustrates a trend of ...'},
|
374
|
+
],
|
375
|
+
",document_extraction
|
376
|
+
"'document_qa' is a tool that can answer any questions about arbitrary documents, presentations, or tables. It's very useful for document QA tasks, you can ask it a specific question or ask it to return a JSON object answering multiple questions about the document.","document_qa(prompt: str, image: numpy.ndarray) -> str:
|
377
|
+
'document_qa' is a tool that can answer any questions about arbitrary documents,
|
378
|
+
presentations, or tables. It's very useful for document QA tasks, you can ask it a
|
379
|
+
specific question or ask it to return a JSON object answering multiple questions
|
380
|
+
about the document.
|
381
|
+
|
382
|
+
Parameters:
|
383
|
+
prompt (str): The question to be answered about the document image.
|
384
|
+
image (np.ndarray): The document image to analyze.
|
385
|
+
|
386
|
+
Returns:
|
387
|
+
str: The answer to the question based on the document's context.
|
388
|
+
|
389
|
+
Example
|
390
|
+
-------
|
391
|
+
>>> document_qa(image, question)
|
392
|
+
'The answer to the question ...'
|
393
|
+
",document_qa
|
394
|
+
"'ocr' extracts text from an image. It returns a list of detected text, bounding boxes with normalized coordinates, and confidence scores. The results are sorted from top-left to bottom right.","ocr(image: numpy.ndarray) -> List[Dict[str, Any]]:
|
395
|
+
'ocr' extracts text from an image. It returns a list of detected text, bounding
|
396
|
+
boxes with normalized coordinates, and confidence scores. The results are sorted
|
397
|
+
from top-left to bottom right.
|
398
|
+
|
399
|
+
Parameters:
|
400
|
+
image (np.ndarray): The image to extract text from.
|
401
|
+
|
402
|
+
Returns:
|
403
|
+
List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
|
404
|
+
with normalized coordinates, and confidence score.
|
405
|
+
|
406
|
+
Example
|
407
|
+
-------
|
408
|
+
>>> ocr(image)
|
409
|
+
[
|
410
|
+
{'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
|
411
|
+
]
|
412
|
+
",ocr
|
413
|
+
'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary images including regular images or images of documents or presentations. It can be very useful for document QA or OCR text extraction. It returns text as an answer to the question.,"qwen2_vl_images_vqa(prompt: str, images: List[numpy.ndarray]) -> str:
|
414
|
+
'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary
|
415
|
+
images including regular images or images of documents or presentations. It can be
|
416
|
+
very useful for document QA or OCR text extraction. It returns text as an answer to
|
417
|
+
the question.
|
418
|
+
|
419
|
+
Parameters:
|
420
|
+
prompt (str): The question about the document image
|
421
|
+
images (List[np.ndarray]): The reference images used for the question
|
422
|
+
|
423
|
+
Returns:
|
424
|
+
str: A string which is the answer to the given prompt.
|
425
|
+
|
426
|
+
Example
|
427
|
+
-------
|
428
|
+
>>> qwen2_vl_images_vqa('Give a summary of the document', images)
|
429
|
+
'The document talks about the history of the United States of America and its...'
|
430
|
+
",qwen2_vl_images_vqa
|
431
|
+
'qwen2_vl_video_vqa' is a tool that can answer any questions about arbitrary videos including regular videos or videos of documents or presentations. It returns text as an answer to the question.,"qwen2_vl_video_vqa(prompt: str, frames: List[numpy.ndarray]) -> str:
|
432
|
+
'qwen2_vl_video_vqa' is a tool that can answer any questions about arbitrary videos
|
433
|
+
including regular videos or videos of documents or presentations. It returns text
|
434
|
+
as an answer to the question.
|
435
|
+
|
436
|
+
Parameters:
|
437
|
+
prompt (str): The question about the video
|
438
|
+
frames (List[np.ndarray]): The reference frames used for the question
|
439
|
+
|
440
|
+
Returns:
|
441
|
+
str: A string which is the answer to the given prompt.
|
442
|
+
|
443
|
+
Example
|
444
|
+
-------
|
445
|
+
>>> qwen2_vl_video_vqa('Which football player made the goal?', frames)
|
446
|
+
'Lionel Messi'
|
447
|
+
",qwen2_vl_video_vqa
|
317
448
|
"'detr_segmentation' is a tool that can segment common objects in an image without any text prompt. It returns a list of detected objects as labels, their regions as masks and their scores.","detr_segmentation(image: numpy.ndarray) -> List[Dict[str, Any]]:
|
318
449
|
'detr_segmentation' is a tool that can segment common objects in an
|
319
450
|
image without any text prompt. It returns a list of detected objects
|
@@ -391,106 +522,38 @@ desc,doc,name
|
|
391
522
|
[10, 11, 15, ..., 202, 202, 205],
|
392
523
|
[10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
|
393
524
|
",generate_pose_image
|
394
|
-
|
395
|
-
'
|
396
|
-
|
397
|
-
between the objects, not the distance between the centers of the objects.
|
398
|
-
|
399
|
-
Parameters:
|
400
|
-
det1 (Dict[str, Any]): The first detection of boxes or masks.
|
401
|
-
det2 (Dict[str, Any]): The second detection of boxes or masks.
|
402
|
-
image_size (Tuple[int, int]): The size of the image given as (height, width).
|
403
|
-
|
404
|
-
Returns:
|
405
|
-
float: The closest distance between the two detections.
|
406
|
-
|
407
|
-
Example
|
408
|
-
-------
|
409
|
-
>>> closest_distance(det1, det2, image_size)
|
410
|
-
141.42
|
411
|
-
",minimum_distance
|
412
|
-
'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary images including regular images or images of documents or presentations. It can be very useful for document QA or OCR text extraction. It returns text as an answer to the question.,"qwen2_vl_images_vqa(prompt: str, images: List[numpy.ndarray]) -> str:
|
413
|
-
'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary
|
414
|
-
images including regular images or images of documents or presentations. It can be
|
415
|
-
very useful for document QA or OCR text extraction. It returns text as an answer to
|
416
|
-
the question.
|
417
|
-
|
418
|
-
Parameters:
|
419
|
-
prompt (str): The question about the document image
|
420
|
-
images (List[np.ndarray]): The reference images used for the question
|
421
|
-
|
422
|
-
Returns:
|
423
|
-
str: A string which is the answer to the given prompt.
|
424
|
-
|
425
|
-
Example
|
426
|
-
-------
|
427
|
-
>>> qwen2_vl_images_vqa('Give a summary of the document', images)
|
428
|
-
'The document talks about the history of the United States of America and its...'
|
429
|
-
",qwen2_vl_images_vqa
|
430
|
-
'qwen2_vl_video_vqa' is a tool that can answer any questions about arbitrary videos including regular videos or videos of documents or presentations. It returns text as an answer to the question.,"qwen2_vl_video_vqa(prompt: str, frames: List[numpy.ndarray]) -> str:
|
431
|
-
'qwen2_vl_video_vqa' is a tool that can answer any questions about arbitrary videos
|
432
|
-
including regular videos or videos of documents or presentations. It returns text
|
433
|
-
as an answer to the question.
|
434
|
-
|
435
|
-
Parameters:
|
436
|
-
prompt (str): The question about the video
|
437
|
-
frames (List[np.ndarray]): The reference frames used for the question
|
438
|
-
|
439
|
-
Returns:
|
440
|
-
str: A string which is the answer to the given prompt.
|
441
|
-
|
442
|
-
Example
|
443
|
-
-------
|
444
|
-
>>> qwen2_vl_video_vqa('Which football player made the goal?', frames)
|
445
|
-
'Lionel Messi'
|
446
|
-
",qwen2_vl_video_vqa
|
447
|
-
"'document_extraction' is a tool that can extract structured information out of documents with different layouts. It returns the extracted data in a structured hierarchical format containing text, tables, pictures, charts, and other information.","document_extraction(image: numpy.ndarray) -> Dict[str, Any]:
|
448
|
-
'document_extraction' is a tool that can extract structured information out of
|
449
|
-
documents with different layouts. It returns the extracted data in a structured
|
450
|
-
hierarchical format containing text, tables, pictures, charts, and other
|
451
|
-
information.
|
525
|
+
'vit_image_classification' is a tool that can classify an image. It returns a list of classes and their probability scores based on image content.,"vit_image_classification(image: numpy.ndarray) -> Dict[str, Any]:
|
526
|
+
'vit_image_classification' is a tool that can classify an image. It returns a
|
527
|
+
list of classes and their probability scores based on image content.
|
452
528
|
|
453
529
|
Parameters:
|
454
|
-
image (np.ndarray): The
|
530
|
+
image (np.ndarray): The image to classify or tag
|
455
531
|
|
456
532
|
Returns:
|
457
|
-
Dict[str, Any]: A dictionary containing the
|
533
|
+
Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
|
534
|
+
contains a list of labels and other a list of scores.
|
458
535
|
|
459
536
|
Example
|
460
537
|
-------
|
461
|
-
>>>
|
462
|
-
{
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
'caption': 'Annual Report 2024',
|
468
|
-
'summary': 'This annual report summarizes ...' },
|
469
|
-
{'bbox': [0.2, 0.9, 0.9, 1.0],
|
470
|
-
'label': table',
|
471
|
-
'order': 1119,
|
472
|
-
'caption': [{'Column 1': 'Value 1', 'Column 2': 'Value 2'},
|
473
|
-
'summary': 'This table illustrates a trend of ...'},
|
474
|
-
],
|
475
|
-
",document_extraction
|
476
|
-
"'document_qa' is a tool that can answer any questions about arbitrary documents, presentations, or tables. It's very useful for document QA tasks, you can ask it a specific question or ask it to return a JSON object answering multiple questions about the document.","document_qa(prompt: str, image: numpy.ndarray) -> str:
|
477
|
-
'document_qa' is a tool that can answer any questions about arbitrary documents,
|
478
|
-
presentations, or tables. It's very useful for document QA tasks, you can ask it a
|
479
|
-
specific question or ask it to return a JSON object answering multiple questions
|
480
|
-
about the document.
|
538
|
+
>>> vit_image_classification(image)
|
539
|
+
{""labels"": [""leopard"", ""lemur, otter"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
|
540
|
+
",vit_image_classification
|
541
|
+
'vit_nsfw_classification' is a tool that can classify an image as 'nsfw' or 'normal'. It returns the predicted label and their probability scores based on image content.,"vit_nsfw_classification(image: numpy.ndarray) -> Dict[str, Any]:
|
542
|
+
'vit_nsfw_classification' is a tool that can classify an image as 'nsfw' or 'normal'.
|
543
|
+
It returns the predicted label and their probability scores based on image content.
|
481
544
|
|
482
545
|
Parameters:
|
483
|
-
|
484
|
-
image (np.ndarray): The document image to analyze.
|
546
|
+
image (np.ndarray): The image to classify or tag
|
485
547
|
|
486
548
|
Returns:
|
487
|
-
str:
|
549
|
+
Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
|
550
|
+
contains a list of labels and other a list of scores.
|
488
551
|
|
489
552
|
Example
|
490
553
|
-------
|
491
|
-
>>>
|
492
|
-
|
493
|
-
",
|
554
|
+
>>> vit_nsfw_classification(image)
|
555
|
+
{""label"": ""normal"", ""scores"": 0.68},
|
556
|
+
",vit_nsfw_classification
|
494
557
|
'video_temporal_localization' will run qwen2vl on each chunk_length_frames value selected for the video. It can detect multiple objects independently per chunk_length_frames given a text prompt such as a referring expression but does not track objects across frames. It returns a list of floats with a value of 1.0 if the objects are found in a given chunk_length_frames of the video.,"video_temporal_localization(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames: Optional[int] = 2) -> List[float]:
|
495
558
|
'video_temporal_localization' will run qwen2vl on each chunk_length_frames
|
496
559
|
value selected for the video. It can detect multiple objects independently per
|
@@ -560,78 +623,24 @@ desc,doc,name
|
|
560
623
|
>>> siglip_classification(image, ['dog', 'cat', 'bird'])
|
561
624
|
{""labels"": [""dog"", ""cat"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
|
562
625
|
",siglip_classification
|
563
|
-
"'
|
564
|
-
'
|
565
|
-
|
566
|
-
|
567
|
-
mask file names and associated probability scores.
|
568
|
-
|
569
|
-
Parameters:
|
570
|
-
prompt (str): The prompt to ground to the image.
|
571
|
-
image (np.ndarray): The image to ground the prompt to.
|
572
|
-
|
573
|
-
Returns:
|
574
|
-
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
575
|
-
bounding box, and mask of the detected objects with normalized coordinates
|
576
|
-
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
577
|
-
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
578
|
-
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
579
|
-
the background.
|
580
|
-
|
581
|
-
Example
|
582
|
-
-------
|
583
|
-
>>> countgd_sam2_video_tracking(""car, dinosaur"", frames)
|
584
|
-
[
|
585
|
-
[
|
586
|
-
{
|
587
|
-
'label': '0: dinosaur',
|
588
|
-
'bbox': [0.1, 0.11, 0.35, 0.4],
|
589
|
-
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
590
|
-
[0, 0, 0, ..., 0, 0, 0],
|
591
|
-
...,
|
592
|
-
[0, 0, 0, ..., 0, 0, 0],
|
593
|
-
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
594
|
-
},
|
595
|
-
],
|
596
|
-
...
|
597
|
-
]
|
598
|
-
",owlv2_sam2_video_tracking
|
599
|
-
"'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores.","countgd_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10) -> List[List[Dict[str, Any]]]:
|
600
|
-
'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
|
601
|
-
prompt such as category names or referring expressions. The categories in the text
|
602
|
-
prompt are separated by commas. It returns a list of bounding boxes, label names,
|
603
|
-
mask file names and associated probability scores.
|
626
|
+
"'minimum_distance' calculates the minimum distance between two detections which can include bounding boxes and or masks. This will return the closest distance between the objects, not the distance between the centers of the objects.","minimum_distance(det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]) -> float:
|
627
|
+
'minimum_distance' calculates the minimum distance between two detections which
|
628
|
+
can include bounding boxes and or masks. This will return the closest distance
|
629
|
+
between the objects, not the distance between the centers of the objects.
|
604
630
|
|
605
631
|
Parameters:
|
606
|
-
|
607
|
-
|
632
|
+
det1 (Dict[str, Any]): The first detection of boxes or masks.
|
633
|
+
det2 (Dict[str, Any]): The second detection of boxes or masks.
|
634
|
+
image_size (Tuple[int, int]): The size of the image given as (height, width).
|
608
635
|
|
609
636
|
Returns:
|
610
|
-
|
611
|
-
bounding box, and mask of the detected objects with normalized coordinates
|
612
|
-
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
613
|
-
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
614
|
-
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
615
|
-
the background.
|
637
|
+
float: The closest distance between the two detections.
|
616
638
|
|
617
639
|
Example
|
618
640
|
-------
|
619
|
-
>>>
|
620
|
-
|
621
|
-
|
622
|
-
{
|
623
|
-
'label': '0: dinosaur',
|
624
|
-
'bbox': [0.1, 0.11, 0.35, 0.4],
|
625
|
-
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
626
|
-
[0, 0, 0, ..., 0, 0, 0],
|
627
|
-
...,
|
628
|
-
[0, 0, 0, ..., 0, 0, 0],
|
629
|
-
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
630
|
-
},
|
631
|
-
],
|
632
|
-
...
|
633
|
-
]
|
634
|
-
",countgd_sam2_video_tracking
|
641
|
+
>>> closest_distance(det1, det2, image_size)
|
642
|
+
141.42
|
643
|
+
",minimum_distance
|
635
644
|
"'extract_frames_and_timestamps' extracts frames and timestamps from a video which can be a file path, url or youtube link, returns a list of dictionaries with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is the relative time in seconds where the frame was captured. The frame is a numpy array.","extract_frames_and_timestamps(video_uri: Union[str, pathlib.Path], fps: float = 1) -> List[Dict[str, Union[numpy.ndarray, float]]]:
|
636
645
|
'extract_frames_and_timestamps' extracts frames and timestamps from a video
|
637
646
|
which can be a file path, url or youtube link, returns a list of dictionaries
|