vision-agent 0.2.56__py3-none-any.whl → 0.2.58__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/__init__.py +1 -2
- vision_agent/agent/agent.py +3 -1
- vision_agent/agent/vision_agent.py +110 -81
- vision_agent/agent/vision_agent_prompts.py +1 -1
- vision_agent/lmm/__init__.py +1 -1
- vision_agent/lmm/lmm.py +54 -116
- vision_agent/tools/__init__.py +2 -1
- vision_agent/tools/tools.py +3 -3
- {vision_agent-0.2.56.dist-info → vision_agent-0.2.58.dist-info}/METADATA +36 -7
- vision_agent-0.2.58.dist-info/RECORD +23 -0
- vision_agent/agent/agent_coder.py +0 -216
- vision_agent/agent/agent_coder_prompts.py +0 -135
- vision_agent/agent/data_interpreter.py +0 -475
- vision_agent/agent/data_interpreter_prompts.py +0 -186
- vision_agent/agent/easytool.py +0 -346
- vision_agent/agent/easytool_prompts.py +0 -89
- vision_agent/agent/easytool_v2.py +0 -781
- vision_agent/agent/easytool_v2_prompts.py +0 -152
- vision_agent/agent/reflexion.py +0 -299
- vision_agent/agent/reflexion_prompts.py +0 -100
- vision_agent/llm/__init__.py +0 -1
- vision_agent/llm/llm.py +0 -176
- vision_agent/tools/easytool_tools.py +0 -1242
- vision_agent-0.2.56.dist-info/RECORD +0 -36
- {vision_agent-0.2.56.dist-info → vision_agent-0.2.58.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.56.dist-info → vision_agent-0.2.58.dist-info}/WHEEL +0 -0
@@ -1,1242 +0,0 @@
|
|
1
|
-
import io
|
2
|
-
import logging
|
3
|
-
import tempfile
|
4
|
-
from abc import ABC
|
5
|
-
from pathlib import Path
|
6
|
-
from typing import Any, Dict, List, Tuple, Type, Union, cast
|
7
|
-
|
8
|
-
import numpy as np
|
9
|
-
import requests
|
10
|
-
from PIL import Image
|
11
|
-
from PIL.Image import Image as ImageType
|
12
|
-
from scipy.spatial import distance # type: ignore
|
13
|
-
|
14
|
-
from vision_agent.lmm import OpenAILMM
|
15
|
-
from vision_agent.tools.tool_utils import _send_inference_request
|
16
|
-
from vision_agent.utils import extract_frames_from_video
|
17
|
-
from vision_agent.utils.image_utils import (
|
18
|
-
b64_to_pil,
|
19
|
-
convert_to_b64,
|
20
|
-
denormalize_bbox,
|
21
|
-
get_image_size,
|
22
|
-
normalize_bbox,
|
23
|
-
rle_decode,
|
24
|
-
)
|
25
|
-
|
26
|
-
_LOGGER = logging.getLogger(__name__)
|
27
|
-
|
28
|
-
|
29
|
-
class Tool(ABC):
|
30
|
-
name: str
|
31
|
-
description: str
|
32
|
-
usage: Dict
|
33
|
-
|
34
|
-
def __call__(self, *args: Any, **kwargs: Any) -> Any:
|
35
|
-
raise NotImplementedError
|
36
|
-
|
37
|
-
|
38
|
-
class NoOp(Tool):
|
39
|
-
name = "noop_"
|
40
|
-
description = "'noop_' is a no-op tool that does nothing if you do not want answer the question directly and not use a tool."
|
41
|
-
usage = {
|
42
|
-
"required_parameters": [],
|
43
|
-
"examples": [
|
44
|
-
{
|
45
|
-
"scenario": "If you do not want to use a tool.",
|
46
|
-
"parameters": {},
|
47
|
-
}
|
48
|
-
],
|
49
|
-
}
|
50
|
-
|
51
|
-
def __call__(self) -> None:
|
52
|
-
return None
|
53
|
-
|
54
|
-
|
55
|
-
class CLIP(Tool):
|
56
|
-
r"""CLIP is a tool that can classify or tag any image given a set of input classes
|
57
|
-
or tags.
|
58
|
-
|
59
|
-
Example
|
60
|
-
-------
|
61
|
-
>>> import vision_agent as va
|
62
|
-
>>> clip = va.tools.CLIP()
|
63
|
-
>>> clip("red line, yellow dot", "ct_scan1.jpg"))
|
64
|
-
[{"labels": ["red line", "yellow dot"], "scores": [0.98, 0.02]}]
|
65
|
-
"""
|
66
|
-
|
67
|
-
name = "clip_"
|
68
|
-
description = "'clip_' is a tool that can classify any image given a set of input names or tags. It returns a list of the input names along with their probability scores."
|
69
|
-
usage = {
|
70
|
-
"required_parameters": [
|
71
|
-
{"name": "prompt", "type": "str"},
|
72
|
-
{"name": "image", "type": "str"},
|
73
|
-
],
|
74
|
-
"examples": [
|
75
|
-
{
|
76
|
-
"scenario": "Can you classify this image as a cat? Image name: cat.jpg",
|
77
|
-
"parameters": {"prompt": "cat", "image": "cat.jpg"},
|
78
|
-
},
|
79
|
-
{
|
80
|
-
"scenario": "Can you tag this photograph with cat or dog? Image name: cat_dog.jpg",
|
81
|
-
"parameters": {"prompt": "cat, dog", "image": "cat_dog.jpg"},
|
82
|
-
},
|
83
|
-
{
|
84
|
-
"scenario": "Can you build me a classifier that classifies red shirts, green shirts and other? Image name: shirts.jpg",
|
85
|
-
"parameters": {
|
86
|
-
"prompt": "red shirt, green shirt, other",
|
87
|
-
"image": "shirts.jpg",
|
88
|
-
},
|
89
|
-
},
|
90
|
-
],
|
91
|
-
}
|
92
|
-
|
93
|
-
# TODO: Add support for input multiple images, which aligns with the output type.
|
94
|
-
def __call__(self, prompt: str, image: Union[str, ImageType]) -> Dict:
|
95
|
-
"""Invoke the CLIP model.
|
96
|
-
|
97
|
-
Parameters:
|
98
|
-
prompt: a string includes a list of classes or tags to classify the image.
|
99
|
-
image: the input image to classify.
|
100
|
-
|
101
|
-
Returns:
|
102
|
-
A list of dictionaries containing the labels and scores. Each dictionary contains the classification result for an image. E.g. [{"labels": ["red line", "yellow dot"], "scores": [0.98, 0.02]}]
|
103
|
-
"""
|
104
|
-
image_b64 = convert_to_b64(image)
|
105
|
-
data = {
|
106
|
-
"prompt": prompt,
|
107
|
-
"image": image_b64,
|
108
|
-
"tool": "closed_set_image_classification",
|
109
|
-
}
|
110
|
-
resp_data = _send_inference_request(data, "tools")
|
111
|
-
resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
|
112
|
-
return resp_data
|
113
|
-
|
114
|
-
|
115
|
-
class ImageCaption(Tool):
|
116
|
-
r"""ImageCaption is a tool that can caption an image based on its contents or tags.
|
117
|
-
|
118
|
-
Example
|
119
|
-
-------
|
120
|
-
>>> import vision_agent as va
|
121
|
-
>>> caption = va.tools.ImageCaption()
|
122
|
-
>>> caption("image1.jpg")
|
123
|
-
{'text': ['a box of orange and white socks']}
|
124
|
-
"""
|
125
|
-
|
126
|
-
name = "image_caption_"
|
127
|
-
description = "'image_caption_' is a tool that can caption an image based on its contents or tags. It returns a text describing the image."
|
128
|
-
usage = {
|
129
|
-
"required_parameters": [
|
130
|
-
{"name": "image", "type": "str"},
|
131
|
-
],
|
132
|
-
"examples": [
|
133
|
-
{
|
134
|
-
"scenario": "Can you describe this image? Image name: cat.jpg",
|
135
|
-
"parameters": {"image": "cat.jpg"},
|
136
|
-
},
|
137
|
-
{
|
138
|
-
"scenario": "Can you caption this image with their main contents? Image name: cat_dog.jpg",
|
139
|
-
"parameters": {"image": "cat_dog.jpg"},
|
140
|
-
},
|
141
|
-
],
|
142
|
-
}
|
143
|
-
|
144
|
-
# TODO: Add support for input multiple images, which aligns with the output type.
|
145
|
-
def __call__(self, image: Union[str, ImageType]) -> Dict:
|
146
|
-
"""Invoke the Image captioning model.
|
147
|
-
|
148
|
-
Parameters:
|
149
|
-
image: the input image to caption.
|
150
|
-
|
151
|
-
Returns:
|
152
|
-
A list of dictionaries containing the labels and scores. Each dictionary contains the classification result for an image. E.g. [{"labels": ["red line", "yellow dot"], "scores": [0.98, 0.02]}]
|
153
|
-
"""
|
154
|
-
image_b64 = convert_to_b64(image)
|
155
|
-
data = {
|
156
|
-
"image": image_b64,
|
157
|
-
"tool": "image_captioning",
|
158
|
-
}
|
159
|
-
return _send_inference_request(data, "tools")
|
160
|
-
|
161
|
-
|
162
|
-
class GroundingDINO(Tool):
|
163
|
-
r"""Grounding DINO is a tool that can detect arbitrary objects with inputs such as
|
164
|
-
category names or referring expressions.
|
165
|
-
|
166
|
-
Example
|
167
|
-
-------
|
168
|
-
>>> import vision_agent as va
|
169
|
-
>>> t = va.tools.GroundingDINO()
|
170
|
-
>>> t("red line. yellow dot", "ct_scan1.jpg")
|
171
|
-
[{'labels': ['red line', 'yellow dot'],
|
172
|
-
'bboxes': [[0.38, 0.15, 0.59, 0.7], [0.48, 0.25, 0.69, 0.71]],
|
173
|
-
'scores': [0.98, 0.02]}]
|
174
|
-
"""
|
175
|
-
|
176
|
-
name = "grounding_dino_"
|
177
|
-
description = "'grounding_dino_' is a tool that can detect and count multiple objects given a text prompt such as category names or referring expressions. It returns a list and count of bounding boxes, label names and associated probability scores."
|
178
|
-
usage = {
|
179
|
-
"required_parameters": [
|
180
|
-
{"name": "prompt", "type": "str"},
|
181
|
-
{"name": "image", "type": "str"},
|
182
|
-
],
|
183
|
-
"optional_parameters": [
|
184
|
-
{"name": "box_threshold", "type": "float", "min": 0.1, "max": 0.5},
|
185
|
-
{"name": "iou_threshold", "type": "float", "min": 0.01, "max": 0.99},
|
186
|
-
],
|
187
|
-
"examples": [
|
188
|
-
{
|
189
|
-
"scenario": "Can you detect and count the giraffes and zebras in this image? Image name: animal.jpg",
|
190
|
-
"parameters": {
|
191
|
-
"prompt": "giraffe. zebra",
|
192
|
-
"image": "person.jpg",
|
193
|
-
},
|
194
|
-
},
|
195
|
-
{
|
196
|
-
"scenario": "Can you build me a car detector?",
|
197
|
-
"parameters": {"prompt": "car", "image": ""},
|
198
|
-
},
|
199
|
-
{
|
200
|
-
"scenario": "Can you detect the person on the left and right? Image name: person.jpg",
|
201
|
-
"parameters": {
|
202
|
-
"prompt": "left person. right person",
|
203
|
-
"image": "person.jpg",
|
204
|
-
},
|
205
|
-
},
|
206
|
-
{
|
207
|
-
"scenario": "Detect the red shirts and green shirt. Image name: shirts.jpg",
|
208
|
-
"parameters": {
|
209
|
-
"prompt": "red shirt. green shirt",
|
210
|
-
"image": "shirts.jpg",
|
211
|
-
"box_threshold": 0.20,
|
212
|
-
"iou_threshold": 0.20,
|
213
|
-
},
|
214
|
-
},
|
215
|
-
],
|
216
|
-
}
|
217
|
-
|
218
|
-
# TODO: Add support for input multiple images, which aligns with the output type.
|
219
|
-
def __call__(
|
220
|
-
self,
|
221
|
-
prompt: str,
|
222
|
-
image: Union[str, Path, ImageType],
|
223
|
-
box_threshold: float = 0.20,
|
224
|
-
iou_threshold: float = 0.20,
|
225
|
-
) -> Dict:
|
226
|
-
"""Invoke the Grounding DINO model.
|
227
|
-
|
228
|
-
Parameters:
|
229
|
-
prompt: one or multiple class names to detect. The classes should be separated by a period if there are multiple classes. E.g. "big dog . small cat"
|
230
|
-
image: the input image to run against.
|
231
|
-
box_threshold: the threshold to filter out the bounding boxes with low scores.
|
232
|
-
iou_threshold: the threshold for intersection over union used in nms algorithm. It will suppress the boxes which have iou greater than this threshold.
|
233
|
-
|
234
|
-
Returns:
|
235
|
-
A dictionary containing the labels, scores, and bboxes, which is the detection result for the input image.
|
236
|
-
"""
|
237
|
-
image_size = get_image_size(image)
|
238
|
-
image_b64 = convert_to_b64(image)
|
239
|
-
request_data = {
|
240
|
-
"prompt": prompt,
|
241
|
-
"image": image_b64,
|
242
|
-
"tool": "visual_grounding",
|
243
|
-
"kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
|
244
|
-
}
|
245
|
-
data: Dict[str, Any] = _send_inference_request(request_data, "tools")
|
246
|
-
if "bboxes" in data:
|
247
|
-
data["bboxes"] = [normalize_bbox(box, image_size) for box in data["bboxes"]]
|
248
|
-
if "scores" in data:
|
249
|
-
data["scores"] = [round(score, 2) for score in data["scores"]]
|
250
|
-
if "labels" in data:
|
251
|
-
data["labels"] = list(data["labels"])
|
252
|
-
data["image_size"] = image_size
|
253
|
-
return data
|
254
|
-
|
255
|
-
|
256
|
-
class GroundingSAM(Tool):
|
257
|
-
r"""Grounding SAM is a tool that can detect and segment arbitrary objects with
|
258
|
-
inputs such as category names or referring expressions.
|
259
|
-
|
260
|
-
Example
|
261
|
-
-------
|
262
|
-
>>> import vision_agent as va
|
263
|
-
>>> t = va.tools.GroundingSAM()
|
264
|
-
>>> t("red line, yellow dot", "ct_scan1.jpg"])
|
265
|
-
[{'labels': ['yellow dot', 'red line'],
|
266
|
-
'bboxes': [[0.38, 0.15, 0.59, 0.7], [0.48, 0.25, 0.69, 0.71]],
|
267
|
-
'masks': [array([[0, 0, 0, ..., 0, 0, 0],
|
268
|
-
[0, 0, 0, ..., 0, 0, 0],
|
269
|
-
...,
|
270
|
-
[0, 0, 0, ..., 0, 0, 0],
|
271
|
-
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8)},
|
272
|
-
array([[0, 0, 0, ..., 0, 0, 0],
|
273
|
-
[0, 0, 0, ..., 0, 0, 0],
|
274
|
-
...,
|
275
|
-
[1, 1, 1, ..., 1, 1, 1],
|
276
|
-
[1, 1, 1, ..., 1, 1, 1]], dtype=uint8)]}]
|
277
|
-
"""
|
278
|
-
|
279
|
-
name = "grounding_sam_"
|
280
|
-
description = "'grounding_sam_' is a tool that can detect and segment multiple objects given a text prompt such as category names or referring expressions. It returns a list of bounding boxes, label names and masks file names and associated probability scores."
|
281
|
-
usage = {
|
282
|
-
"required_parameters": [
|
283
|
-
{"name": "prompt", "type": "str"},
|
284
|
-
{"name": "image", "type": "str"},
|
285
|
-
],
|
286
|
-
"optional_parameters": [
|
287
|
-
{"name": "box_threshold", "type": "float", "min": 0.1, "max": 0.5},
|
288
|
-
{"name": "iou_threshold", "type": "float", "min": 0.01, "max": 0.99},
|
289
|
-
],
|
290
|
-
"examples": [
|
291
|
-
{
|
292
|
-
"scenario": "Can you segment the apples and grapes in this image? Image name: fruits.jpg",
|
293
|
-
"parameters": {
|
294
|
-
"prompt": "apple. grape",
|
295
|
-
"image": "fruits.jpg",
|
296
|
-
},
|
297
|
-
},
|
298
|
-
{
|
299
|
-
"scenario": "Can you build me a car segmentor?",
|
300
|
-
"parameters": {"prompt": "car", "image": ""},
|
301
|
-
},
|
302
|
-
{
|
303
|
-
"scenario": "Can you segment the person on the left and right? Image name: person.jpg",
|
304
|
-
"parameters": {
|
305
|
-
"prompt": "left person. right person",
|
306
|
-
"image": "person.jpg",
|
307
|
-
},
|
308
|
-
},
|
309
|
-
{
|
310
|
-
"scenario": "Can you build me a tool that segments red shirts and green shirts? Image name: shirts.jpg",
|
311
|
-
"parameters": {
|
312
|
-
"prompt": "red shirt, green shirt",
|
313
|
-
"image": "shirts.jpg",
|
314
|
-
"box_threshold": 0.20,
|
315
|
-
"iou_threshold": 0.20,
|
316
|
-
},
|
317
|
-
},
|
318
|
-
],
|
319
|
-
}
|
320
|
-
|
321
|
-
# TODO: Add support for input multiple images, which aligns with the output type.
|
322
|
-
def __call__(
|
323
|
-
self,
|
324
|
-
prompt: str,
|
325
|
-
image: Union[str, ImageType],
|
326
|
-
box_threshold: float = 0.2,
|
327
|
-
iou_threshold: float = 0.2,
|
328
|
-
) -> Dict:
|
329
|
-
"""Invoke the Grounding SAM model.
|
330
|
-
|
331
|
-
Parameters:
|
332
|
-
prompt: a list of classes to segment.
|
333
|
-
image: the input image to segment.
|
334
|
-
box_threshold: the threshold to filter out the bounding boxes with low scores.
|
335
|
-
iou_threshold: the threshold for intersection over union used in nms algorithm. It will suppress the boxes which have iou greater than this threshold.
|
336
|
-
|
337
|
-
Returns:
|
338
|
-
A dictionary containing the labels, scores, bboxes and masks for the input image.
|
339
|
-
"""
|
340
|
-
image_size = get_image_size(image)
|
341
|
-
image_b64 = convert_to_b64(image)
|
342
|
-
request_data = {
|
343
|
-
"prompt": prompt,
|
344
|
-
"image": image_b64,
|
345
|
-
"tool": "visual_grounding_segment",
|
346
|
-
"kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
|
347
|
-
}
|
348
|
-
data: Dict[str, Any] = _send_inference_request(request_data, "tools")
|
349
|
-
if "bboxes" in data:
|
350
|
-
data["bboxes"] = [normalize_bbox(box, image_size) for box in data["bboxes"]]
|
351
|
-
if "masks" in data:
|
352
|
-
data["masks"] = [
|
353
|
-
rle_decode(mask_rle=mask, shape=data["mask_shape"])
|
354
|
-
for mask in data["masks"]
|
355
|
-
]
|
356
|
-
data["image_size"] = image_size
|
357
|
-
data.pop("mask_shape", None)
|
358
|
-
return data
|
359
|
-
|
360
|
-
|
361
|
-
class DINOv(Tool):
|
362
|
-
r"""DINOv is a tool that can detect and segment similar objects with the given input masks.
|
363
|
-
|
364
|
-
Example
|
365
|
-
-------
|
366
|
-
>>> import vision_agent as va
|
367
|
-
>>> t = va.tools.DINOv()
|
368
|
-
>>> t(prompt=[{"mask":"balloon_mask.jpg", "image": "balloon.jpg"}], image="balloon.jpg"])
|
369
|
-
[{'scores': [0.512, 0.212],
|
370
|
-
'masks': [array([[0, 0, 0, ..., 0, 0, 0],
|
371
|
-
...,
|
372
|
-
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8)},
|
373
|
-
array([[0, 0, 0, ..., 0, 0, 0],
|
374
|
-
...,
|
375
|
-
[1, 1, 1, ..., 1, 1, 1]], dtype=uint8)]}]
|
376
|
-
"""
|
377
|
-
|
378
|
-
name = "dinov_"
|
379
|
-
description = "'dinov_' is a tool that can detect and segment similar objects given a reference segmentation mask."
|
380
|
-
usage = {
|
381
|
-
"required_parameters": [
|
382
|
-
{"name": "prompt", "type": "List[Dict[str, str]]"},
|
383
|
-
{"name": "image", "type": "str"},
|
384
|
-
],
|
385
|
-
"examples": [
|
386
|
-
{
|
387
|
-
"scenario": "Can you find all the balloons in this image that is similar to the provided masked area? Image name: input.jpg Reference image: balloon.jpg Reference mask: balloon_mask.jpg",
|
388
|
-
"parameters": {
|
389
|
-
"prompt": [
|
390
|
-
{"mask": "balloon_mask.jpg", "image": "balloon.jpg"},
|
391
|
-
],
|
392
|
-
"image": "input.jpg",
|
393
|
-
},
|
394
|
-
},
|
395
|
-
{
|
396
|
-
"scenario": "Detect all the objects in this image that are similar to the provided mask. Image name: original.jpg Reference image: mask.png Reference mask: background.png",
|
397
|
-
"parameters": {
|
398
|
-
"prompt": [
|
399
|
-
{"mask": "mask.png", "image": "background.png"},
|
400
|
-
],
|
401
|
-
"image": "original.jpg",
|
402
|
-
},
|
403
|
-
},
|
404
|
-
],
|
405
|
-
}
|
406
|
-
|
407
|
-
def __call__(
|
408
|
-
self, prompt: List[Dict[str, str]], image: Union[str, ImageType]
|
409
|
-
) -> Dict:
|
410
|
-
"""Invoke the DINOv model.
|
411
|
-
|
412
|
-
Parameters:
|
413
|
-
prompt: a list of visual prompts in the form of {'mask': 'MASK_FILE_PATH', 'image': 'IMAGE_FILE_PATH'}.
|
414
|
-
image: the input image to segment.
|
415
|
-
|
416
|
-
Returns:
|
417
|
-
A dictionary of the below keys: 'scores', 'masks' and 'mask_shape', which stores a list of detected segmentation masks and its scores.
|
418
|
-
"""
|
419
|
-
image_b64 = convert_to_b64(image)
|
420
|
-
for p in prompt:
|
421
|
-
p["mask"] = convert_to_b64(p["mask"])
|
422
|
-
p["image"] = convert_to_b64(p["image"])
|
423
|
-
request_data = {
|
424
|
-
"prompt": prompt,
|
425
|
-
"image": image_b64,
|
426
|
-
}
|
427
|
-
data: Dict[str, Any] = _send_inference_request(request_data, "dinov")
|
428
|
-
if "bboxes" in data:
|
429
|
-
data["bboxes"] = [
|
430
|
-
normalize_bbox(box, data["mask_shape"]) for box in data["bboxes"]
|
431
|
-
]
|
432
|
-
if "masks" in data:
|
433
|
-
data["masks"] = [
|
434
|
-
rle_decode(mask_rle=mask, shape=data["mask_shape"])
|
435
|
-
for mask in data["masks"]
|
436
|
-
]
|
437
|
-
data["labels"] = ["visual prompt" for _ in range(len(data["masks"]))]
|
438
|
-
mask_shape = data.pop("mask_shape", None)
|
439
|
-
data["image_size"] = (mask_shape[0], mask_shape[1]) if mask_shape else None
|
440
|
-
return data
|
441
|
-
|
442
|
-
|
443
|
-
class AgentDINOv(DINOv):
|
444
|
-
def __call__(
|
445
|
-
self,
|
446
|
-
prompt: List[Dict[str, str]],
|
447
|
-
image: Union[str, ImageType],
|
448
|
-
) -> Dict:
|
449
|
-
rets = super().__call__(prompt, image)
|
450
|
-
mask_files = []
|
451
|
-
for mask in rets["masks"]:
|
452
|
-
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
453
|
-
file_name = Path(tmp.name).with_suffix(".mask.png")
|
454
|
-
Image.fromarray(mask * 255).save(file_name)
|
455
|
-
mask_files.append(str(file_name))
|
456
|
-
rets["masks"] = mask_files
|
457
|
-
return rets
|
458
|
-
|
459
|
-
|
460
|
-
class AgentGroundingSAM(GroundingSAM):
|
461
|
-
r"""AgentGroundingSAM is the same as GroundingSAM but it saves the masks as files
|
462
|
-
returns the file name. This makes it easier for agents to use.
|
463
|
-
"""
|
464
|
-
|
465
|
-
def __call__(
|
466
|
-
self,
|
467
|
-
prompt: str,
|
468
|
-
image: Union[str, ImageType],
|
469
|
-
box_threshold: float = 0.2,
|
470
|
-
iou_threshold: float = 0.75,
|
471
|
-
) -> Dict:
|
472
|
-
rets = super().__call__(prompt, image, box_threshold, iou_threshold)
|
473
|
-
mask_files = []
|
474
|
-
for mask in rets["masks"]:
|
475
|
-
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
476
|
-
file_name = Path(tmp.name).with_suffix(".mask.png")
|
477
|
-
Image.fromarray(mask * 255).save(file_name)
|
478
|
-
mask_files.append(str(file_name))
|
479
|
-
rets["masks"] = mask_files
|
480
|
-
return rets
|
481
|
-
|
482
|
-
|
483
|
-
class ZeroShotCounting(Tool):
|
484
|
-
r"""ZeroShotCounting is a tool that can count total number of instances of an object
|
485
|
-
present in an image belonging to same class without a text or visual prompt.
|
486
|
-
|
487
|
-
Example
|
488
|
-
-------
|
489
|
-
>>> import vision_agent as va
|
490
|
-
>>> zshot_count = va.tools.ZeroShotCounting()
|
491
|
-
>>> zshot_count("image1.jpg")
|
492
|
-
{'count': 45}
|
493
|
-
"""
|
494
|
-
|
495
|
-
name = "zero_shot_counting_"
|
496
|
-
description = "'zero_shot_counting_' is a tool that counts foreground items given only an image and no other information. It returns only the count of the objects in the image"
|
497
|
-
|
498
|
-
usage = {
|
499
|
-
"required_parameters": [
|
500
|
-
{"name": "image", "type": "str"},
|
501
|
-
],
|
502
|
-
"examples": [
|
503
|
-
{
|
504
|
-
"scenario": "Can you count the items in the image? Image name: lids.jpg",
|
505
|
-
"parameters": {"image": "lids.jpg"},
|
506
|
-
},
|
507
|
-
{
|
508
|
-
"scenario": "Can you count the total number of objects in this image? Image name: tray.jpg",
|
509
|
-
"parameters": {"image": "tray.jpg"},
|
510
|
-
},
|
511
|
-
{
|
512
|
-
"scenario": "Can you build me an object counting tool? Image name: shirts.jpg",
|
513
|
-
"parameters": {
|
514
|
-
"image": "shirts.jpg",
|
515
|
-
},
|
516
|
-
},
|
517
|
-
],
|
518
|
-
}
|
519
|
-
|
520
|
-
# TODO: Add support for input multiple images, which aligns with the output type.
|
521
|
-
def __call__(self, image: Union[str, ImageType]) -> Dict:
|
522
|
-
"""Invoke the Zero shot counting model.
|
523
|
-
|
524
|
-
Parameters:
|
525
|
-
image: the input image.
|
526
|
-
|
527
|
-
Returns:
|
528
|
-
A dictionary containing the key 'count' and the count as value. E.g. {count: 12}
|
529
|
-
"""
|
530
|
-
image_b64 = convert_to_b64(image)
|
531
|
-
data = {
|
532
|
-
"image": image_b64,
|
533
|
-
"tool": "zero_shot_counting",
|
534
|
-
}
|
535
|
-
resp_data = _send_inference_request(data, "tools")
|
536
|
-
resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
|
537
|
-
return resp_data
|
538
|
-
|
539
|
-
|
540
|
-
class VisualPromptCounting(Tool):
|
541
|
-
r"""VisualPromptCounting is a tool that can count total number of instances of an object
|
542
|
-
present in an image belonging to same class with help of an visual prompt which is a bounding box.
|
543
|
-
|
544
|
-
Example
|
545
|
-
-------
|
546
|
-
>>> import vision_agent as va
|
547
|
-
>>> prompt_count = va.tools.VisualPromptCounting()
|
548
|
-
>>> prompt_count(image="image1.jpg", prompt={"bbox": [0.1, 0.1, 0.4, 0.42]})
|
549
|
-
{'count': 23}
|
550
|
-
"""
|
551
|
-
|
552
|
-
name = "visual_prompt_counting_"
|
553
|
-
description = "'visual_prompt_counting_' is a tool that counts foreground items in an image given a visual prompt which is a bounding box describing the object. It returns only the count of the objects in the image."
|
554
|
-
|
555
|
-
usage = {
|
556
|
-
"required_parameters": [
|
557
|
-
{"name": "image", "type": "str"},
|
558
|
-
{"name": "prompt", "type": "Dict[str, List[float]"},
|
559
|
-
],
|
560
|
-
"examples": [
|
561
|
-
{
|
562
|
-
"scenario": "Here is an example of a lid '0.1, 0.1, 0.14, 0.2', Can you count the items in the image ? Image name: lids.jpg",
|
563
|
-
"parameters": {
|
564
|
-
"image": "lids.jpg",
|
565
|
-
"prompt": {"bbox": [0.1, 0.1, 0.14, 0.2]},
|
566
|
-
},
|
567
|
-
},
|
568
|
-
{
|
569
|
-
"scenario": "Can you count the total number of objects in this image ? Image name: tray.jpg, reference_data: {'bbox': [0.1, 0.1, 0.2, 0.25]}",
|
570
|
-
"parameters": {
|
571
|
-
"image": "tray.jpg",
|
572
|
-
"prompt": {"bbox": [0.1, 0.1, 0.2, 0.25]},
|
573
|
-
},
|
574
|
-
},
|
575
|
-
{
|
576
|
-
"scenario": "Can you count this item based on an example, reference_data: {'bbox': [100, 115, 200, 200]} ? Image name: shirts.jpg",
|
577
|
-
"parameters": {
|
578
|
-
"image": "shirts.jpg",
|
579
|
-
"prompt": {"bbox": [100, 115, 200, 200]},
|
580
|
-
},
|
581
|
-
},
|
582
|
-
{
|
583
|
-
"scenario": "Can you build me a counting tool based on an example prompt ? Image name: shoes.jpg, reference_data: {'bbox': [0.1, 0.1, 0.6, 0.65]}",
|
584
|
-
"parameters": {
|
585
|
-
"image": "shoes.jpg",
|
586
|
-
"prompt": {"bbox": [0.1, 0.1, 0.6, 0.65]},
|
587
|
-
},
|
588
|
-
},
|
589
|
-
],
|
590
|
-
}
|
591
|
-
|
592
|
-
def __call__(
|
593
|
-
self, image: Union[str, ImageType], prompt: Dict[str, List[float]]
|
594
|
-
) -> Dict:
|
595
|
-
"""Invoke the few shot counting model.
|
596
|
-
|
597
|
-
Parameters:
|
598
|
-
image: the input image.
|
599
|
-
prompt: the visual prompt which is a bounding box describing the object.
|
600
|
-
|
601
|
-
Returns:
|
602
|
-
A dictionary containing the key 'count' and the count as value. E.g. {count: 12}
|
603
|
-
"""
|
604
|
-
image_size = get_image_size(image)
|
605
|
-
bbox = prompt["bbox"]
|
606
|
-
bbox_str = ", ".join(map(str, denormalize_bbox(bbox, image_size)))
|
607
|
-
image_b64 = convert_to_b64(image)
|
608
|
-
|
609
|
-
data = {
|
610
|
-
"image": image_b64,
|
611
|
-
"prompt": bbox_str,
|
612
|
-
"tool": "few_shot_counting",
|
613
|
-
}
|
614
|
-
resp_data = _send_inference_request(data, "tools")
|
615
|
-
resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
|
616
|
-
return resp_data
|
617
|
-
|
618
|
-
|
619
|
-
class VisualQuestionAnswering(Tool):
|
620
|
-
r"""VisualQuestionAnswering is a tool that can explain contents of an image and answer questions about the same
|
621
|
-
|
622
|
-
Example
|
623
|
-
-------
|
624
|
-
>>> import vision_agent as va
|
625
|
-
>>> vqa_tool = va.tools.VisualQuestionAnswering()
|
626
|
-
>>> vqa_tool(image="image1.jpg", prompt="describe this image in detail")
|
627
|
-
{'text': "The image contains a cat sitting on a table with a bowl of milk."}
|
628
|
-
"""
|
629
|
-
|
630
|
-
name = "visual_question_answering_"
|
631
|
-
description = "'visual_question_answering_' is a tool that can answer basic questions about the image given a question and an image. It returns a text describing the image and the answer to the question"
|
632
|
-
|
633
|
-
usage = {
|
634
|
-
"required_parameters": [
|
635
|
-
{"name": "image", "type": "str"},
|
636
|
-
{"name": "prompt", "type": "str"},
|
637
|
-
],
|
638
|
-
"examples": [
|
639
|
-
{
|
640
|
-
"scenario": "Describe this image in detail. Image name: cat.jpg",
|
641
|
-
"parameters": {
|
642
|
-
"image": "cats.jpg",
|
643
|
-
"prompt": "Describe this image in detail",
|
644
|
-
},
|
645
|
-
},
|
646
|
-
{
|
647
|
-
"scenario": "Can you help me with this street sign in this image ? What does it say ? Image name: sign.jpg",
|
648
|
-
"parameters": {
|
649
|
-
"image": "sign.jpg",
|
650
|
-
"prompt": "Can you help me with this street sign ? What does it say ?",
|
651
|
-
},
|
652
|
-
},
|
653
|
-
{
|
654
|
-
"scenario": "Describe the weather in the image for me ? Image name: weather.jpg",
|
655
|
-
"parameters": {
|
656
|
-
"image": "weather.jpg",
|
657
|
-
"prompt": "Describe the weather in the image for me ",
|
658
|
-
},
|
659
|
-
},
|
660
|
-
{
|
661
|
-
"scenario": "Which 2 are the least frequent bins in this histogram ? Image name: chart.jpg",
|
662
|
-
"parameters": {
|
663
|
-
"image": "chart.jpg",
|
664
|
-
"prompt": "Which 2 are the least frequent bins in this histogram",
|
665
|
-
},
|
666
|
-
},
|
667
|
-
],
|
668
|
-
}
|
669
|
-
|
670
|
-
def __call__(self, image: str, prompt: str) -> Dict:
|
671
|
-
"""Invoke the visual question answering model.
|
672
|
-
|
673
|
-
Parameters:
|
674
|
-
image: the input image.
|
675
|
-
|
676
|
-
Returns:
|
677
|
-
A dictionary containing the key 'text' and the answer to the prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'}
|
678
|
-
"""
|
679
|
-
|
680
|
-
gpt = OpenAILMM()
|
681
|
-
return {"text": gpt(input=prompt, images=[image])}
|
682
|
-
|
683
|
-
|
684
|
-
class ImageQuestionAnswering(Tool):
|
685
|
-
r"""ImageQuestionAnswering is a tool that can explain contents of an image and answer questions about the same
|
686
|
-
It is same as VisualQuestionAnswering but this tool is not used by agents. It is used when user requests a tool for VQA using generate_image_qa_tool function.
|
687
|
-
It is also useful if the user wants the data to be not exposed to OpenAI endpoints
|
688
|
-
|
689
|
-
Example
|
690
|
-
-------
|
691
|
-
>>> import vision_agent as va
|
692
|
-
>>> vqa_tool = va.tools.ImageQuestionAnswering()
|
693
|
-
>>> vqa_tool(image="image1.jpg", prompt="describe this image in detail")
|
694
|
-
{'text': "The image contains a cat sitting on a table with a bowl of milk."}
|
695
|
-
"""
|
696
|
-
|
697
|
-
name = "image_question_answering_"
|
698
|
-
description = "'image_question_answering_' is a tool that can answer basic questions about the image given a question and an image. It returns a text describing the image and the answer to the question"
|
699
|
-
|
700
|
-
usage = {
|
701
|
-
"required_parameters": [
|
702
|
-
{"name": "image", "type": "str"},
|
703
|
-
{"name": "prompt", "type": "str"},
|
704
|
-
],
|
705
|
-
"examples": [
|
706
|
-
{
|
707
|
-
"scenario": "Describe this image in detail. Image name: cat.jpg",
|
708
|
-
"parameters": {
|
709
|
-
"image": "cats.jpg",
|
710
|
-
"prompt": "Describe this image in detail",
|
711
|
-
},
|
712
|
-
},
|
713
|
-
{
|
714
|
-
"scenario": "Can you help me with this street sign in this image ? What does it say ? Image name: sign.jpg",
|
715
|
-
"parameters": {
|
716
|
-
"image": "sign.jpg",
|
717
|
-
"prompt": "Can you help me with this street sign ? What does it say ?",
|
718
|
-
},
|
719
|
-
},
|
720
|
-
{
|
721
|
-
"scenario": "Describe the weather in the image for me ? Image name: weather.jpg",
|
722
|
-
"parameters": {
|
723
|
-
"image": "weather.jpg",
|
724
|
-
"prompt": "Describe the weather in the image for me ",
|
725
|
-
},
|
726
|
-
},
|
727
|
-
{
|
728
|
-
"scenario": "Can you generate an image question answering tool ? Image name: chart.jpg, prompt: Which 2 are the least frequent bins in this histogram",
|
729
|
-
"parameters": {
|
730
|
-
"image": "chart.jpg",
|
731
|
-
"prompt": "Which 2 are the least frequent bins in this histogram",
|
732
|
-
},
|
733
|
-
},
|
734
|
-
],
|
735
|
-
}
|
736
|
-
|
737
|
-
def __call__(self, image: Union[str, ImageType], prompt: str) -> Dict:
|
738
|
-
"""Invoke the visual question answering model.
|
739
|
-
|
740
|
-
Parameters:
|
741
|
-
image: the input image.
|
742
|
-
|
743
|
-
Returns:
|
744
|
-
A dictionary containing the key 'text' and the answer to the prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'}
|
745
|
-
"""
|
746
|
-
|
747
|
-
image_b64 = convert_to_b64(image)
|
748
|
-
data = {
|
749
|
-
"image": image_b64,
|
750
|
-
"prompt": prompt,
|
751
|
-
"tool": "image_question_answering",
|
752
|
-
}
|
753
|
-
|
754
|
-
return _send_inference_request(data, "tools")
|
755
|
-
|
756
|
-
|
757
|
-
class Crop(Tool):
|
758
|
-
r"""Crop crops an image given a bounding box and returns a file name of the cropped image."""
|
759
|
-
|
760
|
-
name = "crop_"
|
761
|
-
description = "'crop_' crops an image given a bounding box and returns a file name of the cropped image. It returns a file with the cropped image."
|
762
|
-
usage = {
|
763
|
-
"required_parameters": [
|
764
|
-
{"name": "bbox", "type": "List[float]"},
|
765
|
-
{"name": "image", "type": "str"},
|
766
|
-
],
|
767
|
-
"examples": [
|
768
|
-
{
|
769
|
-
"scenario": "Can you crop the image to the bounding box [0.1, 0.1, 0.9, 0.9]? Image name: image.jpg",
|
770
|
-
"parameters": {"bbox": [0.1, 0.1, 0.9, 0.9], "image": "image.jpg"},
|
771
|
-
},
|
772
|
-
{
|
773
|
-
"scenario": "Cut out the image to the bounding box [0.2, 0.2, 0.8, 0.8]. Image name: car.jpg",
|
774
|
-
"parameters": {"bbox": [0.2, 0.2, 0.8, 0.8], "image": "car.jpg"},
|
775
|
-
},
|
776
|
-
],
|
777
|
-
}
|
778
|
-
|
779
|
-
def __call__(self, bbox: List[float], image: Union[str, Path]) -> Dict:
|
780
|
-
pil_image = Image.open(image)
|
781
|
-
width, height = pil_image.size
|
782
|
-
bbox = [
|
783
|
-
int(bbox[0] * width),
|
784
|
-
int(bbox[1] * height),
|
785
|
-
int(bbox[2] * width),
|
786
|
-
int(bbox[3] * height),
|
787
|
-
]
|
788
|
-
cropped_image = pil_image.crop(bbox) # type: ignore
|
789
|
-
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
790
|
-
cropped_image.save(tmp.name)
|
791
|
-
|
792
|
-
return {"image": tmp.name}
|
793
|
-
|
794
|
-
|
795
|
-
class BboxStats(Tool):
|
796
|
-
r"""BboxStats returns the height, width and area of the bounding box in pixels to 2 decimal places."""
|
797
|
-
|
798
|
-
name = "bbox_stats_"
|
799
|
-
description = "'bbox_stats_' returns the height, width and area of the given bounding box in pixels to 2 decimal places."
|
800
|
-
usage = {
|
801
|
-
"required_parameters": [
|
802
|
-
{"name": "bboxes", "type": "List[int]"},
|
803
|
-
{"name": "image_size", "type": "Tuple[int]"},
|
804
|
-
],
|
805
|
-
"examples": [
|
806
|
-
{
|
807
|
-
"scenario": "Calculate the width and height of the bounding box [0.2, 0.21, 0.34, 0.42]",
|
808
|
-
"parameters": {
|
809
|
-
"bboxes": [[0.2, 0.21, 0.34, 0.42]],
|
810
|
-
"image_size": (500, 1200),
|
811
|
-
},
|
812
|
-
},
|
813
|
-
{
|
814
|
-
"scenario": "Calculate the area of the bounding box [0.2, 0.21, 0.34, 0.42]",
|
815
|
-
"parameters": {
|
816
|
-
"bboxes": [[0.2, 0.21, 0.34, 0.42]],
|
817
|
-
"image_size": (640, 480),
|
818
|
-
},
|
819
|
-
},
|
820
|
-
],
|
821
|
-
}
|
822
|
-
|
823
|
-
def __call__(
|
824
|
-
self, bboxes: List[List[int]], image_size: Tuple[int, int]
|
825
|
-
) -> List[Dict]:
|
826
|
-
areas = []
|
827
|
-
height, width = image_size
|
828
|
-
for bbox in bboxes:
|
829
|
-
x1, y1, x2, y2 = bbox
|
830
|
-
areas.append(
|
831
|
-
{
|
832
|
-
"width": round((x2 - x1) * width, 2),
|
833
|
-
"height": round((y2 - y1) * height, 2),
|
834
|
-
"area": round((x2 - x1) * (y2 - y1) * width * height, 2),
|
835
|
-
}
|
836
|
-
)
|
837
|
-
|
838
|
-
return areas
|
839
|
-
|
840
|
-
|
841
|
-
class SegArea(Tool):
|
842
|
-
r"""SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places."""
|
843
|
-
|
844
|
-
name = "seg_area_"
|
845
|
-
description = "'seg_area_' returns the area of the given segmentation mask in pixels normalized to 2 decimal places."
|
846
|
-
usage = {
|
847
|
-
"required_parameters": [{"name": "masks", "type": "str"}],
|
848
|
-
"examples": [
|
849
|
-
{
|
850
|
-
"scenario": "If you want to calculate the area of the segmentation mask, pass the masks file name.",
|
851
|
-
"parameters": {"masks": "mask_file.jpg"},
|
852
|
-
},
|
853
|
-
],
|
854
|
-
}
|
855
|
-
|
856
|
-
def __call__(self, masks: Union[str, Path]) -> float:
|
857
|
-
pil_mask = Image.open(str(masks))
|
858
|
-
np_mask = np.array(pil_mask)
|
859
|
-
np_mask = np.clip(np_mask, 0, 1)
|
860
|
-
return cast(float, round(np.sum(np_mask), 2))
|
861
|
-
|
862
|
-
|
863
|
-
class BboxIoU(Tool):
|
864
|
-
name = "bbox_iou_"
|
865
|
-
description = "'bbox_iou_' returns the intersection over union of two bounding boxes. This is a good tool for determining if two objects are overlapping."
|
866
|
-
usage = {
|
867
|
-
"required_parameters": [
|
868
|
-
{"name": "bbox1", "type": "List[int]"},
|
869
|
-
{"name": "bbox2", "type": "List[int]"},
|
870
|
-
],
|
871
|
-
"examples": [
|
872
|
-
{
|
873
|
-
"scenario": "If you want to calculate the intersection over union of the bounding boxes [0.2, 0.21, 0.34, 0.42] and [0.3, 0.31, 0.44, 0.52]",
|
874
|
-
"parameters": {
|
875
|
-
"bbox1": [0.2, 0.21, 0.34, 0.42],
|
876
|
-
"bbox2": [0.3, 0.31, 0.44, 0.52],
|
877
|
-
},
|
878
|
-
}
|
879
|
-
],
|
880
|
-
}
|
881
|
-
|
882
|
-
def __call__(self, bbox1: List[int], bbox2: List[int]) -> float:
|
883
|
-
x1, y1, x2, y2 = bbox1
|
884
|
-
x3, y3, x4, y4 = bbox2
|
885
|
-
xA = max(x1, x3)
|
886
|
-
yA = max(y1, y3)
|
887
|
-
xB = min(x2, x4)
|
888
|
-
yB = min(y2, y4)
|
889
|
-
inter_area = max(0, xB - xA) * max(0, yB - yA)
|
890
|
-
boxa_area = (x2 - x1) * (y2 - y1)
|
891
|
-
boxb_area = (x4 - x3) * (y4 - y3)
|
892
|
-
iou = inter_area / float(boxa_area + boxb_area - inter_area)
|
893
|
-
return round(iou, 2)
|
894
|
-
|
895
|
-
|
896
|
-
class SegIoU(Tool):
|
897
|
-
name = "seg_iou_"
|
898
|
-
description = "'seg_iou_' returns the intersection over union of two segmentation masks given their segmentation mask files."
|
899
|
-
usage = {
|
900
|
-
"required_parameters": [
|
901
|
-
{"name": "mask1", "type": "str"},
|
902
|
-
{"name": "mask2", "type": "str"},
|
903
|
-
],
|
904
|
-
"examples": [
|
905
|
-
{
|
906
|
-
"scenario": "Calculate the intersection over union of the segmentation masks for mask_file1.jpg and mask_file2.jpg",
|
907
|
-
"parameters": {"mask1": "mask_file1.png", "mask2": "mask_file2.png"},
|
908
|
-
}
|
909
|
-
],
|
910
|
-
}
|
911
|
-
|
912
|
-
def __call__(self, mask1: Union[str, Path], mask2: Union[str, Path]) -> float:
|
913
|
-
pil_mask1 = Image.open(str(mask1))
|
914
|
-
pil_mask2 = Image.open(str(mask2))
|
915
|
-
np_mask1 = np.clip(np.array(pil_mask1), 0, 1)
|
916
|
-
np_mask2 = np.clip(np.array(pil_mask2), 0, 1)
|
917
|
-
intersection = np.logical_and(np_mask1, np_mask2)
|
918
|
-
union = np.logical_or(np_mask1, np_mask2)
|
919
|
-
iou = np.sum(intersection) / np.sum(union)
|
920
|
-
return cast(float, round(iou, 2))
|
921
|
-
|
922
|
-
|
923
|
-
class BboxContains(Tool):
|
924
|
-
name = "bbox_contains_"
|
925
|
-
description = "Given two bounding boxes, a target bounding box and a region bounding box, 'bbox_contains_' returns the intersection of the two bounding boxes which is the percentage area of the target bounding box overlaps with the region bounding box. This is a good tool for determining if the region object contains the target object."
|
926
|
-
usage = {
|
927
|
-
"required_parameters": [
|
928
|
-
{"name": "target", "type": "List[int]"},
|
929
|
-
{"name": "target_class", "type": "str"},
|
930
|
-
{"name": "region", "type": "List[int]"},
|
931
|
-
{"name": "region_class", "type": "str"},
|
932
|
-
],
|
933
|
-
"examples": [
|
934
|
-
{
|
935
|
-
"scenario": "Determine if the dog on the couch, bounding box of the dog: [0.2, 0.21, 0.34, 0.42], bounding box of the couch: [0.3, 0.31, 0.44, 0.52]",
|
936
|
-
"parameters": {
|
937
|
-
"target": [0.2, 0.21, 0.34, 0.42],
|
938
|
-
"target_class": "dog",
|
939
|
-
"region": [0.3, 0.31, 0.44, 0.52],
|
940
|
-
"region_class": "couch",
|
941
|
-
},
|
942
|
-
},
|
943
|
-
{
|
944
|
-
"scenario": "Check if the kid is in the pool? bounding box of the kid: [0.2, 0.21, 0.34, 0.42], bounding box of the pool: [0.3, 0.31, 0.44, 0.52]",
|
945
|
-
"parameters": {
|
946
|
-
"target": [0.2, 0.21, 0.34, 0.42],
|
947
|
-
"target_class": "kid",
|
948
|
-
"region": [0.3, 0.31, 0.44, 0.52],
|
949
|
-
"region_class": "pool",
|
950
|
-
},
|
951
|
-
},
|
952
|
-
],
|
953
|
-
}
|
954
|
-
|
955
|
-
def __call__(
|
956
|
-
self, target: List[int], target_class: str, region: List[int], region_class: str
|
957
|
-
) -> Dict[str, Union[str, float]]:
|
958
|
-
x1, y1, x2, y2 = target
|
959
|
-
x3, y3, x4, y4 = region
|
960
|
-
xA = max(x1, x3)
|
961
|
-
yA = max(y1, y3)
|
962
|
-
xB = min(x2, x4)
|
963
|
-
yB = min(y2, y4)
|
964
|
-
inter_area = max(0, xB - xA) * max(0, yB - yA)
|
965
|
-
boxa_area = (x2 - x1) * (y2 - y1)
|
966
|
-
iou = inter_area / float(boxa_area)
|
967
|
-
area = round(iou, 2)
|
968
|
-
return {
|
969
|
-
"target_class": target_class,
|
970
|
-
"region_class": region_class,
|
971
|
-
"intersection": area,
|
972
|
-
}
|
973
|
-
|
974
|
-
|
975
|
-
class ObjectDistance(Tool):
|
976
|
-
name = "object_distance_"
|
977
|
-
description = "'object_distance_' calculates the distance between two objects in an image. It returns the minimum distance between the two objects."
|
978
|
-
usage = {
|
979
|
-
"required_parameters": [
|
980
|
-
{"name": "object1", "type": "Dict[str, Any]"},
|
981
|
-
{"name": "object2", "type": "Dict[str, Any]"},
|
982
|
-
],
|
983
|
-
"examples": [
|
984
|
-
{
|
985
|
-
"scenario": "Calculate the distance between these two objects {bboxes: [0.2, 0.21, 0.34, 0.42], masks: 'mask_file1.png'}, {bboxes: [0.3, 0.31, 0.44, 0.52], masks: 'mask_file2.png'}",
|
986
|
-
"parameters": {
|
987
|
-
"object1": {
|
988
|
-
"bboxes": [0.2, 0.21, 0.34, 0.42],
|
989
|
-
"scores": 0.54,
|
990
|
-
"masks": "mask_file1.png",
|
991
|
-
},
|
992
|
-
"object2": {
|
993
|
-
"bboxes": [0.3, 0.31, 0.44, 0.52],
|
994
|
-
"scores": 0.66,
|
995
|
-
"masks": "mask_file2.png",
|
996
|
-
},
|
997
|
-
},
|
998
|
-
}
|
999
|
-
],
|
1000
|
-
}
|
1001
|
-
|
1002
|
-
def __call__(self, object1: Dict[str, Any], object2: Dict[str, Any]) -> float:
|
1003
|
-
if "masks" in object1 and "masks" in object2:
|
1004
|
-
mask1 = object1["masks"]
|
1005
|
-
mask2 = object2["masks"]
|
1006
|
-
return MaskDistance()(mask1, mask2)
|
1007
|
-
elif "bboxes" in object1 and "bboxes" in object2:
|
1008
|
-
bbox1 = object1["bboxes"]
|
1009
|
-
bbox2 = object2["bboxes"]
|
1010
|
-
return BoxDistance()(bbox1, bbox2)
|
1011
|
-
else:
|
1012
|
-
raise ValueError("Either of the objects should have masks or bboxes")
|
1013
|
-
|
1014
|
-
|
1015
|
-
class BoxDistance(Tool):
|
1016
|
-
name = "box_distance_"
|
1017
|
-
description = "'box_distance_' calculates distance between two bounding boxes. It returns the minumum distance between the given bounding boxes"
|
1018
|
-
usage = {
|
1019
|
-
"required_parameters": [
|
1020
|
-
{"name": "bbox1", "type": "List[int]"},
|
1021
|
-
{"name": "bbox2", "type": "List[int]"},
|
1022
|
-
],
|
1023
|
-
"examples": [
|
1024
|
-
{
|
1025
|
-
"scenario": "Calculate the distance between these two bounding boxes [0.2, 0.21, 0.34, 0.42] and [0.3, 0.31, 0.44, 0.52]",
|
1026
|
-
"parameters": {
|
1027
|
-
"bbox1": [0.2, 0.21, 0.34, 0.42],
|
1028
|
-
"bbox2": [0.3, 0.31, 0.44, 0.52],
|
1029
|
-
},
|
1030
|
-
}
|
1031
|
-
],
|
1032
|
-
}
|
1033
|
-
|
1034
|
-
def __call__(self, bbox1: List[int], bbox2: List[int]) -> float:
|
1035
|
-
x11, y11, x12, y12 = bbox1
|
1036
|
-
x21, y21, x22, y22 = bbox2
|
1037
|
-
|
1038
|
-
horizontal_dist = np.max([0, x21 - x12, x11 - x22])
|
1039
|
-
vertical_dist = np.max([0, y21 - y12, y11 - y22])
|
1040
|
-
|
1041
|
-
return cast(float, round(np.sqrt(horizontal_dist**2 + vertical_dist**2), 2))
|
1042
|
-
|
1043
|
-
|
1044
|
-
class MaskDistance(Tool):
|
1045
|
-
name = "mask_distance_"
|
1046
|
-
description = "'mask_distance_' calculates distance between two masks. It is helpful in checking proximity of two objects. It returns the minumum distance between the given masks"
|
1047
|
-
usage = {
|
1048
|
-
"required_parameters": [
|
1049
|
-
{"name": "mask1", "type": "str"},
|
1050
|
-
{"name": "mask2", "type": "str"},
|
1051
|
-
],
|
1052
|
-
"examples": [
|
1053
|
-
{
|
1054
|
-
"scenario": "Calculate the distance between the segmentation masks for mask_file1.jpg and mask_file2.jpg",
|
1055
|
-
"parameters": {"mask1": "mask_file1.png", "mask2": "mask_file2.png"},
|
1056
|
-
}
|
1057
|
-
],
|
1058
|
-
}
|
1059
|
-
|
1060
|
-
def __call__(self, mask1: Union[str, Path], mask2: Union[str, Path]) -> float:
|
1061
|
-
pil_mask1 = Image.open(str(mask1))
|
1062
|
-
pil_mask2 = Image.open(str(mask2))
|
1063
|
-
np_mask1 = np.clip(np.array(pil_mask1), 0, 1)
|
1064
|
-
np_mask2 = np.clip(np.array(pil_mask2), 0, 1)
|
1065
|
-
|
1066
|
-
mask1_points = np.transpose(np.nonzero(np_mask1))
|
1067
|
-
mask2_points = np.transpose(np.nonzero(np_mask2))
|
1068
|
-
dist_matrix = distance.cdist(mask1_points, mask2_points, "euclidean")
|
1069
|
-
return cast(float, np.round(np.min(dist_matrix), 2))
|
1070
|
-
|
1071
|
-
|
1072
|
-
class ExtractFrames(Tool):
|
1073
|
-
r"""Extract frames from a video."""
|
1074
|
-
|
1075
|
-
name = "extract_frames_"
|
1076
|
-
description = "'extract_frames_' extracts frames from a video every 2 seconds, returns a list of tuples (frame, timestamp), where timestamp is the relative time in seconds where the frame was captured. The frame is a local image file path."
|
1077
|
-
usage = {
|
1078
|
-
"required_parameters": [{"name": "video_uri", "type": "str"}],
|
1079
|
-
"optional_parameters": [{"name": "frames_every", "type": "float"}],
|
1080
|
-
"examples": [
|
1081
|
-
{
|
1082
|
-
"scenario": "Can you extract the frames from this video? Video: www.foobar.com/video?name=test.mp4",
|
1083
|
-
"parameters": {"video_uri": "www.foobar.com/video?name=test.mp4"},
|
1084
|
-
},
|
1085
|
-
{
|
1086
|
-
"scenario": "Can you extract the images from this video file at every 2 seconds ? Video path: tests/data/test.mp4",
|
1087
|
-
"parameters": {"video_uri": "tests/data/test.mp4", "frames_every": 2},
|
1088
|
-
},
|
1089
|
-
],
|
1090
|
-
}
|
1091
|
-
|
1092
|
-
def __call__(
|
1093
|
-
self, video_uri: str, frames_every: float = 2
|
1094
|
-
) -> List[Tuple[str, float]]:
|
1095
|
-
"""Extract frames from a video.
|
1096
|
-
|
1097
|
-
|
1098
|
-
Parameters:
|
1099
|
-
video_uri: the path to the video file or a url points to the video data
|
1100
|
-
|
1101
|
-
Returns:
|
1102
|
-
a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(path_to_frame1, 0.0), (path_to_frame2, 0.5), ...]. The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order.
|
1103
|
-
"""
|
1104
|
-
frames = extract_frames_from_video(video_uri, fps=round(1 / frames_every, 2))
|
1105
|
-
result = []
|
1106
|
-
_LOGGER.info(
|
1107
|
-
f"Extracted {len(frames)} frames from video {video_uri}. Temporarily saving them as images to disk for downstream tasks."
|
1108
|
-
)
|
1109
|
-
for frame, ts in frames:
|
1110
|
-
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
1111
|
-
file_name = Path(tmp.name).with_suffix(".frame.png")
|
1112
|
-
Image.fromarray(frame).save(file_name)
|
1113
|
-
result.append((str(file_name), ts))
|
1114
|
-
return result
|
1115
|
-
|
1116
|
-
|
1117
|
-
class OCR(Tool):
|
1118
|
-
name = "ocr_"
|
1119
|
-
description = "'ocr_' extracts text from an image. It returns a list of detected text, bounding boxes, and confidence scores."
|
1120
|
-
usage = {
|
1121
|
-
"required_parameters": [
|
1122
|
-
{"name": "image", "type": "str"},
|
1123
|
-
],
|
1124
|
-
"examples": [
|
1125
|
-
{
|
1126
|
-
"scenario": "Can you extract the text from this image? Image name: image.png",
|
1127
|
-
"parameters": {"image": "image.png"},
|
1128
|
-
},
|
1129
|
-
],
|
1130
|
-
}
|
1131
|
-
_API_KEY = "land_sk_WVYwP00xA3iXely2vuar6YUDZ3MJT9yLX6oW5noUkwICzYLiDV"
|
1132
|
-
_URL = "https://app.landing.ai/ocr/v1/detect-text"
|
1133
|
-
|
1134
|
-
def __call__(self, image: str) -> dict:
|
1135
|
-
pil_image = Image.open(image).convert("RGB")
|
1136
|
-
image_size = pil_image.size[::-1]
|
1137
|
-
image_buffer = io.BytesIO()
|
1138
|
-
pil_image.save(image_buffer, format="PNG")
|
1139
|
-
buffer_bytes = image_buffer.getvalue()
|
1140
|
-
image_buffer.close()
|
1141
|
-
|
1142
|
-
res = requests.post(
|
1143
|
-
self._URL,
|
1144
|
-
files={"images": buffer_bytes},
|
1145
|
-
data={"language": "en"},
|
1146
|
-
headers={"contentType": "multipart/form-data", "apikey": self._API_KEY},
|
1147
|
-
)
|
1148
|
-
if res.status_code != 200:
|
1149
|
-
_LOGGER.error(f"Request failed: {res.text}")
|
1150
|
-
raise ValueError(f"Request failed: {res.text}")
|
1151
|
-
|
1152
|
-
data = res.json()
|
1153
|
-
output: Dict[str, List] = {"labels": [], "bboxes": [], "scores": []}
|
1154
|
-
for det in data[0]:
|
1155
|
-
output["labels"].append(det["text"])
|
1156
|
-
box = [
|
1157
|
-
det["location"][0]["x"],
|
1158
|
-
det["location"][0]["y"],
|
1159
|
-
det["location"][2]["x"],
|
1160
|
-
det["location"][2]["y"],
|
1161
|
-
]
|
1162
|
-
box = normalize_bbox(box, image_size)
|
1163
|
-
output["bboxes"].append(box)
|
1164
|
-
output["scores"].append(round(det["score"], 2))
|
1165
|
-
return output
|
1166
|
-
|
1167
|
-
|
1168
|
-
class Calculator(Tool):
|
1169
|
-
r"""Calculator is a tool that can perform basic arithmetic operations."""
|
1170
|
-
|
1171
|
-
name = "calculator_"
|
1172
|
-
description = (
|
1173
|
-
"'calculator_' is a tool that can perform basic arithmetic operations."
|
1174
|
-
)
|
1175
|
-
usage = {
|
1176
|
-
"required_parameters": [{"name": "equation", "type": "str"}],
|
1177
|
-
"examples": [
|
1178
|
-
{
|
1179
|
-
"scenario": "If you want to calculate (2 * 3) + 4",
|
1180
|
-
"parameters": {"equation": "2 + 4"},
|
1181
|
-
},
|
1182
|
-
{
|
1183
|
-
"scenario": "If you want to calculate (4 + 2.5) / 2.1",
|
1184
|
-
"parameters": {"equation": "(4 + 2.5) / 2.1"},
|
1185
|
-
},
|
1186
|
-
],
|
1187
|
-
}
|
1188
|
-
|
1189
|
-
def __call__(self, equation: str) -> float:
|
1190
|
-
return cast(float, round(eval(equation), 2))
|
1191
|
-
|
1192
|
-
|
1193
|
-
TOOLS = {
|
1194
|
-
i: {"name": c.name, "description": c.description, "usage": c.usage, "class": c}
|
1195
|
-
for i, c in enumerate(
|
1196
|
-
[
|
1197
|
-
NoOp,
|
1198
|
-
CLIP,
|
1199
|
-
GroundingDINO,
|
1200
|
-
AgentGroundingSAM,
|
1201
|
-
ZeroShotCounting,
|
1202
|
-
VisualPromptCounting,
|
1203
|
-
VisualQuestionAnswering,
|
1204
|
-
AgentDINOv,
|
1205
|
-
ExtractFrames,
|
1206
|
-
Crop,
|
1207
|
-
BboxStats,
|
1208
|
-
SegArea,
|
1209
|
-
ObjectDistance,
|
1210
|
-
BboxContains,
|
1211
|
-
SegIoU,
|
1212
|
-
OCR,
|
1213
|
-
Calculator,
|
1214
|
-
]
|
1215
|
-
)
|
1216
|
-
if (hasattr(c, "name") and hasattr(c, "description") and hasattr(c, "usage"))
|
1217
|
-
}
|
1218
|
-
|
1219
|
-
|
1220
|
-
def register_tool(tool: Type[Tool]) -> Type[Tool]:
|
1221
|
-
r"""Add a tool to the list of available tools.
|
1222
|
-
|
1223
|
-
Parameters:
|
1224
|
-
tool: The tool to add.
|
1225
|
-
"""
|
1226
|
-
|
1227
|
-
if (
|
1228
|
-
not hasattr(tool, "name")
|
1229
|
-
or not hasattr(tool, "description")
|
1230
|
-
or not hasattr(tool, "usage")
|
1231
|
-
):
|
1232
|
-
raise ValueError(
|
1233
|
-
"The tool must have 'name', 'description' and 'usage' attributes."
|
1234
|
-
)
|
1235
|
-
|
1236
|
-
TOOLS[len(TOOLS)] = {
|
1237
|
-
"name": tool.name,
|
1238
|
-
"description": tool.description,
|
1239
|
-
"usage": tool.usage,
|
1240
|
-
"class": tool,
|
1241
|
-
}
|
1242
|
-
return tool
|