vision-agent 0.2.29__py3-none-any.whl → 0.2.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/__init__.py +2 -2
- vision_agent/agent/agent.py +2 -2
- vision_agent/agent/agent_coder.py +8 -8
- vision_agent/agent/{vision_agent_v2.py → data_interpreter.py} +12 -12
- vision_agent/agent/{vision_agent_v2_prompts.py → data_interpreter_prompts.py} +3 -3
- vision_agent/agent/easytool.py +8 -8
- vision_agent/agent/easytool_v2.py +778 -0
- vision_agent/agent/easytool_v2_prompts.py +152 -0
- vision_agent/agent/reflexion.py +8 -8
- vision_agent/agent/vision_agent.py +360 -691
- vision_agent/agent/vision_agent_prompts.py +231 -149
- vision_agent/llm/llm.py +3 -4
- vision_agent/lmm/lmm.py +6 -6
- vision_agent/tools/__init__.py +21 -22
- vision_agent/tools/easytool_tools.py +1242 -0
- vision_agent/tools/tools.py +533 -1090
- vision_agent-0.2.31.dist-info/METADATA +175 -0
- vision_agent-0.2.31.dist-info/RECORD +36 -0
- vision_agent/agent/vision_agent_v3.py +0 -386
- vision_agent/agent/vision_agent_v3_prompts.py +0 -226
- vision_agent/tools/tools_v2.py +0 -685
- vision_agent-0.2.29.dist-info/METADATA +0 -226
- vision_agent-0.2.29.dist-info/RECORD +0 -36
- {vision_agent-0.2.29.dist-info → vision_agent-0.2.31.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.29.dist-info → vision_agent-0.2.31.dist-info}/WHEEL +0 -0
vision_agent/tools/tools.py
CHANGED
@@ -1,17 +1,18 @@
|
|
1
|
+
import inspect
|
1
2
|
import io
|
3
|
+
import json
|
2
4
|
import logging
|
3
5
|
import tempfile
|
4
|
-
from
|
6
|
+
from importlib import resources
|
5
7
|
from pathlib import Path
|
6
|
-
from typing import Any, Dict, List, Tuple,
|
8
|
+
from typing import Any, Callable, Dict, List, Tuple, Union, cast
|
7
9
|
|
8
10
|
import numpy as np
|
11
|
+
import pandas as pd
|
9
12
|
import requests
|
10
|
-
from PIL import Image
|
11
|
-
from PIL.Image import Image as ImageType
|
13
|
+
from PIL import Image, ImageDraw, ImageFont
|
12
14
|
from scipy.spatial import distance # type: ignore
|
13
15
|
|
14
|
-
from vision_agent.lmm import OpenAILMM
|
15
16
|
from vision_agent.tools.tool_utils import _send_inference_request
|
16
17
|
from vision_agent.utils import extract_frames_from_video
|
17
18
|
from vision_agent.utils.image_utils import (
|
@@ -23,1220 +24,662 @@ from vision_agent.utils.image_utils import (
|
|
23
24
|
rle_decode,
|
24
25
|
)
|
25
26
|
|
27
|
+
COLORS = [
|
28
|
+
(158, 218, 229),
|
29
|
+
(219, 219, 141),
|
30
|
+
(23, 190, 207),
|
31
|
+
(188, 189, 34),
|
32
|
+
(199, 199, 199),
|
33
|
+
(247, 182, 210),
|
34
|
+
(127, 127, 127),
|
35
|
+
(227, 119, 194),
|
36
|
+
(196, 156, 148),
|
37
|
+
(197, 176, 213),
|
38
|
+
(140, 86, 75),
|
39
|
+
(148, 103, 189),
|
40
|
+
(255, 152, 150),
|
41
|
+
(152, 223, 138),
|
42
|
+
(214, 39, 40),
|
43
|
+
(44, 160, 44),
|
44
|
+
(255, 187, 120),
|
45
|
+
(174, 199, 232),
|
46
|
+
(255, 127, 14),
|
47
|
+
(31, 119, 180),
|
48
|
+
]
|
49
|
+
_API_KEY = "land_sk_WVYwP00xA3iXely2vuar6YUDZ3MJT9yLX6oW5noUkwICzYLiDV"
|
50
|
+
_OCR_URL = "https://app.landing.ai/ocr/v1/detect-text"
|
51
|
+
logging.basicConfig(level=logging.INFO)
|
26
52
|
_LOGGER = logging.getLogger(__name__)
|
27
53
|
|
28
54
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
55
|
+
def grounding_dino(
|
56
|
+
prompt: str,
|
57
|
+
image: np.ndarray,
|
58
|
+
box_threshold: float = 0.20,
|
59
|
+
iou_threshold: float = 0.20,
|
60
|
+
) -> List[Dict[str, Any]]:
|
61
|
+
"""'grounding_dino' is a tool that can detect and count objects given a text prompt
|
62
|
+
such as category names or referring expressions. It returns a list and count of
|
63
|
+
bounding boxes, label names and associated probability scores.
|
36
64
|
|
65
|
+
Parameters:
|
66
|
+
prompt (str): The prompt to ground to the image.
|
67
|
+
image (np.ndarray): The image to ground the prompt to.
|
68
|
+
box_threshold (float, optional): The threshold for the box detection. Defaults
|
69
|
+
to 0.20.
|
70
|
+
iou_threshold (float, optional): The threshold for the Intersection over Union
|
71
|
+
(IoU). Defaults to 0.20.
|
72
|
+
|
73
|
+
Returns:
|
74
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
75
|
+
bounding box of the detected objects with normalized coordinates
|
76
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left and
|
77
|
+
xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
37
78
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
79
|
+
Example
|
80
|
+
-------
|
81
|
+
>>> grounding_dino("car. dinosaur", image)
|
82
|
+
[
|
83
|
+
{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
84
|
+
{'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
85
|
+
]
|
86
|
+
"""
|
87
|
+
image_size = image.shape[:2]
|
88
|
+
image_b64 = convert_to_b64(image)
|
89
|
+
request_data = {
|
90
|
+
"prompt": prompt,
|
91
|
+
"image": image_b64,
|
92
|
+
"tool": "visual_grounding",
|
93
|
+
"kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
|
49
94
|
}
|
95
|
+
data: Dict[str, Any] = _send_inference_request(request_data, "tools")
|
96
|
+
return_data = []
|
97
|
+
for i in range(len(data["bboxes"])):
|
98
|
+
return_data.append(
|
99
|
+
{
|
100
|
+
"score": round(data["scores"][i], 2),
|
101
|
+
"label": data["labels"][i],
|
102
|
+
"bbox": normalize_bbox(data["bboxes"][i], image_size),
|
103
|
+
}
|
104
|
+
)
|
105
|
+
return return_data
|
50
106
|
|
51
|
-
def __call__(self) -> None:
|
52
|
-
return None
|
53
107
|
|
108
|
+
def grounding_sam(
|
109
|
+
prompt: str,
|
110
|
+
image: np.ndarray,
|
111
|
+
box_threshold: float = 0.20,
|
112
|
+
iou_threshold: float = 0.20,
|
113
|
+
) -> List[Dict[str, Any]]:
|
114
|
+
"""'grounding_sam' is a tool that can detect and segment objects given a text
|
115
|
+
prompt such as category names or referring expressions. It returns a list of
|
116
|
+
bounding boxes, label names and masks file names and associated probability scores.
|
54
117
|
|
55
|
-
|
56
|
-
|
57
|
-
|
118
|
+
Parameters:
|
119
|
+
prompt (str): The prompt to ground to the image.
|
120
|
+
image (np.ndarray): The image to ground the prompt to.
|
121
|
+
box_threshold (float, optional): The threshold for the box detection. Defaults
|
122
|
+
to 0.20.
|
123
|
+
iou_threshold (float, optional): The threshold for the Intersection over Union
|
124
|
+
(IoU). Defaults to 0.20.
|
125
|
+
|
126
|
+
Returns:
|
127
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
128
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
129
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left and
|
130
|
+
xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
131
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
132
|
+
the background.
|
58
133
|
|
59
134
|
Example
|
60
135
|
-------
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
136
|
+
>>> grounding_sam("car. dinosaur", image)
|
137
|
+
[
|
138
|
+
{
|
139
|
+
'score': 0.99,
|
140
|
+
'label': 'dinosaur',
|
141
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
142
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
143
|
+
[0, 0, 0, ..., 0, 0, 0],
|
144
|
+
...,
|
145
|
+
[0, 0, 0, ..., 0, 0, 0],
|
146
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
147
|
+
},
|
148
|
+
]
|
65
149
|
"""
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
"
|
71
|
-
|
72
|
-
|
73
|
-
],
|
74
|
-
"examples": [
|
75
|
-
{
|
76
|
-
"scenario": "Can you classify this image as a cat? Image name: cat.jpg",
|
77
|
-
"parameters": {"prompt": "cat", "image": "cat.jpg"},
|
78
|
-
},
|
79
|
-
{
|
80
|
-
"scenario": "Can you tag this photograph with cat or dog? Image name: cat_dog.jpg",
|
81
|
-
"parameters": {"prompt": "cat, dog", "image": "cat_dog.jpg"},
|
82
|
-
},
|
83
|
-
{
|
84
|
-
"scenario": "Can you build me a classifier that classifies red shirts, green shirts and other? Image name: shirts.jpg",
|
85
|
-
"parameters": {
|
86
|
-
"prompt": "red shirt, green shirt, other",
|
87
|
-
"image": "shirts.jpg",
|
88
|
-
},
|
89
|
-
},
|
90
|
-
],
|
150
|
+
image_size = image.shape[:2]
|
151
|
+
image_b64 = convert_to_b64(image)
|
152
|
+
request_data = {
|
153
|
+
"prompt": prompt,
|
154
|
+
"image": image_b64,
|
155
|
+
"tool": "visual_grounding_segment",
|
156
|
+
"kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
|
91
157
|
}
|
158
|
+
data: Dict[str, Any] = _send_inference_request(request_data, "tools")
|
159
|
+
return_data = []
|
160
|
+
for i in range(len(data["bboxes"])):
|
161
|
+
return_data.append(
|
162
|
+
{
|
163
|
+
"score": round(data["scores"][i], 2),
|
164
|
+
"label": data["labels"][i],
|
165
|
+
"bbox": normalize_bbox(data["bboxes"][i], image_size),
|
166
|
+
"mask": rle_decode(mask_rle=data["masks"][i], shape=data["mask_shape"]),
|
167
|
+
}
|
168
|
+
)
|
169
|
+
return return_data
|
92
170
|
|
93
|
-
# TODO: Add support for input multiple images, which aligns with the output type.
|
94
|
-
def __call__(self, prompt: str, image: Union[str, ImageType]) -> Dict:
|
95
|
-
"""Invoke the CLIP model.
|
96
|
-
|
97
|
-
Parameters:
|
98
|
-
prompt: a string includes a list of classes or tags to classify the image.
|
99
|
-
image: the input image to classify.
|
100
171
|
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
"image": image_b64,
|
108
|
-
"tool": "closed_set_image_classification",
|
109
|
-
}
|
110
|
-
resp_data = _send_inference_request(data, "tools")
|
111
|
-
resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
|
112
|
-
return resp_data
|
172
|
+
def extract_frames(
|
173
|
+
video_uri: Union[str, Path], fps: float = 0.5
|
174
|
+
) -> List[Tuple[np.ndarray, float]]:
|
175
|
+
"""'extract_frames' extracts frames from a video, returns a list of tuples (frame,
|
176
|
+
timestamp), where timestamp is the relative time in seconds where the frame was
|
177
|
+
captured. The frame is a local image file path.
|
113
178
|
|
179
|
+
Parameters:
|
180
|
+
video_uri (Union[str, Path]): The path to the video file.
|
181
|
+
fps (float, optional): The frame rate per second to extract the frames. Defaults
|
182
|
+
to 0.5.
|
114
183
|
|
115
|
-
|
116
|
-
|
184
|
+
Returns:
|
185
|
+
List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame
|
186
|
+
and the timestamp in seconds.
|
117
187
|
|
118
188
|
Example
|
119
189
|
-------
|
120
|
-
|
121
|
-
|
122
|
-
>>> caption("image1.jpg")
|
123
|
-
{'text': ['a box of orange and white socks']}
|
190
|
+
>>> extract_frames("path/to/video.mp4")
|
191
|
+
[(frame1, 0.0), (frame2, 0.5), ...]
|
124
192
|
"""
|
125
193
|
|
126
|
-
|
127
|
-
description = "'image_caption_' is a tool that can caption an image based on its contents or tags. It returns a text describing the image."
|
128
|
-
usage = {
|
129
|
-
"required_parameters": [
|
130
|
-
{"name": "image", "type": "str"},
|
131
|
-
],
|
132
|
-
"examples": [
|
133
|
-
{
|
134
|
-
"scenario": "Can you describe this image? Image name: cat.jpg",
|
135
|
-
"parameters": {"image": "cat.jpg"},
|
136
|
-
},
|
137
|
-
{
|
138
|
-
"scenario": "Can you caption this image with their main contents? Image name: cat_dog.jpg",
|
139
|
-
"parameters": {"image": "cat_dog.jpg"},
|
140
|
-
},
|
141
|
-
],
|
142
|
-
}
|
143
|
-
|
144
|
-
# TODO: Add support for input multiple images, which aligns with the output type.
|
145
|
-
def __call__(self, image: Union[str, ImageType]) -> Dict:
|
146
|
-
"""Invoke the Image captioning model.
|
194
|
+
return extract_frames_from_video(str(video_uri), fps)
|
147
195
|
|
148
|
-
Parameters:
|
149
|
-
image: the input image to caption.
|
150
196
|
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
image_b64 = convert_to_b64(image)
|
155
|
-
data = {
|
156
|
-
"image": image_b64,
|
157
|
-
"tool": "image_captioning",
|
158
|
-
}
|
159
|
-
return _send_inference_request(data, "tools")
|
197
|
+
def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
198
|
+
"""'ocr' extracts text from an image. It returns a list of detected text, bounding
|
199
|
+
boxes, and confidence scores.
|
160
200
|
|
201
|
+
Parameters:
|
202
|
+
image (np.ndarray): The image to extract text from.
|
161
203
|
|
162
|
-
|
163
|
-
|
164
|
-
|
204
|
+
Returns:
|
205
|
+
List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox,
|
206
|
+
and confidence score.
|
165
207
|
|
166
208
|
Example
|
167
209
|
-------
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
'bboxes': [[0.38, 0.15, 0.59, 0.7], [0.48, 0.25, 0.69, 0.71]],
|
173
|
-
'scores': [0.98, 0.02]}]
|
210
|
+
>>> ocr(image)
|
211
|
+
[
|
212
|
+
{'label': 'some text', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
|
213
|
+
]
|
174
214
|
"""
|
175
215
|
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
"
|
188
|
-
|
189
|
-
"scenario": "Can you detect and count the giraffes and zebras in this image? Image name: animal.jpg",
|
190
|
-
"parameters": {
|
191
|
-
"prompt": "giraffe. zebra",
|
192
|
-
"image": "person.jpg",
|
193
|
-
},
|
194
|
-
},
|
195
|
-
{
|
196
|
-
"scenario": "Can you build me a car detector?",
|
197
|
-
"parameters": {"prompt": "car", "image": ""},
|
198
|
-
},
|
199
|
-
{
|
200
|
-
"scenario": "Can you detect the person on the left and right? Image name: person.jpg",
|
201
|
-
"parameters": {
|
202
|
-
"prompt": "left person. right person",
|
203
|
-
"image": "person.jpg",
|
204
|
-
},
|
205
|
-
},
|
206
|
-
{
|
207
|
-
"scenario": "Detect the red shirts and green shirt. Image name: shirts.jpg",
|
208
|
-
"parameters": {
|
209
|
-
"prompt": "red shirt. green shirt",
|
210
|
-
"image": "shirts.jpg",
|
211
|
-
"box_threshold": 0.20,
|
212
|
-
"iou_threshold": 0.20,
|
213
|
-
},
|
214
|
-
},
|
215
|
-
],
|
216
|
-
}
|
216
|
+
pil_image = Image.fromarray(image).convert("RGB")
|
217
|
+
image_size = pil_image.size[::-1]
|
218
|
+
image_buffer = io.BytesIO()
|
219
|
+
pil_image.save(image_buffer, format="PNG")
|
220
|
+
buffer_bytes = image_buffer.getvalue()
|
221
|
+
image_buffer.close()
|
222
|
+
|
223
|
+
res = requests.post(
|
224
|
+
_OCR_URL,
|
225
|
+
files={"images": buffer_bytes},
|
226
|
+
data={"language": "en"},
|
227
|
+
headers={"contentType": "multipart/form-data", "apikey": _API_KEY},
|
228
|
+
)
|
217
229
|
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
if "bboxes" in data:
|
247
|
-
data["bboxes"] = [normalize_bbox(box, image_size) for box in data["bboxes"]]
|
248
|
-
if "scores" in data:
|
249
|
-
data["scores"] = [round(score, 2) for score in data["scores"]]
|
250
|
-
if "labels" in data:
|
251
|
-
data["labels"] = list(data["labels"])
|
252
|
-
data["image_size"] = image_size
|
253
|
-
return data
|
254
|
-
|
255
|
-
|
256
|
-
class GroundingSAM(Tool):
|
257
|
-
r"""Grounding SAM is a tool that can detect and segment arbitrary objects with
|
258
|
-
inputs such as category names or referring expressions.
|
230
|
+
if res.status_code != 200:
|
231
|
+
raise ValueError(f"OCR request failed with status code {res.status_code}")
|
232
|
+
|
233
|
+
data = res.json()
|
234
|
+
output = []
|
235
|
+
for det in data[0]:
|
236
|
+
label = det["text"]
|
237
|
+
box = [
|
238
|
+
det["location"][0]["x"],
|
239
|
+
det["location"][0]["y"],
|
240
|
+
det["location"][2]["x"],
|
241
|
+
det["location"][2]["y"],
|
242
|
+
]
|
243
|
+
box = normalize_bbox(box, image_size)
|
244
|
+
output.append({"label": label, "bbox": box, "score": round(det["score"], 2)})
|
245
|
+
|
246
|
+
return output
|
247
|
+
|
248
|
+
|
249
|
+
def zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
|
250
|
+
"""'zero_shot_counting' is a tool that counts the dominant foreground object given an image and no other information about the content.
|
251
|
+
It returns only the count of the objects in the image.
|
252
|
+
|
253
|
+
Parameters:
|
254
|
+
image (np.ndarray): The image that contains lot of instances of a single object
|
255
|
+
|
256
|
+
Returns:
|
257
|
+
Dict[str, Any]: A dictionary containing the key 'count' and the count as a value. E.g. {count: 12}.
|
259
258
|
|
260
259
|
Example
|
261
260
|
-------
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
[{'labels': ['yellow dot', 'red line'],
|
266
|
-
'bboxes': [[0.38, 0.15, 0.59, 0.7], [0.48, 0.25, 0.69, 0.71]],
|
267
|
-
'masks': [array([[0, 0, 0, ..., 0, 0, 0],
|
268
|
-
[0, 0, 0, ..., 0, 0, 0],
|
269
|
-
...,
|
270
|
-
[0, 0, 0, ..., 0, 0, 0],
|
271
|
-
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8)},
|
272
|
-
array([[0, 0, 0, ..., 0, 0, 0],
|
273
|
-
[0, 0, 0, ..., 0, 0, 0],
|
274
|
-
...,
|
275
|
-
[1, 1, 1, ..., 1, 1, 1],
|
276
|
-
[1, 1, 1, ..., 1, 1, 1]], dtype=uint8)]}]
|
261
|
+
>>> zero_shot_counting(image)
|
262
|
+
{'count': 45},
|
263
|
+
|
277
264
|
"""
|
278
265
|
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
"
|
283
|
-
{"name": "prompt", "type": "str"},
|
284
|
-
{"name": "image", "type": "str"},
|
285
|
-
],
|
286
|
-
"optional_parameters": [
|
287
|
-
{"name": "box_threshold", "type": "float", "min": 0.1, "max": 0.5},
|
288
|
-
{"name": "iou_threshold", "type": "float", "min": 0.01, "max": 0.99},
|
289
|
-
],
|
290
|
-
"examples": [
|
291
|
-
{
|
292
|
-
"scenario": "Can you segment the apples and grapes in this image? Image name: fruits.jpg",
|
293
|
-
"parameters": {
|
294
|
-
"prompt": "apple. grape",
|
295
|
-
"image": "fruits.jpg",
|
296
|
-
},
|
297
|
-
},
|
298
|
-
{
|
299
|
-
"scenario": "Can you build me a car segmentor?",
|
300
|
-
"parameters": {"prompt": "car", "image": ""},
|
301
|
-
},
|
302
|
-
{
|
303
|
-
"scenario": "Can you segment the person on the left and right? Image name: person.jpg",
|
304
|
-
"parameters": {
|
305
|
-
"prompt": "left person. right person",
|
306
|
-
"image": "person.jpg",
|
307
|
-
},
|
308
|
-
},
|
309
|
-
{
|
310
|
-
"scenario": "Can you build me a tool that segments red shirts and green shirts? Image name: shirts.jpg",
|
311
|
-
"parameters": {
|
312
|
-
"prompt": "red shirt, green shirt",
|
313
|
-
"image": "shirts.jpg",
|
314
|
-
"box_threshold": 0.20,
|
315
|
-
"iou_threshold": 0.20,
|
316
|
-
},
|
317
|
-
},
|
318
|
-
],
|
266
|
+
image_b64 = convert_to_b64(image)
|
267
|
+
data = {
|
268
|
+
"image": image_b64,
|
269
|
+
"tool": "zero_shot_counting",
|
319
270
|
}
|
271
|
+
resp_data = _send_inference_request(data, "tools")
|
272
|
+
resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
|
273
|
+
return resp_data
|
274
|
+
|
275
|
+
|
276
|
+
def visual_prompt_counting(
|
277
|
+
image: np.ndarray, visual_prompt: Dict[str, List[float]]
|
278
|
+
) -> Dict[str, Any]:
|
279
|
+
"""'visual_prompt_counting' is a tool that counts the dominant foreground object given an image and a visual prompt which is a bounding box describing the object.
|
280
|
+
It returns only the count of the objects in the image.
|
281
|
+
|
282
|
+
Parameters:
|
283
|
+
image (np.ndarray): The image that contains lot of instances of a single object
|
320
284
|
|
321
|
-
|
322
|
-
|
323
|
-
self,
|
324
|
-
prompt: str,
|
325
|
-
image: Union[str, ImageType],
|
326
|
-
box_threshold: float = 0.2,
|
327
|
-
iou_threshold: float = 0.2,
|
328
|
-
) -> Dict:
|
329
|
-
"""Invoke the Grounding SAM model.
|
330
|
-
|
331
|
-
Parameters:
|
332
|
-
prompt: a list of classes to segment.
|
333
|
-
image: the input image to segment.
|
334
|
-
box_threshold: the threshold to filter out the bounding boxes with low scores.
|
335
|
-
iou_threshold: the threshold for intersection over union used in nms algorithm. It will suppress the boxes which have iou greater than this threshold.
|
336
|
-
|
337
|
-
Returns:
|
338
|
-
A dictionary containing the labels, scores, bboxes and masks for the input image.
|
339
|
-
"""
|
340
|
-
image_size = get_image_size(image)
|
341
|
-
image_b64 = convert_to_b64(image)
|
342
|
-
request_data = {
|
343
|
-
"prompt": prompt,
|
344
|
-
"image": image_b64,
|
345
|
-
"tool": "visual_grounding_segment",
|
346
|
-
"kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
|
347
|
-
}
|
348
|
-
data: Dict[str, Any] = _send_inference_request(request_data, "tools")
|
349
|
-
if "bboxes" in data:
|
350
|
-
data["bboxes"] = [normalize_bbox(box, image_size) for box in data["bboxes"]]
|
351
|
-
if "masks" in data:
|
352
|
-
data["masks"] = [
|
353
|
-
rle_decode(mask_rle=mask, shape=data["mask_shape"])
|
354
|
-
for mask in data["masks"]
|
355
|
-
]
|
356
|
-
data["image_size"] = image_size
|
357
|
-
data.pop("mask_shape", None)
|
358
|
-
return data
|
359
|
-
|
360
|
-
|
361
|
-
class DINOv(Tool):
|
362
|
-
r"""DINOv is a tool that can detect and segment similar objects with the given input masks.
|
285
|
+
Returns:
|
286
|
+
Dict[str, Any]: A dictionary containing the key 'count' and the count as a value. E.g. {count: 12}.
|
363
287
|
|
364
288
|
Example
|
365
289
|
-------
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
[{'scores': [0.512, 0.212],
|
370
|
-
'masks': [array([[0, 0, 0, ..., 0, 0, 0],
|
371
|
-
...,
|
372
|
-
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8)},
|
373
|
-
array([[0, 0, 0, ..., 0, 0, 0],
|
374
|
-
...,
|
375
|
-
[1, 1, 1, ..., 1, 1, 1]], dtype=uint8)]}]
|
290
|
+
>>> visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
|
291
|
+
{'count': 45},
|
292
|
+
|
376
293
|
"""
|
377
294
|
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
"
|
386
|
-
|
387
|
-
"scenario": "Can you find all the balloons in this image that is similar to the provided masked area? Image name: input.jpg Reference image: balloon.jpg Reference mask: balloon_mask.jpg",
|
388
|
-
"parameters": {
|
389
|
-
"prompt": [
|
390
|
-
{"mask": "balloon_mask.jpg", "image": "balloon.jpg"},
|
391
|
-
],
|
392
|
-
"image": "input.jpg",
|
393
|
-
},
|
394
|
-
},
|
395
|
-
{
|
396
|
-
"scenario": "Detect all the objects in this image that are similar to the provided mask. Image name: original.jpg Reference image: mask.png Reference mask: background.png",
|
397
|
-
"parameters": {
|
398
|
-
"prompt": [
|
399
|
-
{"mask": "mask.png", "image": "background.png"},
|
400
|
-
],
|
401
|
-
"image": "original.jpg",
|
402
|
-
},
|
403
|
-
},
|
404
|
-
],
|
295
|
+
image_size = get_image_size(image)
|
296
|
+
bbox = visual_prompt["bbox"]
|
297
|
+
bbox_str = ", ".join(map(str, denormalize_bbox(bbox, image_size)))
|
298
|
+
image_b64 = convert_to_b64(image)
|
299
|
+
|
300
|
+
data = {
|
301
|
+
"image": image_b64,
|
302
|
+
"prompt": bbox_str,
|
303
|
+
"tool": "few_shot_counting",
|
405
304
|
}
|
305
|
+
resp_data = _send_inference_request(data, "tools")
|
306
|
+
resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
|
307
|
+
return resp_data
|
406
308
|
|
407
|
-
def __call__(
|
408
|
-
self, prompt: List[Dict[str, str]], image: Union[str, ImageType]
|
409
|
-
) -> Dict:
|
410
|
-
"""Invoke the DINOv model.
|
411
|
-
|
412
|
-
Parameters:
|
413
|
-
prompt: a list of visual prompts in the form of {'mask': 'MASK_FILE_PATH', 'image': 'IMAGE_FILE_PATH'}.
|
414
|
-
image: the input image to segment.
|
415
|
-
|
416
|
-
Returns:
|
417
|
-
A dictionary of the below keys: 'scores', 'masks' and 'mask_shape', which stores a list of detected segmentation masks and its scores.
|
418
|
-
"""
|
419
|
-
image_b64 = convert_to_b64(image)
|
420
|
-
for p in prompt:
|
421
|
-
p["mask"] = convert_to_b64(p["mask"])
|
422
|
-
p["image"] = convert_to_b64(p["image"])
|
423
|
-
request_data = {
|
424
|
-
"prompt": prompt,
|
425
|
-
"image": image_b64,
|
426
|
-
}
|
427
|
-
data: Dict[str, Any] = _send_inference_request(request_data, "dinov")
|
428
|
-
if "bboxes" in data:
|
429
|
-
data["bboxes"] = [
|
430
|
-
normalize_bbox(box, data["mask_shape"]) for box in data["bboxes"]
|
431
|
-
]
|
432
|
-
if "masks" in data:
|
433
|
-
data["masks"] = [
|
434
|
-
rle_decode(mask_rle=mask, shape=data["mask_shape"])
|
435
|
-
for mask in data["masks"]
|
436
|
-
]
|
437
|
-
data["labels"] = ["visual prompt" for _ in range(len(data["masks"]))]
|
438
|
-
mask_shape = data.pop("mask_shape", None)
|
439
|
-
data["image_size"] = (mask_shape[0], mask_shape[1]) if mask_shape else None
|
440
|
-
return data
|
441
|
-
|
442
|
-
|
443
|
-
class AgentDINOv(DINOv):
|
444
|
-
def __call__(
|
445
|
-
self,
|
446
|
-
prompt: List[Dict[str, str]],
|
447
|
-
image: Union[str, ImageType],
|
448
|
-
) -> Dict:
|
449
|
-
rets = super().__call__(prompt, image)
|
450
|
-
mask_files = []
|
451
|
-
for mask in rets["masks"]:
|
452
|
-
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
453
|
-
file_name = Path(tmp.name).with_suffix(".mask.png")
|
454
|
-
Image.fromarray(mask * 255).save(file_name)
|
455
|
-
mask_files.append(str(file_name))
|
456
|
-
rets["masks"] = mask_files
|
457
|
-
return rets
|
458
|
-
|
459
|
-
|
460
|
-
class AgentGroundingSAM(GroundingSAM):
|
461
|
-
r"""AgentGroundingSAM is the same as GroundingSAM but it saves the masks as files
|
462
|
-
returns the file name. This makes it easier for agents to use.
|
463
|
-
"""
|
464
309
|
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
476
|
-
file_name = Path(tmp.name).with_suffix(".mask.png")
|
477
|
-
Image.fromarray(mask * 255).save(file_name)
|
478
|
-
mask_files.append(str(file_name))
|
479
|
-
rets["masks"] = mask_files
|
480
|
-
return rets
|
481
|
-
|
482
|
-
|
483
|
-
class ZeroShotCounting(Tool):
|
484
|
-
r"""ZeroShotCounting is a tool that can count total number of instances of an object
|
485
|
-
present in an image belonging to same class without a text or visual prompt.
|
310
|
+
def image_question_answering(image: np.ndarray, prompt: str) -> str:
|
311
|
+
"""'image_question_answering_' is a tool that can answer questions about the visual contents of an image given a question and an image.
|
312
|
+
It returns an answer to the question
|
313
|
+
|
314
|
+
Parameters:
|
315
|
+
image (np.ndarray): The reference image used for the question
|
316
|
+
prompt (str): The question about the image
|
317
|
+
|
318
|
+
Returns:
|
319
|
+
str: A string which is the answer to the given prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'}.
|
486
320
|
|
487
321
|
Example
|
488
322
|
-------
|
489
|
-
|
490
|
-
|
491
|
-
>>> zshot_count("image1.jpg")
|
492
|
-
{'count': 45}
|
493
|
-
"""
|
323
|
+
>>> image_question_answering(image, 'What is the cat doing ?')
|
324
|
+
'drinking milk'
|
494
325
|
|
495
|
-
|
496
|
-
description = "'zero_shot_counting_' is a tool that counts foreground items given only an image and no other information. It returns only the count of the objects in the image"
|
326
|
+
"""
|
497
327
|
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
"
|
503
|
-
{
|
504
|
-
"scenario": "Can you count the items in the image? Image name: lids.jpg",
|
505
|
-
"parameters": {"image": "lids.jpg"},
|
506
|
-
},
|
507
|
-
{
|
508
|
-
"scenario": "Can you count the total number of objects in this image? Image name: tray.jpg",
|
509
|
-
"parameters": {"image": "tray.jpg"},
|
510
|
-
},
|
511
|
-
{
|
512
|
-
"scenario": "Can you build me an object counting tool? Image name: shirts.jpg",
|
513
|
-
"parameters": {
|
514
|
-
"image": "shirts.jpg",
|
515
|
-
},
|
516
|
-
},
|
517
|
-
],
|
328
|
+
image_b64 = convert_to_b64(image)
|
329
|
+
data = {
|
330
|
+
"image": image_b64,
|
331
|
+
"prompt": prompt,
|
332
|
+
"tool": "image_question_answering",
|
518
333
|
}
|
519
334
|
|
520
|
-
|
521
|
-
|
522
|
-
"""Invoke the Zero shot counting model.
|
335
|
+
answer = _send_inference_request(data, "tools")
|
336
|
+
return answer["text"][0] # type: ignore
|
523
337
|
|
524
|
-
Parameters:
|
525
|
-
image: the input image.
|
526
338
|
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
image_b64 = convert_to_b64(image)
|
531
|
-
data = {
|
532
|
-
"image": image_b64,
|
533
|
-
"tool": "zero_shot_counting",
|
534
|
-
}
|
535
|
-
resp_data = _send_inference_request(data, "tools")
|
536
|
-
resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
|
537
|
-
return resp_data
|
339
|
+
def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
|
340
|
+
"""'clip' is a tool that can classify an image given a list of input classes or tags.
|
341
|
+
It returns the same list of the input classes along with their probability scores based on image content.
|
538
342
|
|
343
|
+
Parameters:
|
344
|
+
image (np.ndarray): The image to classify or tag
|
345
|
+
classes (List[str]): The list of classes or tags that is associated with the image
|
539
346
|
|
540
|
-
|
541
|
-
|
542
|
-
present in an image belonging to same class with help of an visual prompt which is a bounding box.
|
347
|
+
Returns:
|
348
|
+
Dict[str, Any]: A dictionary containing the labels and scores. One dictionary contains a list of given labels and other a list of scores.
|
543
349
|
|
544
350
|
Example
|
545
351
|
-------
|
546
|
-
|
547
|
-
|
548
|
-
>>> prompt_count(image="image1.jpg", prompt={"bbox": [0.1, 0.1, 0.4, 0.42]})
|
549
|
-
{'count': 23}
|
550
|
-
"""
|
352
|
+
>>> clip(image, ['dog', 'cat', 'bird'])
|
353
|
+
{"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]},
|
551
354
|
|
552
|
-
|
553
|
-
description = "'visual_prompt_counting_' is a tool that counts foreground items in an image given a visual prompt which is a bounding box describing the object. It returns only the count of the objects in the image."
|
355
|
+
"""
|
554
356
|
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
"examples": [
|
561
|
-
{
|
562
|
-
"scenario": "Here is an example of a lid '0.1, 0.1, 0.14, 0.2', Can you count the items in the image ? Image name: lids.jpg",
|
563
|
-
"parameters": {
|
564
|
-
"image": "lids.jpg",
|
565
|
-
"prompt": {"bbox": [0.1, 0.1, 0.14, 0.2]},
|
566
|
-
},
|
567
|
-
},
|
568
|
-
{
|
569
|
-
"scenario": "Can you count the total number of objects in this image ? Image name: tray.jpg, reference_data: {'bbox': [0.1, 0.1, 0.2, 0.25]}",
|
570
|
-
"parameters": {
|
571
|
-
"image": "tray.jpg",
|
572
|
-
"prompt": {"bbox": [0.1, 0.1, 0.2, 0.25]},
|
573
|
-
},
|
574
|
-
},
|
575
|
-
{
|
576
|
-
"scenario": "Can you count this item based on an example, reference_data: {'bbox': [100, 115, 200, 200]} ? Image name: shirts.jpg",
|
577
|
-
"parameters": {
|
578
|
-
"image": "shirts.jpg",
|
579
|
-
"prompt": {"bbox": [100, 115, 200, 200]},
|
580
|
-
},
|
581
|
-
},
|
582
|
-
{
|
583
|
-
"scenario": "Can you build me a counting tool based on an example prompt ? Image name: shoes.jpg, reference_data: {'bbox': [0.1, 0.1, 0.6, 0.65]}",
|
584
|
-
"parameters": {
|
585
|
-
"image": "shoes.jpg",
|
586
|
-
"prompt": {"bbox": [0.1, 0.1, 0.6, 0.65]},
|
587
|
-
},
|
588
|
-
},
|
589
|
-
],
|
357
|
+
image_b64 = convert_to_b64(image)
|
358
|
+
data = {
|
359
|
+
"prompt": ",".join(classes),
|
360
|
+
"image": image_b64,
|
361
|
+
"tool": "closed_set_image_classification",
|
590
362
|
}
|
363
|
+
resp_data = _send_inference_request(data, "tools")
|
364
|
+
resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
|
365
|
+
return resp_data
|
591
366
|
|
592
|
-
def __call__(
|
593
|
-
self, image: Union[str, ImageType], prompt: Dict[str, List[float]]
|
594
|
-
) -> Dict:
|
595
|
-
"""Invoke the few shot counting model.
|
596
|
-
|
597
|
-
Parameters:
|
598
|
-
image: the input image.
|
599
|
-
prompt: the visual prompt which is a bounding box describing the object.
|
600
|
-
|
601
|
-
Returns:
|
602
|
-
A dictionary containing the key 'count' and the count as value. E.g. {count: 12}
|
603
|
-
"""
|
604
|
-
image_size = get_image_size(image)
|
605
|
-
bbox = prompt["bbox"]
|
606
|
-
bbox_str = ", ".join(map(str, denormalize_bbox(bbox, image_size)))
|
607
|
-
image_b64 = convert_to_b64(image)
|
608
367
|
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
"tool": "few_shot_counting",
|
613
|
-
}
|
614
|
-
resp_data = _send_inference_request(data, "tools")
|
615
|
-
resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
|
616
|
-
return resp_data
|
368
|
+
def image_caption(image: np.ndarray) -> str:
|
369
|
+
"""'image_caption' is a tool that can caption an image based on its contents.
|
370
|
+
It returns a text describing the image.
|
617
371
|
|
372
|
+
Parameters:
|
373
|
+
image (np.ndarray): The image to caption
|
618
374
|
|
619
|
-
|
620
|
-
|
375
|
+
Returns:
|
376
|
+
str: A string which is the caption for the given image.
|
621
377
|
|
622
378
|
Example
|
623
379
|
-------
|
624
|
-
|
625
|
-
|
626
|
-
>>> vqa_tool(image="image1.jpg", prompt="describe this image in detail")
|
627
|
-
{'text': "The image contains a cat sitting on a table with a bowl of milk."}
|
628
|
-
"""
|
380
|
+
>>> image_caption(image)
|
381
|
+
'This image contains a cat sitting on a table with a bowl of milk.'
|
629
382
|
|
630
|
-
|
631
|
-
description = "'visual_question_answering_' is a tool that can answer basic questions about the image given a question and an image. It returns a text describing the image and the answer to the question"
|
383
|
+
"""
|
632
384
|
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
],
|
638
|
-
"examples": [
|
639
|
-
{
|
640
|
-
"scenario": "Describe this image in detail. Image name: cat.jpg",
|
641
|
-
"parameters": {
|
642
|
-
"image": "cats.jpg",
|
643
|
-
"prompt": "Describe this image in detail",
|
644
|
-
},
|
645
|
-
},
|
646
|
-
{
|
647
|
-
"scenario": "Can you help me with this street sign in this image ? What does it say ? Image name: sign.jpg",
|
648
|
-
"parameters": {
|
649
|
-
"image": "sign.jpg",
|
650
|
-
"prompt": "Can you help me with this street sign ? What does it say ?",
|
651
|
-
},
|
652
|
-
},
|
653
|
-
{
|
654
|
-
"scenario": "Describe the weather in the image for me ? Image name: weather.jpg",
|
655
|
-
"parameters": {
|
656
|
-
"image": "weather.jpg",
|
657
|
-
"prompt": "Describe the weather in the image for me ",
|
658
|
-
},
|
659
|
-
},
|
660
|
-
{
|
661
|
-
"scenario": "Which 2 are the least frequent bins in this histogram ? Image name: chart.jpg",
|
662
|
-
"parameters": {
|
663
|
-
"image": "chart.jpg",
|
664
|
-
"prompt": "Which 2 are the least frequent bins in this histogram",
|
665
|
-
},
|
666
|
-
},
|
667
|
-
],
|
385
|
+
image_b64 = convert_to_b64(image)
|
386
|
+
data = {
|
387
|
+
"image": image_b64,
|
388
|
+
"tool": "image_captioning",
|
668
389
|
}
|
669
390
|
|
670
|
-
|
671
|
-
|
391
|
+
answer = _send_inference_request(data, "tools")
|
392
|
+
return answer["text"][0] # type: ignore
|
672
393
|
|
673
|
-
Parameters:
|
674
|
-
image: the input image.
|
675
394
|
|
676
|
-
|
677
|
-
|
678
|
-
"""
|
679
|
-
|
680
|
-
gpt = OpenAILMM()
|
681
|
-
return {"text": gpt(input=prompt, images=[image])}
|
395
|
+
def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
|
396
|
+
"""'closest_mask_distance' calculates the closest distance between two masks.
|
682
397
|
|
398
|
+
Parameters:
|
399
|
+
mask1 (np.ndarray): The first mask.
|
400
|
+
mask2 (np.ndarray): The second mask.
|
683
401
|
|
684
|
-
|
685
|
-
|
686
|
-
It is same as VisualQuestionAnswering but this tool is not used by agents. It is used when user requests a tool for VQA using generate_image_qa_tool function.
|
687
|
-
It is also useful if the user wants the data to be not exposed to OpenAI endpoints
|
402
|
+
Returns:
|
403
|
+
float: The closest distance between the two masks.
|
688
404
|
|
689
405
|
Example
|
690
406
|
-------
|
691
|
-
|
692
|
-
|
693
|
-
>>> vqa_tool(image="image1.jpg", prompt="describe this image in detail")
|
694
|
-
{'text': "The image contains a cat sitting on a table with a bowl of milk."}
|
407
|
+
>>> closest_mask_distance(mask1, mask2)
|
408
|
+
0.5
|
695
409
|
"""
|
696
410
|
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
{"name": "prompt", "type": "str"},
|
704
|
-
],
|
705
|
-
"examples": [
|
706
|
-
{
|
707
|
-
"scenario": "Describe this image in detail. Image name: cat.jpg",
|
708
|
-
"parameters": {
|
709
|
-
"image": "cats.jpg",
|
710
|
-
"prompt": "Describe this image in detail",
|
711
|
-
},
|
712
|
-
},
|
713
|
-
{
|
714
|
-
"scenario": "Can you help me with this street sign in this image ? What does it say ? Image name: sign.jpg",
|
715
|
-
"parameters": {
|
716
|
-
"image": "sign.jpg",
|
717
|
-
"prompt": "Can you help me with this street sign ? What does it say ?",
|
718
|
-
},
|
719
|
-
},
|
720
|
-
{
|
721
|
-
"scenario": "Describe the weather in the image for me ? Image name: weather.jpg",
|
722
|
-
"parameters": {
|
723
|
-
"image": "weather.jpg",
|
724
|
-
"prompt": "Describe the weather in the image for me ",
|
725
|
-
},
|
726
|
-
},
|
727
|
-
{
|
728
|
-
"scenario": "Can you generate an image question answering tool ? Image name: chart.jpg, prompt: Which 2 are the least frequent bins in this histogram",
|
729
|
-
"parameters": {
|
730
|
-
"image": "chart.jpg",
|
731
|
-
"prompt": "Which 2 are the least frequent bins in this histogram",
|
732
|
-
},
|
733
|
-
},
|
734
|
-
],
|
735
|
-
}
|
411
|
+
mask1 = np.clip(mask1, 0, 1)
|
412
|
+
mask2 = np.clip(mask2, 0, 1)
|
413
|
+
mask1_points = np.transpose(np.nonzero(mask1))
|
414
|
+
mask2_points = np.transpose(np.nonzero(mask2))
|
415
|
+
dist_matrix = distance.cdist(mask1_points, mask2_points, "euclidean")
|
416
|
+
return cast(float, np.min(dist_matrix))
|
736
417
|
|
737
|
-
def __call__(self, image: Union[str, ImageType], prompt: str) -> Dict:
|
738
|
-
"""Invoke the visual question answering model.
|
739
418
|
|
740
|
-
|
741
|
-
|
419
|
+
def closest_box_distance(
|
420
|
+
box1: List[float], box2: List[float], image_size: Tuple[int, int]
|
421
|
+
) -> float:
|
422
|
+
"""'closest_box_distance' calculates the closest distance between two bounding boxes.
|
742
423
|
|
743
|
-
|
744
|
-
|
745
|
-
|
424
|
+
Parameters:
|
425
|
+
box1 (List[float]): The first bounding box.
|
426
|
+
box2 (List[float]): The second bounding box.
|
427
|
+
image_size (Tuple[int, int]): The size of the image given as (height, width).
|
746
428
|
|
747
|
-
|
748
|
-
|
749
|
-
"image": image_b64,
|
750
|
-
"prompt": prompt,
|
751
|
-
"tool": "image_question_answering",
|
752
|
-
}
|
429
|
+
Returns:
|
430
|
+
float: The closest distance between the two bounding boxes.
|
753
431
|
|
754
|
-
|
432
|
+
Example
|
433
|
+
-------
|
434
|
+
>>> closest_box_distance([100, 100, 200, 200], [300, 300, 400, 400])
|
435
|
+
141.42
|
436
|
+
"""
|
755
437
|
|
438
|
+
x11, y11, x12, y12 = denormalize_bbox(box1, image_size)
|
439
|
+
x21, y21, x22, y22 = denormalize_bbox(box2, image_size)
|
756
440
|
|
757
|
-
|
758
|
-
|
441
|
+
horizontal_distance = np.max([0, x21 - x12, x11 - x22])
|
442
|
+
vertical_distance = np.max([0, y21 - y12, y11 - y22])
|
443
|
+
return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
|
759
444
|
|
760
|
-
name = "crop_"
|
761
|
-
description = "'crop_' crops an image given a bounding box and returns a file name of the cropped image. It returns a file with the cropped image."
|
762
|
-
usage = {
|
763
|
-
"required_parameters": [
|
764
|
-
{"name": "bbox", "type": "List[float]"},
|
765
|
-
{"name": "image", "type": "str"},
|
766
|
-
],
|
767
|
-
"examples": [
|
768
|
-
{
|
769
|
-
"scenario": "Can you crop the image to the bounding box [0.1, 0.1, 0.9, 0.9]? Image name: image.jpg",
|
770
|
-
"parameters": {"bbox": [0.1, 0.1, 0.9, 0.9], "image": "image.jpg"},
|
771
|
-
},
|
772
|
-
{
|
773
|
-
"scenario": "Cut out the image to the bounding box [0.2, 0.2, 0.8, 0.8]. Image name: car.jpg",
|
774
|
-
"parameters": {"bbox": [0.2, 0.2, 0.8, 0.8], "image": "car.jpg"},
|
775
|
-
},
|
776
|
-
],
|
777
|
-
}
|
778
445
|
|
779
|
-
|
780
|
-
pil_image = Image.open(image)
|
781
|
-
width, height = pil_image.size
|
782
|
-
bbox = [
|
783
|
-
int(bbox[0] * width),
|
784
|
-
int(bbox[1] * height),
|
785
|
-
int(bbox[2] * width),
|
786
|
-
int(bbox[3] * height),
|
787
|
-
]
|
788
|
-
cropped_image = pil_image.crop(bbox) # type: ignore
|
789
|
-
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
790
|
-
cropped_image.save(tmp.name)
|
446
|
+
# Utility and visualization functions
|
791
447
|
|
792
|
-
return {"image": tmp.name}
|
793
448
|
|
449
|
+
def save_json(data: Any, file_path: str) -> None:
|
450
|
+
"""'save_json' is a utility function that saves data as a JSON file. It is helpful
|
451
|
+
for saving data that contains NumPy arrays which are not JSON serializable.
|
794
452
|
|
795
|
-
|
796
|
-
|
453
|
+
Parameters:
|
454
|
+
data (Any): The data to save.
|
455
|
+
file_path (str): The path to save the JSON file.
|
797
456
|
|
798
|
-
|
799
|
-
|
800
|
-
|
801
|
-
|
802
|
-
{"name": "bboxes", "type": "List[int]"},
|
803
|
-
{"name": "image_size", "type": "Tuple[int]"},
|
804
|
-
],
|
805
|
-
"examples": [
|
806
|
-
{
|
807
|
-
"scenario": "Calculate the width and height of the bounding box [0.2, 0.21, 0.34, 0.42]",
|
808
|
-
"parameters": {
|
809
|
-
"bboxes": [[0.2, 0.21, 0.34, 0.42]],
|
810
|
-
"image_size": (500, 1200),
|
811
|
-
},
|
812
|
-
},
|
813
|
-
{
|
814
|
-
"scenario": "Calculate the area of the bounding box [0.2, 0.21, 0.34, 0.42]",
|
815
|
-
"parameters": {
|
816
|
-
"bboxes": [[0.2, 0.21, 0.34, 0.42]],
|
817
|
-
"image_size": (640, 480),
|
818
|
-
},
|
819
|
-
},
|
820
|
-
],
|
821
|
-
}
|
457
|
+
Example
|
458
|
+
-------
|
459
|
+
>>> save_json(data, "path/to/file.json")
|
460
|
+
"""
|
822
461
|
|
823
|
-
|
824
|
-
self,
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
|
829
|
-
|
830
|
-
areas.append(
|
831
|
-
{
|
832
|
-
"width": round((x2 - x1) * width, 2),
|
833
|
-
"height": round((y2 - y1) * height, 2),
|
834
|
-
"area": round((x2 - x1) * (y2 - y1) * width * height, 2),
|
835
|
-
}
|
836
|
-
)
|
837
|
-
|
838
|
-
return areas
|
839
|
-
|
840
|
-
|
841
|
-
class SegArea(Tool):
|
842
|
-
r"""SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places."""
|
843
|
-
|
844
|
-
name = "seg_area_"
|
845
|
-
description = "'seg_area_' returns the area of the given segmentation mask in pixels normalized to 2 decimal places."
|
846
|
-
usage = {
|
847
|
-
"required_parameters": [{"name": "masks", "type": "str"}],
|
848
|
-
"examples": [
|
849
|
-
{
|
850
|
-
"scenario": "If you want to calculate the area of the segmentation mask, pass the masks file name.",
|
851
|
-
"parameters": {"masks": "mask_file.jpg"},
|
852
|
-
},
|
853
|
-
],
|
854
|
-
}
|
462
|
+
class NumpyEncoder(json.JSONEncoder):
|
463
|
+
def default(self, obj: Any): # type: ignore
|
464
|
+
if isinstance(obj, np.ndarray):
|
465
|
+
return obj.tolist()
|
466
|
+
elif isinstance(obj, np.bool_):
|
467
|
+
return bool(obj)
|
468
|
+
return json.JSONEncoder.default(self, obj)
|
855
469
|
|
856
|
-
|
857
|
-
|
858
|
-
np_mask = np.array(pil_mask)
|
859
|
-
np_mask = np.clip(np_mask, 0, 1)
|
860
|
-
return cast(float, round(np.sum(np_mask), 2))
|
861
|
-
|
862
|
-
|
863
|
-
class BboxIoU(Tool):
|
864
|
-
name = "bbox_iou_"
|
865
|
-
description = "'bbox_iou_' returns the intersection over union of two bounding boxes. This is a good tool for determining if two objects are overlapping."
|
866
|
-
usage = {
|
867
|
-
"required_parameters": [
|
868
|
-
{"name": "bbox1", "type": "List[int]"},
|
869
|
-
{"name": "bbox2", "type": "List[int]"},
|
870
|
-
],
|
871
|
-
"examples": [
|
872
|
-
{
|
873
|
-
"scenario": "If you want to calculate the intersection over union of the bounding boxes [0.2, 0.21, 0.34, 0.42] and [0.3, 0.31, 0.44, 0.52]",
|
874
|
-
"parameters": {
|
875
|
-
"bbox1": [0.2, 0.21, 0.34, 0.42],
|
876
|
-
"bbox2": [0.3, 0.31, 0.44, 0.52],
|
877
|
-
},
|
878
|
-
}
|
879
|
-
],
|
880
|
-
}
|
470
|
+
with open(file_path, "w") as f:
|
471
|
+
json.dump(data, f, cls=NumpyEncoder)
|
881
472
|
|
882
|
-
def __call__(self, bbox1: List[int], bbox2: List[int]) -> float:
|
883
|
-
x1, y1, x2, y2 = bbox1
|
884
|
-
x3, y3, x4, y4 = bbox2
|
885
|
-
xA = max(x1, x3)
|
886
|
-
yA = max(y1, y3)
|
887
|
-
xB = min(x2, x4)
|
888
|
-
yB = min(y2, y4)
|
889
|
-
inter_area = max(0, xB - xA) * max(0, yB - yA)
|
890
|
-
boxa_area = (x2 - x1) * (y2 - y1)
|
891
|
-
boxb_area = (x4 - x3) * (y4 - y3)
|
892
|
-
iou = inter_area / float(boxa_area + boxb_area - inter_area)
|
893
|
-
return round(iou, 2)
|
894
|
-
|
895
|
-
|
896
|
-
class SegIoU(Tool):
|
897
|
-
name = "seg_iou_"
|
898
|
-
description = "'seg_iou_' returns the intersection over union of two segmentation masks given their segmentation mask files."
|
899
|
-
usage = {
|
900
|
-
"required_parameters": [
|
901
|
-
{"name": "mask1", "type": "str"},
|
902
|
-
{"name": "mask2", "type": "str"},
|
903
|
-
],
|
904
|
-
"examples": [
|
905
|
-
{
|
906
|
-
"scenario": "Calculate the intersection over union of the segmentation masks for mask_file1.jpg and mask_file2.jpg",
|
907
|
-
"parameters": {"mask1": "mask_file1.png", "mask2": "mask_file2.png"},
|
908
|
-
}
|
909
|
-
],
|
910
|
-
}
|
911
473
|
|
912
|
-
|
913
|
-
|
914
|
-
pil_mask2 = Image.open(str(mask2))
|
915
|
-
np_mask1 = np.clip(np.array(pil_mask1), 0, 1)
|
916
|
-
np_mask2 = np.clip(np.array(pil_mask2), 0, 1)
|
917
|
-
intersection = np.logical_and(np_mask1, np_mask2)
|
918
|
-
union = np.logical_or(np_mask1, np_mask2)
|
919
|
-
iou = np.sum(intersection) / np.sum(union)
|
920
|
-
return cast(float, round(iou, 2))
|
921
|
-
|
922
|
-
|
923
|
-
class BboxContains(Tool):
|
924
|
-
name = "bbox_contains_"
|
925
|
-
description = "Given two bounding boxes, a target bounding box and a region bounding box, 'bbox_contains_' returns the intersection of the two bounding boxes which is the percentage area of the target bounding box overlaps with the region bounding box. This is a good tool for determining if the region object contains the target object."
|
926
|
-
usage = {
|
927
|
-
"required_parameters": [
|
928
|
-
{"name": "target", "type": "List[int]"},
|
929
|
-
{"name": "target_class", "type": "str"},
|
930
|
-
{"name": "region", "type": "List[int]"},
|
931
|
-
{"name": "region_class", "type": "str"},
|
932
|
-
],
|
933
|
-
"examples": [
|
934
|
-
{
|
935
|
-
"scenario": "Determine if the dog on the couch, bounding box of the dog: [0.2, 0.21, 0.34, 0.42], bounding box of the couch: [0.3, 0.31, 0.44, 0.52]",
|
936
|
-
"parameters": {
|
937
|
-
"target": [0.2, 0.21, 0.34, 0.42],
|
938
|
-
"target_class": "dog",
|
939
|
-
"region": [0.3, 0.31, 0.44, 0.52],
|
940
|
-
"region_class": "couch",
|
941
|
-
},
|
942
|
-
},
|
943
|
-
{
|
944
|
-
"scenario": "Check if the kid is in the pool? bounding box of the kid: [0.2, 0.21, 0.34, 0.42], bounding box of the pool: [0.3, 0.31, 0.44, 0.52]",
|
945
|
-
"parameters": {
|
946
|
-
"target": [0.2, 0.21, 0.34, 0.42],
|
947
|
-
"target_class": "kid",
|
948
|
-
"region": [0.3, 0.31, 0.44, 0.52],
|
949
|
-
"region_class": "pool",
|
950
|
-
},
|
951
|
-
},
|
952
|
-
],
|
953
|
-
}
|
474
|
+
def load_image(image_path: str) -> np.ndarray:
|
475
|
+
"""'load_image' is a utility function that loads an image from the given path.
|
954
476
|
|
955
|
-
|
956
|
-
|
957
|
-
) -> Dict[str, Union[str, float]]:
|
958
|
-
x1, y1, x2, y2 = target
|
959
|
-
x3, y3, x4, y4 = region
|
960
|
-
xA = max(x1, x3)
|
961
|
-
yA = max(y1, y3)
|
962
|
-
xB = min(x2, x4)
|
963
|
-
yB = min(y2, y4)
|
964
|
-
inter_area = max(0, xB - xA) * max(0, yB - yA)
|
965
|
-
boxa_area = (x2 - x1) * (y2 - y1)
|
966
|
-
iou = inter_area / float(boxa_area)
|
967
|
-
area = round(iou, 2)
|
968
|
-
return {
|
969
|
-
"target_class": target_class,
|
970
|
-
"region_class": region_class,
|
971
|
-
"intersection": area,
|
972
|
-
}
|
973
|
-
|
974
|
-
|
975
|
-
class ObjectDistance(Tool):
|
976
|
-
name = "object_distance_"
|
977
|
-
description = "'object_distance_' calculates the distance between two objects in an image. It returns the minimum distance between the two objects."
|
978
|
-
usage = {
|
979
|
-
"required_parameters": [
|
980
|
-
{"name": "object1", "type": "Dict[str, Any]"},
|
981
|
-
{"name": "object2", "type": "Dict[str, Any]"},
|
982
|
-
],
|
983
|
-
"examples": [
|
984
|
-
{
|
985
|
-
"scenario": "Calculate the distance between these two objects {bboxes: [0.2, 0.21, 0.34, 0.42], masks: 'mask_file1.png'}, {bboxes: [0.3, 0.31, 0.44, 0.52], masks: 'mask_file2.png'}",
|
986
|
-
"parameters": {
|
987
|
-
"object1": {
|
988
|
-
"bboxes": [0.2, 0.21, 0.34, 0.42],
|
989
|
-
"scores": 0.54,
|
990
|
-
"masks": "mask_file1.png",
|
991
|
-
},
|
992
|
-
"object2": {
|
993
|
-
"bboxes": [0.3, 0.31, 0.44, 0.52],
|
994
|
-
"scores": 0.66,
|
995
|
-
"masks": "mask_file2.png",
|
996
|
-
},
|
997
|
-
},
|
998
|
-
}
|
999
|
-
],
|
1000
|
-
}
|
477
|
+
Parameters:
|
478
|
+
image_path (str): The path to the image.
|
1001
479
|
|
1002
|
-
|
1003
|
-
|
1004
|
-
mask1 = object1["masks"]
|
1005
|
-
mask2 = object2["masks"]
|
1006
|
-
return MaskDistance()(mask1, mask2)
|
1007
|
-
elif "bboxes" in object1 and "bboxes" in object2:
|
1008
|
-
bbox1 = object1["bboxes"]
|
1009
|
-
bbox2 = object2["bboxes"]
|
1010
|
-
return BoxDistance()(bbox1, bbox2)
|
1011
|
-
else:
|
1012
|
-
raise ValueError("Either of the objects should have masks or bboxes")
|
1013
|
-
|
1014
|
-
|
1015
|
-
class BoxDistance(Tool):
|
1016
|
-
name = "box_distance_"
|
1017
|
-
description = "'box_distance_' calculates distance between two bounding boxes. It returns the minumum distance between the given bounding boxes"
|
1018
|
-
usage = {
|
1019
|
-
"required_parameters": [
|
1020
|
-
{"name": "bbox1", "type": "List[int]"},
|
1021
|
-
{"name": "bbox2", "type": "List[int]"},
|
1022
|
-
],
|
1023
|
-
"examples": [
|
1024
|
-
{
|
1025
|
-
"scenario": "Calculate the distance between these two bounding boxes [0.2, 0.21, 0.34, 0.42] and [0.3, 0.31, 0.44, 0.52]",
|
1026
|
-
"parameters": {
|
1027
|
-
"bbox1": [0.2, 0.21, 0.34, 0.42],
|
1028
|
-
"bbox2": [0.3, 0.31, 0.44, 0.52],
|
1029
|
-
},
|
1030
|
-
}
|
1031
|
-
],
|
1032
|
-
}
|
480
|
+
Returns:
|
481
|
+
np.ndarray: The image as a NumPy array.
|
1033
482
|
|
1034
|
-
|
1035
|
-
|
1036
|
-
|
483
|
+
Example
|
484
|
+
-------
|
485
|
+
>>> load_image("path/to/image.jpg")
|
486
|
+
"""
|
1037
487
|
|
1038
|
-
|
1039
|
-
|
488
|
+
image = Image.open(image_path).convert("RGB")
|
489
|
+
return np.array(image)
|
1040
490
|
|
1041
|
-
return cast(float, round(np.sqrt(horizontal_dist**2 + vertical_dist**2), 2))
|
1042
491
|
|
492
|
+
def save_image(image: np.ndarray) -> str:
|
493
|
+
"""'save_image' is a utility function that saves an image as a temporary file.
|
1043
494
|
|
1044
|
-
|
1045
|
-
|
1046
|
-
description = "'mask_distance_' calculates distance between two masks. It is helpful in checking proximity of two objects. It returns the minumum distance between the given masks"
|
1047
|
-
usage = {
|
1048
|
-
"required_parameters": [
|
1049
|
-
{"name": "mask1", "type": "str"},
|
1050
|
-
{"name": "mask2", "type": "str"},
|
1051
|
-
],
|
1052
|
-
"examples": [
|
1053
|
-
{
|
1054
|
-
"scenario": "Calculate the distance between the segmentation masks for mask_file1.jpg and mask_file2.jpg",
|
1055
|
-
"parameters": {"mask1": "mask_file1.png", "mask2": "mask_file2.png"},
|
1056
|
-
}
|
1057
|
-
],
|
1058
|
-
}
|
495
|
+
Parameters:
|
496
|
+
image (np.ndarray): The image to save.
|
1059
497
|
|
1060
|
-
|
1061
|
-
|
1062
|
-
pil_mask2 = Image.open(str(mask2))
|
1063
|
-
np_mask1 = np.clip(np.array(pil_mask1), 0, 1)
|
1064
|
-
np_mask2 = np.clip(np.array(pil_mask2), 0, 1)
|
498
|
+
Returns:
|
499
|
+
str: The path to the saved image.
|
1065
500
|
|
1066
|
-
|
1067
|
-
|
1068
|
-
|
1069
|
-
|
501
|
+
Example
|
502
|
+
-------
|
503
|
+
>>> save_image(image)
|
504
|
+
"/tmp/tmpabc123.png"
|
505
|
+
"""
|
1070
506
|
|
507
|
+
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
|
508
|
+
pil_image = Image.fromarray(image.astype(np.uint8))
|
509
|
+
pil_image.save(f, "PNG")
|
510
|
+
return f.name
|
1071
511
|
|
1072
|
-
class ExtractFrames(Tool):
|
1073
|
-
r"""Extract frames from a video."""
|
1074
512
|
|
1075
|
-
|
1076
|
-
|
1077
|
-
|
1078
|
-
|
1079
|
-
|
1080
|
-
"examples": [
|
1081
|
-
{
|
1082
|
-
"scenario": "Can you extract the frames from this video? Video: www.foobar.com/video?name=test.mp4",
|
1083
|
-
"parameters": {"video_uri": "www.foobar.com/video?name=test.mp4"},
|
1084
|
-
},
|
1085
|
-
{
|
1086
|
-
"scenario": "Can you extract the images from this video file at every 2 seconds ? Video path: tests/data/test.mp4",
|
1087
|
-
"parameters": {"video_uri": "tests/data/test.mp4", "frames_every": 2},
|
1088
|
-
},
|
1089
|
-
],
|
1090
|
-
}
|
513
|
+
def overlay_bounding_boxes(
|
514
|
+
image: np.ndarray, bboxes: List[Dict[str, Any]]
|
515
|
+
) -> np.ndarray:
|
516
|
+
"""'display_bounding_boxes' is a utility function that displays bounding boxes on
|
517
|
+
an image.
|
1091
518
|
|
1092
|
-
|
1093
|
-
|
1094
|
-
|
1095
|
-
|
519
|
+
Parameters:
|
520
|
+
image (np.ndarray): The image to display the bounding boxes on.
|
521
|
+
bboxes (List[Dict[str, Any]]): A list of dictionaries containing the bounding
|
522
|
+
boxes.
|
1096
523
|
|
524
|
+
Returns:
|
525
|
+
np.ndarray: The image with the bounding boxes, labels and scores displayed.
|
1097
526
|
|
1098
|
-
|
1099
|
-
|
527
|
+
Example
|
528
|
+
-------
|
529
|
+
>>> image_with_bboxes = display_bounding_boxes(
|
530
|
+
image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
|
531
|
+
)
|
532
|
+
"""
|
533
|
+
pil_image = Image.fromarray(image.astype(np.uint8))
|
1100
534
|
|
1101
|
-
|
1102
|
-
|
1103
|
-
|
1104
|
-
frames = extract_frames_from_video(video_uri, fps=round(1 / frames_every, 2))
|
1105
|
-
result = []
|
1106
|
-
_LOGGER.info(
|
1107
|
-
f"Extracted {len(frames)} frames from video {video_uri}. Temporarily saving them as images to disk for downstream tasks."
|
535
|
+
if len(set([box["label"] for box in bboxes])) > len(COLORS):
|
536
|
+
_LOGGER.warning(
|
537
|
+
"Number of unique labels exceeds the number of available colors. Some labels may have the same color."
|
1108
538
|
)
|
1109
|
-
|
1110
|
-
|
1111
|
-
|
1112
|
-
|
1113
|
-
result.append((str(file_name), ts))
|
1114
|
-
return result
|
1115
|
-
|
1116
|
-
|
1117
|
-
class OCR(Tool):
|
1118
|
-
name = "ocr_"
|
1119
|
-
description = "'ocr_' extracts text from an image. It returns a list of detected text, bounding boxes, and confidence scores."
|
1120
|
-
usage = {
|
1121
|
-
"required_parameters": [
|
1122
|
-
{"name": "image", "type": "str"},
|
1123
|
-
],
|
1124
|
-
"examples": [
|
1125
|
-
{
|
1126
|
-
"scenario": "Can you extract the text from this image? Image name: image.png",
|
1127
|
-
"parameters": {"image": "image.png"},
|
1128
|
-
},
|
1129
|
-
],
|
539
|
+
|
540
|
+
color = {
|
541
|
+
label: COLORS[i % len(COLORS)]
|
542
|
+
for i, label in enumerate(set([box["label"] for box in bboxes]))
|
1130
543
|
}
|
1131
|
-
|
1132
|
-
|
1133
|
-
|
1134
|
-
|
1135
|
-
|
1136
|
-
|
1137
|
-
|
1138
|
-
pil_image.save(image_buffer, format="PNG")
|
1139
|
-
buffer_bytes = image_buffer.getvalue()
|
1140
|
-
image_buffer.close()
|
1141
|
-
|
1142
|
-
res = requests.post(
|
1143
|
-
self._URL,
|
1144
|
-
files={"images": buffer_bytes},
|
1145
|
-
data={"language": "en"},
|
1146
|
-
headers={"contentType": "multipart/form-data", "apikey": self._API_KEY},
|
1147
|
-
)
|
1148
|
-
if res.status_code != 200:
|
1149
|
-
_LOGGER.error(f"Request failed: {res.text}")
|
1150
|
-
raise ValueError(f"Request failed: {res.text}")
|
1151
|
-
|
1152
|
-
data = res.json()
|
1153
|
-
output: Dict[str, List] = {"labels": [], "bboxes": [], "scores": []}
|
1154
|
-
for det in data[0]:
|
1155
|
-
output["labels"].append(det["text"])
|
1156
|
-
box = [
|
1157
|
-
det["location"][0]["x"],
|
1158
|
-
det["location"][0]["y"],
|
1159
|
-
det["location"][2]["x"],
|
1160
|
-
det["location"][2]["y"],
|
1161
|
-
]
|
1162
|
-
box = normalize_bbox(box, image_size)
|
1163
|
-
output["bboxes"].append(box)
|
1164
|
-
output["scores"].append(round(det["score"], 2))
|
1165
|
-
return output
|
1166
|
-
|
1167
|
-
|
1168
|
-
class Calculator(Tool):
|
1169
|
-
r"""Calculator is a tool that can perform basic arithmetic operations."""
|
1170
|
-
|
1171
|
-
name = "calculator_"
|
1172
|
-
description = (
|
1173
|
-
"'calculator_' is a tool that can perform basic arithmetic operations."
|
544
|
+
|
545
|
+
width, height = pil_image.size
|
546
|
+
fontsize = max(12, int(min(width, height) / 40))
|
547
|
+
draw = ImageDraw.Draw(pil_image)
|
548
|
+
font = ImageFont.truetype(
|
549
|
+
str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")),
|
550
|
+
fontsize,
|
1174
551
|
)
|
1175
|
-
usage = {
|
1176
|
-
"required_parameters": [{"name": "equation", "type": "str"}],
|
1177
|
-
"examples": [
|
1178
|
-
{
|
1179
|
-
"scenario": "If you want to calculate (2 * 3) + 4",
|
1180
|
-
"parameters": {"equation": "2 + 4"},
|
1181
|
-
},
|
1182
|
-
{
|
1183
|
-
"scenario": "If you want to calculate (4 + 2.5) / 2.1",
|
1184
|
-
"parameters": {"equation": "(4 + 2.5) / 2.1"},
|
1185
|
-
},
|
1186
|
-
],
|
1187
|
-
}
|
1188
552
|
|
1189
|
-
|
1190
|
-
|
1191
|
-
|
1192
|
-
|
1193
|
-
|
1194
|
-
|
1195
|
-
|
1196
|
-
|
1197
|
-
|
1198
|
-
|
1199
|
-
GroundingDINO,
|
1200
|
-
AgentGroundingSAM,
|
1201
|
-
ZeroShotCounting,
|
1202
|
-
VisualPromptCounting,
|
1203
|
-
VisualQuestionAnswering,
|
1204
|
-
AgentDINOv,
|
1205
|
-
ExtractFrames,
|
1206
|
-
Crop,
|
1207
|
-
BboxStats,
|
1208
|
-
SegArea,
|
1209
|
-
ObjectDistance,
|
1210
|
-
BboxContains,
|
1211
|
-
SegIoU,
|
1212
|
-
OCR,
|
1213
|
-
Calculator,
|
553
|
+
for elt in bboxes:
|
554
|
+
label = elt["label"]
|
555
|
+
box = elt["bbox"]
|
556
|
+
scores = elt["score"]
|
557
|
+
|
558
|
+
box = [
|
559
|
+
int(box[0] * width),
|
560
|
+
int(box[1] * height),
|
561
|
+
int(box[2] * width),
|
562
|
+
int(box[3] * height),
|
1214
563
|
]
|
1215
|
-
|
1216
|
-
|
1217
|
-
|
564
|
+
draw.rectangle(box, outline=color[label], width=4)
|
565
|
+
text = f"{label}: {scores:.2f}"
|
566
|
+
text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
|
567
|
+
draw.rectangle((box[0], box[1], text_box[2], text_box[3]), fill=color[label])
|
568
|
+
draw.text((box[0], box[1]), text, fill="black", font=font)
|
569
|
+
return np.array(pil_image.convert("RGB"))
|
1218
570
|
|
1219
571
|
|
1220
|
-
def
|
1221
|
-
|
572
|
+
def overlay_segmentation_masks(
|
573
|
+
image: np.ndarray, masks: List[Dict[str, Any]]
|
574
|
+
) -> np.ndarray:
|
575
|
+
"""'display_segmentation_masks' is a utility function that displays segmentation
|
576
|
+
masks.
|
1222
577
|
|
1223
578
|
Parameters:
|
1224
|
-
|
579
|
+
image (np.ndarray): The image to display the masks on.
|
580
|
+
masks (List[Dict[str, Any]]): A list of dictionaries containing the masks.
|
581
|
+
|
582
|
+
Returns:
|
583
|
+
np.ndarray: The image with the masks displayed.
|
584
|
+
|
585
|
+
Example
|
586
|
+
-------
|
587
|
+
>>> image_with_masks = display_segmentation_masks(
|
588
|
+
image,
|
589
|
+
[{
|
590
|
+
'score': 0.99,
|
591
|
+
'label': 'dinosaur',
|
592
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
593
|
+
[0, 0, 0, ..., 0, 0, 0],
|
594
|
+
...,
|
595
|
+
[0, 0, 0, ..., 0, 0, 0],
|
596
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
597
|
+
}],
|
598
|
+
)
|
1225
599
|
"""
|
600
|
+
pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGBA")
|
1226
601
|
|
1227
|
-
if (
|
1228
|
-
|
1229
|
-
|
1230
|
-
or not hasattr(tool, "usage")
|
1231
|
-
):
|
1232
|
-
raise ValueError(
|
1233
|
-
"The tool must have 'name', 'description' and 'usage' attributes."
|
602
|
+
if len(set([mask["label"] for mask in masks])) > len(COLORS):
|
603
|
+
_LOGGER.warning(
|
604
|
+
"Number of unique labels exceeds the number of available colors. Some labels may have the same color."
|
1234
605
|
)
|
1235
606
|
|
1236
|
-
|
1237
|
-
|
1238
|
-
"
|
1239
|
-
"usage": tool.usage,
|
1240
|
-
"class": tool,
|
607
|
+
color = {
|
608
|
+
label: COLORS[i % len(COLORS)]
|
609
|
+
for i, label in enumerate(set([mask["label"] for mask in masks]))
|
1241
610
|
}
|
1242
|
-
|
611
|
+
|
612
|
+
for elt in masks:
|
613
|
+
mask = elt["mask"]
|
614
|
+
label = elt["label"]
|
615
|
+
np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
|
616
|
+
np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
|
617
|
+
mask_img = Image.fromarray(np_mask.astype(np.uint8))
|
618
|
+
pil_image = Image.alpha_composite(pil_image, mask_img)
|
619
|
+
return np.array(pil_image.convert("RGB"))
|
620
|
+
|
621
|
+
|
622
|
+
def get_tool_documentation(funcs: List[Callable[..., Any]]) -> str:
|
623
|
+
docstrings = ""
|
624
|
+
for func in funcs:
|
625
|
+
docstrings += f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}\n\n"
|
626
|
+
|
627
|
+
return docstrings
|
628
|
+
|
629
|
+
|
630
|
+
def get_tool_descriptions(funcs: List[Callable[..., Any]]) -> str:
|
631
|
+
descriptions = ""
|
632
|
+
for func in funcs:
|
633
|
+
description = func.__doc__
|
634
|
+
if description is None:
|
635
|
+
description = ""
|
636
|
+
|
637
|
+
description = (
|
638
|
+
description[: description.find("Parameters:")].replace("\n", " ").strip()
|
639
|
+
)
|
640
|
+
description = " ".join(description.split())
|
641
|
+
descriptions += f"- {func.__name__}{inspect.signature(func)}: {description}\n"
|
642
|
+
return descriptions
|
643
|
+
|
644
|
+
|
645
|
+
def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame:
|
646
|
+
data: Dict[str, List[str]] = {"desc": [], "doc": []}
|
647
|
+
|
648
|
+
for func in funcs:
|
649
|
+
desc = func.__doc__
|
650
|
+
if desc is None:
|
651
|
+
desc = ""
|
652
|
+
desc = desc[: desc.find("Parameters:")].replace("\n", " ").strip()
|
653
|
+
desc = " ".join(desc.split())
|
654
|
+
|
655
|
+
doc = f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}"
|
656
|
+
data["desc"].append(desc)
|
657
|
+
data["doc"].append(doc)
|
658
|
+
|
659
|
+
return pd.DataFrame(data) # type: ignore
|
660
|
+
|
661
|
+
|
662
|
+
TOOLS = [
|
663
|
+
grounding_dino,
|
664
|
+
grounding_sam,
|
665
|
+
extract_frames,
|
666
|
+
ocr,
|
667
|
+
clip,
|
668
|
+
zero_shot_counting,
|
669
|
+
visual_prompt_counting,
|
670
|
+
image_question_answering,
|
671
|
+
image_caption,
|
672
|
+
closest_mask_distance,
|
673
|
+
closest_box_distance,
|
674
|
+
save_json,
|
675
|
+
load_image,
|
676
|
+
save_image,
|
677
|
+
overlay_bounding_boxes,
|
678
|
+
overlay_segmentation_masks,
|
679
|
+
]
|
680
|
+
TOOLS_DF = get_tools_df(TOOLS) # type: ignore
|
681
|
+
TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore
|
682
|
+
TOOL_DOCSTRING = get_tool_documentation(TOOLS) # type: ignore
|
683
|
+
UTILITIES_DOCSTRING = get_tool_documentation(
|
684
|
+
[save_json, load_image, save_image, overlay_bounding_boxes]
|
685
|
+
)
|