vision-agent 0.2.30__py3-none-any.whl → 0.2.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,17 +1,18 @@
1
+ import inspect
1
2
  import io
3
+ import json
2
4
  import logging
3
5
  import tempfile
4
- from abc import ABC
6
+ from importlib import resources
5
7
  from pathlib import Path
6
- from typing import Any, Dict, List, Tuple, Type, Union, cast
8
+ from typing import Any, Callable, Dict, List, Tuple, Union, cast
7
9
 
8
10
  import numpy as np
11
+ import pandas as pd
9
12
  import requests
10
- from PIL import Image
11
- from PIL.Image import Image as ImageType
13
+ from PIL import Image, ImageDraw, ImageFont
12
14
  from scipy.spatial import distance # type: ignore
13
15
 
14
- from vision_agent.lmm import OpenAILMM
15
16
  from vision_agent.tools.tool_utils import _send_inference_request
16
17
  from vision_agent.utils import extract_frames_from_video
17
18
  from vision_agent.utils.image_utils import (
@@ -23,1220 +24,662 @@ from vision_agent.utils.image_utils import (
23
24
  rle_decode,
24
25
  )
25
26
 
27
+ COLORS = [
28
+ (158, 218, 229),
29
+ (219, 219, 141),
30
+ (23, 190, 207),
31
+ (188, 189, 34),
32
+ (199, 199, 199),
33
+ (247, 182, 210),
34
+ (127, 127, 127),
35
+ (227, 119, 194),
36
+ (196, 156, 148),
37
+ (197, 176, 213),
38
+ (140, 86, 75),
39
+ (148, 103, 189),
40
+ (255, 152, 150),
41
+ (152, 223, 138),
42
+ (214, 39, 40),
43
+ (44, 160, 44),
44
+ (255, 187, 120),
45
+ (174, 199, 232),
46
+ (255, 127, 14),
47
+ (31, 119, 180),
48
+ ]
49
+ _API_KEY = "land_sk_WVYwP00xA3iXely2vuar6YUDZ3MJT9yLX6oW5noUkwICzYLiDV"
50
+ _OCR_URL = "https://app.landing.ai/ocr/v1/detect-text"
51
+ logging.basicConfig(level=logging.INFO)
26
52
  _LOGGER = logging.getLogger(__name__)
27
53
 
28
54
 
29
- class Tool(ABC):
30
- name: str
31
- description: str
32
- usage: Dict
33
-
34
- def __call__(self, *args: Any, **kwargs: Any) -> Any:
35
- raise NotImplementedError
55
+ def grounding_dino(
56
+ prompt: str,
57
+ image: np.ndarray,
58
+ box_threshold: float = 0.20,
59
+ iou_threshold: float = 0.20,
60
+ ) -> List[Dict[str, Any]]:
61
+ """'grounding_dino' is a tool that can detect and count objects given a text prompt
62
+ such as category names or referring expressions. It returns a list and count of
63
+ bounding boxes, label names and associated probability scores.
36
64
 
65
+ Parameters:
66
+ prompt (str): The prompt to ground to the image.
67
+ image (np.ndarray): The image to ground the prompt to.
68
+ box_threshold (float, optional): The threshold for the box detection. Defaults
69
+ to 0.20.
70
+ iou_threshold (float, optional): The threshold for the Intersection over Union
71
+ (IoU). Defaults to 0.20.
72
+
73
+ Returns:
74
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
75
+ bounding box of the detected objects with normalized coordinates
76
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left and
77
+ xmax and ymax are the coordinates of the bottom-right of the bounding box.
37
78
 
38
- class NoOp(Tool):
39
- name = "noop_"
40
- description = "'noop_' is a no-op tool that does nothing if you do not want answer the question directly and not use a tool."
41
- usage = {
42
- "required_parameters": [],
43
- "examples": [
44
- {
45
- "scenario": "If you do not want to use a tool.",
46
- "parameters": {},
47
- }
48
- ],
79
+ Example
80
+ -------
81
+ >>> grounding_dino("car. dinosaur", image)
82
+ [
83
+ {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
84
+ {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
85
+ ]
86
+ """
87
+ image_size = image.shape[:2]
88
+ image_b64 = convert_to_b64(image)
89
+ request_data = {
90
+ "prompt": prompt,
91
+ "image": image_b64,
92
+ "tool": "visual_grounding",
93
+ "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
49
94
  }
95
+ data: Dict[str, Any] = _send_inference_request(request_data, "tools")
96
+ return_data = []
97
+ for i in range(len(data["bboxes"])):
98
+ return_data.append(
99
+ {
100
+ "score": round(data["scores"][i], 2),
101
+ "label": data["labels"][i],
102
+ "bbox": normalize_bbox(data["bboxes"][i], image_size),
103
+ }
104
+ )
105
+ return return_data
50
106
 
51
- def __call__(self) -> None:
52
- return None
53
107
 
108
+ def grounding_sam(
109
+ prompt: str,
110
+ image: np.ndarray,
111
+ box_threshold: float = 0.20,
112
+ iou_threshold: float = 0.20,
113
+ ) -> List[Dict[str, Any]]:
114
+ """'grounding_sam' is a tool that can detect and segment objects given a text
115
+ prompt such as category names or referring expressions. It returns a list of
116
+ bounding boxes, label names and masks file names and associated probability scores.
54
117
 
55
- class CLIP(Tool):
56
- r"""CLIP is a tool that can classify or tag any image given a set of input classes
57
- or tags.
118
+ Parameters:
119
+ prompt (str): The prompt to ground to the image.
120
+ image (np.ndarray): The image to ground the prompt to.
121
+ box_threshold (float, optional): The threshold for the box detection. Defaults
122
+ to 0.20.
123
+ iou_threshold (float, optional): The threshold for the Intersection over Union
124
+ (IoU). Defaults to 0.20.
125
+
126
+ Returns:
127
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
128
+ bounding box, and mask of the detected objects with normalized coordinates
129
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left and
130
+ xmax and ymax are the coordinates of the bottom-right of the bounding box.
131
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
132
+ the background.
58
133
 
59
134
  Example
60
135
  -------
61
- >>> import vision_agent as va
62
- >>> clip = va.tools.CLIP()
63
- >>> clip("red line, yellow dot", "ct_scan1.jpg"))
64
- [{"labels": ["red line", "yellow dot"], "scores": [0.98, 0.02]}]
136
+ >>> grounding_sam("car. dinosaur", image)
137
+ [
138
+ {
139
+ 'score': 0.99,
140
+ 'label': 'dinosaur',
141
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
142
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
143
+ [0, 0, 0, ..., 0, 0, 0],
144
+ ...,
145
+ [0, 0, 0, ..., 0, 0, 0],
146
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
147
+ },
148
+ ]
65
149
  """
66
-
67
- name = "clip_"
68
- description = "'clip_' is a tool that can classify any image given a set of input names or tags. It returns a list of the input names along with their probability scores."
69
- usage = {
70
- "required_parameters": [
71
- {"name": "prompt", "type": "str"},
72
- {"name": "image", "type": "str"},
73
- ],
74
- "examples": [
75
- {
76
- "scenario": "Can you classify this image as a cat? Image name: cat.jpg",
77
- "parameters": {"prompt": "cat", "image": "cat.jpg"},
78
- },
79
- {
80
- "scenario": "Can you tag this photograph with cat or dog? Image name: cat_dog.jpg",
81
- "parameters": {"prompt": "cat, dog", "image": "cat_dog.jpg"},
82
- },
83
- {
84
- "scenario": "Can you build me a classifier that classifies red shirts, green shirts and other? Image name: shirts.jpg",
85
- "parameters": {
86
- "prompt": "red shirt, green shirt, other",
87
- "image": "shirts.jpg",
88
- },
89
- },
90
- ],
150
+ image_size = image.shape[:2]
151
+ image_b64 = convert_to_b64(image)
152
+ request_data = {
153
+ "prompt": prompt,
154
+ "image": image_b64,
155
+ "tool": "visual_grounding_segment",
156
+ "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
91
157
  }
158
+ data: Dict[str, Any] = _send_inference_request(request_data, "tools")
159
+ return_data = []
160
+ for i in range(len(data["bboxes"])):
161
+ return_data.append(
162
+ {
163
+ "score": round(data["scores"][i], 2),
164
+ "label": data["labels"][i],
165
+ "bbox": normalize_bbox(data["bboxes"][i], image_size),
166
+ "mask": rle_decode(mask_rle=data["masks"][i], shape=data["mask_shape"]),
167
+ }
168
+ )
169
+ return return_data
92
170
 
93
- # TODO: Add support for input multiple images, which aligns with the output type.
94
- def __call__(self, prompt: str, image: Union[str, ImageType]) -> Dict:
95
- """Invoke the CLIP model.
96
-
97
- Parameters:
98
- prompt: a string includes a list of classes or tags to classify the image.
99
- image: the input image to classify.
100
171
 
101
- Returns:
102
- A list of dictionaries containing the labels and scores. Each dictionary contains the classification result for an image. E.g. [{"labels": ["red line", "yellow dot"], "scores": [0.98, 0.02]}]
103
- """
104
- image_b64 = convert_to_b64(image)
105
- data = {
106
- "prompt": prompt,
107
- "image": image_b64,
108
- "tool": "closed_set_image_classification",
109
- }
110
- resp_data = _send_inference_request(data, "tools")
111
- resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
112
- return resp_data
172
+ def extract_frames(
173
+ video_uri: Union[str, Path], fps: float = 0.5
174
+ ) -> List[Tuple[np.ndarray, float]]:
175
+ """'extract_frames' extracts frames from a video, returns a list of tuples (frame,
176
+ timestamp), where timestamp is the relative time in seconds where the frame was
177
+ captured. The frame is a local image file path.
113
178
 
179
+ Parameters:
180
+ video_uri (Union[str, Path]): The path to the video file.
181
+ fps (float, optional): The frame rate per second to extract the frames. Defaults
182
+ to 0.5.
114
183
 
115
- class ImageCaption(Tool):
116
- r"""ImageCaption is a tool that can caption an image based on its contents or tags.
184
+ Returns:
185
+ List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame
186
+ and the timestamp in seconds.
117
187
 
118
188
  Example
119
189
  -------
120
- >>> import vision_agent as va
121
- >>> caption = va.tools.ImageCaption()
122
- >>> caption("image1.jpg")
123
- {'text': ['a box of orange and white socks']}
190
+ >>> extract_frames("path/to/video.mp4")
191
+ [(frame1, 0.0), (frame2, 0.5), ...]
124
192
  """
125
193
 
126
- name = "image_caption_"
127
- description = "'image_caption_' is a tool that can caption an image based on its contents or tags. It returns a text describing the image."
128
- usage = {
129
- "required_parameters": [
130
- {"name": "image", "type": "str"},
131
- ],
132
- "examples": [
133
- {
134
- "scenario": "Can you describe this image? Image name: cat.jpg",
135
- "parameters": {"image": "cat.jpg"},
136
- },
137
- {
138
- "scenario": "Can you caption this image with their main contents? Image name: cat_dog.jpg",
139
- "parameters": {"image": "cat_dog.jpg"},
140
- },
141
- ],
142
- }
143
-
144
- # TODO: Add support for input multiple images, which aligns with the output type.
145
- def __call__(self, image: Union[str, ImageType]) -> Dict:
146
- """Invoke the Image captioning model.
194
+ return extract_frames_from_video(str(video_uri), fps)
147
195
 
148
- Parameters:
149
- image: the input image to caption.
150
196
 
151
- Returns:
152
- A list of dictionaries containing the labels and scores. Each dictionary contains the classification result for an image. E.g. [{"labels": ["red line", "yellow dot"], "scores": [0.98, 0.02]}]
153
- """
154
- image_b64 = convert_to_b64(image)
155
- data = {
156
- "image": image_b64,
157
- "tool": "image_captioning",
158
- }
159
- return _send_inference_request(data, "tools")
197
+ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
198
+ """'ocr' extracts text from an image. It returns a list of detected text, bounding
199
+ boxes, and confidence scores.
160
200
 
201
+ Parameters:
202
+ image (np.ndarray): The image to extract text from.
161
203
 
162
- class GroundingDINO(Tool):
163
- r"""Grounding DINO is a tool that can detect arbitrary objects with inputs such as
164
- category names or referring expressions.
204
+ Returns:
205
+ List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox,
206
+ and confidence score.
165
207
 
166
208
  Example
167
209
  -------
168
- >>> import vision_agent as va
169
- >>> t = va.tools.GroundingDINO()
170
- >>> t("red line. yellow dot", "ct_scan1.jpg")
171
- [{'labels': ['red line', 'yellow dot'],
172
- 'bboxes': [[0.38, 0.15, 0.59, 0.7], [0.48, 0.25, 0.69, 0.71]],
173
- 'scores': [0.98, 0.02]}]
210
+ >>> ocr(image)
211
+ [
212
+ {'label': 'some text', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
213
+ ]
174
214
  """
175
215
 
176
- name = "grounding_dino_"
177
- description = "'grounding_dino_' is a tool that can detect and count multiple objects given a text prompt such as category names or referring expressions. It returns a list and count of bounding boxes, label names and associated probability scores."
178
- usage = {
179
- "required_parameters": [
180
- {"name": "prompt", "type": "str"},
181
- {"name": "image", "type": "str"},
182
- ],
183
- "optional_parameters": [
184
- {"name": "box_threshold", "type": "float", "min": 0.1, "max": 0.5},
185
- {"name": "iou_threshold", "type": "float", "min": 0.01, "max": 0.99},
186
- ],
187
- "examples": [
188
- {
189
- "scenario": "Can you detect and count the giraffes and zebras in this image? Image name: animal.jpg",
190
- "parameters": {
191
- "prompt": "giraffe. zebra",
192
- "image": "person.jpg",
193
- },
194
- },
195
- {
196
- "scenario": "Can you build me a car detector?",
197
- "parameters": {"prompt": "car", "image": ""},
198
- },
199
- {
200
- "scenario": "Can you detect the person on the left and right? Image name: person.jpg",
201
- "parameters": {
202
- "prompt": "left person. right person",
203
- "image": "person.jpg",
204
- },
205
- },
206
- {
207
- "scenario": "Detect the red shirts and green shirt. Image name: shirts.jpg",
208
- "parameters": {
209
- "prompt": "red shirt. green shirt",
210
- "image": "shirts.jpg",
211
- "box_threshold": 0.20,
212
- "iou_threshold": 0.20,
213
- },
214
- },
215
- ],
216
- }
216
+ pil_image = Image.fromarray(image).convert("RGB")
217
+ image_size = pil_image.size[::-1]
218
+ image_buffer = io.BytesIO()
219
+ pil_image.save(image_buffer, format="PNG")
220
+ buffer_bytes = image_buffer.getvalue()
221
+ image_buffer.close()
222
+
223
+ res = requests.post(
224
+ _OCR_URL,
225
+ files={"images": buffer_bytes},
226
+ data={"language": "en"},
227
+ headers={"contentType": "multipart/form-data", "apikey": _API_KEY},
228
+ )
217
229
 
218
- # TODO: Add support for input multiple images, which aligns with the output type.
219
- def __call__(
220
- self,
221
- prompt: str,
222
- image: Union[str, Path, ImageType],
223
- box_threshold: float = 0.20,
224
- iou_threshold: float = 0.20,
225
- ) -> Dict:
226
- """Invoke the Grounding DINO model.
227
-
228
- Parameters:
229
- prompt: one or multiple class names to detect. The classes should be separated by a period if there are multiple classes. E.g. "big dog . small cat"
230
- image: the input image to run against.
231
- box_threshold: the threshold to filter out the bounding boxes with low scores.
232
- iou_threshold: the threshold for intersection over union used in nms algorithm. It will suppress the boxes which have iou greater than this threshold.
233
-
234
- Returns:
235
- A dictionary containing the labels, scores, and bboxes, which is the detection result for the input image.
236
- """
237
- image_size = get_image_size(image)
238
- image_b64 = convert_to_b64(image)
239
- request_data = {
240
- "prompt": prompt,
241
- "image": image_b64,
242
- "tool": "visual_grounding",
243
- "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
244
- }
245
- data: Dict[str, Any] = _send_inference_request(request_data, "tools")
246
- if "bboxes" in data:
247
- data["bboxes"] = [normalize_bbox(box, image_size) for box in data["bboxes"]]
248
- if "scores" in data:
249
- data["scores"] = [round(score, 2) for score in data["scores"]]
250
- if "labels" in data:
251
- data["labels"] = list(data["labels"])
252
- data["image_size"] = image_size
253
- return data
254
-
255
-
256
- class GroundingSAM(Tool):
257
- r"""Grounding SAM is a tool that can detect and segment arbitrary objects with
258
- inputs such as category names or referring expressions.
230
+ if res.status_code != 200:
231
+ raise ValueError(f"OCR request failed with status code {res.status_code}")
232
+
233
+ data = res.json()
234
+ output = []
235
+ for det in data[0]:
236
+ label = det["text"]
237
+ box = [
238
+ det["location"][0]["x"],
239
+ det["location"][0]["y"],
240
+ det["location"][2]["x"],
241
+ det["location"][2]["y"],
242
+ ]
243
+ box = normalize_bbox(box, image_size)
244
+ output.append({"label": label, "bbox": box, "score": round(det["score"], 2)})
245
+
246
+ return output
247
+
248
+
249
+ def zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
250
+ """'zero_shot_counting' is a tool that counts the dominant foreground object given an image and no other information about the content.
251
+ It returns only the count of the objects in the image.
252
+
253
+ Parameters:
254
+ image (np.ndarray): The image that contains lot of instances of a single object
255
+
256
+ Returns:
257
+ Dict[str, Any]: A dictionary containing the key 'count' and the count as a value. E.g. {count: 12}.
259
258
 
260
259
  Example
261
260
  -------
262
- >>> import vision_agent as va
263
- >>> t = va.tools.GroundingSAM()
264
- >>> t("red line, yellow dot", "ct_scan1.jpg"])
265
- [{'labels': ['yellow dot', 'red line'],
266
- 'bboxes': [[0.38, 0.15, 0.59, 0.7], [0.48, 0.25, 0.69, 0.71]],
267
- 'masks': [array([[0, 0, 0, ..., 0, 0, 0],
268
- [0, 0, 0, ..., 0, 0, 0],
269
- ...,
270
- [0, 0, 0, ..., 0, 0, 0],
271
- [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)},
272
- array([[0, 0, 0, ..., 0, 0, 0],
273
- [0, 0, 0, ..., 0, 0, 0],
274
- ...,
275
- [1, 1, 1, ..., 1, 1, 1],
276
- [1, 1, 1, ..., 1, 1, 1]], dtype=uint8)]}]
261
+ >>> zero_shot_counting(image)
262
+ {'count': 45},
263
+
277
264
  """
278
265
 
279
- name = "grounding_sam_"
280
- description = "'grounding_sam_' is a tool that can detect and segment multiple objects given a text prompt such as category names or referring expressions. It returns a list of bounding boxes, label names and masks file names and associated probability scores."
281
- usage = {
282
- "required_parameters": [
283
- {"name": "prompt", "type": "str"},
284
- {"name": "image", "type": "str"},
285
- ],
286
- "optional_parameters": [
287
- {"name": "box_threshold", "type": "float", "min": 0.1, "max": 0.5},
288
- {"name": "iou_threshold", "type": "float", "min": 0.01, "max": 0.99},
289
- ],
290
- "examples": [
291
- {
292
- "scenario": "Can you segment the apples and grapes in this image? Image name: fruits.jpg",
293
- "parameters": {
294
- "prompt": "apple. grape",
295
- "image": "fruits.jpg",
296
- },
297
- },
298
- {
299
- "scenario": "Can you build me a car segmentor?",
300
- "parameters": {"prompt": "car", "image": ""},
301
- },
302
- {
303
- "scenario": "Can you segment the person on the left and right? Image name: person.jpg",
304
- "parameters": {
305
- "prompt": "left person. right person",
306
- "image": "person.jpg",
307
- },
308
- },
309
- {
310
- "scenario": "Can you build me a tool that segments red shirts and green shirts? Image name: shirts.jpg",
311
- "parameters": {
312
- "prompt": "red shirt, green shirt",
313
- "image": "shirts.jpg",
314
- "box_threshold": 0.20,
315
- "iou_threshold": 0.20,
316
- },
317
- },
318
- ],
266
+ image_b64 = convert_to_b64(image)
267
+ data = {
268
+ "image": image_b64,
269
+ "tool": "zero_shot_counting",
319
270
  }
271
+ resp_data = _send_inference_request(data, "tools")
272
+ resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
273
+ return resp_data
274
+
275
+
276
+ def visual_prompt_counting(
277
+ image: np.ndarray, visual_prompt: Dict[str, List[float]]
278
+ ) -> Dict[str, Any]:
279
+ """'visual_prompt_counting' is a tool that counts the dominant foreground object given an image and a visual prompt which is a bounding box describing the object.
280
+ It returns only the count of the objects in the image.
281
+
282
+ Parameters:
283
+ image (np.ndarray): The image that contains lot of instances of a single object
320
284
 
321
- # TODO: Add support for input multiple images, which aligns with the output type.
322
- def __call__(
323
- self,
324
- prompt: str,
325
- image: Union[str, ImageType],
326
- box_threshold: float = 0.2,
327
- iou_threshold: float = 0.2,
328
- ) -> Dict:
329
- """Invoke the Grounding SAM model.
330
-
331
- Parameters:
332
- prompt: a list of classes to segment.
333
- image: the input image to segment.
334
- box_threshold: the threshold to filter out the bounding boxes with low scores.
335
- iou_threshold: the threshold for intersection over union used in nms algorithm. It will suppress the boxes which have iou greater than this threshold.
336
-
337
- Returns:
338
- A dictionary containing the labels, scores, bboxes and masks for the input image.
339
- """
340
- image_size = get_image_size(image)
341
- image_b64 = convert_to_b64(image)
342
- request_data = {
343
- "prompt": prompt,
344
- "image": image_b64,
345
- "tool": "visual_grounding_segment",
346
- "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
347
- }
348
- data: Dict[str, Any] = _send_inference_request(request_data, "tools")
349
- if "bboxes" in data:
350
- data["bboxes"] = [normalize_bbox(box, image_size) for box in data["bboxes"]]
351
- if "masks" in data:
352
- data["masks"] = [
353
- rle_decode(mask_rle=mask, shape=data["mask_shape"])
354
- for mask in data["masks"]
355
- ]
356
- data["image_size"] = image_size
357
- data.pop("mask_shape", None)
358
- return data
359
-
360
-
361
- class DINOv(Tool):
362
- r"""DINOv is a tool that can detect and segment similar objects with the given input masks.
285
+ Returns:
286
+ Dict[str, Any]: A dictionary containing the key 'count' and the count as a value. E.g. {count: 12}.
363
287
 
364
288
  Example
365
289
  -------
366
- >>> import vision_agent as va
367
- >>> t = va.tools.DINOv()
368
- >>> t(prompt=[{"mask":"balloon_mask.jpg", "image": "balloon.jpg"}], image="balloon.jpg"])
369
- [{'scores': [0.512, 0.212],
370
- 'masks': [array([[0, 0, 0, ..., 0, 0, 0],
371
- ...,
372
- [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)},
373
- array([[0, 0, 0, ..., 0, 0, 0],
374
- ...,
375
- [1, 1, 1, ..., 1, 1, 1]], dtype=uint8)]}]
290
+ >>> visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
291
+ {'count': 45},
292
+
376
293
  """
377
294
 
378
- name = "dinov_"
379
- description = "'dinov_' is a tool that can detect and segment similar objects given a reference segmentation mask."
380
- usage = {
381
- "required_parameters": [
382
- {"name": "prompt", "type": "List[Dict[str, str]]"},
383
- {"name": "image", "type": "str"},
384
- ],
385
- "examples": [
386
- {
387
- "scenario": "Can you find all the balloons in this image that is similar to the provided masked area? Image name: input.jpg Reference image: balloon.jpg Reference mask: balloon_mask.jpg",
388
- "parameters": {
389
- "prompt": [
390
- {"mask": "balloon_mask.jpg", "image": "balloon.jpg"},
391
- ],
392
- "image": "input.jpg",
393
- },
394
- },
395
- {
396
- "scenario": "Detect all the objects in this image that are similar to the provided mask. Image name: original.jpg Reference image: mask.png Reference mask: background.png",
397
- "parameters": {
398
- "prompt": [
399
- {"mask": "mask.png", "image": "background.png"},
400
- ],
401
- "image": "original.jpg",
402
- },
403
- },
404
- ],
295
+ image_size = get_image_size(image)
296
+ bbox = visual_prompt["bbox"]
297
+ bbox_str = ", ".join(map(str, denormalize_bbox(bbox, image_size)))
298
+ image_b64 = convert_to_b64(image)
299
+
300
+ data = {
301
+ "image": image_b64,
302
+ "prompt": bbox_str,
303
+ "tool": "few_shot_counting",
405
304
  }
305
+ resp_data = _send_inference_request(data, "tools")
306
+ resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
307
+ return resp_data
406
308
 
407
- def __call__(
408
- self, prompt: List[Dict[str, str]], image: Union[str, ImageType]
409
- ) -> Dict:
410
- """Invoke the DINOv model.
411
-
412
- Parameters:
413
- prompt: a list of visual prompts in the form of {'mask': 'MASK_FILE_PATH', 'image': 'IMAGE_FILE_PATH'}.
414
- image: the input image to segment.
415
-
416
- Returns:
417
- A dictionary of the below keys: 'scores', 'masks' and 'mask_shape', which stores a list of detected segmentation masks and its scores.
418
- """
419
- image_b64 = convert_to_b64(image)
420
- for p in prompt:
421
- p["mask"] = convert_to_b64(p["mask"])
422
- p["image"] = convert_to_b64(p["image"])
423
- request_data = {
424
- "prompt": prompt,
425
- "image": image_b64,
426
- }
427
- data: Dict[str, Any] = _send_inference_request(request_data, "dinov")
428
- if "bboxes" in data:
429
- data["bboxes"] = [
430
- normalize_bbox(box, data["mask_shape"]) for box in data["bboxes"]
431
- ]
432
- if "masks" in data:
433
- data["masks"] = [
434
- rle_decode(mask_rle=mask, shape=data["mask_shape"])
435
- for mask in data["masks"]
436
- ]
437
- data["labels"] = ["visual prompt" for _ in range(len(data["masks"]))]
438
- mask_shape = data.pop("mask_shape", None)
439
- data["image_size"] = (mask_shape[0], mask_shape[1]) if mask_shape else None
440
- return data
441
-
442
-
443
- class AgentDINOv(DINOv):
444
- def __call__(
445
- self,
446
- prompt: List[Dict[str, str]],
447
- image: Union[str, ImageType],
448
- ) -> Dict:
449
- rets = super().__call__(prompt, image)
450
- mask_files = []
451
- for mask in rets["masks"]:
452
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
453
- file_name = Path(tmp.name).with_suffix(".mask.png")
454
- Image.fromarray(mask * 255).save(file_name)
455
- mask_files.append(str(file_name))
456
- rets["masks"] = mask_files
457
- return rets
458
-
459
-
460
- class AgentGroundingSAM(GroundingSAM):
461
- r"""AgentGroundingSAM is the same as GroundingSAM but it saves the masks as files
462
- returns the file name. This makes it easier for agents to use.
463
- """
464
309
 
465
- def __call__(
466
- self,
467
- prompt: str,
468
- image: Union[str, ImageType],
469
- box_threshold: float = 0.2,
470
- iou_threshold: float = 0.75,
471
- ) -> Dict:
472
- rets = super().__call__(prompt, image, box_threshold, iou_threshold)
473
- mask_files = []
474
- for mask in rets["masks"]:
475
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
476
- file_name = Path(tmp.name).with_suffix(".mask.png")
477
- Image.fromarray(mask * 255).save(file_name)
478
- mask_files.append(str(file_name))
479
- rets["masks"] = mask_files
480
- return rets
481
-
482
-
483
- class ZeroShotCounting(Tool):
484
- r"""ZeroShotCounting is a tool that can count total number of instances of an object
485
- present in an image belonging to same class without a text or visual prompt.
310
+ def image_question_answering(image: np.ndarray, prompt: str) -> str:
311
+ """'image_question_answering_' is a tool that can answer questions about the visual contents of an image given a question and an image.
312
+ It returns an answer to the question
313
+
314
+ Parameters:
315
+ image (np.ndarray): The reference image used for the question
316
+ prompt (str): The question about the image
317
+
318
+ Returns:
319
+ str: A string which is the answer to the given prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'}.
486
320
 
487
321
  Example
488
322
  -------
489
- >>> import vision_agent as va
490
- >>> zshot_count = va.tools.ZeroShotCounting()
491
- >>> zshot_count("image1.jpg")
492
- {'count': 45}
493
- """
323
+ >>> image_question_answering(image, 'What is the cat doing ?')
324
+ 'drinking milk'
494
325
 
495
- name = "zero_shot_counting_"
496
- description = "'zero_shot_counting_' is a tool that counts foreground items given only an image and no other information. It returns only the count of the objects in the image"
326
+ """
497
327
 
498
- usage = {
499
- "required_parameters": [
500
- {"name": "image", "type": "str"},
501
- ],
502
- "examples": [
503
- {
504
- "scenario": "Can you count the items in the image? Image name: lids.jpg",
505
- "parameters": {"image": "lids.jpg"},
506
- },
507
- {
508
- "scenario": "Can you count the total number of objects in this image? Image name: tray.jpg",
509
- "parameters": {"image": "tray.jpg"},
510
- },
511
- {
512
- "scenario": "Can you build me an object counting tool? Image name: shirts.jpg",
513
- "parameters": {
514
- "image": "shirts.jpg",
515
- },
516
- },
517
- ],
328
+ image_b64 = convert_to_b64(image)
329
+ data = {
330
+ "image": image_b64,
331
+ "prompt": prompt,
332
+ "tool": "image_question_answering",
518
333
  }
519
334
 
520
- # TODO: Add support for input multiple images, which aligns with the output type.
521
- def __call__(self, image: Union[str, ImageType]) -> Dict:
522
- """Invoke the Zero shot counting model.
335
+ answer = _send_inference_request(data, "tools")
336
+ return answer["text"][0] # type: ignore
523
337
 
524
- Parameters:
525
- image: the input image.
526
338
 
527
- Returns:
528
- A dictionary containing the key 'count' and the count as value. E.g. {count: 12}
529
- """
530
- image_b64 = convert_to_b64(image)
531
- data = {
532
- "image": image_b64,
533
- "tool": "zero_shot_counting",
534
- }
535
- resp_data = _send_inference_request(data, "tools")
536
- resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
537
- return resp_data
339
+ def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
340
+ """'clip' is a tool that can classify an image given a list of input classes or tags.
341
+ It returns the same list of the input classes along with their probability scores based on image content.
538
342
 
343
+ Parameters:
344
+ image (np.ndarray): The image to classify or tag
345
+ classes (List[str]): The list of classes or tags that is associated with the image
539
346
 
540
- class VisualPromptCounting(Tool):
541
- r"""VisualPromptCounting is a tool that can count total number of instances of an object
542
- present in an image belonging to same class with help of an visual prompt which is a bounding box.
347
+ Returns:
348
+ Dict[str, Any]: A dictionary containing the labels and scores. One dictionary contains a list of given labels and other a list of scores.
543
349
 
544
350
  Example
545
351
  -------
546
- >>> import vision_agent as va
547
- >>> prompt_count = va.tools.VisualPromptCounting()
548
- >>> prompt_count(image="image1.jpg", prompt={"bbox": [0.1, 0.1, 0.4, 0.42]})
549
- {'count': 23}
550
- """
352
+ >>> clip(image, ['dog', 'cat', 'bird'])
353
+ {"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]},
551
354
 
552
- name = "visual_prompt_counting_"
553
- description = "'visual_prompt_counting_' is a tool that counts foreground items in an image given a visual prompt which is a bounding box describing the object. It returns only the count of the objects in the image."
355
+ """
554
356
 
555
- usage = {
556
- "required_parameters": [
557
- {"name": "image", "type": "str"},
558
- {"name": "prompt", "type": "Dict[str, List[float]"},
559
- ],
560
- "examples": [
561
- {
562
- "scenario": "Here is an example of a lid '0.1, 0.1, 0.14, 0.2', Can you count the items in the image ? Image name: lids.jpg",
563
- "parameters": {
564
- "image": "lids.jpg",
565
- "prompt": {"bbox": [0.1, 0.1, 0.14, 0.2]},
566
- },
567
- },
568
- {
569
- "scenario": "Can you count the total number of objects in this image ? Image name: tray.jpg, reference_data: {'bbox': [0.1, 0.1, 0.2, 0.25]}",
570
- "parameters": {
571
- "image": "tray.jpg",
572
- "prompt": {"bbox": [0.1, 0.1, 0.2, 0.25]},
573
- },
574
- },
575
- {
576
- "scenario": "Can you count this item based on an example, reference_data: {'bbox': [100, 115, 200, 200]} ? Image name: shirts.jpg",
577
- "parameters": {
578
- "image": "shirts.jpg",
579
- "prompt": {"bbox": [100, 115, 200, 200]},
580
- },
581
- },
582
- {
583
- "scenario": "Can you build me a counting tool based on an example prompt ? Image name: shoes.jpg, reference_data: {'bbox': [0.1, 0.1, 0.6, 0.65]}",
584
- "parameters": {
585
- "image": "shoes.jpg",
586
- "prompt": {"bbox": [0.1, 0.1, 0.6, 0.65]},
587
- },
588
- },
589
- ],
357
+ image_b64 = convert_to_b64(image)
358
+ data = {
359
+ "prompt": ",".join(classes),
360
+ "image": image_b64,
361
+ "tool": "closed_set_image_classification",
590
362
  }
363
+ resp_data = _send_inference_request(data, "tools")
364
+ resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
365
+ return resp_data
591
366
 
592
- def __call__(
593
- self, image: Union[str, ImageType], prompt: Dict[str, List[float]]
594
- ) -> Dict:
595
- """Invoke the few shot counting model.
596
-
597
- Parameters:
598
- image: the input image.
599
- prompt: the visual prompt which is a bounding box describing the object.
600
-
601
- Returns:
602
- A dictionary containing the key 'count' and the count as value. E.g. {count: 12}
603
- """
604
- image_size = get_image_size(image)
605
- bbox = prompt["bbox"]
606
- bbox_str = ", ".join(map(str, denormalize_bbox(bbox, image_size)))
607
- image_b64 = convert_to_b64(image)
608
367
 
609
- data = {
610
- "image": image_b64,
611
- "prompt": bbox_str,
612
- "tool": "few_shot_counting",
613
- }
614
- resp_data = _send_inference_request(data, "tools")
615
- resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
616
- return resp_data
368
+ def image_caption(image: np.ndarray) -> str:
369
+ """'image_caption' is a tool that can caption an image based on its contents.
370
+ It returns a text describing the image.
617
371
 
372
+ Parameters:
373
+ image (np.ndarray): The image to caption
618
374
 
619
- class VisualQuestionAnswering(Tool):
620
- r"""VisualQuestionAnswering is a tool that can explain contents of an image and answer questions about the same
375
+ Returns:
376
+ str: A string which is the caption for the given image.
621
377
 
622
378
  Example
623
379
  -------
624
- >>> import vision_agent as va
625
- >>> vqa_tool = va.tools.VisualQuestionAnswering()
626
- >>> vqa_tool(image="image1.jpg", prompt="describe this image in detail")
627
- {'text': "The image contains a cat sitting on a table with a bowl of milk."}
628
- """
380
+ >>> image_caption(image)
381
+ 'This image contains a cat sitting on a table with a bowl of milk.'
629
382
 
630
- name = "visual_question_answering_"
631
- description = "'visual_question_answering_' is a tool that can answer basic questions about the image given a question and an image. It returns a text describing the image and the answer to the question"
383
+ """
632
384
 
633
- usage = {
634
- "required_parameters": [
635
- {"name": "image", "type": "str"},
636
- {"name": "prompt", "type": "str"},
637
- ],
638
- "examples": [
639
- {
640
- "scenario": "Describe this image in detail. Image name: cat.jpg",
641
- "parameters": {
642
- "image": "cats.jpg",
643
- "prompt": "Describe this image in detail",
644
- },
645
- },
646
- {
647
- "scenario": "Can you help me with this street sign in this image ? What does it say ? Image name: sign.jpg",
648
- "parameters": {
649
- "image": "sign.jpg",
650
- "prompt": "Can you help me with this street sign ? What does it say ?",
651
- },
652
- },
653
- {
654
- "scenario": "Describe the weather in the image for me ? Image name: weather.jpg",
655
- "parameters": {
656
- "image": "weather.jpg",
657
- "prompt": "Describe the weather in the image for me ",
658
- },
659
- },
660
- {
661
- "scenario": "Which 2 are the least frequent bins in this histogram ? Image name: chart.jpg",
662
- "parameters": {
663
- "image": "chart.jpg",
664
- "prompt": "Which 2 are the least frequent bins in this histogram",
665
- },
666
- },
667
- ],
385
+ image_b64 = convert_to_b64(image)
386
+ data = {
387
+ "image": image_b64,
388
+ "tool": "image_captioning",
668
389
  }
669
390
 
670
- def __call__(self, image: str, prompt: str) -> Dict:
671
- """Invoke the visual question answering model.
391
+ answer = _send_inference_request(data, "tools")
392
+ return answer["text"][0] # type: ignore
672
393
 
673
- Parameters:
674
- image: the input image.
675
394
 
676
- Returns:
677
- A dictionary containing the key 'text' and the answer to the prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'}
678
- """
679
-
680
- gpt = OpenAILMM()
681
- return {"text": gpt(input=prompt, images=[image])}
395
+ def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
396
+ """'closest_mask_distance' calculates the closest distance between two masks.
682
397
 
398
+ Parameters:
399
+ mask1 (np.ndarray): The first mask.
400
+ mask2 (np.ndarray): The second mask.
683
401
 
684
- class ImageQuestionAnswering(Tool):
685
- r"""ImageQuestionAnswering is a tool that can explain contents of an image and answer questions about the same
686
- It is same as VisualQuestionAnswering but this tool is not used by agents. It is used when user requests a tool for VQA using generate_image_qa_tool function.
687
- It is also useful if the user wants the data to be not exposed to OpenAI endpoints
402
+ Returns:
403
+ float: The closest distance between the two masks.
688
404
 
689
405
  Example
690
406
  -------
691
- >>> import vision_agent as va
692
- >>> vqa_tool = va.tools.ImageQuestionAnswering()
693
- >>> vqa_tool(image="image1.jpg", prompt="describe this image in detail")
694
- {'text': "The image contains a cat sitting on a table with a bowl of milk."}
407
+ >>> closest_mask_distance(mask1, mask2)
408
+ 0.5
695
409
  """
696
410
 
697
- name = "image_question_answering_"
698
- description = "'image_question_answering_' is a tool that can answer basic questions about the image given a question and an image. It returns a text describing the image and the answer to the question"
699
-
700
- usage = {
701
- "required_parameters": [
702
- {"name": "image", "type": "str"},
703
- {"name": "prompt", "type": "str"},
704
- ],
705
- "examples": [
706
- {
707
- "scenario": "Describe this image in detail. Image name: cat.jpg",
708
- "parameters": {
709
- "image": "cats.jpg",
710
- "prompt": "Describe this image in detail",
711
- },
712
- },
713
- {
714
- "scenario": "Can you help me with this street sign in this image ? What does it say ? Image name: sign.jpg",
715
- "parameters": {
716
- "image": "sign.jpg",
717
- "prompt": "Can you help me with this street sign ? What does it say ?",
718
- },
719
- },
720
- {
721
- "scenario": "Describe the weather in the image for me ? Image name: weather.jpg",
722
- "parameters": {
723
- "image": "weather.jpg",
724
- "prompt": "Describe the weather in the image for me ",
725
- },
726
- },
727
- {
728
- "scenario": "Can you generate an image question answering tool ? Image name: chart.jpg, prompt: Which 2 are the least frequent bins in this histogram",
729
- "parameters": {
730
- "image": "chart.jpg",
731
- "prompt": "Which 2 are the least frequent bins in this histogram",
732
- },
733
- },
734
- ],
735
- }
411
+ mask1 = np.clip(mask1, 0, 1)
412
+ mask2 = np.clip(mask2, 0, 1)
413
+ mask1_points = np.transpose(np.nonzero(mask1))
414
+ mask2_points = np.transpose(np.nonzero(mask2))
415
+ dist_matrix = distance.cdist(mask1_points, mask2_points, "euclidean")
416
+ return cast(float, np.min(dist_matrix))
736
417
 
737
- def __call__(self, image: Union[str, ImageType], prompt: str) -> Dict:
738
- """Invoke the visual question answering model.
739
418
 
740
- Parameters:
741
- image: the input image.
419
+ def closest_box_distance(
420
+ box1: List[float], box2: List[float], image_size: Tuple[int, int]
421
+ ) -> float:
422
+ """'closest_box_distance' calculates the closest distance between two bounding boxes.
742
423
 
743
- Returns:
744
- A dictionary containing the key 'text' and the answer to the prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'}
745
- """
424
+ Parameters:
425
+ box1 (List[float]): The first bounding box.
426
+ box2 (List[float]): The second bounding box.
427
+ image_size (Tuple[int, int]): The size of the image given as (height, width).
746
428
 
747
- image_b64 = convert_to_b64(image)
748
- data = {
749
- "image": image_b64,
750
- "prompt": prompt,
751
- "tool": "image_question_answering",
752
- }
429
+ Returns:
430
+ float: The closest distance between the two bounding boxes.
753
431
 
754
- return _send_inference_request(data, "tools")
432
+ Example
433
+ -------
434
+ >>> closest_box_distance([100, 100, 200, 200], [300, 300, 400, 400])
435
+ 141.42
436
+ """
755
437
 
438
+ x11, y11, x12, y12 = denormalize_bbox(box1, image_size)
439
+ x21, y21, x22, y22 = denormalize_bbox(box2, image_size)
756
440
 
757
- class Crop(Tool):
758
- r"""Crop crops an image given a bounding box and returns a file name of the cropped image."""
441
+ horizontal_distance = np.max([0, x21 - x12, x11 - x22])
442
+ vertical_distance = np.max([0, y21 - y12, y11 - y22])
443
+ return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
759
444
 
760
- name = "crop_"
761
- description = "'crop_' crops an image given a bounding box and returns a file name of the cropped image. It returns a file with the cropped image."
762
- usage = {
763
- "required_parameters": [
764
- {"name": "bbox", "type": "List[float]"},
765
- {"name": "image", "type": "str"},
766
- ],
767
- "examples": [
768
- {
769
- "scenario": "Can you crop the image to the bounding box [0.1, 0.1, 0.9, 0.9]? Image name: image.jpg",
770
- "parameters": {"bbox": [0.1, 0.1, 0.9, 0.9], "image": "image.jpg"},
771
- },
772
- {
773
- "scenario": "Cut out the image to the bounding box [0.2, 0.2, 0.8, 0.8]. Image name: car.jpg",
774
- "parameters": {"bbox": [0.2, 0.2, 0.8, 0.8], "image": "car.jpg"},
775
- },
776
- ],
777
- }
778
445
 
779
- def __call__(self, bbox: List[float], image: Union[str, Path]) -> Dict:
780
- pil_image = Image.open(image)
781
- width, height = pil_image.size
782
- bbox = [
783
- int(bbox[0] * width),
784
- int(bbox[1] * height),
785
- int(bbox[2] * width),
786
- int(bbox[3] * height),
787
- ]
788
- cropped_image = pil_image.crop(bbox) # type: ignore
789
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
790
- cropped_image.save(tmp.name)
446
+ # Utility and visualization functions
791
447
 
792
- return {"image": tmp.name}
793
448
 
449
+ def save_json(data: Any, file_path: str) -> None:
450
+ """'save_json' is a utility function that saves data as a JSON file. It is helpful
451
+ for saving data that contains NumPy arrays which are not JSON serializable.
794
452
 
795
- class BboxStats(Tool):
796
- r"""BboxStats returns the height, width and area of the bounding box in pixels to 2 decimal places."""
453
+ Parameters:
454
+ data (Any): The data to save.
455
+ file_path (str): The path to save the JSON file.
797
456
 
798
- name = "bbox_stats_"
799
- description = "'bbox_stats_' returns the height, width and area of the given bounding box in pixels to 2 decimal places."
800
- usage = {
801
- "required_parameters": [
802
- {"name": "bboxes", "type": "List[int]"},
803
- {"name": "image_size", "type": "Tuple[int]"},
804
- ],
805
- "examples": [
806
- {
807
- "scenario": "Calculate the width and height of the bounding box [0.2, 0.21, 0.34, 0.42]",
808
- "parameters": {
809
- "bboxes": [[0.2, 0.21, 0.34, 0.42]],
810
- "image_size": (500, 1200),
811
- },
812
- },
813
- {
814
- "scenario": "Calculate the area of the bounding box [0.2, 0.21, 0.34, 0.42]",
815
- "parameters": {
816
- "bboxes": [[0.2, 0.21, 0.34, 0.42]],
817
- "image_size": (640, 480),
818
- },
819
- },
820
- ],
821
- }
457
+ Example
458
+ -------
459
+ >>> save_json(data, "path/to/file.json")
460
+ """
822
461
 
823
- def __call__(
824
- self, bboxes: List[List[int]], image_size: Tuple[int, int]
825
- ) -> List[Dict]:
826
- areas = []
827
- height, width = image_size
828
- for bbox in bboxes:
829
- x1, y1, x2, y2 = bbox
830
- areas.append(
831
- {
832
- "width": round((x2 - x1) * width, 2),
833
- "height": round((y2 - y1) * height, 2),
834
- "area": round((x2 - x1) * (y2 - y1) * width * height, 2),
835
- }
836
- )
837
-
838
- return areas
839
-
840
-
841
- class SegArea(Tool):
842
- r"""SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places."""
843
-
844
- name = "seg_area_"
845
- description = "'seg_area_' returns the area of the given segmentation mask in pixels normalized to 2 decimal places."
846
- usage = {
847
- "required_parameters": [{"name": "masks", "type": "str"}],
848
- "examples": [
849
- {
850
- "scenario": "If you want to calculate the area of the segmentation mask, pass the masks file name.",
851
- "parameters": {"masks": "mask_file.jpg"},
852
- },
853
- ],
854
- }
462
+ class NumpyEncoder(json.JSONEncoder):
463
+ def default(self, obj: Any): # type: ignore
464
+ if isinstance(obj, np.ndarray):
465
+ return obj.tolist()
466
+ elif isinstance(obj, np.bool_):
467
+ return bool(obj)
468
+ return json.JSONEncoder.default(self, obj)
855
469
 
856
- def __call__(self, masks: Union[str, Path]) -> float:
857
- pil_mask = Image.open(str(masks))
858
- np_mask = np.array(pil_mask)
859
- np_mask = np.clip(np_mask, 0, 1)
860
- return cast(float, round(np.sum(np_mask), 2))
861
-
862
-
863
- class BboxIoU(Tool):
864
- name = "bbox_iou_"
865
- description = "'bbox_iou_' returns the intersection over union of two bounding boxes. This is a good tool for determining if two objects are overlapping."
866
- usage = {
867
- "required_parameters": [
868
- {"name": "bbox1", "type": "List[int]"},
869
- {"name": "bbox2", "type": "List[int]"},
870
- ],
871
- "examples": [
872
- {
873
- "scenario": "If you want to calculate the intersection over union of the bounding boxes [0.2, 0.21, 0.34, 0.42] and [0.3, 0.31, 0.44, 0.52]",
874
- "parameters": {
875
- "bbox1": [0.2, 0.21, 0.34, 0.42],
876
- "bbox2": [0.3, 0.31, 0.44, 0.52],
877
- },
878
- }
879
- ],
880
- }
470
+ with open(file_path, "w") as f:
471
+ json.dump(data, f, cls=NumpyEncoder)
881
472
 
882
- def __call__(self, bbox1: List[int], bbox2: List[int]) -> float:
883
- x1, y1, x2, y2 = bbox1
884
- x3, y3, x4, y4 = bbox2
885
- xA = max(x1, x3)
886
- yA = max(y1, y3)
887
- xB = min(x2, x4)
888
- yB = min(y2, y4)
889
- inter_area = max(0, xB - xA) * max(0, yB - yA)
890
- boxa_area = (x2 - x1) * (y2 - y1)
891
- boxb_area = (x4 - x3) * (y4 - y3)
892
- iou = inter_area / float(boxa_area + boxb_area - inter_area)
893
- return round(iou, 2)
894
-
895
-
896
- class SegIoU(Tool):
897
- name = "seg_iou_"
898
- description = "'seg_iou_' returns the intersection over union of two segmentation masks given their segmentation mask files."
899
- usage = {
900
- "required_parameters": [
901
- {"name": "mask1", "type": "str"},
902
- {"name": "mask2", "type": "str"},
903
- ],
904
- "examples": [
905
- {
906
- "scenario": "Calculate the intersection over union of the segmentation masks for mask_file1.jpg and mask_file2.jpg",
907
- "parameters": {"mask1": "mask_file1.png", "mask2": "mask_file2.png"},
908
- }
909
- ],
910
- }
911
473
 
912
- def __call__(self, mask1: Union[str, Path], mask2: Union[str, Path]) -> float:
913
- pil_mask1 = Image.open(str(mask1))
914
- pil_mask2 = Image.open(str(mask2))
915
- np_mask1 = np.clip(np.array(pil_mask1), 0, 1)
916
- np_mask2 = np.clip(np.array(pil_mask2), 0, 1)
917
- intersection = np.logical_and(np_mask1, np_mask2)
918
- union = np.logical_or(np_mask1, np_mask2)
919
- iou = np.sum(intersection) / np.sum(union)
920
- return cast(float, round(iou, 2))
921
-
922
-
923
- class BboxContains(Tool):
924
- name = "bbox_contains_"
925
- description = "Given two bounding boxes, a target bounding box and a region bounding box, 'bbox_contains_' returns the intersection of the two bounding boxes which is the percentage area of the target bounding box overlaps with the region bounding box. This is a good tool for determining if the region object contains the target object."
926
- usage = {
927
- "required_parameters": [
928
- {"name": "target", "type": "List[int]"},
929
- {"name": "target_class", "type": "str"},
930
- {"name": "region", "type": "List[int]"},
931
- {"name": "region_class", "type": "str"},
932
- ],
933
- "examples": [
934
- {
935
- "scenario": "Determine if the dog on the couch, bounding box of the dog: [0.2, 0.21, 0.34, 0.42], bounding box of the couch: [0.3, 0.31, 0.44, 0.52]",
936
- "parameters": {
937
- "target": [0.2, 0.21, 0.34, 0.42],
938
- "target_class": "dog",
939
- "region": [0.3, 0.31, 0.44, 0.52],
940
- "region_class": "couch",
941
- },
942
- },
943
- {
944
- "scenario": "Check if the kid is in the pool? bounding box of the kid: [0.2, 0.21, 0.34, 0.42], bounding box of the pool: [0.3, 0.31, 0.44, 0.52]",
945
- "parameters": {
946
- "target": [0.2, 0.21, 0.34, 0.42],
947
- "target_class": "kid",
948
- "region": [0.3, 0.31, 0.44, 0.52],
949
- "region_class": "pool",
950
- },
951
- },
952
- ],
953
- }
474
+ def load_image(image_path: str) -> np.ndarray:
475
+ """'load_image' is a utility function that loads an image from the given path.
954
476
 
955
- def __call__(
956
- self, target: List[int], target_class: str, region: List[int], region_class: str
957
- ) -> Dict[str, Union[str, float]]:
958
- x1, y1, x2, y2 = target
959
- x3, y3, x4, y4 = region
960
- xA = max(x1, x3)
961
- yA = max(y1, y3)
962
- xB = min(x2, x4)
963
- yB = min(y2, y4)
964
- inter_area = max(0, xB - xA) * max(0, yB - yA)
965
- boxa_area = (x2 - x1) * (y2 - y1)
966
- iou = inter_area / float(boxa_area)
967
- area = round(iou, 2)
968
- return {
969
- "target_class": target_class,
970
- "region_class": region_class,
971
- "intersection": area,
972
- }
973
-
974
-
975
- class ObjectDistance(Tool):
976
- name = "object_distance_"
977
- description = "'object_distance_' calculates the distance between two objects in an image. It returns the minimum distance between the two objects."
978
- usage = {
979
- "required_parameters": [
980
- {"name": "object1", "type": "Dict[str, Any]"},
981
- {"name": "object2", "type": "Dict[str, Any]"},
982
- ],
983
- "examples": [
984
- {
985
- "scenario": "Calculate the distance between these two objects {bboxes: [0.2, 0.21, 0.34, 0.42], masks: 'mask_file1.png'}, {bboxes: [0.3, 0.31, 0.44, 0.52], masks: 'mask_file2.png'}",
986
- "parameters": {
987
- "object1": {
988
- "bboxes": [0.2, 0.21, 0.34, 0.42],
989
- "scores": 0.54,
990
- "masks": "mask_file1.png",
991
- },
992
- "object2": {
993
- "bboxes": [0.3, 0.31, 0.44, 0.52],
994
- "scores": 0.66,
995
- "masks": "mask_file2.png",
996
- },
997
- },
998
- }
999
- ],
1000
- }
477
+ Parameters:
478
+ image_path (str): The path to the image.
1001
479
 
1002
- def __call__(self, object1: Dict[str, Any], object2: Dict[str, Any]) -> float:
1003
- if "masks" in object1 and "masks" in object2:
1004
- mask1 = object1["masks"]
1005
- mask2 = object2["masks"]
1006
- return MaskDistance()(mask1, mask2)
1007
- elif "bboxes" in object1 and "bboxes" in object2:
1008
- bbox1 = object1["bboxes"]
1009
- bbox2 = object2["bboxes"]
1010
- return BoxDistance()(bbox1, bbox2)
1011
- else:
1012
- raise ValueError("Either of the objects should have masks or bboxes")
1013
-
1014
-
1015
- class BoxDistance(Tool):
1016
- name = "box_distance_"
1017
- description = "'box_distance_' calculates distance between two bounding boxes. It returns the minumum distance between the given bounding boxes"
1018
- usage = {
1019
- "required_parameters": [
1020
- {"name": "bbox1", "type": "List[int]"},
1021
- {"name": "bbox2", "type": "List[int]"},
1022
- ],
1023
- "examples": [
1024
- {
1025
- "scenario": "Calculate the distance between these two bounding boxes [0.2, 0.21, 0.34, 0.42] and [0.3, 0.31, 0.44, 0.52]",
1026
- "parameters": {
1027
- "bbox1": [0.2, 0.21, 0.34, 0.42],
1028
- "bbox2": [0.3, 0.31, 0.44, 0.52],
1029
- },
1030
- }
1031
- ],
1032
- }
480
+ Returns:
481
+ np.ndarray: The image as a NumPy array.
1033
482
 
1034
- def __call__(self, bbox1: List[int], bbox2: List[int]) -> float:
1035
- x11, y11, x12, y12 = bbox1
1036
- x21, y21, x22, y22 = bbox2
483
+ Example
484
+ -------
485
+ >>> load_image("path/to/image.jpg")
486
+ """
1037
487
 
1038
- horizontal_dist = np.max([0, x21 - x12, x11 - x22])
1039
- vertical_dist = np.max([0, y21 - y12, y11 - y22])
488
+ image = Image.open(image_path).convert("RGB")
489
+ return np.array(image)
1040
490
 
1041
- return cast(float, round(np.sqrt(horizontal_dist**2 + vertical_dist**2), 2))
1042
491
 
492
+ def save_image(image: np.ndarray) -> str:
493
+ """'save_image' is a utility function that saves an image as a temporary file.
1043
494
 
1044
- class MaskDistance(Tool):
1045
- name = "mask_distance_"
1046
- description = "'mask_distance_' calculates distance between two masks. It is helpful in checking proximity of two objects. It returns the minumum distance between the given masks"
1047
- usage = {
1048
- "required_parameters": [
1049
- {"name": "mask1", "type": "str"},
1050
- {"name": "mask2", "type": "str"},
1051
- ],
1052
- "examples": [
1053
- {
1054
- "scenario": "Calculate the distance between the segmentation masks for mask_file1.jpg and mask_file2.jpg",
1055
- "parameters": {"mask1": "mask_file1.png", "mask2": "mask_file2.png"},
1056
- }
1057
- ],
1058
- }
495
+ Parameters:
496
+ image (np.ndarray): The image to save.
1059
497
 
1060
- def __call__(self, mask1: Union[str, Path], mask2: Union[str, Path]) -> float:
1061
- pil_mask1 = Image.open(str(mask1))
1062
- pil_mask2 = Image.open(str(mask2))
1063
- np_mask1 = np.clip(np.array(pil_mask1), 0, 1)
1064
- np_mask2 = np.clip(np.array(pil_mask2), 0, 1)
498
+ Returns:
499
+ str: The path to the saved image.
1065
500
 
1066
- mask1_points = np.transpose(np.nonzero(np_mask1))
1067
- mask2_points = np.transpose(np.nonzero(np_mask2))
1068
- dist_matrix = distance.cdist(mask1_points, mask2_points, "euclidean")
1069
- return cast(float, np.round(np.min(dist_matrix), 2))
501
+ Example
502
+ -------
503
+ >>> save_image(image)
504
+ "/tmp/tmpabc123.png"
505
+ """
1070
506
 
507
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
508
+ pil_image = Image.fromarray(image.astype(np.uint8))
509
+ pil_image.save(f, "PNG")
510
+ return f.name
1071
511
 
1072
- class ExtractFrames(Tool):
1073
- r"""Extract frames from a video."""
1074
512
 
1075
- name = "extract_frames_"
1076
- description = "'extract_frames_' extracts frames from a video every 2 seconds, returns a list of tuples (frame, timestamp), where timestamp is the relative time in seconds where the frame was captured. The frame is a local image file path."
1077
- usage = {
1078
- "required_parameters": [{"name": "video_uri", "type": "str"}],
1079
- "optional_parameters": [{"name": "frames_every", "type": "float"}],
1080
- "examples": [
1081
- {
1082
- "scenario": "Can you extract the frames from this video? Video: www.foobar.com/video?name=test.mp4",
1083
- "parameters": {"video_uri": "www.foobar.com/video?name=test.mp4"},
1084
- },
1085
- {
1086
- "scenario": "Can you extract the images from this video file at every 2 seconds ? Video path: tests/data/test.mp4",
1087
- "parameters": {"video_uri": "tests/data/test.mp4", "frames_every": 2},
1088
- },
1089
- ],
1090
- }
513
+ def overlay_bounding_boxes(
514
+ image: np.ndarray, bboxes: List[Dict[str, Any]]
515
+ ) -> np.ndarray:
516
+ """'display_bounding_boxes' is a utility function that displays bounding boxes on
517
+ an image.
1091
518
 
1092
- def __call__(
1093
- self, video_uri: str, frames_every: float = 2
1094
- ) -> List[Tuple[str, float]]:
1095
- """Extract frames from a video.
519
+ Parameters:
520
+ image (np.ndarray): The image to display the bounding boxes on.
521
+ bboxes (List[Dict[str, Any]]): A list of dictionaries containing the bounding
522
+ boxes.
1096
523
 
524
+ Returns:
525
+ np.ndarray: The image with the bounding boxes, labels and scores displayed.
1097
526
 
1098
- Parameters:
1099
- video_uri: the path to the video file or a url points to the video data
527
+ Example
528
+ -------
529
+ >>> image_with_bboxes = display_bounding_boxes(
530
+ image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
531
+ )
532
+ """
533
+ pil_image = Image.fromarray(image.astype(np.uint8))
1100
534
 
1101
- Returns:
1102
- a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(path_to_frame1, 0.0), (path_to_frame2, 0.5), ...]. The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order.
1103
- """
1104
- frames = extract_frames_from_video(video_uri, fps=round(1 / frames_every, 2))
1105
- result = []
1106
- _LOGGER.info(
1107
- f"Extracted {len(frames)} frames from video {video_uri}. Temporarily saving them as images to disk for downstream tasks."
535
+ if len(set([box["label"] for box in bboxes])) > len(COLORS):
536
+ _LOGGER.warning(
537
+ "Number of unique labels exceeds the number of available colors. Some labels may have the same color."
1108
538
  )
1109
- for frame, ts in frames:
1110
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
1111
- file_name = Path(tmp.name).with_suffix(".frame.png")
1112
- Image.fromarray(frame).save(file_name)
1113
- result.append((str(file_name), ts))
1114
- return result
1115
-
1116
-
1117
- class OCR(Tool):
1118
- name = "ocr_"
1119
- description = "'ocr_' extracts text from an image. It returns a list of detected text, bounding boxes, and confidence scores."
1120
- usage = {
1121
- "required_parameters": [
1122
- {"name": "image", "type": "str"},
1123
- ],
1124
- "examples": [
1125
- {
1126
- "scenario": "Can you extract the text from this image? Image name: image.png",
1127
- "parameters": {"image": "image.png"},
1128
- },
1129
- ],
539
+
540
+ color = {
541
+ label: COLORS[i % len(COLORS)]
542
+ for i, label in enumerate(set([box["label"] for box in bboxes]))
1130
543
  }
1131
- _API_KEY = "land_sk_WVYwP00xA3iXely2vuar6YUDZ3MJT9yLX6oW5noUkwICzYLiDV"
1132
- _URL = "https://app.landing.ai/ocr/v1/detect-text"
1133
-
1134
- def __call__(self, image: str) -> dict:
1135
- pil_image = Image.open(image).convert("RGB")
1136
- image_size = pil_image.size[::-1]
1137
- image_buffer = io.BytesIO()
1138
- pil_image.save(image_buffer, format="PNG")
1139
- buffer_bytes = image_buffer.getvalue()
1140
- image_buffer.close()
1141
-
1142
- res = requests.post(
1143
- self._URL,
1144
- files={"images": buffer_bytes},
1145
- data={"language": "en"},
1146
- headers={"contentType": "multipart/form-data", "apikey": self._API_KEY},
1147
- )
1148
- if res.status_code != 200:
1149
- _LOGGER.error(f"Request failed: {res.text}")
1150
- raise ValueError(f"Request failed: {res.text}")
1151
-
1152
- data = res.json()
1153
- output: Dict[str, List] = {"labels": [], "bboxes": [], "scores": []}
1154
- for det in data[0]:
1155
- output["labels"].append(det["text"])
1156
- box = [
1157
- det["location"][0]["x"],
1158
- det["location"][0]["y"],
1159
- det["location"][2]["x"],
1160
- det["location"][2]["y"],
1161
- ]
1162
- box = normalize_bbox(box, image_size)
1163
- output["bboxes"].append(box)
1164
- output["scores"].append(round(det["score"], 2))
1165
- return output
1166
-
1167
-
1168
- class Calculator(Tool):
1169
- r"""Calculator is a tool that can perform basic arithmetic operations."""
1170
-
1171
- name = "calculator_"
1172
- description = (
1173
- "'calculator_' is a tool that can perform basic arithmetic operations."
544
+
545
+ width, height = pil_image.size
546
+ fontsize = max(12, int(min(width, height) / 40))
547
+ draw = ImageDraw.Draw(pil_image)
548
+ font = ImageFont.truetype(
549
+ str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")),
550
+ fontsize,
1174
551
  )
1175
- usage = {
1176
- "required_parameters": [{"name": "equation", "type": "str"}],
1177
- "examples": [
1178
- {
1179
- "scenario": "If you want to calculate (2 * 3) + 4",
1180
- "parameters": {"equation": "2 + 4"},
1181
- },
1182
- {
1183
- "scenario": "If you want to calculate (4 + 2.5) / 2.1",
1184
- "parameters": {"equation": "(4 + 2.5) / 2.1"},
1185
- },
1186
- ],
1187
- }
1188
552
 
1189
- def __call__(self, equation: str) -> float:
1190
- return cast(float, round(eval(equation), 2))
1191
-
1192
-
1193
- TOOLS = {
1194
- i: {"name": c.name, "description": c.description, "usage": c.usage, "class": c}
1195
- for i, c in enumerate(
1196
- [
1197
- NoOp,
1198
- CLIP,
1199
- GroundingDINO,
1200
- AgentGroundingSAM,
1201
- ZeroShotCounting,
1202
- VisualPromptCounting,
1203
- VisualQuestionAnswering,
1204
- AgentDINOv,
1205
- ExtractFrames,
1206
- Crop,
1207
- BboxStats,
1208
- SegArea,
1209
- ObjectDistance,
1210
- BboxContains,
1211
- SegIoU,
1212
- OCR,
1213
- Calculator,
553
+ for elt in bboxes:
554
+ label = elt["label"]
555
+ box = elt["bbox"]
556
+ scores = elt["score"]
557
+
558
+ box = [
559
+ int(box[0] * width),
560
+ int(box[1] * height),
561
+ int(box[2] * width),
562
+ int(box[3] * height),
1214
563
  ]
1215
- )
1216
- if (hasattr(c, "name") and hasattr(c, "description") and hasattr(c, "usage"))
1217
- }
564
+ draw.rectangle(box, outline=color[label], width=4)
565
+ text = f"{label}: {scores:.2f}"
566
+ text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
567
+ draw.rectangle((box[0], box[1], text_box[2], text_box[3]), fill=color[label])
568
+ draw.text((box[0], box[1]), text, fill="black", font=font)
569
+ return np.array(pil_image.convert("RGB"))
1218
570
 
1219
571
 
1220
- def register_tool(tool: Type[Tool]) -> Type[Tool]:
1221
- r"""Add a tool to the list of available tools.
572
+ def overlay_segmentation_masks(
573
+ image: np.ndarray, masks: List[Dict[str, Any]]
574
+ ) -> np.ndarray:
575
+ """'display_segmentation_masks' is a utility function that displays segmentation
576
+ masks.
1222
577
 
1223
578
  Parameters:
1224
- tool: The tool to add.
579
+ image (np.ndarray): The image to display the masks on.
580
+ masks (List[Dict[str, Any]]): A list of dictionaries containing the masks.
581
+
582
+ Returns:
583
+ np.ndarray: The image with the masks displayed.
584
+
585
+ Example
586
+ -------
587
+ >>> image_with_masks = display_segmentation_masks(
588
+ image,
589
+ [{
590
+ 'score': 0.99,
591
+ 'label': 'dinosaur',
592
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
593
+ [0, 0, 0, ..., 0, 0, 0],
594
+ ...,
595
+ [0, 0, 0, ..., 0, 0, 0],
596
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
597
+ }],
598
+ )
1225
599
  """
600
+ pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGBA")
1226
601
 
1227
- if (
1228
- not hasattr(tool, "name")
1229
- or not hasattr(tool, "description")
1230
- or not hasattr(tool, "usage")
1231
- ):
1232
- raise ValueError(
1233
- "The tool must have 'name', 'description' and 'usage' attributes."
602
+ if len(set([mask["label"] for mask in masks])) > len(COLORS):
603
+ _LOGGER.warning(
604
+ "Number of unique labels exceeds the number of available colors. Some labels may have the same color."
1234
605
  )
1235
606
 
1236
- TOOLS[len(TOOLS)] = {
1237
- "name": tool.name,
1238
- "description": tool.description,
1239
- "usage": tool.usage,
1240
- "class": tool,
607
+ color = {
608
+ label: COLORS[i % len(COLORS)]
609
+ for i, label in enumerate(set([mask["label"] for mask in masks]))
1241
610
  }
1242
- return tool
611
+
612
+ for elt in masks:
613
+ mask = elt["mask"]
614
+ label = elt["label"]
615
+ np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
616
+ np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
617
+ mask_img = Image.fromarray(np_mask.astype(np.uint8))
618
+ pil_image = Image.alpha_composite(pil_image, mask_img)
619
+ return np.array(pil_image.convert("RGB"))
620
+
621
+
622
+ def get_tool_documentation(funcs: List[Callable[..., Any]]) -> str:
623
+ docstrings = ""
624
+ for func in funcs:
625
+ docstrings += f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}\n\n"
626
+
627
+ return docstrings
628
+
629
+
630
+ def get_tool_descriptions(funcs: List[Callable[..., Any]]) -> str:
631
+ descriptions = ""
632
+ for func in funcs:
633
+ description = func.__doc__
634
+ if description is None:
635
+ description = ""
636
+
637
+ description = (
638
+ description[: description.find("Parameters:")].replace("\n", " ").strip()
639
+ )
640
+ description = " ".join(description.split())
641
+ descriptions += f"- {func.__name__}{inspect.signature(func)}: {description}\n"
642
+ return descriptions
643
+
644
+
645
+ def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame:
646
+ data: Dict[str, List[str]] = {"desc": [], "doc": []}
647
+
648
+ for func in funcs:
649
+ desc = func.__doc__
650
+ if desc is None:
651
+ desc = ""
652
+ desc = desc[: desc.find("Parameters:")].replace("\n", " ").strip()
653
+ desc = " ".join(desc.split())
654
+
655
+ doc = f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}"
656
+ data["desc"].append(desc)
657
+ data["doc"].append(doc)
658
+
659
+ return pd.DataFrame(data) # type: ignore
660
+
661
+
662
+ TOOLS = [
663
+ grounding_dino,
664
+ grounding_sam,
665
+ extract_frames,
666
+ ocr,
667
+ clip,
668
+ zero_shot_counting,
669
+ visual_prompt_counting,
670
+ image_question_answering,
671
+ image_caption,
672
+ closest_mask_distance,
673
+ closest_box_distance,
674
+ save_json,
675
+ load_image,
676
+ save_image,
677
+ overlay_bounding_boxes,
678
+ overlay_segmentation_masks,
679
+ ]
680
+ TOOLS_DF = get_tools_df(TOOLS) # type: ignore
681
+ TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore
682
+ TOOL_DOCSTRING = get_tool_documentation(TOOLS) # type: ignore
683
+ UTILITIES_DOCSTRING = get_tool_documentation(
684
+ [save_json, load_image, save_image, overlay_bounding_boxes]
685
+ )