vision-agent 0.2.210__py3-none-any.whl → 0.2.211__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/tools/__init__.py +1 -14
- vision_agent/tools/tool_utils.py +2 -2
- vision_agent/tools/tools.py +526 -757
- vision_agent/utils/image_utils.py +16 -0
- {vision_agent-0.2.210.dist-info → vision_agent-0.2.211.dist-info}/METADATA +1 -1
- {vision_agent-0.2.210.dist-info → vision_agent-0.2.211.dist-info}/RECORD +8 -8
- {vision_agent-0.2.210.dist-info → vision_agent-0.2.211.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.210.dist-info → vision_agent-0.2.211.dist-info}/WHEEL +0 -0
vision_agent/tools/tools.py
CHANGED
@@ -4,6 +4,7 @@ import logging
|
|
4
4
|
import os
|
5
5
|
import tempfile
|
6
6
|
import urllib.request
|
7
|
+
from base64 import b64encode
|
7
8
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
8
9
|
from functools import lru_cache
|
9
10
|
from importlib import resources
|
@@ -14,6 +15,7 @@ from uuid import UUID
|
|
14
15
|
import cv2
|
15
16
|
import numpy as np
|
16
17
|
import requests
|
18
|
+
from IPython.display import display
|
17
19
|
from PIL import Image, ImageDraw, ImageFont
|
18
20
|
from pillow_heif import register_heif_opener # type: ignore
|
19
21
|
from pytube import YouTube # type: ignore
|
@@ -21,8 +23,8 @@ from pytube import YouTube # type: ignore
|
|
21
23
|
from vision_agent.clients.landing_public_api import LandingPublicAPI
|
22
24
|
from vision_agent.lmm.lmm import AnthropicLMM, OpenAILMM
|
23
25
|
from vision_agent.tools.tool_utils import (
|
26
|
+
ToolCallTrace,
|
24
27
|
add_bboxes_from_masks,
|
25
|
-
filter_bboxes_by_threshold,
|
26
28
|
get_tool_descriptions,
|
27
29
|
get_tool_documentation,
|
28
30
|
get_tools_df,
|
@@ -32,7 +34,7 @@ from vision_agent.tools.tool_utils import (
|
|
32
34
|
send_task_inference_request,
|
33
35
|
single_nms,
|
34
36
|
)
|
35
|
-
from vision_agent.tools.tools_types import JobStatus
|
37
|
+
from vision_agent.tools.tools_types import JobStatus
|
36
38
|
from vision_agent.utils.exceptions import FineTuneModelIsNotReady
|
37
39
|
from vision_agent.utils.execute import FileSerializer, MimeType
|
38
40
|
from vision_agent.utils.image_utils import (
|
@@ -41,7 +43,6 @@ from vision_agent.utils.image_utils import (
|
|
41
43
|
convert_to_b64,
|
42
44
|
denormalize_bbox,
|
43
45
|
encode_image_bytes,
|
44
|
-
get_image_size,
|
45
46
|
normalize_bbox,
|
46
47
|
numpy_to_bytes,
|
47
48
|
rle_decode,
|
@@ -88,66 +89,33 @@ def get_tool_recommender() -> Sim:
|
|
88
89
|
return load_cached_sim(TOOLS_DF)
|
89
90
|
|
90
91
|
|
91
|
-
def
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
Example
|
120
|
-
-------
|
121
|
-
>>> grounding_dino("car. dinosaur", image)
|
122
|
-
[
|
123
|
-
{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
124
|
-
{'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
125
|
-
]
|
126
|
-
"""
|
127
|
-
image_size = image.shape[:2]
|
128
|
-
image_b64 = convert_to_b64(image)
|
129
|
-
if model_size not in ["large", "tiny"]:
|
130
|
-
raise ValueError("model_size must be either 'large' or 'tiny'")
|
131
|
-
request_data = {
|
132
|
-
"prompt": prompt,
|
133
|
-
"image": image_b64,
|
134
|
-
"tool": (
|
135
|
-
"visual_grounding" if model_size == "large" else "visual_grounding_tiny"
|
136
|
-
),
|
137
|
-
"kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
|
138
|
-
"function_name": "grounding_dino",
|
139
|
-
}
|
140
|
-
data: Dict[str, Any] = send_inference_request(request_data, "tools")
|
141
|
-
return_data = []
|
142
|
-
for i in range(len(data["bboxes"])):
|
143
|
-
return_data.append(
|
144
|
-
{
|
145
|
-
"score": round(data["scores"][i], 2),
|
146
|
-
"label": data["labels"][i],
|
147
|
-
"bbox": normalize_bbox(data["bboxes"][i], image_size),
|
148
|
-
}
|
149
|
-
)
|
150
|
-
return return_data
|
92
|
+
def _display_tool_trace(
|
93
|
+
function_name: str,
|
94
|
+
request: Dict[str, Any],
|
95
|
+
response: Any,
|
96
|
+
files: Union[List[Tuple[str, bytes]], str],
|
97
|
+
) -> None:
|
98
|
+
# Sends data through IPython's display function so front-end can show them. We use
|
99
|
+
# a function here instead of a decarator becuase we do not want to re-calculate data
|
100
|
+
# such as video bytes, which can be slow. Since this is calculated inside the
|
101
|
+
# function we can't capture it with a decarator without adding it as a return value
|
102
|
+
# which would change the function signature and affect the agent.
|
103
|
+
files_in_b64: List[Tuple[str, str]]
|
104
|
+
if isinstance(files, str):
|
105
|
+
files_in_b64 = [("images", files)]
|
106
|
+
else:
|
107
|
+
files_in_b64 = [(file[0], b64encode(file[1]).decode("utf-8")) for file in files]
|
108
|
+
|
109
|
+
request["function_name"] = function_name
|
110
|
+
tool_call_trace = ToolCallTrace(
|
111
|
+
endpoint_url="",
|
112
|
+
type="tool_func_call",
|
113
|
+
request=request,
|
114
|
+
response={"data": response},
|
115
|
+
error=None,
|
116
|
+
files=files_in_b64,
|
117
|
+
)
|
118
|
+
display({MimeType.APPLICATION_JSON: tool_call_trace.model_dump()}, raw=True)
|
151
119
|
|
152
120
|
|
153
121
|
def owl_v2_image(
|
@@ -223,14 +191,21 @@ def owl_v2_image(
|
|
223
191
|
# get the first frame
|
224
192
|
bboxes = detections[0]
|
225
193
|
bboxes_formatted = [
|
226
|
-
|
227
|
-
label
|
228
|
-
bbox
|
229
|
-
score
|
230
|
-
|
194
|
+
{
|
195
|
+
"label": bbox["label"],
|
196
|
+
"bbox": normalize_bbox(bbox["bounding_box"], image_size),
|
197
|
+
"score": round(bbox["score"], 2),
|
198
|
+
}
|
231
199
|
for bbox in bboxes
|
232
200
|
]
|
233
|
-
|
201
|
+
|
202
|
+
_display_tool_trace(
|
203
|
+
owl_v2_image.__name__,
|
204
|
+
payload,
|
205
|
+
detections[0],
|
206
|
+
files,
|
207
|
+
)
|
208
|
+
return bboxes_formatted
|
234
209
|
|
235
210
|
|
236
211
|
def owl_v2_video(
|
@@ -309,81 +284,21 @@ def owl_v2_video(
|
|
309
284
|
bboxes_formatted = []
|
310
285
|
for frame_data in detections:
|
311
286
|
bboxes_formatted_per_frame = [
|
312
|
-
|
313
|
-
label
|
314
|
-
bbox
|
315
|
-
score
|
316
|
-
|
287
|
+
{
|
288
|
+
"label": bbox["label"],
|
289
|
+
"bbox": normalize_bbox(bbox["bounding_box"], image_size),
|
290
|
+
"score": round(bbox["score"], 2),
|
291
|
+
}
|
317
292
|
for bbox in frame_data
|
318
293
|
]
|
319
294
|
bboxes_formatted.append(bboxes_formatted_per_frame)
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
iou_threshold: float = 0.20,
|
328
|
-
) -> List[Dict[str, Any]]:
|
329
|
-
"""'grounding_sam' is a tool that can segment multiple objects given a text prompt
|
330
|
-
such as category names or referring expressions. The categories in text prompt are
|
331
|
-
separated by commas or periods. It returns a list of bounding boxes, label names,
|
332
|
-
mask file names and associated probability scores.
|
333
|
-
|
334
|
-
Parameters:
|
335
|
-
prompt (str): The prompt to ground to the image.
|
336
|
-
image (np.ndarray): The image to ground the prompt to.
|
337
|
-
box_threshold (float, optional): The threshold for the box detection. Defaults
|
338
|
-
to 0.20.
|
339
|
-
iou_threshold (float, optional): The threshold for the Intersection over Union
|
340
|
-
(IoU). Defaults to 0.20.
|
341
|
-
|
342
|
-
Returns:
|
343
|
-
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
344
|
-
bounding box, and mask of the detected objects with normalized coordinates
|
345
|
-
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
346
|
-
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
347
|
-
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
348
|
-
the background.
|
349
|
-
|
350
|
-
Example
|
351
|
-
-------
|
352
|
-
>>> grounding_sam("car. dinosaur", image)
|
353
|
-
[
|
354
|
-
{
|
355
|
-
'score': 0.99,
|
356
|
-
'label': 'dinosaur',
|
357
|
-
'bbox': [0.1, 0.11, 0.35, 0.4],
|
358
|
-
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
359
|
-
[0, 0, 0, ..., 0, 0, 0],
|
360
|
-
...,
|
361
|
-
[0, 0, 0, ..., 0, 0, 0],
|
362
|
-
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
363
|
-
},
|
364
|
-
]
|
365
|
-
"""
|
366
|
-
image_size = image.shape[:2]
|
367
|
-
image_b64 = convert_to_b64(image)
|
368
|
-
request_data = {
|
369
|
-
"prompt": prompt,
|
370
|
-
"image": image_b64,
|
371
|
-
"tool": "visual_grounding_segment",
|
372
|
-
"kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
|
373
|
-
"function_name": "grounding_sam",
|
374
|
-
}
|
375
|
-
data: Dict[str, Any] = send_inference_request(request_data, "tools")
|
376
|
-
return_data = []
|
377
|
-
for i in range(len(data["bboxes"])):
|
378
|
-
return_data.append(
|
379
|
-
{
|
380
|
-
"score": round(data["scores"][i], 2),
|
381
|
-
"label": data["labels"][i],
|
382
|
-
"bbox": normalize_bbox(data["bboxes"][i], image_size),
|
383
|
-
"mask": rle_decode(mask_rle=data["masks"][i], shape=data["mask_shape"]),
|
384
|
-
}
|
385
|
-
)
|
386
|
-
return return_data
|
295
|
+
_display_tool_trace(
|
296
|
+
owl_v2_video.__name__,
|
297
|
+
payload,
|
298
|
+
detections[0],
|
299
|
+
files,
|
300
|
+
)
|
301
|
+
return bboxes_formatted
|
387
302
|
|
388
303
|
|
389
304
|
def florence2_sam2_image(
|
@@ -460,6 +375,13 @@ def florence2_sam2_image(
|
|
460
375
|
label = detection["label"]
|
461
376
|
bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
|
462
377
|
return_data.append({"label": label, "bbox": bbox, "mask": mask, "score": 1.0})
|
378
|
+
|
379
|
+
_display_tool_trace(
|
380
|
+
florence2_sam2_image.__name__,
|
381
|
+
payload,
|
382
|
+
detections[0],
|
383
|
+
files,
|
384
|
+
)
|
463
385
|
return return_data
|
464
386
|
|
465
387
|
|
@@ -545,10 +467,36 @@ def florence2_sam2_video_tracking(
|
|
545
467
|
for detection in frame:
|
546
468
|
mask = rle_decode_array(detection["mask"])
|
547
469
|
label = str(detection["id"]) + ": " + detection["label"]
|
548
|
-
return_frame_data.append(
|
470
|
+
return_frame_data.append(
|
471
|
+
{"label": label, "mask": mask, "score": 1.0, "rle": detection["mask"]}
|
472
|
+
)
|
549
473
|
return_data.append(return_frame_data)
|
550
474
|
return_data = add_bboxes_from_masks(return_data)
|
551
|
-
|
475
|
+
return_data = nms(return_data, iou_threshold=0.95)
|
476
|
+
|
477
|
+
_display_tool_trace(
|
478
|
+
florence2_sam2_video_tracking.__name__,
|
479
|
+
payload,
|
480
|
+
[
|
481
|
+
[
|
482
|
+
{
|
483
|
+
"label": e["label"],
|
484
|
+
"score": e["score"],
|
485
|
+
"bbox": denormalize_bbox(e["bbox"], frames[0].shape[:2]),
|
486
|
+
"mask": e["rle"],
|
487
|
+
}
|
488
|
+
for e in lst
|
489
|
+
]
|
490
|
+
for lst in return_data
|
491
|
+
],
|
492
|
+
files,
|
493
|
+
)
|
494
|
+
# We save the RLE for display purposes, re-calculting RLE can get very expensive.
|
495
|
+
# Deleted here because we are returning the numpy masks instead
|
496
|
+
for frame in return_data:
|
497
|
+
for obj in frame:
|
498
|
+
del obj["rle"]
|
499
|
+
return return_data
|
552
500
|
|
553
501
|
|
554
502
|
def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
@@ -603,86 +551,175 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
603
551
|
box = normalize_bbox(box, image_size)
|
604
552
|
output.append({"label": label, "bbox": box, "score": round(det["score"], 2)})
|
605
553
|
|
606
|
-
|
607
|
-
|
554
|
+
_display_tool_trace(
|
555
|
+
ocr.__name__,
|
556
|
+
{},
|
557
|
+
data,
|
558
|
+
cast(List[Tuple[str, bytes]], [("image", buffer_bytes)]),
|
559
|
+
)
|
560
|
+
return sorted(output, key=lambda x: (x["bbox"][1], x["bbox"][0]))
|
561
|
+
|
562
|
+
|
563
|
+
def _sam2(
|
564
|
+
image: np.ndarray,
|
565
|
+
detections: List[Dict[str, Any]],
|
566
|
+
image_size: Tuple[int, ...],
|
567
|
+
image_bytes: Optional[bytes] = None,
|
568
|
+
) -> Dict[str, Any]:
|
569
|
+
if image_bytes is None:
|
570
|
+
image_bytes = numpy_to_bytes(image)
|
571
|
+
|
572
|
+
files = [("images", image_bytes)]
|
573
|
+
payload = {
|
574
|
+
"model": "sam2",
|
575
|
+
"bboxes": json.dumps(
|
576
|
+
[
|
577
|
+
{
|
578
|
+
"labels": [d["label"] for d in detections],
|
579
|
+
"bboxes": [
|
580
|
+
denormalize_bbox(d["bbox"], image_size) for d in detections
|
581
|
+
],
|
582
|
+
}
|
583
|
+
]
|
584
|
+
),
|
585
|
+
}
|
586
|
+
|
587
|
+
metadata = {"function_name": "sam2"}
|
588
|
+
pred_detections = send_task_inference_request(
|
589
|
+
payload, "sam2", files=files, metadata=metadata
|
590
|
+
)
|
591
|
+
frame = pred_detections[0]
|
592
|
+
return_data = []
|
593
|
+
display_data = []
|
594
|
+
for inp_detection, detection in zip(detections, frame):
|
595
|
+
mask = rle_decode_array(detection["mask"])
|
596
|
+
label = detection["label"]
|
597
|
+
bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
|
598
|
+
return_data.append(
|
599
|
+
{
|
600
|
+
"label": label,
|
601
|
+
"bbox": bbox,
|
602
|
+
"mask": mask,
|
603
|
+
"score": inp_detection["score"],
|
604
|
+
}
|
605
|
+
)
|
606
|
+
display_data.append(
|
607
|
+
{
|
608
|
+
"label": label,
|
609
|
+
"bbox": detection["bounding_box"],
|
610
|
+
"mask": detection["mask"],
|
611
|
+
"score": inp_detection["score"],
|
612
|
+
}
|
613
|
+
)
|
614
|
+
return {"files": files, "return_data": return_data, "display_data": display_data}
|
608
615
|
|
609
616
|
|
610
|
-
def
|
611
|
-
|
612
|
-
|
613
|
-
|
617
|
+
def sam2(
|
618
|
+
image: np.ndarray,
|
619
|
+
detections: List[Dict[str, Any]],
|
620
|
+
) -> List[Dict[str, Any]]:
|
621
|
+
"""'sam2' is a tool that can segment multiple objects given an input bounding box,
|
622
|
+
label and score. It returns a set of masks along with the corresponding bounding
|
623
|
+
boxes and labels.
|
614
624
|
|
615
625
|
Parameters:
|
616
|
-
image (np.ndarray): The image that contains
|
626
|
+
image (np.ndarray): The image that contains multiple instances of the object.
|
627
|
+
detections (List[Dict[str, Any]]): A list of dictionaries containing the score,
|
628
|
+
label, and bounding box of the detected objects with normalized coordinates
|
629
|
+
between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates
|
630
|
+
of the top-left and xmax and ymax are the coordinates of the bottom-right of
|
631
|
+
the bounding box.
|
617
632
|
|
618
633
|
Returns:
|
619
|
-
Dict[str, Any]: A
|
620
|
-
|
634
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
635
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
636
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
637
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
638
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
639
|
+
the background.
|
621
640
|
|
622
641
|
Example
|
623
642
|
-------
|
624
|
-
>>>
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
643
|
+
>>> sam2(image, [
|
644
|
+
{'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
645
|
+
])
|
646
|
+
[
|
647
|
+
{
|
648
|
+
'score': 0.49,
|
649
|
+
'label': 'flower',
|
650
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
651
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
652
|
+
[0, 0, 0, ..., 0, 0, 0],
|
653
|
+
...,
|
654
|
+
[0, 0, 0, ..., 0, 0, 0],
|
655
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
656
|
+
},
|
657
|
+
]
|
633
658
|
"""
|
659
|
+
image_size = image.shape[:2]
|
660
|
+
ret = _sam2(image, detections, image_size)
|
661
|
+
_display_tool_trace(
|
662
|
+
sam2.__name__,
|
663
|
+
{},
|
664
|
+
ret["display_data"],
|
665
|
+
ret["files"],
|
666
|
+
)
|
634
667
|
|
635
|
-
|
636
|
-
data = {
|
637
|
-
"image": image_b64,
|
638
|
-
"function_name": "loca_zero_shot_counting",
|
639
|
-
}
|
640
|
-
resp_data: dict[str, Any] = send_inference_request(data, "loca", v2=True)
|
641
|
-
resp_data["heat_map"] = np.array(resp_data["heat_map"][0]).astype(np.uint8)
|
642
|
-
return resp_data
|
668
|
+
return ret["return_data"] # type: ignore
|
643
669
|
|
644
670
|
|
645
|
-
def
|
646
|
-
|
671
|
+
def _countgd_object_detection(
|
672
|
+
prompt: str,
|
673
|
+
image: np.ndarray,
|
674
|
+
box_threshold: float,
|
675
|
+
image_size: Tuple[int, ...],
|
676
|
+
image_bytes: Optional[bytes] = None,
|
647
677
|
) -> Dict[str, Any]:
|
648
|
-
|
649
|
-
|
650
|
-
It returns only the count of the objects in the image.
|
678
|
+
if image_bytes is None:
|
679
|
+
image_bytes = numpy_to_bytes(image)
|
651
680
|
|
652
|
-
|
653
|
-
|
654
|
-
visual_prompt (Dict[str, List[float]]): Bounding box of the object in
|
655
|
-
format [xmin, ymin, xmax, ymax]. Only 1 bounding box can be provided.
|
681
|
+
files = [("image", image_bytes)]
|
682
|
+
prompts = [p.strip() for p in prompt.split(", ")]
|
656
683
|
|
657
|
-
|
658
|
-
|
659
|
-
|
684
|
+
def _run_countgd(prompt: str) -> List[Dict[str, Any]]:
|
685
|
+
payload = {
|
686
|
+
"prompts": [prompt],
|
687
|
+
"confidence": box_threshold, # still not being used in the API
|
688
|
+
"model": "countgd",
|
689
|
+
}
|
690
|
+
metadata = {"function_name": "countgd_counting"}
|
660
691
|
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
[ 0, 0, 0, ..., 0, 0, 0],
|
667
|
-
[ 0, 0, 0, ..., 0, 0, 1],
|
668
|
-
...,
|
669
|
-
[ 0, 0, 0, ..., 30, 35, 41],
|
670
|
-
[ 0, 0, 0, ..., 41, 47, 53],
|
671
|
-
[ 0, 0, 0, ..., 53, 59, 64]], dtype=uint8)}
|
672
|
-
"""
|
692
|
+
detections = send_task_inference_request(
|
693
|
+
payload, "text-to-object-detection", files=files, metadata=metadata
|
694
|
+
)
|
695
|
+
# get the first frame
|
696
|
+
return detections[0] # type: ignore
|
673
697
|
|
674
|
-
|
675
|
-
|
676
|
-
|
698
|
+
bboxes = []
|
699
|
+
with ThreadPoolExecutor() as executor:
|
700
|
+
futures = [executor.submit(_run_countgd, prompt) for prompt in prompts]
|
701
|
+
for future in as_completed(futures):
|
702
|
+
bboxes.extend(future.result())
|
677
703
|
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
704
|
+
return_data = [
|
705
|
+
{
|
706
|
+
"label": bbox["label"],
|
707
|
+
"bbox": normalize_bbox(bbox["bounding_box"], image_size),
|
708
|
+
"score": round(bbox["score"], 2),
|
709
|
+
}
|
710
|
+
for bbox in bboxes
|
711
|
+
]
|
712
|
+
|
713
|
+
return_data = single_nms(return_data, iou_threshold=0.80)
|
714
|
+
display_data = [
|
715
|
+
{
|
716
|
+
"label": e["label"],
|
717
|
+
"score": e["score"],
|
718
|
+
"bbox": denormalize_bbox(e["bbox"], image_size),
|
719
|
+
}
|
720
|
+
for e in return_data
|
721
|
+
]
|
722
|
+
return {"files": files, "return_data": return_data, "display_data": display_data}
|
686
723
|
|
687
724
|
|
688
725
|
def countgd_object_detection(
|
@@ -723,121 +760,17 @@ def countgd_object_detection(
|
|
723
760
|
if image_size[0] < 1 or image_size[1] < 1:
|
724
761
|
return []
|
725
762
|
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
|
738
|
-
detections = send_task_inference_request(
|
739
|
-
payload, "text-to-object-detection", files=files, metadata=metadata
|
740
|
-
)
|
741
|
-
# get the first frame
|
742
|
-
return detections[0] # type: ignore
|
743
|
-
|
744
|
-
bboxes = []
|
745
|
-
with ThreadPoolExecutor() as executor:
|
746
|
-
futures = [executor.submit(_run_countgd, prompt) for prompt in prompts]
|
747
|
-
for future in as_completed(futures):
|
748
|
-
bboxes.extend(future.result())
|
749
|
-
|
750
|
-
bboxes_formatted = [
|
751
|
-
ODResponseData(
|
752
|
-
label=bbox["label"],
|
753
|
-
bbox=normalize_bbox(bbox["bounding_box"], image_size),
|
754
|
-
score=round(bbox["score"], 2),
|
755
|
-
)
|
756
|
-
for bbox in bboxes
|
757
|
-
]
|
758
|
-
# TODO: remove this once we start to use the confidence on countgd
|
759
|
-
filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
|
760
|
-
return_data = [bbox.model_dump() for bbox in filtered_bboxes]
|
761
|
-
return single_nms(return_data, iou_threshold=0.80)
|
762
|
-
|
763
|
-
|
764
|
-
def sam2(
|
765
|
-
image: np.ndarray,
|
766
|
-
detections: List[Dict[str, Any]],
|
767
|
-
) -> List[Dict[str, Any]]:
|
768
|
-
"""'sam2' is a tool that can segment multiple objects given an input bounding box,
|
769
|
-
label and score. It returns a set of masks along with the corresponding bounding
|
770
|
-
boxes and labels.
|
771
|
-
|
772
|
-
Parameters:
|
773
|
-
image (np.ndarray): The image that contains multiple instances of the object.
|
774
|
-
detections (List[Dict[str, Any]]): A list of dictionaries containing the score,
|
775
|
-
label, and bounding box of the detected objects with normalized coordinates
|
776
|
-
between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates
|
777
|
-
of the top-left and xmax and ymax are the coordinates of the bottom-right of
|
778
|
-
the bounding box.
|
779
|
-
|
780
|
-
Returns:
|
781
|
-
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
782
|
-
bounding box, and mask of the detected objects with normalized coordinates
|
783
|
-
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
784
|
-
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
785
|
-
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
786
|
-
the background.
|
787
|
-
|
788
|
-
Example
|
789
|
-
-------
|
790
|
-
>>> sam2(image, [
|
791
|
-
{'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
792
|
-
])
|
793
|
-
[
|
794
|
-
{
|
795
|
-
'score': 0.49,
|
796
|
-
'label': 'flower',
|
797
|
-
'bbox': [0.1, 0.11, 0.35, 0.4],
|
798
|
-
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
799
|
-
[0, 0, 0, ..., 0, 0, 0],
|
800
|
-
...,
|
801
|
-
[0, 0, 0, ..., 0, 0, 0],
|
802
|
-
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
803
|
-
},
|
804
|
-
]
|
805
|
-
"""
|
806
|
-
image_size = image.shape[:2]
|
807
|
-
|
808
|
-
files = [("images", numpy_to_bytes(image))]
|
809
|
-
payload = {
|
810
|
-
"model": "sam2",
|
811
|
-
"bboxes": json.dumps(
|
812
|
-
[
|
813
|
-
{
|
814
|
-
"labels": [d["label"] for d in detections],
|
815
|
-
"bboxes": [
|
816
|
-
denormalize_bbox(d["bbox"], image_size) for d in detections
|
817
|
-
],
|
818
|
-
}
|
819
|
-
]
|
820
|
-
),
|
821
|
-
}
|
822
|
-
metadata = {"function_name": "sam2"}
|
823
|
-
pred_detections = send_task_inference_request(
|
824
|
-
payload, "sam2", files=files, metadata=metadata
|
825
|
-
)
|
826
|
-
frame = pred_detections[0]
|
827
|
-
return_data = []
|
828
|
-
for inp_detection, detection in zip(detections, frame):
|
829
|
-
mask = rle_decode_array(detection["mask"])
|
830
|
-
label = detection["label"]
|
831
|
-
bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
|
832
|
-
return_data.append(
|
833
|
-
{
|
834
|
-
"label": label,
|
835
|
-
"bbox": bbox,
|
836
|
-
"mask": mask,
|
837
|
-
"score": inp_detection["score"],
|
838
|
-
}
|
839
|
-
)
|
840
|
-
return return_data
|
763
|
+
ret = _countgd_object_detection(prompt, image, box_threshold, image_size)
|
764
|
+
_display_tool_trace(
|
765
|
+
countgd_object_detection.__name__,
|
766
|
+
{
|
767
|
+
"prompts": prompt,
|
768
|
+
"confidence": box_threshold,
|
769
|
+
},
|
770
|
+
ret["display_data"],
|
771
|
+
ret["files"],
|
772
|
+
)
|
773
|
+
return ret["return_data"] # type: ignore
|
841
774
|
|
842
775
|
|
843
776
|
def countgd_sam2_object_detection(
|
@@ -881,9 +814,23 @@ def countgd_sam2_object_detection(
|
|
881
814
|
},
|
882
815
|
]
|
883
816
|
"""
|
884
|
-
|
885
|
-
|
886
|
-
|
817
|
+
|
818
|
+
od_ret = _countgd_object_detection(prompt, image, box_threshold, image.shape[:2])
|
819
|
+
seg_ret = _sam2(
|
820
|
+
image, od_ret["return_data"], image.shape[:2], image_bytes=od_ret["files"][0][1]
|
821
|
+
)
|
822
|
+
|
823
|
+
_display_tool_trace(
|
824
|
+
countgd_sam2_object_detection.__name__,
|
825
|
+
{
|
826
|
+
"prompts": prompt,
|
827
|
+
"confidence": box_threshold,
|
828
|
+
},
|
829
|
+
seg_ret["display_data"],
|
830
|
+
seg_ret["files"],
|
831
|
+
)
|
832
|
+
|
833
|
+
return seg_ret["return_data"] # type: ignore
|
887
834
|
|
888
835
|
|
889
836
|
def countgd_example_based_counting(
|
@@ -941,76 +888,28 @@ def countgd_example_based_counting(
|
|
941
888
|
# get the first frame
|
942
889
|
bboxes_per_frame = detections[0]
|
943
890
|
bboxes_formatted = [
|
944
|
-
|
945
|
-
label
|
946
|
-
bbox
|
947
|
-
score
|
948
|
-
|
891
|
+
{
|
892
|
+
"label": bbox["label"],
|
893
|
+
"bbox": normalize_bbox(bbox["bounding_box"], image_size),
|
894
|
+
"score": round(bbox["score"], 2),
|
895
|
+
}
|
949
896
|
for bbox in bboxes_per_frame
|
950
897
|
]
|
951
|
-
|
952
|
-
|
953
|
-
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
959
|
-
|
960
|
-
|
961
|
-
|
962
|
-
|
963
|
-
|
964
|
-
Returns:
|
965
|
-
str: A string which is the answer to the given prompt.
|
966
|
-
|
967
|
-
Example
|
968
|
-
-------
|
969
|
-
>>> florence2_roberta_vqa('What is the top left animal in this image?', image)
|
970
|
-
'white tiger'
|
971
|
-
"""
|
972
|
-
|
973
|
-
image_b64 = convert_to_b64(image)
|
974
|
-
data = {
|
975
|
-
"image": image_b64,
|
976
|
-
"question": prompt,
|
977
|
-
"function_name": "florence2_roberta_vqa",
|
978
|
-
}
|
979
|
-
|
980
|
-
answer = send_inference_request(data, "florence2-qa", v2=True)
|
981
|
-
return answer # type: ignore
|
982
|
-
|
983
|
-
|
984
|
-
def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
|
985
|
-
"""'ixc25_image_vqa' is a tool that can answer any questions about arbitrary images
|
986
|
-
including regular images or images of documents or presentations. It returns text
|
987
|
-
as an answer to the question.
|
988
|
-
|
989
|
-
Parameters:
|
990
|
-
prompt (str): The question about the image
|
991
|
-
image (np.ndarray): The reference image used for the question
|
992
|
-
|
993
|
-
Returns:
|
994
|
-
str: A string which is the answer to the given prompt.
|
995
|
-
|
996
|
-
Example
|
997
|
-
-------
|
998
|
-
>>> ixc25_image_vqa('What is the cat doing?', image)
|
999
|
-
'drinking milk'
|
1000
|
-
"""
|
1001
|
-
if image.shape[0] < 1 or image.shape[1] < 1:
|
1002
|
-
raise ValueError(f"Image is empty, image shape: {image.shape}")
|
1003
|
-
|
1004
|
-
buffer_bytes = numpy_to_bytes(image)
|
1005
|
-
files = [("image", buffer_bytes)]
|
1006
|
-
payload = {
|
1007
|
-
"prompt": prompt,
|
1008
|
-
"function_name": "ixc25_image_vqa",
|
1009
|
-
}
|
1010
|
-
data: Dict[str, Any] = send_inference_request(
|
1011
|
-
payload, "internlm-xcomposer2", files=files, v2=True
|
898
|
+
_display_tool_trace(
|
899
|
+
countgd_example_based_counting.__name__,
|
900
|
+
payload,
|
901
|
+
[
|
902
|
+
{
|
903
|
+
"label": e["label"],
|
904
|
+
"score": e["score"],
|
905
|
+
"bbox": denormalize_bbox(e["bbox"], image_size),
|
906
|
+
}
|
907
|
+
for e in bboxes_formatted
|
908
|
+
],
|
909
|
+
files,
|
1012
910
|
)
|
1013
|
-
|
911
|
+
|
912
|
+
return bboxes_formatted
|
1014
913
|
|
1015
914
|
|
1016
915
|
def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
|
@@ -1047,61 +946,13 @@ def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
|
|
1047
946
|
data: Dict[str, Any] = send_inference_request(
|
1048
947
|
payload, "image-to-text", files=files, v2=True
|
1049
948
|
)
|
1050
|
-
|
1051
|
-
|
1052
|
-
|
1053
|
-
|
1054
|
-
|
1055
|
-
returns the extracted text as a string and can be used as an alternative to OCR if
|
1056
|
-
you do not need to know the exact bounding box of the text.
|
1057
|
-
|
1058
|
-
Parameters:
|
1059
|
-
image (np.ndarray): The image to extract text from.
|
1060
|
-
|
1061
|
-
Returns:
|
1062
|
-
str: The extracted text from the image.
|
1063
|
-
"""
|
1064
|
-
|
1065
|
-
lmm = AnthropicLMM()
|
1066
|
-
buffer = io.BytesIO()
|
1067
|
-
Image.fromarray(image).save(buffer, format="PNG")
|
1068
|
-
image_bytes = buffer.getvalue()
|
1069
|
-
image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
|
1070
|
-
text = lmm.generate(
|
1071
|
-
"Extract and return any text you see in this image and nothing else. If you do not read any text respond with an empty string.",
|
1072
|
-
[image_b64],
|
1073
|
-
)
|
1074
|
-
return cast(str, text)
|
1075
|
-
|
1076
|
-
|
1077
|
-
def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
|
1078
|
-
"""'ixc25_video_vqa' is a tool that can answer any questions about arbitrary videos
|
1079
|
-
including regular videos or videos of documents or presentations. It returns text
|
1080
|
-
as an answer to the question.
|
1081
|
-
|
1082
|
-
Parameters:
|
1083
|
-
prompt (str): The question about the video
|
1084
|
-
frames (List[np.ndarray]): The reference frames used for the question
|
1085
|
-
|
1086
|
-
Returns:
|
1087
|
-
str: A string which is the answer to the given prompt.
|
1088
|
-
|
1089
|
-
Example
|
1090
|
-
-------
|
1091
|
-
>>> ixc25_video_vqa('Which football player made the goal?', frames)
|
1092
|
-
'Lionel Messi'
|
1093
|
-
"""
|
1094
|
-
|
1095
|
-
buffer_bytes = frames_to_bytes(frames)
|
1096
|
-
files = [("video", buffer_bytes)]
|
1097
|
-
payload = {
|
1098
|
-
"prompt": prompt,
|
1099
|
-
"function_name": "ixc25_video_vqa",
|
1100
|
-
}
|
1101
|
-
data: Dict[str, Any] = send_inference_request(
|
1102
|
-
payload, "internlm-xcomposer2", files=files, v2=True
|
949
|
+
_display_tool_trace(
|
950
|
+
qwen2_vl_images_vqa.__name__,
|
951
|
+
payload,
|
952
|
+
cast(str, data),
|
953
|
+
files,
|
1103
954
|
)
|
1104
|
-
return cast(str, data
|
955
|
+
return cast(str, data)
|
1105
956
|
|
1106
957
|
|
1107
958
|
def qwen2_vl_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
|
@@ -1135,9 +986,39 @@ def qwen2_vl_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
|
|
1135
986
|
data: Dict[str, Any] = send_inference_request(
|
1136
987
|
payload, "image-to-text", files=files, v2=True
|
1137
988
|
)
|
989
|
+
_display_tool_trace(
|
990
|
+
qwen2_vl_video_vqa.__name__,
|
991
|
+
payload,
|
992
|
+
cast(str, data),
|
993
|
+
files,
|
994
|
+
)
|
1138
995
|
return cast(str, data)
|
1139
996
|
|
1140
997
|
|
998
|
+
def claude35_text_extraction(image: np.ndarray) -> str:
|
999
|
+
"""'claude35_text_extraction' is a tool that can extract text from an image. It
|
1000
|
+
returns the extracted text as a string and can be used as an alternative to OCR if
|
1001
|
+
you do not need to know the exact bounding box of the text.
|
1002
|
+
|
1003
|
+
Parameters:
|
1004
|
+
image (np.ndarray): The image to extract text from.
|
1005
|
+
|
1006
|
+
Returns:
|
1007
|
+
str: The extracted text from the image.
|
1008
|
+
"""
|
1009
|
+
|
1010
|
+
lmm = AnthropicLMM()
|
1011
|
+
buffer = io.BytesIO()
|
1012
|
+
Image.fromarray(image).save(buffer, format="PNG")
|
1013
|
+
image_bytes = buffer.getvalue()
|
1014
|
+
image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
|
1015
|
+
text = lmm.generate(
|
1016
|
+
"Extract and return any text you see in this image and nothing else. If you do not read any text respond with an empty string.",
|
1017
|
+
[image_b64],
|
1018
|
+
)
|
1019
|
+
return cast(str, text)
|
1020
|
+
|
1021
|
+
|
1141
1022
|
def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
|
1142
1023
|
"""'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
|
1143
1024
|
including regular images or images of documents or presentations. It returns text
|
@@ -1201,36 +1082,6 @@ def gpt4o_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
|
|
1201
1082
|
return cast(str, resp)
|
1202
1083
|
|
1203
1084
|
|
1204
|
-
def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
|
1205
|
-
"""'git_vqa_v2' is a tool that can answer questions about the visual
|
1206
|
-
contents of an image given a question and an image. It returns an answer to the
|
1207
|
-
question
|
1208
|
-
|
1209
|
-
Parameters:
|
1210
|
-
prompt (str): The question about the image
|
1211
|
-
image (np.ndarray): The reference image used for the question
|
1212
|
-
|
1213
|
-
Returns:
|
1214
|
-
str: A string which is the answer to the given prompt.
|
1215
|
-
|
1216
|
-
Example
|
1217
|
-
-------
|
1218
|
-
>>> git_vqa_v2('What is the cat doing ?', image)
|
1219
|
-
'drinking milk'
|
1220
|
-
"""
|
1221
|
-
|
1222
|
-
image_b64 = convert_to_b64(image)
|
1223
|
-
data = {
|
1224
|
-
"image": image_b64,
|
1225
|
-
"prompt": prompt,
|
1226
|
-
"tool": "image_question_answering",
|
1227
|
-
"function_name": "git_vqa_v2",
|
1228
|
-
}
|
1229
|
-
|
1230
|
-
answer = send_inference_request(data, "tools")
|
1231
|
-
return answer["text"][0] # type: ignore
|
1232
|
-
|
1233
|
-
|
1234
1085
|
def video_temporal_localization(
|
1235
1086
|
prompt: str,
|
1236
1087
|
frames: List[np.ndarray],
|
@@ -1274,70 +1125,48 @@ def video_temporal_localization(
|
|
1274
1125
|
data = send_inference_request(
|
1275
1126
|
payload, "video-temporal-localization", files=files, v2=True
|
1276
1127
|
)
|
1128
|
+
_display_tool_trace(
|
1129
|
+
video_temporal_localization.__name__,
|
1130
|
+
payload,
|
1131
|
+
data,
|
1132
|
+
files,
|
1133
|
+
)
|
1277
1134
|
return [cast(float, value) for value in data]
|
1278
1135
|
|
1279
1136
|
|
1280
|
-
def
|
1281
|
-
"""'
|
1282
|
-
of
|
1283
|
-
their probability scores based on image content.
|
1137
|
+
def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
|
1138
|
+
"""'vit_image_classification' is a tool that can classify an image. It returns a
|
1139
|
+
list of classes and their probability scores based on image content.
|
1284
1140
|
|
1285
1141
|
Parameters:
|
1286
1142
|
image (np.ndarray): The image to classify or tag
|
1287
|
-
classes (List[str]): The list of classes or tags that is associated with the image
|
1288
1143
|
|
1289
1144
|
Returns:
|
1290
1145
|
Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
|
1291
|
-
contains a list of
|
1146
|
+
contains a list of labels and other a list of scores.
|
1292
1147
|
|
1293
1148
|
Example
|
1294
1149
|
-------
|
1295
|
-
>>>
|
1296
|
-
{"labels": ["
|
1150
|
+
>>> vit_image_classification(image)
|
1151
|
+
{"labels": ["leopard", "lemur, otter", "bird"], "scores": [0.68, 0.30, 0.02]},
|
1297
1152
|
"""
|
1298
|
-
|
1299
1153
|
if image.shape[0] < 1 or image.shape[1] < 1:
|
1300
1154
|
return {"labels": [], "scores": []}
|
1301
1155
|
|
1302
1156
|
image_b64 = convert_to_b64(image)
|
1303
1157
|
data = {
|
1304
|
-
"prompt": ",".join(classes),
|
1305
1158
|
"image": image_b64,
|
1306
|
-
"tool": "
|
1307
|
-
"function_name": "
|
1308
|
-
}
|
1309
|
-
resp_data: dict[str, Any] = send_inference_request(data, "tools")
|
1310
|
-
resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
|
1311
|
-
return resp_data
|
1312
|
-
|
1313
|
-
|
1314
|
-
def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
|
1315
|
-
"""'vit_image_classification' is a tool that can classify an image. It returns a
|
1316
|
-
list of classes and their probability scores based on image content.
|
1317
|
-
|
1318
|
-
Parameters:
|
1319
|
-
image (np.ndarray): The image to classify or tag
|
1320
|
-
|
1321
|
-
Returns:
|
1322
|
-
Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
|
1323
|
-
contains a list of labels and other a list of scores.
|
1324
|
-
|
1325
|
-
Example
|
1326
|
-
-------
|
1327
|
-
>>> vit_image_classification(image)
|
1328
|
-
{"labels": ["leopard", "lemur, otter", "bird"], "scores": [0.68, 0.30, 0.02]},
|
1329
|
-
"""
|
1330
|
-
if image.shape[0] < 1 or image.shape[1] < 1:
|
1331
|
-
return {"labels": [], "scores": []}
|
1332
|
-
|
1333
|
-
image_b64 = convert_to_b64(image)
|
1334
|
-
data = {
|
1335
|
-
"image": image_b64,
|
1336
|
-
"tool": "image_classification",
|
1337
|
-
"function_name": "vit_image_classification",
|
1159
|
+
"tool": "image_classification",
|
1160
|
+
"function_name": "vit_image_classification",
|
1338
1161
|
}
|
1339
1162
|
resp_data: dict[str, Any] = send_inference_request(data, "tools")
|
1340
1163
|
resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
|
1164
|
+
_display_tool_trace(
|
1165
|
+
vit_image_classification.__name__,
|
1166
|
+
data,
|
1167
|
+
resp_data,
|
1168
|
+
image_b64,
|
1169
|
+
)
|
1341
1170
|
return resp_data
|
1342
1171
|
|
1343
1172
|
|
@@ -1369,65 +1198,15 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
|
|
1369
1198
|
data, "nsfw-classification", v2=True
|
1370
1199
|
)
|
1371
1200
|
resp_data["score"] = round(resp_data["score"], 4)
|
1201
|
+
_display_tool_trace(
|
1202
|
+
vit_nsfw_classification.__name__,
|
1203
|
+
data,
|
1204
|
+
resp_data,
|
1205
|
+
image_b64,
|
1206
|
+
)
|
1372
1207
|
return resp_data
|
1373
1208
|
|
1374
1209
|
|
1375
|
-
def blip_image_caption(image: np.ndarray) -> str:
|
1376
|
-
"""'blip_image_caption' is a tool that can caption an image based on its contents. It
|
1377
|
-
returns a text describing the image.
|
1378
|
-
|
1379
|
-
Parameters:
|
1380
|
-
image (np.ndarray): The image to caption
|
1381
|
-
|
1382
|
-
Returns:
|
1383
|
-
str: A string which is the caption for the given image.
|
1384
|
-
|
1385
|
-
Example
|
1386
|
-
-------
|
1387
|
-
>>> blip_image_caption(image)
|
1388
|
-
'This image contains a cat sitting on a table with a bowl of milk.'
|
1389
|
-
"""
|
1390
|
-
|
1391
|
-
image_b64 = convert_to_b64(image)
|
1392
|
-
data = {
|
1393
|
-
"image": image_b64,
|
1394
|
-
"tool": "image_captioning",
|
1395
|
-
"function_name": "blip_image_caption",
|
1396
|
-
}
|
1397
|
-
|
1398
|
-
answer = send_inference_request(data, "tools")
|
1399
|
-
return answer["text"][0] # type: ignore
|
1400
|
-
|
1401
|
-
|
1402
|
-
def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> str:
|
1403
|
-
"""'florence2_image_caption' is a tool that can caption or describe an image based
|
1404
|
-
on its contents. It returns a text describing the image.
|
1405
|
-
|
1406
|
-
Parameters:
|
1407
|
-
image (np.ndarray): The image to caption
|
1408
|
-
detail_caption (bool): If True, the caption will be as detailed as possible else
|
1409
|
-
the caption will be a brief description.
|
1410
|
-
|
1411
|
-
Returns:
|
1412
|
-
str: A string which is the caption for the given image.
|
1413
|
-
|
1414
|
-
Example
|
1415
|
-
-------
|
1416
|
-
>>> florence2_image_caption(image, False)
|
1417
|
-
'This image contains a cat sitting on a table with a bowl of milk.'
|
1418
|
-
"""
|
1419
|
-
image_b64 = convert_to_b64(image)
|
1420
|
-
task = "<MORE_DETAILED_CAPTION>" if detail_caption else "<DETAILED_CAPTION>"
|
1421
|
-
data = {
|
1422
|
-
"image": image_b64,
|
1423
|
-
"task": task,
|
1424
|
-
"function_name": "florence2_image_caption",
|
1425
|
-
}
|
1426
|
-
|
1427
|
-
answer = send_inference_request(data, "florence2", v2=True)
|
1428
|
-
return answer[task] # type: ignore
|
1429
|
-
|
1430
|
-
|
1431
1210
|
def florence2_phrase_grounding(
|
1432
1211
|
prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
|
1433
1212
|
) -> List[Dict[str, Any]]:
|
@@ -1490,15 +1269,21 @@ def florence2_phrase_grounding(
|
|
1490
1269
|
# get the first frame
|
1491
1270
|
bboxes = detections[0]
|
1492
1271
|
bboxes_formatted = [
|
1493
|
-
|
1494
|
-
label
|
1495
|
-
bbox
|
1496
|
-
score
|
1497
|
-
|
1272
|
+
{
|
1273
|
+
"label": bbox["label"],
|
1274
|
+
"bbox": normalize_bbox(bbox["bounding_box"], image_size),
|
1275
|
+
"score": round(bbox["score"], 2),
|
1276
|
+
}
|
1498
1277
|
for bbox in bboxes
|
1499
1278
|
]
|
1500
1279
|
|
1501
|
-
|
1280
|
+
_display_tool_trace(
|
1281
|
+
florence2_phrase_grounding.__name__,
|
1282
|
+
payload,
|
1283
|
+
detections[0],
|
1284
|
+
files,
|
1285
|
+
)
|
1286
|
+
return [bbox for bbox in bboxes_formatted]
|
1502
1287
|
|
1503
1288
|
|
1504
1289
|
def florence2_phrase_grounding_video(
|
@@ -1566,15 +1351,21 @@ def florence2_phrase_grounding_video(
|
|
1566
1351
|
bboxes_formatted = []
|
1567
1352
|
for frame_data in detections:
|
1568
1353
|
bboxes_formatted_per_frame = [
|
1569
|
-
|
1570
|
-
label
|
1571
|
-
bbox
|
1572
|
-
score
|
1573
|
-
|
1354
|
+
{
|
1355
|
+
"label": bbox["label"],
|
1356
|
+
"bbox": normalize_bbox(bbox["bounding_box"], image_size),
|
1357
|
+
"score": round(bbox["score"], 2),
|
1358
|
+
}
|
1574
1359
|
for bbox in frame_data
|
1575
1360
|
]
|
1576
1361
|
bboxes_formatted.append(bboxes_formatted_per_frame)
|
1577
|
-
|
1362
|
+
_display_tool_trace(
|
1363
|
+
florence2_phrase_grounding_video.__name__,
|
1364
|
+
payload,
|
1365
|
+
detections,
|
1366
|
+
files,
|
1367
|
+
)
|
1368
|
+
return bboxes_formatted
|
1578
1369
|
|
1579
1370
|
|
1580
1371
|
def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
@@ -1621,6 +1412,12 @@ def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
1621
1412
|
"score": 1.0,
|
1622
1413
|
}
|
1623
1414
|
)
|
1415
|
+
_display_tool_trace(
|
1416
|
+
florence2_ocr.__name__,
|
1417
|
+
{},
|
1418
|
+
detections,
|
1419
|
+
image_b64,
|
1420
|
+
)
|
1624
1421
|
return return_data
|
1625
1422
|
|
1626
1423
|
|
@@ -1683,6 +1480,12 @@ def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
1683
1480
|
),
|
1684
1481
|
}
|
1685
1482
|
)
|
1483
|
+
_display_tool_trace(
|
1484
|
+
detr_segmentation.__name__,
|
1485
|
+
{},
|
1486
|
+
return_data,
|
1487
|
+
image_b64,
|
1488
|
+
)
|
1686
1489
|
return return_data
|
1687
1490
|
|
1688
1491
|
|
@@ -1721,74 +1524,15 @@ def depth_anything_v2(image: np.ndarray) -> np.ndarray:
|
|
1721
1524
|
depth_map_np.max() - depth_map_np.min()
|
1722
1525
|
)
|
1723
1526
|
depth_map_np = (255 * depth_map_np).astype(np.uint8)
|
1527
|
+
_display_tool_trace(
|
1528
|
+
depth_anything_v2.__name__,
|
1529
|
+
{},
|
1530
|
+
depth_map,
|
1531
|
+
image_b64,
|
1532
|
+
)
|
1724
1533
|
return depth_map_np
|
1725
1534
|
|
1726
1535
|
|
1727
|
-
def generate_soft_edge_image(image: np.ndarray) -> np.ndarray:
|
1728
|
-
"""'generate_soft_edge_image' is a tool that runs Holistically Nested edge detection
|
1729
|
-
to generate a soft edge image (HED) from a given RGB image. The returned image is
|
1730
|
-
monochrome and represents object boundaries as soft white edges on black background
|
1731
|
-
|
1732
|
-
Parameters:
|
1733
|
-
image (np.ndarray): The image to used to generate soft edge image
|
1734
|
-
|
1735
|
-
Returns:
|
1736
|
-
np.ndarray: A soft edge image with pixel values ranging from 0 to 255.
|
1737
|
-
|
1738
|
-
Example
|
1739
|
-
-------
|
1740
|
-
>>> generate_soft_edge_image(image)
|
1741
|
-
array([[0, 0, 0, ..., 0, 0, 0],
|
1742
|
-
[0, 20, 24, ..., 0, 100, 103],
|
1743
|
-
...,
|
1744
|
-
[10, 11, 15, ..., 202, 202, 205],
|
1745
|
-
[10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
|
1746
|
-
"""
|
1747
|
-
image_b64 = convert_to_b64(image)
|
1748
|
-
data = {
|
1749
|
-
"image": image_b64,
|
1750
|
-
"tool": "generate_hed",
|
1751
|
-
"function_name": "generate_soft_edge_image",
|
1752
|
-
}
|
1753
|
-
|
1754
|
-
answer = send_inference_request(data, "tools")
|
1755
|
-
return_data = np.array(b64_to_pil(answer["masks"][0]).convert("L"))
|
1756
|
-
return return_data
|
1757
|
-
|
1758
|
-
|
1759
|
-
def dpt_hybrid_midas(image: np.ndarray) -> np.ndarray:
|
1760
|
-
"""'dpt_hybrid_midas' is a tool that generates a normal mapped from a given RGB
|
1761
|
-
image. The returned RGB image is texture mapped image of the surface normals and the
|
1762
|
-
RGB values represent the surface normals in the x, y, z directions.
|
1763
|
-
|
1764
|
-
Parameters:
|
1765
|
-
image (np.ndarray): The image to used to generate normal image
|
1766
|
-
|
1767
|
-
Returns:
|
1768
|
-
np.ndarray: A mapped normal image with RGB pixel values indicating surface
|
1769
|
-
normals in x, y, z directions.
|
1770
|
-
|
1771
|
-
Example
|
1772
|
-
-------
|
1773
|
-
>>> dpt_hybrid_midas(image)
|
1774
|
-
array([[0, 0, 0, ..., 0, 0, 0],
|
1775
|
-
[0, 20, 24, ..., 0, 100, 103],
|
1776
|
-
...,
|
1777
|
-
[10, 11, 15, ..., 202, 202, 205],
|
1778
|
-
[10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
|
1779
|
-
"""
|
1780
|
-
image_b64 = convert_to_b64(image)
|
1781
|
-
data = {
|
1782
|
-
"image": image_b64,
|
1783
|
-
"tool": "generate_normal",
|
1784
|
-
"function_name": "dpt_hybrid_midas",
|
1785
|
-
}
|
1786
|
-
|
1787
|
-
answer = send_inference_request(data, "tools")
|
1788
|
-
return_data = np.array(b64_to_pil(answer["masks"][0]).convert("RGB"))
|
1789
|
-
return return_data
|
1790
|
-
|
1791
|
-
|
1792
1536
|
def generate_pose_image(image: np.ndarray) -> np.ndarray:
|
1793
1537
|
"""'generate_pose_image' is a tool that generates a open pose bone/stick image from
|
1794
1538
|
a given RGB image. The returned bone image is RGB with the pose amd keypoints colored
|
@@ -1817,6 +1561,12 @@ def generate_pose_image(image: np.ndarray) -> np.ndarray:
|
|
1817
1561
|
|
1818
1562
|
pos_img = send_inference_request(data, "pose-detector", v2=True)
|
1819
1563
|
return_data = np.array(b64_to_pil(pos_img["data"]).convert("RGB"))
|
1564
|
+
_display_tool_trace(
|
1565
|
+
generate_pose_image.__name__,
|
1566
|
+
{},
|
1567
|
+
pos_img,
|
1568
|
+
image_b64,
|
1569
|
+
)
|
1820
1570
|
return return_data
|
1821
1571
|
|
1822
1572
|
|
@@ -1861,120 +1611,18 @@ def template_match(
|
|
1861
1611
|
for i in range(len(answer["bboxes"])):
|
1862
1612
|
return_data.append(
|
1863
1613
|
{
|
1614
|
+
"label": "match",
|
1864
1615
|
"score": round(answer["scores"][i], 2),
|
1865
1616
|
"bbox": normalize_bbox(answer["bboxes"][i], image_size),
|
1866
1617
|
}
|
1867
1618
|
)
|
1868
|
-
|
1869
|
-
|
1870
|
-
|
1871
|
-
|
1872
|
-
|
1873
|
-
) -> float:
|
1874
|
-
"""'minimum_distance' calculates the minimum distance between two detections which
|
1875
|
-
can include bounding boxes and or masks. This will return the closest distance
|
1876
|
-
between the objects, not the distance between the centers of the objects.
|
1877
|
-
|
1878
|
-
Parameters:
|
1879
|
-
det1 (Dict[str, Any]): The first detection of boxes or masks.
|
1880
|
-
det2 (Dict[str, Any]): The second detection of boxes or masks.
|
1881
|
-
image_size (Tuple[int, int]): The size of the image given as (height, width).
|
1882
|
-
|
1883
|
-
Returns:
|
1884
|
-
float: The closest distance between the two detections.
|
1885
|
-
|
1886
|
-
Example
|
1887
|
-
-------
|
1888
|
-
>>> closest_distance(det1, det2, image_size)
|
1889
|
-
141.42
|
1890
|
-
"""
|
1891
|
-
|
1892
|
-
if "mask" in det1 and "mask" in det2:
|
1893
|
-
return closest_mask_distance(det1["mask"], det2["mask"])
|
1894
|
-
elif "bbox" in det1 and "bbox" in det2:
|
1895
|
-
return closest_box_distance(det1["bbox"], det2["bbox"], image_size)
|
1896
|
-
else:
|
1897
|
-
raise ValueError("Both detections must have either bbox or mask")
|
1898
|
-
|
1899
|
-
|
1900
|
-
def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
|
1901
|
-
"""'closest_mask_distance' calculates the closest distance between two masks.
|
1902
|
-
|
1903
|
-
Parameters:
|
1904
|
-
mask1 (np.ndarray): The first mask.
|
1905
|
-
mask2 (np.ndarray): The second mask.
|
1906
|
-
|
1907
|
-
Returns:
|
1908
|
-
float: The closest distance between the two masks.
|
1909
|
-
|
1910
|
-
Example
|
1911
|
-
-------
|
1912
|
-
>>> closest_mask_distance(mask1, mask2)
|
1913
|
-
0.5
|
1914
|
-
"""
|
1915
|
-
|
1916
|
-
mask1 = np.clip(mask1, 0, 1)
|
1917
|
-
mask2 = np.clip(mask2, 0, 1)
|
1918
|
-
contours1, _ = cv2.findContours(mask1, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
|
1919
|
-
contours2, _ = cv2.findContours(mask2, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
|
1920
|
-
largest_contour1 = max(contours1, key=cv2.contourArea)
|
1921
|
-
largest_contour2 = max(contours2, key=cv2.contourArea)
|
1922
|
-
polygon1 = cv2.approxPolyDP(largest_contour1, 1.0, True)
|
1923
|
-
polygon2 = cv2.approxPolyDP(largest_contour2, 1.0, True)
|
1924
|
-
min_distance = np.inf
|
1925
|
-
|
1926
|
-
small_polygon, larger_contour = (
|
1927
|
-
(polygon1, largest_contour2)
|
1928
|
-
if len(largest_contour1) < len(largest_contour2)
|
1929
|
-
else (polygon2, largest_contour1)
|
1619
|
+
_display_tool_trace(
|
1620
|
+
template_match.__name__,
|
1621
|
+
{"template_image": template_image_b64},
|
1622
|
+
return_data,
|
1623
|
+
image_b64,
|
1930
1624
|
)
|
1931
|
-
|
1932
|
-
# For each point in the first polygon
|
1933
|
-
for point in small_polygon:
|
1934
|
-
# Calculate the distance to the second polygon, -1 is to invert result as point inside the polygon is positive
|
1935
|
-
|
1936
|
-
distance = (
|
1937
|
-
cv2.pointPolygonTest(
|
1938
|
-
larger_contour, (point[0, 0].item(), point[0, 1].item()), True
|
1939
|
-
)
|
1940
|
-
* -1
|
1941
|
-
)
|
1942
|
-
|
1943
|
-
# If the distance is negative, the point is inside the polygon, so the distance is 0
|
1944
|
-
if distance < 0:
|
1945
|
-
continue
|
1946
|
-
else:
|
1947
|
-
# Update the minimum distance if the point is outside the polygon
|
1948
|
-
min_distance = min(min_distance, distance)
|
1949
|
-
|
1950
|
-
return min_distance if min_distance != np.inf else 0.0
|
1951
|
-
|
1952
|
-
|
1953
|
-
def closest_box_distance(
|
1954
|
-
box1: List[float], box2: List[float], image_size: Tuple[int, int]
|
1955
|
-
) -> float:
|
1956
|
-
"""'closest_box_distance' calculates the closest distance between two bounding boxes.
|
1957
|
-
|
1958
|
-
Parameters:
|
1959
|
-
box1 (List[float]): The first bounding box.
|
1960
|
-
box2 (List[float]): The second bounding box.
|
1961
|
-
image_size (Tuple[int, int]): The size of the image given as (height, width).
|
1962
|
-
|
1963
|
-
Returns:
|
1964
|
-
float: The closest distance between the two bounding boxes.
|
1965
|
-
|
1966
|
-
Example
|
1967
|
-
-------
|
1968
|
-
>>> closest_box_distance([100, 100, 200, 200], [300, 300, 400, 400])
|
1969
|
-
141.42
|
1970
|
-
"""
|
1971
|
-
|
1972
|
-
x11, y11, x12, y12 = denormalize_bbox(box1, image_size)
|
1973
|
-
x21, y21, x22, y22 = denormalize_bbox(box2, image_size)
|
1974
|
-
|
1975
|
-
horizontal_distance = np.max([0, x21 - x12, x11 - x22])
|
1976
|
-
vertical_distance = np.max([0, y21 - y12, y11 - y22])
|
1977
|
-
return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
|
1625
|
+
return return_data
|
1978
1626
|
|
1979
1627
|
|
1980
1628
|
def flux_image_inpainting(
|
@@ -2064,6 +1712,12 @@ def flux_image_inpainting(
|
|
2064
1712
|
)
|
2065
1713
|
|
2066
1714
|
output_image = np.array(b64_to_pil(response[0]).convert("RGB"))
|
1715
|
+
_display_tool_trace(
|
1716
|
+
flux_image_inpainting.__name__,
|
1717
|
+
payload,
|
1718
|
+
output_image,
|
1719
|
+
files,
|
1720
|
+
)
|
2067
1721
|
return output_image
|
2068
1722
|
|
2069
1723
|
|
@@ -2106,9 +1760,124 @@ def siglip_classification(image: np.ndarray, labels: List[str]) -> Dict[str, Any
|
|
2106
1760
|
metadata_payload={"function_name": "siglip_classification"},
|
2107
1761
|
)
|
2108
1762
|
|
1763
|
+
_display_tool_trace(
|
1764
|
+
siglip_classification.__name__,
|
1765
|
+
payload,
|
1766
|
+
response,
|
1767
|
+
files,
|
1768
|
+
)
|
2109
1769
|
return response
|
2110
1770
|
|
2111
1771
|
|
1772
|
+
def minimum_distance(
|
1773
|
+
det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]
|
1774
|
+
) -> float:
|
1775
|
+
"""'minimum_distance' calculates the minimum distance between two detections which
|
1776
|
+
can include bounding boxes and or masks. This will return the closest distance
|
1777
|
+
between the objects, not the distance between the centers of the objects.
|
1778
|
+
|
1779
|
+
Parameters:
|
1780
|
+
det1 (Dict[str, Any]): The first detection of boxes or masks.
|
1781
|
+
det2 (Dict[str, Any]): The second detection of boxes or masks.
|
1782
|
+
image_size (Tuple[int, int]): The size of the image given as (height, width).
|
1783
|
+
|
1784
|
+
Returns:
|
1785
|
+
float: The closest distance between the two detections.
|
1786
|
+
|
1787
|
+
Example
|
1788
|
+
-------
|
1789
|
+
>>> closest_distance(det1, det2, image_size)
|
1790
|
+
141.42
|
1791
|
+
"""
|
1792
|
+
|
1793
|
+
if "mask" in det1 and "mask" in det2:
|
1794
|
+
return closest_mask_distance(det1["mask"], det2["mask"])
|
1795
|
+
elif "bbox" in det1 and "bbox" in det2:
|
1796
|
+
return closest_box_distance(det1["bbox"], det2["bbox"], image_size)
|
1797
|
+
else:
|
1798
|
+
raise ValueError("Both detections must have either bbox or mask")
|
1799
|
+
|
1800
|
+
|
1801
|
+
def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
|
1802
|
+
"""'closest_mask_distance' calculates the closest distance between two masks.
|
1803
|
+
|
1804
|
+
Parameters:
|
1805
|
+
mask1 (np.ndarray): The first mask.
|
1806
|
+
mask2 (np.ndarray): The second mask.
|
1807
|
+
|
1808
|
+
Returns:
|
1809
|
+
float: The closest distance between the two masks.
|
1810
|
+
|
1811
|
+
Example
|
1812
|
+
-------
|
1813
|
+
>>> closest_mask_distance(mask1, mask2)
|
1814
|
+
0.5
|
1815
|
+
"""
|
1816
|
+
|
1817
|
+
mask1 = np.clip(mask1, 0, 1)
|
1818
|
+
mask2 = np.clip(mask2, 0, 1)
|
1819
|
+
contours1, _ = cv2.findContours(mask1, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
|
1820
|
+
contours2, _ = cv2.findContours(mask2, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
|
1821
|
+
largest_contour1 = max(contours1, key=cv2.contourArea)
|
1822
|
+
largest_contour2 = max(contours2, key=cv2.contourArea)
|
1823
|
+
polygon1 = cv2.approxPolyDP(largest_contour1, 1.0, True)
|
1824
|
+
polygon2 = cv2.approxPolyDP(largest_contour2, 1.0, True)
|
1825
|
+
min_distance = np.inf
|
1826
|
+
|
1827
|
+
small_polygon, larger_contour = (
|
1828
|
+
(polygon1, largest_contour2)
|
1829
|
+
if len(largest_contour1) < len(largest_contour2)
|
1830
|
+
else (polygon2, largest_contour1)
|
1831
|
+
)
|
1832
|
+
|
1833
|
+
# For each point in the first polygon
|
1834
|
+
for point in small_polygon:
|
1835
|
+
# Calculate the distance to the second polygon, -1 is to invert result as point inside the polygon is positive
|
1836
|
+
|
1837
|
+
distance = (
|
1838
|
+
cv2.pointPolygonTest(
|
1839
|
+
larger_contour, (point[0, 0].item(), point[0, 1].item()), True
|
1840
|
+
)
|
1841
|
+
* -1
|
1842
|
+
)
|
1843
|
+
|
1844
|
+
# If the distance is negative, the point is inside the polygon, so the distance is 0
|
1845
|
+
if distance < 0:
|
1846
|
+
continue
|
1847
|
+
else:
|
1848
|
+
# Update the minimum distance if the point is outside the polygon
|
1849
|
+
min_distance = min(min_distance, distance)
|
1850
|
+
|
1851
|
+
return min_distance if min_distance != np.inf else 0.0
|
1852
|
+
|
1853
|
+
|
1854
|
+
def closest_box_distance(
|
1855
|
+
box1: List[float], box2: List[float], image_size: Tuple[int, int]
|
1856
|
+
) -> float:
|
1857
|
+
"""'closest_box_distance' calculates the closest distance between two bounding boxes.
|
1858
|
+
|
1859
|
+
Parameters:
|
1860
|
+
box1 (List[float]): The first bounding box.
|
1861
|
+
box2 (List[float]): The second bounding box.
|
1862
|
+
image_size (Tuple[int, int]): The size of the image given as (height, width).
|
1863
|
+
|
1864
|
+
Returns:
|
1865
|
+
float: The closest distance between the two bounding boxes.
|
1866
|
+
|
1867
|
+
Example
|
1868
|
+
-------
|
1869
|
+
>>> closest_box_distance([100, 100, 200, 200], [300, 300, 400, 400])
|
1870
|
+
141.42
|
1871
|
+
"""
|
1872
|
+
|
1873
|
+
x11, y11, x12, y12 = denormalize_bbox(box1, image_size)
|
1874
|
+
x21, y21, x22, y22 = denormalize_bbox(box2, image_size)
|
1875
|
+
|
1876
|
+
horizontal_distance = np.max([0, x21 - x12, x11 - x22])
|
1877
|
+
vertical_distance = np.max([0, y21 - y12, y11 - y22])
|
1878
|
+
return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
|
1879
|
+
|
1880
|
+
|
2112
1881
|
# Utility and visualization functions
|
2113
1882
|
|
2114
1883
|
|