vision-agent 0.2.210__py3-none-any.whl → 0.2.212__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/tools/__init__.py +3 -13
- vision_agent/tools/tool_utils.py +2 -2
- vision_agent/tools/tools.py +729 -768
- vision_agent/utils/image_utils.py +16 -0
- {vision_agent-0.2.210.dist-info → vision_agent-0.2.212.dist-info}/METADATA +1 -1
- {vision_agent-0.2.210.dist-info → vision_agent-0.2.212.dist-info}/RECORD +8 -8
- {vision_agent-0.2.210.dist-info → vision_agent-0.2.212.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.210.dist-info → vision_agent-0.2.212.dist-info}/WHEEL +0 -0
vision_agent/tools/tools.py
CHANGED
@@ -4,7 +4,9 @@ import logging
|
|
4
4
|
import os
|
5
5
|
import tempfile
|
6
6
|
import urllib.request
|
7
|
+
from base64 import b64encode
|
7
8
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
9
|
+
from enum import Enum
|
8
10
|
from functools import lru_cache
|
9
11
|
from importlib import resources
|
10
12
|
from pathlib import Path
|
@@ -14,6 +16,7 @@ from uuid import UUID
|
|
14
16
|
import cv2
|
15
17
|
import numpy as np
|
16
18
|
import requests
|
19
|
+
from IPython.display import display
|
17
20
|
from PIL import Image, ImageDraw, ImageFont
|
18
21
|
from pillow_heif import register_heif_opener # type: ignore
|
19
22
|
from pytube import YouTube # type: ignore
|
@@ -21,8 +24,8 @@ from pytube import YouTube # type: ignore
|
|
21
24
|
from vision_agent.clients.landing_public_api import LandingPublicAPI
|
22
25
|
from vision_agent.lmm.lmm import AnthropicLMM, OpenAILMM
|
23
26
|
from vision_agent.tools.tool_utils import (
|
27
|
+
ToolCallTrace,
|
24
28
|
add_bboxes_from_masks,
|
25
|
-
filter_bboxes_by_threshold,
|
26
29
|
get_tool_descriptions,
|
27
30
|
get_tool_documentation,
|
28
31
|
get_tools_df,
|
@@ -32,7 +35,7 @@ from vision_agent.tools.tool_utils import (
|
|
32
35
|
send_task_inference_request,
|
33
36
|
single_nms,
|
34
37
|
)
|
35
|
-
from vision_agent.tools.tools_types import JobStatus
|
38
|
+
from vision_agent.tools.tools_types import JobStatus
|
36
39
|
from vision_agent.utils.exceptions import FineTuneModelIsNotReady
|
37
40
|
from vision_agent.utils.execute import FileSerializer, MimeType
|
38
41
|
from vision_agent.utils.image_utils import (
|
@@ -41,7 +44,6 @@ from vision_agent.utils.image_utils import (
|
|
41
44
|
convert_to_b64,
|
42
45
|
denormalize_bbox,
|
43
46
|
encode_image_bytes,
|
44
|
-
get_image_size,
|
45
47
|
normalize_bbox,
|
46
48
|
numpy_to_bytes,
|
47
49
|
rle_decode,
|
@@ -88,66 +90,33 @@ def get_tool_recommender() -> Sim:
|
|
88
90
|
return load_cached_sim(TOOLS_DF)
|
89
91
|
|
90
92
|
|
91
|
-
def
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
Example
|
120
|
-
-------
|
121
|
-
>>> grounding_dino("car. dinosaur", image)
|
122
|
-
[
|
123
|
-
{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
124
|
-
{'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
125
|
-
]
|
126
|
-
"""
|
127
|
-
image_size = image.shape[:2]
|
128
|
-
image_b64 = convert_to_b64(image)
|
129
|
-
if model_size not in ["large", "tiny"]:
|
130
|
-
raise ValueError("model_size must be either 'large' or 'tiny'")
|
131
|
-
request_data = {
|
132
|
-
"prompt": prompt,
|
133
|
-
"image": image_b64,
|
134
|
-
"tool": (
|
135
|
-
"visual_grounding" if model_size == "large" else "visual_grounding_tiny"
|
136
|
-
),
|
137
|
-
"kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
|
138
|
-
"function_name": "grounding_dino",
|
139
|
-
}
|
140
|
-
data: Dict[str, Any] = send_inference_request(request_data, "tools")
|
141
|
-
return_data = []
|
142
|
-
for i in range(len(data["bboxes"])):
|
143
|
-
return_data.append(
|
144
|
-
{
|
145
|
-
"score": round(data["scores"][i], 2),
|
146
|
-
"label": data["labels"][i],
|
147
|
-
"bbox": normalize_bbox(data["bboxes"][i], image_size),
|
148
|
-
}
|
149
|
-
)
|
150
|
-
return return_data
|
93
|
+
def _display_tool_trace(
|
94
|
+
function_name: str,
|
95
|
+
request: Dict[str, Any],
|
96
|
+
response: Any,
|
97
|
+
files: Union[List[Tuple[str, bytes]], str],
|
98
|
+
) -> None:
|
99
|
+
# Sends data through IPython's display function so front-end can show them. We use
|
100
|
+
# a function here instead of a decarator becuase we do not want to re-calculate data
|
101
|
+
# such as video bytes, which can be slow. Since this is calculated inside the
|
102
|
+
# function we can't capture it with a decarator without adding it as a return value
|
103
|
+
# which would change the function signature and affect the agent.
|
104
|
+
files_in_b64: List[Tuple[str, str]]
|
105
|
+
if isinstance(files, str):
|
106
|
+
files_in_b64 = [("images", files)]
|
107
|
+
else:
|
108
|
+
files_in_b64 = [(file[0], b64encode(file[1]).decode("utf-8")) for file in files]
|
109
|
+
|
110
|
+
request["function_name"] = function_name
|
111
|
+
tool_call_trace = ToolCallTrace(
|
112
|
+
endpoint_url="",
|
113
|
+
type="tool_func_call",
|
114
|
+
request=request,
|
115
|
+
response={"data": response},
|
116
|
+
error=None,
|
117
|
+
files=files_in_b64,
|
118
|
+
)
|
119
|
+
display({MimeType.APPLICATION_JSON: tool_call_trace.model_dump()}, raw=True)
|
151
120
|
|
152
121
|
|
153
122
|
def owl_v2_image(
|
@@ -223,14 +192,21 @@ def owl_v2_image(
|
|
223
192
|
# get the first frame
|
224
193
|
bboxes = detections[0]
|
225
194
|
bboxes_formatted = [
|
226
|
-
|
227
|
-
label
|
228
|
-
bbox
|
229
|
-
score
|
230
|
-
|
195
|
+
{
|
196
|
+
"label": bbox["label"],
|
197
|
+
"bbox": normalize_bbox(bbox["bounding_box"], image_size),
|
198
|
+
"score": round(bbox["score"], 2),
|
199
|
+
}
|
231
200
|
for bbox in bboxes
|
232
201
|
]
|
233
|
-
|
202
|
+
|
203
|
+
_display_tool_trace(
|
204
|
+
owl_v2_image.__name__,
|
205
|
+
payload,
|
206
|
+
detections[0],
|
207
|
+
files,
|
208
|
+
)
|
209
|
+
return bboxes_formatted
|
234
210
|
|
235
211
|
|
236
212
|
def owl_v2_video(
|
@@ -309,81 +285,21 @@ def owl_v2_video(
|
|
309
285
|
bboxes_formatted = []
|
310
286
|
for frame_data in detections:
|
311
287
|
bboxes_formatted_per_frame = [
|
312
|
-
|
313
|
-
label
|
314
|
-
bbox
|
315
|
-
score
|
316
|
-
|
288
|
+
{
|
289
|
+
"label": bbox["label"],
|
290
|
+
"bbox": normalize_bbox(bbox["bounding_box"], image_size),
|
291
|
+
"score": round(bbox["score"], 2),
|
292
|
+
}
|
317
293
|
for bbox in frame_data
|
318
294
|
]
|
319
295
|
bboxes_formatted.append(bboxes_formatted_per_frame)
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
iou_threshold: float = 0.20,
|
328
|
-
) -> List[Dict[str, Any]]:
|
329
|
-
"""'grounding_sam' is a tool that can segment multiple objects given a text prompt
|
330
|
-
such as category names or referring expressions. The categories in text prompt are
|
331
|
-
separated by commas or periods. It returns a list of bounding boxes, label names,
|
332
|
-
mask file names and associated probability scores.
|
333
|
-
|
334
|
-
Parameters:
|
335
|
-
prompt (str): The prompt to ground to the image.
|
336
|
-
image (np.ndarray): The image to ground the prompt to.
|
337
|
-
box_threshold (float, optional): The threshold for the box detection. Defaults
|
338
|
-
to 0.20.
|
339
|
-
iou_threshold (float, optional): The threshold for the Intersection over Union
|
340
|
-
(IoU). Defaults to 0.20.
|
341
|
-
|
342
|
-
Returns:
|
343
|
-
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
344
|
-
bounding box, and mask of the detected objects with normalized coordinates
|
345
|
-
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
346
|
-
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
347
|
-
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
348
|
-
the background.
|
349
|
-
|
350
|
-
Example
|
351
|
-
-------
|
352
|
-
>>> grounding_sam("car. dinosaur", image)
|
353
|
-
[
|
354
|
-
{
|
355
|
-
'score': 0.99,
|
356
|
-
'label': 'dinosaur',
|
357
|
-
'bbox': [0.1, 0.11, 0.35, 0.4],
|
358
|
-
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
359
|
-
[0, 0, 0, ..., 0, 0, 0],
|
360
|
-
...,
|
361
|
-
[0, 0, 0, ..., 0, 0, 0],
|
362
|
-
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
363
|
-
},
|
364
|
-
]
|
365
|
-
"""
|
366
|
-
image_size = image.shape[:2]
|
367
|
-
image_b64 = convert_to_b64(image)
|
368
|
-
request_data = {
|
369
|
-
"prompt": prompt,
|
370
|
-
"image": image_b64,
|
371
|
-
"tool": "visual_grounding_segment",
|
372
|
-
"kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
|
373
|
-
"function_name": "grounding_sam",
|
374
|
-
}
|
375
|
-
data: Dict[str, Any] = send_inference_request(request_data, "tools")
|
376
|
-
return_data = []
|
377
|
-
for i in range(len(data["bboxes"])):
|
378
|
-
return_data.append(
|
379
|
-
{
|
380
|
-
"score": round(data["scores"][i], 2),
|
381
|
-
"label": data["labels"][i],
|
382
|
-
"bbox": normalize_bbox(data["bboxes"][i], image_size),
|
383
|
-
"mask": rle_decode(mask_rle=data["masks"][i], shape=data["mask_shape"]),
|
384
|
-
}
|
385
|
-
)
|
386
|
-
return return_data
|
296
|
+
_display_tool_trace(
|
297
|
+
owl_v2_video.__name__,
|
298
|
+
payload,
|
299
|
+
detections[0],
|
300
|
+
files,
|
301
|
+
)
|
302
|
+
return bboxes_formatted
|
387
303
|
|
388
304
|
|
389
305
|
def florence2_sam2_image(
|
@@ -460,6 +376,13 @@ def florence2_sam2_image(
|
|
460
376
|
label = detection["label"]
|
461
377
|
bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
|
462
378
|
return_data.append({"label": label, "bbox": bbox, "mask": mask, "score": 1.0})
|
379
|
+
|
380
|
+
_display_tool_trace(
|
381
|
+
florence2_sam2_image.__name__,
|
382
|
+
payload,
|
383
|
+
detections[0],
|
384
|
+
files,
|
385
|
+
)
|
463
386
|
return return_data
|
464
387
|
|
465
388
|
|
@@ -545,10 +468,36 @@ def florence2_sam2_video_tracking(
|
|
545
468
|
for detection in frame:
|
546
469
|
mask = rle_decode_array(detection["mask"])
|
547
470
|
label = str(detection["id"]) + ": " + detection["label"]
|
548
|
-
return_frame_data.append(
|
471
|
+
return_frame_data.append(
|
472
|
+
{"label": label, "mask": mask, "score": 1.0, "rle": detection["mask"]}
|
473
|
+
)
|
549
474
|
return_data.append(return_frame_data)
|
550
475
|
return_data = add_bboxes_from_masks(return_data)
|
551
|
-
|
476
|
+
return_data = nms(return_data, iou_threshold=0.95)
|
477
|
+
|
478
|
+
_display_tool_trace(
|
479
|
+
florence2_sam2_video_tracking.__name__,
|
480
|
+
payload,
|
481
|
+
[
|
482
|
+
[
|
483
|
+
{
|
484
|
+
"label": e["label"],
|
485
|
+
"score": e["score"],
|
486
|
+
"bbox": denormalize_bbox(e["bbox"], frames[0].shape[:2]),
|
487
|
+
"mask": e["rle"],
|
488
|
+
}
|
489
|
+
for e in lst
|
490
|
+
]
|
491
|
+
for lst in return_data
|
492
|
+
],
|
493
|
+
files,
|
494
|
+
)
|
495
|
+
# We save the RLE for display purposes, re-calculting RLE can get very expensive.
|
496
|
+
# Deleted here because we are returning the numpy masks instead
|
497
|
+
for frame in return_data:
|
498
|
+
for obj in frame:
|
499
|
+
del obj["rle"]
|
500
|
+
return return_data
|
552
501
|
|
553
502
|
|
554
503
|
def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
@@ -603,128 +552,134 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
603
552
|
box = normalize_bbox(box, image_size)
|
604
553
|
output.append({"label": label, "bbox": box, "score": round(det["score"], 2)})
|
605
554
|
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
the objects in the image.
|
614
|
-
|
615
|
-
Parameters:
|
616
|
-
image (np.ndarray): The image that contains lot of instances of a single object
|
617
|
-
|
618
|
-
Returns:
|
619
|
-
Dict[str, Any]: A dictionary containing the key 'count' and the count as a
|
620
|
-
value, e.g. {count: 12} and a heat map for visualization purposes.
|
621
|
-
|
622
|
-
Example
|
623
|
-
-------
|
624
|
-
>>> loca_zero_shot_counting(image)
|
625
|
-
{'count': 83,
|
626
|
-
'heat_map': array([[ 0, 0, 0, ..., 0, 0, 0],
|
627
|
-
[ 0, 0, 0, ..., 0, 0, 0],
|
628
|
-
[ 0, 0, 0, ..., 0, 0, 1],
|
629
|
-
...,
|
630
|
-
[ 0, 0, 0, ..., 30, 35, 41],
|
631
|
-
[ 0, 0, 0, ..., 41, 47, 53],
|
632
|
-
[ 0, 0, 0, ..., 53, 59, 64]], dtype=uint8)}
|
633
|
-
"""
|
634
|
-
|
635
|
-
image_b64 = convert_to_b64(image)
|
636
|
-
data = {
|
637
|
-
"image": image_b64,
|
638
|
-
"function_name": "loca_zero_shot_counting",
|
639
|
-
}
|
640
|
-
resp_data: dict[str, Any] = send_inference_request(data, "loca", v2=True)
|
641
|
-
resp_data["heat_map"] = np.array(resp_data["heat_map"][0]).astype(np.uint8)
|
642
|
-
return resp_data
|
555
|
+
_display_tool_trace(
|
556
|
+
ocr.__name__,
|
557
|
+
{},
|
558
|
+
data,
|
559
|
+
cast(List[Tuple[str, bytes]], [("image", buffer_bytes)]),
|
560
|
+
)
|
561
|
+
return sorted(output, key=lambda x: (x["bbox"][1], x["bbox"][0]))
|
643
562
|
|
644
563
|
|
645
|
-
def
|
646
|
-
image: np.ndarray,
|
564
|
+
def _sam2(
|
565
|
+
image: np.ndarray,
|
566
|
+
detections: List[Dict[str, Any]],
|
567
|
+
image_size: Tuple[int, ...],
|
568
|
+
image_bytes: Optional[bytes] = None,
|
647
569
|
) -> Dict[str, Any]:
|
648
|
-
|
649
|
-
|
650
|
-
It returns only the count of the objects in the image.
|
651
|
-
|
652
|
-
Parameters:
|
653
|
-
image (np.ndarray): The image that contains lot of instances of a single object
|
654
|
-
visual_prompt (Dict[str, List[float]]): Bounding box of the object in
|
655
|
-
format [xmin, ymin, xmax, ymax]. Only 1 bounding box can be provided.
|
656
|
-
|
657
|
-
Returns:
|
658
|
-
Dict[str, Any]: A dictionary containing the key 'count' and the count as a
|
659
|
-
value, e.g. {count: 12} and a heat map for visualization purposes.
|
660
|
-
|
661
|
-
Example
|
662
|
-
-------
|
663
|
-
>>> loca_visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
|
664
|
-
{'count': 83,
|
665
|
-
'heat_map': array([[ 0, 0, 0, ..., 0, 0, 0],
|
666
|
-
[ 0, 0, 0, ..., 0, 0, 0],
|
667
|
-
[ 0, 0, 0, ..., 0, 0, 1],
|
668
|
-
...,
|
669
|
-
[ 0, 0, 0, ..., 30, 35, 41],
|
670
|
-
[ 0, 0, 0, ..., 41, 47, 53],
|
671
|
-
[ 0, 0, 0, ..., 53, 59, 64]], dtype=uint8)}
|
672
|
-
"""
|
673
|
-
|
674
|
-
image_size = get_image_size(image)
|
675
|
-
bbox = visual_prompt["bbox"]
|
676
|
-
image_b64 = convert_to_b64(image)
|
570
|
+
if image_bytes is None:
|
571
|
+
image_bytes = numpy_to_bytes(image)
|
677
572
|
|
678
|
-
|
679
|
-
|
680
|
-
"
|
681
|
-
"
|
573
|
+
files = [("images", image_bytes)]
|
574
|
+
payload = {
|
575
|
+
"model": "sam2",
|
576
|
+
"bboxes": json.dumps(
|
577
|
+
[
|
578
|
+
{
|
579
|
+
"labels": [d["label"] for d in detections],
|
580
|
+
"bboxes": [
|
581
|
+
denormalize_bbox(d["bbox"], image_size) for d in detections
|
582
|
+
],
|
583
|
+
}
|
584
|
+
]
|
585
|
+
),
|
682
586
|
}
|
683
|
-
|
684
|
-
|
685
|
-
|
587
|
+
|
588
|
+
metadata = {"function_name": "sam2"}
|
589
|
+
pred_detections = send_task_inference_request(
|
590
|
+
payload, "sam2", files=files, metadata=metadata
|
591
|
+
)
|
592
|
+
frame = pred_detections[0]
|
593
|
+
return_data = []
|
594
|
+
display_data = []
|
595
|
+
for inp_detection, detection in zip(detections, frame):
|
596
|
+
mask = rle_decode_array(detection["mask"])
|
597
|
+
label = detection["label"]
|
598
|
+
bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
|
599
|
+
return_data.append(
|
600
|
+
{
|
601
|
+
"label": label,
|
602
|
+
"bbox": bbox,
|
603
|
+
"mask": mask,
|
604
|
+
"score": inp_detection["score"],
|
605
|
+
}
|
606
|
+
)
|
607
|
+
display_data.append(
|
608
|
+
{
|
609
|
+
"label": label,
|
610
|
+
"bbox": detection["bounding_box"],
|
611
|
+
"mask": detection["mask"],
|
612
|
+
"score": inp_detection["score"],
|
613
|
+
}
|
614
|
+
)
|
615
|
+
return {"files": files, "return_data": return_data, "display_data": display_data}
|
686
616
|
|
687
617
|
|
688
|
-
def
|
689
|
-
prompt: str,
|
618
|
+
def sam2(
|
690
619
|
image: np.ndarray,
|
691
|
-
|
620
|
+
detections: List[Dict[str, Any]],
|
692
621
|
) -> List[Dict[str, Any]]:
|
693
|
-
"""'
|
694
|
-
|
695
|
-
|
696
|
-
prompt with commas. It returns a list of bounding boxes with normalized
|
697
|
-
coordinates, label names and associated confidence scores.
|
622
|
+
"""'sam2' is a tool that can segment multiple objects given an input bounding box,
|
623
|
+
label and score. It returns a set of masks along with the corresponding bounding
|
624
|
+
boxes and labels.
|
698
625
|
|
699
626
|
Parameters:
|
700
|
-
prompt (str): The object that needs to be counted.
|
701
627
|
image (np.ndarray): The image that contains multiple instances of the object.
|
702
|
-
|
703
|
-
|
628
|
+
detections (List[Dict[str, Any]]): A list of dictionaries containing the score,
|
629
|
+
label, and bounding box of the detected objects with normalized coordinates
|
630
|
+
between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates
|
631
|
+
of the top-left and xmax and ymax are the coordinates of the bottom-right of
|
632
|
+
the bounding box.
|
704
633
|
|
705
634
|
Returns:
|
706
|
-
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
707
|
-
bounding box of the detected objects with normalized coordinates
|
708
|
-
|
709
|
-
|
710
|
-
|
635
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
636
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
637
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
638
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
639
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
640
|
+
the background.
|
711
641
|
|
712
642
|
Example
|
713
643
|
-------
|
714
|
-
>>>
|
644
|
+
>>> sam2(image, [
|
645
|
+
{'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
646
|
+
])
|
715
647
|
[
|
716
|
-
{
|
717
|
-
|
718
|
-
|
719
|
-
|
648
|
+
{
|
649
|
+
'score': 0.49,
|
650
|
+
'label': 'flower',
|
651
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
652
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
653
|
+
[0, 0, 0, ..., 0, 0, 0],
|
654
|
+
...,
|
655
|
+
[0, 0, 0, ..., 0, 0, 0],
|
656
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
657
|
+
},
|
720
658
|
]
|
721
659
|
"""
|
722
660
|
image_size = image.shape[:2]
|
723
|
-
|
724
|
-
|
661
|
+
ret = _sam2(image, detections, image_size)
|
662
|
+
_display_tool_trace(
|
663
|
+
sam2.__name__,
|
664
|
+
{},
|
665
|
+
ret["display_data"],
|
666
|
+
ret["files"],
|
667
|
+
)
|
725
668
|
|
726
|
-
|
727
|
-
|
669
|
+
return ret["return_data"] # type: ignore
|
670
|
+
|
671
|
+
|
672
|
+
def _countgd_object_detection(
|
673
|
+
prompt: str,
|
674
|
+
image: np.ndarray,
|
675
|
+
box_threshold: float,
|
676
|
+
image_size: Tuple[int, ...],
|
677
|
+
image_bytes: Optional[bytes] = None,
|
678
|
+
) -> Dict[str, Any]:
|
679
|
+
if image_bytes is None:
|
680
|
+
image_bytes = numpy_to_bytes(image)
|
681
|
+
|
682
|
+
files = [("image", image_bytes)]
|
728
683
|
prompts = [p.strip() for p in prompt.split(", ")]
|
729
684
|
|
730
685
|
def _run_countgd(prompt: str) -> List[Dict[str, Any]]:
|
@@ -747,97 +702,76 @@ def countgd_object_detection(
|
|
747
702
|
for future in as_completed(futures):
|
748
703
|
bboxes.extend(future.result())
|
749
704
|
|
750
|
-
|
751
|
-
|
752
|
-
label
|
753
|
-
bbox
|
754
|
-
score
|
755
|
-
|
705
|
+
return_data = [
|
706
|
+
{
|
707
|
+
"label": bbox["label"],
|
708
|
+
"bbox": normalize_bbox(bbox["bounding_box"], image_size),
|
709
|
+
"score": round(bbox["score"], 2),
|
710
|
+
}
|
756
711
|
for bbox in bboxes
|
757
712
|
]
|
758
|
-
# TODO: remove this once we start to use the confidence on countgd
|
759
|
-
filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
|
760
|
-
return_data = [bbox.model_dump() for bbox in filtered_bboxes]
|
761
|
-
return single_nms(return_data, iou_threshold=0.80)
|
762
713
|
|
714
|
+
return_data = single_nms(return_data, iou_threshold=0.80)
|
715
|
+
display_data = [
|
716
|
+
{
|
717
|
+
"label": e["label"],
|
718
|
+
"score": e["score"],
|
719
|
+
"bbox": denormalize_bbox(e["bbox"], image_size),
|
720
|
+
}
|
721
|
+
for e in return_data
|
722
|
+
]
|
723
|
+
return {"files": files, "return_data": return_data, "display_data": display_data}
|
763
724
|
|
764
|
-
|
725
|
+
|
726
|
+
def countgd_object_detection(
|
727
|
+
prompt: str,
|
765
728
|
image: np.ndarray,
|
766
|
-
|
729
|
+
box_threshold: float = 0.23,
|
767
730
|
) -> List[Dict[str, Any]]:
|
768
|
-
"""'
|
769
|
-
|
770
|
-
|
731
|
+
"""'countgd_object_detection' is a tool that can detect multiple instances of an
|
732
|
+
object given a text prompt. It is particularly useful when trying to detect and
|
733
|
+
count a large number of objects. You can optionally separate object names in the
|
734
|
+
prompt with commas. It returns a list of bounding boxes with normalized
|
735
|
+
coordinates, label names and associated confidence scores.
|
771
736
|
|
772
737
|
Parameters:
|
738
|
+
prompt (str): The object that needs to be counted.
|
773
739
|
image (np.ndarray): The image that contains multiple instances of the object.
|
774
|
-
|
775
|
-
|
776
|
-
between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates
|
777
|
-
of the top-left and xmax and ymax are the coordinates of the bottom-right of
|
778
|
-
the bounding box.
|
740
|
+
box_threshold (float, optional): The threshold for detection. Defaults
|
741
|
+
to 0.23.
|
779
742
|
|
780
743
|
Returns:
|
781
|
-
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
782
|
-
bounding box
|
783
|
-
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
|
784
|
-
and xmax and ymax are the coordinates of the bottom-right of the
|
785
|
-
|
786
|
-
the background.
|
744
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
745
|
+
bounding box of the detected objects with normalized coordinates between 0
|
746
|
+
and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
|
747
|
+
top-left and xmax and ymax are the coordinates of the bottom-right of the
|
748
|
+
bounding box.
|
787
749
|
|
788
750
|
Example
|
789
751
|
-------
|
790
|
-
>>>
|
791
|
-
{'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
792
|
-
])
|
752
|
+
>>> countgd_object_detection("flower", image)
|
793
753
|
[
|
794
|
-
{
|
795
|
-
|
796
|
-
|
797
|
-
|
798
|
-
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
799
|
-
[0, 0, 0, ..., 0, 0, 0],
|
800
|
-
...,
|
801
|
-
[0, 0, 0, ..., 0, 0, 0],
|
802
|
-
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
803
|
-
},
|
754
|
+
{'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
755
|
+
{'score': 0.68, 'label': 'flower', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
756
|
+
{'score': 0.78, 'label': 'flower', 'bbox': [0.3, 0.35, 0.48, 0.52},
|
757
|
+
{'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
|
804
758
|
]
|
805
759
|
"""
|
806
760
|
image_size = image.shape[:2]
|
761
|
+
if image_size[0] < 1 or image_size[1] < 1:
|
762
|
+
return []
|
807
763
|
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
812
|
-
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
],
|
818
|
-
}
|
819
|
-
]
|
820
|
-
),
|
821
|
-
}
|
822
|
-
metadata = {"function_name": "sam2"}
|
823
|
-
pred_detections = send_task_inference_request(
|
824
|
-
payload, "sam2", files=files, metadata=metadata
|
764
|
+
ret = _countgd_object_detection(prompt, image, box_threshold, image_size)
|
765
|
+
_display_tool_trace(
|
766
|
+
countgd_object_detection.__name__,
|
767
|
+
{
|
768
|
+
"prompts": prompt,
|
769
|
+
"confidence": box_threshold,
|
770
|
+
},
|
771
|
+
ret["display_data"],
|
772
|
+
ret["files"],
|
825
773
|
)
|
826
|
-
|
827
|
-
return_data = []
|
828
|
-
for inp_detection, detection in zip(detections, frame):
|
829
|
-
mask = rle_decode_array(detection["mask"])
|
830
|
-
label = detection["label"]
|
831
|
-
bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
|
832
|
-
return_data.append(
|
833
|
-
{
|
834
|
-
"label": label,
|
835
|
-
"bbox": bbox,
|
836
|
-
"mask": mask,
|
837
|
-
"score": inp_detection["score"],
|
838
|
-
}
|
839
|
-
)
|
840
|
-
return return_data
|
774
|
+
return ret["return_data"] # type: ignore
|
841
775
|
|
842
776
|
|
843
777
|
def countgd_sam2_object_detection(
|
@@ -881,9 +815,23 @@ def countgd_sam2_object_detection(
|
|
881
815
|
},
|
882
816
|
]
|
883
817
|
"""
|
884
|
-
|
885
|
-
|
886
|
-
|
818
|
+
|
819
|
+
od_ret = _countgd_object_detection(prompt, image, box_threshold, image.shape[:2])
|
820
|
+
seg_ret = _sam2(
|
821
|
+
image, od_ret["return_data"], image.shape[:2], image_bytes=od_ret["files"][0][1]
|
822
|
+
)
|
823
|
+
|
824
|
+
_display_tool_trace(
|
825
|
+
countgd_sam2_object_detection.__name__,
|
826
|
+
{
|
827
|
+
"prompts": prompt,
|
828
|
+
"confidence": box_threshold,
|
829
|
+
},
|
830
|
+
seg_ret["display_data"],
|
831
|
+
seg_ret["files"],
|
832
|
+
)
|
833
|
+
|
834
|
+
return seg_ret["return_data"] # type: ignore
|
887
835
|
|
888
836
|
|
889
837
|
def countgd_example_based_counting(
|
@@ -941,76 +889,28 @@ def countgd_example_based_counting(
|
|
941
889
|
# get the first frame
|
942
890
|
bboxes_per_frame = detections[0]
|
943
891
|
bboxes_formatted = [
|
944
|
-
|
945
|
-
label
|
946
|
-
bbox
|
947
|
-
score
|
948
|
-
|
892
|
+
{
|
893
|
+
"label": bbox["label"],
|
894
|
+
"bbox": normalize_bbox(bbox["bounding_box"], image_size),
|
895
|
+
"score": round(bbox["score"], 2),
|
896
|
+
}
|
949
897
|
for bbox in bboxes_per_frame
|
950
898
|
]
|
951
|
-
|
952
|
-
|
953
|
-
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
959
|
-
|
960
|
-
|
961
|
-
|
962
|
-
|
963
|
-
|
964
|
-
Returns:
|
965
|
-
str: A string which is the answer to the given prompt.
|
966
|
-
|
967
|
-
Example
|
968
|
-
-------
|
969
|
-
>>> florence2_roberta_vqa('What is the top left animal in this image?', image)
|
970
|
-
'white tiger'
|
971
|
-
"""
|
972
|
-
|
973
|
-
image_b64 = convert_to_b64(image)
|
974
|
-
data = {
|
975
|
-
"image": image_b64,
|
976
|
-
"question": prompt,
|
977
|
-
"function_name": "florence2_roberta_vqa",
|
978
|
-
}
|
979
|
-
|
980
|
-
answer = send_inference_request(data, "florence2-qa", v2=True)
|
981
|
-
return answer # type: ignore
|
982
|
-
|
983
|
-
|
984
|
-
def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
|
985
|
-
"""'ixc25_image_vqa' is a tool that can answer any questions about arbitrary images
|
986
|
-
including regular images or images of documents or presentations. It returns text
|
987
|
-
as an answer to the question.
|
988
|
-
|
989
|
-
Parameters:
|
990
|
-
prompt (str): The question about the image
|
991
|
-
image (np.ndarray): The reference image used for the question
|
992
|
-
|
993
|
-
Returns:
|
994
|
-
str: A string which is the answer to the given prompt.
|
995
|
-
|
996
|
-
Example
|
997
|
-
-------
|
998
|
-
>>> ixc25_image_vqa('What is the cat doing?', image)
|
999
|
-
'drinking milk'
|
1000
|
-
"""
|
1001
|
-
if image.shape[0] < 1 or image.shape[1] < 1:
|
1002
|
-
raise ValueError(f"Image is empty, image shape: {image.shape}")
|
1003
|
-
|
1004
|
-
buffer_bytes = numpy_to_bytes(image)
|
1005
|
-
files = [("image", buffer_bytes)]
|
1006
|
-
payload = {
|
1007
|
-
"prompt": prompt,
|
1008
|
-
"function_name": "ixc25_image_vqa",
|
1009
|
-
}
|
1010
|
-
data: Dict[str, Any] = send_inference_request(
|
1011
|
-
payload, "internlm-xcomposer2", files=files, v2=True
|
899
|
+
_display_tool_trace(
|
900
|
+
countgd_example_based_counting.__name__,
|
901
|
+
payload,
|
902
|
+
[
|
903
|
+
{
|
904
|
+
"label": e["label"],
|
905
|
+
"score": e["score"],
|
906
|
+
"bbox": denormalize_bbox(e["bbox"], image_size),
|
907
|
+
}
|
908
|
+
for e in bboxes_formatted
|
909
|
+
],
|
910
|
+
files,
|
1012
911
|
)
|
1013
|
-
|
912
|
+
|
913
|
+
return bboxes_formatted
|
1014
914
|
|
1015
915
|
|
1016
916
|
def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
|
@@ -1047,61 +947,13 @@ def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
|
|
1047
947
|
data: Dict[str, Any] = send_inference_request(
|
1048
948
|
payload, "image-to-text", files=files, v2=True
|
1049
949
|
)
|
1050
|
-
|
1051
|
-
|
1052
|
-
|
1053
|
-
|
1054
|
-
|
1055
|
-
returns the extracted text as a string and can be used as an alternative to OCR if
|
1056
|
-
you do not need to know the exact bounding box of the text.
|
1057
|
-
|
1058
|
-
Parameters:
|
1059
|
-
image (np.ndarray): The image to extract text from.
|
1060
|
-
|
1061
|
-
Returns:
|
1062
|
-
str: The extracted text from the image.
|
1063
|
-
"""
|
1064
|
-
|
1065
|
-
lmm = AnthropicLMM()
|
1066
|
-
buffer = io.BytesIO()
|
1067
|
-
Image.fromarray(image).save(buffer, format="PNG")
|
1068
|
-
image_bytes = buffer.getvalue()
|
1069
|
-
image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
|
1070
|
-
text = lmm.generate(
|
1071
|
-
"Extract and return any text you see in this image and nothing else. If you do not read any text respond with an empty string.",
|
1072
|
-
[image_b64],
|
1073
|
-
)
|
1074
|
-
return cast(str, text)
|
1075
|
-
|
1076
|
-
|
1077
|
-
def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
|
1078
|
-
"""'ixc25_video_vqa' is a tool that can answer any questions about arbitrary videos
|
1079
|
-
including regular videos or videos of documents or presentations. It returns text
|
1080
|
-
as an answer to the question.
|
1081
|
-
|
1082
|
-
Parameters:
|
1083
|
-
prompt (str): The question about the video
|
1084
|
-
frames (List[np.ndarray]): The reference frames used for the question
|
1085
|
-
|
1086
|
-
Returns:
|
1087
|
-
str: A string which is the answer to the given prompt.
|
1088
|
-
|
1089
|
-
Example
|
1090
|
-
-------
|
1091
|
-
>>> ixc25_video_vqa('Which football player made the goal?', frames)
|
1092
|
-
'Lionel Messi'
|
1093
|
-
"""
|
1094
|
-
|
1095
|
-
buffer_bytes = frames_to_bytes(frames)
|
1096
|
-
files = [("video", buffer_bytes)]
|
1097
|
-
payload = {
|
1098
|
-
"prompt": prompt,
|
1099
|
-
"function_name": "ixc25_video_vqa",
|
1100
|
-
}
|
1101
|
-
data: Dict[str, Any] = send_inference_request(
|
1102
|
-
payload, "internlm-xcomposer2", files=files, v2=True
|
950
|
+
_display_tool_trace(
|
951
|
+
qwen2_vl_images_vqa.__name__,
|
952
|
+
payload,
|
953
|
+
cast(str, data),
|
954
|
+
files,
|
1103
955
|
)
|
1104
|
-
return cast(str, data
|
956
|
+
return cast(str, data)
|
1105
957
|
|
1106
958
|
|
1107
959
|
def qwen2_vl_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
|
@@ -1135,9 +987,39 @@ def qwen2_vl_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
|
|
1135
987
|
data: Dict[str, Any] = send_inference_request(
|
1136
988
|
payload, "image-to-text", files=files, v2=True
|
1137
989
|
)
|
990
|
+
_display_tool_trace(
|
991
|
+
qwen2_vl_video_vqa.__name__,
|
992
|
+
payload,
|
993
|
+
cast(str, data),
|
994
|
+
files,
|
995
|
+
)
|
1138
996
|
return cast(str, data)
|
1139
997
|
|
1140
998
|
|
999
|
+
def claude35_text_extraction(image: np.ndarray) -> str:
|
1000
|
+
"""'claude35_text_extraction' is a tool that can extract text from an image. It
|
1001
|
+
returns the extracted text as a string and can be used as an alternative to OCR if
|
1002
|
+
you do not need to know the exact bounding box of the text.
|
1003
|
+
|
1004
|
+
Parameters:
|
1005
|
+
image (np.ndarray): The image to extract text from.
|
1006
|
+
|
1007
|
+
Returns:
|
1008
|
+
str: The extracted text from the image.
|
1009
|
+
"""
|
1010
|
+
|
1011
|
+
lmm = AnthropicLMM()
|
1012
|
+
buffer = io.BytesIO()
|
1013
|
+
Image.fromarray(image).save(buffer, format="PNG")
|
1014
|
+
image_bytes = buffer.getvalue()
|
1015
|
+
image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
|
1016
|
+
text = lmm.generate(
|
1017
|
+
"Extract and return any text you see in this image and nothing else. If you do not read any text respond with an empty string.",
|
1018
|
+
[image_b64],
|
1019
|
+
)
|
1020
|
+
return cast(str, text)
|
1021
|
+
|
1022
|
+
|
1141
1023
|
def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
|
1142
1024
|
"""'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
|
1143
1025
|
including regular images or images of documents or presentations. It returns text
|
@@ -1187,48 +1069,18 @@ def gpt4o_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
|
|
1187
1069
|
|
1188
1070
|
if len(frames) > 10:
|
1189
1071
|
step = len(frames) / 10
|
1190
|
-
frames = [frames[int(i * step)] for i in range(10)]
|
1191
|
-
|
1192
|
-
frames_b64 = []
|
1193
|
-
for frame in frames:
|
1194
|
-
buffer = io.BytesIO()
|
1195
|
-
Image.fromarray(frame).save(buffer, format="PNG")
|
1196
|
-
image_bytes = buffer.getvalue()
|
1197
|
-
image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
|
1198
|
-
frames_b64.append(image_b64)
|
1199
|
-
|
1200
|
-
resp = lmm.generate(prompt, frames_b64)
|
1201
|
-
return cast(str, resp)
|
1202
|
-
|
1203
|
-
|
1204
|
-
def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
|
1205
|
-
"""'git_vqa_v2' is a tool that can answer questions about the visual
|
1206
|
-
contents of an image given a question and an image. It returns an answer to the
|
1207
|
-
question
|
1208
|
-
|
1209
|
-
Parameters:
|
1210
|
-
prompt (str): The question about the image
|
1211
|
-
image (np.ndarray): The reference image used for the question
|
1212
|
-
|
1213
|
-
Returns:
|
1214
|
-
str: A string which is the answer to the given prompt.
|
1215
|
-
|
1216
|
-
Example
|
1217
|
-
-------
|
1218
|
-
>>> git_vqa_v2('What is the cat doing ?', image)
|
1219
|
-
'drinking milk'
|
1220
|
-
"""
|
1072
|
+
frames = [frames[int(i * step)] for i in range(10)]
|
1221
1073
|
|
1222
|
-
|
1223
|
-
|
1224
|
-
|
1225
|
-
"
|
1226
|
-
|
1227
|
-
"
|
1228
|
-
|
1074
|
+
frames_b64 = []
|
1075
|
+
for frame in frames:
|
1076
|
+
buffer = io.BytesIO()
|
1077
|
+
Image.fromarray(frame).save(buffer, format="PNG")
|
1078
|
+
image_bytes = buffer.getvalue()
|
1079
|
+
image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
|
1080
|
+
frames_b64.append(image_b64)
|
1229
1081
|
|
1230
|
-
|
1231
|
-
return
|
1082
|
+
resp = lmm.generate(prompt, frames_b64)
|
1083
|
+
return cast(str, resp)
|
1232
1084
|
|
1233
1085
|
|
1234
1086
|
def video_temporal_localization(
|
@@ -1274,43 +1126,15 @@ def video_temporal_localization(
|
|
1274
1126
|
data = send_inference_request(
|
1275
1127
|
payload, "video-temporal-localization", files=files, v2=True
|
1276
1128
|
)
|
1129
|
+
_display_tool_trace(
|
1130
|
+
video_temporal_localization.__name__,
|
1131
|
+
payload,
|
1132
|
+
data,
|
1133
|
+
files,
|
1134
|
+
)
|
1277
1135
|
return [cast(float, value) for value in data]
|
1278
1136
|
|
1279
1137
|
|
1280
|
-
def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
|
1281
|
-
"""'clip' is a tool that can classify an image or a cropped detection given a list
|
1282
|
-
of input classes or tags. It returns the same list of the input classes along with
|
1283
|
-
their probability scores based on image content.
|
1284
|
-
|
1285
|
-
Parameters:
|
1286
|
-
image (np.ndarray): The image to classify or tag
|
1287
|
-
classes (List[str]): The list of classes or tags that is associated with the image
|
1288
|
-
|
1289
|
-
Returns:
|
1290
|
-
Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
|
1291
|
-
contains a list of given labels and other a list of scores.
|
1292
|
-
|
1293
|
-
Example
|
1294
|
-
-------
|
1295
|
-
>>> clip(image, ['dog', 'cat', 'bird'])
|
1296
|
-
{"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]},
|
1297
|
-
"""
|
1298
|
-
|
1299
|
-
if image.shape[0] < 1 or image.shape[1] < 1:
|
1300
|
-
return {"labels": [], "scores": []}
|
1301
|
-
|
1302
|
-
image_b64 = convert_to_b64(image)
|
1303
|
-
data = {
|
1304
|
-
"prompt": ",".join(classes),
|
1305
|
-
"image": image_b64,
|
1306
|
-
"tool": "closed_set_image_classification",
|
1307
|
-
"function_name": "clip",
|
1308
|
-
}
|
1309
|
-
resp_data: dict[str, Any] = send_inference_request(data, "tools")
|
1310
|
-
resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
|
1311
|
-
return resp_data
|
1312
|
-
|
1313
|
-
|
1314
1138
|
def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
|
1315
1139
|
"""'vit_image_classification' is a tool that can classify an image. It returns a
|
1316
1140
|
list of classes and their probability scores based on image content.
|
@@ -1338,6 +1162,12 @@ def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
|
|
1338
1162
|
}
|
1339
1163
|
resp_data: dict[str, Any] = send_inference_request(data, "tools")
|
1340
1164
|
resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
|
1165
|
+
_display_tool_trace(
|
1166
|
+
vit_image_classification.__name__,
|
1167
|
+
data,
|
1168
|
+
resp_data,
|
1169
|
+
image_b64,
|
1170
|
+
)
|
1341
1171
|
return resp_data
|
1342
1172
|
|
1343
1173
|
|
@@ -1369,65 +1199,15 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
|
|
1369
1199
|
data, "nsfw-classification", v2=True
|
1370
1200
|
)
|
1371
1201
|
resp_data["score"] = round(resp_data["score"], 4)
|
1202
|
+
_display_tool_trace(
|
1203
|
+
vit_nsfw_classification.__name__,
|
1204
|
+
data,
|
1205
|
+
resp_data,
|
1206
|
+
image_b64,
|
1207
|
+
)
|
1372
1208
|
return resp_data
|
1373
1209
|
|
1374
1210
|
|
1375
|
-
def blip_image_caption(image: np.ndarray) -> str:
|
1376
|
-
"""'blip_image_caption' is a tool that can caption an image based on its contents. It
|
1377
|
-
returns a text describing the image.
|
1378
|
-
|
1379
|
-
Parameters:
|
1380
|
-
image (np.ndarray): The image to caption
|
1381
|
-
|
1382
|
-
Returns:
|
1383
|
-
str: A string which is the caption for the given image.
|
1384
|
-
|
1385
|
-
Example
|
1386
|
-
-------
|
1387
|
-
>>> blip_image_caption(image)
|
1388
|
-
'This image contains a cat sitting on a table with a bowl of milk.'
|
1389
|
-
"""
|
1390
|
-
|
1391
|
-
image_b64 = convert_to_b64(image)
|
1392
|
-
data = {
|
1393
|
-
"image": image_b64,
|
1394
|
-
"tool": "image_captioning",
|
1395
|
-
"function_name": "blip_image_caption",
|
1396
|
-
}
|
1397
|
-
|
1398
|
-
answer = send_inference_request(data, "tools")
|
1399
|
-
return answer["text"][0] # type: ignore
|
1400
|
-
|
1401
|
-
|
1402
|
-
def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> str:
|
1403
|
-
"""'florence2_image_caption' is a tool that can caption or describe an image based
|
1404
|
-
on its contents. It returns a text describing the image.
|
1405
|
-
|
1406
|
-
Parameters:
|
1407
|
-
image (np.ndarray): The image to caption
|
1408
|
-
detail_caption (bool): If True, the caption will be as detailed as possible else
|
1409
|
-
the caption will be a brief description.
|
1410
|
-
|
1411
|
-
Returns:
|
1412
|
-
str: A string which is the caption for the given image.
|
1413
|
-
|
1414
|
-
Example
|
1415
|
-
-------
|
1416
|
-
>>> florence2_image_caption(image, False)
|
1417
|
-
'This image contains a cat sitting on a table with a bowl of milk.'
|
1418
|
-
"""
|
1419
|
-
image_b64 = convert_to_b64(image)
|
1420
|
-
task = "<MORE_DETAILED_CAPTION>" if detail_caption else "<DETAILED_CAPTION>"
|
1421
|
-
data = {
|
1422
|
-
"image": image_b64,
|
1423
|
-
"task": task,
|
1424
|
-
"function_name": "florence2_image_caption",
|
1425
|
-
}
|
1426
|
-
|
1427
|
-
answer = send_inference_request(data, "florence2", v2=True)
|
1428
|
-
return answer[task] # type: ignore
|
1429
|
-
|
1430
|
-
|
1431
1211
|
def florence2_phrase_grounding(
|
1432
1212
|
prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
|
1433
1213
|
) -> List[Dict[str, Any]]:
|
@@ -1490,15 +1270,21 @@ def florence2_phrase_grounding(
|
|
1490
1270
|
# get the first frame
|
1491
1271
|
bboxes = detections[0]
|
1492
1272
|
bboxes_formatted = [
|
1493
|
-
|
1494
|
-
label
|
1495
|
-
bbox
|
1496
|
-
score
|
1497
|
-
|
1273
|
+
{
|
1274
|
+
"label": bbox["label"],
|
1275
|
+
"bbox": normalize_bbox(bbox["bounding_box"], image_size),
|
1276
|
+
"score": round(bbox["score"], 2),
|
1277
|
+
}
|
1498
1278
|
for bbox in bboxes
|
1499
1279
|
]
|
1500
1280
|
|
1501
|
-
|
1281
|
+
_display_tool_trace(
|
1282
|
+
florence2_phrase_grounding.__name__,
|
1283
|
+
payload,
|
1284
|
+
detections[0],
|
1285
|
+
files,
|
1286
|
+
)
|
1287
|
+
return [bbox for bbox in bboxes_formatted]
|
1502
1288
|
|
1503
1289
|
|
1504
1290
|
def florence2_phrase_grounding_video(
|
@@ -1566,15 +1352,21 @@ def florence2_phrase_grounding_video(
|
|
1566
1352
|
bboxes_formatted = []
|
1567
1353
|
for frame_data in detections:
|
1568
1354
|
bboxes_formatted_per_frame = [
|
1569
|
-
|
1570
|
-
label
|
1571
|
-
bbox
|
1572
|
-
score
|
1573
|
-
|
1355
|
+
{
|
1356
|
+
"label": bbox["label"],
|
1357
|
+
"bbox": normalize_bbox(bbox["bounding_box"], image_size),
|
1358
|
+
"score": round(bbox["score"], 2),
|
1359
|
+
}
|
1574
1360
|
for bbox in frame_data
|
1575
1361
|
]
|
1576
1362
|
bboxes_formatted.append(bboxes_formatted_per_frame)
|
1577
|
-
|
1363
|
+
_display_tool_trace(
|
1364
|
+
florence2_phrase_grounding_video.__name__,
|
1365
|
+
payload,
|
1366
|
+
detections,
|
1367
|
+
files,
|
1368
|
+
)
|
1369
|
+
return bboxes_formatted
|
1578
1370
|
|
1579
1371
|
|
1580
1372
|
def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
@@ -1621,6 +1413,12 @@ def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
1621
1413
|
"score": 1.0,
|
1622
1414
|
}
|
1623
1415
|
)
|
1416
|
+
_display_tool_trace(
|
1417
|
+
florence2_ocr.__name__,
|
1418
|
+
{},
|
1419
|
+
detections,
|
1420
|
+
image_b64,
|
1421
|
+
)
|
1624
1422
|
return return_data
|
1625
1423
|
|
1626
1424
|
|
@@ -1683,6 +1481,12 @@ def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
1683
1481
|
),
|
1684
1482
|
}
|
1685
1483
|
)
|
1484
|
+
_display_tool_trace(
|
1485
|
+
detr_segmentation.__name__,
|
1486
|
+
{},
|
1487
|
+
return_data,
|
1488
|
+
image_b64,
|
1489
|
+
)
|
1686
1490
|
return return_data
|
1687
1491
|
|
1688
1492
|
|
@@ -1721,74 +1525,15 @@ def depth_anything_v2(image: np.ndarray) -> np.ndarray:
|
|
1721
1525
|
depth_map_np.max() - depth_map_np.min()
|
1722
1526
|
)
|
1723
1527
|
depth_map_np = (255 * depth_map_np).astype(np.uint8)
|
1528
|
+
_display_tool_trace(
|
1529
|
+
depth_anything_v2.__name__,
|
1530
|
+
{},
|
1531
|
+
depth_map,
|
1532
|
+
image_b64,
|
1533
|
+
)
|
1724
1534
|
return depth_map_np
|
1725
1535
|
|
1726
1536
|
|
1727
|
-
def generate_soft_edge_image(image: np.ndarray) -> np.ndarray:
|
1728
|
-
"""'generate_soft_edge_image' is a tool that runs Holistically Nested edge detection
|
1729
|
-
to generate a soft edge image (HED) from a given RGB image. The returned image is
|
1730
|
-
monochrome and represents object boundaries as soft white edges on black background
|
1731
|
-
|
1732
|
-
Parameters:
|
1733
|
-
image (np.ndarray): The image to used to generate soft edge image
|
1734
|
-
|
1735
|
-
Returns:
|
1736
|
-
np.ndarray: A soft edge image with pixel values ranging from 0 to 255.
|
1737
|
-
|
1738
|
-
Example
|
1739
|
-
-------
|
1740
|
-
>>> generate_soft_edge_image(image)
|
1741
|
-
array([[0, 0, 0, ..., 0, 0, 0],
|
1742
|
-
[0, 20, 24, ..., 0, 100, 103],
|
1743
|
-
...,
|
1744
|
-
[10, 11, 15, ..., 202, 202, 205],
|
1745
|
-
[10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
|
1746
|
-
"""
|
1747
|
-
image_b64 = convert_to_b64(image)
|
1748
|
-
data = {
|
1749
|
-
"image": image_b64,
|
1750
|
-
"tool": "generate_hed",
|
1751
|
-
"function_name": "generate_soft_edge_image",
|
1752
|
-
}
|
1753
|
-
|
1754
|
-
answer = send_inference_request(data, "tools")
|
1755
|
-
return_data = np.array(b64_to_pil(answer["masks"][0]).convert("L"))
|
1756
|
-
return return_data
|
1757
|
-
|
1758
|
-
|
1759
|
-
def dpt_hybrid_midas(image: np.ndarray) -> np.ndarray:
|
1760
|
-
"""'dpt_hybrid_midas' is a tool that generates a normal mapped from a given RGB
|
1761
|
-
image. The returned RGB image is texture mapped image of the surface normals and the
|
1762
|
-
RGB values represent the surface normals in the x, y, z directions.
|
1763
|
-
|
1764
|
-
Parameters:
|
1765
|
-
image (np.ndarray): The image to used to generate normal image
|
1766
|
-
|
1767
|
-
Returns:
|
1768
|
-
np.ndarray: A mapped normal image with RGB pixel values indicating surface
|
1769
|
-
normals in x, y, z directions.
|
1770
|
-
|
1771
|
-
Example
|
1772
|
-
-------
|
1773
|
-
>>> dpt_hybrid_midas(image)
|
1774
|
-
array([[0, 0, 0, ..., 0, 0, 0],
|
1775
|
-
[0, 20, 24, ..., 0, 100, 103],
|
1776
|
-
...,
|
1777
|
-
[10, 11, 15, ..., 202, 202, 205],
|
1778
|
-
[10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
|
1779
|
-
"""
|
1780
|
-
image_b64 = convert_to_b64(image)
|
1781
|
-
data = {
|
1782
|
-
"image": image_b64,
|
1783
|
-
"tool": "generate_normal",
|
1784
|
-
"function_name": "dpt_hybrid_midas",
|
1785
|
-
}
|
1786
|
-
|
1787
|
-
answer = send_inference_request(data, "tools")
|
1788
|
-
return_data = np.array(b64_to_pil(answer["masks"][0]).convert("RGB"))
|
1789
|
-
return return_data
|
1790
|
-
|
1791
|
-
|
1792
1537
|
def generate_pose_image(image: np.ndarray) -> np.ndarray:
|
1793
1538
|
"""'generate_pose_image' is a tool that generates a open pose bone/stick image from
|
1794
1539
|
a given RGB image. The returned bone image is RGB with the pose amd keypoints colored
|
@@ -1817,6 +1562,12 @@ def generate_pose_image(image: np.ndarray) -> np.ndarray:
|
|
1817
1562
|
|
1818
1563
|
pos_img = send_inference_request(data, "pose-detector", v2=True)
|
1819
1564
|
return_data = np.array(b64_to_pil(pos_img["data"]).convert("RGB"))
|
1565
|
+
_display_tool_trace(
|
1566
|
+
generate_pose_image.__name__,
|
1567
|
+
{},
|
1568
|
+
pos_img,
|
1569
|
+
image_b64,
|
1570
|
+
)
|
1820
1571
|
return return_data
|
1821
1572
|
|
1822
1573
|
|
@@ -1851,130 +1602,28 @@ def template_match(
|
|
1851
1602
|
template_image_b64 = convert_to_b64(template_image)
|
1852
1603
|
data = {
|
1853
1604
|
"image": image_b64,
|
1854
|
-
"template": template_image_b64,
|
1855
|
-
"tool": "template_match",
|
1856
|
-
"function_name": "template_match",
|
1857
|
-
}
|
1858
|
-
|
1859
|
-
answer = send_inference_request(data, "tools")
|
1860
|
-
return_data = []
|
1861
|
-
for i in range(len(answer["bboxes"])):
|
1862
|
-
return_data.append(
|
1863
|
-
{
|
1864
|
-
"score": round(answer["scores"][i], 2),
|
1865
|
-
"bbox": normalize_bbox(answer["bboxes"][i], image_size),
|
1866
|
-
}
|
1867
|
-
)
|
1868
|
-
return return_data
|
1869
|
-
|
1870
|
-
|
1871
|
-
def minimum_distance(
|
1872
|
-
det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]
|
1873
|
-
) -> float:
|
1874
|
-
"""'minimum_distance' calculates the minimum distance between two detections which
|
1875
|
-
can include bounding boxes and or masks. This will return the closest distance
|
1876
|
-
between the objects, not the distance between the centers of the objects.
|
1877
|
-
|
1878
|
-
Parameters:
|
1879
|
-
det1 (Dict[str, Any]): The first detection of boxes or masks.
|
1880
|
-
det2 (Dict[str, Any]): The second detection of boxes or masks.
|
1881
|
-
image_size (Tuple[int, int]): The size of the image given as (height, width).
|
1882
|
-
|
1883
|
-
Returns:
|
1884
|
-
float: The closest distance between the two detections.
|
1885
|
-
|
1886
|
-
Example
|
1887
|
-
-------
|
1888
|
-
>>> closest_distance(det1, det2, image_size)
|
1889
|
-
141.42
|
1890
|
-
"""
|
1891
|
-
|
1892
|
-
if "mask" in det1 and "mask" in det2:
|
1893
|
-
return closest_mask_distance(det1["mask"], det2["mask"])
|
1894
|
-
elif "bbox" in det1 and "bbox" in det2:
|
1895
|
-
return closest_box_distance(det1["bbox"], det2["bbox"], image_size)
|
1896
|
-
else:
|
1897
|
-
raise ValueError("Both detections must have either bbox or mask")
|
1898
|
-
|
1899
|
-
|
1900
|
-
def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
|
1901
|
-
"""'closest_mask_distance' calculates the closest distance between two masks.
|
1902
|
-
|
1903
|
-
Parameters:
|
1904
|
-
mask1 (np.ndarray): The first mask.
|
1905
|
-
mask2 (np.ndarray): The second mask.
|
1906
|
-
|
1907
|
-
Returns:
|
1908
|
-
float: The closest distance between the two masks.
|
1909
|
-
|
1910
|
-
Example
|
1911
|
-
-------
|
1912
|
-
>>> closest_mask_distance(mask1, mask2)
|
1913
|
-
0.5
|
1914
|
-
"""
|
1915
|
-
|
1916
|
-
mask1 = np.clip(mask1, 0, 1)
|
1917
|
-
mask2 = np.clip(mask2, 0, 1)
|
1918
|
-
contours1, _ = cv2.findContours(mask1, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
|
1919
|
-
contours2, _ = cv2.findContours(mask2, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
|
1920
|
-
largest_contour1 = max(contours1, key=cv2.contourArea)
|
1921
|
-
largest_contour2 = max(contours2, key=cv2.contourArea)
|
1922
|
-
polygon1 = cv2.approxPolyDP(largest_contour1, 1.0, True)
|
1923
|
-
polygon2 = cv2.approxPolyDP(largest_contour2, 1.0, True)
|
1924
|
-
min_distance = np.inf
|
1925
|
-
|
1926
|
-
small_polygon, larger_contour = (
|
1927
|
-
(polygon1, largest_contour2)
|
1928
|
-
if len(largest_contour1) < len(largest_contour2)
|
1929
|
-
else (polygon2, largest_contour1)
|
1930
|
-
)
|
1931
|
-
|
1932
|
-
# For each point in the first polygon
|
1933
|
-
for point in small_polygon:
|
1934
|
-
# Calculate the distance to the second polygon, -1 is to invert result as point inside the polygon is positive
|
1935
|
-
|
1936
|
-
distance = (
|
1937
|
-
cv2.pointPolygonTest(
|
1938
|
-
larger_contour, (point[0, 0].item(), point[0, 1].item()), True
|
1939
|
-
)
|
1940
|
-
* -1
|
1941
|
-
)
|
1942
|
-
|
1943
|
-
# If the distance is negative, the point is inside the polygon, so the distance is 0
|
1944
|
-
if distance < 0:
|
1945
|
-
continue
|
1946
|
-
else:
|
1947
|
-
# Update the minimum distance if the point is outside the polygon
|
1948
|
-
min_distance = min(min_distance, distance)
|
1949
|
-
|
1950
|
-
return min_distance if min_distance != np.inf else 0.0
|
1951
|
-
|
1952
|
-
|
1953
|
-
def closest_box_distance(
|
1954
|
-
box1: List[float], box2: List[float], image_size: Tuple[int, int]
|
1955
|
-
) -> float:
|
1956
|
-
"""'closest_box_distance' calculates the closest distance between two bounding boxes.
|
1957
|
-
|
1958
|
-
Parameters:
|
1959
|
-
box1 (List[float]): The first bounding box.
|
1960
|
-
box2 (List[float]): The second bounding box.
|
1961
|
-
image_size (Tuple[int, int]): The size of the image given as (height, width).
|
1962
|
-
|
1963
|
-
Returns:
|
1964
|
-
float: The closest distance between the two bounding boxes.
|
1965
|
-
|
1966
|
-
Example
|
1967
|
-
-------
|
1968
|
-
>>> closest_box_distance([100, 100, 200, 200], [300, 300, 400, 400])
|
1969
|
-
141.42
|
1970
|
-
"""
|
1971
|
-
|
1972
|
-
x11, y11, x12, y12 = denormalize_bbox(box1, image_size)
|
1973
|
-
x21, y21, x22, y22 = denormalize_bbox(box2, image_size)
|
1605
|
+
"template": template_image_b64,
|
1606
|
+
"tool": "template_match",
|
1607
|
+
"function_name": "template_match",
|
1608
|
+
}
|
1974
1609
|
|
1975
|
-
|
1976
|
-
|
1977
|
-
|
1610
|
+
answer = send_inference_request(data, "tools")
|
1611
|
+
return_data = []
|
1612
|
+
for i in range(len(answer["bboxes"])):
|
1613
|
+
return_data.append(
|
1614
|
+
{
|
1615
|
+
"label": "match",
|
1616
|
+
"score": round(answer["scores"][i], 2),
|
1617
|
+
"bbox": normalize_bbox(answer["bboxes"][i], image_size),
|
1618
|
+
}
|
1619
|
+
)
|
1620
|
+
_display_tool_trace(
|
1621
|
+
template_match.__name__,
|
1622
|
+
{"template_image": template_image_b64},
|
1623
|
+
return_data,
|
1624
|
+
image_b64,
|
1625
|
+
)
|
1626
|
+
return return_data
|
1978
1627
|
|
1979
1628
|
|
1980
1629
|
def flux_image_inpainting(
|
@@ -2064,6 +1713,12 @@ def flux_image_inpainting(
|
|
2064
1713
|
)
|
2065
1714
|
|
2066
1715
|
output_image = np.array(b64_to_pil(response[0]).convert("RGB"))
|
1716
|
+
_display_tool_trace(
|
1717
|
+
flux_image_inpainting.__name__,
|
1718
|
+
payload,
|
1719
|
+
output_image,
|
1720
|
+
files,
|
1721
|
+
)
|
2067
1722
|
return output_image
|
2068
1723
|
|
2069
1724
|
|
@@ -2106,9 +1761,124 @@ def siglip_classification(image: np.ndarray, labels: List[str]) -> Dict[str, Any
|
|
2106
1761
|
metadata_payload={"function_name": "siglip_classification"},
|
2107
1762
|
)
|
2108
1763
|
|
1764
|
+
_display_tool_trace(
|
1765
|
+
siglip_classification.__name__,
|
1766
|
+
payload,
|
1767
|
+
response,
|
1768
|
+
files,
|
1769
|
+
)
|
2109
1770
|
return response
|
2110
1771
|
|
2111
1772
|
|
1773
|
+
def minimum_distance(
|
1774
|
+
det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]
|
1775
|
+
) -> float:
|
1776
|
+
"""'minimum_distance' calculates the minimum distance between two detections which
|
1777
|
+
can include bounding boxes and or masks. This will return the closest distance
|
1778
|
+
between the objects, not the distance between the centers of the objects.
|
1779
|
+
|
1780
|
+
Parameters:
|
1781
|
+
det1 (Dict[str, Any]): The first detection of boxes or masks.
|
1782
|
+
det2 (Dict[str, Any]): The second detection of boxes or masks.
|
1783
|
+
image_size (Tuple[int, int]): The size of the image given as (height, width).
|
1784
|
+
|
1785
|
+
Returns:
|
1786
|
+
float: The closest distance between the two detections.
|
1787
|
+
|
1788
|
+
Example
|
1789
|
+
-------
|
1790
|
+
>>> closest_distance(det1, det2, image_size)
|
1791
|
+
141.42
|
1792
|
+
"""
|
1793
|
+
|
1794
|
+
if "mask" in det1 and "mask" in det2:
|
1795
|
+
return closest_mask_distance(det1["mask"], det2["mask"])
|
1796
|
+
elif "bbox" in det1 and "bbox" in det2:
|
1797
|
+
return closest_box_distance(det1["bbox"], det2["bbox"], image_size)
|
1798
|
+
else:
|
1799
|
+
raise ValueError("Both detections must have either bbox or mask")
|
1800
|
+
|
1801
|
+
|
1802
|
+
def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
|
1803
|
+
"""'closest_mask_distance' calculates the closest distance between two masks.
|
1804
|
+
|
1805
|
+
Parameters:
|
1806
|
+
mask1 (np.ndarray): The first mask.
|
1807
|
+
mask2 (np.ndarray): The second mask.
|
1808
|
+
|
1809
|
+
Returns:
|
1810
|
+
float: The closest distance between the two masks.
|
1811
|
+
|
1812
|
+
Example
|
1813
|
+
-------
|
1814
|
+
>>> closest_mask_distance(mask1, mask2)
|
1815
|
+
0.5
|
1816
|
+
"""
|
1817
|
+
|
1818
|
+
mask1 = np.clip(mask1, 0, 1)
|
1819
|
+
mask2 = np.clip(mask2, 0, 1)
|
1820
|
+
contours1, _ = cv2.findContours(mask1, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
|
1821
|
+
contours2, _ = cv2.findContours(mask2, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
|
1822
|
+
largest_contour1 = max(contours1, key=cv2.contourArea)
|
1823
|
+
largest_contour2 = max(contours2, key=cv2.contourArea)
|
1824
|
+
polygon1 = cv2.approxPolyDP(largest_contour1, 1.0, True)
|
1825
|
+
polygon2 = cv2.approxPolyDP(largest_contour2, 1.0, True)
|
1826
|
+
min_distance = np.inf
|
1827
|
+
|
1828
|
+
small_polygon, larger_contour = (
|
1829
|
+
(polygon1, largest_contour2)
|
1830
|
+
if len(largest_contour1) < len(largest_contour2)
|
1831
|
+
else (polygon2, largest_contour1)
|
1832
|
+
)
|
1833
|
+
|
1834
|
+
# For each point in the first polygon
|
1835
|
+
for point in small_polygon:
|
1836
|
+
# Calculate the distance to the second polygon, -1 is to invert result as point inside the polygon is positive
|
1837
|
+
|
1838
|
+
distance = (
|
1839
|
+
cv2.pointPolygonTest(
|
1840
|
+
larger_contour, (point[0, 0].item(), point[0, 1].item()), True
|
1841
|
+
)
|
1842
|
+
* -1
|
1843
|
+
)
|
1844
|
+
|
1845
|
+
# If the distance is negative, the point is inside the polygon, so the distance is 0
|
1846
|
+
if distance < 0:
|
1847
|
+
continue
|
1848
|
+
else:
|
1849
|
+
# Update the minimum distance if the point is outside the polygon
|
1850
|
+
min_distance = min(min_distance, distance)
|
1851
|
+
|
1852
|
+
return min_distance if min_distance != np.inf else 0.0
|
1853
|
+
|
1854
|
+
|
1855
|
+
def closest_box_distance(
|
1856
|
+
box1: List[float], box2: List[float], image_size: Tuple[int, int]
|
1857
|
+
) -> float:
|
1858
|
+
"""'closest_box_distance' calculates the closest distance between two bounding boxes.
|
1859
|
+
|
1860
|
+
Parameters:
|
1861
|
+
box1 (List[float]): The first bounding box.
|
1862
|
+
box2 (List[float]): The second bounding box.
|
1863
|
+
image_size (Tuple[int, int]): The size of the image given as (height, width).
|
1864
|
+
|
1865
|
+
Returns:
|
1866
|
+
float: The closest distance between the two bounding boxes.
|
1867
|
+
|
1868
|
+
Example
|
1869
|
+
-------
|
1870
|
+
>>> closest_box_distance([100, 100, 200, 200], [300, 300, 400, 400])
|
1871
|
+
141.42
|
1872
|
+
"""
|
1873
|
+
|
1874
|
+
x11, y11, x12, y12 = denormalize_bbox(box1, image_size)
|
1875
|
+
x21, y21, x22, y22 = denormalize_bbox(box2, image_size)
|
1876
|
+
|
1877
|
+
horizontal_distance = np.max([0, x21 - x12, x11 - x22])
|
1878
|
+
vertical_distance = np.max([0, y21 - y12, y11 - y22])
|
1879
|
+
return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
|
1880
|
+
|
1881
|
+
|
2112
1882
|
# Utility and visualization functions
|
2113
1883
|
|
2114
1884
|
|
@@ -2625,6 +2395,197 @@ def _plot_counting(
|
|
2625
2395
|
return image
|
2626
2396
|
|
2627
2397
|
|
2398
|
+
class ODModels(str, Enum):
|
2399
|
+
COUNTGD = "countgd"
|
2400
|
+
FLORENCE2 = "florence2"
|
2401
|
+
OWLV2 = "owlv2"
|
2402
|
+
|
2403
|
+
|
2404
|
+
def od_sam2_video_tracking(
|
2405
|
+
od_model: ODModels,
|
2406
|
+
prompt: str,
|
2407
|
+
frames: List[np.ndarray],
|
2408
|
+
chunk_length: Optional[int] = 10,
|
2409
|
+
fine_tune_id: Optional[str] = None,
|
2410
|
+
) -> List[List[Dict[str, Any]]]:
|
2411
|
+
|
2412
|
+
results: List[Optional[List[Dict[str, Any]]]] = [None] * len(frames)
|
2413
|
+
|
2414
|
+
if chunk_length is None:
|
2415
|
+
step = 1 # Process every frame
|
2416
|
+
elif chunk_length <= 0:
|
2417
|
+
raise ValueError("chunk_length must be a positive integer or None.")
|
2418
|
+
else:
|
2419
|
+
step = chunk_length # Process frames with the specified step size
|
2420
|
+
|
2421
|
+
for idx in range(0, len(frames), step):
|
2422
|
+
if od_model == ODModels.COUNTGD:
|
2423
|
+
results[idx] = countgd_object_detection(prompt=prompt, image=frames[idx])
|
2424
|
+
function_name = "countgd_object_detection"
|
2425
|
+
elif od_model == ODModels.OWLV2:
|
2426
|
+
results[idx] = owl_v2_image(
|
2427
|
+
prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
|
2428
|
+
)
|
2429
|
+
function_name = "owl_v2_image"
|
2430
|
+
elif od_model == ODModels.FLORENCE2:
|
2431
|
+
results[idx] = florence2_sam2_image(
|
2432
|
+
prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
|
2433
|
+
)
|
2434
|
+
function_name = "florence2_sam2_image"
|
2435
|
+
else:
|
2436
|
+
raise NotImplementedError(
|
2437
|
+
f"Object detection model '{od_model}' is not implemented."
|
2438
|
+
)
|
2439
|
+
|
2440
|
+
image_size = frames[0].shape[:2]
|
2441
|
+
|
2442
|
+
def _transform_detections(
|
2443
|
+
input_list: List[Optional[List[Dict[str, Any]]]]
|
2444
|
+
) -> List[Optional[Dict[str, Any]]]:
|
2445
|
+
output_list: List[Optional[Dict[str, Any]]] = []
|
2446
|
+
|
2447
|
+
for idx, frame in enumerate(input_list):
|
2448
|
+
if frame is not None:
|
2449
|
+
labels = [detection["label"] for detection in frame]
|
2450
|
+
bboxes = [
|
2451
|
+
denormalize_bbox(detection["bbox"], image_size)
|
2452
|
+
for detection in frame
|
2453
|
+
]
|
2454
|
+
|
2455
|
+
output_list.append(
|
2456
|
+
{
|
2457
|
+
"labels": labels,
|
2458
|
+
"bboxes": bboxes,
|
2459
|
+
}
|
2460
|
+
)
|
2461
|
+
else:
|
2462
|
+
output_list.append(None)
|
2463
|
+
|
2464
|
+
return output_list
|
2465
|
+
|
2466
|
+
output = _transform_detections(results)
|
2467
|
+
|
2468
|
+
buffer_bytes = frames_to_bytes(frames)
|
2469
|
+
files = [("video", buffer_bytes)]
|
2470
|
+
payload = {"bboxes": json.dumps(output), "chunk_length": chunk_length}
|
2471
|
+
metadata = {"function_name": function_name}
|
2472
|
+
|
2473
|
+
detections = send_task_inference_request(
|
2474
|
+
payload,
|
2475
|
+
"sam2",
|
2476
|
+
files=files,
|
2477
|
+
metadata=metadata,
|
2478
|
+
)
|
2479
|
+
|
2480
|
+
return_data = []
|
2481
|
+
for frame in detections:
|
2482
|
+
return_frame_data = []
|
2483
|
+
for detection in frame:
|
2484
|
+
mask = rle_decode_array(detection["mask"])
|
2485
|
+
label = str(detection["id"]) + ": " + detection["label"]
|
2486
|
+
return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
|
2487
|
+
return_data.append(return_frame_data)
|
2488
|
+
return_data = add_bboxes_from_masks(return_data)
|
2489
|
+
return nms(return_data, iou_threshold=0.95)
|
2490
|
+
|
2491
|
+
|
2492
|
+
def countgd_sam2_video_tracking(
|
2493
|
+
prompt: str,
|
2494
|
+
frames: List[np.ndarray],
|
2495
|
+
chunk_length: Optional[int] = 10,
|
2496
|
+
) -> List[List[Dict[str, Any]]]:
|
2497
|
+
"""'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
|
2498
|
+
prompt such as category names or referring expressions. The categories in the text
|
2499
|
+
prompt are separated by commas. It returns a list of bounding boxes, label names,
|
2500
|
+
mask file names and associated probability scores.
|
2501
|
+
|
2502
|
+
Parameters:
|
2503
|
+
prompt (str): The prompt to ground to the image.
|
2504
|
+
image (np.ndarray): The image to ground the prompt to.
|
2505
|
+
|
2506
|
+
Returns:
|
2507
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
2508
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
2509
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
2510
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
2511
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
2512
|
+
the background.
|
2513
|
+
|
2514
|
+
Example
|
2515
|
+
-------
|
2516
|
+
>>> countgd_sam2_video_tracking("car, dinosaur", frames)
|
2517
|
+
[
|
2518
|
+
[
|
2519
|
+
{
|
2520
|
+
'label': '0: dinosaur',
|
2521
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
2522
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
2523
|
+
[0, 0, 0, ..., 0, 0, 0],
|
2524
|
+
...,
|
2525
|
+
[0, 0, 0, ..., 0, 0, 0],
|
2526
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
2527
|
+
},
|
2528
|
+
],
|
2529
|
+
...
|
2530
|
+
]
|
2531
|
+
"""
|
2532
|
+
|
2533
|
+
return od_sam2_video_tracking(
|
2534
|
+
ODModels.COUNTGD, prompt=prompt, frames=frames, chunk_length=chunk_length
|
2535
|
+
)
|
2536
|
+
|
2537
|
+
|
2538
|
+
def owlv2_sam2_video_tracking(
|
2539
|
+
prompt: str,
|
2540
|
+
frames: List[np.ndarray],
|
2541
|
+
chunk_length: Optional[int] = 10,
|
2542
|
+
fine_tune_id: Optional[str] = None,
|
2543
|
+
) -> List[List[Dict[str, Any]]]:
|
2544
|
+
"""'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
|
2545
|
+
prompt such as category names or referring expressions. The categories in the text
|
2546
|
+
prompt are separated by commas. It returns a list of bounding boxes, label names,
|
2547
|
+
mask file names and associated probability scores.
|
2548
|
+
|
2549
|
+
Parameters:
|
2550
|
+
prompt (str): The prompt to ground to the image.
|
2551
|
+
image (np.ndarray): The image to ground the prompt to.
|
2552
|
+
|
2553
|
+
Returns:
|
2554
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
2555
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
2556
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
2557
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
2558
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
2559
|
+
the background.
|
2560
|
+
|
2561
|
+
Example
|
2562
|
+
-------
|
2563
|
+
>>> countgd_sam2_video_tracking("car, dinosaur", frames)
|
2564
|
+
[
|
2565
|
+
[
|
2566
|
+
{
|
2567
|
+
'label': '0: dinosaur',
|
2568
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
2569
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
2570
|
+
[0, 0, 0, ..., 0, 0, 0],
|
2571
|
+
...,
|
2572
|
+
[0, 0, 0, ..., 0, 0, 0],
|
2573
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
2574
|
+
},
|
2575
|
+
],
|
2576
|
+
...
|
2577
|
+
]
|
2578
|
+
"""
|
2579
|
+
|
2580
|
+
return od_sam2_video_tracking(
|
2581
|
+
ODModels.OWLV2,
|
2582
|
+
prompt=prompt,
|
2583
|
+
frames=frames,
|
2584
|
+
chunk_length=chunk_length,
|
2585
|
+
fine_tune_id=fine_tune_id,
|
2586
|
+
)
|
2587
|
+
|
2588
|
+
|
2628
2589
|
FUNCTION_TOOLS = [
|
2629
2590
|
owl_v2_image,
|
2630
2591
|
owl_v2_video,
|