vision-agent 0.2.210__py3-none-any.whl → 0.2.212__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -4,7 +4,9 @@ import logging
4
4
  import os
5
5
  import tempfile
6
6
  import urllib.request
7
+ from base64 import b64encode
7
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ from enum import Enum
8
10
  from functools import lru_cache
9
11
  from importlib import resources
10
12
  from pathlib import Path
@@ -14,6 +16,7 @@ from uuid import UUID
14
16
  import cv2
15
17
  import numpy as np
16
18
  import requests
19
+ from IPython.display import display
17
20
  from PIL import Image, ImageDraw, ImageFont
18
21
  from pillow_heif import register_heif_opener # type: ignore
19
22
  from pytube import YouTube # type: ignore
@@ -21,8 +24,8 @@ from pytube import YouTube # type: ignore
21
24
  from vision_agent.clients.landing_public_api import LandingPublicAPI
22
25
  from vision_agent.lmm.lmm import AnthropicLMM, OpenAILMM
23
26
  from vision_agent.tools.tool_utils import (
27
+ ToolCallTrace,
24
28
  add_bboxes_from_masks,
25
- filter_bboxes_by_threshold,
26
29
  get_tool_descriptions,
27
30
  get_tool_documentation,
28
31
  get_tools_df,
@@ -32,7 +35,7 @@ from vision_agent.tools.tool_utils import (
32
35
  send_task_inference_request,
33
36
  single_nms,
34
37
  )
35
- from vision_agent.tools.tools_types import JobStatus, ODResponseData
38
+ from vision_agent.tools.tools_types import JobStatus
36
39
  from vision_agent.utils.exceptions import FineTuneModelIsNotReady
37
40
  from vision_agent.utils.execute import FileSerializer, MimeType
38
41
  from vision_agent.utils.image_utils import (
@@ -41,7 +44,6 @@ from vision_agent.utils.image_utils import (
41
44
  convert_to_b64,
42
45
  denormalize_bbox,
43
46
  encode_image_bytes,
44
- get_image_size,
45
47
  normalize_bbox,
46
48
  numpy_to_bytes,
47
49
  rle_decode,
@@ -88,66 +90,33 @@ def get_tool_recommender() -> Sim:
88
90
  return load_cached_sim(TOOLS_DF)
89
91
 
90
92
 
91
- def grounding_dino(
92
- prompt: str,
93
- image: np.ndarray,
94
- box_threshold: float = 0.20,
95
- iou_threshold: float = 0.20,
96
- model_size: str = "large",
97
- ) -> List[Dict[str, Any]]:
98
- """'grounding_dino' is a tool that can detect and count multiple objects given a text
99
- prompt such as category names or referring expressions. The categories in text prompt
100
- are separated by commas or periods. It returns a list of bounding boxes with
101
- normalized coordinates, label names and associated probability scores.
102
-
103
- Parameters:
104
- prompt (str): The prompt to ground to the image.
105
- image (np.ndarray): The image to ground the prompt to.
106
- box_threshold (float, optional): The threshold for the box detection. Defaults
107
- to 0.20.
108
- iou_threshold (float, optional): The threshold for the Intersection over Union
109
- (IoU). Defaults to 0.20.
110
- model_size (str, optional): The size of the model to use.
111
-
112
- Returns:
113
- List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
114
- bounding box of the detected objects with normalized coordinates between 0
115
- and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
116
- top-left and xmax and ymax are the coordinates of the bottom-right of the
117
- bounding box.
118
-
119
- Example
120
- -------
121
- >>> grounding_dino("car. dinosaur", image)
122
- [
123
- {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
124
- {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
125
- ]
126
- """
127
- image_size = image.shape[:2]
128
- image_b64 = convert_to_b64(image)
129
- if model_size not in ["large", "tiny"]:
130
- raise ValueError("model_size must be either 'large' or 'tiny'")
131
- request_data = {
132
- "prompt": prompt,
133
- "image": image_b64,
134
- "tool": (
135
- "visual_grounding" if model_size == "large" else "visual_grounding_tiny"
136
- ),
137
- "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
138
- "function_name": "grounding_dino",
139
- }
140
- data: Dict[str, Any] = send_inference_request(request_data, "tools")
141
- return_data = []
142
- for i in range(len(data["bboxes"])):
143
- return_data.append(
144
- {
145
- "score": round(data["scores"][i], 2),
146
- "label": data["labels"][i],
147
- "bbox": normalize_bbox(data["bboxes"][i], image_size),
148
- }
149
- )
150
- return return_data
93
+ def _display_tool_trace(
94
+ function_name: str,
95
+ request: Dict[str, Any],
96
+ response: Any,
97
+ files: Union[List[Tuple[str, bytes]], str],
98
+ ) -> None:
99
+ # Sends data through IPython's display function so front-end can show them. We use
100
+ # a function here instead of a decarator becuase we do not want to re-calculate data
101
+ # such as video bytes, which can be slow. Since this is calculated inside the
102
+ # function we can't capture it with a decarator without adding it as a return value
103
+ # which would change the function signature and affect the agent.
104
+ files_in_b64: List[Tuple[str, str]]
105
+ if isinstance(files, str):
106
+ files_in_b64 = [("images", files)]
107
+ else:
108
+ files_in_b64 = [(file[0], b64encode(file[1]).decode("utf-8")) for file in files]
109
+
110
+ request["function_name"] = function_name
111
+ tool_call_trace = ToolCallTrace(
112
+ endpoint_url="",
113
+ type="tool_func_call",
114
+ request=request,
115
+ response={"data": response},
116
+ error=None,
117
+ files=files_in_b64,
118
+ )
119
+ display({MimeType.APPLICATION_JSON: tool_call_trace.model_dump()}, raw=True)
151
120
 
152
121
 
153
122
  def owl_v2_image(
@@ -223,14 +192,21 @@ def owl_v2_image(
223
192
  # get the first frame
224
193
  bboxes = detections[0]
225
194
  bboxes_formatted = [
226
- ODResponseData(
227
- label=bbox["label"],
228
- bbox=normalize_bbox(bbox["bounding_box"], image_size),
229
- score=round(bbox["score"], 2),
230
- )
195
+ {
196
+ "label": bbox["label"],
197
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
198
+ "score": round(bbox["score"], 2),
199
+ }
231
200
  for bbox in bboxes
232
201
  ]
233
- return [bbox.model_dump() for bbox in bboxes_formatted]
202
+
203
+ _display_tool_trace(
204
+ owl_v2_image.__name__,
205
+ payload,
206
+ detections[0],
207
+ files,
208
+ )
209
+ return bboxes_formatted
234
210
 
235
211
 
236
212
  def owl_v2_video(
@@ -309,81 +285,21 @@ def owl_v2_video(
309
285
  bboxes_formatted = []
310
286
  for frame_data in detections:
311
287
  bboxes_formatted_per_frame = [
312
- ODResponseData(
313
- label=bbox["label"],
314
- bbox=normalize_bbox(bbox["bounding_box"], image_size),
315
- score=round(bbox["score"], 2),
316
- )
288
+ {
289
+ "label": bbox["label"],
290
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
291
+ "score": round(bbox["score"], 2),
292
+ }
317
293
  for bbox in frame_data
318
294
  ]
319
295
  bboxes_formatted.append(bboxes_formatted_per_frame)
320
- return [[bbox.model_dump() for bbox in frame] for frame in bboxes_formatted]
321
-
322
-
323
- def grounding_sam(
324
- prompt: str,
325
- image: np.ndarray,
326
- box_threshold: float = 0.20,
327
- iou_threshold: float = 0.20,
328
- ) -> List[Dict[str, Any]]:
329
- """'grounding_sam' is a tool that can segment multiple objects given a text prompt
330
- such as category names or referring expressions. The categories in text prompt are
331
- separated by commas or periods. It returns a list of bounding boxes, label names,
332
- mask file names and associated probability scores.
333
-
334
- Parameters:
335
- prompt (str): The prompt to ground to the image.
336
- image (np.ndarray): The image to ground the prompt to.
337
- box_threshold (float, optional): The threshold for the box detection. Defaults
338
- to 0.20.
339
- iou_threshold (float, optional): The threshold for the Intersection over Union
340
- (IoU). Defaults to 0.20.
341
-
342
- Returns:
343
- List[Dict[str, Any]]: A list of dictionaries containing the score, label,
344
- bounding box, and mask of the detected objects with normalized coordinates
345
- (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
346
- and xmax and ymax are the coordinates of the bottom-right of the bounding box.
347
- The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
348
- the background.
349
-
350
- Example
351
- -------
352
- >>> grounding_sam("car. dinosaur", image)
353
- [
354
- {
355
- 'score': 0.99,
356
- 'label': 'dinosaur',
357
- 'bbox': [0.1, 0.11, 0.35, 0.4],
358
- 'mask': array([[0, 0, 0, ..., 0, 0, 0],
359
- [0, 0, 0, ..., 0, 0, 0],
360
- ...,
361
- [0, 0, 0, ..., 0, 0, 0],
362
- [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
363
- },
364
- ]
365
- """
366
- image_size = image.shape[:2]
367
- image_b64 = convert_to_b64(image)
368
- request_data = {
369
- "prompt": prompt,
370
- "image": image_b64,
371
- "tool": "visual_grounding_segment",
372
- "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
373
- "function_name": "grounding_sam",
374
- }
375
- data: Dict[str, Any] = send_inference_request(request_data, "tools")
376
- return_data = []
377
- for i in range(len(data["bboxes"])):
378
- return_data.append(
379
- {
380
- "score": round(data["scores"][i], 2),
381
- "label": data["labels"][i],
382
- "bbox": normalize_bbox(data["bboxes"][i], image_size),
383
- "mask": rle_decode(mask_rle=data["masks"][i], shape=data["mask_shape"]),
384
- }
385
- )
386
- return return_data
296
+ _display_tool_trace(
297
+ owl_v2_video.__name__,
298
+ payload,
299
+ detections[0],
300
+ files,
301
+ )
302
+ return bboxes_formatted
387
303
 
388
304
 
389
305
  def florence2_sam2_image(
@@ -460,6 +376,13 @@ def florence2_sam2_image(
460
376
  label = detection["label"]
461
377
  bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
462
378
  return_data.append({"label": label, "bbox": bbox, "mask": mask, "score": 1.0})
379
+
380
+ _display_tool_trace(
381
+ florence2_sam2_image.__name__,
382
+ payload,
383
+ detections[0],
384
+ files,
385
+ )
463
386
  return return_data
464
387
 
465
388
 
@@ -545,10 +468,36 @@ def florence2_sam2_video_tracking(
545
468
  for detection in frame:
546
469
  mask = rle_decode_array(detection["mask"])
547
470
  label = str(detection["id"]) + ": " + detection["label"]
548
- return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
471
+ return_frame_data.append(
472
+ {"label": label, "mask": mask, "score": 1.0, "rle": detection["mask"]}
473
+ )
549
474
  return_data.append(return_frame_data)
550
475
  return_data = add_bboxes_from_masks(return_data)
551
- return nms(return_data, iou_threshold=0.95)
476
+ return_data = nms(return_data, iou_threshold=0.95)
477
+
478
+ _display_tool_trace(
479
+ florence2_sam2_video_tracking.__name__,
480
+ payload,
481
+ [
482
+ [
483
+ {
484
+ "label": e["label"],
485
+ "score": e["score"],
486
+ "bbox": denormalize_bbox(e["bbox"], frames[0].shape[:2]),
487
+ "mask": e["rle"],
488
+ }
489
+ for e in lst
490
+ ]
491
+ for lst in return_data
492
+ ],
493
+ files,
494
+ )
495
+ # We save the RLE for display purposes, re-calculting RLE can get very expensive.
496
+ # Deleted here because we are returning the numpy masks instead
497
+ for frame in return_data:
498
+ for obj in frame:
499
+ del obj["rle"]
500
+ return return_data
552
501
 
553
502
 
554
503
  def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
@@ -603,128 +552,134 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
603
552
  box = normalize_bbox(box, image_size)
604
553
  output.append({"label": label, "bbox": box, "score": round(det["score"], 2)})
605
554
 
606
- ocr_results = sorted(output, key=lambda x: (x["bbox"][1], x["bbox"][0]))
607
- return ocr_results
608
-
609
-
610
- def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
611
- """'loca_zero_shot_counting' is a tool that counts the dominant foreground object given
612
- an image and no other information about the content. It returns only the count of
613
- the objects in the image.
614
-
615
- Parameters:
616
- image (np.ndarray): The image that contains lot of instances of a single object
617
-
618
- Returns:
619
- Dict[str, Any]: A dictionary containing the key 'count' and the count as a
620
- value, e.g. {count: 12} and a heat map for visualization purposes.
621
-
622
- Example
623
- -------
624
- >>> loca_zero_shot_counting(image)
625
- {'count': 83,
626
- 'heat_map': array([[ 0, 0, 0, ..., 0, 0, 0],
627
- [ 0, 0, 0, ..., 0, 0, 0],
628
- [ 0, 0, 0, ..., 0, 0, 1],
629
- ...,
630
- [ 0, 0, 0, ..., 30, 35, 41],
631
- [ 0, 0, 0, ..., 41, 47, 53],
632
- [ 0, 0, 0, ..., 53, 59, 64]], dtype=uint8)}
633
- """
634
-
635
- image_b64 = convert_to_b64(image)
636
- data = {
637
- "image": image_b64,
638
- "function_name": "loca_zero_shot_counting",
639
- }
640
- resp_data: dict[str, Any] = send_inference_request(data, "loca", v2=True)
641
- resp_data["heat_map"] = np.array(resp_data["heat_map"][0]).astype(np.uint8)
642
- return resp_data
555
+ _display_tool_trace(
556
+ ocr.__name__,
557
+ {},
558
+ data,
559
+ cast(List[Tuple[str, bytes]], [("image", buffer_bytes)]),
560
+ )
561
+ return sorted(output, key=lambda x: (x["bbox"][1], x["bbox"][0]))
643
562
 
644
563
 
645
- def loca_visual_prompt_counting(
646
- image: np.ndarray, visual_prompt: Dict[str, List[float]]
564
+ def _sam2(
565
+ image: np.ndarray,
566
+ detections: List[Dict[str, Any]],
567
+ image_size: Tuple[int, ...],
568
+ image_bytes: Optional[bytes] = None,
647
569
  ) -> Dict[str, Any]:
648
- """'loca_visual_prompt_counting' is a tool that counts the dominant foreground object
649
- given an image and a visual prompt which is a bounding box describing the object.
650
- It returns only the count of the objects in the image.
651
-
652
- Parameters:
653
- image (np.ndarray): The image that contains lot of instances of a single object
654
- visual_prompt (Dict[str, List[float]]): Bounding box of the object in
655
- format [xmin, ymin, xmax, ymax]. Only 1 bounding box can be provided.
656
-
657
- Returns:
658
- Dict[str, Any]: A dictionary containing the key 'count' and the count as a
659
- value, e.g. {count: 12} and a heat map for visualization purposes.
660
-
661
- Example
662
- -------
663
- >>> loca_visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
664
- {'count': 83,
665
- 'heat_map': array([[ 0, 0, 0, ..., 0, 0, 0],
666
- [ 0, 0, 0, ..., 0, 0, 0],
667
- [ 0, 0, 0, ..., 0, 0, 1],
668
- ...,
669
- [ 0, 0, 0, ..., 30, 35, 41],
670
- [ 0, 0, 0, ..., 41, 47, 53],
671
- [ 0, 0, 0, ..., 53, 59, 64]], dtype=uint8)}
672
- """
673
-
674
- image_size = get_image_size(image)
675
- bbox = visual_prompt["bbox"]
676
- image_b64 = convert_to_b64(image)
570
+ if image_bytes is None:
571
+ image_bytes = numpy_to_bytes(image)
677
572
 
678
- data = {
679
- "image": image_b64,
680
- "bbox": list(map(int, denormalize_bbox(bbox, image_size))),
681
- "function_name": "loca_visual_prompt_counting",
573
+ files = [("images", image_bytes)]
574
+ payload = {
575
+ "model": "sam2",
576
+ "bboxes": json.dumps(
577
+ [
578
+ {
579
+ "labels": [d["label"] for d in detections],
580
+ "bboxes": [
581
+ denormalize_bbox(d["bbox"], image_size) for d in detections
582
+ ],
583
+ }
584
+ ]
585
+ ),
682
586
  }
683
- resp_data: dict[str, Any] = send_inference_request(data, "loca", v2=True)
684
- resp_data["heat_map"] = np.array(resp_data["heat_map"][0]).astype(np.uint8)
685
- return resp_data
587
+
588
+ metadata = {"function_name": "sam2"}
589
+ pred_detections = send_task_inference_request(
590
+ payload, "sam2", files=files, metadata=metadata
591
+ )
592
+ frame = pred_detections[0]
593
+ return_data = []
594
+ display_data = []
595
+ for inp_detection, detection in zip(detections, frame):
596
+ mask = rle_decode_array(detection["mask"])
597
+ label = detection["label"]
598
+ bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
599
+ return_data.append(
600
+ {
601
+ "label": label,
602
+ "bbox": bbox,
603
+ "mask": mask,
604
+ "score": inp_detection["score"],
605
+ }
606
+ )
607
+ display_data.append(
608
+ {
609
+ "label": label,
610
+ "bbox": detection["bounding_box"],
611
+ "mask": detection["mask"],
612
+ "score": inp_detection["score"],
613
+ }
614
+ )
615
+ return {"files": files, "return_data": return_data, "display_data": display_data}
686
616
 
687
617
 
688
- def countgd_object_detection(
689
- prompt: str,
618
+ def sam2(
690
619
  image: np.ndarray,
691
- box_threshold: float = 0.23,
620
+ detections: List[Dict[str, Any]],
692
621
  ) -> List[Dict[str, Any]]:
693
- """'countgd_object_detection' is a tool that can detect multiple instances of an
694
- object given a text prompt. It is particularly useful when trying to detect and
695
- count a large number of objects. You can optionally separate object names in the
696
- prompt with commas. It returns a list of bounding boxes with normalized
697
- coordinates, label names and associated confidence scores.
622
+ """'sam2' is a tool that can segment multiple objects given an input bounding box,
623
+ label and score. It returns a set of masks along with the corresponding bounding
624
+ boxes and labels.
698
625
 
699
626
  Parameters:
700
- prompt (str): The object that needs to be counted.
701
627
  image (np.ndarray): The image that contains multiple instances of the object.
702
- box_threshold (float, optional): The threshold for detection. Defaults
703
- to 0.23.
628
+ detections (List[Dict[str, Any]]): A list of dictionaries containing the score,
629
+ label, and bounding box of the detected objects with normalized coordinates
630
+ between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates
631
+ of the top-left and xmax and ymax are the coordinates of the bottom-right of
632
+ the bounding box.
704
633
 
705
634
  Returns:
706
- List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
707
- bounding box of the detected objects with normalized coordinates between 0
708
- and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
709
- top-left and xmax and ymax are the coordinates of the bottom-right of the
710
- bounding box.
635
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
636
+ bounding box, and mask of the detected objects with normalized coordinates
637
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
638
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
639
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
640
+ the background.
711
641
 
712
642
  Example
713
643
  -------
714
- >>> countgd_object_detection("flower", image)
644
+ >>> sam2(image, [
645
+ {'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
646
+ ])
715
647
  [
716
- {'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
717
- {'score': 0.68, 'label': 'flower', 'bbox': [0.2, 0.21, 0.45, 0.5},
718
- {'score': 0.78, 'label': 'flower', 'bbox': [0.3, 0.35, 0.48, 0.52},
719
- {'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
648
+ {
649
+ 'score': 0.49,
650
+ 'label': 'flower',
651
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
652
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
653
+ [0, 0, 0, ..., 0, 0, 0],
654
+ ...,
655
+ [0, 0, 0, ..., 0, 0, 0],
656
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
657
+ },
720
658
  ]
721
659
  """
722
660
  image_size = image.shape[:2]
723
- if image_size[0] < 1 or image_size[1] < 1:
724
- return []
661
+ ret = _sam2(image, detections, image_size)
662
+ _display_tool_trace(
663
+ sam2.__name__,
664
+ {},
665
+ ret["display_data"],
666
+ ret["files"],
667
+ )
725
668
 
726
- buffer_bytes = numpy_to_bytes(image)
727
- files = [("image", buffer_bytes)]
669
+ return ret["return_data"] # type: ignore
670
+
671
+
672
+ def _countgd_object_detection(
673
+ prompt: str,
674
+ image: np.ndarray,
675
+ box_threshold: float,
676
+ image_size: Tuple[int, ...],
677
+ image_bytes: Optional[bytes] = None,
678
+ ) -> Dict[str, Any]:
679
+ if image_bytes is None:
680
+ image_bytes = numpy_to_bytes(image)
681
+
682
+ files = [("image", image_bytes)]
728
683
  prompts = [p.strip() for p in prompt.split(", ")]
729
684
 
730
685
  def _run_countgd(prompt: str) -> List[Dict[str, Any]]:
@@ -747,97 +702,76 @@ def countgd_object_detection(
747
702
  for future in as_completed(futures):
748
703
  bboxes.extend(future.result())
749
704
 
750
- bboxes_formatted = [
751
- ODResponseData(
752
- label=bbox["label"],
753
- bbox=normalize_bbox(bbox["bounding_box"], image_size),
754
- score=round(bbox["score"], 2),
755
- )
705
+ return_data = [
706
+ {
707
+ "label": bbox["label"],
708
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
709
+ "score": round(bbox["score"], 2),
710
+ }
756
711
  for bbox in bboxes
757
712
  ]
758
- # TODO: remove this once we start to use the confidence on countgd
759
- filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
760
- return_data = [bbox.model_dump() for bbox in filtered_bboxes]
761
- return single_nms(return_data, iou_threshold=0.80)
762
713
 
714
+ return_data = single_nms(return_data, iou_threshold=0.80)
715
+ display_data = [
716
+ {
717
+ "label": e["label"],
718
+ "score": e["score"],
719
+ "bbox": denormalize_bbox(e["bbox"], image_size),
720
+ }
721
+ for e in return_data
722
+ ]
723
+ return {"files": files, "return_data": return_data, "display_data": display_data}
763
724
 
764
- def sam2(
725
+
726
+ def countgd_object_detection(
727
+ prompt: str,
765
728
  image: np.ndarray,
766
- detections: List[Dict[str, Any]],
729
+ box_threshold: float = 0.23,
767
730
  ) -> List[Dict[str, Any]]:
768
- """'sam2' is a tool that can segment multiple objects given an input bounding box,
769
- label and score. It returns a set of masks along with the corresponding bounding
770
- boxes and labels.
731
+ """'countgd_object_detection' is a tool that can detect multiple instances of an
732
+ object given a text prompt. It is particularly useful when trying to detect and
733
+ count a large number of objects. You can optionally separate object names in the
734
+ prompt with commas. It returns a list of bounding boxes with normalized
735
+ coordinates, label names and associated confidence scores.
771
736
 
772
737
  Parameters:
738
+ prompt (str): The object that needs to be counted.
773
739
  image (np.ndarray): The image that contains multiple instances of the object.
774
- detections (List[Dict[str, Any]]): A list of dictionaries containing the score,
775
- label, and bounding box of the detected objects with normalized coordinates
776
- between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates
777
- of the top-left and xmax and ymax are the coordinates of the bottom-right of
778
- the bounding box.
740
+ box_threshold (float, optional): The threshold for detection. Defaults
741
+ to 0.23.
779
742
 
780
743
  Returns:
781
- List[Dict[str, Any]]: A list of dictionaries containing the score, label,
782
- bounding box, and mask of the detected objects with normalized coordinates
783
- (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
784
- and xmax and ymax are the coordinates of the bottom-right of the bounding box.
785
- The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
786
- the background.
744
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
745
+ bounding box of the detected objects with normalized coordinates between 0
746
+ and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
747
+ top-left and xmax and ymax are the coordinates of the bottom-right of the
748
+ bounding box.
787
749
 
788
750
  Example
789
751
  -------
790
- >>> sam2(image, [
791
- {'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
792
- ])
752
+ >>> countgd_object_detection("flower", image)
793
753
  [
794
- {
795
- 'score': 0.49,
796
- 'label': 'flower',
797
- 'bbox': [0.1, 0.11, 0.35, 0.4],
798
- 'mask': array([[0, 0, 0, ..., 0, 0, 0],
799
- [0, 0, 0, ..., 0, 0, 0],
800
- ...,
801
- [0, 0, 0, ..., 0, 0, 0],
802
- [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
803
- },
754
+ {'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
755
+ {'score': 0.68, 'label': 'flower', 'bbox': [0.2, 0.21, 0.45, 0.5},
756
+ {'score': 0.78, 'label': 'flower', 'bbox': [0.3, 0.35, 0.48, 0.52},
757
+ {'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
804
758
  ]
805
759
  """
806
760
  image_size = image.shape[:2]
761
+ if image_size[0] < 1 or image_size[1] < 1:
762
+ return []
807
763
 
808
- files = [("images", numpy_to_bytes(image))]
809
- payload = {
810
- "model": "sam2",
811
- "bboxes": json.dumps(
812
- [
813
- {
814
- "labels": [d["label"] for d in detections],
815
- "bboxes": [
816
- denormalize_bbox(d["bbox"], image_size) for d in detections
817
- ],
818
- }
819
- ]
820
- ),
821
- }
822
- metadata = {"function_name": "sam2"}
823
- pred_detections = send_task_inference_request(
824
- payload, "sam2", files=files, metadata=metadata
764
+ ret = _countgd_object_detection(prompt, image, box_threshold, image_size)
765
+ _display_tool_trace(
766
+ countgd_object_detection.__name__,
767
+ {
768
+ "prompts": prompt,
769
+ "confidence": box_threshold,
770
+ },
771
+ ret["display_data"],
772
+ ret["files"],
825
773
  )
826
- frame = pred_detections[0]
827
- return_data = []
828
- for inp_detection, detection in zip(detections, frame):
829
- mask = rle_decode_array(detection["mask"])
830
- label = detection["label"]
831
- bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
832
- return_data.append(
833
- {
834
- "label": label,
835
- "bbox": bbox,
836
- "mask": mask,
837
- "score": inp_detection["score"],
838
- }
839
- )
840
- return return_data
774
+ return ret["return_data"] # type: ignore
841
775
 
842
776
 
843
777
  def countgd_sam2_object_detection(
@@ -881,9 +815,23 @@ def countgd_sam2_object_detection(
881
815
  },
882
816
  ]
883
817
  """
884
- detections = countgd_object_detection(prompt, image, box_threshold)
885
- detections_with_masks = sam2(image, detections)
886
- return detections_with_masks
818
+
819
+ od_ret = _countgd_object_detection(prompt, image, box_threshold, image.shape[:2])
820
+ seg_ret = _sam2(
821
+ image, od_ret["return_data"], image.shape[:2], image_bytes=od_ret["files"][0][1]
822
+ )
823
+
824
+ _display_tool_trace(
825
+ countgd_sam2_object_detection.__name__,
826
+ {
827
+ "prompts": prompt,
828
+ "confidence": box_threshold,
829
+ },
830
+ seg_ret["display_data"],
831
+ seg_ret["files"],
832
+ )
833
+
834
+ return seg_ret["return_data"] # type: ignore
887
835
 
888
836
 
889
837
  def countgd_example_based_counting(
@@ -941,76 +889,28 @@ def countgd_example_based_counting(
941
889
  # get the first frame
942
890
  bboxes_per_frame = detections[0]
943
891
  bboxes_formatted = [
944
- ODResponseData(
945
- label=bbox["label"],
946
- bbox=normalize_bbox(bbox["bounding_box"], image_size),
947
- score=round(bbox["score"], 2),
948
- )
892
+ {
893
+ "label": bbox["label"],
894
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
895
+ "score": round(bbox["score"], 2),
896
+ }
949
897
  for bbox in bboxes_per_frame
950
898
  ]
951
- filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
952
- return [bbox.model_dump() for bbox in filtered_bboxes]
953
-
954
-
955
- def florence2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
956
- """'florence2_roberta_vqa' is a tool that takes an image and analyzes
957
- its contents, generates detailed captions and then tries to answer the given
958
- question using the generated context. It returns text as an answer to the question.
959
-
960
- Parameters:
961
- prompt (str): The question about the image
962
- image (np.ndarray): The reference image used for the question
963
-
964
- Returns:
965
- str: A string which is the answer to the given prompt.
966
-
967
- Example
968
- -------
969
- >>> florence2_roberta_vqa('What is the top left animal in this image?', image)
970
- 'white tiger'
971
- """
972
-
973
- image_b64 = convert_to_b64(image)
974
- data = {
975
- "image": image_b64,
976
- "question": prompt,
977
- "function_name": "florence2_roberta_vqa",
978
- }
979
-
980
- answer = send_inference_request(data, "florence2-qa", v2=True)
981
- return answer # type: ignore
982
-
983
-
984
- def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
985
- """'ixc25_image_vqa' is a tool that can answer any questions about arbitrary images
986
- including regular images or images of documents or presentations. It returns text
987
- as an answer to the question.
988
-
989
- Parameters:
990
- prompt (str): The question about the image
991
- image (np.ndarray): The reference image used for the question
992
-
993
- Returns:
994
- str: A string which is the answer to the given prompt.
995
-
996
- Example
997
- -------
998
- >>> ixc25_image_vqa('What is the cat doing?', image)
999
- 'drinking milk'
1000
- """
1001
- if image.shape[0] < 1 or image.shape[1] < 1:
1002
- raise ValueError(f"Image is empty, image shape: {image.shape}")
1003
-
1004
- buffer_bytes = numpy_to_bytes(image)
1005
- files = [("image", buffer_bytes)]
1006
- payload = {
1007
- "prompt": prompt,
1008
- "function_name": "ixc25_image_vqa",
1009
- }
1010
- data: Dict[str, Any] = send_inference_request(
1011
- payload, "internlm-xcomposer2", files=files, v2=True
899
+ _display_tool_trace(
900
+ countgd_example_based_counting.__name__,
901
+ payload,
902
+ [
903
+ {
904
+ "label": e["label"],
905
+ "score": e["score"],
906
+ "bbox": denormalize_bbox(e["bbox"], image_size),
907
+ }
908
+ for e in bboxes_formatted
909
+ ],
910
+ files,
1012
911
  )
1013
- return cast(str, data["answer"])
912
+
913
+ return bboxes_formatted
1014
914
 
1015
915
 
1016
916
  def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
@@ -1047,61 +947,13 @@ def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
1047
947
  data: Dict[str, Any] = send_inference_request(
1048
948
  payload, "image-to-text", files=files, v2=True
1049
949
  )
1050
- return cast(str, data)
1051
-
1052
-
1053
- def claude35_text_extraction(image: np.ndarray) -> str:
1054
- """'claude35_text_extraction' is a tool that can extract text from an image. It
1055
- returns the extracted text as a string and can be used as an alternative to OCR if
1056
- you do not need to know the exact bounding box of the text.
1057
-
1058
- Parameters:
1059
- image (np.ndarray): The image to extract text from.
1060
-
1061
- Returns:
1062
- str: The extracted text from the image.
1063
- """
1064
-
1065
- lmm = AnthropicLMM()
1066
- buffer = io.BytesIO()
1067
- Image.fromarray(image).save(buffer, format="PNG")
1068
- image_bytes = buffer.getvalue()
1069
- image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
1070
- text = lmm.generate(
1071
- "Extract and return any text you see in this image and nothing else. If you do not read any text respond with an empty string.",
1072
- [image_b64],
1073
- )
1074
- return cast(str, text)
1075
-
1076
-
1077
- def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
1078
- """'ixc25_video_vqa' is a tool that can answer any questions about arbitrary videos
1079
- including regular videos or videos of documents or presentations. It returns text
1080
- as an answer to the question.
1081
-
1082
- Parameters:
1083
- prompt (str): The question about the video
1084
- frames (List[np.ndarray]): The reference frames used for the question
1085
-
1086
- Returns:
1087
- str: A string which is the answer to the given prompt.
1088
-
1089
- Example
1090
- -------
1091
- >>> ixc25_video_vqa('Which football player made the goal?', frames)
1092
- 'Lionel Messi'
1093
- """
1094
-
1095
- buffer_bytes = frames_to_bytes(frames)
1096
- files = [("video", buffer_bytes)]
1097
- payload = {
1098
- "prompt": prompt,
1099
- "function_name": "ixc25_video_vqa",
1100
- }
1101
- data: Dict[str, Any] = send_inference_request(
1102
- payload, "internlm-xcomposer2", files=files, v2=True
950
+ _display_tool_trace(
951
+ qwen2_vl_images_vqa.__name__,
952
+ payload,
953
+ cast(str, data),
954
+ files,
1103
955
  )
1104
- return cast(str, data["answer"])
956
+ return cast(str, data)
1105
957
 
1106
958
 
1107
959
  def qwen2_vl_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
@@ -1135,9 +987,39 @@ def qwen2_vl_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
1135
987
  data: Dict[str, Any] = send_inference_request(
1136
988
  payload, "image-to-text", files=files, v2=True
1137
989
  )
990
+ _display_tool_trace(
991
+ qwen2_vl_video_vqa.__name__,
992
+ payload,
993
+ cast(str, data),
994
+ files,
995
+ )
1138
996
  return cast(str, data)
1139
997
 
1140
998
 
999
+ def claude35_text_extraction(image: np.ndarray) -> str:
1000
+ """'claude35_text_extraction' is a tool that can extract text from an image. It
1001
+ returns the extracted text as a string and can be used as an alternative to OCR if
1002
+ you do not need to know the exact bounding box of the text.
1003
+
1004
+ Parameters:
1005
+ image (np.ndarray): The image to extract text from.
1006
+
1007
+ Returns:
1008
+ str: The extracted text from the image.
1009
+ """
1010
+
1011
+ lmm = AnthropicLMM()
1012
+ buffer = io.BytesIO()
1013
+ Image.fromarray(image).save(buffer, format="PNG")
1014
+ image_bytes = buffer.getvalue()
1015
+ image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
1016
+ text = lmm.generate(
1017
+ "Extract and return any text you see in this image and nothing else. If you do not read any text respond with an empty string.",
1018
+ [image_b64],
1019
+ )
1020
+ return cast(str, text)
1021
+
1022
+
1141
1023
  def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
1142
1024
  """'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
1143
1025
  including regular images or images of documents or presentations. It returns text
@@ -1187,48 +1069,18 @@ def gpt4o_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
1187
1069
 
1188
1070
  if len(frames) > 10:
1189
1071
  step = len(frames) / 10
1190
- frames = [frames[int(i * step)] for i in range(10)]
1191
-
1192
- frames_b64 = []
1193
- for frame in frames:
1194
- buffer = io.BytesIO()
1195
- Image.fromarray(frame).save(buffer, format="PNG")
1196
- image_bytes = buffer.getvalue()
1197
- image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
1198
- frames_b64.append(image_b64)
1199
-
1200
- resp = lmm.generate(prompt, frames_b64)
1201
- return cast(str, resp)
1202
-
1203
-
1204
- def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
1205
- """'git_vqa_v2' is a tool that can answer questions about the visual
1206
- contents of an image given a question and an image. It returns an answer to the
1207
- question
1208
-
1209
- Parameters:
1210
- prompt (str): The question about the image
1211
- image (np.ndarray): The reference image used for the question
1212
-
1213
- Returns:
1214
- str: A string which is the answer to the given prompt.
1215
-
1216
- Example
1217
- -------
1218
- >>> git_vqa_v2('What is the cat doing ?', image)
1219
- 'drinking milk'
1220
- """
1072
+ frames = [frames[int(i * step)] for i in range(10)]
1221
1073
 
1222
- image_b64 = convert_to_b64(image)
1223
- data = {
1224
- "image": image_b64,
1225
- "prompt": prompt,
1226
- "tool": "image_question_answering",
1227
- "function_name": "git_vqa_v2",
1228
- }
1074
+ frames_b64 = []
1075
+ for frame in frames:
1076
+ buffer = io.BytesIO()
1077
+ Image.fromarray(frame).save(buffer, format="PNG")
1078
+ image_bytes = buffer.getvalue()
1079
+ image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
1080
+ frames_b64.append(image_b64)
1229
1081
 
1230
- answer = send_inference_request(data, "tools")
1231
- return answer["text"][0] # type: ignore
1082
+ resp = lmm.generate(prompt, frames_b64)
1083
+ return cast(str, resp)
1232
1084
 
1233
1085
 
1234
1086
  def video_temporal_localization(
@@ -1274,43 +1126,15 @@ def video_temporal_localization(
1274
1126
  data = send_inference_request(
1275
1127
  payload, "video-temporal-localization", files=files, v2=True
1276
1128
  )
1129
+ _display_tool_trace(
1130
+ video_temporal_localization.__name__,
1131
+ payload,
1132
+ data,
1133
+ files,
1134
+ )
1277
1135
  return [cast(float, value) for value in data]
1278
1136
 
1279
1137
 
1280
- def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
1281
- """'clip' is a tool that can classify an image or a cropped detection given a list
1282
- of input classes or tags. It returns the same list of the input classes along with
1283
- their probability scores based on image content.
1284
-
1285
- Parameters:
1286
- image (np.ndarray): The image to classify or tag
1287
- classes (List[str]): The list of classes or tags that is associated with the image
1288
-
1289
- Returns:
1290
- Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
1291
- contains a list of given labels and other a list of scores.
1292
-
1293
- Example
1294
- -------
1295
- >>> clip(image, ['dog', 'cat', 'bird'])
1296
- {"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]},
1297
- """
1298
-
1299
- if image.shape[0] < 1 or image.shape[1] < 1:
1300
- return {"labels": [], "scores": []}
1301
-
1302
- image_b64 = convert_to_b64(image)
1303
- data = {
1304
- "prompt": ",".join(classes),
1305
- "image": image_b64,
1306
- "tool": "closed_set_image_classification",
1307
- "function_name": "clip",
1308
- }
1309
- resp_data: dict[str, Any] = send_inference_request(data, "tools")
1310
- resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
1311
- return resp_data
1312
-
1313
-
1314
1138
  def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
1315
1139
  """'vit_image_classification' is a tool that can classify an image. It returns a
1316
1140
  list of classes and their probability scores based on image content.
@@ -1338,6 +1162,12 @@ def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
1338
1162
  }
1339
1163
  resp_data: dict[str, Any] = send_inference_request(data, "tools")
1340
1164
  resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
1165
+ _display_tool_trace(
1166
+ vit_image_classification.__name__,
1167
+ data,
1168
+ resp_data,
1169
+ image_b64,
1170
+ )
1341
1171
  return resp_data
1342
1172
 
1343
1173
 
@@ -1369,65 +1199,15 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
1369
1199
  data, "nsfw-classification", v2=True
1370
1200
  )
1371
1201
  resp_data["score"] = round(resp_data["score"], 4)
1202
+ _display_tool_trace(
1203
+ vit_nsfw_classification.__name__,
1204
+ data,
1205
+ resp_data,
1206
+ image_b64,
1207
+ )
1372
1208
  return resp_data
1373
1209
 
1374
1210
 
1375
- def blip_image_caption(image: np.ndarray) -> str:
1376
- """'blip_image_caption' is a tool that can caption an image based on its contents. It
1377
- returns a text describing the image.
1378
-
1379
- Parameters:
1380
- image (np.ndarray): The image to caption
1381
-
1382
- Returns:
1383
- str: A string which is the caption for the given image.
1384
-
1385
- Example
1386
- -------
1387
- >>> blip_image_caption(image)
1388
- 'This image contains a cat sitting on a table with a bowl of milk.'
1389
- """
1390
-
1391
- image_b64 = convert_to_b64(image)
1392
- data = {
1393
- "image": image_b64,
1394
- "tool": "image_captioning",
1395
- "function_name": "blip_image_caption",
1396
- }
1397
-
1398
- answer = send_inference_request(data, "tools")
1399
- return answer["text"][0] # type: ignore
1400
-
1401
-
1402
- def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> str:
1403
- """'florence2_image_caption' is a tool that can caption or describe an image based
1404
- on its contents. It returns a text describing the image.
1405
-
1406
- Parameters:
1407
- image (np.ndarray): The image to caption
1408
- detail_caption (bool): If True, the caption will be as detailed as possible else
1409
- the caption will be a brief description.
1410
-
1411
- Returns:
1412
- str: A string which is the caption for the given image.
1413
-
1414
- Example
1415
- -------
1416
- >>> florence2_image_caption(image, False)
1417
- 'This image contains a cat sitting on a table with a bowl of milk.'
1418
- """
1419
- image_b64 = convert_to_b64(image)
1420
- task = "<MORE_DETAILED_CAPTION>" if detail_caption else "<DETAILED_CAPTION>"
1421
- data = {
1422
- "image": image_b64,
1423
- "task": task,
1424
- "function_name": "florence2_image_caption",
1425
- }
1426
-
1427
- answer = send_inference_request(data, "florence2", v2=True)
1428
- return answer[task] # type: ignore
1429
-
1430
-
1431
1211
  def florence2_phrase_grounding(
1432
1212
  prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
1433
1213
  ) -> List[Dict[str, Any]]:
@@ -1490,15 +1270,21 @@ def florence2_phrase_grounding(
1490
1270
  # get the first frame
1491
1271
  bboxes = detections[0]
1492
1272
  bboxes_formatted = [
1493
- ODResponseData(
1494
- label=bbox["label"],
1495
- bbox=normalize_bbox(bbox["bounding_box"], image_size),
1496
- score=round(bbox["score"], 2),
1497
- )
1273
+ {
1274
+ "label": bbox["label"],
1275
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
1276
+ "score": round(bbox["score"], 2),
1277
+ }
1498
1278
  for bbox in bboxes
1499
1279
  ]
1500
1280
 
1501
- return [bbox.model_dump() for bbox in bboxes_formatted]
1281
+ _display_tool_trace(
1282
+ florence2_phrase_grounding.__name__,
1283
+ payload,
1284
+ detections[0],
1285
+ files,
1286
+ )
1287
+ return [bbox for bbox in bboxes_formatted]
1502
1288
 
1503
1289
 
1504
1290
  def florence2_phrase_grounding_video(
@@ -1566,15 +1352,21 @@ def florence2_phrase_grounding_video(
1566
1352
  bboxes_formatted = []
1567
1353
  for frame_data in detections:
1568
1354
  bboxes_formatted_per_frame = [
1569
- ODResponseData(
1570
- label=bbox["label"],
1571
- bbox=normalize_bbox(bbox["bounding_box"], image_size),
1572
- score=round(bbox["score"], 2),
1573
- )
1355
+ {
1356
+ "label": bbox["label"],
1357
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
1358
+ "score": round(bbox["score"], 2),
1359
+ }
1574
1360
  for bbox in frame_data
1575
1361
  ]
1576
1362
  bboxes_formatted.append(bboxes_formatted_per_frame)
1577
- return [[bbox.model_dump() for bbox in frame] for frame in bboxes_formatted]
1363
+ _display_tool_trace(
1364
+ florence2_phrase_grounding_video.__name__,
1365
+ payload,
1366
+ detections,
1367
+ files,
1368
+ )
1369
+ return bboxes_formatted
1578
1370
 
1579
1371
 
1580
1372
  def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
@@ -1621,6 +1413,12 @@ def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
1621
1413
  "score": 1.0,
1622
1414
  }
1623
1415
  )
1416
+ _display_tool_trace(
1417
+ florence2_ocr.__name__,
1418
+ {},
1419
+ detections,
1420
+ image_b64,
1421
+ )
1624
1422
  return return_data
1625
1423
 
1626
1424
 
@@ -1683,6 +1481,12 @@ def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
1683
1481
  ),
1684
1482
  }
1685
1483
  )
1484
+ _display_tool_trace(
1485
+ detr_segmentation.__name__,
1486
+ {},
1487
+ return_data,
1488
+ image_b64,
1489
+ )
1686
1490
  return return_data
1687
1491
 
1688
1492
 
@@ -1721,74 +1525,15 @@ def depth_anything_v2(image: np.ndarray) -> np.ndarray:
1721
1525
  depth_map_np.max() - depth_map_np.min()
1722
1526
  )
1723
1527
  depth_map_np = (255 * depth_map_np).astype(np.uint8)
1528
+ _display_tool_trace(
1529
+ depth_anything_v2.__name__,
1530
+ {},
1531
+ depth_map,
1532
+ image_b64,
1533
+ )
1724
1534
  return depth_map_np
1725
1535
 
1726
1536
 
1727
- def generate_soft_edge_image(image: np.ndarray) -> np.ndarray:
1728
- """'generate_soft_edge_image' is a tool that runs Holistically Nested edge detection
1729
- to generate a soft edge image (HED) from a given RGB image. The returned image is
1730
- monochrome and represents object boundaries as soft white edges on black background
1731
-
1732
- Parameters:
1733
- image (np.ndarray): The image to used to generate soft edge image
1734
-
1735
- Returns:
1736
- np.ndarray: A soft edge image with pixel values ranging from 0 to 255.
1737
-
1738
- Example
1739
- -------
1740
- >>> generate_soft_edge_image(image)
1741
- array([[0, 0, 0, ..., 0, 0, 0],
1742
- [0, 20, 24, ..., 0, 100, 103],
1743
- ...,
1744
- [10, 11, 15, ..., 202, 202, 205],
1745
- [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
1746
- """
1747
- image_b64 = convert_to_b64(image)
1748
- data = {
1749
- "image": image_b64,
1750
- "tool": "generate_hed",
1751
- "function_name": "generate_soft_edge_image",
1752
- }
1753
-
1754
- answer = send_inference_request(data, "tools")
1755
- return_data = np.array(b64_to_pil(answer["masks"][0]).convert("L"))
1756
- return return_data
1757
-
1758
-
1759
- def dpt_hybrid_midas(image: np.ndarray) -> np.ndarray:
1760
- """'dpt_hybrid_midas' is a tool that generates a normal mapped from a given RGB
1761
- image. The returned RGB image is texture mapped image of the surface normals and the
1762
- RGB values represent the surface normals in the x, y, z directions.
1763
-
1764
- Parameters:
1765
- image (np.ndarray): The image to used to generate normal image
1766
-
1767
- Returns:
1768
- np.ndarray: A mapped normal image with RGB pixel values indicating surface
1769
- normals in x, y, z directions.
1770
-
1771
- Example
1772
- -------
1773
- >>> dpt_hybrid_midas(image)
1774
- array([[0, 0, 0, ..., 0, 0, 0],
1775
- [0, 20, 24, ..., 0, 100, 103],
1776
- ...,
1777
- [10, 11, 15, ..., 202, 202, 205],
1778
- [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
1779
- """
1780
- image_b64 = convert_to_b64(image)
1781
- data = {
1782
- "image": image_b64,
1783
- "tool": "generate_normal",
1784
- "function_name": "dpt_hybrid_midas",
1785
- }
1786
-
1787
- answer = send_inference_request(data, "tools")
1788
- return_data = np.array(b64_to_pil(answer["masks"][0]).convert("RGB"))
1789
- return return_data
1790
-
1791
-
1792
1537
  def generate_pose_image(image: np.ndarray) -> np.ndarray:
1793
1538
  """'generate_pose_image' is a tool that generates a open pose bone/stick image from
1794
1539
  a given RGB image. The returned bone image is RGB with the pose amd keypoints colored
@@ -1817,6 +1562,12 @@ def generate_pose_image(image: np.ndarray) -> np.ndarray:
1817
1562
 
1818
1563
  pos_img = send_inference_request(data, "pose-detector", v2=True)
1819
1564
  return_data = np.array(b64_to_pil(pos_img["data"]).convert("RGB"))
1565
+ _display_tool_trace(
1566
+ generate_pose_image.__name__,
1567
+ {},
1568
+ pos_img,
1569
+ image_b64,
1570
+ )
1820
1571
  return return_data
1821
1572
 
1822
1573
 
@@ -1851,130 +1602,28 @@ def template_match(
1851
1602
  template_image_b64 = convert_to_b64(template_image)
1852
1603
  data = {
1853
1604
  "image": image_b64,
1854
- "template": template_image_b64,
1855
- "tool": "template_match",
1856
- "function_name": "template_match",
1857
- }
1858
-
1859
- answer = send_inference_request(data, "tools")
1860
- return_data = []
1861
- for i in range(len(answer["bboxes"])):
1862
- return_data.append(
1863
- {
1864
- "score": round(answer["scores"][i], 2),
1865
- "bbox": normalize_bbox(answer["bboxes"][i], image_size),
1866
- }
1867
- )
1868
- return return_data
1869
-
1870
-
1871
- def minimum_distance(
1872
- det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]
1873
- ) -> float:
1874
- """'minimum_distance' calculates the minimum distance between two detections which
1875
- can include bounding boxes and or masks. This will return the closest distance
1876
- between the objects, not the distance between the centers of the objects.
1877
-
1878
- Parameters:
1879
- det1 (Dict[str, Any]): The first detection of boxes or masks.
1880
- det2 (Dict[str, Any]): The second detection of boxes or masks.
1881
- image_size (Tuple[int, int]): The size of the image given as (height, width).
1882
-
1883
- Returns:
1884
- float: The closest distance between the two detections.
1885
-
1886
- Example
1887
- -------
1888
- >>> closest_distance(det1, det2, image_size)
1889
- 141.42
1890
- """
1891
-
1892
- if "mask" in det1 and "mask" in det2:
1893
- return closest_mask_distance(det1["mask"], det2["mask"])
1894
- elif "bbox" in det1 and "bbox" in det2:
1895
- return closest_box_distance(det1["bbox"], det2["bbox"], image_size)
1896
- else:
1897
- raise ValueError("Both detections must have either bbox or mask")
1898
-
1899
-
1900
- def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
1901
- """'closest_mask_distance' calculates the closest distance between two masks.
1902
-
1903
- Parameters:
1904
- mask1 (np.ndarray): The first mask.
1905
- mask2 (np.ndarray): The second mask.
1906
-
1907
- Returns:
1908
- float: The closest distance between the two masks.
1909
-
1910
- Example
1911
- -------
1912
- >>> closest_mask_distance(mask1, mask2)
1913
- 0.5
1914
- """
1915
-
1916
- mask1 = np.clip(mask1, 0, 1)
1917
- mask2 = np.clip(mask2, 0, 1)
1918
- contours1, _ = cv2.findContours(mask1, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
1919
- contours2, _ = cv2.findContours(mask2, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
1920
- largest_contour1 = max(contours1, key=cv2.contourArea)
1921
- largest_contour2 = max(contours2, key=cv2.contourArea)
1922
- polygon1 = cv2.approxPolyDP(largest_contour1, 1.0, True)
1923
- polygon2 = cv2.approxPolyDP(largest_contour2, 1.0, True)
1924
- min_distance = np.inf
1925
-
1926
- small_polygon, larger_contour = (
1927
- (polygon1, largest_contour2)
1928
- if len(largest_contour1) < len(largest_contour2)
1929
- else (polygon2, largest_contour1)
1930
- )
1931
-
1932
- # For each point in the first polygon
1933
- for point in small_polygon:
1934
- # Calculate the distance to the second polygon, -1 is to invert result as point inside the polygon is positive
1935
-
1936
- distance = (
1937
- cv2.pointPolygonTest(
1938
- larger_contour, (point[0, 0].item(), point[0, 1].item()), True
1939
- )
1940
- * -1
1941
- )
1942
-
1943
- # If the distance is negative, the point is inside the polygon, so the distance is 0
1944
- if distance < 0:
1945
- continue
1946
- else:
1947
- # Update the minimum distance if the point is outside the polygon
1948
- min_distance = min(min_distance, distance)
1949
-
1950
- return min_distance if min_distance != np.inf else 0.0
1951
-
1952
-
1953
- def closest_box_distance(
1954
- box1: List[float], box2: List[float], image_size: Tuple[int, int]
1955
- ) -> float:
1956
- """'closest_box_distance' calculates the closest distance between two bounding boxes.
1957
-
1958
- Parameters:
1959
- box1 (List[float]): The first bounding box.
1960
- box2 (List[float]): The second bounding box.
1961
- image_size (Tuple[int, int]): The size of the image given as (height, width).
1962
-
1963
- Returns:
1964
- float: The closest distance between the two bounding boxes.
1965
-
1966
- Example
1967
- -------
1968
- >>> closest_box_distance([100, 100, 200, 200], [300, 300, 400, 400])
1969
- 141.42
1970
- """
1971
-
1972
- x11, y11, x12, y12 = denormalize_bbox(box1, image_size)
1973
- x21, y21, x22, y22 = denormalize_bbox(box2, image_size)
1605
+ "template": template_image_b64,
1606
+ "tool": "template_match",
1607
+ "function_name": "template_match",
1608
+ }
1974
1609
 
1975
- horizontal_distance = np.max([0, x21 - x12, x11 - x22])
1976
- vertical_distance = np.max([0, y21 - y12, y11 - y22])
1977
- return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
1610
+ answer = send_inference_request(data, "tools")
1611
+ return_data = []
1612
+ for i in range(len(answer["bboxes"])):
1613
+ return_data.append(
1614
+ {
1615
+ "label": "match",
1616
+ "score": round(answer["scores"][i], 2),
1617
+ "bbox": normalize_bbox(answer["bboxes"][i], image_size),
1618
+ }
1619
+ )
1620
+ _display_tool_trace(
1621
+ template_match.__name__,
1622
+ {"template_image": template_image_b64},
1623
+ return_data,
1624
+ image_b64,
1625
+ )
1626
+ return return_data
1978
1627
 
1979
1628
 
1980
1629
  def flux_image_inpainting(
@@ -2064,6 +1713,12 @@ def flux_image_inpainting(
2064
1713
  )
2065
1714
 
2066
1715
  output_image = np.array(b64_to_pil(response[0]).convert("RGB"))
1716
+ _display_tool_trace(
1717
+ flux_image_inpainting.__name__,
1718
+ payload,
1719
+ output_image,
1720
+ files,
1721
+ )
2067
1722
  return output_image
2068
1723
 
2069
1724
 
@@ -2106,9 +1761,124 @@ def siglip_classification(image: np.ndarray, labels: List[str]) -> Dict[str, Any
2106
1761
  metadata_payload={"function_name": "siglip_classification"},
2107
1762
  )
2108
1763
 
1764
+ _display_tool_trace(
1765
+ siglip_classification.__name__,
1766
+ payload,
1767
+ response,
1768
+ files,
1769
+ )
2109
1770
  return response
2110
1771
 
2111
1772
 
1773
+ def minimum_distance(
1774
+ det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]
1775
+ ) -> float:
1776
+ """'minimum_distance' calculates the minimum distance between two detections which
1777
+ can include bounding boxes and or masks. This will return the closest distance
1778
+ between the objects, not the distance between the centers of the objects.
1779
+
1780
+ Parameters:
1781
+ det1 (Dict[str, Any]): The first detection of boxes or masks.
1782
+ det2 (Dict[str, Any]): The second detection of boxes or masks.
1783
+ image_size (Tuple[int, int]): The size of the image given as (height, width).
1784
+
1785
+ Returns:
1786
+ float: The closest distance between the two detections.
1787
+
1788
+ Example
1789
+ -------
1790
+ >>> closest_distance(det1, det2, image_size)
1791
+ 141.42
1792
+ """
1793
+
1794
+ if "mask" in det1 and "mask" in det2:
1795
+ return closest_mask_distance(det1["mask"], det2["mask"])
1796
+ elif "bbox" in det1 and "bbox" in det2:
1797
+ return closest_box_distance(det1["bbox"], det2["bbox"], image_size)
1798
+ else:
1799
+ raise ValueError("Both detections must have either bbox or mask")
1800
+
1801
+
1802
+ def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
1803
+ """'closest_mask_distance' calculates the closest distance between two masks.
1804
+
1805
+ Parameters:
1806
+ mask1 (np.ndarray): The first mask.
1807
+ mask2 (np.ndarray): The second mask.
1808
+
1809
+ Returns:
1810
+ float: The closest distance between the two masks.
1811
+
1812
+ Example
1813
+ -------
1814
+ >>> closest_mask_distance(mask1, mask2)
1815
+ 0.5
1816
+ """
1817
+
1818
+ mask1 = np.clip(mask1, 0, 1)
1819
+ mask2 = np.clip(mask2, 0, 1)
1820
+ contours1, _ = cv2.findContours(mask1, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
1821
+ contours2, _ = cv2.findContours(mask2, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
1822
+ largest_contour1 = max(contours1, key=cv2.contourArea)
1823
+ largest_contour2 = max(contours2, key=cv2.contourArea)
1824
+ polygon1 = cv2.approxPolyDP(largest_contour1, 1.0, True)
1825
+ polygon2 = cv2.approxPolyDP(largest_contour2, 1.0, True)
1826
+ min_distance = np.inf
1827
+
1828
+ small_polygon, larger_contour = (
1829
+ (polygon1, largest_contour2)
1830
+ if len(largest_contour1) < len(largest_contour2)
1831
+ else (polygon2, largest_contour1)
1832
+ )
1833
+
1834
+ # For each point in the first polygon
1835
+ for point in small_polygon:
1836
+ # Calculate the distance to the second polygon, -1 is to invert result as point inside the polygon is positive
1837
+
1838
+ distance = (
1839
+ cv2.pointPolygonTest(
1840
+ larger_contour, (point[0, 0].item(), point[0, 1].item()), True
1841
+ )
1842
+ * -1
1843
+ )
1844
+
1845
+ # If the distance is negative, the point is inside the polygon, so the distance is 0
1846
+ if distance < 0:
1847
+ continue
1848
+ else:
1849
+ # Update the minimum distance if the point is outside the polygon
1850
+ min_distance = min(min_distance, distance)
1851
+
1852
+ return min_distance if min_distance != np.inf else 0.0
1853
+
1854
+
1855
+ def closest_box_distance(
1856
+ box1: List[float], box2: List[float], image_size: Tuple[int, int]
1857
+ ) -> float:
1858
+ """'closest_box_distance' calculates the closest distance between two bounding boxes.
1859
+
1860
+ Parameters:
1861
+ box1 (List[float]): The first bounding box.
1862
+ box2 (List[float]): The second bounding box.
1863
+ image_size (Tuple[int, int]): The size of the image given as (height, width).
1864
+
1865
+ Returns:
1866
+ float: The closest distance between the two bounding boxes.
1867
+
1868
+ Example
1869
+ -------
1870
+ >>> closest_box_distance([100, 100, 200, 200], [300, 300, 400, 400])
1871
+ 141.42
1872
+ """
1873
+
1874
+ x11, y11, x12, y12 = denormalize_bbox(box1, image_size)
1875
+ x21, y21, x22, y22 = denormalize_bbox(box2, image_size)
1876
+
1877
+ horizontal_distance = np.max([0, x21 - x12, x11 - x22])
1878
+ vertical_distance = np.max([0, y21 - y12, y11 - y22])
1879
+ return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
1880
+
1881
+
2112
1882
  # Utility and visualization functions
2113
1883
 
2114
1884
 
@@ -2625,6 +2395,197 @@ def _plot_counting(
2625
2395
  return image
2626
2396
 
2627
2397
 
2398
+ class ODModels(str, Enum):
2399
+ COUNTGD = "countgd"
2400
+ FLORENCE2 = "florence2"
2401
+ OWLV2 = "owlv2"
2402
+
2403
+
2404
+ def od_sam2_video_tracking(
2405
+ od_model: ODModels,
2406
+ prompt: str,
2407
+ frames: List[np.ndarray],
2408
+ chunk_length: Optional[int] = 10,
2409
+ fine_tune_id: Optional[str] = None,
2410
+ ) -> List[List[Dict[str, Any]]]:
2411
+
2412
+ results: List[Optional[List[Dict[str, Any]]]] = [None] * len(frames)
2413
+
2414
+ if chunk_length is None:
2415
+ step = 1 # Process every frame
2416
+ elif chunk_length <= 0:
2417
+ raise ValueError("chunk_length must be a positive integer or None.")
2418
+ else:
2419
+ step = chunk_length # Process frames with the specified step size
2420
+
2421
+ for idx in range(0, len(frames), step):
2422
+ if od_model == ODModels.COUNTGD:
2423
+ results[idx] = countgd_object_detection(prompt=prompt, image=frames[idx])
2424
+ function_name = "countgd_object_detection"
2425
+ elif od_model == ODModels.OWLV2:
2426
+ results[idx] = owl_v2_image(
2427
+ prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
2428
+ )
2429
+ function_name = "owl_v2_image"
2430
+ elif od_model == ODModels.FLORENCE2:
2431
+ results[idx] = florence2_sam2_image(
2432
+ prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
2433
+ )
2434
+ function_name = "florence2_sam2_image"
2435
+ else:
2436
+ raise NotImplementedError(
2437
+ f"Object detection model '{od_model}' is not implemented."
2438
+ )
2439
+
2440
+ image_size = frames[0].shape[:2]
2441
+
2442
+ def _transform_detections(
2443
+ input_list: List[Optional[List[Dict[str, Any]]]]
2444
+ ) -> List[Optional[Dict[str, Any]]]:
2445
+ output_list: List[Optional[Dict[str, Any]]] = []
2446
+
2447
+ for idx, frame in enumerate(input_list):
2448
+ if frame is not None:
2449
+ labels = [detection["label"] for detection in frame]
2450
+ bboxes = [
2451
+ denormalize_bbox(detection["bbox"], image_size)
2452
+ for detection in frame
2453
+ ]
2454
+
2455
+ output_list.append(
2456
+ {
2457
+ "labels": labels,
2458
+ "bboxes": bboxes,
2459
+ }
2460
+ )
2461
+ else:
2462
+ output_list.append(None)
2463
+
2464
+ return output_list
2465
+
2466
+ output = _transform_detections(results)
2467
+
2468
+ buffer_bytes = frames_to_bytes(frames)
2469
+ files = [("video", buffer_bytes)]
2470
+ payload = {"bboxes": json.dumps(output), "chunk_length": chunk_length}
2471
+ metadata = {"function_name": function_name}
2472
+
2473
+ detections = send_task_inference_request(
2474
+ payload,
2475
+ "sam2",
2476
+ files=files,
2477
+ metadata=metadata,
2478
+ )
2479
+
2480
+ return_data = []
2481
+ for frame in detections:
2482
+ return_frame_data = []
2483
+ for detection in frame:
2484
+ mask = rle_decode_array(detection["mask"])
2485
+ label = str(detection["id"]) + ": " + detection["label"]
2486
+ return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
2487
+ return_data.append(return_frame_data)
2488
+ return_data = add_bboxes_from_masks(return_data)
2489
+ return nms(return_data, iou_threshold=0.95)
2490
+
2491
+
2492
+ def countgd_sam2_video_tracking(
2493
+ prompt: str,
2494
+ frames: List[np.ndarray],
2495
+ chunk_length: Optional[int] = 10,
2496
+ ) -> List[List[Dict[str, Any]]]:
2497
+ """'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
2498
+ prompt such as category names or referring expressions. The categories in the text
2499
+ prompt are separated by commas. It returns a list of bounding boxes, label names,
2500
+ mask file names and associated probability scores.
2501
+
2502
+ Parameters:
2503
+ prompt (str): The prompt to ground to the image.
2504
+ image (np.ndarray): The image to ground the prompt to.
2505
+
2506
+ Returns:
2507
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
2508
+ bounding box, and mask of the detected objects with normalized coordinates
2509
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
2510
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
2511
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
2512
+ the background.
2513
+
2514
+ Example
2515
+ -------
2516
+ >>> countgd_sam2_video_tracking("car, dinosaur", frames)
2517
+ [
2518
+ [
2519
+ {
2520
+ 'label': '0: dinosaur',
2521
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
2522
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
2523
+ [0, 0, 0, ..., 0, 0, 0],
2524
+ ...,
2525
+ [0, 0, 0, ..., 0, 0, 0],
2526
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
2527
+ },
2528
+ ],
2529
+ ...
2530
+ ]
2531
+ """
2532
+
2533
+ return od_sam2_video_tracking(
2534
+ ODModels.COUNTGD, prompt=prompt, frames=frames, chunk_length=chunk_length
2535
+ )
2536
+
2537
+
2538
+ def owlv2_sam2_video_tracking(
2539
+ prompt: str,
2540
+ frames: List[np.ndarray],
2541
+ chunk_length: Optional[int] = 10,
2542
+ fine_tune_id: Optional[str] = None,
2543
+ ) -> List[List[Dict[str, Any]]]:
2544
+ """'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
2545
+ prompt such as category names or referring expressions. The categories in the text
2546
+ prompt are separated by commas. It returns a list of bounding boxes, label names,
2547
+ mask file names and associated probability scores.
2548
+
2549
+ Parameters:
2550
+ prompt (str): The prompt to ground to the image.
2551
+ image (np.ndarray): The image to ground the prompt to.
2552
+
2553
+ Returns:
2554
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
2555
+ bounding box, and mask of the detected objects with normalized coordinates
2556
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
2557
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
2558
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
2559
+ the background.
2560
+
2561
+ Example
2562
+ -------
2563
+ >>> countgd_sam2_video_tracking("car, dinosaur", frames)
2564
+ [
2565
+ [
2566
+ {
2567
+ 'label': '0: dinosaur',
2568
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
2569
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
2570
+ [0, 0, 0, ..., 0, 0, 0],
2571
+ ...,
2572
+ [0, 0, 0, ..., 0, 0, 0],
2573
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
2574
+ },
2575
+ ],
2576
+ ...
2577
+ ]
2578
+ """
2579
+
2580
+ return od_sam2_video_tracking(
2581
+ ODModels.OWLV2,
2582
+ prompt=prompt,
2583
+ frames=frames,
2584
+ chunk_length=chunk_length,
2585
+ fine_tune_id=fine_tune_id,
2586
+ )
2587
+
2588
+
2628
2589
  FUNCTION_TOOLS = [
2629
2590
  owl_v2_image,
2630
2591
  owl_v2_video,