vision-agent 0.2.210__py3-none-any.whl → 0.2.212__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,7 +4,9 @@ import logging
4
4
  import os
5
5
  import tempfile
6
6
  import urllib.request
7
+ from base64 import b64encode
7
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ from enum import Enum
8
10
  from functools import lru_cache
9
11
  from importlib import resources
10
12
  from pathlib import Path
@@ -14,6 +16,7 @@ from uuid import UUID
14
16
  import cv2
15
17
  import numpy as np
16
18
  import requests
19
+ from IPython.display import display
17
20
  from PIL import Image, ImageDraw, ImageFont
18
21
  from pillow_heif import register_heif_opener # type: ignore
19
22
  from pytube import YouTube # type: ignore
@@ -21,8 +24,8 @@ from pytube import YouTube # type: ignore
21
24
  from vision_agent.clients.landing_public_api import LandingPublicAPI
22
25
  from vision_agent.lmm.lmm import AnthropicLMM, OpenAILMM
23
26
  from vision_agent.tools.tool_utils import (
27
+ ToolCallTrace,
24
28
  add_bboxes_from_masks,
25
- filter_bboxes_by_threshold,
26
29
  get_tool_descriptions,
27
30
  get_tool_documentation,
28
31
  get_tools_df,
@@ -32,7 +35,7 @@ from vision_agent.tools.tool_utils import (
32
35
  send_task_inference_request,
33
36
  single_nms,
34
37
  )
35
- from vision_agent.tools.tools_types import JobStatus, ODResponseData
38
+ from vision_agent.tools.tools_types import JobStatus
36
39
  from vision_agent.utils.exceptions import FineTuneModelIsNotReady
37
40
  from vision_agent.utils.execute import FileSerializer, MimeType
38
41
  from vision_agent.utils.image_utils import (
@@ -41,7 +44,6 @@ from vision_agent.utils.image_utils import (
41
44
  convert_to_b64,
42
45
  denormalize_bbox,
43
46
  encode_image_bytes,
44
- get_image_size,
45
47
  normalize_bbox,
46
48
  numpy_to_bytes,
47
49
  rle_decode,
@@ -88,66 +90,33 @@ def get_tool_recommender() -> Sim:
88
90
  return load_cached_sim(TOOLS_DF)
89
91
 
90
92
 
91
- def grounding_dino(
92
- prompt: str,
93
- image: np.ndarray,
94
- box_threshold: float = 0.20,
95
- iou_threshold: float = 0.20,
96
- model_size: str = "large",
97
- ) -> List[Dict[str, Any]]:
98
- """'grounding_dino' is a tool that can detect and count multiple objects given a text
99
- prompt such as category names or referring expressions. The categories in text prompt
100
- are separated by commas or periods. It returns a list of bounding boxes with
101
- normalized coordinates, label names and associated probability scores.
102
-
103
- Parameters:
104
- prompt (str): The prompt to ground to the image.
105
- image (np.ndarray): The image to ground the prompt to.
106
- box_threshold (float, optional): The threshold for the box detection. Defaults
107
- to 0.20.
108
- iou_threshold (float, optional): The threshold for the Intersection over Union
109
- (IoU). Defaults to 0.20.
110
- model_size (str, optional): The size of the model to use.
111
-
112
- Returns:
113
- List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
114
- bounding box of the detected objects with normalized coordinates between 0
115
- and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
116
- top-left and xmax and ymax are the coordinates of the bottom-right of the
117
- bounding box.
118
-
119
- Example
120
- -------
121
- >>> grounding_dino("car. dinosaur", image)
122
- [
123
- {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
124
- {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
125
- ]
126
- """
127
- image_size = image.shape[:2]
128
- image_b64 = convert_to_b64(image)
129
- if model_size not in ["large", "tiny"]:
130
- raise ValueError("model_size must be either 'large' or 'tiny'")
131
- request_data = {
132
- "prompt": prompt,
133
- "image": image_b64,
134
- "tool": (
135
- "visual_grounding" if model_size == "large" else "visual_grounding_tiny"
136
- ),
137
- "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
138
- "function_name": "grounding_dino",
139
- }
140
- data: Dict[str, Any] = send_inference_request(request_data, "tools")
141
- return_data = []
142
- for i in range(len(data["bboxes"])):
143
- return_data.append(
144
- {
145
- "score": round(data["scores"][i], 2),
146
- "label": data["labels"][i],
147
- "bbox": normalize_bbox(data["bboxes"][i], image_size),
148
- }
149
- )
150
- return return_data
93
+ def _display_tool_trace(
94
+ function_name: str,
95
+ request: Dict[str, Any],
96
+ response: Any,
97
+ files: Union[List[Tuple[str, bytes]], str],
98
+ ) -> None:
99
+ # Sends data through IPython's display function so front-end can show them. We use
100
+ # a function here instead of a decarator becuase we do not want to re-calculate data
101
+ # such as video bytes, which can be slow. Since this is calculated inside the
102
+ # function we can't capture it with a decarator without adding it as a return value
103
+ # which would change the function signature and affect the agent.
104
+ files_in_b64: List[Tuple[str, str]]
105
+ if isinstance(files, str):
106
+ files_in_b64 = [("images", files)]
107
+ else:
108
+ files_in_b64 = [(file[0], b64encode(file[1]).decode("utf-8")) for file in files]
109
+
110
+ request["function_name"] = function_name
111
+ tool_call_trace = ToolCallTrace(
112
+ endpoint_url="",
113
+ type="tool_func_call",
114
+ request=request,
115
+ response={"data": response},
116
+ error=None,
117
+ files=files_in_b64,
118
+ )
119
+ display({MimeType.APPLICATION_JSON: tool_call_trace.model_dump()}, raw=True)
151
120
 
152
121
 
153
122
  def owl_v2_image(
@@ -223,14 +192,21 @@ def owl_v2_image(
223
192
  # get the first frame
224
193
  bboxes = detections[0]
225
194
  bboxes_formatted = [
226
- ODResponseData(
227
- label=bbox["label"],
228
- bbox=normalize_bbox(bbox["bounding_box"], image_size),
229
- score=round(bbox["score"], 2),
230
- )
195
+ {
196
+ "label": bbox["label"],
197
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
198
+ "score": round(bbox["score"], 2),
199
+ }
231
200
  for bbox in bboxes
232
201
  ]
233
- return [bbox.model_dump() for bbox in bboxes_formatted]
202
+
203
+ _display_tool_trace(
204
+ owl_v2_image.__name__,
205
+ payload,
206
+ detections[0],
207
+ files,
208
+ )
209
+ return bboxes_formatted
234
210
 
235
211
 
236
212
  def owl_v2_video(
@@ -309,81 +285,21 @@ def owl_v2_video(
309
285
  bboxes_formatted = []
310
286
  for frame_data in detections:
311
287
  bboxes_formatted_per_frame = [
312
- ODResponseData(
313
- label=bbox["label"],
314
- bbox=normalize_bbox(bbox["bounding_box"], image_size),
315
- score=round(bbox["score"], 2),
316
- )
288
+ {
289
+ "label": bbox["label"],
290
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
291
+ "score": round(bbox["score"], 2),
292
+ }
317
293
  for bbox in frame_data
318
294
  ]
319
295
  bboxes_formatted.append(bboxes_formatted_per_frame)
320
- return [[bbox.model_dump() for bbox in frame] for frame in bboxes_formatted]
321
-
322
-
323
- def grounding_sam(
324
- prompt: str,
325
- image: np.ndarray,
326
- box_threshold: float = 0.20,
327
- iou_threshold: float = 0.20,
328
- ) -> List[Dict[str, Any]]:
329
- """'grounding_sam' is a tool that can segment multiple objects given a text prompt
330
- such as category names or referring expressions. The categories in text prompt are
331
- separated by commas or periods. It returns a list of bounding boxes, label names,
332
- mask file names and associated probability scores.
333
-
334
- Parameters:
335
- prompt (str): The prompt to ground to the image.
336
- image (np.ndarray): The image to ground the prompt to.
337
- box_threshold (float, optional): The threshold for the box detection. Defaults
338
- to 0.20.
339
- iou_threshold (float, optional): The threshold for the Intersection over Union
340
- (IoU). Defaults to 0.20.
341
-
342
- Returns:
343
- List[Dict[str, Any]]: A list of dictionaries containing the score, label,
344
- bounding box, and mask of the detected objects with normalized coordinates
345
- (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
346
- and xmax and ymax are the coordinates of the bottom-right of the bounding box.
347
- The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
348
- the background.
349
-
350
- Example
351
- -------
352
- >>> grounding_sam("car. dinosaur", image)
353
- [
354
- {
355
- 'score': 0.99,
356
- 'label': 'dinosaur',
357
- 'bbox': [0.1, 0.11, 0.35, 0.4],
358
- 'mask': array([[0, 0, 0, ..., 0, 0, 0],
359
- [0, 0, 0, ..., 0, 0, 0],
360
- ...,
361
- [0, 0, 0, ..., 0, 0, 0],
362
- [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
363
- },
364
- ]
365
- """
366
- image_size = image.shape[:2]
367
- image_b64 = convert_to_b64(image)
368
- request_data = {
369
- "prompt": prompt,
370
- "image": image_b64,
371
- "tool": "visual_grounding_segment",
372
- "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
373
- "function_name": "grounding_sam",
374
- }
375
- data: Dict[str, Any] = send_inference_request(request_data, "tools")
376
- return_data = []
377
- for i in range(len(data["bboxes"])):
378
- return_data.append(
379
- {
380
- "score": round(data["scores"][i], 2),
381
- "label": data["labels"][i],
382
- "bbox": normalize_bbox(data["bboxes"][i], image_size),
383
- "mask": rle_decode(mask_rle=data["masks"][i], shape=data["mask_shape"]),
384
- }
385
- )
386
- return return_data
296
+ _display_tool_trace(
297
+ owl_v2_video.__name__,
298
+ payload,
299
+ detections[0],
300
+ files,
301
+ )
302
+ return bboxes_formatted
387
303
 
388
304
 
389
305
  def florence2_sam2_image(
@@ -460,6 +376,13 @@ def florence2_sam2_image(
460
376
  label = detection["label"]
461
377
  bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
462
378
  return_data.append({"label": label, "bbox": bbox, "mask": mask, "score": 1.0})
379
+
380
+ _display_tool_trace(
381
+ florence2_sam2_image.__name__,
382
+ payload,
383
+ detections[0],
384
+ files,
385
+ )
463
386
  return return_data
464
387
 
465
388
 
@@ -545,10 +468,36 @@ def florence2_sam2_video_tracking(
545
468
  for detection in frame:
546
469
  mask = rle_decode_array(detection["mask"])
547
470
  label = str(detection["id"]) + ": " + detection["label"]
548
- return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
471
+ return_frame_data.append(
472
+ {"label": label, "mask": mask, "score": 1.0, "rle": detection["mask"]}
473
+ )
549
474
  return_data.append(return_frame_data)
550
475
  return_data = add_bboxes_from_masks(return_data)
551
- return nms(return_data, iou_threshold=0.95)
476
+ return_data = nms(return_data, iou_threshold=0.95)
477
+
478
+ _display_tool_trace(
479
+ florence2_sam2_video_tracking.__name__,
480
+ payload,
481
+ [
482
+ [
483
+ {
484
+ "label": e["label"],
485
+ "score": e["score"],
486
+ "bbox": denormalize_bbox(e["bbox"], frames[0].shape[:2]),
487
+ "mask": e["rle"],
488
+ }
489
+ for e in lst
490
+ ]
491
+ for lst in return_data
492
+ ],
493
+ files,
494
+ )
495
+ # We save the RLE for display purposes, re-calculting RLE can get very expensive.
496
+ # Deleted here because we are returning the numpy masks instead
497
+ for frame in return_data:
498
+ for obj in frame:
499
+ del obj["rle"]
500
+ return return_data
552
501
 
553
502
 
554
503
  def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
@@ -603,128 +552,134 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
603
552
  box = normalize_bbox(box, image_size)
604
553
  output.append({"label": label, "bbox": box, "score": round(det["score"], 2)})
605
554
 
606
- ocr_results = sorted(output, key=lambda x: (x["bbox"][1], x["bbox"][0]))
607
- return ocr_results
608
-
609
-
610
- def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
611
- """'loca_zero_shot_counting' is a tool that counts the dominant foreground object given
612
- an image and no other information about the content. It returns only the count of
613
- the objects in the image.
614
-
615
- Parameters:
616
- image (np.ndarray): The image that contains lot of instances of a single object
617
-
618
- Returns:
619
- Dict[str, Any]: A dictionary containing the key 'count' and the count as a
620
- value, e.g. {count: 12} and a heat map for visualization purposes.
621
-
622
- Example
623
- -------
624
- >>> loca_zero_shot_counting(image)
625
- {'count': 83,
626
- 'heat_map': array([[ 0, 0, 0, ..., 0, 0, 0],
627
- [ 0, 0, 0, ..., 0, 0, 0],
628
- [ 0, 0, 0, ..., 0, 0, 1],
629
- ...,
630
- [ 0, 0, 0, ..., 30, 35, 41],
631
- [ 0, 0, 0, ..., 41, 47, 53],
632
- [ 0, 0, 0, ..., 53, 59, 64]], dtype=uint8)}
633
- """
634
-
635
- image_b64 = convert_to_b64(image)
636
- data = {
637
- "image": image_b64,
638
- "function_name": "loca_zero_shot_counting",
639
- }
640
- resp_data: dict[str, Any] = send_inference_request(data, "loca", v2=True)
641
- resp_data["heat_map"] = np.array(resp_data["heat_map"][0]).astype(np.uint8)
642
- return resp_data
555
+ _display_tool_trace(
556
+ ocr.__name__,
557
+ {},
558
+ data,
559
+ cast(List[Tuple[str, bytes]], [("image", buffer_bytes)]),
560
+ )
561
+ return sorted(output, key=lambda x: (x["bbox"][1], x["bbox"][0]))
643
562
 
644
563
 
645
- def loca_visual_prompt_counting(
646
- image: np.ndarray, visual_prompt: Dict[str, List[float]]
564
+ def _sam2(
565
+ image: np.ndarray,
566
+ detections: List[Dict[str, Any]],
567
+ image_size: Tuple[int, ...],
568
+ image_bytes: Optional[bytes] = None,
647
569
  ) -> Dict[str, Any]:
648
- """'loca_visual_prompt_counting' is a tool that counts the dominant foreground object
649
- given an image and a visual prompt which is a bounding box describing the object.
650
- It returns only the count of the objects in the image.
651
-
652
- Parameters:
653
- image (np.ndarray): The image that contains lot of instances of a single object
654
- visual_prompt (Dict[str, List[float]]): Bounding box of the object in
655
- format [xmin, ymin, xmax, ymax]. Only 1 bounding box can be provided.
656
-
657
- Returns:
658
- Dict[str, Any]: A dictionary containing the key 'count' and the count as a
659
- value, e.g. {count: 12} and a heat map for visualization purposes.
660
-
661
- Example
662
- -------
663
- >>> loca_visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
664
- {'count': 83,
665
- 'heat_map': array([[ 0, 0, 0, ..., 0, 0, 0],
666
- [ 0, 0, 0, ..., 0, 0, 0],
667
- [ 0, 0, 0, ..., 0, 0, 1],
668
- ...,
669
- [ 0, 0, 0, ..., 30, 35, 41],
670
- [ 0, 0, 0, ..., 41, 47, 53],
671
- [ 0, 0, 0, ..., 53, 59, 64]], dtype=uint8)}
672
- """
673
-
674
- image_size = get_image_size(image)
675
- bbox = visual_prompt["bbox"]
676
- image_b64 = convert_to_b64(image)
570
+ if image_bytes is None:
571
+ image_bytes = numpy_to_bytes(image)
677
572
 
678
- data = {
679
- "image": image_b64,
680
- "bbox": list(map(int, denormalize_bbox(bbox, image_size))),
681
- "function_name": "loca_visual_prompt_counting",
573
+ files = [("images", image_bytes)]
574
+ payload = {
575
+ "model": "sam2",
576
+ "bboxes": json.dumps(
577
+ [
578
+ {
579
+ "labels": [d["label"] for d in detections],
580
+ "bboxes": [
581
+ denormalize_bbox(d["bbox"], image_size) for d in detections
582
+ ],
583
+ }
584
+ ]
585
+ ),
682
586
  }
683
- resp_data: dict[str, Any] = send_inference_request(data, "loca", v2=True)
684
- resp_data["heat_map"] = np.array(resp_data["heat_map"][0]).astype(np.uint8)
685
- return resp_data
587
+
588
+ metadata = {"function_name": "sam2"}
589
+ pred_detections = send_task_inference_request(
590
+ payload, "sam2", files=files, metadata=metadata
591
+ )
592
+ frame = pred_detections[0]
593
+ return_data = []
594
+ display_data = []
595
+ for inp_detection, detection in zip(detections, frame):
596
+ mask = rle_decode_array(detection["mask"])
597
+ label = detection["label"]
598
+ bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
599
+ return_data.append(
600
+ {
601
+ "label": label,
602
+ "bbox": bbox,
603
+ "mask": mask,
604
+ "score": inp_detection["score"],
605
+ }
606
+ )
607
+ display_data.append(
608
+ {
609
+ "label": label,
610
+ "bbox": detection["bounding_box"],
611
+ "mask": detection["mask"],
612
+ "score": inp_detection["score"],
613
+ }
614
+ )
615
+ return {"files": files, "return_data": return_data, "display_data": display_data}
686
616
 
687
617
 
688
- def countgd_object_detection(
689
- prompt: str,
618
+ def sam2(
690
619
  image: np.ndarray,
691
- box_threshold: float = 0.23,
620
+ detections: List[Dict[str, Any]],
692
621
  ) -> List[Dict[str, Any]]:
693
- """'countgd_object_detection' is a tool that can detect multiple instances of an
694
- object given a text prompt. It is particularly useful when trying to detect and
695
- count a large number of objects. You can optionally separate object names in the
696
- prompt with commas. It returns a list of bounding boxes with normalized
697
- coordinates, label names and associated confidence scores.
622
+ """'sam2' is a tool that can segment multiple objects given an input bounding box,
623
+ label and score. It returns a set of masks along with the corresponding bounding
624
+ boxes and labels.
698
625
 
699
626
  Parameters:
700
- prompt (str): The object that needs to be counted.
701
627
  image (np.ndarray): The image that contains multiple instances of the object.
702
- box_threshold (float, optional): The threshold for detection. Defaults
703
- to 0.23.
628
+ detections (List[Dict[str, Any]]): A list of dictionaries containing the score,
629
+ label, and bounding box of the detected objects with normalized coordinates
630
+ between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates
631
+ of the top-left and xmax and ymax are the coordinates of the bottom-right of
632
+ the bounding box.
704
633
 
705
634
  Returns:
706
- List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
707
- bounding box of the detected objects with normalized coordinates between 0
708
- and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
709
- top-left and xmax and ymax are the coordinates of the bottom-right of the
710
- bounding box.
635
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
636
+ bounding box, and mask of the detected objects with normalized coordinates
637
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
638
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
639
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
640
+ the background.
711
641
 
712
642
  Example
713
643
  -------
714
- >>> countgd_object_detection("flower", image)
644
+ >>> sam2(image, [
645
+ {'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
646
+ ])
715
647
  [
716
- {'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
717
- {'score': 0.68, 'label': 'flower', 'bbox': [0.2, 0.21, 0.45, 0.5},
718
- {'score': 0.78, 'label': 'flower', 'bbox': [0.3, 0.35, 0.48, 0.52},
719
- {'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
648
+ {
649
+ 'score': 0.49,
650
+ 'label': 'flower',
651
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
652
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
653
+ [0, 0, 0, ..., 0, 0, 0],
654
+ ...,
655
+ [0, 0, 0, ..., 0, 0, 0],
656
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
657
+ },
720
658
  ]
721
659
  """
722
660
  image_size = image.shape[:2]
723
- if image_size[0] < 1 or image_size[1] < 1:
724
- return []
661
+ ret = _sam2(image, detections, image_size)
662
+ _display_tool_trace(
663
+ sam2.__name__,
664
+ {},
665
+ ret["display_data"],
666
+ ret["files"],
667
+ )
725
668
 
726
- buffer_bytes = numpy_to_bytes(image)
727
- files = [("image", buffer_bytes)]
669
+ return ret["return_data"] # type: ignore
670
+
671
+
672
+ def _countgd_object_detection(
673
+ prompt: str,
674
+ image: np.ndarray,
675
+ box_threshold: float,
676
+ image_size: Tuple[int, ...],
677
+ image_bytes: Optional[bytes] = None,
678
+ ) -> Dict[str, Any]:
679
+ if image_bytes is None:
680
+ image_bytes = numpy_to_bytes(image)
681
+
682
+ files = [("image", image_bytes)]
728
683
  prompts = [p.strip() for p in prompt.split(", ")]
729
684
 
730
685
  def _run_countgd(prompt: str) -> List[Dict[str, Any]]:
@@ -747,97 +702,76 @@ def countgd_object_detection(
747
702
  for future in as_completed(futures):
748
703
  bboxes.extend(future.result())
749
704
 
750
- bboxes_formatted = [
751
- ODResponseData(
752
- label=bbox["label"],
753
- bbox=normalize_bbox(bbox["bounding_box"], image_size),
754
- score=round(bbox["score"], 2),
755
- )
705
+ return_data = [
706
+ {
707
+ "label": bbox["label"],
708
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
709
+ "score": round(bbox["score"], 2),
710
+ }
756
711
  for bbox in bboxes
757
712
  ]
758
- # TODO: remove this once we start to use the confidence on countgd
759
- filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
760
- return_data = [bbox.model_dump() for bbox in filtered_bboxes]
761
- return single_nms(return_data, iou_threshold=0.80)
762
713
 
714
+ return_data = single_nms(return_data, iou_threshold=0.80)
715
+ display_data = [
716
+ {
717
+ "label": e["label"],
718
+ "score": e["score"],
719
+ "bbox": denormalize_bbox(e["bbox"], image_size),
720
+ }
721
+ for e in return_data
722
+ ]
723
+ return {"files": files, "return_data": return_data, "display_data": display_data}
763
724
 
764
- def sam2(
725
+
726
+ def countgd_object_detection(
727
+ prompt: str,
765
728
  image: np.ndarray,
766
- detections: List[Dict[str, Any]],
729
+ box_threshold: float = 0.23,
767
730
  ) -> List[Dict[str, Any]]:
768
- """'sam2' is a tool that can segment multiple objects given an input bounding box,
769
- label and score. It returns a set of masks along with the corresponding bounding
770
- boxes and labels.
731
+ """'countgd_object_detection' is a tool that can detect multiple instances of an
732
+ object given a text prompt. It is particularly useful when trying to detect and
733
+ count a large number of objects. You can optionally separate object names in the
734
+ prompt with commas. It returns a list of bounding boxes with normalized
735
+ coordinates, label names and associated confidence scores.
771
736
 
772
737
  Parameters:
738
+ prompt (str): The object that needs to be counted.
773
739
  image (np.ndarray): The image that contains multiple instances of the object.
774
- detections (List[Dict[str, Any]]): A list of dictionaries containing the score,
775
- label, and bounding box of the detected objects with normalized coordinates
776
- between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates
777
- of the top-left and xmax and ymax are the coordinates of the bottom-right of
778
- the bounding box.
740
+ box_threshold (float, optional): The threshold for detection. Defaults
741
+ to 0.23.
779
742
 
780
743
  Returns:
781
- List[Dict[str, Any]]: A list of dictionaries containing the score, label,
782
- bounding box, and mask of the detected objects with normalized coordinates
783
- (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
784
- and xmax and ymax are the coordinates of the bottom-right of the bounding box.
785
- The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
786
- the background.
744
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
745
+ bounding box of the detected objects with normalized coordinates between 0
746
+ and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
747
+ top-left and xmax and ymax are the coordinates of the bottom-right of the
748
+ bounding box.
787
749
 
788
750
  Example
789
751
  -------
790
- >>> sam2(image, [
791
- {'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
792
- ])
752
+ >>> countgd_object_detection("flower", image)
793
753
  [
794
- {
795
- 'score': 0.49,
796
- 'label': 'flower',
797
- 'bbox': [0.1, 0.11, 0.35, 0.4],
798
- 'mask': array([[0, 0, 0, ..., 0, 0, 0],
799
- [0, 0, 0, ..., 0, 0, 0],
800
- ...,
801
- [0, 0, 0, ..., 0, 0, 0],
802
- [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
803
- },
754
+ {'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
755
+ {'score': 0.68, 'label': 'flower', 'bbox': [0.2, 0.21, 0.45, 0.5},
756
+ {'score': 0.78, 'label': 'flower', 'bbox': [0.3, 0.35, 0.48, 0.52},
757
+ {'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
804
758
  ]
805
759
  """
806
760
  image_size = image.shape[:2]
761
+ if image_size[0] < 1 or image_size[1] < 1:
762
+ return []
807
763
 
808
- files = [("images", numpy_to_bytes(image))]
809
- payload = {
810
- "model": "sam2",
811
- "bboxes": json.dumps(
812
- [
813
- {
814
- "labels": [d["label"] for d in detections],
815
- "bboxes": [
816
- denormalize_bbox(d["bbox"], image_size) for d in detections
817
- ],
818
- }
819
- ]
820
- ),
821
- }
822
- metadata = {"function_name": "sam2"}
823
- pred_detections = send_task_inference_request(
824
- payload, "sam2", files=files, metadata=metadata
764
+ ret = _countgd_object_detection(prompt, image, box_threshold, image_size)
765
+ _display_tool_trace(
766
+ countgd_object_detection.__name__,
767
+ {
768
+ "prompts": prompt,
769
+ "confidence": box_threshold,
770
+ },
771
+ ret["display_data"],
772
+ ret["files"],
825
773
  )
826
- frame = pred_detections[0]
827
- return_data = []
828
- for inp_detection, detection in zip(detections, frame):
829
- mask = rle_decode_array(detection["mask"])
830
- label = detection["label"]
831
- bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
832
- return_data.append(
833
- {
834
- "label": label,
835
- "bbox": bbox,
836
- "mask": mask,
837
- "score": inp_detection["score"],
838
- }
839
- )
840
- return return_data
774
+ return ret["return_data"] # type: ignore
841
775
 
842
776
 
843
777
  def countgd_sam2_object_detection(
@@ -881,9 +815,23 @@ def countgd_sam2_object_detection(
881
815
  },
882
816
  ]
883
817
  """
884
- detections = countgd_object_detection(prompt, image, box_threshold)
885
- detections_with_masks = sam2(image, detections)
886
- return detections_with_masks
818
+
819
+ od_ret = _countgd_object_detection(prompt, image, box_threshold, image.shape[:2])
820
+ seg_ret = _sam2(
821
+ image, od_ret["return_data"], image.shape[:2], image_bytes=od_ret["files"][0][1]
822
+ )
823
+
824
+ _display_tool_trace(
825
+ countgd_sam2_object_detection.__name__,
826
+ {
827
+ "prompts": prompt,
828
+ "confidence": box_threshold,
829
+ },
830
+ seg_ret["display_data"],
831
+ seg_ret["files"],
832
+ )
833
+
834
+ return seg_ret["return_data"] # type: ignore
887
835
 
888
836
 
889
837
  def countgd_example_based_counting(
@@ -941,76 +889,28 @@ def countgd_example_based_counting(
941
889
  # get the first frame
942
890
  bboxes_per_frame = detections[0]
943
891
  bboxes_formatted = [
944
- ODResponseData(
945
- label=bbox["label"],
946
- bbox=normalize_bbox(bbox["bounding_box"], image_size),
947
- score=round(bbox["score"], 2),
948
- )
892
+ {
893
+ "label": bbox["label"],
894
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
895
+ "score": round(bbox["score"], 2),
896
+ }
949
897
  for bbox in bboxes_per_frame
950
898
  ]
951
- filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
952
- return [bbox.model_dump() for bbox in filtered_bboxes]
953
-
954
-
955
- def florence2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
956
- """'florence2_roberta_vqa' is a tool that takes an image and analyzes
957
- its contents, generates detailed captions and then tries to answer the given
958
- question using the generated context. It returns text as an answer to the question.
959
-
960
- Parameters:
961
- prompt (str): The question about the image
962
- image (np.ndarray): The reference image used for the question
963
-
964
- Returns:
965
- str: A string which is the answer to the given prompt.
966
-
967
- Example
968
- -------
969
- >>> florence2_roberta_vqa('What is the top left animal in this image?', image)
970
- 'white tiger'
971
- """
972
-
973
- image_b64 = convert_to_b64(image)
974
- data = {
975
- "image": image_b64,
976
- "question": prompt,
977
- "function_name": "florence2_roberta_vqa",
978
- }
979
-
980
- answer = send_inference_request(data, "florence2-qa", v2=True)
981
- return answer # type: ignore
982
-
983
-
984
- def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
985
- """'ixc25_image_vqa' is a tool that can answer any questions about arbitrary images
986
- including regular images or images of documents or presentations. It returns text
987
- as an answer to the question.
988
-
989
- Parameters:
990
- prompt (str): The question about the image
991
- image (np.ndarray): The reference image used for the question
992
-
993
- Returns:
994
- str: A string which is the answer to the given prompt.
995
-
996
- Example
997
- -------
998
- >>> ixc25_image_vqa('What is the cat doing?', image)
999
- 'drinking milk'
1000
- """
1001
- if image.shape[0] < 1 or image.shape[1] < 1:
1002
- raise ValueError(f"Image is empty, image shape: {image.shape}")
1003
-
1004
- buffer_bytes = numpy_to_bytes(image)
1005
- files = [("image", buffer_bytes)]
1006
- payload = {
1007
- "prompt": prompt,
1008
- "function_name": "ixc25_image_vqa",
1009
- }
1010
- data: Dict[str, Any] = send_inference_request(
1011
- payload, "internlm-xcomposer2", files=files, v2=True
899
+ _display_tool_trace(
900
+ countgd_example_based_counting.__name__,
901
+ payload,
902
+ [
903
+ {
904
+ "label": e["label"],
905
+ "score": e["score"],
906
+ "bbox": denormalize_bbox(e["bbox"], image_size),
907
+ }
908
+ for e in bboxes_formatted
909
+ ],
910
+ files,
1012
911
  )
1013
- return cast(str, data["answer"])
912
+
913
+ return bboxes_formatted
1014
914
 
1015
915
 
1016
916
  def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
@@ -1047,61 +947,13 @@ def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
1047
947
  data: Dict[str, Any] = send_inference_request(
1048
948
  payload, "image-to-text", files=files, v2=True
1049
949
  )
1050
- return cast(str, data)
1051
-
1052
-
1053
- def claude35_text_extraction(image: np.ndarray) -> str:
1054
- """'claude35_text_extraction' is a tool that can extract text from an image. It
1055
- returns the extracted text as a string and can be used as an alternative to OCR if
1056
- you do not need to know the exact bounding box of the text.
1057
-
1058
- Parameters:
1059
- image (np.ndarray): The image to extract text from.
1060
-
1061
- Returns:
1062
- str: The extracted text from the image.
1063
- """
1064
-
1065
- lmm = AnthropicLMM()
1066
- buffer = io.BytesIO()
1067
- Image.fromarray(image).save(buffer, format="PNG")
1068
- image_bytes = buffer.getvalue()
1069
- image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
1070
- text = lmm.generate(
1071
- "Extract and return any text you see in this image and nothing else. If you do not read any text respond with an empty string.",
1072
- [image_b64],
1073
- )
1074
- return cast(str, text)
1075
-
1076
-
1077
- def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
1078
- """'ixc25_video_vqa' is a tool that can answer any questions about arbitrary videos
1079
- including regular videos or videos of documents or presentations. It returns text
1080
- as an answer to the question.
1081
-
1082
- Parameters:
1083
- prompt (str): The question about the video
1084
- frames (List[np.ndarray]): The reference frames used for the question
1085
-
1086
- Returns:
1087
- str: A string which is the answer to the given prompt.
1088
-
1089
- Example
1090
- -------
1091
- >>> ixc25_video_vqa('Which football player made the goal?', frames)
1092
- 'Lionel Messi'
1093
- """
1094
-
1095
- buffer_bytes = frames_to_bytes(frames)
1096
- files = [("video", buffer_bytes)]
1097
- payload = {
1098
- "prompt": prompt,
1099
- "function_name": "ixc25_video_vqa",
1100
- }
1101
- data: Dict[str, Any] = send_inference_request(
1102
- payload, "internlm-xcomposer2", files=files, v2=True
950
+ _display_tool_trace(
951
+ qwen2_vl_images_vqa.__name__,
952
+ payload,
953
+ cast(str, data),
954
+ files,
1103
955
  )
1104
- return cast(str, data["answer"])
956
+ return cast(str, data)
1105
957
 
1106
958
 
1107
959
  def qwen2_vl_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
@@ -1135,9 +987,39 @@ def qwen2_vl_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
1135
987
  data: Dict[str, Any] = send_inference_request(
1136
988
  payload, "image-to-text", files=files, v2=True
1137
989
  )
990
+ _display_tool_trace(
991
+ qwen2_vl_video_vqa.__name__,
992
+ payload,
993
+ cast(str, data),
994
+ files,
995
+ )
1138
996
  return cast(str, data)
1139
997
 
1140
998
 
999
+ def claude35_text_extraction(image: np.ndarray) -> str:
1000
+ """'claude35_text_extraction' is a tool that can extract text from an image. It
1001
+ returns the extracted text as a string and can be used as an alternative to OCR if
1002
+ you do not need to know the exact bounding box of the text.
1003
+
1004
+ Parameters:
1005
+ image (np.ndarray): The image to extract text from.
1006
+
1007
+ Returns:
1008
+ str: The extracted text from the image.
1009
+ """
1010
+
1011
+ lmm = AnthropicLMM()
1012
+ buffer = io.BytesIO()
1013
+ Image.fromarray(image).save(buffer, format="PNG")
1014
+ image_bytes = buffer.getvalue()
1015
+ image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
1016
+ text = lmm.generate(
1017
+ "Extract and return any text you see in this image and nothing else. If you do not read any text respond with an empty string.",
1018
+ [image_b64],
1019
+ )
1020
+ return cast(str, text)
1021
+
1022
+
1141
1023
  def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
1142
1024
  """'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
1143
1025
  including regular images or images of documents or presentations. It returns text
@@ -1187,48 +1069,18 @@ def gpt4o_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
1187
1069
 
1188
1070
  if len(frames) > 10:
1189
1071
  step = len(frames) / 10
1190
- frames = [frames[int(i * step)] for i in range(10)]
1191
-
1192
- frames_b64 = []
1193
- for frame in frames:
1194
- buffer = io.BytesIO()
1195
- Image.fromarray(frame).save(buffer, format="PNG")
1196
- image_bytes = buffer.getvalue()
1197
- image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
1198
- frames_b64.append(image_b64)
1199
-
1200
- resp = lmm.generate(prompt, frames_b64)
1201
- return cast(str, resp)
1202
-
1203
-
1204
- def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
1205
- """'git_vqa_v2' is a tool that can answer questions about the visual
1206
- contents of an image given a question and an image. It returns an answer to the
1207
- question
1208
-
1209
- Parameters:
1210
- prompt (str): The question about the image
1211
- image (np.ndarray): The reference image used for the question
1212
-
1213
- Returns:
1214
- str: A string which is the answer to the given prompt.
1215
-
1216
- Example
1217
- -------
1218
- >>> git_vqa_v2('What is the cat doing ?', image)
1219
- 'drinking milk'
1220
- """
1072
+ frames = [frames[int(i * step)] for i in range(10)]
1221
1073
 
1222
- image_b64 = convert_to_b64(image)
1223
- data = {
1224
- "image": image_b64,
1225
- "prompt": prompt,
1226
- "tool": "image_question_answering",
1227
- "function_name": "git_vqa_v2",
1228
- }
1074
+ frames_b64 = []
1075
+ for frame in frames:
1076
+ buffer = io.BytesIO()
1077
+ Image.fromarray(frame).save(buffer, format="PNG")
1078
+ image_bytes = buffer.getvalue()
1079
+ image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
1080
+ frames_b64.append(image_b64)
1229
1081
 
1230
- answer = send_inference_request(data, "tools")
1231
- return answer["text"][0] # type: ignore
1082
+ resp = lmm.generate(prompt, frames_b64)
1083
+ return cast(str, resp)
1232
1084
 
1233
1085
 
1234
1086
  def video_temporal_localization(
@@ -1274,43 +1126,15 @@ def video_temporal_localization(
1274
1126
  data = send_inference_request(
1275
1127
  payload, "video-temporal-localization", files=files, v2=True
1276
1128
  )
1129
+ _display_tool_trace(
1130
+ video_temporal_localization.__name__,
1131
+ payload,
1132
+ data,
1133
+ files,
1134
+ )
1277
1135
  return [cast(float, value) for value in data]
1278
1136
 
1279
1137
 
1280
- def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
1281
- """'clip' is a tool that can classify an image or a cropped detection given a list
1282
- of input classes or tags. It returns the same list of the input classes along with
1283
- their probability scores based on image content.
1284
-
1285
- Parameters:
1286
- image (np.ndarray): The image to classify or tag
1287
- classes (List[str]): The list of classes or tags that is associated with the image
1288
-
1289
- Returns:
1290
- Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
1291
- contains a list of given labels and other a list of scores.
1292
-
1293
- Example
1294
- -------
1295
- >>> clip(image, ['dog', 'cat', 'bird'])
1296
- {"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]},
1297
- """
1298
-
1299
- if image.shape[0] < 1 or image.shape[1] < 1:
1300
- return {"labels": [], "scores": []}
1301
-
1302
- image_b64 = convert_to_b64(image)
1303
- data = {
1304
- "prompt": ",".join(classes),
1305
- "image": image_b64,
1306
- "tool": "closed_set_image_classification",
1307
- "function_name": "clip",
1308
- }
1309
- resp_data: dict[str, Any] = send_inference_request(data, "tools")
1310
- resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
1311
- return resp_data
1312
-
1313
-
1314
1138
  def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
1315
1139
  """'vit_image_classification' is a tool that can classify an image. It returns a
1316
1140
  list of classes and their probability scores based on image content.
@@ -1338,6 +1162,12 @@ def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
1338
1162
  }
1339
1163
  resp_data: dict[str, Any] = send_inference_request(data, "tools")
1340
1164
  resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
1165
+ _display_tool_trace(
1166
+ vit_image_classification.__name__,
1167
+ data,
1168
+ resp_data,
1169
+ image_b64,
1170
+ )
1341
1171
  return resp_data
1342
1172
 
1343
1173
 
@@ -1369,65 +1199,15 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
1369
1199
  data, "nsfw-classification", v2=True
1370
1200
  )
1371
1201
  resp_data["score"] = round(resp_data["score"], 4)
1202
+ _display_tool_trace(
1203
+ vit_nsfw_classification.__name__,
1204
+ data,
1205
+ resp_data,
1206
+ image_b64,
1207
+ )
1372
1208
  return resp_data
1373
1209
 
1374
1210
 
1375
- def blip_image_caption(image: np.ndarray) -> str:
1376
- """'blip_image_caption' is a tool that can caption an image based on its contents. It
1377
- returns a text describing the image.
1378
-
1379
- Parameters:
1380
- image (np.ndarray): The image to caption
1381
-
1382
- Returns:
1383
- str: A string which is the caption for the given image.
1384
-
1385
- Example
1386
- -------
1387
- >>> blip_image_caption(image)
1388
- 'This image contains a cat sitting on a table with a bowl of milk.'
1389
- """
1390
-
1391
- image_b64 = convert_to_b64(image)
1392
- data = {
1393
- "image": image_b64,
1394
- "tool": "image_captioning",
1395
- "function_name": "blip_image_caption",
1396
- }
1397
-
1398
- answer = send_inference_request(data, "tools")
1399
- return answer["text"][0] # type: ignore
1400
-
1401
-
1402
- def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> str:
1403
- """'florence2_image_caption' is a tool that can caption or describe an image based
1404
- on its contents. It returns a text describing the image.
1405
-
1406
- Parameters:
1407
- image (np.ndarray): The image to caption
1408
- detail_caption (bool): If True, the caption will be as detailed as possible else
1409
- the caption will be a brief description.
1410
-
1411
- Returns:
1412
- str: A string which is the caption for the given image.
1413
-
1414
- Example
1415
- -------
1416
- >>> florence2_image_caption(image, False)
1417
- 'This image contains a cat sitting on a table with a bowl of milk.'
1418
- """
1419
- image_b64 = convert_to_b64(image)
1420
- task = "<MORE_DETAILED_CAPTION>" if detail_caption else "<DETAILED_CAPTION>"
1421
- data = {
1422
- "image": image_b64,
1423
- "task": task,
1424
- "function_name": "florence2_image_caption",
1425
- }
1426
-
1427
- answer = send_inference_request(data, "florence2", v2=True)
1428
- return answer[task] # type: ignore
1429
-
1430
-
1431
1211
  def florence2_phrase_grounding(
1432
1212
  prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
1433
1213
  ) -> List[Dict[str, Any]]:
@@ -1490,15 +1270,21 @@ def florence2_phrase_grounding(
1490
1270
  # get the first frame
1491
1271
  bboxes = detections[0]
1492
1272
  bboxes_formatted = [
1493
- ODResponseData(
1494
- label=bbox["label"],
1495
- bbox=normalize_bbox(bbox["bounding_box"], image_size),
1496
- score=round(bbox["score"], 2),
1497
- )
1273
+ {
1274
+ "label": bbox["label"],
1275
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
1276
+ "score": round(bbox["score"], 2),
1277
+ }
1498
1278
  for bbox in bboxes
1499
1279
  ]
1500
1280
 
1501
- return [bbox.model_dump() for bbox in bboxes_formatted]
1281
+ _display_tool_trace(
1282
+ florence2_phrase_grounding.__name__,
1283
+ payload,
1284
+ detections[0],
1285
+ files,
1286
+ )
1287
+ return [bbox for bbox in bboxes_formatted]
1502
1288
 
1503
1289
 
1504
1290
  def florence2_phrase_grounding_video(
@@ -1566,15 +1352,21 @@ def florence2_phrase_grounding_video(
1566
1352
  bboxes_formatted = []
1567
1353
  for frame_data in detections:
1568
1354
  bboxes_formatted_per_frame = [
1569
- ODResponseData(
1570
- label=bbox["label"],
1571
- bbox=normalize_bbox(bbox["bounding_box"], image_size),
1572
- score=round(bbox["score"], 2),
1573
- )
1355
+ {
1356
+ "label": bbox["label"],
1357
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
1358
+ "score": round(bbox["score"], 2),
1359
+ }
1574
1360
  for bbox in frame_data
1575
1361
  ]
1576
1362
  bboxes_formatted.append(bboxes_formatted_per_frame)
1577
- return [[bbox.model_dump() for bbox in frame] for frame in bboxes_formatted]
1363
+ _display_tool_trace(
1364
+ florence2_phrase_grounding_video.__name__,
1365
+ payload,
1366
+ detections,
1367
+ files,
1368
+ )
1369
+ return bboxes_formatted
1578
1370
 
1579
1371
 
1580
1372
  def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
@@ -1621,6 +1413,12 @@ def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
1621
1413
  "score": 1.0,
1622
1414
  }
1623
1415
  )
1416
+ _display_tool_trace(
1417
+ florence2_ocr.__name__,
1418
+ {},
1419
+ detections,
1420
+ image_b64,
1421
+ )
1624
1422
  return return_data
1625
1423
 
1626
1424
 
@@ -1683,6 +1481,12 @@ def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
1683
1481
  ),
1684
1482
  }
1685
1483
  )
1484
+ _display_tool_trace(
1485
+ detr_segmentation.__name__,
1486
+ {},
1487
+ return_data,
1488
+ image_b64,
1489
+ )
1686
1490
  return return_data
1687
1491
 
1688
1492
 
@@ -1721,74 +1525,15 @@ def depth_anything_v2(image: np.ndarray) -> np.ndarray:
1721
1525
  depth_map_np.max() - depth_map_np.min()
1722
1526
  )
1723
1527
  depth_map_np = (255 * depth_map_np).astype(np.uint8)
1528
+ _display_tool_trace(
1529
+ depth_anything_v2.__name__,
1530
+ {},
1531
+ depth_map,
1532
+ image_b64,
1533
+ )
1724
1534
  return depth_map_np
1725
1535
 
1726
1536
 
1727
- def generate_soft_edge_image(image: np.ndarray) -> np.ndarray:
1728
- """'generate_soft_edge_image' is a tool that runs Holistically Nested edge detection
1729
- to generate a soft edge image (HED) from a given RGB image. The returned image is
1730
- monochrome and represents object boundaries as soft white edges on black background
1731
-
1732
- Parameters:
1733
- image (np.ndarray): The image to used to generate soft edge image
1734
-
1735
- Returns:
1736
- np.ndarray: A soft edge image with pixel values ranging from 0 to 255.
1737
-
1738
- Example
1739
- -------
1740
- >>> generate_soft_edge_image(image)
1741
- array([[0, 0, 0, ..., 0, 0, 0],
1742
- [0, 20, 24, ..., 0, 100, 103],
1743
- ...,
1744
- [10, 11, 15, ..., 202, 202, 205],
1745
- [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
1746
- """
1747
- image_b64 = convert_to_b64(image)
1748
- data = {
1749
- "image": image_b64,
1750
- "tool": "generate_hed",
1751
- "function_name": "generate_soft_edge_image",
1752
- }
1753
-
1754
- answer = send_inference_request(data, "tools")
1755
- return_data = np.array(b64_to_pil(answer["masks"][0]).convert("L"))
1756
- return return_data
1757
-
1758
-
1759
- def dpt_hybrid_midas(image: np.ndarray) -> np.ndarray:
1760
- """'dpt_hybrid_midas' is a tool that generates a normal mapped from a given RGB
1761
- image. The returned RGB image is texture mapped image of the surface normals and the
1762
- RGB values represent the surface normals in the x, y, z directions.
1763
-
1764
- Parameters:
1765
- image (np.ndarray): The image to used to generate normal image
1766
-
1767
- Returns:
1768
- np.ndarray: A mapped normal image with RGB pixel values indicating surface
1769
- normals in x, y, z directions.
1770
-
1771
- Example
1772
- -------
1773
- >>> dpt_hybrid_midas(image)
1774
- array([[0, 0, 0, ..., 0, 0, 0],
1775
- [0, 20, 24, ..., 0, 100, 103],
1776
- ...,
1777
- [10, 11, 15, ..., 202, 202, 205],
1778
- [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
1779
- """
1780
- image_b64 = convert_to_b64(image)
1781
- data = {
1782
- "image": image_b64,
1783
- "tool": "generate_normal",
1784
- "function_name": "dpt_hybrid_midas",
1785
- }
1786
-
1787
- answer = send_inference_request(data, "tools")
1788
- return_data = np.array(b64_to_pil(answer["masks"][0]).convert("RGB"))
1789
- return return_data
1790
-
1791
-
1792
1537
  def generate_pose_image(image: np.ndarray) -> np.ndarray:
1793
1538
  """'generate_pose_image' is a tool that generates a open pose bone/stick image from
1794
1539
  a given RGB image. The returned bone image is RGB with the pose amd keypoints colored
@@ -1817,6 +1562,12 @@ def generate_pose_image(image: np.ndarray) -> np.ndarray:
1817
1562
 
1818
1563
  pos_img = send_inference_request(data, "pose-detector", v2=True)
1819
1564
  return_data = np.array(b64_to_pil(pos_img["data"]).convert("RGB"))
1565
+ _display_tool_trace(
1566
+ generate_pose_image.__name__,
1567
+ {},
1568
+ pos_img,
1569
+ image_b64,
1570
+ )
1820
1571
  return return_data
1821
1572
 
1822
1573
 
@@ -1851,130 +1602,28 @@ def template_match(
1851
1602
  template_image_b64 = convert_to_b64(template_image)
1852
1603
  data = {
1853
1604
  "image": image_b64,
1854
- "template": template_image_b64,
1855
- "tool": "template_match",
1856
- "function_name": "template_match",
1857
- }
1858
-
1859
- answer = send_inference_request(data, "tools")
1860
- return_data = []
1861
- for i in range(len(answer["bboxes"])):
1862
- return_data.append(
1863
- {
1864
- "score": round(answer["scores"][i], 2),
1865
- "bbox": normalize_bbox(answer["bboxes"][i], image_size),
1866
- }
1867
- )
1868
- return return_data
1869
-
1870
-
1871
- def minimum_distance(
1872
- det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]
1873
- ) -> float:
1874
- """'minimum_distance' calculates the minimum distance between two detections which
1875
- can include bounding boxes and or masks. This will return the closest distance
1876
- between the objects, not the distance between the centers of the objects.
1877
-
1878
- Parameters:
1879
- det1 (Dict[str, Any]): The first detection of boxes or masks.
1880
- det2 (Dict[str, Any]): The second detection of boxes or masks.
1881
- image_size (Tuple[int, int]): The size of the image given as (height, width).
1882
-
1883
- Returns:
1884
- float: The closest distance between the two detections.
1885
-
1886
- Example
1887
- -------
1888
- >>> closest_distance(det1, det2, image_size)
1889
- 141.42
1890
- """
1891
-
1892
- if "mask" in det1 and "mask" in det2:
1893
- return closest_mask_distance(det1["mask"], det2["mask"])
1894
- elif "bbox" in det1 and "bbox" in det2:
1895
- return closest_box_distance(det1["bbox"], det2["bbox"], image_size)
1896
- else:
1897
- raise ValueError("Both detections must have either bbox or mask")
1898
-
1899
-
1900
- def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
1901
- """'closest_mask_distance' calculates the closest distance between two masks.
1902
-
1903
- Parameters:
1904
- mask1 (np.ndarray): The first mask.
1905
- mask2 (np.ndarray): The second mask.
1906
-
1907
- Returns:
1908
- float: The closest distance between the two masks.
1909
-
1910
- Example
1911
- -------
1912
- >>> closest_mask_distance(mask1, mask2)
1913
- 0.5
1914
- """
1915
-
1916
- mask1 = np.clip(mask1, 0, 1)
1917
- mask2 = np.clip(mask2, 0, 1)
1918
- contours1, _ = cv2.findContours(mask1, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
1919
- contours2, _ = cv2.findContours(mask2, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
1920
- largest_contour1 = max(contours1, key=cv2.contourArea)
1921
- largest_contour2 = max(contours2, key=cv2.contourArea)
1922
- polygon1 = cv2.approxPolyDP(largest_contour1, 1.0, True)
1923
- polygon2 = cv2.approxPolyDP(largest_contour2, 1.0, True)
1924
- min_distance = np.inf
1925
-
1926
- small_polygon, larger_contour = (
1927
- (polygon1, largest_contour2)
1928
- if len(largest_contour1) < len(largest_contour2)
1929
- else (polygon2, largest_contour1)
1930
- )
1931
-
1932
- # For each point in the first polygon
1933
- for point in small_polygon:
1934
- # Calculate the distance to the second polygon, -1 is to invert result as point inside the polygon is positive
1935
-
1936
- distance = (
1937
- cv2.pointPolygonTest(
1938
- larger_contour, (point[0, 0].item(), point[0, 1].item()), True
1939
- )
1940
- * -1
1941
- )
1942
-
1943
- # If the distance is negative, the point is inside the polygon, so the distance is 0
1944
- if distance < 0:
1945
- continue
1946
- else:
1947
- # Update the minimum distance if the point is outside the polygon
1948
- min_distance = min(min_distance, distance)
1949
-
1950
- return min_distance if min_distance != np.inf else 0.0
1951
-
1952
-
1953
- def closest_box_distance(
1954
- box1: List[float], box2: List[float], image_size: Tuple[int, int]
1955
- ) -> float:
1956
- """'closest_box_distance' calculates the closest distance between two bounding boxes.
1957
-
1958
- Parameters:
1959
- box1 (List[float]): The first bounding box.
1960
- box2 (List[float]): The second bounding box.
1961
- image_size (Tuple[int, int]): The size of the image given as (height, width).
1962
-
1963
- Returns:
1964
- float: The closest distance between the two bounding boxes.
1965
-
1966
- Example
1967
- -------
1968
- >>> closest_box_distance([100, 100, 200, 200], [300, 300, 400, 400])
1969
- 141.42
1970
- """
1971
-
1972
- x11, y11, x12, y12 = denormalize_bbox(box1, image_size)
1973
- x21, y21, x22, y22 = denormalize_bbox(box2, image_size)
1605
+ "template": template_image_b64,
1606
+ "tool": "template_match",
1607
+ "function_name": "template_match",
1608
+ }
1974
1609
 
1975
- horizontal_distance = np.max([0, x21 - x12, x11 - x22])
1976
- vertical_distance = np.max([0, y21 - y12, y11 - y22])
1977
- return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
1610
+ answer = send_inference_request(data, "tools")
1611
+ return_data = []
1612
+ for i in range(len(answer["bboxes"])):
1613
+ return_data.append(
1614
+ {
1615
+ "label": "match",
1616
+ "score": round(answer["scores"][i], 2),
1617
+ "bbox": normalize_bbox(answer["bboxes"][i], image_size),
1618
+ }
1619
+ )
1620
+ _display_tool_trace(
1621
+ template_match.__name__,
1622
+ {"template_image": template_image_b64},
1623
+ return_data,
1624
+ image_b64,
1625
+ )
1626
+ return return_data
1978
1627
 
1979
1628
 
1980
1629
  def flux_image_inpainting(
@@ -2064,6 +1713,12 @@ def flux_image_inpainting(
2064
1713
  )
2065
1714
 
2066
1715
  output_image = np.array(b64_to_pil(response[0]).convert("RGB"))
1716
+ _display_tool_trace(
1717
+ flux_image_inpainting.__name__,
1718
+ payload,
1719
+ output_image,
1720
+ files,
1721
+ )
2067
1722
  return output_image
2068
1723
 
2069
1724
 
@@ -2106,9 +1761,124 @@ def siglip_classification(image: np.ndarray, labels: List[str]) -> Dict[str, Any
2106
1761
  metadata_payload={"function_name": "siglip_classification"},
2107
1762
  )
2108
1763
 
1764
+ _display_tool_trace(
1765
+ siglip_classification.__name__,
1766
+ payload,
1767
+ response,
1768
+ files,
1769
+ )
2109
1770
  return response
2110
1771
 
2111
1772
 
1773
+ def minimum_distance(
1774
+ det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]
1775
+ ) -> float:
1776
+ """'minimum_distance' calculates the minimum distance between two detections which
1777
+ can include bounding boxes and or masks. This will return the closest distance
1778
+ between the objects, not the distance between the centers of the objects.
1779
+
1780
+ Parameters:
1781
+ det1 (Dict[str, Any]): The first detection of boxes or masks.
1782
+ det2 (Dict[str, Any]): The second detection of boxes or masks.
1783
+ image_size (Tuple[int, int]): The size of the image given as (height, width).
1784
+
1785
+ Returns:
1786
+ float: The closest distance between the two detections.
1787
+
1788
+ Example
1789
+ -------
1790
+ >>> closest_distance(det1, det2, image_size)
1791
+ 141.42
1792
+ """
1793
+
1794
+ if "mask" in det1 and "mask" in det2:
1795
+ return closest_mask_distance(det1["mask"], det2["mask"])
1796
+ elif "bbox" in det1 and "bbox" in det2:
1797
+ return closest_box_distance(det1["bbox"], det2["bbox"], image_size)
1798
+ else:
1799
+ raise ValueError("Both detections must have either bbox or mask")
1800
+
1801
+
1802
+ def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
1803
+ """'closest_mask_distance' calculates the closest distance between two masks.
1804
+
1805
+ Parameters:
1806
+ mask1 (np.ndarray): The first mask.
1807
+ mask2 (np.ndarray): The second mask.
1808
+
1809
+ Returns:
1810
+ float: The closest distance between the two masks.
1811
+
1812
+ Example
1813
+ -------
1814
+ >>> closest_mask_distance(mask1, mask2)
1815
+ 0.5
1816
+ """
1817
+
1818
+ mask1 = np.clip(mask1, 0, 1)
1819
+ mask2 = np.clip(mask2, 0, 1)
1820
+ contours1, _ = cv2.findContours(mask1, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
1821
+ contours2, _ = cv2.findContours(mask2, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
1822
+ largest_contour1 = max(contours1, key=cv2.contourArea)
1823
+ largest_contour2 = max(contours2, key=cv2.contourArea)
1824
+ polygon1 = cv2.approxPolyDP(largest_contour1, 1.0, True)
1825
+ polygon2 = cv2.approxPolyDP(largest_contour2, 1.0, True)
1826
+ min_distance = np.inf
1827
+
1828
+ small_polygon, larger_contour = (
1829
+ (polygon1, largest_contour2)
1830
+ if len(largest_contour1) < len(largest_contour2)
1831
+ else (polygon2, largest_contour1)
1832
+ )
1833
+
1834
+ # For each point in the first polygon
1835
+ for point in small_polygon:
1836
+ # Calculate the distance to the second polygon, -1 is to invert result as point inside the polygon is positive
1837
+
1838
+ distance = (
1839
+ cv2.pointPolygonTest(
1840
+ larger_contour, (point[0, 0].item(), point[0, 1].item()), True
1841
+ )
1842
+ * -1
1843
+ )
1844
+
1845
+ # If the distance is negative, the point is inside the polygon, so the distance is 0
1846
+ if distance < 0:
1847
+ continue
1848
+ else:
1849
+ # Update the minimum distance if the point is outside the polygon
1850
+ min_distance = min(min_distance, distance)
1851
+
1852
+ return min_distance if min_distance != np.inf else 0.0
1853
+
1854
+
1855
+ def closest_box_distance(
1856
+ box1: List[float], box2: List[float], image_size: Tuple[int, int]
1857
+ ) -> float:
1858
+ """'closest_box_distance' calculates the closest distance between two bounding boxes.
1859
+
1860
+ Parameters:
1861
+ box1 (List[float]): The first bounding box.
1862
+ box2 (List[float]): The second bounding box.
1863
+ image_size (Tuple[int, int]): The size of the image given as (height, width).
1864
+
1865
+ Returns:
1866
+ float: The closest distance between the two bounding boxes.
1867
+
1868
+ Example
1869
+ -------
1870
+ >>> closest_box_distance([100, 100, 200, 200], [300, 300, 400, 400])
1871
+ 141.42
1872
+ """
1873
+
1874
+ x11, y11, x12, y12 = denormalize_bbox(box1, image_size)
1875
+ x21, y21, x22, y22 = denormalize_bbox(box2, image_size)
1876
+
1877
+ horizontal_distance = np.max([0, x21 - x12, x11 - x22])
1878
+ vertical_distance = np.max([0, y21 - y12, y11 - y22])
1879
+ return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
1880
+
1881
+
2112
1882
  # Utility and visualization functions
2113
1883
 
2114
1884
 
@@ -2625,6 +2395,197 @@ def _plot_counting(
2625
2395
  return image
2626
2396
 
2627
2397
 
2398
+ class ODModels(str, Enum):
2399
+ COUNTGD = "countgd"
2400
+ FLORENCE2 = "florence2"
2401
+ OWLV2 = "owlv2"
2402
+
2403
+
2404
+ def od_sam2_video_tracking(
2405
+ od_model: ODModels,
2406
+ prompt: str,
2407
+ frames: List[np.ndarray],
2408
+ chunk_length: Optional[int] = 10,
2409
+ fine_tune_id: Optional[str] = None,
2410
+ ) -> List[List[Dict[str, Any]]]:
2411
+
2412
+ results: List[Optional[List[Dict[str, Any]]]] = [None] * len(frames)
2413
+
2414
+ if chunk_length is None:
2415
+ step = 1 # Process every frame
2416
+ elif chunk_length <= 0:
2417
+ raise ValueError("chunk_length must be a positive integer or None.")
2418
+ else:
2419
+ step = chunk_length # Process frames with the specified step size
2420
+
2421
+ for idx in range(0, len(frames), step):
2422
+ if od_model == ODModels.COUNTGD:
2423
+ results[idx] = countgd_object_detection(prompt=prompt, image=frames[idx])
2424
+ function_name = "countgd_object_detection"
2425
+ elif od_model == ODModels.OWLV2:
2426
+ results[idx] = owl_v2_image(
2427
+ prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
2428
+ )
2429
+ function_name = "owl_v2_image"
2430
+ elif od_model == ODModels.FLORENCE2:
2431
+ results[idx] = florence2_sam2_image(
2432
+ prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
2433
+ )
2434
+ function_name = "florence2_sam2_image"
2435
+ else:
2436
+ raise NotImplementedError(
2437
+ f"Object detection model '{od_model}' is not implemented."
2438
+ )
2439
+
2440
+ image_size = frames[0].shape[:2]
2441
+
2442
+ def _transform_detections(
2443
+ input_list: List[Optional[List[Dict[str, Any]]]]
2444
+ ) -> List[Optional[Dict[str, Any]]]:
2445
+ output_list: List[Optional[Dict[str, Any]]] = []
2446
+
2447
+ for idx, frame in enumerate(input_list):
2448
+ if frame is not None:
2449
+ labels = [detection["label"] for detection in frame]
2450
+ bboxes = [
2451
+ denormalize_bbox(detection["bbox"], image_size)
2452
+ for detection in frame
2453
+ ]
2454
+
2455
+ output_list.append(
2456
+ {
2457
+ "labels": labels,
2458
+ "bboxes": bboxes,
2459
+ }
2460
+ )
2461
+ else:
2462
+ output_list.append(None)
2463
+
2464
+ return output_list
2465
+
2466
+ output = _transform_detections(results)
2467
+
2468
+ buffer_bytes = frames_to_bytes(frames)
2469
+ files = [("video", buffer_bytes)]
2470
+ payload = {"bboxes": json.dumps(output), "chunk_length": chunk_length}
2471
+ metadata = {"function_name": function_name}
2472
+
2473
+ detections = send_task_inference_request(
2474
+ payload,
2475
+ "sam2",
2476
+ files=files,
2477
+ metadata=metadata,
2478
+ )
2479
+
2480
+ return_data = []
2481
+ for frame in detections:
2482
+ return_frame_data = []
2483
+ for detection in frame:
2484
+ mask = rle_decode_array(detection["mask"])
2485
+ label = str(detection["id"]) + ": " + detection["label"]
2486
+ return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
2487
+ return_data.append(return_frame_data)
2488
+ return_data = add_bboxes_from_masks(return_data)
2489
+ return nms(return_data, iou_threshold=0.95)
2490
+
2491
+
2492
+ def countgd_sam2_video_tracking(
2493
+ prompt: str,
2494
+ frames: List[np.ndarray],
2495
+ chunk_length: Optional[int] = 10,
2496
+ ) -> List[List[Dict[str, Any]]]:
2497
+ """'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
2498
+ prompt such as category names or referring expressions. The categories in the text
2499
+ prompt are separated by commas. It returns a list of bounding boxes, label names,
2500
+ mask file names and associated probability scores.
2501
+
2502
+ Parameters:
2503
+ prompt (str): The prompt to ground to the image.
2504
+ image (np.ndarray): The image to ground the prompt to.
2505
+
2506
+ Returns:
2507
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
2508
+ bounding box, and mask of the detected objects with normalized coordinates
2509
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
2510
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
2511
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
2512
+ the background.
2513
+
2514
+ Example
2515
+ -------
2516
+ >>> countgd_sam2_video_tracking("car, dinosaur", frames)
2517
+ [
2518
+ [
2519
+ {
2520
+ 'label': '0: dinosaur',
2521
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
2522
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
2523
+ [0, 0, 0, ..., 0, 0, 0],
2524
+ ...,
2525
+ [0, 0, 0, ..., 0, 0, 0],
2526
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
2527
+ },
2528
+ ],
2529
+ ...
2530
+ ]
2531
+ """
2532
+
2533
+ return od_sam2_video_tracking(
2534
+ ODModels.COUNTGD, prompt=prompt, frames=frames, chunk_length=chunk_length
2535
+ )
2536
+
2537
+
2538
+ def owlv2_sam2_video_tracking(
2539
+ prompt: str,
2540
+ frames: List[np.ndarray],
2541
+ chunk_length: Optional[int] = 10,
2542
+ fine_tune_id: Optional[str] = None,
2543
+ ) -> List[List[Dict[str, Any]]]:
2544
+ """'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
2545
+ prompt such as category names or referring expressions. The categories in the text
2546
+ prompt are separated by commas. It returns a list of bounding boxes, label names,
2547
+ mask file names and associated probability scores.
2548
+
2549
+ Parameters:
2550
+ prompt (str): The prompt to ground to the image.
2551
+ image (np.ndarray): The image to ground the prompt to.
2552
+
2553
+ Returns:
2554
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
2555
+ bounding box, and mask of the detected objects with normalized coordinates
2556
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
2557
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
2558
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
2559
+ the background.
2560
+
2561
+ Example
2562
+ -------
2563
+ >>> countgd_sam2_video_tracking("car, dinosaur", frames)
2564
+ [
2565
+ [
2566
+ {
2567
+ 'label': '0: dinosaur',
2568
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
2569
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
2570
+ [0, 0, 0, ..., 0, 0, 0],
2571
+ ...,
2572
+ [0, 0, 0, ..., 0, 0, 0],
2573
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
2574
+ },
2575
+ ],
2576
+ ...
2577
+ ]
2578
+ """
2579
+
2580
+ return od_sam2_video_tracking(
2581
+ ODModels.OWLV2,
2582
+ prompt=prompt,
2583
+ frames=frames,
2584
+ chunk_length=chunk_length,
2585
+ fine_tune_id=fine_tune_id,
2586
+ )
2587
+
2588
+
2628
2589
  FUNCTION_TOOLS = [
2629
2590
  owl_v2_image,
2630
2591
  owl_v2_video,