vision-agent 0.2.210__py3-none-any.whl → 0.2.211__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -4,6 +4,7 @@ import logging
4
4
  import os
5
5
  import tempfile
6
6
  import urllib.request
7
+ from base64 import b64encode
7
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
8
9
  from functools import lru_cache
9
10
  from importlib import resources
@@ -14,6 +15,7 @@ from uuid import UUID
14
15
  import cv2
15
16
  import numpy as np
16
17
  import requests
18
+ from IPython.display import display
17
19
  from PIL import Image, ImageDraw, ImageFont
18
20
  from pillow_heif import register_heif_opener # type: ignore
19
21
  from pytube import YouTube # type: ignore
@@ -21,8 +23,8 @@ from pytube import YouTube # type: ignore
21
23
  from vision_agent.clients.landing_public_api import LandingPublicAPI
22
24
  from vision_agent.lmm.lmm import AnthropicLMM, OpenAILMM
23
25
  from vision_agent.tools.tool_utils import (
26
+ ToolCallTrace,
24
27
  add_bboxes_from_masks,
25
- filter_bboxes_by_threshold,
26
28
  get_tool_descriptions,
27
29
  get_tool_documentation,
28
30
  get_tools_df,
@@ -32,7 +34,7 @@ from vision_agent.tools.tool_utils import (
32
34
  send_task_inference_request,
33
35
  single_nms,
34
36
  )
35
- from vision_agent.tools.tools_types import JobStatus, ODResponseData
37
+ from vision_agent.tools.tools_types import JobStatus
36
38
  from vision_agent.utils.exceptions import FineTuneModelIsNotReady
37
39
  from vision_agent.utils.execute import FileSerializer, MimeType
38
40
  from vision_agent.utils.image_utils import (
@@ -41,7 +43,6 @@ from vision_agent.utils.image_utils import (
41
43
  convert_to_b64,
42
44
  denormalize_bbox,
43
45
  encode_image_bytes,
44
- get_image_size,
45
46
  normalize_bbox,
46
47
  numpy_to_bytes,
47
48
  rle_decode,
@@ -88,66 +89,33 @@ def get_tool_recommender() -> Sim:
88
89
  return load_cached_sim(TOOLS_DF)
89
90
 
90
91
 
91
- def grounding_dino(
92
- prompt: str,
93
- image: np.ndarray,
94
- box_threshold: float = 0.20,
95
- iou_threshold: float = 0.20,
96
- model_size: str = "large",
97
- ) -> List[Dict[str, Any]]:
98
- """'grounding_dino' is a tool that can detect and count multiple objects given a text
99
- prompt such as category names or referring expressions. The categories in text prompt
100
- are separated by commas or periods. It returns a list of bounding boxes with
101
- normalized coordinates, label names and associated probability scores.
102
-
103
- Parameters:
104
- prompt (str): The prompt to ground to the image.
105
- image (np.ndarray): The image to ground the prompt to.
106
- box_threshold (float, optional): The threshold for the box detection. Defaults
107
- to 0.20.
108
- iou_threshold (float, optional): The threshold for the Intersection over Union
109
- (IoU). Defaults to 0.20.
110
- model_size (str, optional): The size of the model to use.
111
-
112
- Returns:
113
- List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
114
- bounding box of the detected objects with normalized coordinates between 0
115
- and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
116
- top-left and xmax and ymax are the coordinates of the bottom-right of the
117
- bounding box.
118
-
119
- Example
120
- -------
121
- >>> grounding_dino("car. dinosaur", image)
122
- [
123
- {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
124
- {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
125
- ]
126
- """
127
- image_size = image.shape[:2]
128
- image_b64 = convert_to_b64(image)
129
- if model_size not in ["large", "tiny"]:
130
- raise ValueError("model_size must be either 'large' or 'tiny'")
131
- request_data = {
132
- "prompt": prompt,
133
- "image": image_b64,
134
- "tool": (
135
- "visual_grounding" if model_size == "large" else "visual_grounding_tiny"
136
- ),
137
- "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
138
- "function_name": "grounding_dino",
139
- }
140
- data: Dict[str, Any] = send_inference_request(request_data, "tools")
141
- return_data = []
142
- for i in range(len(data["bboxes"])):
143
- return_data.append(
144
- {
145
- "score": round(data["scores"][i], 2),
146
- "label": data["labels"][i],
147
- "bbox": normalize_bbox(data["bboxes"][i], image_size),
148
- }
149
- )
150
- return return_data
92
+ def _display_tool_trace(
93
+ function_name: str,
94
+ request: Dict[str, Any],
95
+ response: Any,
96
+ files: Union[List[Tuple[str, bytes]], str],
97
+ ) -> None:
98
+ # Sends data through IPython's display function so front-end can show them. We use
99
+ # a function here instead of a decarator becuase we do not want to re-calculate data
100
+ # such as video bytes, which can be slow. Since this is calculated inside the
101
+ # function we can't capture it with a decarator without adding it as a return value
102
+ # which would change the function signature and affect the agent.
103
+ files_in_b64: List[Tuple[str, str]]
104
+ if isinstance(files, str):
105
+ files_in_b64 = [("images", files)]
106
+ else:
107
+ files_in_b64 = [(file[0], b64encode(file[1]).decode("utf-8")) for file in files]
108
+
109
+ request["function_name"] = function_name
110
+ tool_call_trace = ToolCallTrace(
111
+ endpoint_url="",
112
+ type="tool_func_call",
113
+ request=request,
114
+ response={"data": response},
115
+ error=None,
116
+ files=files_in_b64,
117
+ )
118
+ display({MimeType.APPLICATION_JSON: tool_call_trace.model_dump()}, raw=True)
151
119
 
152
120
 
153
121
  def owl_v2_image(
@@ -223,14 +191,21 @@ def owl_v2_image(
223
191
  # get the first frame
224
192
  bboxes = detections[0]
225
193
  bboxes_formatted = [
226
- ODResponseData(
227
- label=bbox["label"],
228
- bbox=normalize_bbox(bbox["bounding_box"], image_size),
229
- score=round(bbox["score"], 2),
230
- )
194
+ {
195
+ "label": bbox["label"],
196
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
197
+ "score": round(bbox["score"], 2),
198
+ }
231
199
  for bbox in bboxes
232
200
  ]
233
- return [bbox.model_dump() for bbox in bboxes_formatted]
201
+
202
+ _display_tool_trace(
203
+ owl_v2_image.__name__,
204
+ payload,
205
+ detections[0],
206
+ files,
207
+ )
208
+ return bboxes_formatted
234
209
 
235
210
 
236
211
  def owl_v2_video(
@@ -309,81 +284,21 @@ def owl_v2_video(
309
284
  bboxes_formatted = []
310
285
  for frame_data in detections:
311
286
  bboxes_formatted_per_frame = [
312
- ODResponseData(
313
- label=bbox["label"],
314
- bbox=normalize_bbox(bbox["bounding_box"], image_size),
315
- score=round(bbox["score"], 2),
316
- )
287
+ {
288
+ "label": bbox["label"],
289
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
290
+ "score": round(bbox["score"], 2),
291
+ }
317
292
  for bbox in frame_data
318
293
  ]
319
294
  bboxes_formatted.append(bboxes_formatted_per_frame)
320
- return [[bbox.model_dump() for bbox in frame] for frame in bboxes_formatted]
321
-
322
-
323
- def grounding_sam(
324
- prompt: str,
325
- image: np.ndarray,
326
- box_threshold: float = 0.20,
327
- iou_threshold: float = 0.20,
328
- ) -> List[Dict[str, Any]]:
329
- """'grounding_sam' is a tool that can segment multiple objects given a text prompt
330
- such as category names or referring expressions. The categories in text prompt are
331
- separated by commas or periods. It returns a list of bounding boxes, label names,
332
- mask file names and associated probability scores.
333
-
334
- Parameters:
335
- prompt (str): The prompt to ground to the image.
336
- image (np.ndarray): The image to ground the prompt to.
337
- box_threshold (float, optional): The threshold for the box detection. Defaults
338
- to 0.20.
339
- iou_threshold (float, optional): The threshold for the Intersection over Union
340
- (IoU). Defaults to 0.20.
341
-
342
- Returns:
343
- List[Dict[str, Any]]: A list of dictionaries containing the score, label,
344
- bounding box, and mask of the detected objects with normalized coordinates
345
- (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
346
- and xmax and ymax are the coordinates of the bottom-right of the bounding box.
347
- The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
348
- the background.
349
-
350
- Example
351
- -------
352
- >>> grounding_sam("car. dinosaur", image)
353
- [
354
- {
355
- 'score': 0.99,
356
- 'label': 'dinosaur',
357
- 'bbox': [0.1, 0.11, 0.35, 0.4],
358
- 'mask': array([[0, 0, 0, ..., 0, 0, 0],
359
- [0, 0, 0, ..., 0, 0, 0],
360
- ...,
361
- [0, 0, 0, ..., 0, 0, 0],
362
- [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
363
- },
364
- ]
365
- """
366
- image_size = image.shape[:2]
367
- image_b64 = convert_to_b64(image)
368
- request_data = {
369
- "prompt": prompt,
370
- "image": image_b64,
371
- "tool": "visual_grounding_segment",
372
- "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
373
- "function_name": "grounding_sam",
374
- }
375
- data: Dict[str, Any] = send_inference_request(request_data, "tools")
376
- return_data = []
377
- for i in range(len(data["bboxes"])):
378
- return_data.append(
379
- {
380
- "score": round(data["scores"][i], 2),
381
- "label": data["labels"][i],
382
- "bbox": normalize_bbox(data["bboxes"][i], image_size),
383
- "mask": rle_decode(mask_rle=data["masks"][i], shape=data["mask_shape"]),
384
- }
385
- )
386
- return return_data
295
+ _display_tool_trace(
296
+ owl_v2_video.__name__,
297
+ payload,
298
+ detections[0],
299
+ files,
300
+ )
301
+ return bboxes_formatted
387
302
 
388
303
 
389
304
  def florence2_sam2_image(
@@ -460,6 +375,13 @@ def florence2_sam2_image(
460
375
  label = detection["label"]
461
376
  bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
462
377
  return_data.append({"label": label, "bbox": bbox, "mask": mask, "score": 1.0})
378
+
379
+ _display_tool_trace(
380
+ florence2_sam2_image.__name__,
381
+ payload,
382
+ detections[0],
383
+ files,
384
+ )
463
385
  return return_data
464
386
 
465
387
 
@@ -545,10 +467,36 @@ def florence2_sam2_video_tracking(
545
467
  for detection in frame:
546
468
  mask = rle_decode_array(detection["mask"])
547
469
  label = str(detection["id"]) + ": " + detection["label"]
548
- return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
470
+ return_frame_data.append(
471
+ {"label": label, "mask": mask, "score": 1.0, "rle": detection["mask"]}
472
+ )
549
473
  return_data.append(return_frame_data)
550
474
  return_data = add_bboxes_from_masks(return_data)
551
- return nms(return_data, iou_threshold=0.95)
475
+ return_data = nms(return_data, iou_threshold=0.95)
476
+
477
+ _display_tool_trace(
478
+ florence2_sam2_video_tracking.__name__,
479
+ payload,
480
+ [
481
+ [
482
+ {
483
+ "label": e["label"],
484
+ "score": e["score"],
485
+ "bbox": denormalize_bbox(e["bbox"], frames[0].shape[:2]),
486
+ "mask": e["rle"],
487
+ }
488
+ for e in lst
489
+ ]
490
+ for lst in return_data
491
+ ],
492
+ files,
493
+ )
494
+ # We save the RLE for display purposes, re-calculting RLE can get very expensive.
495
+ # Deleted here because we are returning the numpy masks instead
496
+ for frame in return_data:
497
+ for obj in frame:
498
+ del obj["rle"]
499
+ return return_data
552
500
 
553
501
 
554
502
  def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
@@ -603,86 +551,175 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
603
551
  box = normalize_bbox(box, image_size)
604
552
  output.append({"label": label, "bbox": box, "score": round(det["score"], 2)})
605
553
 
606
- ocr_results = sorted(output, key=lambda x: (x["bbox"][1], x["bbox"][0]))
607
- return ocr_results
554
+ _display_tool_trace(
555
+ ocr.__name__,
556
+ {},
557
+ data,
558
+ cast(List[Tuple[str, bytes]], [("image", buffer_bytes)]),
559
+ )
560
+ return sorted(output, key=lambda x: (x["bbox"][1], x["bbox"][0]))
561
+
562
+
563
+ def _sam2(
564
+ image: np.ndarray,
565
+ detections: List[Dict[str, Any]],
566
+ image_size: Tuple[int, ...],
567
+ image_bytes: Optional[bytes] = None,
568
+ ) -> Dict[str, Any]:
569
+ if image_bytes is None:
570
+ image_bytes = numpy_to_bytes(image)
571
+
572
+ files = [("images", image_bytes)]
573
+ payload = {
574
+ "model": "sam2",
575
+ "bboxes": json.dumps(
576
+ [
577
+ {
578
+ "labels": [d["label"] for d in detections],
579
+ "bboxes": [
580
+ denormalize_bbox(d["bbox"], image_size) for d in detections
581
+ ],
582
+ }
583
+ ]
584
+ ),
585
+ }
586
+
587
+ metadata = {"function_name": "sam2"}
588
+ pred_detections = send_task_inference_request(
589
+ payload, "sam2", files=files, metadata=metadata
590
+ )
591
+ frame = pred_detections[0]
592
+ return_data = []
593
+ display_data = []
594
+ for inp_detection, detection in zip(detections, frame):
595
+ mask = rle_decode_array(detection["mask"])
596
+ label = detection["label"]
597
+ bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
598
+ return_data.append(
599
+ {
600
+ "label": label,
601
+ "bbox": bbox,
602
+ "mask": mask,
603
+ "score": inp_detection["score"],
604
+ }
605
+ )
606
+ display_data.append(
607
+ {
608
+ "label": label,
609
+ "bbox": detection["bounding_box"],
610
+ "mask": detection["mask"],
611
+ "score": inp_detection["score"],
612
+ }
613
+ )
614
+ return {"files": files, "return_data": return_data, "display_data": display_data}
608
615
 
609
616
 
610
- def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
611
- """'loca_zero_shot_counting' is a tool that counts the dominant foreground object given
612
- an image and no other information about the content. It returns only the count of
613
- the objects in the image.
617
+ def sam2(
618
+ image: np.ndarray,
619
+ detections: List[Dict[str, Any]],
620
+ ) -> List[Dict[str, Any]]:
621
+ """'sam2' is a tool that can segment multiple objects given an input bounding box,
622
+ label and score. It returns a set of masks along with the corresponding bounding
623
+ boxes and labels.
614
624
 
615
625
  Parameters:
616
- image (np.ndarray): The image that contains lot of instances of a single object
626
+ image (np.ndarray): The image that contains multiple instances of the object.
627
+ detections (List[Dict[str, Any]]): A list of dictionaries containing the score,
628
+ label, and bounding box of the detected objects with normalized coordinates
629
+ between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates
630
+ of the top-left and xmax and ymax are the coordinates of the bottom-right of
631
+ the bounding box.
617
632
 
618
633
  Returns:
619
- Dict[str, Any]: A dictionary containing the key 'count' and the count as a
620
- value, e.g. {count: 12} and a heat map for visualization purposes.
634
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
635
+ bounding box, and mask of the detected objects with normalized coordinates
636
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
637
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
638
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
639
+ the background.
621
640
 
622
641
  Example
623
642
  -------
624
- >>> loca_zero_shot_counting(image)
625
- {'count': 83,
626
- 'heat_map': array([[ 0, 0, 0, ..., 0, 0, 0],
627
- [ 0, 0, 0, ..., 0, 0, 0],
628
- [ 0, 0, 0, ..., 0, 0, 1],
629
- ...,
630
- [ 0, 0, 0, ..., 30, 35, 41],
631
- [ 0, 0, 0, ..., 41, 47, 53],
632
- [ 0, 0, 0, ..., 53, 59, 64]], dtype=uint8)}
643
+ >>> sam2(image, [
644
+ {'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
645
+ ])
646
+ [
647
+ {
648
+ 'score': 0.49,
649
+ 'label': 'flower',
650
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
651
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
652
+ [0, 0, 0, ..., 0, 0, 0],
653
+ ...,
654
+ [0, 0, 0, ..., 0, 0, 0],
655
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
656
+ },
657
+ ]
633
658
  """
659
+ image_size = image.shape[:2]
660
+ ret = _sam2(image, detections, image_size)
661
+ _display_tool_trace(
662
+ sam2.__name__,
663
+ {},
664
+ ret["display_data"],
665
+ ret["files"],
666
+ )
634
667
 
635
- image_b64 = convert_to_b64(image)
636
- data = {
637
- "image": image_b64,
638
- "function_name": "loca_zero_shot_counting",
639
- }
640
- resp_data: dict[str, Any] = send_inference_request(data, "loca", v2=True)
641
- resp_data["heat_map"] = np.array(resp_data["heat_map"][0]).astype(np.uint8)
642
- return resp_data
668
+ return ret["return_data"] # type: ignore
643
669
 
644
670
 
645
- def loca_visual_prompt_counting(
646
- image: np.ndarray, visual_prompt: Dict[str, List[float]]
671
+ def _countgd_object_detection(
672
+ prompt: str,
673
+ image: np.ndarray,
674
+ box_threshold: float,
675
+ image_size: Tuple[int, ...],
676
+ image_bytes: Optional[bytes] = None,
647
677
  ) -> Dict[str, Any]:
648
- """'loca_visual_prompt_counting' is a tool that counts the dominant foreground object
649
- given an image and a visual prompt which is a bounding box describing the object.
650
- It returns only the count of the objects in the image.
678
+ if image_bytes is None:
679
+ image_bytes = numpy_to_bytes(image)
651
680
 
652
- Parameters:
653
- image (np.ndarray): The image that contains lot of instances of a single object
654
- visual_prompt (Dict[str, List[float]]): Bounding box of the object in
655
- format [xmin, ymin, xmax, ymax]. Only 1 bounding box can be provided.
681
+ files = [("image", image_bytes)]
682
+ prompts = [p.strip() for p in prompt.split(", ")]
656
683
 
657
- Returns:
658
- Dict[str, Any]: A dictionary containing the key 'count' and the count as a
659
- value, e.g. {count: 12} and a heat map for visualization purposes.
684
+ def _run_countgd(prompt: str) -> List[Dict[str, Any]]:
685
+ payload = {
686
+ "prompts": [prompt],
687
+ "confidence": box_threshold, # still not being used in the API
688
+ "model": "countgd",
689
+ }
690
+ metadata = {"function_name": "countgd_counting"}
660
691
 
661
- Example
662
- -------
663
- >>> loca_visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
664
- {'count': 83,
665
- 'heat_map': array([[ 0, 0, 0, ..., 0, 0, 0],
666
- [ 0, 0, 0, ..., 0, 0, 0],
667
- [ 0, 0, 0, ..., 0, 0, 1],
668
- ...,
669
- [ 0, 0, 0, ..., 30, 35, 41],
670
- [ 0, 0, 0, ..., 41, 47, 53],
671
- [ 0, 0, 0, ..., 53, 59, 64]], dtype=uint8)}
672
- """
692
+ detections = send_task_inference_request(
693
+ payload, "text-to-object-detection", files=files, metadata=metadata
694
+ )
695
+ # get the first frame
696
+ return detections[0] # type: ignore
673
697
 
674
- image_size = get_image_size(image)
675
- bbox = visual_prompt["bbox"]
676
- image_b64 = convert_to_b64(image)
698
+ bboxes = []
699
+ with ThreadPoolExecutor() as executor:
700
+ futures = [executor.submit(_run_countgd, prompt) for prompt in prompts]
701
+ for future in as_completed(futures):
702
+ bboxes.extend(future.result())
677
703
 
678
- data = {
679
- "image": image_b64,
680
- "bbox": list(map(int, denormalize_bbox(bbox, image_size))),
681
- "function_name": "loca_visual_prompt_counting",
682
- }
683
- resp_data: dict[str, Any] = send_inference_request(data, "loca", v2=True)
684
- resp_data["heat_map"] = np.array(resp_data["heat_map"][0]).astype(np.uint8)
685
- return resp_data
704
+ return_data = [
705
+ {
706
+ "label": bbox["label"],
707
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
708
+ "score": round(bbox["score"], 2),
709
+ }
710
+ for bbox in bboxes
711
+ ]
712
+
713
+ return_data = single_nms(return_data, iou_threshold=0.80)
714
+ display_data = [
715
+ {
716
+ "label": e["label"],
717
+ "score": e["score"],
718
+ "bbox": denormalize_bbox(e["bbox"], image_size),
719
+ }
720
+ for e in return_data
721
+ ]
722
+ return {"files": files, "return_data": return_data, "display_data": display_data}
686
723
 
687
724
 
688
725
  def countgd_object_detection(
@@ -723,121 +760,17 @@ def countgd_object_detection(
723
760
  if image_size[0] < 1 or image_size[1] < 1:
724
761
  return []
725
762
 
726
- buffer_bytes = numpy_to_bytes(image)
727
- files = [("image", buffer_bytes)]
728
- prompts = [p.strip() for p in prompt.split(", ")]
729
-
730
- def _run_countgd(prompt: str) -> List[Dict[str, Any]]:
731
- payload = {
732
- "prompts": [prompt],
733
- "confidence": box_threshold, # still not being used in the API
734
- "model": "countgd",
735
- }
736
- metadata = {"function_name": "countgd_counting"}
737
-
738
- detections = send_task_inference_request(
739
- payload, "text-to-object-detection", files=files, metadata=metadata
740
- )
741
- # get the first frame
742
- return detections[0] # type: ignore
743
-
744
- bboxes = []
745
- with ThreadPoolExecutor() as executor:
746
- futures = [executor.submit(_run_countgd, prompt) for prompt in prompts]
747
- for future in as_completed(futures):
748
- bboxes.extend(future.result())
749
-
750
- bboxes_formatted = [
751
- ODResponseData(
752
- label=bbox["label"],
753
- bbox=normalize_bbox(bbox["bounding_box"], image_size),
754
- score=round(bbox["score"], 2),
755
- )
756
- for bbox in bboxes
757
- ]
758
- # TODO: remove this once we start to use the confidence on countgd
759
- filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
760
- return_data = [bbox.model_dump() for bbox in filtered_bboxes]
761
- return single_nms(return_data, iou_threshold=0.80)
762
-
763
-
764
- def sam2(
765
- image: np.ndarray,
766
- detections: List[Dict[str, Any]],
767
- ) -> List[Dict[str, Any]]:
768
- """'sam2' is a tool that can segment multiple objects given an input bounding box,
769
- label and score. It returns a set of masks along with the corresponding bounding
770
- boxes and labels.
771
-
772
- Parameters:
773
- image (np.ndarray): The image that contains multiple instances of the object.
774
- detections (List[Dict[str, Any]]): A list of dictionaries containing the score,
775
- label, and bounding box of the detected objects with normalized coordinates
776
- between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates
777
- of the top-left and xmax and ymax are the coordinates of the bottom-right of
778
- the bounding box.
779
-
780
- Returns:
781
- List[Dict[str, Any]]: A list of dictionaries containing the score, label,
782
- bounding box, and mask of the detected objects with normalized coordinates
783
- (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
784
- and xmax and ymax are the coordinates of the bottom-right of the bounding box.
785
- The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
786
- the background.
787
-
788
- Example
789
- -------
790
- >>> sam2(image, [
791
- {'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
792
- ])
793
- [
794
- {
795
- 'score': 0.49,
796
- 'label': 'flower',
797
- 'bbox': [0.1, 0.11, 0.35, 0.4],
798
- 'mask': array([[0, 0, 0, ..., 0, 0, 0],
799
- [0, 0, 0, ..., 0, 0, 0],
800
- ...,
801
- [0, 0, 0, ..., 0, 0, 0],
802
- [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
803
- },
804
- ]
805
- """
806
- image_size = image.shape[:2]
807
-
808
- files = [("images", numpy_to_bytes(image))]
809
- payload = {
810
- "model": "sam2",
811
- "bboxes": json.dumps(
812
- [
813
- {
814
- "labels": [d["label"] for d in detections],
815
- "bboxes": [
816
- denormalize_bbox(d["bbox"], image_size) for d in detections
817
- ],
818
- }
819
- ]
820
- ),
821
- }
822
- metadata = {"function_name": "sam2"}
823
- pred_detections = send_task_inference_request(
824
- payload, "sam2", files=files, metadata=metadata
825
- )
826
- frame = pred_detections[0]
827
- return_data = []
828
- for inp_detection, detection in zip(detections, frame):
829
- mask = rle_decode_array(detection["mask"])
830
- label = detection["label"]
831
- bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
832
- return_data.append(
833
- {
834
- "label": label,
835
- "bbox": bbox,
836
- "mask": mask,
837
- "score": inp_detection["score"],
838
- }
839
- )
840
- return return_data
763
+ ret = _countgd_object_detection(prompt, image, box_threshold, image_size)
764
+ _display_tool_trace(
765
+ countgd_object_detection.__name__,
766
+ {
767
+ "prompts": prompt,
768
+ "confidence": box_threshold,
769
+ },
770
+ ret["display_data"],
771
+ ret["files"],
772
+ )
773
+ return ret["return_data"] # type: ignore
841
774
 
842
775
 
843
776
  def countgd_sam2_object_detection(
@@ -881,9 +814,23 @@ def countgd_sam2_object_detection(
881
814
  },
882
815
  ]
883
816
  """
884
- detections = countgd_object_detection(prompt, image, box_threshold)
885
- detections_with_masks = sam2(image, detections)
886
- return detections_with_masks
817
+
818
+ od_ret = _countgd_object_detection(prompt, image, box_threshold, image.shape[:2])
819
+ seg_ret = _sam2(
820
+ image, od_ret["return_data"], image.shape[:2], image_bytes=od_ret["files"][0][1]
821
+ )
822
+
823
+ _display_tool_trace(
824
+ countgd_sam2_object_detection.__name__,
825
+ {
826
+ "prompts": prompt,
827
+ "confidence": box_threshold,
828
+ },
829
+ seg_ret["display_data"],
830
+ seg_ret["files"],
831
+ )
832
+
833
+ return seg_ret["return_data"] # type: ignore
887
834
 
888
835
 
889
836
  def countgd_example_based_counting(
@@ -941,76 +888,28 @@ def countgd_example_based_counting(
941
888
  # get the first frame
942
889
  bboxes_per_frame = detections[0]
943
890
  bboxes_formatted = [
944
- ODResponseData(
945
- label=bbox["label"],
946
- bbox=normalize_bbox(bbox["bounding_box"], image_size),
947
- score=round(bbox["score"], 2),
948
- )
891
+ {
892
+ "label": bbox["label"],
893
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
894
+ "score": round(bbox["score"], 2),
895
+ }
949
896
  for bbox in bboxes_per_frame
950
897
  ]
951
- filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
952
- return [bbox.model_dump() for bbox in filtered_bboxes]
953
-
954
-
955
- def florence2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
956
- """'florence2_roberta_vqa' is a tool that takes an image and analyzes
957
- its contents, generates detailed captions and then tries to answer the given
958
- question using the generated context. It returns text as an answer to the question.
959
-
960
- Parameters:
961
- prompt (str): The question about the image
962
- image (np.ndarray): The reference image used for the question
963
-
964
- Returns:
965
- str: A string which is the answer to the given prompt.
966
-
967
- Example
968
- -------
969
- >>> florence2_roberta_vqa('What is the top left animal in this image?', image)
970
- 'white tiger'
971
- """
972
-
973
- image_b64 = convert_to_b64(image)
974
- data = {
975
- "image": image_b64,
976
- "question": prompt,
977
- "function_name": "florence2_roberta_vqa",
978
- }
979
-
980
- answer = send_inference_request(data, "florence2-qa", v2=True)
981
- return answer # type: ignore
982
-
983
-
984
- def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
985
- """'ixc25_image_vqa' is a tool that can answer any questions about arbitrary images
986
- including regular images or images of documents or presentations. It returns text
987
- as an answer to the question.
988
-
989
- Parameters:
990
- prompt (str): The question about the image
991
- image (np.ndarray): The reference image used for the question
992
-
993
- Returns:
994
- str: A string which is the answer to the given prompt.
995
-
996
- Example
997
- -------
998
- >>> ixc25_image_vqa('What is the cat doing?', image)
999
- 'drinking milk'
1000
- """
1001
- if image.shape[0] < 1 or image.shape[1] < 1:
1002
- raise ValueError(f"Image is empty, image shape: {image.shape}")
1003
-
1004
- buffer_bytes = numpy_to_bytes(image)
1005
- files = [("image", buffer_bytes)]
1006
- payload = {
1007
- "prompt": prompt,
1008
- "function_name": "ixc25_image_vqa",
1009
- }
1010
- data: Dict[str, Any] = send_inference_request(
1011
- payload, "internlm-xcomposer2", files=files, v2=True
898
+ _display_tool_trace(
899
+ countgd_example_based_counting.__name__,
900
+ payload,
901
+ [
902
+ {
903
+ "label": e["label"],
904
+ "score": e["score"],
905
+ "bbox": denormalize_bbox(e["bbox"], image_size),
906
+ }
907
+ for e in bboxes_formatted
908
+ ],
909
+ files,
1012
910
  )
1013
- return cast(str, data["answer"])
911
+
912
+ return bboxes_formatted
1014
913
 
1015
914
 
1016
915
  def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
@@ -1047,61 +946,13 @@ def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
1047
946
  data: Dict[str, Any] = send_inference_request(
1048
947
  payload, "image-to-text", files=files, v2=True
1049
948
  )
1050
- return cast(str, data)
1051
-
1052
-
1053
- def claude35_text_extraction(image: np.ndarray) -> str:
1054
- """'claude35_text_extraction' is a tool that can extract text from an image. It
1055
- returns the extracted text as a string and can be used as an alternative to OCR if
1056
- you do not need to know the exact bounding box of the text.
1057
-
1058
- Parameters:
1059
- image (np.ndarray): The image to extract text from.
1060
-
1061
- Returns:
1062
- str: The extracted text from the image.
1063
- """
1064
-
1065
- lmm = AnthropicLMM()
1066
- buffer = io.BytesIO()
1067
- Image.fromarray(image).save(buffer, format="PNG")
1068
- image_bytes = buffer.getvalue()
1069
- image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
1070
- text = lmm.generate(
1071
- "Extract and return any text you see in this image and nothing else. If you do not read any text respond with an empty string.",
1072
- [image_b64],
1073
- )
1074
- return cast(str, text)
1075
-
1076
-
1077
- def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
1078
- """'ixc25_video_vqa' is a tool that can answer any questions about arbitrary videos
1079
- including regular videos or videos of documents or presentations. It returns text
1080
- as an answer to the question.
1081
-
1082
- Parameters:
1083
- prompt (str): The question about the video
1084
- frames (List[np.ndarray]): The reference frames used for the question
1085
-
1086
- Returns:
1087
- str: A string which is the answer to the given prompt.
1088
-
1089
- Example
1090
- -------
1091
- >>> ixc25_video_vqa('Which football player made the goal?', frames)
1092
- 'Lionel Messi'
1093
- """
1094
-
1095
- buffer_bytes = frames_to_bytes(frames)
1096
- files = [("video", buffer_bytes)]
1097
- payload = {
1098
- "prompt": prompt,
1099
- "function_name": "ixc25_video_vqa",
1100
- }
1101
- data: Dict[str, Any] = send_inference_request(
1102
- payload, "internlm-xcomposer2", files=files, v2=True
949
+ _display_tool_trace(
950
+ qwen2_vl_images_vqa.__name__,
951
+ payload,
952
+ cast(str, data),
953
+ files,
1103
954
  )
1104
- return cast(str, data["answer"])
955
+ return cast(str, data)
1105
956
 
1106
957
 
1107
958
  def qwen2_vl_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
@@ -1135,9 +986,39 @@ def qwen2_vl_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
1135
986
  data: Dict[str, Any] = send_inference_request(
1136
987
  payload, "image-to-text", files=files, v2=True
1137
988
  )
989
+ _display_tool_trace(
990
+ qwen2_vl_video_vqa.__name__,
991
+ payload,
992
+ cast(str, data),
993
+ files,
994
+ )
1138
995
  return cast(str, data)
1139
996
 
1140
997
 
998
+ def claude35_text_extraction(image: np.ndarray) -> str:
999
+ """'claude35_text_extraction' is a tool that can extract text from an image. It
1000
+ returns the extracted text as a string and can be used as an alternative to OCR if
1001
+ you do not need to know the exact bounding box of the text.
1002
+
1003
+ Parameters:
1004
+ image (np.ndarray): The image to extract text from.
1005
+
1006
+ Returns:
1007
+ str: The extracted text from the image.
1008
+ """
1009
+
1010
+ lmm = AnthropicLMM()
1011
+ buffer = io.BytesIO()
1012
+ Image.fromarray(image).save(buffer, format="PNG")
1013
+ image_bytes = buffer.getvalue()
1014
+ image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
1015
+ text = lmm.generate(
1016
+ "Extract and return any text you see in this image and nothing else. If you do not read any text respond with an empty string.",
1017
+ [image_b64],
1018
+ )
1019
+ return cast(str, text)
1020
+
1021
+
1141
1022
  def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
1142
1023
  """'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
1143
1024
  including regular images or images of documents or presentations. It returns text
@@ -1201,36 +1082,6 @@ def gpt4o_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
1201
1082
  return cast(str, resp)
1202
1083
 
1203
1084
 
1204
- def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
1205
- """'git_vqa_v2' is a tool that can answer questions about the visual
1206
- contents of an image given a question and an image. It returns an answer to the
1207
- question
1208
-
1209
- Parameters:
1210
- prompt (str): The question about the image
1211
- image (np.ndarray): The reference image used for the question
1212
-
1213
- Returns:
1214
- str: A string which is the answer to the given prompt.
1215
-
1216
- Example
1217
- -------
1218
- >>> git_vqa_v2('What is the cat doing ?', image)
1219
- 'drinking milk'
1220
- """
1221
-
1222
- image_b64 = convert_to_b64(image)
1223
- data = {
1224
- "image": image_b64,
1225
- "prompt": prompt,
1226
- "tool": "image_question_answering",
1227
- "function_name": "git_vqa_v2",
1228
- }
1229
-
1230
- answer = send_inference_request(data, "tools")
1231
- return answer["text"][0] # type: ignore
1232
-
1233
-
1234
1085
  def video_temporal_localization(
1235
1086
  prompt: str,
1236
1087
  frames: List[np.ndarray],
@@ -1274,70 +1125,48 @@ def video_temporal_localization(
1274
1125
  data = send_inference_request(
1275
1126
  payload, "video-temporal-localization", files=files, v2=True
1276
1127
  )
1128
+ _display_tool_trace(
1129
+ video_temporal_localization.__name__,
1130
+ payload,
1131
+ data,
1132
+ files,
1133
+ )
1277
1134
  return [cast(float, value) for value in data]
1278
1135
 
1279
1136
 
1280
- def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
1281
- """'clip' is a tool that can classify an image or a cropped detection given a list
1282
- of input classes or tags. It returns the same list of the input classes along with
1283
- their probability scores based on image content.
1137
+ def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
1138
+ """'vit_image_classification' is a tool that can classify an image. It returns a
1139
+ list of classes and their probability scores based on image content.
1284
1140
 
1285
1141
  Parameters:
1286
1142
  image (np.ndarray): The image to classify or tag
1287
- classes (List[str]): The list of classes or tags that is associated with the image
1288
1143
 
1289
1144
  Returns:
1290
1145
  Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
1291
- contains a list of given labels and other a list of scores.
1146
+ contains a list of labels and other a list of scores.
1292
1147
 
1293
1148
  Example
1294
1149
  -------
1295
- >>> clip(image, ['dog', 'cat', 'bird'])
1296
- {"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]},
1150
+ >>> vit_image_classification(image)
1151
+ {"labels": ["leopard", "lemur, otter", "bird"], "scores": [0.68, 0.30, 0.02]},
1297
1152
  """
1298
-
1299
1153
  if image.shape[0] < 1 or image.shape[1] < 1:
1300
1154
  return {"labels": [], "scores": []}
1301
1155
 
1302
1156
  image_b64 = convert_to_b64(image)
1303
1157
  data = {
1304
- "prompt": ",".join(classes),
1305
1158
  "image": image_b64,
1306
- "tool": "closed_set_image_classification",
1307
- "function_name": "clip",
1308
- }
1309
- resp_data: dict[str, Any] = send_inference_request(data, "tools")
1310
- resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
1311
- return resp_data
1312
-
1313
-
1314
- def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
1315
- """'vit_image_classification' is a tool that can classify an image. It returns a
1316
- list of classes and their probability scores based on image content.
1317
-
1318
- Parameters:
1319
- image (np.ndarray): The image to classify or tag
1320
-
1321
- Returns:
1322
- Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
1323
- contains a list of labels and other a list of scores.
1324
-
1325
- Example
1326
- -------
1327
- >>> vit_image_classification(image)
1328
- {"labels": ["leopard", "lemur, otter", "bird"], "scores": [0.68, 0.30, 0.02]},
1329
- """
1330
- if image.shape[0] < 1 or image.shape[1] < 1:
1331
- return {"labels": [], "scores": []}
1332
-
1333
- image_b64 = convert_to_b64(image)
1334
- data = {
1335
- "image": image_b64,
1336
- "tool": "image_classification",
1337
- "function_name": "vit_image_classification",
1159
+ "tool": "image_classification",
1160
+ "function_name": "vit_image_classification",
1338
1161
  }
1339
1162
  resp_data: dict[str, Any] = send_inference_request(data, "tools")
1340
1163
  resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
1164
+ _display_tool_trace(
1165
+ vit_image_classification.__name__,
1166
+ data,
1167
+ resp_data,
1168
+ image_b64,
1169
+ )
1341
1170
  return resp_data
1342
1171
 
1343
1172
 
@@ -1369,65 +1198,15 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
1369
1198
  data, "nsfw-classification", v2=True
1370
1199
  )
1371
1200
  resp_data["score"] = round(resp_data["score"], 4)
1201
+ _display_tool_trace(
1202
+ vit_nsfw_classification.__name__,
1203
+ data,
1204
+ resp_data,
1205
+ image_b64,
1206
+ )
1372
1207
  return resp_data
1373
1208
 
1374
1209
 
1375
- def blip_image_caption(image: np.ndarray) -> str:
1376
- """'blip_image_caption' is a tool that can caption an image based on its contents. It
1377
- returns a text describing the image.
1378
-
1379
- Parameters:
1380
- image (np.ndarray): The image to caption
1381
-
1382
- Returns:
1383
- str: A string which is the caption for the given image.
1384
-
1385
- Example
1386
- -------
1387
- >>> blip_image_caption(image)
1388
- 'This image contains a cat sitting on a table with a bowl of milk.'
1389
- """
1390
-
1391
- image_b64 = convert_to_b64(image)
1392
- data = {
1393
- "image": image_b64,
1394
- "tool": "image_captioning",
1395
- "function_name": "blip_image_caption",
1396
- }
1397
-
1398
- answer = send_inference_request(data, "tools")
1399
- return answer["text"][0] # type: ignore
1400
-
1401
-
1402
- def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> str:
1403
- """'florence2_image_caption' is a tool that can caption or describe an image based
1404
- on its contents. It returns a text describing the image.
1405
-
1406
- Parameters:
1407
- image (np.ndarray): The image to caption
1408
- detail_caption (bool): If True, the caption will be as detailed as possible else
1409
- the caption will be a brief description.
1410
-
1411
- Returns:
1412
- str: A string which is the caption for the given image.
1413
-
1414
- Example
1415
- -------
1416
- >>> florence2_image_caption(image, False)
1417
- 'This image contains a cat sitting on a table with a bowl of milk.'
1418
- """
1419
- image_b64 = convert_to_b64(image)
1420
- task = "<MORE_DETAILED_CAPTION>" if detail_caption else "<DETAILED_CAPTION>"
1421
- data = {
1422
- "image": image_b64,
1423
- "task": task,
1424
- "function_name": "florence2_image_caption",
1425
- }
1426
-
1427
- answer = send_inference_request(data, "florence2", v2=True)
1428
- return answer[task] # type: ignore
1429
-
1430
-
1431
1210
  def florence2_phrase_grounding(
1432
1211
  prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
1433
1212
  ) -> List[Dict[str, Any]]:
@@ -1490,15 +1269,21 @@ def florence2_phrase_grounding(
1490
1269
  # get the first frame
1491
1270
  bboxes = detections[0]
1492
1271
  bboxes_formatted = [
1493
- ODResponseData(
1494
- label=bbox["label"],
1495
- bbox=normalize_bbox(bbox["bounding_box"], image_size),
1496
- score=round(bbox["score"], 2),
1497
- )
1272
+ {
1273
+ "label": bbox["label"],
1274
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
1275
+ "score": round(bbox["score"], 2),
1276
+ }
1498
1277
  for bbox in bboxes
1499
1278
  ]
1500
1279
 
1501
- return [bbox.model_dump() for bbox in bboxes_formatted]
1280
+ _display_tool_trace(
1281
+ florence2_phrase_grounding.__name__,
1282
+ payload,
1283
+ detections[0],
1284
+ files,
1285
+ )
1286
+ return [bbox for bbox in bboxes_formatted]
1502
1287
 
1503
1288
 
1504
1289
  def florence2_phrase_grounding_video(
@@ -1566,15 +1351,21 @@ def florence2_phrase_grounding_video(
1566
1351
  bboxes_formatted = []
1567
1352
  for frame_data in detections:
1568
1353
  bboxes_formatted_per_frame = [
1569
- ODResponseData(
1570
- label=bbox["label"],
1571
- bbox=normalize_bbox(bbox["bounding_box"], image_size),
1572
- score=round(bbox["score"], 2),
1573
- )
1354
+ {
1355
+ "label": bbox["label"],
1356
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
1357
+ "score": round(bbox["score"], 2),
1358
+ }
1574
1359
  for bbox in frame_data
1575
1360
  ]
1576
1361
  bboxes_formatted.append(bboxes_formatted_per_frame)
1577
- return [[bbox.model_dump() for bbox in frame] for frame in bboxes_formatted]
1362
+ _display_tool_trace(
1363
+ florence2_phrase_grounding_video.__name__,
1364
+ payload,
1365
+ detections,
1366
+ files,
1367
+ )
1368
+ return bboxes_formatted
1578
1369
 
1579
1370
 
1580
1371
  def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
@@ -1621,6 +1412,12 @@ def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
1621
1412
  "score": 1.0,
1622
1413
  }
1623
1414
  )
1415
+ _display_tool_trace(
1416
+ florence2_ocr.__name__,
1417
+ {},
1418
+ detections,
1419
+ image_b64,
1420
+ )
1624
1421
  return return_data
1625
1422
 
1626
1423
 
@@ -1683,6 +1480,12 @@ def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
1683
1480
  ),
1684
1481
  }
1685
1482
  )
1483
+ _display_tool_trace(
1484
+ detr_segmentation.__name__,
1485
+ {},
1486
+ return_data,
1487
+ image_b64,
1488
+ )
1686
1489
  return return_data
1687
1490
 
1688
1491
 
@@ -1721,74 +1524,15 @@ def depth_anything_v2(image: np.ndarray) -> np.ndarray:
1721
1524
  depth_map_np.max() - depth_map_np.min()
1722
1525
  )
1723
1526
  depth_map_np = (255 * depth_map_np).astype(np.uint8)
1527
+ _display_tool_trace(
1528
+ depth_anything_v2.__name__,
1529
+ {},
1530
+ depth_map,
1531
+ image_b64,
1532
+ )
1724
1533
  return depth_map_np
1725
1534
 
1726
1535
 
1727
- def generate_soft_edge_image(image: np.ndarray) -> np.ndarray:
1728
- """'generate_soft_edge_image' is a tool that runs Holistically Nested edge detection
1729
- to generate a soft edge image (HED) from a given RGB image. The returned image is
1730
- monochrome and represents object boundaries as soft white edges on black background
1731
-
1732
- Parameters:
1733
- image (np.ndarray): The image to used to generate soft edge image
1734
-
1735
- Returns:
1736
- np.ndarray: A soft edge image with pixel values ranging from 0 to 255.
1737
-
1738
- Example
1739
- -------
1740
- >>> generate_soft_edge_image(image)
1741
- array([[0, 0, 0, ..., 0, 0, 0],
1742
- [0, 20, 24, ..., 0, 100, 103],
1743
- ...,
1744
- [10, 11, 15, ..., 202, 202, 205],
1745
- [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
1746
- """
1747
- image_b64 = convert_to_b64(image)
1748
- data = {
1749
- "image": image_b64,
1750
- "tool": "generate_hed",
1751
- "function_name": "generate_soft_edge_image",
1752
- }
1753
-
1754
- answer = send_inference_request(data, "tools")
1755
- return_data = np.array(b64_to_pil(answer["masks"][0]).convert("L"))
1756
- return return_data
1757
-
1758
-
1759
- def dpt_hybrid_midas(image: np.ndarray) -> np.ndarray:
1760
- """'dpt_hybrid_midas' is a tool that generates a normal mapped from a given RGB
1761
- image. The returned RGB image is texture mapped image of the surface normals and the
1762
- RGB values represent the surface normals in the x, y, z directions.
1763
-
1764
- Parameters:
1765
- image (np.ndarray): The image to used to generate normal image
1766
-
1767
- Returns:
1768
- np.ndarray: A mapped normal image with RGB pixel values indicating surface
1769
- normals in x, y, z directions.
1770
-
1771
- Example
1772
- -------
1773
- >>> dpt_hybrid_midas(image)
1774
- array([[0, 0, 0, ..., 0, 0, 0],
1775
- [0, 20, 24, ..., 0, 100, 103],
1776
- ...,
1777
- [10, 11, 15, ..., 202, 202, 205],
1778
- [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
1779
- """
1780
- image_b64 = convert_to_b64(image)
1781
- data = {
1782
- "image": image_b64,
1783
- "tool": "generate_normal",
1784
- "function_name": "dpt_hybrid_midas",
1785
- }
1786
-
1787
- answer = send_inference_request(data, "tools")
1788
- return_data = np.array(b64_to_pil(answer["masks"][0]).convert("RGB"))
1789
- return return_data
1790
-
1791
-
1792
1536
  def generate_pose_image(image: np.ndarray) -> np.ndarray:
1793
1537
  """'generate_pose_image' is a tool that generates a open pose bone/stick image from
1794
1538
  a given RGB image. The returned bone image is RGB with the pose amd keypoints colored
@@ -1817,6 +1561,12 @@ def generate_pose_image(image: np.ndarray) -> np.ndarray:
1817
1561
 
1818
1562
  pos_img = send_inference_request(data, "pose-detector", v2=True)
1819
1563
  return_data = np.array(b64_to_pil(pos_img["data"]).convert("RGB"))
1564
+ _display_tool_trace(
1565
+ generate_pose_image.__name__,
1566
+ {},
1567
+ pos_img,
1568
+ image_b64,
1569
+ )
1820
1570
  return return_data
1821
1571
 
1822
1572
 
@@ -1861,120 +1611,18 @@ def template_match(
1861
1611
  for i in range(len(answer["bboxes"])):
1862
1612
  return_data.append(
1863
1613
  {
1614
+ "label": "match",
1864
1615
  "score": round(answer["scores"][i], 2),
1865
1616
  "bbox": normalize_bbox(answer["bboxes"][i], image_size),
1866
1617
  }
1867
1618
  )
1868
- return return_data
1869
-
1870
-
1871
- def minimum_distance(
1872
- det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]
1873
- ) -> float:
1874
- """'minimum_distance' calculates the minimum distance between two detections which
1875
- can include bounding boxes and or masks. This will return the closest distance
1876
- between the objects, not the distance between the centers of the objects.
1877
-
1878
- Parameters:
1879
- det1 (Dict[str, Any]): The first detection of boxes or masks.
1880
- det2 (Dict[str, Any]): The second detection of boxes or masks.
1881
- image_size (Tuple[int, int]): The size of the image given as (height, width).
1882
-
1883
- Returns:
1884
- float: The closest distance between the two detections.
1885
-
1886
- Example
1887
- -------
1888
- >>> closest_distance(det1, det2, image_size)
1889
- 141.42
1890
- """
1891
-
1892
- if "mask" in det1 and "mask" in det2:
1893
- return closest_mask_distance(det1["mask"], det2["mask"])
1894
- elif "bbox" in det1 and "bbox" in det2:
1895
- return closest_box_distance(det1["bbox"], det2["bbox"], image_size)
1896
- else:
1897
- raise ValueError("Both detections must have either bbox or mask")
1898
-
1899
-
1900
- def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
1901
- """'closest_mask_distance' calculates the closest distance between two masks.
1902
-
1903
- Parameters:
1904
- mask1 (np.ndarray): The first mask.
1905
- mask2 (np.ndarray): The second mask.
1906
-
1907
- Returns:
1908
- float: The closest distance between the two masks.
1909
-
1910
- Example
1911
- -------
1912
- >>> closest_mask_distance(mask1, mask2)
1913
- 0.5
1914
- """
1915
-
1916
- mask1 = np.clip(mask1, 0, 1)
1917
- mask2 = np.clip(mask2, 0, 1)
1918
- contours1, _ = cv2.findContours(mask1, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
1919
- contours2, _ = cv2.findContours(mask2, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
1920
- largest_contour1 = max(contours1, key=cv2.contourArea)
1921
- largest_contour2 = max(contours2, key=cv2.contourArea)
1922
- polygon1 = cv2.approxPolyDP(largest_contour1, 1.0, True)
1923
- polygon2 = cv2.approxPolyDP(largest_contour2, 1.0, True)
1924
- min_distance = np.inf
1925
-
1926
- small_polygon, larger_contour = (
1927
- (polygon1, largest_contour2)
1928
- if len(largest_contour1) < len(largest_contour2)
1929
- else (polygon2, largest_contour1)
1619
+ _display_tool_trace(
1620
+ template_match.__name__,
1621
+ {"template_image": template_image_b64},
1622
+ return_data,
1623
+ image_b64,
1930
1624
  )
1931
-
1932
- # For each point in the first polygon
1933
- for point in small_polygon:
1934
- # Calculate the distance to the second polygon, -1 is to invert result as point inside the polygon is positive
1935
-
1936
- distance = (
1937
- cv2.pointPolygonTest(
1938
- larger_contour, (point[0, 0].item(), point[0, 1].item()), True
1939
- )
1940
- * -1
1941
- )
1942
-
1943
- # If the distance is negative, the point is inside the polygon, so the distance is 0
1944
- if distance < 0:
1945
- continue
1946
- else:
1947
- # Update the minimum distance if the point is outside the polygon
1948
- min_distance = min(min_distance, distance)
1949
-
1950
- return min_distance if min_distance != np.inf else 0.0
1951
-
1952
-
1953
- def closest_box_distance(
1954
- box1: List[float], box2: List[float], image_size: Tuple[int, int]
1955
- ) -> float:
1956
- """'closest_box_distance' calculates the closest distance between two bounding boxes.
1957
-
1958
- Parameters:
1959
- box1 (List[float]): The first bounding box.
1960
- box2 (List[float]): The second bounding box.
1961
- image_size (Tuple[int, int]): The size of the image given as (height, width).
1962
-
1963
- Returns:
1964
- float: The closest distance between the two bounding boxes.
1965
-
1966
- Example
1967
- -------
1968
- >>> closest_box_distance([100, 100, 200, 200], [300, 300, 400, 400])
1969
- 141.42
1970
- """
1971
-
1972
- x11, y11, x12, y12 = denormalize_bbox(box1, image_size)
1973
- x21, y21, x22, y22 = denormalize_bbox(box2, image_size)
1974
-
1975
- horizontal_distance = np.max([0, x21 - x12, x11 - x22])
1976
- vertical_distance = np.max([0, y21 - y12, y11 - y22])
1977
- return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
1625
+ return return_data
1978
1626
 
1979
1627
 
1980
1628
  def flux_image_inpainting(
@@ -2064,6 +1712,12 @@ def flux_image_inpainting(
2064
1712
  )
2065
1713
 
2066
1714
  output_image = np.array(b64_to_pil(response[0]).convert("RGB"))
1715
+ _display_tool_trace(
1716
+ flux_image_inpainting.__name__,
1717
+ payload,
1718
+ output_image,
1719
+ files,
1720
+ )
2067
1721
  return output_image
2068
1722
 
2069
1723
 
@@ -2106,9 +1760,124 @@ def siglip_classification(image: np.ndarray, labels: List[str]) -> Dict[str, Any
2106
1760
  metadata_payload={"function_name": "siglip_classification"},
2107
1761
  )
2108
1762
 
1763
+ _display_tool_trace(
1764
+ siglip_classification.__name__,
1765
+ payload,
1766
+ response,
1767
+ files,
1768
+ )
2109
1769
  return response
2110
1770
 
2111
1771
 
1772
+ def minimum_distance(
1773
+ det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]
1774
+ ) -> float:
1775
+ """'minimum_distance' calculates the minimum distance between two detections which
1776
+ can include bounding boxes and or masks. This will return the closest distance
1777
+ between the objects, not the distance between the centers of the objects.
1778
+
1779
+ Parameters:
1780
+ det1 (Dict[str, Any]): The first detection of boxes or masks.
1781
+ det2 (Dict[str, Any]): The second detection of boxes or masks.
1782
+ image_size (Tuple[int, int]): The size of the image given as (height, width).
1783
+
1784
+ Returns:
1785
+ float: The closest distance between the two detections.
1786
+
1787
+ Example
1788
+ -------
1789
+ >>> closest_distance(det1, det2, image_size)
1790
+ 141.42
1791
+ """
1792
+
1793
+ if "mask" in det1 and "mask" in det2:
1794
+ return closest_mask_distance(det1["mask"], det2["mask"])
1795
+ elif "bbox" in det1 and "bbox" in det2:
1796
+ return closest_box_distance(det1["bbox"], det2["bbox"], image_size)
1797
+ else:
1798
+ raise ValueError("Both detections must have either bbox or mask")
1799
+
1800
+
1801
+ def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
1802
+ """'closest_mask_distance' calculates the closest distance between two masks.
1803
+
1804
+ Parameters:
1805
+ mask1 (np.ndarray): The first mask.
1806
+ mask2 (np.ndarray): The second mask.
1807
+
1808
+ Returns:
1809
+ float: The closest distance between the two masks.
1810
+
1811
+ Example
1812
+ -------
1813
+ >>> closest_mask_distance(mask1, mask2)
1814
+ 0.5
1815
+ """
1816
+
1817
+ mask1 = np.clip(mask1, 0, 1)
1818
+ mask2 = np.clip(mask2, 0, 1)
1819
+ contours1, _ = cv2.findContours(mask1, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
1820
+ contours2, _ = cv2.findContours(mask2, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
1821
+ largest_contour1 = max(contours1, key=cv2.contourArea)
1822
+ largest_contour2 = max(contours2, key=cv2.contourArea)
1823
+ polygon1 = cv2.approxPolyDP(largest_contour1, 1.0, True)
1824
+ polygon2 = cv2.approxPolyDP(largest_contour2, 1.0, True)
1825
+ min_distance = np.inf
1826
+
1827
+ small_polygon, larger_contour = (
1828
+ (polygon1, largest_contour2)
1829
+ if len(largest_contour1) < len(largest_contour2)
1830
+ else (polygon2, largest_contour1)
1831
+ )
1832
+
1833
+ # For each point in the first polygon
1834
+ for point in small_polygon:
1835
+ # Calculate the distance to the second polygon, -1 is to invert result as point inside the polygon is positive
1836
+
1837
+ distance = (
1838
+ cv2.pointPolygonTest(
1839
+ larger_contour, (point[0, 0].item(), point[0, 1].item()), True
1840
+ )
1841
+ * -1
1842
+ )
1843
+
1844
+ # If the distance is negative, the point is inside the polygon, so the distance is 0
1845
+ if distance < 0:
1846
+ continue
1847
+ else:
1848
+ # Update the minimum distance if the point is outside the polygon
1849
+ min_distance = min(min_distance, distance)
1850
+
1851
+ return min_distance if min_distance != np.inf else 0.0
1852
+
1853
+
1854
+ def closest_box_distance(
1855
+ box1: List[float], box2: List[float], image_size: Tuple[int, int]
1856
+ ) -> float:
1857
+ """'closest_box_distance' calculates the closest distance between two bounding boxes.
1858
+
1859
+ Parameters:
1860
+ box1 (List[float]): The first bounding box.
1861
+ box2 (List[float]): The second bounding box.
1862
+ image_size (Tuple[int, int]): The size of the image given as (height, width).
1863
+
1864
+ Returns:
1865
+ float: The closest distance between the two bounding boxes.
1866
+
1867
+ Example
1868
+ -------
1869
+ >>> closest_box_distance([100, 100, 200, 200], [300, 300, 400, 400])
1870
+ 141.42
1871
+ """
1872
+
1873
+ x11, y11, x12, y12 = denormalize_bbox(box1, image_size)
1874
+ x21, y21, x22, y22 = denormalize_bbox(box2, image_size)
1875
+
1876
+ horizontal_distance = np.max([0, x21 - x12, x11 - x22])
1877
+ vertical_distance = np.max([0, y21 - y12, y11 - y22])
1878
+ return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
1879
+
1880
+
2112
1881
  # Utility and visualization functions
2113
1882
 
2114
1883