vision-agent 0.2.210__py3-none-any.whl → 0.2.211__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,6 +4,7 @@ import logging
4
4
  import os
5
5
  import tempfile
6
6
  import urllib.request
7
+ from base64 import b64encode
7
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
8
9
  from functools import lru_cache
9
10
  from importlib import resources
@@ -14,6 +15,7 @@ from uuid import UUID
14
15
  import cv2
15
16
  import numpy as np
16
17
  import requests
18
+ from IPython.display import display
17
19
  from PIL import Image, ImageDraw, ImageFont
18
20
  from pillow_heif import register_heif_opener # type: ignore
19
21
  from pytube import YouTube # type: ignore
@@ -21,8 +23,8 @@ from pytube import YouTube # type: ignore
21
23
  from vision_agent.clients.landing_public_api import LandingPublicAPI
22
24
  from vision_agent.lmm.lmm import AnthropicLMM, OpenAILMM
23
25
  from vision_agent.tools.tool_utils import (
26
+ ToolCallTrace,
24
27
  add_bboxes_from_masks,
25
- filter_bboxes_by_threshold,
26
28
  get_tool_descriptions,
27
29
  get_tool_documentation,
28
30
  get_tools_df,
@@ -32,7 +34,7 @@ from vision_agent.tools.tool_utils import (
32
34
  send_task_inference_request,
33
35
  single_nms,
34
36
  )
35
- from vision_agent.tools.tools_types import JobStatus, ODResponseData
37
+ from vision_agent.tools.tools_types import JobStatus
36
38
  from vision_agent.utils.exceptions import FineTuneModelIsNotReady
37
39
  from vision_agent.utils.execute import FileSerializer, MimeType
38
40
  from vision_agent.utils.image_utils import (
@@ -41,7 +43,6 @@ from vision_agent.utils.image_utils import (
41
43
  convert_to_b64,
42
44
  denormalize_bbox,
43
45
  encode_image_bytes,
44
- get_image_size,
45
46
  normalize_bbox,
46
47
  numpy_to_bytes,
47
48
  rle_decode,
@@ -88,66 +89,33 @@ def get_tool_recommender() -> Sim:
88
89
  return load_cached_sim(TOOLS_DF)
89
90
 
90
91
 
91
- def grounding_dino(
92
- prompt: str,
93
- image: np.ndarray,
94
- box_threshold: float = 0.20,
95
- iou_threshold: float = 0.20,
96
- model_size: str = "large",
97
- ) -> List[Dict[str, Any]]:
98
- """'grounding_dino' is a tool that can detect and count multiple objects given a text
99
- prompt such as category names or referring expressions. The categories in text prompt
100
- are separated by commas or periods. It returns a list of bounding boxes with
101
- normalized coordinates, label names and associated probability scores.
102
-
103
- Parameters:
104
- prompt (str): The prompt to ground to the image.
105
- image (np.ndarray): The image to ground the prompt to.
106
- box_threshold (float, optional): The threshold for the box detection. Defaults
107
- to 0.20.
108
- iou_threshold (float, optional): The threshold for the Intersection over Union
109
- (IoU). Defaults to 0.20.
110
- model_size (str, optional): The size of the model to use.
111
-
112
- Returns:
113
- List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
114
- bounding box of the detected objects with normalized coordinates between 0
115
- and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
116
- top-left and xmax and ymax are the coordinates of the bottom-right of the
117
- bounding box.
118
-
119
- Example
120
- -------
121
- >>> grounding_dino("car. dinosaur", image)
122
- [
123
- {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
124
- {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
125
- ]
126
- """
127
- image_size = image.shape[:2]
128
- image_b64 = convert_to_b64(image)
129
- if model_size not in ["large", "tiny"]:
130
- raise ValueError("model_size must be either 'large' or 'tiny'")
131
- request_data = {
132
- "prompt": prompt,
133
- "image": image_b64,
134
- "tool": (
135
- "visual_grounding" if model_size == "large" else "visual_grounding_tiny"
136
- ),
137
- "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
138
- "function_name": "grounding_dino",
139
- }
140
- data: Dict[str, Any] = send_inference_request(request_data, "tools")
141
- return_data = []
142
- for i in range(len(data["bboxes"])):
143
- return_data.append(
144
- {
145
- "score": round(data["scores"][i], 2),
146
- "label": data["labels"][i],
147
- "bbox": normalize_bbox(data["bboxes"][i], image_size),
148
- }
149
- )
150
- return return_data
92
+ def _display_tool_trace(
93
+ function_name: str,
94
+ request: Dict[str, Any],
95
+ response: Any,
96
+ files: Union[List[Tuple[str, bytes]], str],
97
+ ) -> None:
98
+ # Sends data through IPython's display function so front-end can show them. We use
99
+ # a function here instead of a decarator becuase we do not want to re-calculate data
100
+ # such as video bytes, which can be slow. Since this is calculated inside the
101
+ # function we can't capture it with a decarator without adding it as a return value
102
+ # which would change the function signature and affect the agent.
103
+ files_in_b64: List[Tuple[str, str]]
104
+ if isinstance(files, str):
105
+ files_in_b64 = [("images", files)]
106
+ else:
107
+ files_in_b64 = [(file[0], b64encode(file[1]).decode("utf-8")) for file in files]
108
+
109
+ request["function_name"] = function_name
110
+ tool_call_trace = ToolCallTrace(
111
+ endpoint_url="",
112
+ type="tool_func_call",
113
+ request=request,
114
+ response={"data": response},
115
+ error=None,
116
+ files=files_in_b64,
117
+ )
118
+ display({MimeType.APPLICATION_JSON: tool_call_trace.model_dump()}, raw=True)
151
119
 
152
120
 
153
121
  def owl_v2_image(
@@ -223,14 +191,21 @@ def owl_v2_image(
223
191
  # get the first frame
224
192
  bboxes = detections[0]
225
193
  bboxes_formatted = [
226
- ODResponseData(
227
- label=bbox["label"],
228
- bbox=normalize_bbox(bbox["bounding_box"], image_size),
229
- score=round(bbox["score"], 2),
230
- )
194
+ {
195
+ "label": bbox["label"],
196
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
197
+ "score": round(bbox["score"], 2),
198
+ }
231
199
  for bbox in bboxes
232
200
  ]
233
- return [bbox.model_dump() for bbox in bboxes_formatted]
201
+
202
+ _display_tool_trace(
203
+ owl_v2_image.__name__,
204
+ payload,
205
+ detections[0],
206
+ files,
207
+ )
208
+ return bboxes_formatted
234
209
 
235
210
 
236
211
  def owl_v2_video(
@@ -309,81 +284,21 @@ def owl_v2_video(
309
284
  bboxes_formatted = []
310
285
  for frame_data in detections:
311
286
  bboxes_formatted_per_frame = [
312
- ODResponseData(
313
- label=bbox["label"],
314
- bbox=normalize_bbox(bbox["bounding_box"], image_size),
315
- score=round(bbox["score"], 2),
316
- )
287
+ {
288
+ "label": bbox["label"],
289
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
290
+ "score": round(bbox["score"], 2),
291
+ }
317
292
  for bbox in frame_data
318
293
  ]
319
294
  bboxes_formatted.append(bboxes_formatted_per_frame)
320
- return [[bbox.model_dump() for bbox in frame] for frame in bboxes_formatted]
321
-
322
-
323
- def grounding_sam(
324
- prompt: str,
325
- image: np.ndarray,
326
- box_threshold: float = 0.20,
327
- iou_threshold: float = 0.20,
328
- ) -> List[Dict[str, Any]]:
329
- """'grounding_sam' is a tool that can segment multiple objects given a text prompt
330
- such as category names or referring expressions. The categories in text prompt are
331
- separated by commas or periods. It returns a list of bounding boxes, label names,
332
- mask file names and associated probability scores.
333
-
334
- Parameters:
335
- prompt (str): The prompt to ground to the image.
336
- image (np.ndarray): The image to ground the prompt to.
337
- box_threshold (float, optional): The threshold for the box detection. Defaults
338
- to 0.20.
339
- iou_threshold (float, optional): The threshold for the Intersection over Union
340
- (IoU). Defaults to 0.20.
341
-
342
- Returns:
343
- List[Dict[str, Any]]: A list of dictionaries containing the score, label,
344
- bounding box, and mask of the detected objects with normalized coordinates
345
- (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
346
- and xmax and ymax are the coordinates of the bottom-right of the bounding box.
347
- The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
348
- the background.
349
-
350
- Example
351
- -------
352
- >>> grounding_sam("car. dinosaur", image)
353
- [
354
- {
355
- 'score': 0.99,
356
- 'label': 'dinosaur',
357
- 'bbox': [0.1, 0.11, 0.35, 0.4],
358
- 'mask': array([[0, 0, 0, ..., 0, 0, 0],
359
- [0, 0, 0, ..., 0, 0, 0],
360
- ...,
361
- [0, 0, 0, ..., 0, 0, 0],
362
- [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
363
- },
364
- ]
365
- """
366
- image_size = image.shape[:2]
367
- image_b64 = convert_to_b64(image)
368
- request_data = {
369
- "prompt": prompt,
370
- "image": image_b64,
371
- "tool": "visual_grounding_segment",
372
- "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
373
- "function_name": "grounding_sam",
374
- }
375
- data: Dict[str, Any] = send_inference_request(request_data, "tools")
376
- return_data = []
377
- for i in range(len(data["bboxes"])):
378
- return_data.append(
379
- {
380
- "score": round(data["scores"][i], 2),
381
- "label": data["labels"][i],
382
- "bbox": normalize_bbox(data["bboxes"][i], image_size),
383
- "mask": rle_decode(mask_rle=data["masks"][i], shape=data["mask_shape"]),
384
- }
385
- )
386
- return return_data
295
+ _display_tool_trace(
296
+ owl_v2_video.__name__,
297
+ payload,
298
+ detections[0],
299
+ files,
300
+ )
301
+ return bboxes_formatted
387
302
 
388
303
 
389
304
  def florence2_sam2_image(
@@ -460,6 +375,13 @@ def florence2_sam2_image(
460
375
  label = detection["label"]
461
376
  bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
462
377
  return_data.append({"label": label, "bbox": bbox, "mask": mask, "score": 1.0})
378
+
379
+ _display_tool_trace(
380
+ florence2_sam2_image.__name__,
381
+ payload,
382
+ detections[0],
383
+ files,
384
+ )
463
385
  return return_data
464
386
 
465
387
 
@@ -545,10 +467,36 @@ def florence2_sam2_video_tracking(
545
467
  for detection in frame:
546
468
  mask = rle_decode_array(detection["mask"])
547
469
  label = str(detection["id"]) + ": " + detection["label"]
548
- return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
470
+ return_frame_data.append(
471
+ {"label": label, "mask": mask, "score": 1.0, "rle": detection["mask"]}
472
+ )
549
473
  return_data.append(return_frame_data)
550
474
  return_data = add_bboxes_from_masks(return_data)
551
- return nms(return_data, iou_threshold=0.95)
475
+ return_data = nms(return_data, iou_threshold=0.95)
476
+
477
+ _display_tool_trace(
478
+ florence2_sam2_video_tracking.__name__,
479
+ payload,
480
+ [
481
+ [
482
+ {
483
+ "label": e["label"],
484
+ "score": e["score"],
485
+ "bbox": denormalize_bbox(e["bbox"], frames[0].shape[:2]),
486
+ "mask": e["rle"],
487
+ }
488
+ for e in lst
489
+ ]
490
+ for lst in return_data
491
+ ],
492
+ files,
493
+ )
494
+ # We save the RLE for display purposes, re-calculting RLE can get very expensive.
495
+ # Deleted here because we are returning the numpy masks instead
496
+ for frame in return_data:
497
+ for obj in frame:
498
+ del obj["rle"]
499
+ return return_data
552
500
 
553
501
 
554
502
  def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
@@ -603,86 +551,175 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
603
551
  box = normalize_bbox(box, image_size)
604
552
  output.append({"label": label, "bbox": box, "score": round(det["score"], 2)})
605
553
 
606
- ocr_results = sorted(output, key=lambda x: (x["bbox"][1], x["bbox"][0]))
607
- return ocr_results
554
+ _display_tool_trace(
555
+ ocr.__name__,
556
+ {},
557
+ data,
558
+ cast(List[Tuple[str, bytes]], [("image", buffer_bytes)]),
559
+ )
560
+ return sorted(output, key=lambda x: (x["bbox"][1], x["bbox"][0]))
561
+
562
+
563
+ def _sam2(
564
+ image: np.ndarray,
565
+ detections: List[Dict[str, Any]],
566
+ image_size: Tuple[int, ...],
567
+ image_bytes: Optional[bytes] = None,
568
+ ) -> Dict[str, Any]:
569
+ if image_bytes is None:
570
+ image_bytes = numpy_to_bytes(image)
571
+
572
+ files = [("images", image_bytes)]
573
+ payload = {
574
+ "model": "sam2",
575
+ "bboxes": json.dumps(
576
+ [
577
+ {
578
+ "labels": [d["label"] for d in detections],
579
+ "bboxes": [
580
+ denormalize_bbox(d["bbox"], image_size) for d in detections
581
+ ],
582
+ }
583
+ ]
584
+ ),
585
+ }
586
+
587
+ metadata = {"function_name": "sam2"}
588
+ pred_detections = send_task_inference_request(
589
+ payload, "sam2", files=files, metadata=metadata
590
+ )
591
+ frame = pred_detections[0]
592
+ return_data = []
593
+ display_data = []
594
+ for inp_detection, detection in zip(detections, frame):
595
+ mask = rle_decode_array(detection["mask"])
596
+ label = detection["label"]
597
+ bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
598
+ return_data.append(
599
+ {
600
+ "label": label,
601
+ "bbox": bbox,
602
+ "mask": mask,
603
+ "score": inp_detection["score"],
604
+ }
605
+ )
606
+ display_data.append(
607
+ {
608
+ "label": label,
609
+ "bbox": detection["bounding_box"],
610
+ "mask": detection["mask"],
611
+ "score": inp_detection["score"],
612
+ }
613
+ )
614
+ return {"files": files, "return_data": return_data, "display_data": display_data}
608
615
 
609
616
 
610
- def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
611
- """'loca_zero_shot_counting' is a tool that counts the dominant foreground object given
612
- an image and no other information about the content. It returns only the count of
613
- the objects in the image.
617
+ def sam2(
618
+ image: np.ndarray,
619
+ detections: List[Dict[str, Any]],
620
+ ) -> List[Dict[str, Any]]:
621
+ """'sam2' is a tool that can segment multiple objects given an input bounding box,
622
+ label and score. It returns a set of masks along with the corresponding bounding
623
+ boxes and labels.
614
624
 
615
625
  Parameters:
616
- image (np.ndarray): The image that contains lot of instances of a single object
626
+ image (np.ndarray): The image that contains multiple instances of the object.
627
+ detections (List[Dict[str, Any]]): A list of dictionaries containing the score,
628
+ label, and bounding box of the detected objects with normalized coordinates
629
+ between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates
630
+ of the top-left and xmax and ymax are the coordinates of the bottom-right of
631
+ the bounding box.
617
632
 
618
633
  Returns:
619
- Dict[str, Any]: A dictionary containing the key 'count' and the count as a
620
- value, e.g. {count: 12} and a heat map for visualization purposes.
634
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
635
+ bounding box, and mask of the detected objects with normalized coordinates
636
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
637
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
638
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
639
+ the background.
621
640
 
622
641
  Example
623
642
  -------
624
- >>> loca_zero_shot_counting(image)
625
- {'count': 83,
626
- 'heat_map': array([[ 0, 0, 0, ..., 0, 0, 0],
627
- [ 0, 0, 0, ..., 0, 0, 0],
628
- [ 0, 0, 0, ..., 0, 0, 1],
629
- ...,
630
- [ 0, 0, 0, ..., 30, 35, 41],
631
- [ 0, 0, 0, ..., 41, 47, 53],
632
- [ 0, 0, 0, ..., 53, 59, 64]], dtype=uint8)}
643
+ >>> sam2(image, [
644
+ {'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
645
+ ])
646
+ [
647
+ {
648
+ 'score': 0.49,
649
+ 'label': 'flower',
650
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
651
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
652
+ [0, 0, 0, ..., 0, 0, 0],
653
+ ...,
654
+ [0, 0, 0, ..., 0, 0, 0],
655
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
656
+ },
657
+ ]
633
658
  """
659
+ image_size = image.shape[:2]
660
+ ret = _sam2(image, detections, image_size)
661
+ _display_tool_trace(
662
+ sam2.__name__,
663
+ {},
664
+ ret["display_data"],
665
+ ret["files"],
666
+ )
634
667
 
635
- image_b64 = convert_to_b64(image)
636
- data = {
637
- "image": image_b64,
638
- "function_name": "loca_zero_shot_counting",
639
- }
640
- resp_data: dict[str, Any] = send_inference_request(data, "loca", v2=True)
641
- resp_data["heat_map"] = np.array(resp_data["heat_map"][0]).astype(np.uint8)
642
- return resp_data
668
+ return ret["return_data"] # type: ignore
643
669
 
644
670
 
645
- def loca_visual_prompt_counting(
646
- image: np.ndarray, visual_prompt: Dict[str, List[float]]
671
+ def _countgd_object_detection(
672
+ prompt: str,
673
+ image: np.ndarray,
674
+ box_threshold: float,
675
+ image_size: Tuple[int, ...],
676
+ image_bytes: Optional[bytes] = None,
647
677
  ) -> Dict[str, Any]:
648
- """'loca_visual_prompt_counting' is a tool that counts the dominant foreground object
649
- given an image and a visual prompt which is a bounding box describing the object.
650
- It returns only the count of the objects in the image.
678
+ if image_bytes is None:
679
+ image_bytes = numpy_to_bytes(image)
651
680
 
652
- Parameters:
653
- image (np.ndarray): The image that contains lot of instances of a single object
654
- visual_prompt (Dict[str, List[float]]): Bounding box of the object in
655
- format [xmin, ymin, xmax, ymax]. Only 1 bounding box can be provided.
681
+ files = [("image", image_bytes)]
682
+ prompts = [p.strip() for p in prompt.split(", ")]
656
683
 
657
- Returns:
658
- Dict[str, Any]: A dictionary containing the key 'count' and the count as a
659
- value, e.g. {count: 12} and a heat map for visualization purposes.
684
+ def _run_countgd(prompt: str) -> List[Dict[str, Any]]:
685
+ payload = {
686
+ "prompts": [prompt],
687
+ "confidence": box_threshold, # still not being used in the API
688
+ "model": "countgd",
689
+ }
690
+ metadata = {"function_name": "countgd_counting"}
660
691
 
661
- Example
662
- -------
663
- >>> loca_visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
664
- {'count': 83,
665
- 'heat_map': array([[ 0, 0, 0, ..., 0, 0, 0],
666
- [ 0, 0, 0, ..., 0, 0, 0],
667
- [ 0, 0, 0, ..., 0, 0, 1],
668
- ...,
669
- [ 0, 0, 0, ..., 30, 35, 41],
670
- [ 0, 0, 0, ..., 41, 47, 53],
671
- [ 0, 0, 0, ..., 53, 59, 64]], dtype=uint8)}
672
- """
692
+ detections = send_task_inference_request(
693
+ payload, "text-to-object-detection", files=files, metadata=metadata
694
+ )
695
+ # get the first frame
696
+ return detections[0] # type: ignore
673
697
 
674
- image_size = get_image_size(image)
675
- bbox = visual_prompt["bbox"]
676
- image_b64 = convert_to_b64(image)
698
+ bboxes = []
699
+ with ThreadPoolExecutor() as executor:
700
+ futures = [executor.submit(_run_countgd, prompt) for prompt in prompts]
701
+ for future in as_completed(futures):
702
+ bboxes.extend(future.result())
677
703
 
678
- data = {
679
- "image": image_b64,
680
- "bbox": list(map(int, denormalize_bbox(bbox, image_size))),
681
- "function_name": "loca_visual_prompt_counting",
682
- }
683
- resp_data: dict[str, Any] = send_inference_request(data, "loca", v2=True)
684
- resp_data["heat_map"] = np.array(resp_data["heat_map"][0]).astype(np.uint8)
685
- return resp_data
704
+ return_data = [
705
+ {
706
+ "label": bbox["label"],
707
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
708
+ "score": round(bbox["score"], 2),
709
+ }
710
+ for bbox in bboxes
711
+ ]
712
+
713
+ return_data = single_nms(return_data, iou_threshold=0.80)
714
+ display_data = [
715
+ {
716
+ "label": e["label"],
717
+ "score": e["score"],
718
+ "bbox": denormalize_bbox(e["bbox"], image_size),
719
+ }
720
+ for e in return_data
721
+ ]
722
+ return {"files": files, "return_data": return_data, "display_data": display_data}
686
723
 
687
724
 
688
725
  def countgd_object_detection(
@@ -723,121 +760,17 @@ def countgd_object_detection(
723
760
  if image_size[0] < 1 or image_size[1] < 1:
724
761
  return []
725
762
 
726
- buffer_bytes = numpy_to_bytes(image)
727
- files = [("image", buffer_bytes)]
728
- prompts = [p.strip() for p in prompt.split(", ")]
729
-
730
- def _run_countgd(prompt: str) -> List[Dict[str, Any]]:
731
- payload = {
732
- "prompts": [prompt],
733
- "confidence": box_threshold, # still not being used in the API
734
- "model": "countgd",
735
- }
736
- metadata = {"function_name": "countgd_counting"}
737
-
738
- detections = send_task_inference_request(
739
- payload, "text-to-object-detection", files=files, metadata=metadata
740
- )
741
- # get the first frame
742
- return detections[0] # type: ignore
743
-
744
- bboxes = []
745
- with ThreadPoolExecutor() as executor:
746
- futures = [executor.submit(_run_countgd, prompt) for prompt in prompts]
747
- for future in as_completed(futures):
748
- bboxes.extend(future.result())
749
-
750
- bboxes_formatted = [
751
- ODResponseData(
752
- label=bbox["label"],
753
- bbox=normalize_bbox(bbox["bounding_box"], image_size),
754
- score=round(bbox["score"], 2),
755
- )
756
- for bbox in bboxes
757
- ]
758
- # TODO: remove this once we start to use the confidence on countgd
759
- filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
760
- return_data = [bbox.model_dump() for bbox in filtered_bboxes]
761
- return single_nms(return_data, iou_threshold=0.80)
762
-
763
-
764
- def sam2(
765
- image: np.ndarray,
766
- detections: List[Dict[str, Any]],
767
- ) -> List[Dict[str, Any]]:
768
- """'sam2' is a tool that can segment multiple objects given an input bounding box,
769
- label and score. It returns a set of masks along with the corresponding bounding
770
- boxes and labels.
771
-
772
- Parameters:
773
- image (np.ndarray): The image that contains multiple instances of the object.
774
- detections (List[Dict[str, Any]]): A list of dictionaries containing the score,
775
- label, and bounding box of the detected objects with normalized coordinates
776
- between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates
777
- of the top-left and xmax and ymax are the coordinates of the bottom-right of
778
- the bounding box.
779
-
780
- Returns:
781
- List[Dict[str, Any]]: A list of dictionaries containing the score, label,
782
- bounding box, and mask of the detected objects with normalized coordinates
783
- (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
784
- and xmax and ymax are the coordinates of the bottom-right of the bounding box.
785
- The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
786
- the background.
787
-
788
- Example
789
- -------
790
- >>> sam2(image, [
791
- {'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
792
- ])
793
- [
794
- {
795
- 'score': 0.49,
796
- 'label': 'flower',
797
- 'bbox': [0.1, 0.11, 0.35, 0.4],
798
- 'mask': array([[0, 0, 0, ..., 0, 0, 0],
799
- [0, 0, 0, ..., 0, 0, 0],
800
- ...,
801
- [0, 0, 0, ..., 0, 0, 0],
802
- [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
803
- },
804
- ]
805
- """
806
- image_size = image.shape[:2]
807
-
808
- files = [("images", numpy_to_bytes(image))]
809
- payload = {
810
- "model": "sam2",
811
- "bboxes": json.dumps(
812
- [
813
- {
814
- "labels": [d["label"] for d in detections],
815
- "bboxes": [
816
- denormalize_bbox(d["bbox"], image_size) for d in detections
817
- ],
818
- }
819
- ]
820
- ),
821
- }
822
- metadata = {"function_name": "sam2"}
823
- pred_detections = send_task_inference_request(
824
- payload, "sam2", files=files, metadata=metadata
825
- )
826
- frame = pred_detections[0]
827
- return_data = []
828
- for inp_detection, detection in zip(detections, frame):
829
- mask = rle_decode_array(detection["mask"])
830
- label = detection["label"]
831
- bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
832
- return_data.append(
833
- {
834
- "label": label,
835
- "bbox": bbox,
836
- "mask": mask,
837
- "score": inp_detection["score"],
838
- }
839
- )
840
- return return_data
763
+ ret = _countgd_object_detection(prompt, image, box_threshold, image_size)
764
+ _display_tool_trace(
765
+ countgd_object_detection.__name__,
766
+ {
767
+ "prompts": prompt,
768
+ "confidence": box_threshold,
769
+ },
770
+ ret["display_data"],
771
+ ret["files"],
772
+ )
773
+ return ret["return_data"] # type: ignore
841
774
 
842
775
 
843
776
  def countgd_sam2_object_detection(
@@ -881,9 +814,23 @@ def countgd_sam2_object_detection(
881
814
  },
882
815
  ]
883
816
  """
884
- detections = countgd_object_detection(prompt, image, box_threshold)
885
- detections_with_masks = sam2(image, detections)
886
- return detections_with_masks
817
+
818
+ od_ret = _countgd_object_detection(prompt, image, box_threshold, image.shape[:2])
819
+ seg_ret = _sam2(
820
+ image, od_ret["return_data"], image.shape[:2], image_bytes=od_ret["files"][0][1]
821
+ )
822
+
823
+ _display_tool_trace(
824
+ countgd_sam2_object_detection.__name__,
825
+ {
826
+ "prompts": prompt,
827
+ "confidence": box_threshold,
828
+ },
829
+ seg_ret["display_data"],
830
+ seg_ret["files"],
831
+ )
832
+
833
+ return seg_ret["return_data"] # type: ignore
887
834
 
888
835
 
889
836
  def countgd_example_based_counting(
@@ -941,76 +888,28 @@ def countgd_example_based_counting(
941
888
  # get the first frame
942
889
  bboxes_per_frame = detections[0]
943
890
  bboxes_formatted = [
944
- ODResponseData(
945
- label=bbox["label"],
946
- bbox=normalize_bbox(bbox["bounding_box"], image_size),
947
- score=round(bbox["score"], 2),
948
- )
891
+ {
892
+ "label": bbox["label"],
893
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
894
+ "score": round(bbox["score"], 2),
895
+ }
949
896
  for bbox in bboxes_per_frame
950
897
  ]
951
- filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
952
- return [bbox.model_dump() for bbox in filtered_bboxes]
953
-
954
-
955
- def florence2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
956
- """'florence2_roberta_vqa' is a tool that takes an image and analyzes
957
- its contents, generates detailed captions and then tries to answer the given
958
- question using the generated context. It returns text as an answer to the question.
959
-
960
- Parameters:
961
- prompt (str): The question about the image
962
- image (np.ndarray): The reference image used for the question
963
-
964
- Returns:
965
- str: A string which is the answer to the given prompt.
966
-
967
- Example
968
- -------
969
- >>> florence2_roberta_vqa('What is the top left animal in this image?', image)
970
- 'white tiger'
971
- """
972
-
973
- image_b64 = convert_to_b64(image)
974
- data = {
975
- "image": image_b64,
976
- "question": prompt,
977
- "function_name": "florence2_roberta_vqa",
978
- }
979
-
980
- answer = send_inference_request(data, "florence2-qa", v2=True)
981
- return answer # type: ignore
982
-
983
-
984
- def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
985
- """'ixc25_image_vqa' is a tool that can answer any questions about arbitrary images
986
- including regular images or images of documents or presentations. It returns text
987
- as an answer to the question.
988
-
989
- Parameters:
990
- prompt (str): The question about the image
991
- image (np.ndarray): The reference image used for the question
992
-
993
- Returns:
994
- str: A string which is the answer to the given prompt.
995
-
996
- Example
997
- -------
998
- >>> ixc25_image_vqa('What is the cat doing?', image)
999
- 'drinking milk'
1000
- """
1001
- if image.shape[0] < 1 or image.shape[1] < 1:
1002
- raise ValueError(f"Image is empty, image shape: {image.shape}")
1003
-
1004
- buffer_bytes = numpy_to_bytes(image)
1005
- files = [("image", buffer_bytes)]
1006
- payload = {
1007
- "prompt": prompt,
1008
- "function_name": "ixc25_image_vqa",
1009
- }
1010
- data: Dict[str, Any] = send_inference_request(
1011
- payload, "internlm-xcomposer2", files=files, v2=True
898
+ _display_tool_trace(
899
+ countgd_example_based_counting.__name__,
900
+ payload,
901
+ [
902
+ {
903
+ "label": e["label"],
904
+ "score": e["score"],
905
+ "bbox": denormalize_bbox(e["bbox"], image_size),
906
+ }
907
+ for e in bboxes_formatted
908
+ ],
909
+ files,
1012
910
  )
1013
- return cast(str, data["answer"])
911
+
912
+ return bboxes_formatted
1014
913
 
1015
914
 
1016
915
  def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
@@ -1047,61 +946,13 @@ def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
1047
946
  data: Dict[str, Any] = send_inference_request(
1048
947
  payload, "image-to-text", files=files, v2=True
1049
948
  )
1050
- return cast(str, data)
1051
-
1052
-
1053
- def claude35_text_extraction(image: np.ndarray) -> str:
1054
- """'claude35_text_extraction' is a tool that can extract text from an image. It
1055
- returns the extracted text as a string and can be used as an alternative to OCR if
1056
- you do not need to know the exact bounding box of the text.
1057
-
1058
- Parameters:
1059
- image (np.ndarray): The image to extract text from.
1060
-
1061
- Returns:
1062
- str: The extracted text from the image.
1063
- """
1064
-
1065
- lmm = AnthropicLMM()
1066
- buffer = io.BytesIO()
1067
- Image.fromarray(image).save(buffer, format="PNG")
1068
- image_bytes = buffer.getvalue()
1069
- image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
1070
- text = lmm.generate(
1071
- "Extract and return any text you see in this image and nothing else. If you do not read any text respond with an empty string.",
1072
- [image_b64],
1073
- )
1074
- return cast(str, text)
1075
-
1076
-
1077
- def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
1078
- """'ixc25_video_vqa' is a tool that can answer any questions about arbitrary videos
1079
- including regular videos or videos of documents or presentations. It returns text
1080
- as an answer to the question.
1081
-
1082
- Parameters:
1083
- prompt (str): The question about the video
1084
- frames (List[np.ndarray]): The reference frames used for the question
1085
-
1086
- Returns:
1087
- str: A string which is the answer to the given prompt.
1088
-
1089
- Example
1090
- -------
1091
- >>> ixc25_video_vqa('Which football player made the goal?', frames)
1092
- 'Lionel Messi'
1093
- """
1094
-
1095
- buffer_bytes = frames_to_bytes(frames)
1096
- files = [("video", buffer_bytes)]
1097
- payload = {
1098
- "prompt": prompt,
1099
- "function_name": "ixc25_video_vqa",
1100
- }
1101
- data: Dict[str, Any] = send_inference_request(
1102
- payload, "internlm-xcomposer2", files=files, v2=True
949
+ _display_tool_trace(
950
+ qwen2_vl_images_vqa.__name__,
951
+ payload,
952
+ cast(str, data),
953
+ files,
1103
954
  )
1104
- return cast(str, data["answer"])
955
+ return cast(str, data)
1105
956
 
1106
957
 
1107
958
  def qwen2_vl_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
@@ -1135,9 +986,39 @@ def qwen2_vl_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
1135
986
  data: Dict[str, Any] = send_inference_request(
1136
987
  payload, "image-to-text", files=files, v2=True
1137
988
  )
989
+ _display_tool_trace(
990
+ qwen2_vl_video_vqa.__name__,
991
+ payload,
992
+ cast(str, data),
993
+ files,
994
+ )
1138
995
  return cast(str, data)
1139
996
 
1140
997
 
998
+ def claude35_text_extraction(image: np.ndarray) -> str:
999
+ """'claude35_text_extraction' is a tool that can extract text from an image. It
1000
+ returns the extracted text as a string and can be used as an alternative to OCR if
1001
+ you do not need to know the exact bounding box of the text.
1002
+
1003
+ Parameters:
1004
+ image (np.ndarray): The image to extract text from.
1005
+
1006
+ Returns:
1007
+ str: The extracted text from the image.
1008
+ """
1009
+
1010
+ lmm = AnthropicLMM()
1011
+ buffer = io.BytesIO()
1012
+ Image.fromarray(image).save(buffer, format="PNG")
1013
+ image_bytes = buffer.getvalue()
1014
+ image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
1015
+ text = lmm.generate(
1016
+ "Extract and return any text you see in this image and nothing else. If you do not read any text respond with an empty string.",
1017
+ [image_b64],
1018
+ )
1019
+ return cast(str, text)
1020
+
1021
+
1141
1022
  def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
1142
1023
  """'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
1143
1024
  including regular images or images of documents or presentations. It returns text
@@ -1201,36 +1082,6 @@ def gpt4o_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
1201
1082
  return cast(str, resp)
1202
1083
 
1203
1084
 
1204
- def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
1205
- """'git_vqa_v2' is a tool that can answer questions about the visual
1206
- contents of an image given a question and an image. It returns an answer to the
1207
- question
1208
-
1209
- Parameters:
1210
- prompt (str): The question about the image
1211
- image (np.ndarray): The reference image used for the question
1212
-
1213
- Returns:
1214
- str: A string which is the answer to the given prompt.
1215
-
1216
- Example
1217
- -------
1218
- >>> git_vqa_v2('What is the cat doing ?', image)
1219
- 'drinking milk'
1220
- """
1221
-
1222
- image_b64 = convert_to_b64(image)
1223
- data = {
1224
- "image": image_b64,
1225
- "prompt": prompt,
1226
- "tool": "image_question_answering",
1227
- "function_name": "git_vqa_v2",
1228
- }
1229
-
1230
- answer = send_inference_request(data, "tools")
1231
- return answer["text"][0] # type: ignore
1232
-
1233
-
1234
1085
  def video_temporal_localization(
1235
1086
  prompt: str,
1236
1087
  frames: List[np.ndarray],
@@ -1274,70 +1125,48 @@ def video_temporal_localization(
1274
1125
  data = send_inference_request(
1275
1126
  payload, "video-temporal-localization", files=files, v2=True
1276
1127
  )
1128
+ _display_tool_trace(
1129
+ video_temporal_localization.__name__,
1130
+ payload,
1131
+ data,
1132
+ files,
1133
+ )
1277
1134
  return [cast(float, value) for value in data]
1278
1135
 
1279
1136
 
1280
- def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
1281
- """'clip' is a tool that can classify an image or a cropped detection given a list
1282
- of input classes or tags. It returns the same list of the input classes along with
1283
- their probability scores based on image content.
1137
+ def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
1138
+ """'vit_image_classification' is a tool that can classify an image. It returns a
1139
+ list of classes and their probability scores based on image content.
1284
1140
 
1285
1141
  Parameters:
1286
1142
  image (np.ndarray): The image to classify or tag
1287
- classes (List[str]): The list of classes or tags that is associated with the image
1288
1143
 
1289
1144
  Returns:
1290
1145
  Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
1291
- contains a list of given labels and other a list of scores.
1146
+ contains a list of labels and other a list of scores.
1292
1147
 
1293
1148
  Example
1294
1149
  -------
1295
- >>> clip(image, ['dog', 'cat', 'bird'])
1296
- {"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]},
1150
+ >>> vit_image_classification(image)
1151
+ {"labels": ["leopard", "lemur, otter", "bird"], "scores": [0.68, 0.30, 0.02]},
1297
1152
  """
1298
-
1299
1153
  if image.shape[0] < 1 or image.shape[1] < 1:
1300
1154
  return {"labels": [], "scores": []}
1301
1155
 
1302
1156
  image_b64 = convert_to_b64(image)
1303
1157
  data = {
1304
- "prompt": ",".join(classes),
1305
1158
  "image": image_b64,
1306
- "tool": "closed_set_image_classification",
1307
- "function_name": "clip",
1308
- }
1309
- resp_data: dict[str, Any] = send_inference_request(data, "tools")
1310
- resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
1311
- return resp_data
1312
-
1313
-
1314
- def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
1315
- """'vit_image_classification' is a tool that can classify an image. It returns a
1316
- list of classes and their probability scores based on image content.
1317
-
1318
- Parameters:
1319
- image (np.ndarray): The image to classify or tag
1320
-
1321
- Returns:
1322
- Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
1323
- contains a list of labels and other a list of scores.
1324
-
1325
- Example
1326
- -------
1327
- >>> vit_image_classification(image)
1328
- {"labels": ["leopard", "lemur, otter", "bird"], "scores": [0.68, 0.30, 0.02]},
1329
- """
1330
- if image.shape[0] < 1 or image.shape[1] < 1:
1331
- return {"labels": [], "scores": []}
1332
-
1333
- image_b64 = convert_to_b64(image)
1334
- data = {
1335
- "image": image_b64,
1336
- "tool": "image_classification",
1337
- "function_name": "vit_image_classification",
1159
+ "tool": "image_classification",
1160
+ "function_name": "vit_image_classification",
1338
1161
  }
1339
1162
  resp_data: dict[str, Any] = send_inference_request(data, "tools")
1340
1163
  resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
1164
+ _display_tool_trace(
1165
+ vit_image_classification.__name__,
1166
+ data,
1167
+ resp_data,
1168
+ image_b64,
1169
+ )
1341
1170
  return resp_data
1342
1171
 
1343
1172
 
@@ -1369,65 +1198,15 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
1369
1198
  data, "nsfw-classification", v2=True
1370
1199
  )
1371
1200
  resp_data["score"] = round(resp_data["score"], 4)
1201
+ _display_tool_trace(
1202
+ vit_nsfw_classification.__name__,
1203
+ data,
1204
+ resp_data,
1205
+ image_b64,
1206
+ )
1372
1207
  return resp_data
1373
1208
 
1374
1209
 
1375
- def blip_image_caption(image: np.ndarray) -> str:
1376
- """'blip_image_caption' is a tool that can caption an image based on its contents. It
1377
- returns a text describing the image.
1378
-
1379
- Parameters:
1380
- image (np.ndarray): The image to caption
1381
-
1382
- Returns:
1383
- str: A string which is the caption for the given image.
1384
-
1385
- Example
1386
- -------
1387
- >>> blip_image_caption(image)
1388
- 'This image contains a cat sitting on a table with a bowl of milk.'
1389
- """
1390
-
1391
- image_b64 = convert_to_b64(image)
1392
- data = {
1393
- "image": image_b64,
1394
- "tool": "image_captioning",
1395
- "function_name": "blip_image_caption",
1396
- }
1397
-
1398
- answer = send_inference_request(data, "tools")
1399
- return answer["text"][0] # type: ignore
1400
-
1401
-
1402
- def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> str:
1403
- """'florence2_image_caption' is a tool that can caption or describe an image based
1404
- on its contents. It returns a text describing the image.
1405
-
1406
- Parameters:
1407
- image (np.ndarray): The image to caption
1408
- detail_caption (bool): If True, the caption will be as detailed as possible else
1409
- the caption will be a brief description.
1410
-
1411
- Returns:
1412
- str: A string which is the caption for the given image.
1413
-
1414
- Example
1415
- -------
1416
- >>> florence2_image_caption(image, False)
1417
- 'This image contains a cat sitting on a table with a bowl of milk.'
1418
- """
1419
- image_b64 = convert_to_b64(image)
1420
- task = "<MORE_DETAILED_CAPTION>" if detail_caption else "<DETAILED_CAPTION>"
1421
- data = {
1422
- "image": image_b64,
1423
- "task": task,
1424
- "function_name": "florence2_image_caption",
1425
- }
1426
-
1427
- answer = send_inference_request(data, "florence2", v2=True)
1428
- return answer[task] # type: ignore
1429
-
1430
-
1431
1210
  def florence2_phrase_grounding(
1432
1211
  prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
1433
1212
  ) -> List[Dict[str, Any]]:
@@ -1490,15 +1269,21 @@ def florence2_phrase_grounding(
1490
1269
  # get the first frame
1491
1270
  bboxes = detections[0]
1492
1271
  bboxes_formatted = [
1493
- ODResponseData(
1494
- label=bbox["label"],
1495
- bbox=normalize_bbox(bbox["bounding_box"], image_size),
1496
- score=round(bbox["score"], 2),
1497
- )
1272
+ {
1273
+ "label": bbox["label"],
1274
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
1275
+ "score": round(bbox["score"], 2),
1276
+ }
1498
1277
  for bbox in bboxes
1499
1278
  ]
1500
1279
 
1501
- return [bbox.model_dump() for bbox in bboxes_formatted]
1280
+ _display_tool_trace(
1281
+ florence2_phrase_grounding.__name__,
1282
+ payload,
1283
+ detections[0],
1284
+ files,
1285
+ )
1286
+ return [bbox for bbox in bboxes_formatted]
1502
1287
 
1503
1288
 
1504
1289
  def florence2_phrase_grounding_video(
@@ -1566,15 +1351,21 @@ def florence2_phrase_grounding_video(
1566
1351
  bboxes_formatted = []
1567
1352
  for frame_data in detections:
1568
1353
  bboxes_formatted_per_frame = [
1569
- ODResponseData(
1570
- label=bbox["label"],
1571
- bbox=normalize_bbox(bbox["bounding_box"], image_size),
1572
- score=round(bbox["score"], 2),
1573
- )
1354
+ {
1355
+ "label": bbox["label"],
1356
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
1357
+ "score": round(bbox["score"], 2),
1358
+ }
1574
1359
  for bbox in frame_data
1575
1360
  ]
1576
1361
  bboxes_formatted.append(bboxes_formatted_per_frame)
1577
- return [[bbox.model_dump() for bbox in frame] for frame in bboxes_formatted]
1362
+ _display_tool_trace(
1363
+ florence2_phrase_grounding_video.__name__,
1364
+ payload,
1365
+ detections,
1366
+ files,
1367
+ )
1368
+ return bboxes_formatted
1578
1369
 
1579
1370
 
1580
1371
  def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
@@ -1621,6 +1412,12 @@ def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
1621
1412
  "score": 1.0,
1622
1413
  }
1623
1414
  )
1415
+ _display_tool_trace(
1416
+ florence2_ocr.__name__,
1417
+ {},
1418
+ detections,
1419
+ image_b64,
1420
+ )
1624
1421
  return return_data
1625
1422
 
1626
1423
 
@@ -1683,6 +1480,12 @@ def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
1683
1480
  ),
1684
1481
  }
1685
1482
  )
1483
+ _display_tool_trace(
1484
+ detr_segmentation.__name__,
1485
+ {},
1486
+ return_data,
1487
+ image_b64,
1488
+ )
1686
1489
  return return_data
1687
1490
 
1688
1491
 
@@ -1721,74 +1524,15 @@ def depth_anything_v2(image: np.ndarray) -> np.ndarray:
1721
1524
  depth_map_np.max() - depth_map_np.min()
1722
1525
  )
1723
1526
  depth_map_np = (255 * depth_map_np).astype(np.uint8)
1527
+ _display_tool_trace(
1528
+ depth_anything_v2.__name__,
1529
+ {},
1530
+ depth_map,
1531
+ image_b64,
1532
+ )
1724
1533
  return depth_map_np
1725
1534
 
1726
1535
 
1727
- def generate_soft_edge_image(image: np.ndarray) -> np.ndarray:
1728
- """'generate_soft_edge_image' is a tool that runs Holistically Nested edge detection
1729
- to generate a soft edge image (HED) from a given RGB image. The returned image is
1730
- monochrome and represents object boundaries as soft white edges on black background
1731
-
1732
- Parameters:
1733
- image (np.ndarray): The image to used to generate soft edge image
1734
-
1735
- Returns:
1736
- np.ndarray: A soft edge image with pixel values ranging from 0 to 255.
1737
-
1738
- Example
1739
- -------
1740
- >>> generate_soft_edge_image(image)
1741
- array([[0, 0, 0, ..., 0, 0, 0],
1742
- [0, 20, 24, ..., 0, 100, 103],
1743
- ...,
1744
- [10, 11, 15, ..., 202, 202, 205],
1745
- [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
1746
- """
1747
- image_b64 = convert_to_b64(image)
1748
- data = {
1749
- "image": image_b64,
1750
- "tool": "generate_hed",
1751
- "function_name": "generate_soft_edge_image",
1752
- }
1753
-
1754
- answer = send_inference_request(data, "tools")
1755
- return_data = np.array(b64_to_pil(answer["masks"][0]).convert("L"))
1756
- return return_data
1757
-
1758
-
1759
- def dpt_hybrid_midas(image: np.ndarray) -> np.ndarray:
1760
- """'dpt_hybrid_midas' is a tool that generates a normal mapped from a given RGB
1761
- image. The returned RGB image is texture mapped image of the surface normals and the
1762
- RGB values represent the surface normals in the x, y, z directions.
1763
-
1764
- Parameters:
1765
- image (np.ndarray): The image to used to generate normal image
1766
-
1767
- Returns:
1768
- np.ndarray: A mapped normal image with RGB pixel values indicating surface
1769
- normals in x, y, z directions.
1770
-
1771
- Example
1772
- -------
1773
- >>> dpt_hybrid_midas(image)
1774
- array([[0, 0, 0, ..., 0, 0, 0],
1775
- [0, 20, 24, ..., 0, 100, 103],
1776
- ...,
1777
- [10, 11, 15, ..., 202, 202, 205],
1778
- [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
1779
- """
1780
- image_b64 = convert_to_b64(image)
1781
- data = {
1782
- "image": image_b64,
1783
- "tool": "generate_normal",
1784
- "function_name": "dpt_hybrid_midas",
1785
- }
1786
-
1787
- answer = send_inference_request(data, "tools")
1788
- return_data = np.array(b64_to_pil(answer["masks"][0]).convert("RGB"))
1789
- return return_data
1790
-
1791
-
1792
1536
  def generate_pose_image(image: np.ndarray) -> np.ndarray:
1793
1537
  """'generate_pose_image' is a tool that generates a open pose bone/stick image from
1794
1538
  a given RGB image. The returned bone image is RGB with the pose amd keypoints colored
@@ -1817,6 +1561,12 @@ def generate_pose_image(image: np.ndarray) -> np.ndarray:
1817
1561
 
1818
1562
  pos_img = send_inference_request(data, "pose-detector", v2=True)
1819
1563
  return_data = np.array(b64_to_pil(pos_img["data"]).convert("RGB"))
1564
+ _display_tool_trace(
1565
+ generate_pose_image.__name__,
1566
+ {},
1567
+ pos_img,
1568
+ image_b64,
1569
+ )
1820
1570
  return return_data
1821
1571
 
1822
1572
 
@@ -1861,120 +1611,18 @@ def template_match(
1861
1611
  for i in range(len(answer["bboxes"])):
1862
1612
  return_data.append(
1863
1613
  {
1614
+ "label": "match",
1864
1615
  "score": round(answer["scores"][i], 2),
1865
1616
  "bbox": normalize_bbox(answer["bboxes"][i], image_size),
1866
1617
  }
1867
1618
  )
1868
- return return_data
1869
-
1870
-
1871
- def minimum_distance(
1872
- det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]
1873
- ) -> float:
1874
- """'minimum_distance' calculates the minimum distance between two detections which
1875
- can include bounding boxes and or masks. This will return the closest distance
1876
- between the objects, not the distance between the centers of the objects.
1877
-
1878
- Parameters:
1879
- det1 (Dict[str, Any]): The first detection of boxes or masks.
1880
- det2 (Dict[str, Any]): The second detection of boxes or masks.
1881
- image_size (Tuple[int, int]): The size of the image given as (height, width).
1882
-
1883
- Returns:
1884
- float: The closest distance between the two detections.
1885
-
1886
- Example
1887
- -------
1888
- >>> closest_distance(det1, det2, image_size)
1889
- 141.42
1890
- """
1891
-
1892
- if "mask" in det1 and "mask" in det2:
1893
- return closest_mask_distance(det1["mask"], det2["mask"])
1894
- elif "bbox" in det1 and "bbox" in det2:
1895
- return closest_box_distance(det1["bbox"], det2["bbox"], image_size)
1896
- else:
1897
- raise ValueError("Both detections must have either bbox or mask")
1898
-
1899
-
1900
- def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
1901
- """'closest_mask_distance' calculates the closest distance between two masks.
1902
-
1903
- Parameters:
1904
- mask1 (np.ndarray): The first mask.
1905
- mask2 (np.ndarray): The second mask.
1906
-
1907
- Returns:
1908
- float: The closest distance between the two masks.
1909
-
1910
- Example
1911
- -------
1912
- >>> closest_mask_distance(mask1, mask2)
1913
- 0.5
1914
- """
1915
-
1916
- mask1 = np.clip(mask1, 0, 1)
1917
- mask2 = np.clip(mask2, 0, 1)
1918
- contours1, _ = cv2.findContours(mask1, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
1919
- contours2, _ = cv2.findContours(mask2, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
1920
- largest_contour1 = max(contours1, key=cv2.contourArea)
1921
- largest_contour2 = max(contours2, key=cv2.contourArea)
1922
- polygon1 = cv2.approxPolyDP(largest_contour1, 1.0, True)
1923
- polygon2 = cv2.approxPolyDP(largest_contour2, 1.0, True)
1924
- min_distance = np.inf
1925
-
1926
- small_polygon, larger_contour = (
1927
- (polygon1, largest_contour2)
1928
- if len(largest_contour1) < len(largest_contour2)
1929
- else (polygon2, largest_contour1)
1619
+ _display_tool_trace(
1620
+ template_match.__name__,
1621
+ {"template_image": template_image_b64},
1622
+ return_data,
1623
+ image_b64,
1930
1624
  )
1931
-
1932
- # For each point in the first polygon
1933
- for point in small_polygon:
1934
- # Calculate the distance to the second polygon, -1 is to invert result as point inside the polygon is positive
1935
-
1936
- distance = (
1937
- cv2.pointPolygonTest(
1938
- larger_contour, (point[0, 0].item(), point[0, 1].item()), True
1939
- )
1940
- * -1
1941
- )
1942
-
1943
- # If the distance is negative, the point is inside the polygon, so the distance is 0
1944
- if distance < 0:
1945
- continue
1946
- else:
1947
- # Update the minimum distance if the point is outside the polygon
1948
- min_distance = min(min_distance, distance)
1949
-
1950
- return min_distance if min_distance != np.inf else 0.0
1951
-
1952
-
1953
- def closest_box_distance(
1954
- box1: List[float], box2: List[float], image_size: Tuple[int, int]
1955
- ) -> float:
1956
- """'closest_box_distance' calculates the closest distance between two bounding boxes.
1957
-
1958
- Parameters:
1959
- box1 (List[float]): The first bounding box.
1960
- box2 (List[float]): The second bounding box.
1961
- image_size (Tuple[int, int]): The size of the image given as (height, width).
1962
-
1963
- Returns:
1964
- float: The closest distance between the two bounding boxes.
1965
-
1966
- Example
1967
- -------
1968
- >>> closest_box_distance([100, 100, 200, 200], [300, 300, 400, 400])
1969
- 141.42
1970
- """
1971
-
1972
- x11, y11, x12, y12 = denormalize_bbox(box1, image_size)
1973
- x21, y21, x22, y22 = denormalize_bbox(box2, image_size)
1974
-
1975
- horizontal_distance = np.max([0, x21 - x12, x11 - x22])
1976
- vertical_distance = np.max([0, y21 - y12, y11 - y22])
1977
- return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
1625
+ return return_data
1978
1626
 
1979
1627
 
1980
1628
  def flux_image_inpainting(
@@ -2064,6 +1712,12 @@ def flux_image_inpainting(
2064
1712
  )
2065
1713
 
2066
1714
  output_image = np.array(b64_to_pil(response[0]).convert("RGB"))
1715
+ _display_tool_trace(
1716
+ flux_image_inpainting.__name__,
1717
+ payload,
1718
+ output_image,
1719
+ files,
1720
+ )
2067
1721
  return output_image
2068
1722
 
2069
1723
 
@@ -2106,9 +1760,124 @@ def siglip_classification(image: np.ndarray, labels: List[str]) -> Dict[str, Any
2106
1760
  metadata_payload={"function_name": "siglip_classification"},
2107
1761
  )
2108
1762
 
1763
+ _display_tool_trace(
1764
+ siglip_classification.__name__,
1765
+ payload,
1766
+ response,
1767
+ files,
1768
+ )
2109
1769
  return response
2110
1770
 
2111
1771
 
1772
+ def minimum_distance(
1773
+ det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]
1774
+ ) -> float:
1775
+ """'minimum_distance' calculates the minimum distance between two detections which
1776
+ can include bounding boxes and or masks. This will return the closest distance
1777
+ between the objects, not the distance between the centers of the objects.
1778
+
1779
+ Parameters:
1780
+ det1 (Dict[str, Any]): The first detection of boxes or masks.
1781
+ det2 (Dict[str, Any]): The second detection of boxes or masks.
1782
+ image_size (Tuple[int, int]): The size of the image given as (height, width).
1783
+
1784
+ Returns:
1785
+ float: The closest distance between the two detections.
1786
+
1787
+ Example
1788
+ -------
1789
+ >>> closest_distance(det1, det2, image_size)
1790
+ 141.42
1791
+ """
1792
+
1793
+ if "mask" in det1 and "mask" in det2:
1794
+ return closest_mask_distance(det1["mask"], det2["mask"])
1795
+ elif "bbox" in det1 and "bbox" in det2:
1796
+ return closest_box_distance(det1["bbox"], det2["bbox"], image_size)
1797
+ else:
1798
+ raise ValueError("Both detections must have either bbox or mask")
1799
+
1800
+
1801
+ def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
1802
+ """'closest_mask_distance' calculates the closest distance between two masks.
1803
+
1804
+ Parameters:
1805
+ mask1 (np.ndarray): The first mask.
1806
+ mask2 (np.ndarray): The second mask.
1807
+
1808
+ Returns:
1809
+ float: The closest distance between the two masks.
1810
+
1811
+ Example
1812
+ -------
1813
+ >>> closest_mask_distance(mask1, mask2)
1814
+ 0.5
1815
+ """
1816
+
1817
+ mask1 = np.clip(mask1, 0, 1)
1818
+ mask2 = np.clip(mask2, 0, 1)
1819
+ contours1, _ = cv2.findContours(mask1, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
1820
+ contours2, _ = cv2.findContours(mask2, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
1821
+ largest_contour1 = max(contours1, key=cv2.contourArea)
1822
+ largest_contour2 = max(contours2, key=cv2.contourArea)
1823
+ polygon1 = cv2.approxPolyDP(largest_contour1, 1.0, True)
1824
+ polygon2 = cv2.approxPolyDP(largest_contour2, 1.0, True)
1825
+ min_distance = np.inf
1826
+
1827
+ small_polygon, larger_contour = (
1828
+ (polygon1, largest_contour2)
1829
+ if len(largest_contour1) < len(largest_contour2)
1830
+ else (polygon2, largest_contour1)
1831
+ )
1832
+
1833
+ # For each point in the first polygon
1834
+ for point in small_polygon:
1835
+ # Calculate the distance to the second polygon, -1 is to invert result as point inside the polygon is positive
1836
+
1837
+ distance = (
1838
+ cv2.pointPolygonTest(
1839
+ larger_contour, (point[0, 0].item(), point[0, 1].item()), True
1840
+ )
1841
+ * -1
1842
+ )
1843
+
1844
+ # If the distance is negative, the point is inside the polygon, so the distance is 0
1845
+ if distance < 0:
1846
+ continue
1847
+ else:
1848
+ # Update the minimum distance if the point is outside the polygon
1849
+ min_distance = min(min_distance, distance)
1850
+
1851
+ return min_distance if min_distance != np.inf else 0.0
1852
+
1853
+
1854
+ def closest_box_distance(
1855
+ box1: List[float], box2: List[float], image_size: Tuple[int, int]
1856
+ ) -> float:
1857
+ """'closest_box_distance' calculates the closest distance between two bounding boxes.
1858
+
1859
+ Parameters:
1860
+ box1 (List[float]): The first bounding box.
1861
+ box2 (List[float]): The second bounding box.
1862
+ image_size (Tuple[int, int]): The size of the image given as (height, width).
1863
+
1864
+ Returns:
1865
+ float: The closest distance between the two bounding boxes.
1866
+
1867
+ Example
1868
+ -------
1869
+ >>> closest_box_distance([100, 100, 200, 200], [300, 300, 400, 400])
1870
+ 141.42
1871
+ """
1872
+
1873
+ x11, y11, x12, y12 = denormalize_bbox(box1, image_size)
1874
+ x21, y21, x22, y22 = denormalize_bbox(box2, image_size)
1875
+
1876
+ horizontal_distance = np.max([0, x21 - x12, x11 - x22])
1877
+ vertical_distance = np.max([0, y21 - y12, y11 - y22])
1878
+ return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
1879
+
1880
+
2112
1881
  # Utility and visualization functions
2113
1882
 
2114
1883