vision-agent 0.2.110__py3-none-any.whl → 0.2.112__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,35 +2,49 @@ import io
2
2
  import json
3
3
  import logging
4
4
  import tempfile
5
- from pathlib import Path
6
5
  from importlib import resources
6
+ from pathlib import Path
7
7
  from typing import Any, Dict, List, Optional, Tuple, Union, cast
8
+ from uuid import UUID
8
9
 
9
10
  import cv2
10
- import requests
11
11
  import numpy as np
12
- from pytube import YouTube # type: ignore
12
+ import requests
13
13
  from moviepy.editor import ImageSequenceClip
14
14
  from PIL import Image, ImageDraw, ImageFont
15
15
  from pillow_heif import register_heif_opener # type: ignore
16
+ from pytube import YouTube # type: ignore
16
17
 
18
+ from vision_agent.clients.landing_public_api import LandingPublicAPI
17
19
  from vision_agent.tools.tool_utils import (
18
- send_inference_request,
19
20
  get_tool_descriptions,
20
21
  get_tool_documentation,
21
22
  get_tools_df,
22
23
  get_tools_info,
24
+ send_inference_request,
25
+ )
26
+ from vision_agent.tools.tools_types import (
27
+ BboxInput,
28
+ BboxInputBase64,
29
+ FineTuning,
30
+ Florencev2FtRequest,
31
+ JobStatus,
32
+ PromptTask,
23
33
  )
24
34
  from vision_agent.utils import extract_frames_from_video
35
+ from vision_agent.utils.exceptions import FineTuneModelIsNotReady
25
36
  from vision_agent.utils.execute import FileSerializer, MimeType
26
37
  from vision_agent.utils.image_utils import (
27
38
  b64_to_pil,
39
+ convert_quad_box_to_bbox,
28
40
  convert_to_b64,
29
41
  denormalize_bbox,
42
+ frames_to_bytes,
30
43
  get_image_size,
31
44
  normalize_bbox,
32
- convert_quad_box_to_bbox,
45
+ numpy_to_bytes,
33
46
  rle_decode,
47
+ rle_decode_array,
34
48
  )
35
49
 
36
50
  register_heif_opener()
@@ -130,9 +144,9 @@ def owl_v2(
130
144
  box_threshold: float = 0.10,
131
145
  ) -> List[Dict[str, Any]]:
132
146
  """'owl_v2' is a tool that can detect and count multiple objects given a text
133
- prompt such as category names or referring expressions. The categories in text prompt
134
- are separated by commas. It returns a list of bounding boxes with
135
- normalized coordinates, label names and associated probability scores.
147
+ prompt such as category names or referring expressions. The categories in text
148
+ prompt are separated by commas. It returns a list of bounding boxes with normalized
149
+ coordinates, label names and associated probability scores.
136
150
 
137
151
  Parameters:
138
152
  prompt (str): The prompt to ground to the image.
@@ -183,10 +197,10 @@ def grounding_sam(
183
197
  box_threshold: float = 0.20,
184
198
  iou_threshold: float = 0.20,
185
199
  ) -> List[Dict[str, Any]]:
186
- """'grounding_sam' is a tool that can segment multiple objects given a
187
- text prompt such as category names or referring expressions. The categories in text
188
- prompt are separated by commas or periods. It returns a list of bounding boxes,
189
- label names, mask file names and associated probability scores.
200
+ """'grounding_sam' is a tool that can segment multiple objects given a text prompt
201
+ such as category names or referring expressions. The categories in text prompt are
202
+ separated by commas or periods. It returns a list of bounding boxes, label names,
203
+ mask file names and associated probability scores.
190
204
 
191
205
  Parameters:
192
206
  prompt (str): The prompt to ground to the image.
@@ -243,52 +257,114 @@ def grounding_sam(
243
257
  return return_data
244
258
 
245
259
 
246
- def extract_frames(
247
- video_uri: Union[str, Path], fps: float = 0.5
248
- ) -> List[Tuple[np.ndarray, float]]:
249
- """'extract_frames' extracts frames from a video which can be a file path or youtube
250
- link, returns a list of tuples (frame, timestamp), where timestamp is the relative
251
- time in seconds where the frame was captured. The frame is a numpy array.
260
+ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
261
+ """'florence2_sam2_image' is a tool that can segment multiple objects given a text
262
+ prompt such as category names or referring expressions. The categories in the text
263
+ prompt are separated by commas. It returns a list of bounding boxes, label names,
264
+ mask file names and associated probability scores of 1.0.
252
265
 
253
266
  Parameters:
254
- video_uri (Union[str, Path]): The path to the video file or youtube link
255
- fps (float, optional): The frame rate per second to extract the frames. Defaults
256
- to 0.5.
267
+ prompt (str): The prompt to ground to the image.
268
+ image (np.ndarray): The image to ground the prompt to.
257
269
 
258
270
  Returns:
259
- List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame
260
- as a numpy array and the timestamp in seconds.
271
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
272
+ bounding box, and mask of the detected objects with normalized coordinates
273
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
274
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
275
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
276
+ the background.
261
277
 
262
278
  Example
263
279
  -------
264
- >>> extract_frames("path/to/video.mp4")
265
- [(frame1, 0.0), (frame2, 0.5), ...]
280
+ >>> florence2_sam2_image("car, dinosaur", image)
281
+ [
282
+ {
283
+ 'score': 1.0,
284
+ 'label': 'dinosaur',
285
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
286
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
287
+ [0, 0, 0, ..., 0, 0, 0],
288
+ ...,
289
+ [0, 0, 0, ..., 0, 0, 0],
290
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
291
+ },
292
+ ]
266
293
  """
294
+ buffer_bytes = numpy_to_bytes(image)
267
295
 
268
- if str(video_uri).startswith(
269
- (
270
- "http://www.youtube.com/",
271
- "https://www.youtube.com/",
272
- "http://youtu.be/",
273
- "https://youtu.be/",
274
- )
275
- ):
276
- with tempfile.TemporaryDirectory() as temp_dir:
277
- yt = YouTube(str(video_uri))
278
- # Download the highest resolution video
279
- video = (
280
- yt.streams.filter(progressive=True, file_extension="mp4")
281
- .order_by("resolution")
282
- .desc()
283
- .first()
284
- )
285
- if not video:
286
- raise Exception("No suitable video stream found")
287
- video_file_path = video.download(output_path=temp_dir)
296
+ files = [("image", buffer_bytes)]
297
+ payload = {
298
+ "prompts": [s.strip() for s in prompt.split(",")],
299
+ "function_name": "florence2_sam2_image",
300
+ }
301
+ data: Dict[str, Any] = send_inference_request(
302
+ payload, "florence2-sam2", files=files, v2=True
303
+ )
304
+ return_data = []
305
+ for _, data_i in data["0"].items():
306
+ mask = rle_decode_array(data_i["mask"])
307
+ label = data_i["label"]
308
+ bbox = normalize_bbox(data_i["bounding_box"], data_i["mask"]["size"])
309
+ return_data.append({"label": label, "bbox": bbox, "mask": mask, "score": 1.0})
310
+ return return_data
288
311
 
289
- return extract_frames_from_video(video_file_path, fps)
290
312
 
291
- return extract_frames_from_video(str(video_uri), fps)
313
+ def florence2_sam2_video(
314
+ prompt: str, frames: List[np.ndarray]
315
+ ) -> List[List[Dict[str, Any]]]:
316
+ """'florence2_sam2_video' is a tool that can segment and track multiple entities
317
+ in a video given a text prompt such as category names or referring expressions. You
318
+ can optionally separate the categories in the text with commas. It only tracks
319
+ entities present in the first frame and only returns segmentation masks. It is
320
+ useful for tracking and counting without duplicating counts.
321
+
322
+ Parameters:
323
+ prompt (str): The prompt to ground to the video.
324
+ frames (List[np.ndarray]): The list of frames to ground the prompt to.
325
+
326
+ Returns:
327
+ List[List[Dict[str, Any]]]: A list of list of dictionaries containing the label
328
+ and segment mask. The outer list represents each frame and the inner list is
329
+ the entities per frame. The label contains the object ID followed by the label
330
+ name. The objects are only identified in the first framed and tracked
331
+ throughout the video.
332
+
333
+ Example
334
+ -------
335
+ >>> florence2_sam2_video("car, dinosaur", frames)
336
+ [
337
+ [
338
+ {
339
+ 'label': '0: dinosaur',
340
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
341
+ [0, 0, 0, ..., 0, 0, 0],
342
+ ...,
343
+ [0, 0, 0, ..., 0, 0, 0],
344
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
345
+ },
346
+ ],
347
+ ]
348
+ """
349
+
350
+ buffer_bytes = frames_to_bytes(frames)
351
+ files = [("video", buffer_bytes)]
352
+ payload = {
353
+ "prompts": prompt.split(","),
354
+ "function_name": "florence2_sam2_video",
355
+ }
356
+ data: Dict[str, Any] = send_inference_request(
357
+ payload, "florence2-sam2", files=files, v2=True
358
+ )
359
+ return_data = []
360
+ for frame_i in data.keys():
361
+ return_frame_data = []
362
+ for obj_id, data_j in data[frame_i].items():
363
+ mask = rle_decode_array(data_j["mask"])
364
+ label = obj_id + ": " + data_j["label"]
365
+ return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
366
+ return_data.append(return_frame_data)
367
+ return return_data
292
368
 
293
369
 
294
370
  def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
@@ -357,12 +433,19 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
357
433
 
358
434
  Returns:
359
435
  Dict[str, Any]: A dictionary containing the key 'count' and the count as a
360
- value. E.g. {count: 12}.
436
+ value, e.g. {count: 12} and a heat map for visaulization purposes.
361
437
 
362
438
  Example
363
439
  -------
364
440
  >>> loca_zero_shot_counting(image)
365
- {'count': 45},
441
+ {'count': 83,
442
+ 'heat_map': array([[ 0, 0, 0, ..., 0, 0, 0],
443
+ [ 0, 0, 0, ..., 0, 0, 0],
444
+ [ 0, 0, 0, ..., 0, 0, 1],
445
+ ...,
446
+ [ 0, 0, 0, ..., 30, 35, 41],
447
+ [ 0, 0, 0, ..., 41, 47, 53],
448
+ [ 0, 0, 0, ..., 53, 59, 64]], dtype=uint8)}
366
449
  """
367
450
 
368
451
  image_b64 = convert_to_b64(image)
@@ -387,12 +470,19 @@ def loca_visual_prompt_counting(
387
470
 
388
471
  Returns:
389
472
  Dict[str, Any]: A dictionary containing the key 'count' and the count as a
390
- value. E.g. {count: 12}.
473
+ value, e.g. {count: 12} and a heat map for visaulization purposes.
391
474
 
392
475
  Example
393
476
  -------
394
477
  >>> loca_visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
395
- {'count': 45},
478
+ {'count': 83,
479
+ 'heat_map': array([[ 0, 0, 0, ..., 0, 0, 0],
480
+ [ 0, 0, 0, ..., 0, 0, 0],
481
+ [ 0, 0, 0, ..., 0, 0, 1],
482
+ ...,
483
+ [ 0, 0, 0, ..., 30, 35, 41],
484
+ [ 0, 0, 0, ..., 41, 47, 53],
485
+ [ 0, 0, 0, ..., 53, 59, 64]], dtype=uint8)}
396
486
  """
397
487
 
398
488
  image_size = get_image_size(image)
@@ -409,8 +499,8 @@ def loca_visual_prompt_counting(
409
499
  return resp_data
410
500
 
411
501
 
412
- def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
413
- """'florencev2_roberta_vqa' is a tool that takes an image and analyzes
502
+ def florence2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
503
+ """'florence2_roberta_vqa' is a tool that takes an image and analyzes
414
504
  its contents, generates detailed captions and then tries to answer the given
415
505
  question using the generated context. It returns text as an answer to the question.
416
506
 
@@ -423,7 +513,7 @@ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
423
513
 
424
514
  Example
425
515
  -------
426
- >>> florencev2_roberta_vqa('What is the top left animal in this image ?', image)
516
+ >>> florence2_roberta_vqa('What is the top left animal in this image?', image)
427
517
  'white tiger'
428
518
  """
429
519
 
@@ -431,13 +521,73 @@ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
431
521
  data = {
432
522
  "image": image_b64,
433
523
  "question": prompt,
434
- "function_name": "florencev2_roberta_vqa",
524
+ "function_name": "florence2_roberta_vqa",
435
525
  }
436
526
 
437
527
  answer = send_inference_request(data, "florence2-qa", v2=True)
438
528
  return answer # type: ignore
439
529
 
440
530
 
531
+ def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
532
+ """'ixc25_image_vqa' is a tool that can answer any questions about arbitrary images
533
+ including regular images or images of documents or presentations. It returns text
534
+ as an answer to the question.
535
+
536
+ Parameters:
537
+ prompt (str): The question about the image
538
+ image (np.ndarray): The reference image used for the question
539
+
540
+ Returns:
541
+ str: A string which is the answer to the given prompt.
542
+
543
+ Example
544
+ -------
545
+ >>> ixc25_image_vqa('What is the cat doing?', image)
546
+ 'drinking milk'
547
+ """
548
+
549
+ buffer_bytes = numpy_to_bytes(image)
550
+ files = [("image", buffer_bytes)]
551
+ payload = {
552
+ "prompt": prompt,
553
+ "function_name": "ixc25_image_vqa",
554
+ }
555
+ data: Dict[str, Any] = send_inference_request(
556
+ payload, "internlm-xcomposer2", files=files, v2=True
557
+ )
558
+ return cast(str, data["answer"])
559
+
560
+
561
+ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
562
+ """'ixc25_video_vqa' is a tool that can answer any questions about arbitrary videos
563
+ including regular videos or videos of documents or presentations. It returns text
564
+ as an answer to the question.
565
+
566
+ Parameters:
567
+ prompt (str): The question about the video
568
+ frames (List[np.ndarray]): The reference frames used for the question
569
+
570
+ Returns:
571
+ str: A string which is the answer to the given prompt.
572
+
573
+ Example
574
+ -------
575
+ >>> ixc25_video_vqa('Which football player made the goal?', frames)
576
+ 'Lionel Messi'
577
+ """
578
+
579
+ buffer_bytes = frames_to_bytes(frames)
580
+ files = [("video", buffer_bytes)]
581
+ payload = {
582
+ "prompt": prompt,
583
+ "function_name": "ixc25_video_vqa",
584
+ }
585
+ data: Dict[str, Any] = send_inference_request(
586
+ payload, "internlm-xcomposer2", files=files, v2=True
587
+ )
588
+ return cast(str, data["answer"])
589
+
590
+
441
591
  def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
442
592
  """'git_vqa_v2' is a tool that can answer questions about the visual
443
593
  contents of an image given a question and an image. It returns an answer to the
@@ -581,8 +731,8 @@ def blip_image_caption(image: np.ndarray) -> str:
581
731
  return answer["text"][0] # type: ignore
582
732
 
583
733
 
584
- def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) -> str:
585
- """'florencev2_image_caption' is a tool that can caption or describe an image based
734
+ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> str:
735
+ """'florence2_image_caption' is a tool that can caption or describe an image based
586
736
  on its contents. It returns a text describing the image.
587
737
 
588
738
  Parameters:
@@ -595,7 +745,7 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) ->
595
745
 
596
746
  Example
597
747
  -------
598
- >>> florencev2_image_caption(image, False)
748
+ >>> florence2_image_caption(image, False)
599
749
  'This image contains a cat sitting on a table with a bowl of milk.'
600
750
  """
601
751
  image_b64 = convert_to_b64(image)
@@ -603,17 +753,19 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) ->
603
753
  data = {
604
754
  "image": image_b64,
605
755
  "task": task,
606
- "function_name": "florencev2_image_caption",
756
+ "function_name": "florence2_image_caption",
607
757
  }
608
758
 
609
759
  answer = send_inference_request(data, "florence2", v2=True)
610
760
  return answer[task] # type: ignore
611
761
 
612
762
 
613
- def florencev2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
614
- """'florencev2_object_detection' is a tool that can detect objects given a text
615
- prompt such as a phrase or class names separated by commas. It returns a list of
616
- detected objects as labels and their location as bounding boxes with score of 1.0.
763
+ def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
764
+ """'florencev2_object_detection' is a tool that can detect and count multiple
765
+ objects given a text prompt such as category names or referring expressions. You
766
+ can optionally separate the categories in the text with commas. It returns a list
767
+ of bounding boxes with normalized coordinates, label names and associated
768
+ probability scores of 1.0.
617
769
 
618
770
  Parameters:
619
771
  prompt (str): The prompt to ground to the image.
@@ -628,7 +780,7 @@ def florencev2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str
628
780
 
629
781
  Example
630
782
  -------
631
- >>> florencev2_object_detection('person looking at a coyote', image)
783
+ >>> florence2_object_detection('person looking at a coyote', image)
632
784
  [
633
785
  {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
634
786
  {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
@@ -640,7 +792,7 @@ def florencev2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str
640
792
  "image": image_b64,
641
793
  "task": "<CAPTION_TO_PHRASE_GROUNDING>",
642
794
  "prompt": prompt,
643
- "function_name": "florencev2_object_detection",
795
+ "function_name": "florence2_object_detection",
644
796
  }
645
797
 
646
798
  detections = send_inference_request(data, "florence2", v2=True)
@@ -657,8 +809,8 @@ def florencev2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str
657
809
  return return_data
658
810
 
659
811
 
660
- def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
661
- """'florencev2_ocr' is a tool that can detect text and text regions in an image.
812
+ def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
813
+ """'florence2_ocr' is a tool that can detect text and text regions in an image.
662
814
  Each text region contains one line of text. It returns a list of detected text,
663
815
  the text region as a bounding box with normalized coordinates, and confidence
664
816
  scores. The results are sorted from top-left to bottom right.
@@ -672,7 +824,7 @@ def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
672
824
 
673
825
  Example
674
826
  -------
675
- >>> florencev2_ocr(image)
827
+ >>> florence2_ocr(image)
676
828
  [
677
829
  {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
678
830
  ]
@@ -683,7 +835,7 @@ def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
683
835
  data = {
684
836
  "image": image_b64,
685
837
  "task": "<OCR_WITH_REGION>",
686
- "function_name": "florencev2_ocr",
838
+ "function_name": "florence2_ocr",
687
839
  }
688
840
 
689
841
  detections = send_inference_request(data, "florence2", v2=True)
@@ -1024,6 +1176,54 @@ def closest_box_distance(
1024
1176
  # Utility and visualization functions
1025
1177
 
1026
1178
 
1179
+ def extract_frames(
1180
+ video_uri: Union[str, Path], fps: float = 1
1181
+ ) -> List[Tuple[np.ndarray, float]]:
1182
+ """'extract_frames' extracts frames from a video which can be a file path or youtube
1183
+ link, returns a list of tuples (frame, timestamp), where timestamp is the relative
1184
+ time in seconds where the frame was captured. The frame is a numpy array.
1185
+
1186
+ Parameters:
1187
+ video_uri (Union[str, Path]): The path to the video file or youtube link
1188
+ fps (float, optional): The frame rate per second to extract the frames. Defaults
1189
+ to 10.
1190
+
1191
+ Returns:
1192
+ List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame
1193
+ as a numpy array and the timestamp in seconds.
1194
+
1195
+ Example
1196
+ -------
1197
+ >>> extract_frames("path/to/video.mp4")
1198
+ [(frame1, 0.0), (frame2, 0.5), ...]
1199
+ """
1200
+
1201
+ if str(video_uri).startswith(
1202
+ (
1203
+ "http://www.youtube.com/",
1204
+ "https://www.youtube.com/",
1205
+ "http://youtu.be/",
1206
+ "https://youtu.be/",
1207
+ )
1208
+ ):
1209
+ with tempfile.TemporaryDirectory() as temp_dir:
1210
+ yt = YouTube(str(video_uri))
1211
+ # Download the highest resolution video
1212
+ video = (
1213
+ yt.streams.filter(progressive=True, file_extension="mp4")
1214
+ .order_by("resolution")
1215
+ .desc()
1216
+ .first()
1217
+ )
1218
+ if not video:
1219
+ raise Exception("No suitable video stream found")
1220
+ video_file_path = video.download(output_path=temp_dir)
1221
+
1222
+ return extract_frames_from_video(video_file_path, fps)
1223
+
1224
+ return extract_frames_from_video(str(video_uri), fps)
1225
+
1226
+
1027
1227
  def save_json(data: Any, file_path: str) -> None:
1028
1228
  """'save_json' is a utility function that saves data as a JSON file. It is helpful
1029
1229
  for saving data that contains NumPy arrays which are not JSON serializable.
@@ -1088,7 +1288,7 @@ def save_image(image: np.ndarray, file_path: str) -> None:
1088
1288
 
1089
1289
 
1090
1290
  def save_video(
1091
- frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float = 4
1291
+ frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float = 1
1092
1292
  ) -> str:
1093
1293
  """'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.
1094
1294
 
@@ -1190,15 +1390,43 @@ def overlay_bounding_boxes(
1190
1390
  return np.array(pil_image)
1191
1391
 
1192
1392
 
1393
+ def _get_text_coords_from_mask(
1394
+ mask: np.ndarray, v_gap: int = 10, h_gap: int = 10
1395
+ ) -> Tuple[int, int]:
1396
+ mask = mask.astype(np.uint8)
1397
+ if np.sum(mask) == 0:
1398
+ return (0, 0)
1399
+
1400
+ rows, cols = np.nonzero(mask)
1401
+ top = rows.min()
1402
+ bottom = rows.max()
1403
+ left = cols.min()
1404
+ right = cols.max()
1405
+
1406
+ if top - v_gap < 0:
1407
+ if bottom + v_gap > mask.shape[0]:
1408
+ top = top
1409
+ else:
1410
+ top = bottom + v_gap
1411
+ else:
1412
+ top = top - v_gap
1413
+
1414
+ return left + (right - left) // 2 - h_gap, top
1415
+
1416
+
1193
1417
  def overlay_segmentation_masks(
1194
- image: np.ndarray, masks: List[Dict[str, Any]]
1195
- ) -> np.ndarray:
1418
+ medias: Union[np.ndarray, List[np.ndarray]],
1419
+ masks: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]],
1420
+ draw_label: bool = True,
1421
+ ) -> Union[np.ndarray, List[np.ndarray]]:
1196
1422
  """'overlay_segmentation_masks' is a utility function that displays segmentation
1197
1423
  masks.
1198
1424
 
1199
1425
  Parameters:
1200
- image (np.ndarray): The image to display the masks on.
1201
- masks (List[Dict[str, Any]]): A list of dictionaries containing the masks.
1426
+ medias (Union[np.ndarray, List[np.ndarray]]): The image or frames to display
1427
+ the masks on.
1428
+ masks (Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]): A list of
1429
+ dictionaries containing the masks.
1202
1430
 
1203
1431
  Returns:
1204
1432
  np.ndarray: The image with the masks displayed.
@@ -1218,27 +1446,50 @@ def overlay_segmentation_masks(
1218
1446
  }],
1219
1447
  )
1220
1448
  """
1221
- pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGBA")
1449
+ medias_int: List[np.ndarray] = (
1450
+ [medias] if isinstance(medias, np.ndarray) else medias
1451
+ )
1452
+ masks_int = [masks] if isinstance(masks[0], dict) else masks
1453
+ masks_int = cast(List[List[Dict[str, Any]]], masks_int)
1222
1454
 
1223
- if len(set([mask["label"] for mask in masks])) > len(COLORS):
1224
- _LOGGER.warning(
1225
- "Number of unique labels exceeds the number of available colors. Some labels may have the same color."
1226
- )
1455
+ labels = set()
1456
+ for mask_i in masks_int:
1457
+ for mask_j in mask_i:
1458
+ labels.add(mask_j["label"])
1459
+ color = {label: COLORS[i % len(COLORS)] for i, label in enumerate(labels)}
1227
1460
 
1228
- color = {
1229
- label: COLORS[i % len(COLORS)]
1230
- for i, label in enumerate(set([mask["label"] for mask in masks]))
1231
- }
1232
- masks = sorted(masks, key=lambda x: x["label"], reverse=True)
1461
+ width, height = Image.fromarray(medias_int[0]).size
1462
+ fontsize = max(12, int(min(width, height) / 40))
1463
+ font = ImageFont.truetype(
1464
+ str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")),
1465
+ fontsize,
1466
+ )
1233
1467
 
1234
- for elt in masks:
1235
- mask = elt["mask"]
1236
- label = elt["label"]
1237
- np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
1238
- np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
1239
- mask_img = Image.fromarray(np_mask.astype(np.uint8))
1240
- pil_image = Image.alpha_composite(pil_image, mask_img)
1241
- return np.array(pil_image)
1468
+ frame_out = []
1469
+ for i, frame in enumerate(medias_int):
1470
+ pil_image = Image.fromarray(frame.astype(np.uint8)).convert("RGBA")
1471
+ for elt in masks_int[i]:
1472
+ mask = elt["mask"]
1473
+ label = elt["label"]
1474
+ np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
1475
+ np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
1476
+ mask_img = Image.fromarray(np_mask.astype(np.uint8))
1477
+ pil_image = Image.alpha_composite(pil_image, mask_img)
1478
+
1479
+ if draw_label:
1480
+ draw = ImageDraw.Draw(pil_image)
1481
+ text_box = draw.textbbox((0, 0), text=label, font=font)
1482
+ x, y = _get_text_coords_from_mask(
1483
+ mask,
1484
+ v_gap=(text_box[3] - text_box[1]) + 10,
1485
+ h_gap=(text_box[2] - text_box[0]) // 2,
1486
+ )
1487
+ if x != 0 and y != 0:
1488
+ text_box = draw.textbbox((x, y), text=label, font=font)
1489
+ draw.rectangle((x, y, text_box[2], text_box[3]), fill=color[label])
1490
+ draw.text((x, y), label, fill="black", font=font)
1491
+ frame_out.append(np.array(pil_image))
1492
+ return frame_out[0] if len(frame_out) == 1 else frame_out
1242
1493
 
1243
1494
 
1244
1495
  def overlay_heat_map(
@@ -1286,9 +1537,121 @@ def overlay_heat_map(
1286
1537
  return np.array(combined)
1287
1538
 
1288
1539
 
1540
+ # TODO: add this function to the imports so that is picked in the agent
1541
+ def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
1542
+ """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
1543
+ to detect objects in an image based on a given dataset. It returns the fine
1544
+ tuning job id.
1545
+
1546
+ Parameters:
1547
+ bboxes (List[BboxInput]): A list of BboxInput containing the
1548
+ image path, labels and bounding boxes.
1549
+ task (PromptTask): The florencev2 fine-tuning task. The options are
1550
+ CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
1551
+
1552
+ Returns:
1553
+ UUID: The fine tuning job id, this id will used to retrieve the fine
1554
+ tuned model.
1555
+
1556
+ Example
1557
+ -------
1558
+ >>> fine_tuning_job_id = florencev2_fine_tuning(
1559
+ [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
1560
+ {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
1561
+ "OBJECT_DETECTION"
1562
+ )
1563
+ """
1564
+ bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
1565
+ task_input = PromptTask[task]
1566
+ fine_tuning_request = [
1567
+ BboxInputBase64(
1568
+ image=convert_to_b64(bbox_input.image_path),
1569
+ filename=bbox_input.image_path.split("/")[-1],
1570
+ labels=bbox_input.labels,
1571
+ bboxes=bbox_input.bboxes,
1572
+ )
1573
+ for bbox_input in bboxes_input
1574
+ ]
1575
+ landing_api = LandingPublicAPI()
1576
+ return landing_api.launch_fine_tuning_job(
1577
+ "florencev2", task_input, fine_tuning_request
1578
+ )
1579
+
1580
+
1581
+ # TODO: add this function to the imports so that is picked in the agent
1582
+ def florencev2_fine_tuned_object_detection(
1583
+ image: np.ndarray, prompt: str, model_id: UUID, task: str
1584
+ ) -> List[Dict[str, Any]]:
1585
+ """'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model
1586
+ to detect objects given a text prompt such as a phrase or class names separated by
1587
+ commas. It returns a list of detected objects as labels and their location as
1588
+ bounding boxes with score of 1.0.
1589
+
1590
+ Parameters:
1591
+ image (np.ndarray): The image to used to detect objects.
1592
+ prompt (str): The prompt to help find objects in the image.
1593
+ model_id (UUID): The fine-tuned model id.
1594
+ task (PromptTask): The florencev2 fine-tuning task. The options are
1595
+ CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
1596
+
1597
+ Returns:
1598
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
1599
+ bounding box of the detected objects with normalized coordinates between 0
1600
+ and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
1601
+ top-left and xmax and ymax are the coordinates of the bottom-right of the
1602
+ bounding box. The scores are always 1.0 and cannot be thresholded
1603
+
1604
+ Example
1605
+ -------
1606
+ >>> florencev2_fine_tuned_object_detection(
1607
+ image,
1608
+ 'person looking at a coyote',
1609
+ UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83")
1610
+ )
1611
+ [
1612
+ {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
1613
+ {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
1614
+ ]
1615
+ """
1616
+ # check if job succeeded first
1617
+ landing_api = LandingPublicAPI()
1618
+ status = landing_api.check_fine_tuning_job(model_id)
1619
+ if status is not JobStatus.SUCCEEDED:
1620
+ raise FineTuneModelIsNotReady()
1621
+
1622
+ task = PromptTask[task]
1623
+ if task is PromptTask.OBJECT_DETECTION:
1624
+ prompt = ""
1625
+
1626
+ data_obj = Florencev2FtRequest(
1627
+ image=convert_to_b64(image),
1628
+ task=task,
1629
+ tool="florencev2_fine_tuning",
1630
+ prompt=prompt,
1631
+ fine_tuning=FineTuning(job_id=model_id),
1632
+ )
1633
+ data = data_obj.model_dump(by_alias=True)
1634
+ metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"}
1635
+ detections = send_inference_request(
1636
+ data, "tools", v2=False, metadata_payload=metadata_payload
1637
+ )
1638
+
1639
+ detections = detections[task.value]
1640
+ return_data = []
1641
+ image_size = image.shape[:2]
1642
+ for i in range(len(detections["bboxes"])):
1643
+ return_data.append(
1644
+ {
1645
+ "score": 1.0,
1646
+ "label": detections["labels"][i],
1647
+ "bbox": normalize_bbox(detections["bboxes"][i], image_size),
1648
+ }
1649
+ )
1650
+ return return_data
1651
+
1652
+
1289
1653
  TOOLS = [
1290
1654
  owl_v2,
1291
- grounding_sam,
1292
1655
  extract_frames,
1293
1656
  ocr,
1294
1657
  clip,
@@ -1296,13 +1659,15 @@ TOOLS = [
1296
1659
  vit_nsfw_classification,
1297
1660
  loca_zero_shot_counting,
1298
1661
  loca_visual_prompt_counting,
1299
- florencev2_roberta_vqa,
1300
- florencev2_image_caption,
1301
- florencev2_ocr,
1662
+ florence2_image_caption,
1663
+ florence2_ocr,
1664
+ florence2_sam2_image,
1665
+ florence2_sam2_video,
1666
+ florence2_object_detection,
1667
+ ixc25_image_vqa,
1668
+ ixc25_video_vqa,
1302
1669
  detr_segmentation,
1303
1670
  depth_anything_v2,
1304
- generate_soft_edge_image,
1305
- dpt_hybrid_midas,
1306
1671
  generate_pose_image,
1307
1672
  closest_mask_distance,
1308
1673
  closest_box_distance,
@@ -1313,7 +1678,6 @@ TOOLS = [
1313
1678
  overlay_bounding_boxes,
1314
1679
  overlay_segmentation_masks,
1315
1680
  overlay_heat_map,
1316
- template_match,
1317
1681
  ]
1318
1682
  TOOLS_DF = get_tools_df(TOOLS) # type: ignore
1319
1683
  TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore