vision-agent 0.2.110__py3-none-any.whl → 0.2.112__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/agent/agent_utils.py +3 -8
- vision_agent/agent/vision_agent.py +1 -1
- vision_agent/agent/vision_agent_coder.py +28 -20
- vision_agent/agent/vision_agent_coder_prompts.py +9 -7
- vision_agent/agent/vision_agent_prompts.py +11 -10
- vision_agent/clients/http.py +15 -3
- vision_agent/clients/landing_public_api.py +14 -2
- vision_agent/tools/__init__.py +11 -5
- vision_agent/tools/meta_tools.py +1 -46
- vision_agent/tools/tool_utils.py +25 -10
- vision_agent/tools/tools.py +463 -99
- vision_agent/tools/tools_types.py +84 -0
- vision_agent/utils/exceptions.py +13 -0
- vision_agent/utils/execute.py +0 -1
- vision_agent/utils/image_utils.py +52 -0
- {vision_agent-0.2.110.dist-info → vision_agent-0.2.112.dist-info}/METADATA +1 -1
- vision_agent-0.2.112.dist-info/RECORD +33 -0
- vision_agent/tools/meta_tools_types.py +0 -30
- vision_agent-0.2.110.dist-info/RECORD +0 -33
- {vision_agent-0.2.110.dist-info → vision_agent-0.2.112.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.110.dist-info → vision_agent-0.2.112.dist-info}/WHEEL +0 -0
vision_agent/tools/tools.py
CHANGED
@@ -2,35 +2,49 @@ import io
|
|
2
2
|
import json
|
3
3
|
import logging
|
4
4
|
import tempfile
|
5
|
-
from pathlib import Path
|
6
5
|
from importlib import resources
|
6
|
+
from pathlib import Path
|
7
7
|
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
8
|
+
from uuid import UUID
|
8
9
|
|
9
10
|
import cv2
|
10
|
-
import requests
|
11
11
|
import numpy as np
|
12
|
-
|
12
|
+
import requests
|
13
13
|
from moviepy.editor import ImageSequenceClip
|
14
14
|
from PIL import Image, ImageDraw, ImageFont
|
15
15
|
from pillow_heif import register_heif_opener # type: ignore
|
16
|
+
from pytube import YouTube # type: ignore
|
16
17
|
|
18
|
+
from vision_agent.clients.landing_public_api import LandingPublicAPI
|
17
19
|
from vision_agent.tools.tool_utils import (
|
18
|
-
send_inference_request,
|
19
20
|
get_tool_descriptions,
|
20
21
|
get_tool_documentation,
|
21
22
|
get_tools_df,
|
22
23
|
get_tools_info,
|
24
|
+
send_inference_request,
|
25
|
+
)
|
26
|
+
from vision_agent.tools.tools_types import (
|
27
|
+
BboxInput,
|
28
|
+
BboxInputBase64,
|
29
|
+
FineTuning,
|
30
|
+
Florencev2FtRequest,
|
31
|
+
JobStatus,
|
32
|
+
PromptTask,
|
23
33
|
)
|
24
34
|
from vision_agent.utils import extract_frames_from_video
|
35
|
+
from vision_agent.utils.exceptions import FineTuneModelIsNotReady
|
25
36
|
from vision_agent.utils.execute import FileSerializer, MimeType
|
26
37
|
from vision_agent.utils.image_utils import (
|
27
38
|
b64_to_pil,
|
39
|
+
convert_quad_box_to_bbox,
|
28
40
|
convert_to_b64,
|
29
41
|
denormalize_bbox,
|
42
|
+
frames_to_bytes,
|
30
43
|
get_image_size,
|
31
44
|
normalize_bbox,
|
32
|
-
|
45
|
+
numpy_to_bytes,
|
33
46
|
rle_decode,
|
47
|
+
rle_decode_array,
|
34
48
|
)
|
35
49
|
|
36
50
|
register_heif_opener()
|
@@ -130,9 +144,9 @@ def owl_v2(
|
|
130
144
|
box_threshold: float = 0.10,
|
131
145
|
) -> List[Dict[str, Any]]:
|
132
146
|
"""'owl_v2' is a tool that can detect and count multiple objects given a text
|
133
|
-
prompt such as category names or referring expressions. The categories in text
|
134
|
-
are separated by commas. It returns a list of bounding boxes with
|
135
|
-
|
147
|
+
prompt such as category names or referring expressions. The categories in text
|
148
|
+
prompt are separated by commas. It returns a list of bounding boxes with normalized
|
149
|
+
coordinates, label names and associated probability scores.
|
136
150
|
|
137
151
|
Parameters:
|
138
152
|
prompt (str): The prompt to ground to the image.
|
@@ -183,10 +197,10 @@ def grounding_sam(
|
|
183
197
|
box_threshold: float = 0.20,
|
184
198
|
iou_threshold: float = 0.20,
|
185
199
|
) -> List[Dict[str, Any]]:
|
186
|
-
"""'grounding_sam' is a tool that can segment multiple objects given a
|
187
|
-
|
188
|
-
|
189
|
-
|
200
|
+
"""'grounding_sam' is a tool that can segment multiple objects given a text prompt
|
201
|
+
such as category names or referring expressions. The categories in text prompt are
|
202
|
+
separated by commas or periods. It returns a list of bounding boxes, label names,
|
203
|
+
mask file names and associated probability scores.
|
190
204
|
|
191
205
|
Parameters:
|
192
206
|
prompt (str): The prompt to ground to the image.
|
@@ -243,52 +257,114 @@ def grounding_sam(
|
|
243
257
|
return return_data
|
244
258
|
|
245
259
|
|
246
|
-
def
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
time in seconds where the frame was captured. The frame is a numpy array.
|
260
|
+
def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
|
261
|
+
"""'florence2_sam2_image' is a tool that can segment multiple objects given a text
|
262
|
+
prompt such as category names or referring expressions. The categories in the text
|
263
|
+
prompt are separated by commas. It returns a list of bounding boxes, label names,
|
264
|
+
mask file names and associated probability scores of 1.0.
|
252
265
|
|
253
266
|
Parameters:
|
254
|
-
|
255
|
-
|
256
|
-
to 0.5.
|
267
|
+
prompt (str): The prompt to ground to the image.
|
268
|
+
image (np.ndarray): The image to ground the prompt to.
|
257
269
|
|
258
270
|
Returns:
|
259
|
-
List[
|
260
|
-
|
271
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
272
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
273
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
274
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
275
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
276
|
+
the background.
|
261
277
|
|
262
278
|
Example
|
263
279
|
-------
|
264
|
-
>>>
|
265
|
-
[
|
280
|
+
>>> florence2_sam2_image("car, dinosaur", image)
|
281
|
+
[
|
282
|
+
{
|
283
|
+
'score': 1.0,
|
284
|
+
'label': 'dinosaur',
|
285
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
286
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
287
|
+
[0, 0, 0, ..., 0, 0, 0],
|
288
|
+
...,
|
289
|
+
[0, 0, 0, ..., 0, 0, 0],
|
290
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
291
|
+
},
|
292
|
+
]
|
266
293
|
"""
|
294
|
+
buffer_bytes = numpy_to_bytes(image)
|
267
295
|
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
)
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
.first()
|
284
|
-
)
|
285
|
-
if not video:
|
286
|
-
raise Exception("No suitable video stream found")
|
287
|
-
video_file_path = video.download(output_path=temp_dir)
|
296
|
+
files = [("image", buffer_bytes)]
|
297
|
+
payload = {
|
298
|
+
"prompts": [s.strip() for s in prompt.split(",")],
|
299
|
+
"function_name": "florence2_sam2_image",
|
300
|
+
}
|
301
|
+
data: Dict[str, Any] = send_inference_request(
|
302
|
+
payload, "florence2-sam2", files=files, v2=True
|
303
|
+
)
|
304
|
+
return_data = []
|
305
|
+
for _, data_i in data["0"].items():
|
306
|
+
mask = rle_decode_array(data_i["mask"])
|
307
|
+
label = data_i["label"]
|
308
|
+
bbox = normalize_bbox(data_i["bounding_box"], data_i["mask"]["size"])
|
309
|
+
return_data.append({"label": label, "bbox": bbox, "mask": mask, "score": 1.0})
|
310
|
+
return return_data
|
288
311
|
|
289
|
-
return extract_frames_from_video(video_file_path, fps)
|
290
312
|
|
291
|
-
|
313
|
+
def florence2_sam2_video(
|
314
|
+
prompt: str, frames: List[np.ndarray]
|
315
|
+
) -> List[List[Dict[str, Any]]]:
|
316
|
+
"""'florence2_sam2_video' is a tool that can segment and track multiple entities
|
317
|
+
in a video given a text prompt such as category names or referring expressions. You
|
318
|
+
can optionally separate the categories in the text with commas. It only tracks
|
319
|
+
entities present in the first frame and only returns segmentation masks. It is
|
320
|
+
useful for tracking and counting without duplicating counts.
|
321
|
+
|
322
|
+
Parameters:
|
323
|
+
prompt (str): The prompt to ground to the video.
|
324
|
+
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
325
|
+
|
326
|
+
Returns:
|
327
|
+
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the label
|
328
|
+
and segment mask. The outer list represents each frame and the inner list is
|
329
|
+
the entities per frame. The label contains the object ID followed by the label
|
330
|
+
name. The objects are only identified in the first framed and tracked
|
331
|
+
throughout the video.
|
332
|
+
|
333
|
+
Example
|
334
|
+
-------
|
335
|
+
>>> florence2_sam2_video("car, dinosaur", frames)
|
336
|
+
[
|
337
|
+
[
|
338
|
+
{
|
339
|
+
'label': '0: dinosaur',
|
340
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
341
|
+
[0, 0, 0, ..., 0, 0, 0],
|
342
|
+
...,
|
343
|
+
[0, 0, 0, ..., 0, 0, 0],
|
344
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
345
|
+
},
|
346
|
+
],
|
347
|
+
]
|
348
|
+
"""
|
349
|
+
|
350
|
+
buffer_bytes = frames_to_bytes(frames)
|
351
|
+
files = [("video", buffer_bytes)]
|
352
|
+
payload = {
|
353
|
+
"prompts": prompt.split(","),
|
354
|
+
"function_name": "florence2_sam2_video",
|
355
|
+
}
|
356
|
+
data: Dict[str, Any] = send_inference_request(
|
357
|
+
payload, "florence2-sam2", files=files, v2=True
|
358
|
+
)
|
359
|
+
return_data = []
|
360
|
+
for frame_i in data.keys():
|
361
|
+
return_frame_data = []
|
362
|
+
for obj_id, data_j in data[frame_i].items():
|
363
|
+
mask = rle_decode_array(data_j["mask"])
|
364
|
+
label = obj_id + ": " + data_j["label"]
|
365
|
+
return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
|
366
|
+
return_data.append(return_frame_data)
|
367
|
+
return return_data
|
292
368
|
|
293
369
|
|
294
370
|
def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
@@ -357,12 +433,19 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
|
|
357
433
|
|
358
434
|
Returns:
|
359
435
|
Dict[str, Any]: A dictionary containing the key 'count' and the count as a
|
360
|
-
value
|
436
|
+
value, e.g. {count: 12} and a heat map for visaulization purposes.
|
361
437
|
|
362
438
|
Example
|
363
439
|
-------
|
364
440
|
>>> loca_zero_shot_counting(image)
|
365
|
-
{'count':
|
441
|
+
{'count': 83,
|
442
|
+
'heat_map': array([[ 0, 0, 0, ..., 0, 0, 0],
|
443
|
+
[ 0, 0, 0, ..., 0, 0, 0],
|
444
|
+
[ 0, 0, 0, ..., 0, 0, 1],
|
445
|
+
...,
|
446
|
+
[ 0, 0, 0, ..., 30, 35, 41],
|
447
|
+
[ 0, 0, 0, ..., 41, 47, 53],
|
448
|
+
[ 0, 0, 0, ..., 53, 59, 64]], dtype=uint8)}
|
366
449
|
"""
|
367
450
|
|
368
451
|
image_b64 = convert_to_b64(image)
|
@@ -387,12 +470,19 @@ def loca_visual_prompt_counting(
|
|
387
470
|
|
388
471
|
Returns:
|
389
472
|
Dict[str, Any]: A dictionary containing the key 'count' and the count as a
|
390
|
-
value
|
473
|
+
value, e.g. {count: 12} and a heat map for visaulization purposes.
|
391
474
|
|
392
475
|
Example
|
393
476
|
-------
|
394
477
|
>>> loca_visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
|
395
|
-
{'count':
|
478
|
+
{'count': 83,
|
479
|
+
'heat_map': array([[ 0, 0, 0, ..., 0, 0, 0],
|
480
|
+
[ 0, 0, 0, ..., 0, 0, 0],
|
481
|
+
[ 0, 0, 0, ..., 0, 0, 1],
|
482
|
+
...,
|
483
|
+
[ 0, 0, 0, ..., 30, 35, 41],
|
484
|
+
[ 0, 0, 0, ..., 41, 47, 53],
|
485
|
+
[ 0, 0, 0, ..., 53, 59, 64]], dtype=uint8)}
|
396
486
|
"""
|
397
487
|
|
398
488
|
image_size = get_image_size(image)
|
@@ -409,8 +499,8 @@ def loca_visual_prompt_counting(
|
|
409
499
|
return resp_data
|
410
500
|
|
411
501
|
|
412
|
-
def
|
413
|
-
"""'
|
502
|
+
def florence2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
|
503
|
+
"""'florence2_roberta_vqa' is a tool that takes an image and analyzes
|
414
504
|
its contents, generates detailed captions and then tries to answer the given
|
415
505
|
question using the generated context. It returns text as an answer to the question.
|
416
506
|
|
@@ -423,7 +513,7 @@ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
|
|
423
513
|
|
424
514
|
Example
|
425
515
|
-------
|
426
|
-
>>>
|
516
|
+
>>> florence2_roberta_vqa('What is the top left animal in this image?', image)
|
427
517
|
'white tiger'
|
428
518
|
"""
|
429
519
|
|
@@ -431,13 +521,73 @@ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
|
|
431
521
|
data = {
|
432
522
|
"image": image_b64,
|
433
523
|
"question": prompt,
|
434
|
-
"function_name": "
|
524
|
+
"function_name": "florence2_roberta_vqa",
|
435
525
|
}
|
436
526
|
|
437
527
|
answer = send_inference_request(data, "florence2-qa", v2=True)
|
438
528
|
return answer # type: ignore
|
439
529
|
|
440
530
|
|
531
|
+
def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
|
532
|
+
"""'ixc25_image_vqa' is a tool that can answer any questions about arbitrary images
|
533
|
+
including regular images or images of documents or presentations. It returns text
|
534
|
+
as an answer to the question.
|
535
|
+
|
536
|
+
Parameters:
|
537
|
+
prompt (str): The question about the image
|
538
|
+
image (np.ndarray): The reference image used for the question
|
539
|
+
|
540
|
+
Returns:
|
541
|
+
str: A string which is the answer to the given prompt.
|
542
|
+
|
543
|
+
Example
|
544
|
+
-------
|
545
|
+
>>> ixc25_image_vqa('What is the cat doing?', image)
|
546
|
+
'drinking milk'
|
547
|
+
"""
|
548
|
+
|
549
|
+
buffer_bytes = numpy_to_bytes(image)
|
550
|
+
files = [("image", buffer_bytes)]
|
551
|
+
payload = {
|
552
|
+
"prompt": prompt,
|
553
|
+
"function_name": "ixc25_image_vqa",
|
554
|
+
}
|
555
|
+
data: Dict[str, Any] = send_inference_request(
|
556
|
+
payload, "internlm-xcomposer2", files=files, v2=True
|
557
|
+
)
|
558
|
+
return cast(str, data["answer"])
|
559
|
+
|
560
|
+
|
561
|
+
def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
|
562
|
+
"""'ixc25_video_vqa' is a tool that can answer any questions about arbitrary videos
|
563
|
+
including regular videos or videos of documents or presentations. It returns text
|
564
|
+
as an answer to the question.
|
565
|
+
|
566
|
+
Parameters:
|
567
|
+
prompt (str): The question about the video
|
568
|
+
frames (List[np.ndarray]): The reference frames used for the question
|
569
|
+
|
570
|
+
Returns:
|
571
|
+
str: A string which is the answer to the given prompt.
|
572
|
+
|
573
|
+
Example
|
574
|
+
-------
|
575
|
+
>>> ixc25_video_vqa('Which football player made the goal?', frames)
|
576
|
+
'Lionel Messi'
|
577
|
+
"""
|
578
|
+
|
579
|
+
buffer_bytes = frames_to_bytes(frames)
|
580
|
+
files = [("video", buffer_bytes)]
|
581
|
+
payload = {
|
582
|
+
"prompt": prompt,
|
583
|
+
"function_name": "ixc25_video_vqa",
|
584
|
+
}
|
585
|
+
data: Dict[str, Any] = send_inference_request(
|
586
|
+
payload, "internlm-xcomposer2", files=files, v2=True
|
587
|
+
)
|
588
|
+
return cast(str, data["answer"])
|
589
|
+
|
590
|
+
|
441
591
|
def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
|
442
592
|
"""'git_vqa_v2' is a tool that can answer questions about the visual
|
443
593
|
contents of an image given a question and an image. It returns an answer to the
|
@@ -581,8 +731,8 @@ def blip_image_caption(image: np.ndarray) -> str:
|
|
581
731
|
return answer["text"][0] # type: ignore
|
582
732
|
|
583
733
|
|
584
|
-
def
|
585
|
-
"""'
|
734
|
+
def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> str:
|
735
|
+
"""'florence2_image_caption' is a tool that can caption or describe an image based
|
586
736
|
on its contents. It returns a text describing the image.
|
587
737
|
|
588
738
|
Parameters:
|
@@ -595,7 +745,7 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) ->
|
|
595
745
|
|
596
746
|
Example
|
597
747
|
-------
|
598
|
-
>>>
|
748
|
+
>>> florence2_image_caption(image, False)
|
599
749
|
'This image contains a cat sitting on a table with a bowl of milk.'
|
600
750
|
"""
|
601
751
|
image_b64 = convert_to_b64(image)
|
@@ -603,17 +753,19 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) ->
|
|
603
753
|
data = {
|
604
754
|
"image": image_b64,
|
605
755
|
"task": task,
|
606
|
-
"function_name": "
|
756
|
+
"function_name": "florence2_image_caption",
|
607
757
|
}
|
608
758
|
|
609
759
|
answer = send_inference_request(data, "florence2", v2=True)
|
610
760
|
return answer[task] # type: ignore
|
611
761
|
|
612
762
|
|
613
|
-
def
|
614
|
-
"""'florencev2_object_detection' is a tool that can detect
|
615
|
-
prompt such as
|
616
|
-
|
763
|
+
def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
|
764
|
+
"""'florencev2_object_detection' is a tool that can detect and count multiple
|
765
|
+
objects given a text prompt such as category names or referring expressions. You
|
766
|
+
can optionally separate the categories in the text with commas. It returns a list
|
767
|
+
of bounding boxes with normalized coordinates, label names and associated
|
768
|
+
probability scores of 1.0.
|
617
769
|
|
618
770
|
Parameters:
|
619
771
|
prompt (str): The prompt to ground to the image.
|
@@ -628,7 +780,7 @@ def florencev2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str
|
|
628
780
|
|
629
781
|
Example
|
630
782
|
-------
|
631
|
-
>>>
|
783
|
+
>>> florence2_object_detection('person looking at a coyote', image)
|
632
784
|
[
|
633
785
|
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
634
786
|
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
@@ -640,7 +792,7 @@ def florencev2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str
|
|
640
792
|
"image": image_b64,
|
641
793
|
"task": "<CAPTION_TO_PHRASE_GROUNDING>",
|
642
794
|
"prompt": prompt,
|
643
|
-
"function_name": "
|
795
|
+
"function_name": "florence2_object_detection",
|
644
796
|
}
|
645
797
|
|
646
798
|
detections = send_inference_request(data, "florence2", v2=True)
|
@@ -657,8 +809,8 @@ def florencev2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str
|
|
657
809
|
return return_data
|
658
810
|
|
659
811
|
|
660
|
-
def
|
661
|
-
"""'
|
812
|
+
def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
813
|
+
"""'florence2_ocr' is a tool that can detect text and text regions in an image.
|
662
814
|
Each text region contains one line of text. It returns a list of detected text,
|
663
815
|
the text region as a bounding box with normalized coordinates, and confidence
|
664
816
|
scores. The results are sorted from top-left to bottom right.
|
@@ -672,7 +824,7 @@ def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
672
824
|
|
673
825
|
Example
|
674
826
|
-------
|
675
|
-
>>>
|
827
|
+
>>> florence2_ocr(image)
|
676
828
|
[
|
677
829
|
{'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
|
678
830
|
]
|
@@ -683,7 +835,7 @@ def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
683
835
|
data = {
|
684
836
|
"image": image_b64,
|
685
837
|
"task": "<OCR_WITH_REGION>",
|
686
|
-
"function_name": "
|
838
|
+
"function_name": "florence2_ocr",
|
687
839
|
}
|
688
840
|
|
689
841
|
detections = send_inference_request(data, "florence2", v2=True)
|
@@ -1024,6 +1176,54 @@ def closest_box_distance(
|
|
1024
1176
|
# Utility and visualization functions
|
1025
1177
|
|
1026
1178
|
|
1179
|
+
def extract_frames(
|
1180
|
+
video_uri: Union[str, Path], fps: float = 1
|
1181
|
+
) -> List[Tuple[np.ndarray, float]]:
|
1182
|
+
"""'extract_frames' extracts frames from a video which can be a file path or youtube
|
1183
|
+
link, returns a list of tuples (frame, timestamp), where timestamp is the relative
|
1184
|
+
time in seconds where the frame was captured. The frame is a numpy array.
|
1185
|
+
|
1186
|
+
Parameters:
|
1187
|
+
video_uri (Union[str, Path]): The path to the video file or youtube link
|
1188
|
+
fps (float, optional): The frame rate per second to extract the frames. Defaults
|
1189
|
+
to 10.
|
1190
|
+
|
1191
|
+
Returns:
|
1192
|
+
List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame
|
1193
|
+
as a numpy array and the timestamp in seconds.
|
1194
|
+
|
1195
|
+
Example
|
1196
|
+
-------
|
1197
|
+
>>> extract_frames("path/to/video.mp4")
|
1198
|
+
[(frame1, 0.0), (frame2, 0.5), ...]
|
1199
|
+
"""
|
1200
|
+
|
1201
|
+
if str(video_uri).startswith(
|
1202
|
+
(
|
1203
|
+
"http://www.youtube.com/",
|
1204
|
+
"https://www.youtube.com/",
|
1205
|
+
"http://youtu.be/",
|
1206
|
+
"https://youtu.be/",
|
1207
|
+
)
|
1208
|
+
):
|
1209
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
1210
|
+
yt = YouTube(str(video_uri))
|
1211
|
+
# Download the highest resolution video
|
1212
|
+
video = (
|
1213
|
+
yt.streams.filter(progressive=True, file_extension="mp4")
|
1214
|
+
.order_by("resolution")
|
1215
|
+
.desc()
|
1216
|
+
.first()
|
1217
|
+
)
|
1218
|
+
if not video:
|
1219
|
+
raise Exception("No suitable video stream found")
|
1220
|
+
video_file_path = video.download(output_path=temp_dir)
|
1221
|
+
|
1222
|
+
return extract_frames_from_video(video_file_path, fps)
|
1223
|
+
|
1224
|
+
return extract_frames_from_video(str(video_uri), fps)
|
1225
|
+
|
1226
|
+
|
1027
1227
|
def save_json(data: Any, file_path: str) -> None:
|
1028
1228
|
"""'save_json' is a utility function that saves data as a JSON file. It is helpful
|
1029
1229
|
for saving data that contains NumPy arrays which are not JSON serializable.
|
@@ -1088,7 +1288,7 @@ def save_image(image: np.ndarray, file_path: str) -> None:
|
|
1088
1288
|
|
1089
1289
|
|
1090
1290
|
def save_video(
|
1091
|
-
frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float =
|
1291
|
+
frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float = 1
|
1092
1292
|
) -> str:
|
1093
1293
|
"""'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.
|
1094
1294
|
|
@@ -1190,15 +1390,43 @@ def overlay_bounding_boxes(
|
|
1190
1390
|
return np.array(pil_image)
|
1191
1391
|
|
1192
1392
|
|
1393
|
+
def _get_text_coords_from_mask(
|
1394
|
+
mask: np.ndarray, v_gap: int = 10, h_gap: int = 10
|
1395
|
+
) -> Tuple[int, int]:
|
1396
|
+
mask = mask.astype(np.uint8)
|
1397
|
+
if np.sum(mask) == 0:
|
1398
|
+
return (0, 0)
|
1399
|
+
|
1400
|
+
rows, cols = np.nonzero(mask)
|
1401
|
+
top = rows.min()
|
1402
|
+
bottom = rows.max()
|
1403
|
+
left = cols.min()
|
1404
|
+
right = cols.max()
|
1405
|
+
|
1406
|
+
if top - v_gap < 0:
|
1407
|
+
if bottom + v_gap > mask.shape[0]:
|
1408
|
+
top = top
|
1409
|
+
else:
|
1410
|
+
top = bottom + v_gap
|
1411
|
+
else:
|
1412
|
+
top = top - v_gap
|
1413
|
+
|
1414
|
+
return left + (right - left) // 2 - h_gap, top
|
1415
|
+
|
1416
|
+
|
1193
1417
|
def overlay_segmentation_masks(
|
1194
|
-
|
1195
|
-
|
1418
|
+
medias: Union[np.ndarray, List[np.ndarray]],
|
1419
|
+
masks: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]],
|
1420
|
+
draw_label: bool = True,
|
1421
|
+
) -> Union[np.ndarray, List[np.ndarray]]:
|
1196
1422
|
"""'overlay_segmentation_masks' is a utility function that displays segmentation
|
1197
1423
|
masks.
|
1198
1424
|
|
1199
1425
|
Parameters:
|
1200
|
-
|
1201
|
-
|
1426
|
+
medias (Union[np.ndarray, List[np.ndarray]]): The image or frames to display
|
1427
|
+
the masks on.
|
1428
|
+
masks (Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]): A list of
|
1429
|
+
dictionaries containing the masks.
|
1202
1430
|
|
1203
1431
|
Returns:
|
1204
1432
|
np.ndarray: The image with the masks displayed.
|
@@ -1218,27 +1446,50 @@ def overlay_segmentation_masks(
|
|
1218
1446
|
}],
|
1219
1447
|
)
|
1220
1448
|
"""
|
1221
|
-
|
1449
|
+
medias_int: List[np.ndarray] = (
|
1450
|
+
[medias] if isinstance(medias, np.ndarray) else medias
|
1451
|
+
)
|
1452
|
+
masks_int = [masks] if isinstance(masks[0], dict) else masks
|
1453
|
+
masks_int = cast(List[List[Dict[str, Any]]], masks_int)
|
1222
1454
|
|
1223
|
-
|
1224
|
-
|
1225
|
-
|
1226
|
-
|
1455
|
+
labels = set()
|
1456
|
+
for mask_i in masks_int:
|
1457
|
+
for mask_j in mask_i:
|
1458
|
+
labels.add(mask_j["label"])
|
1459
|
+
color = {label: COLORS[i % len(COLORS)] for i, label in enumerate(labels)}
|
1227
1460
|
|
1228
|
-
|
1229
|
-
|
1230
|
-
|
1231
|
-
|
1232
|
-
|
1461
|
+
width, height = Image.fromarray(medias_int[0]).size
|
1462
|
+
fontsize = max(12, int(min(width, height) / 40))
|
1463
|
+
font = ImageFont.truetype(
|
1464
|
+
str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")),
|
1465
|
+
fontsize,
|
1466
|
+
)
|
1233
1467
|
|
1234
|
-
|
1235
|
-
|
1236
|
-
|
1237
|
-
|
1238
|
-
|
1239
|
-
|
1240
|
-
|
1241
|
-
|
1468
|
+
frame_out = []
|
1469
|
+
for i, frame in enumerate(medias_int):
|
1470
|
+
pil_image = Image.fromarray(frame.astype(np.uint8)).convert("RGBA")
|
1471
|
+
for elt in masks_int[i]:
|
1472
|
+
mask = elt["mask"]
|
1473
|
+
label = elt["label"]
|
1474
|
+
np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
|
1475
|
+
np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
|
1476
|
+
mask_img = Image.fromarray(np_mask.astype(np.uint8))
|
1477
|
+
pil_image = Image.alpha_composite(pil_image, mask_img)
|
1478
|
+
|
1479
|
+
if draw_label:
|
1480
|
+
draw = ImageDraw.Draw(pil_image)
|
1481
|
+
text_box = draw.textbbox((0, 0), text=label, font=font)
|
1482
|
+
x, y = _get_text_coords_from_mask(
|
1483
|
+
mask,
|
1484
|
+
v_gap=(text_box[3] - text_box[1]) + 10,
|
1485
|
+
h_gap=(text_box[2] - text_box[0]) // 2,
|
1486
|
+
)
|
1487
|
+
if x != 0 and y != 0:
|
1488
|
+
text_box = draw.textbbox((x, y), text=label, font=font)
|
1489
|
+
draw.rectangle((x, y, text_box[2], text_box[3]), fill=color[label])
|
1490
|
+
draw.text((x, y), label, fill="black", font=font)
|
1491
|
+
frame_out.append(np.array(pil_image))
|
1492
|
+
return frame_out[0] if len(frame_out) == 1 else frame_out
|
1242
1493
|
|
1243
1494
|
|
1244
1495
|
def overlay_heat_map(
|
@@ -1286,9 +1537,121 @@ def overlay_heat_map(
|
|
1286
1537
|
return np.array(combined)
|
1287
1538
|
|
1288
1539
|
|
1540
|
+
# TODO: add this function to the imports so that is picked in the agent
|
1541
|
+
def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
|
1542
|
+
"""'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
|
1543
|
+
to detect objects in an image based on a given dataset. It returns the fine
|
1544
|
+
tuning job id.
|
1545
|
+
|
1546
|
+
Parameters:
|
1547
|
+
bboxes (List[BboxInput]): A list of BboxInput containing the
|
1548
|
+
image path, labels and bounding boxes.
|
1549
|
+
task (PromptTask): The florencev2 fine-tuning task. The options are
|
1550
|
+
CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
|
1551
|
+
|
1552
|
+
Returns:
|
1553
|
+
UUID: The fine tuning job id, this id will used to retrieve the fine
|
1554
|
+
tuned model.
|
1555
|
+
|
1556
|
+
Example
|
1557
|
+
-------
|
1558
|
+
>>> fine_tuning_job_id = florencev2_fine_tuning(
|
1559
|
+
[{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
|
1560
|
+
{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
|
1561
|
+
"OBJECT_DETECTION"
|
1562
|
+
)
|
1563
|
+
"""
|
1564
|
+
bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
|
1565
|
+
task_input = PromptTask[task]
|
1566
|
+
fine_tuning_request = [
|
1567
|
+
BboxInputBase64(
|
1568
|
+
image=convert_to_b64(bbox_input.image_path),
|
1569
|
+
filename=bbox_input.image_path.split("/")[-1],
|
1570
|
+
labels=bbox_input.labels,
|
1571
|
+
bboxes=bbox_input.bboxes,
|
1572
|
+
)
|
1573
|
+
for bbox_input in bboxes_input
|
1574
|
+
]
|
1575
|
+
landing_api = LandingPublicAPI()
|
1576
|
+
return landing_api.launch_fine_tuning_job(
|
1577
|
+
"florencev2", task_input, fine_tuning_request
|
1578
|
+
)
|
1579
|
+
|
1580
|
+
|
1581
|
+
# TODO: add this function to the imports so that is picked in the agent
|
1582
|
+
def florencev2_fine_tuned_object_detection(
|
1583
|
+
image: np.ndarray, prompt: str, model_id: UUID, task: str
|
1584
|
+
) -> List[Dict[str, Any]]:
|
1585
|
+
"""'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model
|
1586
|
+
to detect objects given a text prompt such as a phrase or class names separated by
|
1587
|
+
commas. It returns a list of detected objects as labels and their location as
|
1588
|
+
bounding boxes with score of 1.0.
|
1589
|
+
|
1590
|
+
Parameters:
|
1591
|
+
image (np.ndarray): The image to used to detect objects.
|
1592
|
+
prompt (str): The prompt to help find objects in the image.
|
1593
|
+
model_id (UUID): The fine-tuned model id.
|
1594
|
+
task (PromptTask): The florencev2 fine-tuning task. The options are
|
1595
|
+
CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
|
1596
|
+
|
1597
|
+
Returns:
|
1598
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
1599
|
+
bounding box of the detected objects with normalized coordinates between 0
|
1600
|
+
and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
|
1601
|
+
top-left and xmax and ymax are the coordinates of the bottom-right of the
|
1602
|
+
bounding box. The scores are always 1.0 and cannot be thresholded
|
1603
|
+
|
1604
|
+
Example
|
1605
|
+
-------
|
1606
|
+
>>> florencev2_fine_tuned_object_detection(
|
1607
|
+
image,
|
1608
|
+
'person looking at a coyote',
|
1609
|
+
UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83")
|
1610
|
+
)
|
1611
|
+
[
|
1612
|
+
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
1613
|
+
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
1614
|
+
]
|
1615
|
+
"""
|
1616
|
+
# check if job succeeded first
|
1617
|
+
landing_api = LandingPublicAPI()
|
1618
|
+
status = landing_api.check_fine_tuning_job(model_id)
|
1619
|
+
if status is not JobStatus.SUCCEEDED:
|
1620
|
+
raise FineTuneModelIsNotReady()
|
1621
|
+
|
1622
|
+
task = PromptTask[task]
|
1623
|
+
if task is PromptTask.OBJECT_DETECTION:
|
1624
|
+
prompt = ""
|
1625
|
+
|
1626
|
+
data_obj = Florencev2FtRequest(
|
1627
|
+
image=convert_to_b64(image),
|
1628
|
+
task=task,
|
1629
|
+
tool="florencev2_fine_tuning",
|
1630
|
+
prompt=prompt,
|
1631
|
+
fine_tuning=FineTuning(job_id=model_id),
|
1632
|
+
)
|
1633
|
+
data = data_obj.model_dump(by_alias=True)
|
1634
|
+
metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"}
|
1635
|
+
detections = send_inference_request(
|
1636
|
+
data, "tools", v2=False, metadata_payload=metadata_payload
|
1637
|
+
)
|
1638
|
+
|
1639
|
+
detections = detections[task.value]
|
1640
|
+
return_data = []
|
1641
|
+
image_size = image.shape[:2]
|
1642
|
+
for i in range(len(detections["bboxes"])):
|
1643
|
+
return_data.append(
|
1644
|
+
{
|
1645
|
+
"score": 1.0,
|
1646
|
+
"label": detections["labels"][i],
|
1647
|
+
"bbox": normalize_bbox(detections["bboxes"][i], image_size),
|
1648
|
+
}
|
1649
|
+
)
|
1650
|
+
return return_data
|
1651
|
+
|
1652
|
+
|
1289
1653
|
TOOLS = [
|
1290
1654
|
owl_v2,
|
1291
|
-
grounding_sam,
|
1292
1655
|
extract_frames,
|
1293
1656
|
ocr,
|
1294
1657
|
clip,
|
@@ -1296,13 +1659,15 @@ TOOLS = [
|
|
1296
1659
|
vit_nsfw_classification,
|
1297
1660
|
loca_zero_shot_counting,
|
1298
1661
|
loca_visual_prompt_counting,
|
1299
|
-
|
1300
|
-
|
1301
|
-
|
1662
|
+
florence2_image_caption,
|
1663
|
+
florence2_ocr,
|
1664
|
+
florence2_sam2_image,
|
1665
|
+
florence2_sam2_video,
|
1666
|
+
florence2_object_detection,
|
1667
|
+
ixc25_image_vqa,
|
1668
|
+
ixc25_video_vqa,
|
1302
1669
|
detr_segmentation,
|
1303
1670
|
depth_anything_v2,
|
1304
|
-
generate_soft_edge_image,
|
1305
|
-
dpt_hybrid_midas,
|
1306
1671
|
generate_pose_image,
|
1307
1672
|
closest_mask_distance,
|
1308
1673
|
closest_box_distance,
|
@@ -1313,7 +1678,6 @@ TOOLS = [
|
|
1313
1678
|
overlay_bounding_boxes,
|
1314
1679
|
overlay_segmentation_masks,
|
1315
1680
|
overlay_heat_map,
|
1316
|
-
template_match,
|
1317
1681
|
]
|
1318
1682
|
TOOLS_DF = get_tools_df(TOOLS) # type: ignore
|
1319
1683
|
TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore
|