vision-agent 0.2.110__py3-none-any.whl → 0.2.112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/agent_utils.py +3 -8
- vision_agent/agent/vision_agent.py +1 -1
- vision_agent/agent/vision_agent_coder.py +28 -20
- vision_agent/agent/vision_agent_coder_prompts.py +9 -7
- vision_agent/agent/vision_agent_prompts.py +11 -10
- vision_agent/clients/http.py +15 -3
- vision_agent/clients/landing_public_api.py +14 -2
- vision_agent/tools/__init__.py +11 -5
- vision_agent/tools/meta_tools.py +1 -46
- vision_agent/tools/tool_utils.py +25 -10
- vision_agent/tools/tools.py +463 -99
- vision_agent/tools/tools_types.py +84 -0
- vision_agent/utils/exceptions.py +13 -0
- vision_agent/utils/execute.py +0 -1
- vision_agent/utils/image_utils.py +52 -0
- {vision_agent-0.2.110.dist-info → vision_agent-0.2.112.dist-info}/METADATA +1 -1
- vision_agent-0.2.112.dist-info/RECORD +33 -0
- vision_agent/tools/meta_tools_types.py +0 -30
- vision_agent-0.2.110.dist-info/RECORD +0 -33
- {vision_agent-0.2.110.dist-info → vision_agent-0.2.112.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.110.dist-info → vision_agent-0.2.112.dist-info}/WHEEL +0 -0
vision_agent/tools/tools.py
CHANGED
@@ -2,35 +2,49 @@ import io
|
|
2
2
|
import json
|
3
3
|
import logging
|
4
4
|
import tempfile
|
5
|
-
from pathlib import Path
|
6
5
|
from importlib import resources
|
6
|
+
from pathlib import Path
|
7
7
|
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
8
|
+
from uuid import UUID
|
8
9
|
|
9
10
|
import cv2
|
10
|
-
import requests
|
11
11
|
import numpy as np
|
12
|
-
|
12
|
+
import requests
|
13
13
|
from moviepy.editor import ImageSequenceClip
|
14
14
|
from PIL import Image, ImageDraw, ImageFont
|
15
15
|
from pillow_heif import register_heif_opener # type: ignore
|
16
|
+
from pytube import YouTube # type: ignore
|
16
17
|
|
18
|
+
from vision_agent.clients.landing_public_api import LandingPublicAPI
|
17
19
|
from vision_agent.tools.tool_utils import (
|
18
|
-
send_inference_request,
|
19
20
|
get_tool_descriptions,
|
20
21
|
get_tool_documentation,
|
21
22
|
get_tools_df,
|
22
23
|
get_tools_info,
|
24
|
+
send_inference_request,
|
25
|
+
)
|
26
|
+
from vision_agent.tools.tools_types import (
|
27
|
+
BboxInput,
|
28
|
+
BboxInputBase64,
|
29
|
+
FineTuning,
|
30
|
+
Florencev2FtRequest,
|
31
|
+
JobStatus,
|
32
|
+
PromptTask,
|
23
33
|
)
|
24
34
|
from vision_agent.utils import extract_frames_from_video
|
35
|
+
from vision_agent.utils.exceptions import FineTuneModelIsNotReady
|
25
36
|
from vision_agent.utils.execute import FileSerializer, MimeType
|
26
37
|
from vision_agent.utils.image_utils import (
|
27
38
|
b64_to_pil,
|
39
|
+
convert_quad_box_to_bbox,
|
28
40
|
convert_to_b64,
|
29
41
|
denormalize_bbox,
|
42
|
+
frames_to_bytes,
|
30
43
|
get_image_size,
|
31
44
|
normalize_bbox,
|
32
|
-
|
45
|
+
numpy_to_bytes,
|
33
46
|
rle_decode,
|
47
|
+
rle_decode_array,
|
34
48
|
)
|
35
49
|
|
36
50
|
register_heif_opener()
|
@@ -130,9 +144,9 @@ def owl_v2(
|
|
130
144
|
box_threshold: float = 0.10,
|
131
145
|
) -> List[Dict[str, Any]]:
|
132
146
|
"""'owl_v2' is a tool that can detect and count multiple objects given a text
|
133
|
-
prompt such as category names or referring expressions. The categories in text
|
134
|
-
are separated by commas. It returns a list of bounding boxes with
|
135
|
-
|
147
|
+
prompt such as category names or referring expressions. The categories in text
|
148
|
+
prompt are separated by commas. It returns a list of bounding boxes with normalized
|
149
|
+
coordinates, label names and associated probability scores.
|
136
150
|
|
137
151
|
Parameters:
|
138
152
|
prompt (str): The prompt to ground to the image.
|
@@ -183,10 +197,10 @@ def grounding_sam(
|
|
183
197
|
box_threshold: float = 0.20,
|
184
198
|
iou_threshold: float = 0.20,
|
185
199
|
) -> List[Dict[str, Any]]:
|
186
|
-
"""'grounding_sam' is a tool that can segment multiple objects given a
|
187
|
-
|
188
|
-
|
189
|
-
|
200
|
+
"""'grounding_sam' is a tool that can segment multiple objects given a text prompt
|
201
|
+
such as category names or referring expressions. The categories in text prompt are
|
202
|
+
separated by commas or periods. It returns a list of bounding boxes, label names,
|
203
|
+
mask file names and associated probability scores.
|
190
204
|
|
191
205
|
Parameters:
|
192
206
|
prompt (str): The prompt to ground to the image.
|
@@ -243,52 +257,114 @@ def grounding_sam(
|
|
243
257
|
return return_data
|
244
258
|
|
245
259
|
|
246
|
-
def
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
time in seconds where the frame was captured. The frame is a numpy array.
|
260
|
+
def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
|
261
|
+
"""'florence2_sam2_image' is a tool that can segment multiple objects given a text
|
262
|
+
prompt such as category names or referring expressions. The categories in the text
|
263
|
+
prompt are separated by commas. It returns a list of bounding boxes, label names,
|
264
|
+
mask file names and associated probability scores of 1.0.
|
252
265
|
|
253
266
|
Parameters:
|
254
|
-
|
255
|
-
|
256
|
-
to 0.5.
|
267
|
+
prompt (str): The prompt to ground to the image.
|
268
|
+
image (np.ndarray): The image to ground the prompt to.
|
257
269
|
|
258
270
|
Returns:
|
259
|
-
List[
|
260
|
-
|
271
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
272
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
273
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
274
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
275
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
276
|
+
the background.
|
261
277
|
|
262
278
|
Example
|
263
279
|
-------
|
264
|
-
>>>
|
265
|
-
[
|
280
|
+
>>> florence2_sam2_image("car, dinosaur", image)
|
281
|
+
[
|
282
|
+
{
|
283
|
+
'score': 1.0,
|
284
|
+
'label': 'dinosaur',
|
285
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
286
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
287
|
+
[0, 0, 0, ..., 0, 0, 0],
|
288
|
+
...,
|
289
|
+
[0, 0, 0, ..., 0, 0, 0],
|
290
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
291
|
+
},
|
292
|
+
]
|
266
293
|
"""
|
294
|
+
buffer_bytes = numpy_to_bytes(image)
|
267
295
|
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
)
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
.first()
|
284
|
-
)
|
285
|
-
if not video:
|
286
|
-
raise Exception("No suitable video stream found")
|
287
|
-
video_file_path = video.download(output_path=temp_dir)
|
296
|
+
files = [("image", buffer_bytes)]
|
297
|
+
payload = {
|
298
|
+
"prompts": [s.strip() for s in prompt.split(",")],
|
299
|
+
"function_name": "florence2_sam2_image",
|
300
|
+
}
|
301
|
+
data: Dict[str, Any] = send_inference_request(
|
302
|
+
payload, "florence2-sam2", files=files, v2=True
|
303
|
+
)
|
304
|
+
return_data = []
|
305
|
+
for _, data_i in data["0"].items():
|
306
|
+
mask = rle_decode_array(data_i["mask"])
|
307
|
+
label = data_i["label"]
|
308
|
+
bbox = normalize_bbox(data_i["bounding_box"], data_i["mask"]["size"])
|
309
|
+
return_data.append({"label": label, "bbox": bbox, "mask": mask, "score": 1.0})
|
310
|
+
return return_data
|
288
311
|
|
289
|
-
return extract_frames_from_video(video_file_path, fps)
|
290
312
|
|
291
|
-
|
313
|
+
def florence2_sam2_video(
|
314
|
+
prompt: str, frames: List[np.ndarray]
|
315
|
+
) -> List[List[Dict[str, Any]]]:
|
316
|
+
"""'florence2_sam2_video' is a tool that can segment and track multiple entities
|
317
|
+
in a video given a text prompt such as category names or referring expressions. You
|
318
|
+
can optionally separate the categories in the text with commas. It only tracks
|
319
|
+
entities present in the first frame and only returns segmentation masks. It is
|
320
|
+
useful for tracking and counting without duplicating counts.
|
321
|
+
|
322
|
+
Parameters:
|
323
|
+
prompt (str): The prompt to ground to the video.
|
324
|
+
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
325
|
+
|
326
|
+
Returns:
|
327
|
+
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the label
|
328
|
+
and segment mask. The outer list represents each frame and the inner list is
|
329
|
+
the entities per frame. The label contains the object ID followed by the label
|
330
|
+
name. The objects are only identified in the first framed and tracked
|
331
|
+
throughout the video.
|
332
|
+
|
333
|
+
Example
|
334
|
+
-------
|
335
|
+
>>> florence2_sam2_video("car, dinosaur", frames)
|
336
|
+
[
|
337
|
+
[
|
338
|
+
{
|
339
|
+
'label': '0: dinosaur',
|
340
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
341
|
+
[0, 0, 0, ..., 0, 0, 0],
|
342
|
+
...,
|
343
|
+
[0, 0, 0, ..., 0, 0, 0],
|
344
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
345
|
+
},
|
346
|
+
],
|
347
|
+
]
|
348
|
+
"""
|
349
|
+
|
350
|
+
buffer_bytes = frames_to_bytes(frames)
|
351
|
+
files = [("video", buffer_bytes)]
|
352
|
+
payload = {
|
353
|
+
"prompts": prompt.split(","),
|
354
|
+
"function_name": "florence2_sam2_video",
|
355
|
+
}
|
356
|
+
data: Dict[str, Any] = send_inference_request(
|
357
|
+
payload, "florence2-sam2", files=files, v2=True
|
358
|
+
)
|
359
|
+
return_data = []
|
360
|
+
for frame_i in data.keys():
|
361
|
+
return_frame_data = []
|
362
|
+
for obj_id, data_j in data[frame_i].items():
|
363
|
+
mask = rle_decode_array(data_j["mask"])
|
364
|
+
label = obj_id + ": " + data_j["label"]
|
365
|
+
return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
|
366
|
+
return_data.append(return_frame_data)
|
367
|
+
return return_data
|
292
368
|
|
293
369
|
|
294
370
|
def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
@@ -357,12 +433,19 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
|
|
357
433
|
|
358
434
|
Returns:
|
359
435
|
Dict[str, Any]: A dictionary containing the key 'count' and the count as a
|
360
|
-
value
|
436
|
+
value, e.g. {count: 12} and a heat map for visaulization purposes.
|
361
437
|
|
362
438
|
Example
|
363
439
|
-------
|
364
440
|
>>> loca_zero_shot_counting(image)
|
365
|
-
{'count':
|
441
|
+
{'count': 83,
|
442
|
+
'heat_map': array([[ 0, 0, 0, ..., 0, 0, 0],
|
443
|
+
[ 0, 0, 0, ..., 0, 0, 0],
|
444
|
+
[ 0, 0, 0, ..., 0, 0, 1],
|
445
|
+
...,
|
446
|
+
[ 0, 0, 0, ..., 30, 35, 41],
|
447
|
+
[ 0, 0, 0, ..., 41, 47, 53],
|
448
|
+
[ 0, 0, 0, ..., 53, 59, 64]], dtype=uint8)}
|
366
449
|
"""
|
367
450
|
|
368
451
|
image_b64 = convert_to_b64(image)
|
@@ -387,12 +470,19 @@ def loca_visual_prompt_counting(
|
|
387
470
|
|
388
471
|
Returns:
|
389
472
|
Dict[str, Any]: A dictionary containing the key 'count' and the count as a
|
390
|
-
value
|
473
|
+
value, e.g. {count: 12} and a heat map for visaulization purposes.
|
391
474
|
|
392
475
|
Example
|
393
476
|
-------
|
394
477
|
>>> loca_visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
|
395
|
-
{'count':
|
478
|
+
{'count': 83,
|
479
|
+
'heat_map': array([[ 0, 0, 0, ..., 0, 0, 0],
|
480
|
+
[ 0, 0, 0, ..., 0, 0, 0],
|
481
|
+
[ 0, 0, 0, ..., 0, 0, 1],
|
482
|
+
...,
|
483
|
+
[ 0, 0, 0, ..., 30, 35, 41],
|
484
|
+
[ 0, 0, 0, ..., 41, 47, 53],
|
485
|
+
[ 0, 0, 0, ..., 53, 59, 64]], dtype=uint8)}
|
396
486
|
"""
|
397
487
|
|
398
488
|
image_size = get_image_size(image)
|
@@ -409,8 +499,8 @@ def loca_visual_prompt_counting(
|
|
409
499
|
return resp_data
|
410
500
|
|
411
501
|
|
412
|
-
def
|
413
|
-
"""'
|
502
|
+
def florence2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
|
503
|
+
"""'florence2_roberta_vqa' is a tool that takes an image and analyzes
|
414
504
|
its contents, generates detailed captions and then tries to answer the given
|
415
505
|
question using the generated context. It returns text as an answer to the question.
|
416
506
|
|
@@ -423,7 +513,7 @@ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
|
|
423
513
|
|
424
514
|
Example
|
425
515
|
-------
|
426
|
-
>>>
|
516
|
+
>>> florence2_roberta_vqa('What is the top left animal in this image?', image)
|
427
517
|
'white tiger'
|
428
518
|
"""
|
429
519
|
|
@@ -431,13 +521,73 @@ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
|
|
431
521
|
data = {
|
432
522
|
"image": image_b64,
|
433
523
|
"question": prompt,
|
434
|
-
"function_name": "
|
524
|
+
"function_name": "florence2_roberta_vqa",
|
435
525
|
}
|
436
526
|
|
437
527
|
answer = send_inference_request(data, "florence2-qa", v2=True)
|
438
528
|
return answer # type: ignore
|
439
529
|
|
440
530
|
|
531
|
+
def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
|
532
|
+
"""'ixc25_image_vqa' is a tool that can answer any questions about arbitrary images
|
533
|
+
including regular images or images of documents or presentations. It returns text
|
534
|
+
as an answer to the question.
|
535
|
+
|
536
|
+
Parameters:
|
537
|
+
prompt (str): The question about the image
|
538
|
+
image (np.ndarray): The reference image used for the question
|
539
|
+
|
540
|
+
Returns:
|
541
|
+
str: A string which is the answer to the given prompt.
|
542
|
+
|
543
|
+
Example
|
544
|
+
-------
|
545
|
+
>>> ixc25_image_vqa('What is the cat doing?', image)
|
546
|
+
'drinking milk'
|
547
|
+
"""
|
548
|
+
|
549
|
+
buffer_bytes = numpy_to_bytes(image)
|
550
|
+
files = [("image", buffer_bytes)]
|
551
|
+
payload = {
|
552
|
+
"prompt": prompt,
|
553
|
+
"function_name": "ixc25_image_vqa",
|
554
|
+
}
|
555
|
+
data: Dict[str, Any] = send_inference_request(
|
556
|
+
payload, "internlm-xcomposer2", files=files, v2=True
|
557
|
+
)
|
558
|
+
return cast(str, data["answer"])
|
559
|
+
|
560
|
+
|
561
|
+
def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
|
562
|
+
"""'ixc25_video_vqa' is a tool that can answer any questions about arbitrary videos
|
563
|
+
including regular videos or videos of documents or presentations. It returns text
|
564
|
+
as an answer to the question.
|
565
|
+
|
566
|
+
Parameters:
|
567
|
+
prompt (str): The question about the video
|
568
|
+
frames (List[np.ndarray]): The reference frames used for the question
|
569
|
+
|
570
|
+
Returns:
|
571
|
+
str: A string which is the answer to the given prompt.
|
572
|
+
|
573
|
+
Example
|
574
|
+
-------
|
575
|
+
>>> ixc25_video_vqa('Which football player made the goal?', frames)
|
576
|
+
'Lionel Messi'
|
577
|
+
"""
|
578
|
+
|
579
|
+
buffer_bytes = frames_to_bytes(frames)
|
580
|
+
files = [("video", buffer_bytes)]
|
581
|
+
payload = {
|
582
|
+
"prompt": prompt,
|
583
|
+
"function_name": "ixc25_video_vqa",
|
584
|
+
}
|
585
|
+
data: Dict[str, Any] = send_inference_request(
|
586
|
+
payload, "internlm-xcomposer2", files=files, v2=True
|
587
|
+
)
|
588
|
+
return cast(str, data["answer"])
|
589
|
+
|
590
|
+
|
441
591
|
def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
|
442
592
|
"""'git_vqa_v2' is a tool that can answer questions about the visual
|
443
593
|
contents of an image given a question and an image. It returns an answer to the
|
@@ -581,8 +731,8 @@ def blip_image_caption(image: np.ndarray) -> str:
|
|
581
731
|
return answer["text"][0] # type: ignore
|
582
732
|
|
583
733
|
|
584
|
-
def
|
585
|
-
"""'
|
734
|
+
def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> str:
|
735
|
+
"""'florence2_image_caption' is a tool that can caption or describe an image based
|
586
736
|
on its contents. It returns a text describing the image.
|
587
737
|
|
588
738
|
Parameters:
|
@@ -595,7 +745,7 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) ->
|
|
595
745
|
|
596
746
|
Example
|
597
747
|
-------
|
598
|
-
>>>
|
748
|
+
>>> florence2_image_caption(image, False)
|
599
749
|
'This image contains a cat sitting on a table with a bowl of milk.'
|
600
750
|
"""
|
601
751
|
image_b64 = convert_to_b64(image)
|
@@ -603,17 +753,19 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) ->
|
|
603
753
|
data = {
|
604
754
|
"image": image_b64,
|
605
755
|
"task": task,
|
606
|
-
"function_name": "
|
756
|
+
"function_name": "florence2_image_caption",
|
607
757
|
}
|
608
758
|
|
609
759
|
answer = send_inference_request(data, "florence2", v2=True)
|
610
760
|
return answer[task] # type: ignore
|
611
761
|
|
612
762
|
|
613
|
-
def
|
614
|
-
"""'florencev2_object_detection' is a tool that can detect
|
615
|
-
prompt such as
|
616
|
-
|
763
|
+
def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
|
764
|
+
"""'florencev2_object_detection' is a tool that can detect and count multiple
|
765
|
+
objects given a text prompt such as category names or referring expressions. You
|
766
|
+
can optionally separate the categories in the text with commas. It returns a list
|
767
|
+
of bounding boxes with normalized coordinates, label names and associated
|
768
|
+
probability scores of 1.0.
|
617
769
|
|
618
770
|
Parameters:
|
619
771
|
prompt (str): The prompt to ground to the image.
|
@@ -628,7 +780,7 @@ def florencev2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str
|
|
628
780
|
|
629
781
|
Example
|
630
782
|
-------
|
631
|
-
>>>
|
783
|
+
>>> florence2_object_detection('person looking at a coyote', image)
|
632
784
|
[
|
633
785
|
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
634
786
|
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
@@ -640,7 +792,7 @@ def florencev2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str
|
|
640
792
|
"image": image_b64,
|
641
793
|
"task": "<CAPTION_TO_PHRASE_GROUNDING>",
|
642
794
|
"prompt": prompt,
|
643
|
-
"function_name": "
|
795
|
+
"function_name": "florence2_object_detection",
|
644
796
|
}
|
645
797
|
|
646
798
|
detections = send_inference_request(data, "florence2", v2=True)
|
@@ -657,8 +809,8 @@ def florencev2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str
|
|
657
809
|
return return_data
|
658
810
|
|
659
811
|
|
660
|
-
def
|
661
|
-
"""'
|
812
|
+
def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
813
|
+
"""'florence2_ocr' is a tool that can detect text and text regions in an image.
|
662
814
|
Each text region contains one line of text. It returns a list of detected text,
|
663
815
|
the text region as a bounding box with normalized coordinates, and confidence
|
664
816
|
scores. The results are sorted from top-left to bottom right.
|
@@ -672,7 +824,7 @@ def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
672
824
|
|
673
825
|
Example
|
674
826
|
-------
|
675
|
-
>>>
|
827
|
+
>>> florence2_ocr(image)
|
676
828
|
[
|
677
829
|
{'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
|
678
830
|
]
|
@@ -683,7 +835,7 @@ def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
683
835
|
data = {
|
684
836
|
"image": image_b64,
|
685
837
|
"task": "<OCR_WITH_REGION>",
|
686
|
-
"function_name": "
|
838
|
+
"function_name": "florence2_ocr",
|
687
839
|
}
|
688
840
|
|
689
841
|
detections = send_inference_request(data, "florence2", v2=True)
|
@@ -1024,6 +1176,54 @@ def closest_box_distance(
|
|
1024
1176
|
# Utility and visualization functions
|
1025
1177
|
|
1026
1178
|
|
1179
|
+
def extract_frames(
|
1180
|
+
video_uri: Union[str, Path], fps: float = 1
|
1181
|
+
) -> List[Tuple[np.ndarray, float]]:
|
1182
|
+
"""'extract_frames' extracts frames from a video which can be a file path or youtube
|
1183
|
+
link, returns a list of tuples (frame, timestamp), where timestamp is the relative
|
1184
|
+
time in seconds where the frame was captured. The frame is a numpy array.
|
1185
|
+
|
1186
|
+
Parameters:
|
1187
|
+
video_uri (Union[str, Path]): The path to the video file or youtube link
|
1188
|
+
fps (float, optional): The frame rate per second to extract the frames. Defaults
|
1189
|
+
to 10.
|
1190
|
+
|
1191
|
+
Returns:
|
1192
|
+
List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame
|
1193
|
+
as a numpy array and the timestamp in seconds.
|
1194
|
+
|
1195
|
+
Example
|
1196
|
+
-------
|
1197
|
+
>>> extract_frames("path/to/video.mp4")
|
1198
|
+
[(frame1, 0.0), (frame2, 0.5), ...]
|
1199
|
+
"""
|
1200
|
+
|
1201
|
+
if str(video_uri).startswith(
|
1202
|
+
(
|
1203
|
+
"http://www.youtube.com/",
|
1204
|
+
"https://www.youtube.com/",
|
1205
|
+
"http://youtu.be/",
|
1206
|
+
"https://youtu.be/",
|
1207
|
+
)
|
1208
|
+
):
|
1209
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
1210
|
+
yt = YouTube(str(video_uri))
|
1211
|
+
# Download the highest resolution video
|
1212
|
+
video = (
|
1213
|
+
yt.streams.filter(progressive=True, file_extension="mp4")
|
1214
|
+
.order_by("resolution")
|
1215
|
+
.desc()
|
1216
|
+
.first()
|
1217
|
+
)
|
1218
|
+
if not video:
|
1219
|
+
raise Exception("No suitable video stream found")
|
1220
|
+
video_file_path = video.download(output_path=temp_dir)
|
1221
|
+
|
1222
|
+
return extract_frames_from_video(video_file_path, fps)
|
1223
|
+
|
1224
|
+
return extract_frames_from_video(str(video_uri), fps)
|
1225
|
+
|
1226
|
+
|
1027
1227
|
def save_json(data: Any, file_path: str) -> None:
|
1028
1228
|
"""'save_json' is a utility function that saves data as a JSON file. It is helpful
|
1029
1229
|
for saving data that contains NumPy arrays which are not JSON serializable.
|
@@ -1088,7 +1288,7 @@ def save_image(image: np.ndarray, file_path: str) -> None:
|
|
1088
1288
|
|
1089
1289
|
|
1090
1290
|
def save_video(
|
1091
|
-
frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float =
|
1291
|
+
frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float = 1
|
1092
1292
|
) -> str:
|
1093
1293
|
"""'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.
|
1094
1294
|
|
@@ -1190,15 +1390,43 @@ def overlay_bounding_boxes(
|
|
1190
1390
|
return np.array(pil_image)
|
1191
1391
|
|
1192
1392
|
|
1393
|
+
def _get_text_coords_from_mask(
|
1394
|
+
mask: np.ndarray, v_gap: int = 10, h_gap: int = 10
|
1395
|
+
) -> Tuple[int, int]:
|
1396
|
+
mask = mask.astype(np.uint8)
|
1397
|
+
if np.sum(mask) == 0:
|
1398
|
+
return (0, 0)
|
1399
|
+
|
1400
|
+
rows, cols = np.nonzero(mask)
|
1401
|
+
top = rows.min()
|
1402
|
+
bottom = rows.max()
|
1403
|
+
left = cols.min()
|
1404
|
+
right = cols.max()
|
1405
|
+
|
1406
|
+
if top - v_gap < 0:
|
1407
|
+
if bottom + v_gap > mask.shape[0]:
|
1408
|
+
top = top
|
1409
|
+
else:
|
1410
|
+
top = bottom + v_gap
|
1411
|
+
else:
|
1412
|
+
top = top - v_gap
|
1413
|
+
|
1414
|
+
return left + (right - left) // 2 - h_gap, top
|
1415
|
+
|
1416
|
+
|
1193
1417
|
def overlay_segmentation_masks(
|
1194
|
-
|
1195
|
-
|
1418
|
+
medias: Union[np.ndarray, List[np.ndarray]],
|
1419
|
+
masks: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]],
|
1420
|
+
draw_label: bool = True,
|
1421
|
+
) -> Union[np.ndarray, List[np.ndarray]]:
|
1196
1422
|
"""'overlay_segmentation_masks' is a utility function that displays segmentation
|
1197
1423
|
masks.
|
1198
1424
|
|
1199
1425
|
Parameters:
|
1200
|
-
|
1201
|
-
|
1426
|
+
medias (Union[np.ndarray, List[np.ndarray]]): The image or frames to display
|
1427
|
+
the masks on.
|
1428
|
+
masks (Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]): A list of
|
1429
|
+
dictionaries containing the masks.
|
1202
1430
|
|
1203
1431
|
Returns:
|
1204
1432
|
np.ndarray: The image with the masks displayed.
|
@@ -1218,27 +1446,50 @@ def overlay_segmentation_masks(
|
|
1218
1446
|
}],
|
1219
1447
|
)
|
1220
1448
|
"""
|
1221
|
-
|
1449
|
+
medias_int: List[np.ndarray] = (
|
1450
|
+
[medias] if isinstance(medias, np.ndarray) else medias
|
1451
|
+
)
|
1452
|
+
masks_int = [masks] if isinstance(masks[0], dict) else masks
|
1453
|
+
masks_int = cast(List[List[Dict[str, Any]]], masks_int)
|
1222
1454
|
|
1223
|
-
|
1224
|
-
|
1225
|
-
|
1226
|
-
|
1455
|
+
labels = set()
|
1456
|
+
for mask_i in masks_int:
|
1457
|
+
for mask_j in mask_i:
|
1458
|
+
labels.add(mask_j["label"])
|
1459
|
+
color = {label: COLORS[i % len(COLORS)] for i, label in enumerate(labels)}
|
1227
1460
|
|
1228
|
-
|
1229
|
-
|
1230
|
-
|
1231
|
-
|
1232
|
-
|
1461
|
+
width, height = Image.fromarray(medias_int[0]).size
|
1462
|
+
fontsize = max(12, int(min(width, height) / 40))
|
1463
|
+
font = ImageFont.truetype(
|
1464
|
+
str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")),
|
1465
|
+
fontsize,
|
1466
|
+
)
|
1233
1467
|
|
1234
|
-
|
1235
|
-
|
1236
|
-
|
1237
|
-
|
1238
|
-
|
1239
|
-
|
1240
|
-
|
1241
|
-
|
1468
|
+
frame_out = []
|
1469
|
+
for i, frame in enumerate(medias_int):
|
1470
|
+
pil_image = Image.fromarray(frame.astype(np.uint8)).convert("RGBA")
|
1471
|
+
for elt in masks_int[i]:
|
1472
|
+
mask = elt["mask"]
|
1473
|
+
label = elt["label"]
|
1474
|
+
np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
|
1475
|
+
np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
|
1476
|
+
mask_img = Image.fromarray(np_mask.astype(np.uint8))
|
1477
|
+
pil_image = Image.alpha_composite(pil_image, mask_img)
|
1478
|
+
|
1479
|
+
if draw_label:
|
1480
|
+
draw = ImageDraw.Draw(pil_image)
|
1481
|
+
text_box = draw.textbbox((0, 0), text=label, font=font)
|
1482
|
+
x, y = _get_text_coords_from_mask(
|
1483
|
+
mask,
|
1484
|
+
v_gap=(text_box[3] - text_box[1]) + 10,
|
1485
|
+
h_gap=(text_box[2] - text_box[0]) // 2,
|
1486
|
+
)
|
1487
|
+
if x != 0 and y != 0:
|
1488
|
+
text_box = draw.textbbox((x, y), text=label, font=font)
|
1489
|
+
draw.rectangle((x, y, text_box[2], text_box[3]), fill=color[label])
|
1490
|
+
draw.text((x, y), label, fill="black", font=font)
|
1491
|
+
frame_out.append(np.array(pil_image))
|
1492
|
+
return frame_out[0] if len(frame_out) == 1 else frame_out
|
1242
1493
|
|
1243
1494
|
|
1244
1495
|
def overlay_heat_map(
|
@@ -1286,9 +1537,121 @@ def overlay_heat_map(
|
|
1286
1537
|
return np.array(combined)
|
1287
1538
|
|
1288
1539
|
|
1540
|
+
# TODO: add this function to the imports so that is picked in the agent
|
1541
|
+
def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
|
1542
|
+
"""'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
|
1543
|
+
to detect objects in an image based on a given dataset. It returns the fine
|
1544
|
+
tuning job id.
|
1545
|
+
|
1546
|
+
Parameters:
|
1547
|
+
bboxes (List[BboxInput]): A list of BboxInput containing the
|
1548
|
+
image path, labels and bounding boxes.
|
1549
|
+
task (PromptTask): The florencev2 fine-tuning task. The options are
|
1550
|
+
CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
|
1551
|
+
|
1552
|
+
Returns:
|
1553
|
+
UUID: The fine tuning job id, this id will used to retrieve the fine
|
1554
|
+
tuned model.
|
1555
|
+
|
1556
|
+
Example
|
1557
|
+
-------
|
1558
|
+
>>> fine_tuning_job_id = florencev2_fine_tuning(
|
1559
|
+
[{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
|
1560
|
+
{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
|
1561
|
+
"OBJECT_DETECTION"
|
1562
|
+
)
|
1563
|
+
"""
|
1564
|
+
bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
|
1565
|
+
task_input = PromptTask[task]
|
1566
|
+
fine_tuning_request = [
|
1567
|
+
BboxInputBase64(
|
1568
|
+
image=convert_to_b64(bbox_input.image_path),
|
1569
|
+
filename=bbox_input.image_path.split("/")[-1],
|
1570
|
+
labels=bbox_input.labels,
|
1571
|
+
bboxes=bbox_input.bboxes,
|
1572
|
+
)
|
1573
|
+
for bbox_input in bboxes_input
|
1574
|
+
]
|
1575
|
+
landing_api = LandingPublicAPI()
|
1576
|
+
return landing_api.launch_fine_tuning_job(
|
1577
|
+
"florencev2", task_input, fine_tuning_request
|
1578
|
+
)
|
1579
|
+
|
1580
|
+
|
1581
|
+
# TODO: add this function to the imports so that is picked in the agent
|
1582
|
+
def florencev2_fine_tuned_object_detection(
|
1583
|
+
image: np.ndarray, prompt: str, model_id: UUID, task: str
|
1584
|
+
) -> List[Dict[str, Any]]:
|
1585
|
+
"""'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model
|
1586
|
+
to detect objects given a text prompt such as a phrase or class names separated by
|
1587
|
+
commas. It returns a list of detected objects as labels and their location as
|
1588
|
+
bounding boxes with score of 1.0.
|
1589
|
+
|
1590
|
+
Parameters:
|
1591
|
+
image (np.ndarray): The image to used to detect objects.
|
1592
|
+
prompt (str): The prompt to help find objects in the image.
|
1593
|
+
model_id (UUID): The fine-tuned model id.
|
1594
|
+
task (PromptTask): The florencev2 fine-tuning task. The options are
|
1595
|
+
CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
|
1596
|
+
|
1597
|
+
Returns:
|
1598
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
1599
|
+
bounding box of the detected objects with normalized coordinates between 0
|
1600
|
+
and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
|
1601
|
+
top-left and xmax and ymax are the coordinates of the bottom-right of the
|
1602
|
+
bounding box. The scores are always 1.0 and cannot be thresholded
|
1603
|
+
|
1604
|
+
Example
|
1605
|
+
-------
|
1606
|
+
>>> florencev2_fine_tuned_object_detection(
|
1607
|
+
image,
|
1608
|
+
'person looking at a coyote',
|
1609
|
+
UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83")
|
1610
|
+
)
|
1611
|
+
[
|
1612
|
+
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
1613
|
+
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
1614
|
+
]
|
1615
|
+
"""
|
1616
|
+
# check if job succeeded first
|
1617
|
+
landing_api = LandingPublicAPI()
|
1618
|
+
status = landing_api.check_fine_tuning_job(model_id)
|
1619
|
+
if status is not JobStatus.SUCCEEDED:
|
1620
|
+
raise FineTuneModelIsNotReady()
|
1621
|
+
|
1622
|
+
task = PromptTask[task]
|
1623
|
+
if task is PromptTask.OBJECT_DETECTION:
|
1624
|
+
prompt = ""
|
1625
|
+
|
1626
|
+
data_obj = Florencev2FtRequest(
|
1627
|
+
image=convert_to_b64(image),
|
1628
|
+
task=task,
|
1629
|
+
tool="florencev2_fine_tuning",
|
1630
|
+
prompt=prompt,
|
1631
|
+
fine_tuning=FineTuning(job_id=model_id),
|
1632
|
+
)
|
1633
|
+
data = data_obj.model_dump(by_alias=True)
|
1634
|
+
metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"}
|
1635
|
+
detections = send_inference_request(
|
1636
|
+
data, "tools", v2=False, metadata_payload=metadata_payload
|
1637
|
+
)
|
1638
|
+
|
1639
|
+
detections = detections[task.value]
|
1640
|
+
return_data = []
|
1641
|
+
image_size = image.shape[:2]
|
1642
|
+
for i in range(len(detections["bboxes"])):
|
1643
|
+
return_data.append(
|
1644
|
+
{
|
1645
|
+
"score": 1.0,
|
1646
|
+
"label": detections["labels"][i],
|
1647
|
+
"bbox": normalize_bbox(detections["bboxes"][i], image_size),
|
1648
|
+
}
|
1649
|
+
)
|
1650
|
+
return return_data
|
1651
|
+
|
1652
|
+
|
1289
1653
|
TOOLS = [
|
1290
1654
|
owl_v2,
|
1291
|
-
grounding_sam,
|
1292
1655
|
extract_frames,
|
1293
1656
|
ocr,
|
1294
1657
|
clip,
|
@@ -1296,13 +1659,15 @@ TOOLS = [
|
|
1296
1659
|
vit_nsfw_classification,
|
1297
1660
|
loca_zero_shot_counting,
|
1298
1661
|
loca_visual_prompt_counting,
|
1299
|
-
|
1300
|
-
|
1301
|
-
|
1662
|
+
florence2_image_caption,
|
1663
|
+
florence2_ocr,
|
1664
|
+
florence2_sam2_image,
|
1665
|
+
florence2_sam2_video,
|
1666
|
+
florence2_object_detection,
|
1667
|
+
ixc25_image_vqa,
|
1668
|
+
ixc25_video_vqa,
|
1302
1669
|
detr_segmentation,
|
1303
1670
|
depth_anything_v2,
|
1304
|
-
generate_soft_edge_image,
|
1305
|
-
dpt_hybrid_midas,
|
1306
1671
|
generate_pose_image,
|
1307
1672
|
closest_mask_distance,
|
1308
1673
|
closest_box_distance,
|
@@ -1313,7 +1678,6 @@ TOOLS = [
|
|
1313
1678
|
overlay_bounding_boxes,
|
1314
1679
|
overlay_segmentation_masks,
|
1315
1680
|
overlay_heat_map,
|
1316
|
-
template_match,
|
1317
1681
|
]
|
1318
1682
|
TOOLS_DF = get_tools_df(TOOLS) # type: ignore
|
1319
1683
|
TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore
|