vision-agent 0.2.75__py3-none-any.whl → 0.2.77__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/tools/__init__.py +9 -0
- vision_agent/tools/tools.py +373 -12
- vision_agent/utils/execute.py +23 -25
- {vision_agent-0.2.75.dist-info → vision_agent-0.2.77.dist-info}/METADATA +3 -2
- {vision_agent-0.2.75.dist-info → vision_agent-0.2.77.dist-info}/RECORD +7 -7
- {vision_agent-0.2.75.dist-info → vision_agent-0.2.77.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.75.dist-info → vision_agent-0.2.77.dist-info}/WHEEL +0 -0
vision_agent/tools/__init__.py
CHANGED
@@ -12,10 +12,18 @@ from .tools import (
|
|
12
12
|
closest_box_distance,
|
13
13
|
closest_mask_distance,
|
14
14
|
extract_frames,
|
15
|
+
florencev2_image_caption,
|
15
16
|
get_tool_documentation,
|
17
|
+
florencev2_object_detection,
|
18
|
+
detr_segmentation,
|
19
|
+
depth_anything_v2,
|
20
|
+
generate_soft_edge_image,
|
21
|
+
dpt_hybrid_midas,
|
22
|
+
generate_pose_image,
|
16
23
|
git_vqa_v2,
|
17
24
|
grounding_dino,
|
18
25
|
grounding_sam,
|
26
|
+
florencev2_roberta_vqa,
|
19
27
|
load_image,
|
20
28
|
loca_visual_prompt_counting,
|
21
29
|
loca_zero_shot_counting,
|
@@ -27,6 +35,7 @@ from .tools import (
|
|
27
35
|
save_image,
|
28
36
|
save_json,
|
29
37
|
save_video,
|
38
|
+
template_match,
|
30
39
|
vit_image_classification,
|
31
40
|
vit_nsfw_classification,
|
32
41
|
)
|
vision_agent/tools/tools.py
CHANGED
@@ -14,6 +14,7 @@ import requests
|
|
14
14
|
from moviepy.editor import ImageSequenceClip
|
15
15
|
from PIL import Image, ImageDraw, ImageFont
|
16
16
|
from pillow_heif import register_heif_opener # type: ignore
|
17
|
+
from pytube import YouTube # type: ignore
|
17
18
|
|
18
19
|
from vision_agent.tools.tool_utils import send_inference_request
|
19
20
|
from vision_agent.utils import extract_frames_from_video
|
@@ -126,7 +127,7 @@ def owl_v2(
|
|
126
127
|
) -> List[Dict[str, Any]]:
|
127
128
|
"""'owl_v2' is a tool that can detect and count multiple objects given a text
|
128
129
|
prompt such as category names or referring expressions. The categories in text prompt
|
129
|
-
are separated by commas
|
130
|
+
are separated by commas. It returns a list of bounding boxes with
|
130
131
|
normalized coordinates, label names and associated probability scores.
|
131
132
|
|
132
133
|
Parameters:
|
@@ -136,7 +137,6 @@ def owl_v2(
|
|
136
137
|
to 0.10.
|
137
138
|
iou_threshold (float, optional): The threshold for the Intersection over Union
|
138
139
|
(IoU). Defaults to 0.10.
|
139
|
-
model_size (str, optional): The size of the model to use.
|
140
140
|
|
141
141
|
Returns:
|
142
142
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
@@ -180,7 +180,7 @@ def grounding_sam(
|
|
180
180
|
box_threshold: float = 0.20,
|
181
181
|
iou_threshold: float = 0.20,
|
182
182
|
) -> List[Dict[str, Any]]:
|
183
|
-
"""'grounding_sam' is a tool that can
|
183
|
+
"""'grounding_sam' is a tool that can segment multiple objects given a
|
184
184
|
text prompt such as category names or referring expressions. The categories in text
|
185
185
|
prompt are separated by commas or periods. It returns a list of bounding boxes,
|
186
186
|
label names, mask file names and associated probability scores.
|
@@ -242,12 +242,12 @@ def grounding_sam(
|
|
242
242
|
def extract_frames(
|
243
243
|
video_uri: Union[str, Path], fps: float = 0.5
|
244
244
|
) -> List[Tuple[np.ndarray, float]]:
|
245
|
-
"""'extract_frames' extracts frames from a video
|
246
|
-
timestamp), where timestamp is the relative
|
247
|
-
captured. The frame is a numpy array.
|
245
|
+
"""'extract_frames' extracts frames from a video which can be a file path or youtube
|
246
|
+
link, returns a list of tuples (frame, timestamp), where timestamp is the relative
|
247
|
+
time in seconds where the frame was captured. The frame is a numpy array.
|
248
248
|
|
249
249
|
Parameters:
|
250
|
-
video_uri (Union[str, Path]): The path to the video file
|
250
|
+
video_uri (Union[str, Path]): The path to the video file or youtube link
|
251
251
|
fps (float, optional): The frame rate per second to extract the frames. Defaults
|
252
252
|
to 0.5.
|
253
253
|
|
@@ -261,6 +261,29 @@ def extract_frames(
|
|
261
261
|
[(frame1, 0.0), (frame2, 0.5), ...]
|
262
262
|
"""
|
263
263
|
|
264
|
+
if str(video_uri).startswith(
|
265
|
+
(
|
266
|
+
"http://www.youtube.com/",
|
267
|
+
"https://www.youtube.com/",
|
268
|
+
"http://youtu.be/",
|
269
|
+
"https://youtu.be/",
|
270
|
+
)
|
271
|
+
):
|
272
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
273
|
+
yt = YouTube(str(video_uri))
|
274
|
+
# Download the highest resolution video
|
275
|
+
video = (
|
276
|
+
yt.streams.filter(progressive=True, file_extension="mp4")
|
277
|
+
.order_by("resolution")
|
278
|
+
.desc()
|
279
|
+
.first()
|
280
|
+
)
|
281
|
+
if not video:
|
282
|
+
raise Exception("No suitable video stream found")
|
283
|
+
video_file_path = video.download(output_path=temp_dir)
|
284
|
+
|
285
|
+
return extract_frames_from_video(video_file_path, fps)
|
286
|
+
|
264
287
|
return extract_frames_from_video(str(video_uri), fps)
|
265
288
|
|
266
289
|
|
@@ -381,6 +404,35 @@ def loca_visual_prompt_counting(
|
|
381
404
|
return resp_data
|
382
405
|
|
383
406
|
|
407
|
+
def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
|
408
|
+
"""'florencev2_roberta_vqa' is a tool that takes an image and analyzes
|
409
|
+
its contents, generates detailed captions and then tries to answer the given
|
410
|
+
question using the generated context. It returns text as an answer to the question.
|
411
|
+
|
412
|
+
Parameters:
|
413
|
+
prompt (str): The question about the image
|
414
|
+
image (np.ndarray): The reference image used for the question
|
415
|
+
|
416
|
+
Returns:
|
417
|
+
str: A string which is the answer to the given prompt.
|
418
|
+
|
419
|
+
Example
|
420
|
+
-------
|
421
|
+
>>> florencev2_roberta_vqa('What is the top left animal in this image ?', image)
|
422
|
+
'white tiger'
|
423
|
+
"""
|
424
|
+
|
425
|
+
image_b64 = convert_to_b64(image)
|
426
|
+
data = {
|
427
|
+
"image": image_b64,
|
428
|
+
"prompt": prompt,
|
429
|
+
"tool": "image_question_answering_with_context",
|
430
|
+
}
|
431
|
+
|
432
|
+
answer = send_inference_request(data, "tools")
|
433
|
+
return answer["text"][0] # type: ignore
|
434
|
+
|
435
|
+
|
384
436
|
def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
|
385
437
|
"""'git_vqa_v2' is a tool that can answer questions about the visual
|
386
438
|
contents of an image given a question and an image. It returns an answer to the
|
@@ -391,8 +443,7 @@ def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
|
|
391
443
|
image (np.ndarray): The reference image used for the question
|
392
444
|
|
393
445
|
Returns:
|
394
|
-
str: A string which is the answer to the given prompt.
|
395
|
-
image contains a cat sitting on a table with a bowl of milk.'}.
|
446
|
+
str: A string which is the answer to the given prompt.
|
396
447
|
|
397
448
|
Example
|
398
449
|
-------
|
@@ -521,6 +572,309 @@ def blip_image_caption(image: np.ndarray) -> str:
|
|
521
572
|
return answer["text"][0] # type: ignore
|
522
573
|
|
523
574
|
|
575
|
+
def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) -> str:
|
576
|
+
"""'florencev2_image_caption' is a tool that can caption or describe an image based
|
577
|
+
on its contents. It returns a text describing the image.
|
578
|
+
|
579
|
+
Parameters:
|
580
|
+
image (np.ndarray): The image to caption
|
581
|
+
detail_caption (bool): If True, the caption will be as detailed as possible else
|
582
|
+
the caption will be a brief description.
|
583
|
+
|
584
|
+
Returns:
|
585
|
+
str: A string which is the caption for the given image.
|
586
|
+
|
587
|
+
Example
|
588
|
+
-------
|
589
|
+
>>> florencev2_image_caption(image, False)
|
590
|
+
'This image contains a cat sitting on a table with a bowl of milk.'
|
591
|
+
"""
|
592
|
+
image_b64 = convert_to_b64(image)
|
593
|
+
data = {
|
594
|
+
"image": image_b64,
|
595
|
+
"tool": "florence2_image_captioning",
|
596
|
+
"detail_caption": detail_caption,
|
597
|
+
}
|
598
|
+
|
599
|
+
answer = send_inference_request(data, "tools")
|
600
|
+
return answer["text"][0] # type: ignore
|
601
|
+
|
602
|
+
|
603
|
+
def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]:
|
604
|
+
"""'florencev2_object_detection' is a tool that can detect common objects in an
|
605
|
+
image without any text prompt or thresholding. It returns a list of detected objects
|
606
|
+
as labels and their location as bounding boxes.
|
607
|
+
|
608
|
+
Parameters:
|
609
|
+
image (np.ndarray): The image to used to detect objects
|
610
|
+
|
611
|
+
Returns:
|
612
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
613
|
+
bounding box of the detected objects with normalized coordinates between 0
|
614
|
+
and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
|
615
|
+
top-left and xmax and ymax are the coordinates of the bottom-right of the
|
616
|
+
bounding box. The scores are always 1.0 and cannot be thresholded
|
617
|
+
|
618
|
+
Example
|
619
|
+
-------
|
620
|
+
>>> florencev2_object_detection(image)
|
621
|
+
[
|
622
|
+
{'score': 1.0, 'label': 'window', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
623
|
+
{'score': 1.0, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
624
|
+
{'score': 1.0, 'label': 'person', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
625
|
+
]
|
626
|
+
"""
|
627
|
+
image_size = image.shape[:2]
|
628
|
+
image_b64 = convert_to_b64(image)
|
629
|
+
data = {
|
630
|
+
"image": image_b64,
|
631
|
+
"tool": "object_detection",
|
632
|
+
}
|
633
|
+
|
634
|
+
answer = send_inference_request(data, "tools")
|
635
|
+
return_data = []
|
636
|
+
for i in range(len(answer["bboxes"])):
|
637
|
+
return_data.append(
|
638
|
+
{
|
639
|
+
"score": round(answer["scores"][i], 2),
|
640
|
+
"label": answer["labels"][i],
|
641
|
+
"bbox": normalize_bbox(answer["bboxes"][i], image_size),
|
642
|
+
}
|
643
|
+
)
|
644
|
+
return return_data
|
645
|
+
|
646
|
+
|
647
|
+
def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
|
648
|
+
"""'detr_segmentation' is a tool that can segment common objects in an
|
649
|
+
image without any text prompt. It returns a list of detected objects
|
650
|
+
as labels, their regions as masks and their scores.
|
651
|
+
|
652
|
+
Parameters:
|
653
|
+
image (np.ndarray): The image used to segment things and objects
|
654
|
+
|
655
|
+
Returns:
|
656
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label
|
657
|
+
and mask of the detected objects. The mask is binary 2D numpy array where 1
|
658
|
+
indicates the object and 0 indicates the background.
|
659
|
+
|
660
|
+
Example
|
661
|
+
-------
|
662
|
+
>>> detr_segmentation(image)
|
663
|
+
[
|
664
|
+
{
|
665
|
+
'score': 0.45,
|
666
|
+
'label': 'window',
|
667
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
668
|
+
[0, 0, 0, ..., 0, 0, 0],
|
669
|
+
...,
|
670
|
+
[0, 0, 0, ..., 0, 0, 0],
|
671
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
672
|
+
},
|
673
|
+
{
|
674
|
+
'score': 0.70,
|
675
|
+
'label': 'bird',
|
676
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
677
|
+
[0, 0, 0, ..., 0, 0, 0],
|
678
|
+
...,
|
679
|
+
[0, 0, 0, ..., 0, 0, 0],
|
680
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
681
|
+
},
|
682
|
+
]
|
683
|
+
"""
|
684
|
+
image_b64 = convert_to_b64(image)
|
685
|
+
data = {
|
686
|
+
"image": image_b64,
|
687
|
+
"tool": "panoptic_segmentation",
|
688
|
+
}
|
689
|
+
|
690
|
+
answer = send_inference_request(data, "tools")
|
691
|
+
return_data = []
|
692
|
+
|
693
|
+
for i in range(len(answer["scores"])):
|
694
|
+
return_data.append(
|
695
|
+
{
|
696
|
+
"score": round(answer["scores"][i], 2),
|
697
|
+
"label": answer["labels"][i],
|
698
|
+
"mask": rle_decode(
|
699
|
+
mask_rle=answer["masks"][i], shape=answer["mask_shape"][0]
|
700
|
+
),
|
701
|
+
}
|
702
|
+
)
|
703
|
+
return return_data
|
704
|
+
|
705
|
+
|
706
|
+
def depth_anything_v2(image: np.ndarray) -> np.ndarray:
|
707
|
+
"""'depth_anything_v2' is a tool that runs depth_anythingv2 model to generate a
|
708
|
+
depth image from a given RGB image. The returned depth image is monochrome and
|
709
|
+
represents depth values as pixel intesities with pixel values ranging from 0 to 255.
|
710
|
+
|
711
|
+
Parameters:
|
712
|
+
image (np.ndarray): The image to used to generate depth image
|
713
|
+
|
714
|
+
Returns:
|
715
|
+
np.ndarray: A grayscale depth image with pixel values ranging from 0 to 255.
|
716
|
+
|
717
|
+
Example
|
718
|
+
-------
|
719
|
+
>>> depth_anything_v2(image)
|
720
|
+
array([[0, 0, 0, ..., 0, 0, 0],
|
721
|
+
[0, 20, 24, ..., 0, 100, 103],
|
722
|
+
...,
|
723
|
+
[10, 11, 15, ..., 202, 202, 205],
|
724
|
+
[10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
|
725
|
+
"""
|
726
|
+
image_b64 = convert_to_b64(image)
|
727
|
+
data = {
|
728
|
+
"image": image_b64,
|
729
|
+
"tool": "generate_depth",
|
730
|
+
}
|
731
|
+
|
732
|
+
answer = send_inference_request(data, "tools")
|
733
|
+
return_data = np.array(b64_to_pil(answer["masks"][0]).convert("L"))
|
734
|
+
return return_data
|
735
|
+
|
736
|
+
|
737
|
+
def generate_soft_edge_image(image: np.ndarray) -> np.ndarray:
|
738
|
+
"""'generate_soft_edge_image' is a tool that runs Holistically Nested edge detection
|
739
|
+
to generate a soft edge image (HED) from a given RGB image. The returned image is
|
740
|
+
monochrome and represents object boundaries as soft white edges on black background
|
741
|
+
|
742
|
+
Parameters:
|
743
|
+
image (np.ndarray): The image to used to generate soft edge image
|
744
|
+
|
745
|
+
Returns:
|
746
|
+
np.ndarray: A soft edge image with pixel values ranging from 0 to 255.
|
747
|
+
|
748
|
+
Example
|
749
|
+
-------
|
750
|
+
>>> generate_soft_edge_image(image)
|
751
|
+
array([[0, 0, 0, ..., 0, 0, 0],
|
752
|
+
[0, 20, 24, ..., 0, 100, 103],
|
753
|
+
...,
|
754
|
+
[10, 11, 15, ..., 202, 202, 205],
|
755
|
+
[10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
|
756
|
+
"""
|
757
|
+
image_b64 = convert_to_b64(image)
|
758
|
+
data = {
|
759
|
+
"image": image_b64,
|
760
|
+
"tool": "generate_hed",
|
761
|
+
}
|
762
|
+
|
763
|
+
answer = send_inference_request(data, "tools")
|
764
|
+
return_data = np.array(b64_to_pil(answer["masks"][0]).convert("L"))
|
765
|
+
return return_data
|
766
|
+
|
767
|
+
|
768
|
+
def dpt_hybrid_midas(image: np.ndarray) -> np.ndarray:
|
769
|
+
"""'dpt_hybrid_midas' is a tool that generates a normal mapped from a given RGB
|
770
|
+
image. The returned RGB image is texture mapped image of the surface normals and the
|
771
|
+
RGB values represent the surface normals in the x, y, z directions.
|
772
|
+
|
773
|
+
Parameters:
|
774
|
+
image (np.ndarray): The image to used to generate normal image
|
775
|
+
|
776
|
+
Returns:
|
777
|
+
np.ndarray: A mapped normal image with RGB pixel values indicating surface
|
778
|
+
normals in x, y, z directions.
|
779
|
+
|
780
|
+
Example
|
781
|
+
-------
|
782
|
+
>>> dpt_hybrid_midas(image)
|
783
|
+
array([[0, 0, 0, ..., 0, 0, 0],
|
784
|
+
[0, 20, 24, ..., 0, 100, 103],
|
785
|
+
...,
|
786
|
+
[10, 11, 15, ..., 202, 202, 205],
|
787
|
+
[10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
|
788
|
+
"""
|
789
|
+
image_b64 = convert_to_b64(image)
|
790
|
+
data = {
|
791
|
+
"image": image_b64,
|
792
|
+
"tool": "generate_normal",
|
793
|
+
}
|
794
|
+
|
795
|
+
answer = send_inference_request(data, "tools")
|
796
|
+
return_data = np.array(b64_to_pil(answer["masks"][0]).convert("RGB"))
|
797
|
+
return return_data
|
798
|
+
|
799
|
+
|
800
|
+
def generate_pose_image(image: np.ndarray) -> np.ndarray:
|
801
|
+
"""'generate_pose_image' is a tool that generates a open pose bone/stick image from
|
802
|
+
a given RGB image. The returned bone image is RGB with the pose amd keypoints colored
|
803
|
+
and background as black.
|
804
|
+
|
805
|
+
Parameters:
|
806
|
+
image (np.ndarray): The image to used to generate pose image
|
807
|
+
|
808
|
+
Returns:
|
809
|
+
np.ndarray: A bone or pose image indicating the pose and keypoints
|
810
|
+
|
811
|
+
Example
|
812
|
+
-------
|
813
|
+
>>> generate_pose_image(image)
|
814
|
+
array([[0, 0, 0, ..., 0, 0, 0],
|
815
|
+
[0, 20, 24, ..., 0, 100, 103],
|
816
|
+
...,
|
817
|
+
[10, 11, 15, ..., 202, 202, 205],
|
818
|
+
[10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
|
819
|
+
"""
|
820
|
+
image_b64 = convert_to_b64(image)
|
821
|
+
data = {
|
822
|
+
"image": image_b64,
|
823
|
+
"tool": "generate_pose",
|
824
|
+
}
|
825
|
+
|
826
|
+
answer = send_inference_request(data, "tools")
|
827
|
+
return_data = np.array(b64_to_pil(answer["masks"][0]).convert("RGB"))
|
828
|
+
return return_data
|
829
|
+
|
830
|
+
|
831
|
+
def template_match(
|
832
|
+
image: np.ndarray, template_image: np.ndarray
|
833
|
+
) -> List[Dict[str, Any]]:
|
834
|
+
"""'template_match' is a tool that can detect all instances of a template in
|
835
|
+
a given image. It returns the locations of the detected template, a corresponding
|
836
|
+
similarity score of the same
|
837
|
+
|
838
|
+
Parameters:
|
839
|
+
image (np.ndarray): The image used for searching the template
|
840
|
+
template_image (np.ndarray): The template image or crop to search in the image
|
841
|
+
|
842
|
+
Returns:
|
843
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score and
|
844
|
+
bounding box of the detected template with normalized coordinates between 0
|
845
|
+
and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
|
846
|
+
top-left and xmax and ymax are the coordinates of the bottom-right of the
|
847
|
+
bounding box.
|
848
|
+
|
849
|
+
Example
|
850
|
+
-------
|
851
|
+
>>> template_match(image, template)
|
852
|
+
[
|
853
|
+
{'score': 0.79, 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
854
|
+
{'score': 0.38, 'bbox': [0.2, 0.21, 0.45, 0.5},
|
855
|
+
]
|
856
|
+
"""
|
857
|
+
image_size = image.shape[:2]
|
858
|
+
image_b64 = convert_to_b64(image)
|
859
|
+
template_image_b64 = convert_to_b64(template_image)
|
860
|
+
data = {
|
861
|
+
"image": image_b64,
|
862
|
+
"template": template_image_b64,
|
863
|
+
"tool": "template_match",
|
864
|
+
}
|
865
|
+
|
866
|
+
answer = send_inference_request(data, "tools")
|
867
|
+
return_data = []
|
868
|
+
for i in range(len(answer["bboxes"])):
|
869
|
+
return_data.append(
|
870
|
+
{
|
871
|
+
"score": round(answer["scores"][i], 2),
|
872
|
+
"bbox": normalize_bbox(answer["bboxes"][i], image_size),
|
873
|
+
}
|
874
|
+
)
|
875
|
+
return return_data
|
876
|
+
|
877
|
+
|
524
878
|
def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
|
525
879
|
"""'closest_mask_distance' calculates the closest distance between two masks.
|
526
880
|
|
@@ -733,7 +1087,7 @@ def overlay_bounding_boxes(
|
|
733
1087
|
image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
|
734
1088
|
)
|
735
1089
|
"""
|
736
|
-
pil_image = Image.fromarray(image.astype(np.uint8))
|
1090
|
+
pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
|
737
1091
|
|
738
1092
|
if len(set([box["label"] for box in bboxes])) > len(COLORS):
|
739
1093
|
_LOGGER.warning(
|
@@ -920,8 +1274,14 @@ TOOLS = [
|
|
920
1274
|
vit_nsfw_classification,
|
921
1275
|
loca_zero_shot_counting,
|
922
1276
|
loca_visual_prompt_counting,
|
923
|
-
|
924
|
-
|
1277
|
+
florencev2_roberta_vqa,
|
1278
|
+
florencev2_image_caption,
|
1279
|
+
florencev2_object_detection,
|
1280
|
+
detr_segmentation,
|
1281
|
+
depth_anything_v2,
|
1282
|
+
generate_soft_edge_image,
|
1283
|
+
dpt_hybrid_midas,
|
1284
|
+
generate_pose_image,
|
925
1285
|
closest_mask_distance,
|
926
1286
|
closest_box_distance,
|
927
1287
|
save_json,
|
@@ -931,6 +1291,7 @@ TOOLS = [
|
|
931
1291
|
overlay_bounding_boxes,
|
932
1292
|
overlay_segmentation_masks,
|
933
1293
|
overlay_heat_map,
|
1294
|
+
template_match,
|
934
1295
|
]
|
935
1296
|
TOOLS_DF = get_tools_df(TOOLS) # type: ignore
|
936
1297
|
TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore
|
vision_agent/utils/execute.py
CHANGED
@@ -11,10 +11,9 @@ import tempfile
|
|
11
11
|
import traceback
|
12
12
|
import warnings
|
13
13
|
from enum import Enum
|
14
|
-
from io import IOBase
|
15
14
|
from pathlib import Path
|
16
15
|
from time import sleep
|
17
|
-
from typing import
|
16
|
+
from typing import Any, Dict, Iterable, List, Optional, Union
|
18
17
|
|
19
18
|
import nbformat
|
20
19
|
import tenacity
|
@@ -33,6 +32,7 @@ from typing_extensions import Self
|
|
33
32
|
|
34
33
|
load_dotenv()
|
35
34
|
_LOGGER = logging.getLogger(__name__)
|
35
|
+
_SESSION_TIMEOUT = 300 # 5 minutes
|
36
36
|
|
37
37
|
|
38
38
|
class MimeType(str, Enum):
|
@@ -403,11 +403,8 @@ class CodeInterpreter(abc.ABC):
|
|
403
403
|
self.restart_kernel()
|
404
404
|
return self.exec_cell(code)
|
405
405
|
|
406
|
-
def upload_file(self, file: Union[str, Path
|
406
|
+
def upload_file(self, file: Union[str, Path]) -> str:
|
407
407
|
# Default behavior is a no-op (for local code interpreter)
|
408
|
-
assert not isinstance(
|
409
|
-
file, IO
|
410
|
-
), "Don't pass IO objects to upload_file() of local interpreter"
|
411
408
|
return str(file)
|
412
409
|
|
413
410
|
def download_file(self, file_path: str) -> Path:
|
@@ -416,7 +413,6 @@ class CodeInterpreter(abc.ABC):
|
|
416
413
|
|
417
414
|
|
418
415
|
class E2BCodeInterpreter(CodeInterpreter):
|
419
|
-
KEEP_ALIVE_SEC: int = 300
|
420
416
|
|
421
417
|
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
422
418
|
super().__init__(*args, **kwargs)
|
@@ -437,8 +433,8 @@ print(f"Vision Agent version: {va_version}")"""
|
|
437
433
|
_LOGGER.info(f"E2BCodeInterpreter initialized:\n{sys_versions}")
|
438
434
|
|
439
435
|
def close(self, *args: Any, **kwargs: Any) -> None:
|
440
|
-
self.interpreter.notebook.close()
|
441
436
|
self.interpreter.close()
|
437
|
+
self.interpreter.kill()
|
442
438
|
|
443
439
|
def restart_kernel(self) -> None:
|
444
440
|
self.interpreter.notebook.restart_kernel()
|
@@ -449,25 +445,27 @@ print(f"Vision Agent version: {va_version}")"""
|
|
449
445
|
retry=tenacity.retry_if_exception_type(TimeoutError),
|
450
446
|
)
|
451
447
|
def exec_cell(self, code: str) -> Execution:
|
452
|
-
self.interpreter.
|
448
|
+
if not self.interpreter.is_running():
|
449
|
+
raise ConnectionResetError(
|
450
|
+
"Remote sandbox is closed unexpectedly. Please retry the operation."
|
451
|
+
)
|
452
|
+
self.interpreter.set_timeout(_SESSION_TIMEOUT) # Extend the life of the sandbox
|
453
453
|
execution = self.interpreter.notebook.exec_cell(code, timeout=self.timeout)
|
454
454
|
return Execution.from_e2b_execution(execution)
|
455
455
|
|
456
|
-
def upload_file(self, file: Union[str, Path
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
file.close()
|
464
|
-
_LOGGER.info(f"File ({file}) is uploaded to: {file.name}")
|
456
|
+
def upload_file(self, file: Union[str, Path]) -> str:
|
457
|
+
file_name = Path(file).name
|
458
|
+
remote_path = f"/home/user/{file_name}"
|
459
|
+
with open(file, "rb") as f:
|
460
|
+
self.interpreter.files.write(path=remote_path, data=f)
|
461
|
+
_LOGGER.info(f"File ({file}) is uploaded to: {remote_path}")
|
462
|
+
return remote_path
|
465
463
|
|
466
464
|
def download_file(self, file_path: str) -> Path:
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
465
|
+
with tempfile.NamedTemporaryFile(mode="w+b", delete=False) as file:
|
466
|
+
file.write(self.interpreter.files.read(path=file_path, format="bytes"))
|
467
|
+
_LOGGER.info(f"File ({file_path}) is downloaded to: {file.name}")
|
468
|
+
return Path(file.name)
|
471
469
|
|
472
470
|
@staticmethod
|
473
471
|
@tenacity.retry(
|
@@ -480,7 +478,7 @@ print(f"Vision Agent version: {va_version}")"""
|
|
480
478
|
|
481
479
|
|
482
480
|
class LocalCodeInterpreter(CodeInterpreter):
|
483
|
-
def __init__(self, timeout: int =
|
481
|
+
def __init__(self, timeout: int = _SESSION_TIMEOUT) -> None:
|
484
482
|
super().__init__(timeout=timeout)
|
485
483
|
self.nb = nbformat.v4.new_notebook()
|
486
484
|
self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
|
@@ -568,9 +566,9 @@ class CodeInterpreterFactory:
|
|
568
566
|
@staticmethod
|
569
567
|
def new_instance() -> CodeInterpreter:
|
570
568
|
if os.getenv("CODE_SANDBOX_RUNTIME") == "e2b":
|
571
|
-
instance: CodeInterpreter = E2BCodeInterpreter(timeout=
|
569
|
+
instance: CodeInterpreter = E2BCodeInterpreter(timeout=_SESSION_TIMEOUT)
|
572
570
|
else:
|
573
|
-
instance = LocalCodeInterpreter(timeout=
|
571
|
+
instance = LocalCodeInterpreter(timeout=_SESSION_TIMEOUT)
|
574
572
|
atexit.register(instance.close)
|
575
573
|
return instance
|
576
574
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.77
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -10,7 +10,7 @@ Classifier: Programming Language :: Python :: 3.9
|
|
10
10
|
Classifier: Programming Language :: Python :: 3.10
|
11
11
|
Classifier: Programming Language :: Python :: 3.11
|
12
12
|
Requires-Dist: e2b (>=0.17.1,<0.18.0)
|
13
|
-
Requires-Dist: e2b-code-interpreter (
|
13
|
+
Requires-Dist: e2b-code-interpreter (==0.0.11a1)
|
14
14
|
Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
|
15
15
|
Requires-Dist: langsmith (>=0.1.58,<0.2.0)
|
16
16
|
Requires-Dist: moviepy (>=1.0.0,<2.0.0)
|
@@ -23,6 +23,7 @@ Requires-Dist: pandas (>=2.0.0,<3.0.0)
|
|
23
23
|
Requires-Dist: pillow (>=10.0.0,<11.0.0)
|
24
24
|
Requires-Dist: pillow-heif (>=0.16.0,<0.17.0)
|
25
25
|
Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
|
26
|
+
Requires-Dist: pytube (==15.0.0)
|
26
27
|
Requires-Dist: requests (>=2.0.0,<3.0.0)
|
27
28
|
Requires-Dist: rich (>=13.7.1,<14.0.0)
|
28
29
|
Requires-Dist: scipy (>=1.13.0,<1.14.0)
|
@@ -7,17 +7,17 @@ vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuF
|
|
7
7
|
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
8
8
|
vision_agent/lmm/__init__.py,sha256=bw24xyQJHGzmph5e-bKCiTh9AX6tRFI2OUd0mofxjZI,68
|
9
9
|
vision_agent/lmm/lmm.py,sha256=TzzACjTP1MNSrHolUWY7fEJzdVfZELQyImRpT8IU_1E,11690
|
10
|
-
vision_agent/tools/__init__.py,sha256=
|
10
|
+
vision_agent/tools/__init__.py,sha256=mF47kfi5X5jfboUxULJnWnFbv1M9uTmmCU3_0uBZVwk,1838
|
11
11
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
12
12
|
vision_agent/tools/tool_utils.py,sha256=ZOY45bCX3nlo6iGwaZ8RVpRJB-vWxkXDed9oegT7-p0,1838
|
13
|
-
vision_agent/tools/tools.py,sha256=
|
13
|
+
vision_agent/tools/tools.py,sha256=TkZqNYX-ocwdaCdXd6c6tysSa_HX2y6Nrgl4JKni4IQ,43661
|
14
14
|
vision_agent/utils/__init__.py,sha256=CW84HnhqI6XQVuxf2KifkLnSuO7EOhmuL09-gAymAak,219
|
15
|
-
vision_agent/utils/execute.py,sha256=
|
15
|
+
vision_agent/utils/execute.py,sha256=DMaQz5-yULxDx-TlSMTRKOPHE7VmyR7PArhXXilm7h0,21368
|
16
16
|
vision_agent/utils/image_utils.py,sha256=_cdiS5YrLzqkq_ZgFUO897m5M4_SCIThwUy4lOklfB8,7700
|
17
17
|
vision_agent/utils/sim.py,sha256=ci6Eta73dDgLP1Ajtknbgmf1g8aAvBHqlVQvBuLMKXQ,4427
|
18
18
|
vision_agent/utils/type_defs.py,sha256=BlI8ywWHAplC7kYWLvt4AOdnKpEW3qWEFm-GEOSkrFQ,1792
|
19
19
|
vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
|
20
|
-
vision_agent-0.2.
|
21
|
-
vision_agent-0.2.
|
22
|
-
vision_agent-0.2.
|
23
|
-
vision_agent-0.2.
|
20
|
+
vision_agent-0.2.77.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
21
|
+
vision_agent-0.2.77.dist-info/METADATA,sha256=2GjXWlij7wzd19pzbjNRt__AVhpVtLcAe_WfGnydxTI,9433
|
22
|
+
vision_agent-0.2.77.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
23
|
+
vision_agent-0.2.77.dist-info/RECORD,,
|
File without changes
|
File without changes
|