vision-agent 0.2.47__py3-none-any.whl → 0.2.49__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent.py +5 -1
- vision_agent/tools/__init__.py +1 -1
- vision_agent/tools/tools.py +39 -31
- vision_agent/utils/execute.py +3 -0
- vision_agent/utils/video.py +0 -2
- {vision_agent-0.2.47.dist-info → vision_agent-0.2.49.dist-info}/METADATA +1 -1
- {vision_agent-0.2.47.dist-info → vision_agent-0.2.49.dist-info}/RECORD +9 -9
- {vision_agent-0.2.47.dist-info → vision_agent-0.2.49.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.47.dist-info → vision_agent-0.2.49.dist-info}/WHEEL +0 -0
@@ -36,7 +36,11 @@ logging.basicConfig(stream=sys.stdout)
|
|
36
36
|
_LOGGER = logging.getLogger(__name__)
|
37
37
|
_MAX_TABULATE_COL_WIDTH = 80
|
38
38
|
_CONSOLE = Console()
|
39
|
-
_DEFAULT_IMPORT = "\n".join(T.__new_tools__)
|
39
|
+
_DEFAULT_IMPORT = "\n".join(T.__new_tools__) + "\n".join(
|
40
|
+
[
|
41
|
+
"from typing import *",
|
42
|
+
]
|
43
|
+
)
|
40
44
|
|
41
45
|
|
42
46
|
def get_diff(before: str, after: str) -> str:
|
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/tools.py
CHANGED
@@ -5,12 +5,13 @@ import logging
|
|
5
5
|
import tempfile
|
6
6
|
from importlib import resources
|
7
7
|
from pathlib import Path
|
8
|
-
from typing import Any, Callable, Dict, List, Tuple, Union, cast
|
8
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
9
9
|
|
10
10
|
import cv2
|
11
11
|
import numpy as np
|
12
12
|
import pandas as pd
|
13
13
|
import requests
|
14
|
+
from moviepy.editor import ImageSequenceClip
|
14
15
|
from PIL import Image, ImageDraw, ImageFont
|
15
16
|
|
16
17
|
from vision_agent.tools.tool_utils import _send_inference_request
|
@@ -545,24 +546,49 @@ def save_image(image: np.ndarray) -> str:
|
|
545
546
|
>>> save_image(image)
|
546
547
|
"/tmp/tmpabc123.png"
|
547
548
|
"""
|
549
|
+
from IPython.display import display
|
548
550
|
|
551
|
+
pil_image = Image.fromarray(image.astype(np.uint8))
|
552
|
+
display(pil_image)
|
549
553
|
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
|
550
|
-
pil_image = Image.fromarray(image.astype(np.uint8))
|
551
554
|
pil_image.save(f, "PNG")
|
552
555
|
return f.name
|
553
556
|
|
554
557
|
|
555
|
-
def
|
556
|
-
|
557
|
-
|
558
|
+
def save_video(
|
559
|
+
frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float = 4
|
560
|
+
) -> str:
|
561
|
+
"""'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.
|
558
562
|
|
559
563
|
Parameters:
|
560
|
-
|
564
|
+
frames (list[np.ndarray]): A list of frames to save.
|
565
|
+
output_video_path (str): The path to save the video file. If not provided, a temporary file will be created.
|
566
|
+
fps (float): The number of frames composes a second in the video.
|
567
|
+
|
568
|
+
Returns:
|
569
|
+
str: The path to the saved video file.
|
561
570
|
|
562
571
|
Example
|
563
572
|
-------
|
564
|
-
>>>
|
573
|
+
>>> save_video(frames)
|
574
|
+
"/tmp/tmpvideo123.mp4"
|
565
575
|
"""
|
576
|
+
if fps <= 0:
|
577
|
+
_LOGGER.warning(f"Invalid fps value: {fps}. Setting fps to 4 (default value).")
|
578
|
+
fps = 4
|
579
|
+
with ImageSequenceClip(frames, fps=fps) as video:
|
580
|
+
if output_video_path:
|
581
|
+
f = open(output_video_path, "wb")
|
582
|
+
else:
|
583
|
+
f = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) # type: ignore
|
584
|
+
video.write_videofile(f.name, codec="libx264")
|
585
|
+
f.close()
|
586
|
+
_save_video_to_result(f.name)
|
587
|
+
return f.name
|
588
|
+
|
589
|
+
|
590
|
+
def _save_video_to_result(video_uri: str) -> None:
|
591
|
+
"""Saves a video into the result of the code execution (as an intermediate output)."""
|
566
592
|
from IPython.display import display
|
567
593
|
|
568
594
|
serializer = FileSerializer(video_uri)
|
@@ -595,8 +621,6 @@ def overlay_bounding_boxes(
|
|
595
621
|
image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
|
596
622
|
)
|
597
623
|
"""
|
598
|
-
from IPython.display import display
|
599
|
-
|
600
624
|
pil_image = Image.fromarray(image.astype(np.uint8))
|
601
625
|
|
602
626
|
if len(set([box["label"] for box in bboxes])) > len(COLORS):
|
@@ -623,20 +647,14 @@ def overlay_bounding_boxes(
|
|
623
647
|
box = elt["bbox"]
|
624
648
|
scores = elt["score"]
|
625
649
|
|
626
|
-
box
|
627
|
-
|
628
|
-
|
629
|
-
int(box[2] * width),
|
630
|
-
int(box[3] * height),
|
631
|
-
]
|
650
|
+
# denormalize the box if it is normalized
|
651
|
+
box = denormalize_bbox(box, (height, width))
|
652
|
+
|
632
653
|
draw.rectangle(box, outline=color[label], width=4)
|
633
654
|
text = f"{label}: {scores:.2f}"
|
634
655
|
text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
|
635
656
|
draw.rectangle((box[0], box[1], text_box[2], text_box[3]), fill=color[label])
|
636
657
|
draw.text((box[0], box[1]), text, fill="black", font=font)
|
637
|
-
|
638
|
-
pil_image = pil_image.convert("RGB")
|
639
|
-
display(pil_image)
|
640
658
|
return np.array(pil_image)
|
641
659
|
|
642
660
|
|
@@ -668,8 +686,6 @@ def overlay_segmentation_masks(
|
|
668
686
|
}],
|
669
687
|
)
|
670
688
|
"""
|
671
|
-
from IPython.display import display
|
672
|
-
|
673
689
|
pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGBA")
|
674
690
|
|
675
691
|
if len(set([mask["label"] for mask in masks])) > len(COLORS):
|
@@ -690,9 +706,6 @@ def overlay_segmentation_masks(
|
|
690
706
|
np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
|
691
707
|
mask_img = Image.fromarray(np_mask.astype(np.uint8))
|
692
708
|
pil_image = Image.alpha_composite(pil_image, mask_img)
|
693
|
-
|
694
|
-
pil_image = pil_image.convert("RGB")
|
695
|
-
display(pil_image)
|
696
709
|
return np.array(pil_image)
|
697
710
|
|
698
711
|
|
@@ -723,8 +736,6 @@ def overlay_heat_map(
|
|
723
736
|
},
|
724
737
|
)
|
725
738
|
"""
|
726
|
-
from IPython.display import display
|
727
|
-
|
728
739
|
pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
|
729
740
|
|
730
741
|
if "heat_map" not in heat_map or len(heat_map["heat_map"]) == 0:
|
@@ -740,10 +751,7 @@ def overlay_heat_map(
|
|
740
751
|
combined = Image.alpha_composite(
|
741
752
|
pil_image.convert("RGBA"), overlay.resize(pil_image.size)
|
742
753
|
)
|
743
|
-
|
744
|
-
pil_image = combined.convert("RGB")
|
745
|
-
display(pil_image)
|
746
|
-
return np.array(pil_image)
|
754
|
+
return np.array(combined)
|
747
755
|
|
748
756
|
|
749
757
|
def get_tool_documentation(funcs: List[Callable[..., Any]]) -> str:
|
@@ -805,7 +813,7 @@ TOOLS = [
|
|
805
813
|
save_json,
|
806
814
|
load_image,
|
807
815
|
save_image,
|
808
|
-
|
816
|
+
save_video,
|
809
817
|
overlay_bounding_boxes,
|
810
818
|
overlay_segmentation_masks,
|
811
819
|
overlay_heat_map,
|
@@ -818,7 +826,7 @@ UTILITIES_DOCSTRING = get_tool_documentation(
|
|
818
826
|
save_json,
|
819
827
|
load_image,
|
820
828
|
save_image,
|
821
|
-
|
829
|
+
save_video,
|
822
830
|
overlay_bounding_boxes,
|
823
831
|
overlay_segmentation_masks,
|
824
832
|
overlay_heat_map,
|
vision_agent/utils/execute.py
CHANGED
@@ -401,6 +401,8 @@ class CodeInterpreter(abc.ABC):
|
|
401
401
|
|
402
402
|
|
403
403
|
class E2BCodeInterpreter(CodeInterpreter):
|
404
|
+
KEEP_ALIVE_SEC: int = 300
|
405
|
+
|
404
406
|
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
405
407
|
super().__init__(*args, **kwargs)
|
406
408
|
assert os.getenv("E2B_API_KEY"), "E2B_API_KEY environment variable must be set"
|
@@ -432,6 +434,7 @@ print(f"Vision Agent version: {va_version}")"""
|
|
432
434
|
retry=tenacity.retry_if_exception_type(TimeoutError),
|
433
435
|
)
|
434
436
|
def exec_cell(self, code: str) -> Execution:
|
437
|
+
self.interpreter.keep_alive(E2BCodeInterpreter.KEEP_ALIVE_SEC)
|
435
438
|
execution = self.interpreter.notebook.exec_cell(code, timeout=self.timeout)
|
436
439
|
return Execution.from_e2b_execution(execution)
|
437
440
|
|
vision_agent/utils/video.py
CHANGED
@@ -31,7 +31,6 @@ def play_video(video_base64: str) -> None:
|
|
31
31
|
# Display the first frame and wait for any key press to start the video
|
32
32
|
ret, frame = cap.read()
|
33
33
|
if ret:
|
34
|
-
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
35
34
|
cv2.imshow("Video Player", frame)
|
36
35
|
_LOGGER.info(f"Press any key to start playing the video: {temp_video_path}")
|
37
36
|
cv2.waitKey(0) # Wait for any key press
|
@@ -40,7 +39,6 @@ def play_video(video_base64: str) -> None:
|
|
40
39
|
ret, frame = cap.read()
|
41
40
|
if not ret:
|
42
41
|
break
|
43
|
-
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
44
42
|
cv2.imshow("Video Player", frame)
|
45
43
|
# Press 'q' to exit the video
|
46
44
|
if cv2.waitKey(200) & 0xFF == ord("q"):
|
@@ -11,7 +11,7 @@ vision_agent/agent/easytool_v2.py,sha256=CjY-sSj3abxnSq3ZHZMt-7YvRWDXEZsC6RN8FFI
|
|
11
11
|
vision_agent/agent/easytool_v2_prompts.py,sha256=MZSIwovYgB-f-kdJ6btaNDVXptJn47bfOL3-Zn6NiC0,8573
|
12
12
|
vision_agent/agent/reflexion.py,sha256=AlM5AvBJvCslXlYQdZiadq4oVHsNBm3IF_03DglTxRo,10506
|
13
13
|
vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
|
14
|
-
vision_agent/agent/vision_agent.py,sha256=
|
14
|
+
vision_agent/agent/vision_agent.py,sha256=X_LF2wRXVYAr8xMuJs3Omi8n06uVgLNgtF25sidKtfM,20424
|
15
15
|
vision_agent/agent/vision_agent_prompts.py,sha256=hgnTlaYp2HMBHLi3e4faPb-DI5jQL9jfhKq9jyEUEgY,8370
|
16
16
|
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
17
|
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
@@ -19,18 +19,18 @@ vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,
|
|
19
19
|
vision_agent/llm/llm.py,sha256=UZ73GqQHE-NKOJWsrOTWfmdHYsbCBkJ5rZ7dhcSCHHw,5951
|
20
20
|
vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
|
21
21
|
vision_agent/lmm/lmm.py,sha256=NwcZYLTzi95LSMAk0sTtw7G_zBLa9lU-DHM5GUUCiK4,10622
|
22
|
-
vision_agent/tools/__init__.py,sha256=
|
22
|
+
vision_agent/tools/__init__.py,sha256=Sng6dChynJJCYWjraXXM0tep_VPdnYl3L9vb0HMy_Pc,1528
|
23
23
|
vision_agent/tools/easytool_tools.py,sha256=pZc5dQlYINlV4nYbbzsDi3-wauA-fCeD2iGmJUMoUfE,47373
|
24
24
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
25
25
|
vision_agent/tools/tool_utils.py,sha256=wzRacbUpqk9hhfX_Y08rL8qP0XCN2w-8IZoYLi3Upn4,869
|
26
|
-
vision_agent/tools/tools.py,sha256=
|
26
|
+
vision_agent/tools/tools.py,sha256=IuTxw-08UodemQAmiIQWdwpqg_Cjf-opGuqtYHv8nuk,26583
|
27
27
|
vision_agent/utils/__init__.py,sha256=Ce4yPhoWanRsnTy3X7YzZNBYYRJsrJeT7N59WUf8GZM,209
|
28
|
-
vision_agent/utils/execute.py,sha256=
|
28
|
+
vision_agent/utils/execute.py,sha256=GqoAodxtwTPBr1nujPTsWiZO2rBGvWVXTe8lgxY4d_g,20603
|
29
29
|
vision_agent/utils/image_utils.py,sha256=_cdiS5YrLzqkq_ZgFUO897m5M4_SCIThwUy4lOklfB8,7700
|
30
30
|
vision_agent/utils/sim.py,sha256=oUZ-6eu8Io-UNt9GXJ0XRKtP-Wc0sPWVzYGVpB2yDFk,3001
|
31
31
|
vision_agent/utils/type_defs.py,sha256=BlI8ywWHAplC7kYWLvt4AOdnKpEW3qWEFm-GEOSkrFQ,1792
|
32
|
-
vision_agent/utils/video.py,sha256=
|
33
|
-
vision_agent-0.2.
|
34
|
-
vision_agent-0.2.
|
35
|
-
vision_agent-0.2.
|
36
|
-
vision_agent-0.2.
|
32
|
+
vision_agent/utils/video.py,sha256=BJ9fomy2giAl038JThQP1WQZ-u4J4J_nsZB7QEWvlcQ,8767
|
33
|
+
vision_agent-0.2.49.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
34
|
+
vision_agent-0.2.49.dist-info/METADATA,sha256=J8uaMXfLvURGCOujviCSb0aaCYOWQnAphcZHjD1bjWw,6817
|
35
|
+
vision_agent-0.2.49.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
36
|
+
vision_agent-0.2.49.dist-info/RECORD,,
|
File without changes
|
File without changes
|