vision-agent 0.2.46__py3-none-any.whl → 0.2.48__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent.py +5 -1
- vision_agent/tools/__init__.py +1 -1
- vision_agent/tools/tools.py +36 -25
- vision_agent/utils/execute.py +8 -0
- vision_agent/utils/video.py +0 -2
- {vision_agent-0.2.46.dist-info → vision_agent-0.2.48.dist-info}/METADATA +1 -1
- {vision_agent-0.2.46.dist-info → vision_agent-0.2.48.dist-info}/RECORD +9 -9
- {vision_agent-0.2.46.dist-info → vision_agent-0.2.48.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.46.dist-info → vision_agent-0.2.48.dist-info}/WHEEL +0 -0
@@ -36,7 +36,11 @@ logging.basicConfig(stream=sys.stdout)
|
|
36
36
|
_LOGGER = logging.getLogger(__name__)
|
37
37
|
_MAX_TABULATE_COL_WIDTH = 80
|
38
38
|
_CONSOLE = Console()
|
39
|
-
_DEFAULT_IMPORT = "\n".join(T.__new_tools__)
|
39
|
+
_DEFAULT_IMPORT = "\n".join(T.__new_tools__) + "\n".join(
|
40
|
+
[
|
41
|
+
"from typing import *",
|
42
|
+
]
|
43
|
+
)
|
40
44
|
|
41
45
|
|
42
46
|
def get_diff(before: str, after: str) -> str:
|
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/tools.py
CHANGED
@@ -5,12 +5,13 @@ import logging
|
|
5
5
|
import tempfile
|
6
6
|
from importlib import resources
|
7
7
|
from pathlib import Path
|
8
|
-
from typing import Any, Callable, Dict, List, Tuple, Union, cast
|
8
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
9
9
|
|
10
10
|
import cv2
|
11
11
|
import numpy as np
|
12
12
|
import pandas as pd
|
13
13
|
import requests
|
14
|
+
from moviepy.editor import ImageSequenceClip
|
14
15
|
from PIL import Image, ImageDraw, ImageFont
|
15
16
|
|
16
17
|
from vision_agent.tools.tool_utils import _send_inference_request
|
@@ -545,24 +546,49 @@ def save_image(image: np.ndarray) -> str:
|
|
545
546
|
>>> save_image(image)
|
546
547
|
"/tmp/tmpabc123.png"
|
547
548
|
"""
|
549
|
+
from IPython.display import display
|
548
550
|
|
551
|
+
pil_image = Image.fromarray(image.astype(np.uint8))
|
552
|
+
display(pil_image)
|
549
553
|
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
|
550
|
-
pil_image = Image.fromarray(image.astype(np.uint8))
|
551
554
|
pil_image.save(f, "PNG")
|
552
555
|
return f.name
|
553
556
|
|
554
557
|
|
555
|
-
def
|
556
|
-
|
557
|
-
|
558
|
+
def save_video(
|
559
|
+
frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float = 4
|
560
|
+
) -> str:
|
561
|
+
"""'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.
|
558
562
|
|
559
563
|
Parameters:
|
560
|
-
|
564
|
+
frames (list[np.ndarray]): A list of frames to save.
|
565
|
+
output_video_path (str): The path to save the video file. If not provided, a temporary file will be created.
|
566
|
+
fps (float): The number of frames composes a second in the video.
|
567
|
+
|
568
|
+
Returns:
|
569
|
+
str: The path to the saved video file.
|
561
570
|
|
562
571
|
Example
|
563
572
|
-------
|
564
|
-
>>>
|
573
|
+
>>> save_video(frames)
|
574
|
+
"/tmp/tmpvideo123.mp4"
|
565
575
|
"""
|
576
|
+
if fps <= 0:
|
577
|
+
_LOGGER.warning(f"Invalid fps value: {fps}. Setting fps to 4 (default value).")
|
578
|
+
fps = 4
|
579
|
+
with ImageSequenceClip(frames, fps=fps) as video:
|
580
|
+
if output_video_path:
|
581
|
+
f = open(output_video_path, "wb")
|
582
|
+
else:
|
583
|
+
f = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) # type: ignore
|
584
|
+
video.write_videofile(f.name, codec="libx264")
|
585
|
+
f.close()
|
586
|
+
_save_video_to_result(f.name)
|
587
|
+
return f.name
|
588
|
+
|
589
|
+
|
590
|
+
def _save_video_to_result(video_uri: str) -> None:
|
591
|
+
"""Saves a video into the result of the code execution (as an intermediate output)."""
|
566
592
|
from IPython.display import display
|
567
593
|
|
568
594
|
serializer = FileSerializer(video_uri)
|
@@ -595,8 +621,6 @@ def overlay_bounding_boxes(
|
|
595
621
|
image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
|
596
622
|
)
|
597
623
|
"""
|
598
|
-
from IPython.display import display
|
599
|
-
|
600
624
|
pil_image = Image.fromarray(image.astype(np.uint8))
|
601
625
|
|
602
626
|
if len(set([box["label"] for box in bboxes])) > len(COLORS):
|
@@ -634,9 +658,6 @@ def overlay_bounding_boxes(
|
|
634
658
|
text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
|
635
659
|
draw.rectangle((box[0], box[1], text_box[2], text_box[3]), fill=color[label])
|
636
660
|
draw.text((box[0], box[1]), text, fill="black", font=font)
|
637
|
-
|
638
|
-
pil_image = pil_image.convert("RGB")
|
639
|
-
display(pil_image)
|
640
661
|
return np.array(pil_image)
|
641
662
|
|
642
663
|
|
@@ -668,8 +689,6 @@ def overlay_segmentation_masks(
|
|
668
689
|
}],
|
669
690
|
)
|
670
691
|
"""
|
671
|
-
from IPython.display import display
|
672
|
-
|
673
692
|
pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGBA")
|
674
693
|
|
675
694
|
if len(set([mask["label"] for mask in masks])) > len(COLORS):
|
@@ -690,9 +709,6 @@ def overlay_segmentation_masks(
|
|
690
709
|
np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
|
691
710
|
mask_img = Image.fromarray(np_mask.astype(np.uint8))
|
692
711
|
pil_image = Image.alpha_composite(pil_image, mask_img)
|
693
|
-
|
694
|
-
pil_image = pil_image.convert("RGB")
|
695
|
-
display(pil_image)
|
696
712
|
return np.array(pil_image)
|
697
713
|
|
698
714
|
|
@@ -723,8 +739,6 @@ def overlay_heat_map(
|
|
723
739
|
},
|
724
740
|
)
|
725
741
|
"""
|
726
|
-
from IPython.display import display
|
727
|
-
|
728
742
|
pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
|
729
743
|
|
730
744
|
if "heat_map" not in heat_map or len(heat_map["heat_map"]) == 0:
|
@@ -740,10 +754,7 @@ def overlay_heat_map(
|
|
740
754
|
combined = Image.alpha_composite(
|
741
755
|
pil_image.convert("RGBA"), overlay.resize(pil_image.size)
|
742
756
|
)
|
743
|
-
|
744
|
-
pil_image = combined.convert("RGB")
|
745
|
-
display(pil_image)
|
746
|
-
return np.array(pil_image)
|
757
|
+
return np.array(combined)
|
747
758
|
|
748
759
|
|
749
760
|
def get_tool_documentation(funcs: List[Callable[..., Any]]) -> str:
|
@@ -805,7 +816,7 @@ TOOLS = [
|
|
805
816
|
save_json,
|
806
817
|
load_image,
|
807
818
|
save_image,
|
808
|
-
|
819
|
+
save_video,
|
809
820
|
overlay_bounding_boxes,
|
810
821
|
overlay_segmentation_masks,
|
811
822
|
overlay_heat_map,
|
@@ -818,7 +829,7 @@ UTILITIES_DOCSTRING = get_tool_documentation(
|
|
818
829
|
save_json,
|
819
830
|
load_image,
|
820
831
|
save_image,
|
821
|
-
|
832
|
+
save_video,
|
822
833
|
overlay_bounding_boxes,
|
823
834
|
overlay_segmentation_masks,
|
824
835
|
overlay_heat_map,
|
vision_agent/utils/execute.py
CHANGED
@@ -401,6 +401,8 @@ class CodeInterpreter(abc.ABC):
|
|
401
401
|
|
402
402
|
|
403
403
|
class E2BCodeInterpreter(CodeInterpreter):
|
404
|
+
KEEP_ALIVE_SEC: int = 300
|
405
|
+
|
404
406
|
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
405
407
|
super().__init__(*args, **kwargs)
|
406
408
|
assert os.getenv("E2B_API_KEY"), "E2B_API_KEY environment variable must be set"
|
@@ -426,7 +428,13 @@ print(f"Vision Agent version: {va_version}")"""
|
|
426
428
|
def restart_kernel(self) -> None:
|
427
429
|
self.interpreter.notebook.restart_kernel()
|
428
430
|
|
431
|
+
@tenacity.retry(
|
432
|
+
wait=tenacity.wait_exponential_jitter(),
|
433
|
+
stop=tenacity.stop_after_attempt(2),
|
434
|
+
retry=tenacity.retry_if_exception_type(TimeoutError),
|
435
|
+
)
|
429
436
|
def exec_cell(self, code: str) -> Execution:
|
437
|
+
self.interpreter.keep_alive(E2BCodeInterpreter.KEEP_ALIVE_SEC)
|
430
438
|
execution = self.interpreter.notebook.exec_cell(code, timeout=self.timeout)
|
431
439
|
return Execution.from_e2b_execution(execution)
|
432
440
|
|
vision_agent/utils/video.py
CHANGED
@@ -31,7 +31,6 @@ def play_video(video_base64: str) -> None:
|
|
31
31
|
# Display the first frame and wait for any key press to start the video
|
32
32
|
ret, frame = cap.read()
|
33
33
|
if ret:
|
34
|
-
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
35
34
|
cv2.imshow("Video Player", frame)
|
36
35
|
_LOGGER.info(f"Press any key to start playing the video: {temp_video_path}")
|
37
36
|
cv2.waitKey(0) # Wait for any key press
|
@@ -40,7 +39,6 @@ def play_video(video_base64: str) -> None:
|
|
40
39
|
ret, frame = cap.read()
|
41
40
|
if not ret:
|
42
41
|
break
|
43
|
-
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
44
42
|
cv2.imshow("Video Player", frame)
|
45
43
|
# Press 'q' to exit the video
|
46
44
|
if cv2.waitKey(200) & 0xFF == ord("q"):
|
@@ -11,7 +11,7 @@ vision_agent/agent/easytool_v2.py,sha256=CjY-sSj3abxnSq3ZHZMt-7YvRWDXEZsC6RN8FFI
|
|
11
11
|
vision_agent/agent/easytool_v2_prompts.py,sha256=MZSIwovYgB-f-kdJ6btaNDVXptJn47bfOL3-Zn6NiC0,8573
|
12
12
|
vision_agent/agent/reflexion.py,sha256=AlM5AvBJvCslXlYQdZiadq4oVHsNBm3IF_03DglTxRo,10506
|
13
13
|
vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
|
14
|
-
vision_agent/agent/vision_agent.py,sha256=
|
14
|
+
vision_agent/agent/vision_agent.py,sha256=X_LF2wRXVYAr8xMuJs3Omi8n06uVgLNgtF25sidKtfM,20424
|
15
15
|
vision_agent/agent/vision_agent_prompts.py,sha256=hgnTlaYp2HMBHLi3e4faPb-DI5jQL9jfhKq9jyEUEgY,8370
|
16
16
|
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
17
|
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
@@ -19,18 +19,18 @@ vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,
|
|
19
19
|
vision_agent/llm/llm.py,sha256=UZ73GqQHE-NKOJWsrOTWfmdHYsbCBkJ5rZ7dhcSCHHw,5951
|
20
20
|
vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
|
21
21
|
vision_agent/lmm/lmm.py,sha256=NwcZYLTzi95LSMAk0sTtw7G_zBLa9lU-DHM5GUUCiK4,10622
|
22
|
-
vision_agent/tools/__init__.py,sha256=
|
22
|
+
vision_agent/tools/__init__.py,sha256=Sng6dChynJJCYWjraXXM0tep_VPdnYl3L9vb0HMy_Pc,1528
|
23
23
|
vision_agent/tools/easytool_tools.py,sha256=pZc5dQlYINlV4nYbbzsDi3-wauA-fCeD2iGmJUMoUfE,47373
|
24
24
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
25
25
|
vision_agent/tools/tool_utils.py,sha256=wzRacbUpqk9hhfX_Y08rL8qP0XCN2w-8IZoYLi3Upn4,869
|
26
|
-
vision_agent/tools/tools.py,sha256=
|
26
|
+
vision_agent/tools/tools.py,sha256=Vpn2SxtjEcnztovat6qMiH52gFsDHo3ikEPrAT4e5yc,26639
|
27
27
|
vision_agent/utils/__init__.py,sha256=Ce4yPhoWanRsnTy3X7YzZNBYYRJsrJeT7N59WUf8GZM,209
|
28
|
-
vision_agent/utils/execute.py,sha256=
|
28
|
+
vision_agent/utils/execute.py,sha256=GqoAodxtwTPBr1nujPTsWiZO2rBGvWVXTe8lgxY4d_g,20603
|
29
29
|
vision_agent/utils/image_utils.py,sha256=_cdiS5YrLzqkq_ZgFUO897m5M4_SCIThwUy4lOklfB8,7700
|
30
30
|
vision_agent/utils/sim.py,sha256=oUZ-6eu8Io-UNt9GXJ0XRKtP-Wc0sPWVzYGVpB2yDFk,3001
|
31
31
|
vision_agent/utils/type_defs.py,sha256=BlI8ywWHAplC7kYWLvt4AOdnKpEW3qWEFm-GEOSkrFQ,1792
|
32
|
-
vision_agent/utils/video.py,sha256=
|
33
|
-
vision_agent-0.2.
|
34
|
-
vision_agent-0.2.
|
35
|
-
vision_agent-0.2.
|
36
|
-
vision_agent-0.2.
|
32
|
+
vision_agent/utils/video.py,sha256=BJ9fomy2giAl038JThQP1WQZ-u4J4J_nsZB7QEWvlcQ,8767
|
33
|
+
vision_agent-0.2.48.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
34
|
+
vision_agent-0.2.48.dist-info/METADATA,sha256=sJSWNAHN2-JMNb5hi4iA-HTzKNskLioIse9sdrMDuy4,6817
|
35
|
+
vision_agent-0.2.48.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
36
|
+
vision_agent-0.2.48.dist-info/RECORD,,
|
File without changes
|
File without changes
|