PyPI - vision-agent - Versions diffs - 0.2.125__tar.gz → 0.2.127__tar.gz - Mend

vision-agent 0.2.125tar.gz → 0.2.127tar.gz

Files changed (34) hide show

{vision_agent-0.2.125 → vision_agent-0.2.127}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.125
+Version: 0.2.127
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai
@@ -10,11 +10,12 @@ Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Requires-Dist: anthropic (>=0.31.0,<0.32.0)
+Requires-Dist: av (>=11.0.0,<12.0.0)
 Requires-Dist: e2b (>=0.17.2a50,<0.18.0)
 Requires-Dist: e2b-code-interpreter (==0.0.11a37)
+Requires-Dist: eva-decord (>=0.6.1,<0.7.0)
 Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
 Requires-Dist: langsmith (>=0.1.58,<0.2.0)
-Requires-Dist: moviepy (>=1.0.0,<2.0.0)
 Requires-Dist: nbclient (>=0.10.0,<0.11.0)
 Requires-Dist: nbformat (>=5.10.4,<6.0.0)
 Requires-Dist: numpy (>=1.21.0,<2.0.0)

{vision_agent-0.2.125 → vision_agent-0.2.127}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.125"
+version = "0.2.127"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"
@@ -25,7 +25,6 @@ tqdm = ">=4.64.0,<5.0.0"
 pandas = "2.*"
 openai = "1.*"
 typing_extensions = "4.*"
-moviepy = "1.*"
 opencv-python = "4.*"
 tabulate = "^0.9.0"
 pydantic-settings = "^2.2.1"
@@ -42,6 +41,8 @@ pillow-heif = "^0.16.0"
 pytube = "15.0.0"
 anthropic = "^0.31.0"
 pydantic = "2.7.4"
+eva-decord = "^0.6.1"
+av = "^11.0.0"
 [tool.poetry.group.dev.dependencies]
 autoflake = "1.*"
@@ -100,10 +101,8 @@ show_error_codes = true
 ignore_missing_imports = true
 module = [
     "cv2.*",
-    "faiss.*",
     "openai.*",
     "sentence_transformers.*",
-    "moviepy.*",
     "e2b_code_interpreter.*",
     "e2b.*"
 ]

{vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/agent/vision_agent_coder.py RENAMED Viewed

@@ -173,7 +173,7 @@ def pick_plan(
     if verbosity == 2:
         _print_code("Initial code and tests:", code)
-        _LOGGER.info(f"Initial code execution result:\n{tool_output.text()}")
+        _LOGGER.info(f"Initial code execution result:\n{tool_output_str}")
     log_progress(
         {

{vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/tools/tools.py RENAMED Viewed

@@ -12,7 +12,6 @@ from uuid import UUID
 import cv2
 import numpy as np
 import requests
-from moviepy.editor import ImageSequenceClip
 from PIL import Image, ImageDraw, ImageEnhance, ImageFont
 from pillow_heif import register_heif_opener  # type: ignore
 from pytube import YouTube  # type: ignore
@@ -35,7 +34,6 @@ from vision_agent.tools.tools_types import (
     ODResponseData,
     PromptTask,
 )
-from vision_agent.utils import extract_frames_from_video
 from vision_agent.utils.exceptions import FineTuneModelIsNotReady
 from vision_agent.utils.execute import FileSerializer, MimeType
 from vision_agent.utils.image_utils import (
@@ -44,13 +42,17 @@ from vision_agent.utils.image_utils import (
     convert_to_b64,
     denormalize_bbox,
     encode_image_bytes,
-    frames_to_bytes,
     get_image_size,
     normalize_bbox,
     numpy_to_bytes,
     rle_decode,
     rle_decode_array,
 )
+from vision_agent.utils.video import (
+    extract_frames_from_video,
+    frames_to_bytes,
+    video_writer,
+)
 register_heif_opener()
@@ -1513,17 +1515,16 @@ def save_video(
         "/tmp/tmpvideo123.mp4"
     """
     if fps <= 0:
-        _LOGGER.warning(f"Invalid fps value: {fps}. Setting fps to 4 (default value).")
-        fps = 4
-    with ImageSequenceClip(frames, fps=fps) as video:
-        if output_video_path:
-            f = open(output_video_path, "wb")
-        else:
-            f = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)  # type: ignore
-        video.write_videofile(f.name, codec="libx264")
-        f.close()
-        _save_video_to_result(f.name)
-    return f.name
+        raise ValueError(f"fps must be greater than 0 got {fps}")
+    if output_video_path is None:
+        output_video_path = tempfile.NamedTemporaryFile(
+            delete=False, suffix=".mp4"
+        ).name
+    output_video_path = video_writer(frames, fps, output_video_path)
+    _save_video_to_result(output_video_path)
+    return output_video_path
 def _save_video_to_result(video_uri: str) -> None:

{vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/utils/__init__.py RENAMED Viewed

@@ -7,4 +7,4 @@ from .execute import (
     Result,
 )
 from .sim import AzureSim, OllamaSim, Sim, load_sim, merge_sim
-from .video import extract_frames_from_video
+from .video import extract_frames_from_video, video_writer

{vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/utils/image_utils.py RENAMED Viewed

@@ -2,14 +2,12 @@
 import base64
 import io
-import tempfile
 from importlib import resources
 from io import BytesIO
 from pathlib import Path
 from typing import Dict, List, Tuple, Union
 import numpy as np
-from moviepy.editor import ImageSequenceClip
 from PIL import Image, ImageDraw, ImageFont
 from PIL.Image import Image as ImageType
@@ -90,24 +88,6 @@ def rle_decode_array(rle: Dict[str, List[int]]) -> np.ndarray:
     return binary_mask
-def frames_to_bytes(
-    frames: List[np.ndarray], fps: float = 10, file_ext: str = "mp4"
-) -> bytes:
-    r"""Convert a list of frames to a video file encoded into a byte string.
-    Parameters:
-        frames: the list of frames
-        fps: the frames per second of the video
-        file_ext: the file extension of the video file
-    """
-    with tempfile.NamedTemporaryFile(delete=True) as temp_file:
-        clip = ImageSequenceClip(frames, fps=fps)
-        clip.write_videofile(temp_file.name + f".{file_ext}", fps=fps, codec="libx264")
-        with open(temp_file.name + f".{file_ext}", "rb") as f:
-            buffer_bytes = f.read()
-    return buffer_bytes
 def b64_to_pil(b64_str: str) -> ImageType:
     r"""Convert a base64 string to a PIL Image.

vision_agent-0.2.127/vision_agent/utils/video.py ADDED Viewed

@@ -0,0 +1,126 @@
+import base64
+import logging
+import tempfile
+from functools import lru_cache
+from typing import List, Optional, Tuple
+import cv2
+import av  # type: ignore
+import numpy as np
+from decord import VideoReader  # type: ignore
+_LOGGER = logging.getLogger(__name__)
+# The maximum length of the clip to extract frames from, in seconds
+def play_video(video_base64: str) -> None:
+    """Play a video file"""
+    video_data = base64.b64decode(video_base64)
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video:
+        temp_video.write(video_data)
+        temp_video_path = temp_video.name
+        cap = cv2.VideoCapture(temp_video_path)
+        if not cap.isOpened():
+            _LOGGER.error("Error: Could not open video.")
+            return
+        # Display the first frame and wait for any key press to start the video
+        ret, frame = cap.read()
+        if ret:
+            cv2.imshow("Video Player", frame)
+            _LOGGER.info(f"Press any key to start playing the video: {temp_video_path}")
+            cv2.waitKey(0)  # Wait for any key press
+        while cap.isOpened():
+            ret, frame = cap.read()
+            if not ret:
+                break
+            cv2.imshow("Video Player", frame)
+            # Press 'q' to exit the video
+            if cv2.waitKey(200) & 0xFF == ord("q"):
+                break
+        cap.release()
+        cv2.destroyAllWindows()
+def _resize_frame(frame: np.ndarray) -> np.ndarray:
+    height, width = frame.shape[:2]
+    new_width = width - (width % 2)
+    new_height = height - (height % 2)
+    return cv2.resize(frame, (new_width, new_height))
+def video_writer(
+    frames: List[np.ndarray], fps: float = 1.0, filename: Optional[str] = None
+) -> str:
+    if filename is None:
+        filename = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
+    container = av.open(filename, mode="w")
+    stream = container.add_stream("h264", rate=fps)
+    height, width = frames[0].shape[:2]
+    stream.height = height - (height % 2)
+    stream.width = width - (width % 2)
+    stream.pix_fmt = "yuv420p"
+    for frame in frames:
+        # Remove the alpha channel (convert RGBA to RGB)
+        frame_rgb = frame[:, :, :3]
+        # Resize the frame to make dimensions divisible by 2
+        frame_rgb = _resize_frame(frame_rgb)
+        av_frame = av.VideoFrame.from_ndarray(frame_rgb, format="rgb24")
+        for packet in stream.encode(av_frame):
+            container.mux(packet)
+    for packet in stream.encode():
+        container.mux(packet)
+    container.close()
+    return filename
+def frames_to_bytes(
+    frames: List[np.ndarray], fps: float = 10, file_ext: str = ".mp4"
+) -> bytes:
+    r"""Convert a list of frames to a video file encoded into a byte string.
+    Parameters:
+        frames: the list of frames
+        fps: the frames per second of the video
+        file_ext: the file extension of the video file
+    """
+    with tempfile.NamedTemporaryFile(delete=True, suffix=file_ext) as temp_file:
+        video_writer(frames, fps, temp_file.name)
+        with open(temp_file.name, "rb") as f:
+            buffer_bytes = f.read()
+    return buffer_bytes
+# WARNING: this cache is cache is a little dangerous because if the underlying video
+# contents change but the filename remains the same it will return the old file contents
+# but for vision agent it's unlikely to change the file contents while keeping the
+# same file name and the time savings are very large.
+@lru_cache(maxsize=8)
+def extract_frames_from_video(
+    video_uri: str, fps: float = 1.0
+) -> List[Tuple[np.ndarray, float]]:
+    """Extract frames from a video
+    Parameters:
+        video_uri (str): the path to the video file or a video file url
+        fps (float): the frame rate per second to extract the frames
+    Returns:
+        a list of tuples containing the extracted frame and the timestamp in seconds.
+            E.g. [(frame1, 0.0), (frame2, 0.5), ...]. The timestamp is the time in seconds
+            from the start of the video. E.g. 12.125 means 12.125 seconds from the start of
+            the video. The frames are sorted by the timestamp in ascending order.
+    """
+    vr = VideoReader(video_uri)
+    orig_fps = vr.get_avg_fps()
+    if fps > orig_fps:
+        fps = orig_fps
+    s = orig_fps / fps
+    samples = [(int(i * s), int(i * s) / orig_fps) for i in range(int(len(vr) / s))]
+    frames = vr.get_batch([s[0] for s in samples]).asnumpy()
+    return [(frames[i, :, :, :], samples[i][1]) for i in range(len(samples))]

vision_agent-0.2.125/vision_agent/utils/video.py DELETED Viewed

@@ -1,215 +0,0 @@
-import base64
-import logging
-import math
-import os
-import tempfile
-from concurrent.futures import ProcessPoolExecutor, as_completed
-from typing import List, Tuple, cast
-import cv2
-import numpy as np
-from moviepy.video.io.VideoFileClip import VideoFileClip
-from tqdm import tqdm
-_LOGGER = logging.getLogger(__name__)
-# The maximum length of the clip to extract frames from, in seconds
-_CLIP_LENGTH = 30.0
-def play_video(video_base64: str) -> None:
-    """Play a video file"""
-    video_data = base64.b64decode(video_base64)
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video:
-        temp_video.write(video_data)
-        temp_video_path = temp_video.name
-        cap = cv2.VideoCapture(temp_video_path)
-        if not cap.isOpened():
-            _LOGGER.error("Error: Could not open video.")
-            return
-        # Display the first frame and wait for any key press to start the video
-        ret, frame = cap.read()
-        if ret:
-            cv2.imshow("Video Player", frame)
-            _LOGGER.info(f"Press any key to start playing the video: {temp_video_path}")
-            cv2.waitKey(0)  # Wait for any key press
-        while cap.isOpened():
-            ret, frame = cap.read()
-            if not ret:
-                break
-            cv2.imshow("Video Player", frame)
-            # Press 'q' to exit the video
-            if cv2.waitKey(200) & 0xFF == ord("q"):
-                break
-        cap.release()
-        cv2.destroyAllWindows()
-def extract_frames_from_video(
-    video_uri: str, fps: float = 0.5, motion_detection_threshold: float = 0.0
-) -> List[Tuple[np.ndarray, float]]:
-    """Extract frames from a video
-    Parameters:
-        video_uri: the path to the video file or a video file url
-        fps: the frame rate per second to extract the frames
-        motion_detection_threshold: The threshold to detect motion between
-            changes/frames. A value between 0-1, which represents the percentage change
-            required for the frames to be considered in motion. For example, a lower
-            value means more frames will be extracted. A non-positive value will disable
-            motion detection and extract all frames.
-    Returns:
-        a list of tuples containing the extracted frame and the timestamp in seconds.
-            E.g. [(frame1, 0.0), (frame2, 0.5), ...]. The timestamp is the time in seconds
-            from the start of the video. E.g. 12.125 means 12.125 seconds from the start of
-            the video. The frames are sorted by the timestamp in ascending order.
-    """
-    with VideoFileClip(video_uri) as video:
-        video_duration: float = video.duration
-        num_workers = os.cpu_count()
-        clip_length: float = min(video_duration, _CLIP_LENGTH)
-        start_times = list(range(0, math.ceil(video_duration), math.ceil(clip_length)))
-        assert start_times, f"No frames to extract from the input video: {video_uri}"
-        segment_args = [
-            {
-                "video_uri": video_uri,
-                "start": start,
-                "end": (
-                    start + clip_length if i < len(start_times) - 1 else video_duration
-                ),
-                "fps": fps,
-                "motion_detection_threshold": motion_detection_threshold,
-            }
-            for i, start in enumerate(start_times)
-        ]
-        if (
-            cast(float, segment_args[-1]["end"])
-            - cast(float, segment_args[-1]["start"])
-            < 1
-        ):
-            # If the last segment is less than 1s, merge it with the previous segment
-            # This is to avoid the failure of the last segment extraction
-            assert (
-                len(segment_args) > 1
-            ), "Development bug - Expect at least 2 segments."
-            segment_args[-2]["end"] = video_duration
-            segment_args.pop(-1)
-        _LOGGER.info(
-            f"""Created {len(segment_args)} segments from the input video  {video_uri} of length {video.duration}s, with clip size: {clip_length}s and {num_workers} workers.
-            Segments: {segment_args}
-            """
-        )
-        frames = []
-        with tqdm(total=len(segment_args)) as pbar:
-            with ProcessPoolExecutor(max_workers=num_workers) as executor:
-                futures = [
-                    executor.submit(_extract_frames_by_clip, **kwargs)  # type: ignore
-                    for kwargs in segment_args
-                ]
-                for future in as_completed(futures):
-                    result = future.result()
-                    frames.extend(result)
-                    pbar.update(1)
-        frames.sort(key=lambda x: x[1])
-        _LOGGER.info(f"Extracted {len(frames)} frames from video {video_uri}")
-        return frames
-def _extract_frames_by_clip(
-    video_uri: str,
-    start: int = 0,
-    end: float = -1,
-    fps: int = 2,
-    motion_detection_threshold: float = 0.06,
-) -> List[Tuple[np.ndarray, float]]:
-    """Extract frames from a video clip with start and end time in seconds.
-    Parameters:
-        video_uri: the path to the video file or a video file url
-        start: the start time (in seconds) of the clip to extract
-        end: the end time (in seconds, up to millisecond level precision) of the clip to extract, if -1, extract the whole video
-        fps: the frame rate to extract the frames
-        motion_detection_threshold: the threshold to detect the motion between frames
-    """
-    with VideoFileClip(video_uri) as video:
-        source_fps = video.fps
-        if end <= 0:
-            end = video.duration
-        _LOGGER.info(
-            f"Extracting frames from video {video_uri} ({video.duration}s) with start={start}s and end={end}s"
-        )
-        clip = video.subclip(start, end)
-        processable_frames = int(clip.duration * fps)
-        _LOGGER.info(
-            f"Extracting frames from video clip of length {clip.duration}s with FPS={fps} and start_time={start}s. Total number of frames in clip: {processable_frames}"
-        )
-        frames = []
-        total_count, skipped_count = 0, 0
-        prev_processed_frame = None
-        pbar = tqdm(
-            total=processable_frames, desc=f"Extracting frames from clip {start}-{end}"
-        )
-        for i, frame in enumerate(clip.iter_frames(fps=fps, dtype="uint8")):
-            total_count += 1
-            pbar.update(1)
-            if motion_detection_threshold > 0:
-                curr_processed_frame = _preprocess_frame(frame)
-                # Skip the frame if it is similar to the previous one
-                if prev_processed_frame is not None and _similar_frame(
-                    prev_processed_frame,
-                    curr_processed_frame,
-                    threshold=motion_detection_threshold,
-                ):
-                    skipped_count += 1
-                    continue
-                prev_processed_frame = curr_processed_frame
-            ts = round(clip.reader.pos / source_fps, 3)
-            frames.append((frame, ts))
-        _LOGGER.info(
-            f"""Finished!
-                Frames extracted: {len(frames)}
-                Extracted frame timestamp: {[f[1] for f in frames]}
-                Total processed frames: {total_count}
-                Skipped frames:  {skipped_count}
-                Scan FPS: {fps}
-                Clip start time: {start}s, {clip.pos}
-                Clip end time: {end}s
-                Clip duration: {clip.duration}s
-                Clip total frames: {clip.duration * source_fps}
-                Video duration: {video.duration}s
-                Video FPS: {video.fps}
-                Video total frames: {video.reader.nframes}"""
-        )
-        return frames
-def _preprocess_frame(frame: np.ndarray) -> np.ndarray:
-    # Convert to grayscale
-    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
-    frame = cv2.GaussianBlur(src=frame, ksize=(5, 5), sigmaX=0)
-    return frame
-def _similar_frame(
-    prev_frame: np.ndarray, curr_frame: np.ndarray, threshold: float
-) -> bool:
-    """Detect two frames are similar or not
-    Parameters:
-        threshold: similarity threshold, a value between 0-1, the percentage change that is considered a different frame.
-    """
-    # calculate difference and update previous frame TODO: don't assume the processed image is cached
-    diff_frame = cv2.absdiff(src1=prev_frame, src2=curr_frame)
-    # Only take different areas that are different enough (>20 / 255)
-    thresh_frame = cv2.threshold(
-        src=diff_frame, thresh=20, maxval=255, type=cv2.THRESH_BINARY
-    )[1]
-    change_percentage = cv2.countNonZero(thresh_frame) / (
-        curr_frame.shape[0] * curr_frame.shape[1]
-    )
-    _LOGGER.debug(f"Image diff: {change_percentage}")
-    return change_percentage < threshold