PyPI - videopython - Versions diffs - 0.2.1__tar.gz → 0.3.0__tar.gz - Mend

videopython 0.2.1tar.gz → 0.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of videopython might be problematic. Click here for more details.

Files changed (21) hide show

{videopython-0.2.1 → videopython-0.3.0}/.gitignore RENAMED Viewed

@@ -137,4 +137,7 @@ dmypy.json
 # Data directories
 data/downloaded/*.mp4
 data/exported/*.mp4
-!data/exported/example.mp4
+!data/exported/example.mp4
+# Mac
+*.DS_Store

{videopython-0.2.1 → videopython-0.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.3
+Metadata-Version: 2.4
 Name: videopython
-Version: 0.2.1
+Version: 0.3.0
 Summary: Minimal video generation and processing library.
 Project-URL: Homepage, https://github.com/bartwojtowicz/videopython/
 Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -19,20 +19,8 @@ Requires-Dist: numpy>=1.25.2
 Requires-Dist: opencv-python>=4.9.0.80
 Requires-Dist: pillow>=10.3.0
 Requires-Dist: pydub>=0.25.1
+Requires-Dist: soundpython>=0.1.9
 Requires-Dist: tqdm>=4.66.3
-Provides-Extra: dev
-Requires-Dist: black==24.3.0; extra == 'dev'
-Requires-Dist: isort==5.12.0; extra == 'dev'
-Requires-Dist: mypy==1.8.0; extra == 'dev'
-Requires-Dist: pydub-stubs==0.25.1.1; extra == 'dev'
-Requires-Dist: pytest==7.4.0; extra == 'dev'
-Requires-Dist: types-pillow==10.2.0.20240213; extra == 'dev'
-Requires-Dist: types-tqdm==4.66.0.20240106; extra == 'dev'
-Provides-Extra: generation
-Requires-Dist: accelerate>=0.29.2; extra == 'generation'
-Requires-Dist: diffusers>=0.26.3; extra == 'generation'
-Requires-Dist: torch>=2.1.0; extra == 'generation'
-Requires-Dist: transformers>=4.38.1; extra == 'generation'
 Description-Content-Type: text/markdown
 # About

{videopython-0.2.1 → videopython-0.3.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "videopython"
-version = "0.2.1"
+version = "0.3.0"
 description = "Minimal video generation and processing library."
 authors = [
     { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -24,18 +24,18 @@ dependencies = [
     "opencv-python>=4.9.0.80",
     "pillow>=10.3.0",
     "pydub>=0.25.1",
+    "soundpython>=0.1.9",
     "tqdm>=4.66.3",
 ]
-[project.optional-dependencies]
+[dependency-groups]
 dev = [
-    "black==24.3.0",
-    "isort==5.12.0",
-    "mypy==1.8.0",
-    "pytest==7.4.0",
-    "types-Pillow==10.2.0.20240213",
-    "types-tqdm==4.66.0.20240106",
-    "pydub-stubs==0.25.1.1",
+    "black>=24.3.0",
+    "isort>=5.12.0",
+    "mypy>=1.8.0",
+    "pytest>=7.4.0",
+    "types-Pillow>=10.2.0.20240213",
+    "types-tqdm>=4.66.0.20240106",
 ]
 generation = [
     "accelerate>=0.29.2",
@@ -49,25 +49,8 @@ Homepage = "https://github.com/bartwojtowicz/videopython/"
 Repository = "https://github.com/bartwojtowicz/videopython/"
 Documentation = "https://github.com/bartwojtowicz/videopython/"
-[tool.rye]
-managed = true
-dev-dependencies = [
-    "black==24.3.0",
-    "isort==5.12.0",
-    "mypy==1.8.0",
-    "pytest==7.4.0",
-    "types-Pillow==10.2.0.20240213",
-    "types-tqdm==4.66.0.20240106",
-    "pydub-stubs==0.25.1.1",
-]
-[tool.rye.scripts]
-test-unit = "pytest"
-test-type = "mypy src"
-test-static = { chain = [
-    "black src -l 120 --check",
-    "isort src --profile black --check"
-]}
+[tool.mypy]
+mypy_path = "stubs"
 [build-system]
 requires = ["hatchling"]
@@ -79,10 +62,8 @@ packages = ["src/videopython"]
 [tool.hatch.build.targets.sdist]
 include = ["src/videopython", "src/videopython/py.typed"]
-[tool.mypy]
-mypy_path = "stubs"
 [tool.pytest]
+pythonpath = [".src/"]
 testpaths = ["src/tests"]
 python_files = ["test_*.py"]
 addopts = "-v --tb=short"

{videopython-0.2.1 → videopython-0.3.0}/src/videopython/base/transitions.py RENAMED Viewed

@@ -67,7 +67,7 @@ class FadeTransition(Transition):
             ],
             fps=video_fps,
         )
-        faded_videos.audio = videos[0].audio.append(videos[1].audio, crossfade=(effect_time_fps / video_fps) * 1000)
+        faded_videos.audio = videos[0].audio.concat(videos[1].audio, crossfade=(effect_time_fps / video_fps))
         return faded_videos
@@ -102,5 +102,5 @@ class BlurTransition(Transition):
             ],
             fps=video_fps,
         )
-        blurred_videos.audio = videos[0].audio.append(videos[1].audio)
+        blurred_videos.audio = videos[0].audio.concat(videos[1].audio)
         return blurred_videos

{videopython-0.2.1 → videopython-0.3.0}/src/videopython/base/video.py RENAMED Viewed

@@ -1,6 +1,5 @@
 from __future__ import annotations
-import shlex
 import subprocess
 import tempfile
 from dataclasses import dataclass
@@ -9,7 +8,7 @@ from typing import Literal, get_args
 import cv2
 import numpy as np
-from pydub import AudioSegment
+from soundpython import Audio
 from videopython.utils.common import generate_random_name
@@ -42,11 +41,7 @@ class VideoMetadata:
     @classmethod
     def from_path(cls, video_path: str) -> VideoMetadata:
-        """Creates VideoMetadata object from video file.
-        Args:
-            video_path: Path to video file.
-        """
+        """Creates VideoMetadata object from video file."""
         video = cv2.VideoCapture(video_path)
         frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
         fps = round(video.get(cv2.CAP_PROP_FPS), 2)
@@ -64,13 +59,7 @@ class VideoMetadata:
     @classmethod
     def from_video(cls, video: Video) -> VideoMetadata:
-        """Creates VideoMetadata object from frames.
-        Args:
-            frames: Frames of the video.
-            fps: Frames per second of the video.
-        """
+        """Creates VideoMetadata object from Video instance."""
         frame_count, height, width, _ = video.frames.shape
         total_seconds = round(frame_count / video.fps, 2)
@@ -116,11 +105,14 @@ class Video:
     def from_path(cls, path: str) -> Video:
         new_vid = cls()
         new_vid.frames, new_vid.fps = cls._load_video_from_path(path)
-        audio = cls._load_audio_from_path(path)
-        if not audio:
+        try:
+            new_vid.audio = Audio.from_file(path)
+        except Exception as e:
             print(f"No audio found for `{path}`, adding silent track!")
-            audio = AudioSegment.silent(duration=round(new_vid.total_seconds * 1000))
-        new_vid.audio = audio
+            new_vid.audio = Audio.create_silent(
+                duration_seconds=round(new_vid.total_seconds, 2), stereo=True, sample_rate=44100
+            )
         return new_vid
     @classmethod
@@ -134,7 +126,9 @@ class Video:
             raise ValueError(f"Unsupported number of dimensions: {frames.shape}!")
         new_vid.frames = frames
         new_vid.fps = fps
-        new_vid.audio = AudioSegment.silent(duration=round(new_vid.total_seconds * 1000))
+        new_vid.audio = Audio.create_silent(
+            duration_seconds=round(new_vid.total_seconds, 2), stereo=True, sample_rate=44100
+        )
         return new_vid
     @classmethod
@@ -144,12 +138,12 @@ class Video:
             image = np.expand_dims(image, axis=0)
         new_vid.frames = np.repeat(image, round(length_seconds * fps), axis=0)
         new_vid.fps = fps
-        new_vid.audio = AudioSegment.silent(duration=round(new_vid.total_seconds * 1000))
+        new_vid.audio = Audio.create_silent(duration_seconds=length_seconds, stereo=True, sample_rate=44100)
         return new_vid
     def copy(self) -> Video:
         copied = Video().from_frames(self.frames.copy(), self.fps)
-        copied.audio = self.audio
+        copied.audio = self.audio  # Audio objects are immutable, no need to copy
         return copied
     def is_loaded(self) -> bool:
@@ -165,25 +159,18 @@ class Video:
             self.from_frames(self.frames[:frame_idx], self.fps),
             self.from_frames(self.frames[frame_idx:], self.fps),
         )
-        audio_midpoint = (frame_idx / self.fps) * 1000
-        split_videos[0].audio = self.audio[:audio_midpoint]
-        split_videos[1].audio = self.audio[audio_midpoint:]
-        return split_videos
-    def save(self, filename: str | Path | None = None, format: ALLOWED_VIDEO_FORMATS = "mp4") -> Path:
-        """Saves the video with audio.
+        # Split audio at the corresponding time point
+        split_time = frame_idx / self.fps
+        split_videos[0].audio = self.audio.slice(start_seconds=0, end_seconds=split_time)
+        split_videos[1].audio = self.audio.slice(start_seconds=split_time)
-        Args:
-            filename: Name of the output video file. Generates random name if not provided.
-            format: Output format (default is 'mp4').
+        return split_videos
-        Returns:
-            Path to the saved video file.
-        """
+    def save(self, filename: str | Path | None = None, format: ALLOWED_VIDEO_FORMATS = "mp4") -> Path:
         if not self.is_loaded():
             raise RuntimeError("Video is not loaded, cannot save!")
-        # Check if the format is allowed
         if format.lower() not in get_args(ALLOWED_VIDEO_FORMATS):
             raise ValueError(
                 f"Unsupported format: {format}. Allowed formats are: {', '.join(get_args(ALLOWED_VIDEO_FORMATS))}"
@@ -203,79 +190,95 @@ class Video:
                 frame_path = temp_dir_path / f"frame_{i:04d}.png"
                 cv2.imwrite(str(frame_path), cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
-            # Save audio to a temporary file
+            # Calculate exact video duration
+            video_duration = len(self.frames) / self.fps
+            # Ensure audio duration matches video duration
+            if (
+                abs(self.audio.metadata.duration_seconds - video_duration) > 0.001
+            ):  # Small threshold for float comparison
+                if self.audio.metadata.duration_seconds < video_duration:
+                    # Create silent audio for the remaining duration
+                    remaining_duration = video_duration - self.audio.metadata.duration_seconds
+                    silent_audio = Audio.create_silent(
+                        duration_seconds=remaining_duration,
+                        stereo=(self.audio.metadata.channels == 2),
+                        sample_rate=self.audio.metadata.sample_rate,
+                        sample_width=self.audio.metadata.sample_width,
+                    )
+                    # Concatenate original audio with silent padding
+                    padded_audio = self.audio.concat(silent_audio)
+                else:
+                    # Trim audio to match video duration
+                    padded_audio = self.audio.slice(end_seconds=video_duration)
+            else:
+                padded_audio = self.audio
+            # Save audio to temporary WAV file
             temp_audio = temp_dir_path / "temp_audio.wav"
-            self.audio.export(str(temp_audio), format="adts", bitrate="192k")
+            padded_audio.save(str(temp_audio), format="wav")
-            # Construct FFmpeg command
+            # Construct FFmpeg command with explicit duration
             ffmpeg_command = [
                 "ffmpeg",
-                "-y",  # Overwrite output file if it exists
-                "-r",
-                str(self.fps),  # Set the frame rate
+                "-y",
+                "-framerate",
+                str(self.fps),  # Use -framerate instead of -r for input
                 "-i",
-                str(temp_dir_path / "frame_%04d.png"),  # Input image sequence
+                str(temp_dir_path / "frame_%04d.png"),
                 "-i",
-                str(temp_audio),  # Input audio file
+                str(temp_audio),
                 "-c:v",
-                "libx264",  # Video codec
+                "libx264",
                 "-preset",
-                "medium",  # Encoding preset (tradeoff between encoding speed and compression)
+                "medium",
                 "-crf",
-                "23",  # Constant Rate Factor (lower means better quality, 23 is default)
+                "23",
                 "-c:a",
-                "copy",  # Audio codec
+                "aac",  # Use AAC instead of copy for more reliable audio
                 "-b:a",
-                "192k",  # Audio bitrate
+                "192k",
                 "-pix_fmt",
-                "yuv420p",  # Pixel format
-                "-shortest",  # Finish encoding when the shortest input stream ends
+                "yuv420p",
+                "-map",
+                "0:v:0",  # Map video from first input
+                "-map",
+                "1:a:0",  # Map audio from second input
+                "-vsync",
+                "cfr",  # Force constant frame rate
                 str(filename),
             ]
             try:
                 subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
-                print(f"Video saved successfully to: {filename}")
                 return filename
             except subprocess.CalledProcessError as e:
                 print(f"Error saving video: {e}")
                 print(f"FFmpeg stderr: {e.stderr}")
                 raise
-    def add_audio(self, audio: AudioSegment, overlay: bool = True, overlay_gain: int = 0, loop: bool = False) -> None:
-        self.audio = self._process_audio(audio=audio, overlay=overlay, overlay_gain=overlay_gain, loop=loop)
-    def add_audio_from_file(self, path: str, overlay: bool = True, overlay_gain: int = 0, loop: bool = False) -> None:
-        new_audio = self._load_audio_from_path(path)
-        if new_audio is None:
-            print(f"Audio file `{path}` not found, skipping!")
-            return
-        self.audio = self._process_audio(audio=new_audio, overlay=overlay, overlay_gain=overlay_gain, loop=loop)
-    def _process_audio(
-        self, audio: AudioSegment, overlay: bool = True, overlay_gain: int = 0, loop: bool = False
-    ) -> AudioSegment:
-        if (duration_diff := round(self.total_seconds - audio.duration_seconds)) > 0 and not loop:
-            audio = audio + AudioSegment.silent(duration_diff * 1000)
-        elif audio.duration_seconds > self.total_seconds:
-            audio = audio[: round(self.total_seconds * 1000)]
+    def add_audio(self, audio: Audio, overlay: bool = True) -> None:
+        if self.audio.is_silent:
+            self.audio = audio
+        elif overlay:
+            self.audio = self.audio.overlay(audio, position=0.0)
+        else:
+            self.audio = audio
-        if overlay:
-            return self.audio.overlay(audio, loop=loop, gain_during_overlay=overlay_gain)
-        return audio
+    def add_audio_from_file(self, path: str, overlay: bool = True) -> None:
+        try:
+            new_audio = Audio.from_file(path)
+            self.add_audio(new_audio, overlay)
+        except Exception as e:
+            print(f"Audio file `{path}` not found or invalid, skipping!")
     def __add__(self, other: Video) -> Video:
-        # TODO: Should it be class method? How to make it work with sum()?
         if self.fps != other.fps:
             raise ValueError("FPS of videos do not match!")
         elif self.frame_shape != other.frame_shape:
-            raise ValueError(
-                "Resolutions of the images do not match: "
-                f"{self.frame_shape} not compatible with {other.frame_shape}."
-            )
+            raise ValueError(f"Resolutions do not match: {self.frame_shape} vs {other.frame_shape}")
         new_video = self.from_frames(np.r_["0,2", self.frames, other.frames], fps=self.fps)
-        new_video.audio = self.audio + other.audio
+        new_video.audio = self.audio.concat(other.audio)
         return new_video
     def __str__(self) -> str:
@@ -285,37 +288,25 @@ class Video:
         if not isinstance(val, slice):
             raise ValueError("Only slices are supported for video indexing!")
-        # Sub-slice video if given a slice
+        # Sub-slice video frames
         sliced = self.from_frames(self.frames[val], fps=self.fps)
-        # Handle slicing without value for audio
+        # Handle slicing bounds for audio
         start = val.start if val.start else 0
         stop = val.stop if val.stop else len(self.frames)
-        # Handle negative values for audio slices
         if start < 0:
             start = len(self.frames) + start
         if stop < 0:
             stop = len(self.frames) + stop
-        # Append audio to the slice
-        audio_start = round(start / self.fps) * 1000
-        audio_end = round(stop / self.fps) * 1000
-        sliced.audio = self.audio[audio_start:audio_end]
-        return sliced
-    @staticmethod
-    def _load_audio_from_path(path: str) -> AudioSegment | None:
-        try:
-            audio = AudioSegment.from_file(path)
-            return audio
-        except IndexError:
-            return None
+        # Slice audio to match video duration
+        audio_start = start / self.fps
+        audio_end = stop / self.fps
+        sliced.audio = self.audio.slice(start_seconds=audio_start, end_seconds=audio_end)
+        return sliced
     @staticmethod
     def _load_video_from_path(path: str) -> tuple[np.ndarray, float]:
-        """Loads frames and fps information from video file.
-        Args:
-            path: Path to video file.
-        """
         cap = cv2.VideoCapture(path)
         if not cap.isOpened():
             raise ValueError(f"Unable to open video file: {path}")
@@ -339,20 +330,16 @@ class Video:
     @property
     def video_shape(self) -> tuple[int, int, int, int]:
-        """Returns 4D video shape."""
         return self.frames.shape
     @property
     def frame_shape(self) -> tuple[int, int, int]:
-        """Returns 3D frame shape."""
         return self.frames.shape[1:]
     @property
     def total_seconds(self) -> float:
-        """Returns total seconds of the video."""
         return round(self.frames.shape[0] / self.fps, 4)
     @property
     def metadata(self) -> VideoMetadata:
-        """Returns VideoMetadata object."""
         return VideoMetadata.from_video(self)

{videopython-0.2.1 → videopython-0.3.0}/src/videopython/generation/audio.py RENAMED Viewed

@@ -1,6 +1,5 @@
-import numpy as np
 import torch
-from pydub import AudioSegment
+from soundpython import Audio, AudioMetadata
 from transformers import (
     AutoProcessor,
     AutoTokenizer,
@@ -17,15 +16,24 @@ class TextToSpeech:
         self.pipeline = VitsModel.from_pretrained(TEXT_TO_SPEECH_MODEL)
         self.tokenizer = AutoTokenizer.from_pretrained(TEXT_TO_SPEECH_MODEL)
-    def generate_audio(self, text: str) -> AudioSegment:
+    def generate_audio(self, text: str) -> Audio:
         tokenized = self.tokenizer(text, return_tensors="pt")
         with torch.no_grad():
             output = self.pipeline(**tokenized).waveform
-        output = (output.T.float().numpy() * (2**31 - 1)).astype(np.int32)
-        audio = AudioSegment(data=output, frame_rate=self.pipeline.config.sampling_rate, sample_width=4, channels=1)
-        return audio
+        # Convert to float32 and normalize to [-1, 1]
+        audio_data = output.T.float().numpy()
+        metadata = AudioMetadata(
+            sample_rate=self.pipeline.config.sampling_rate,
+            channels=1,
+            sample_width=4,
+            duration_seconds=len(audio_data) / self.pipeline.config.sampling_rate,
+            frame_count=len(audio_data),
+        )
+        return Audio(audio_data, metadata)
 class TextToMusic:
@@ -37,7 +45,7 @@ class TextToMusic:
         self.processor = AutoProcessor.from_pretrained(MUSIC_GENERATION_MODEL_SMALL)
         self.model = MusicgenForConditionalGeneration.from_pretrained(MUSIC_GENERATION_MODEL_SMALL)
-    def generate_audio(self, text: str, max_new_tokens: int) -> AudioSegment:
+    def generate_audio(self, text: str, max_new_tokens: int) -> Audio:
         inputs = self.processor(
             text=[text],
             padding=True,
@@ -45,12 +53,16 @@ class TextToMusic:
         )
         audio_values = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
         sampling_rate = self.model.config.audio_encoder.sampling_rate
-        output = (audio_values[0, 0].float().numpy() * (2**31 - 1)).astype(np.int32)
-        audio = AudioSegment(
-            data=output.tobytes(),
-            frame_rate=sampling_rate,
-            sample_width=4,
+        # Convert to float32 and normalize to [-1, 1]
+        audio_data = audio_values[0, 0].float().numpy()
+        metadata = AudioMetadata(
+            sample_rate=sampling_rate,
             channels=1,
+            sample_width=4,
+            duration_seconds=len(audio_data) / sampling_rate,
+            frame_count=len(audio_data),
         )
-        return audio
+        return Audio(audio_data, metadata)

{videopython-0.2.1 → videopython-0.3.0}/src/videopython/utils/image.py RENAMED Viewed

@@ -197,6 +197,10 @@ class ImageText:
             # Find bounding rectangle for written text
             box_slice = img[y:current_text_height, x : x + box_width]
             text_mask = np.any(box_slice != 0, axis=2).astype(np.uint8)
+            if not isinstance(text_mask, np.ndarray):
+                raise TypeError(
+                    f"The returned text mask is of type {type(text_mask)}, " "but it should be numpy array!"
+                )
             xmin, xmax, ymin, ymax = self._find_smallest_bounding_rect(text_mask)
             # Get global bounding box position
             xmin += x - background_padding