PyPI - videopython - Versions diffs - 0.26.4__tar.gz → 0.26.5__tar.gz - Mend

videopython 0.26.4tar.gz → 0.26.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

{videopython-0.26.4 → videopython-0.26.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: videopython
-Version: 0.26.4
+Version: 0.26.5
 Summary: Minimal video generation and processing library.
 Project-URL: Homepage, https://videopython.com
 Project-URL: Repository, https://github.com/bartwojtowicz/videopython/

{videopython-0.26.4 → videopython-0.26.5}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "videopython"
-version = "0.26.4"
+version = "0.26.5"
 description = "Minimal video generation and processing library."
 authors = [
     { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },

{videopython-0.26.4 → videopython-0.26.5}/src/videopython/ai/dubbing/dubber.py RENAMED Viewed

@@ -8,6 +8,7 @@ from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable
 from videopython.ai.dubbing.models import DubbingResult, RevoiceResult
+from videopython.ai.dubbing.pipeline import WhisperModel
 if TYPE_CHECKING:
     from videopython.base.video import Video
@@ -25,19 +26,38 @@ class VideoDubber:
             model is resident at a time. Trades per-run latency (~10-30s of
             extra model loads) for a much lower memory ceiling. Recommended for
             GPUs with <=12GB VRAM or hosts with <32GB RAM. Default False.
+        whisper_model: Whisper model size used for transcription. Larger models
+            give better accuracy at the cost of VRAM and latency. One of
+            ``tiny``, ``base``, ``small``, ``medium``, ``large``, ``turbo``.
+            Default ``small``.
     """
-    def __init__(self, device: str | None = None, low_memory: bool = False):
+    def __init__(
+        self,
+        device: str | None = None,
+        low_memory: bool = False,
+        whisper_model: WhisperModel = "small",
+    ):
         self.device = device
         self.low_memory = low_memory
+        self.whisper_model = whisper_model
         self._local_pipeline: Any = None
         requested = device.lower() if isinstance(device, str) else "auto"
-        logger.info("VideoDubber initialized with device=%s low_memory=%s", requested, low_memory)
+        logger.info(
+            "VideoDubber initialized with device=%s low_memory=%s whisper_model=%s",
+            requested,
+            low_memory,
+            whisper_model,
+        )
     def _init_local_pipeline(self) -> None:
         from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
-        self._local_pipeline = LocalDubbingPipeline(device=self.device, low_memory=self.low_memory)
+        self._local_pipeline = LocalDubbingPipeline(
+            device=self.device,
+            low_memory=self.low_memory,
+            whisper_model=self.whisper_model,
+        )
     def dub(
         self,

{videopython-0.26.4 → videopython-0.26.5}/src/videopython/ai/dubbing/pipeline.py RENAMED Viewed

@@ -3,7 +3,7 @@
 from __future__ import annotations
 import logging
-from typing import TYPE_CHECKING, Any, Callable
+from typing import TYPE_CHECKING, Any, Callable, Literal
 from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio
 from videopython.ai.dubbing.timing import TimingSynchronizer
@@ -11,6 +11,8 @@ from videopython.ai.dubbing.timing import TimingSynchronizer
 if TYPE_CHECKING:
     from videopython.base.audio import Audio
+WhisperModel = Literal["tiny", "base", "small", "medium", "large", "turbo"]
 logger = logging.getLogger(__name__)
@@ -23,14 +25,21 @@ class LocalDubbingPipeline:
     with <=12GB VRAM or hosts with <32GB RAM.
     """
-    def __init__(self, device: str | None = None, low_memory: bool = False):
+    def __init__(
+        self,
+        device: str | None = None,
+        low_memory: bool = False,
+        whisper_model: WhisperModel = "small",
+    ):
         self.device = device
         self.low_memory = low_memory
+        self.whisper_model = whisper_model
         requested = device.lower() if isinstance(device, str) else "auto"
         logger.info(
-            "LocalDubbingPipeline initialized with device=%s low_memory=%s",
+            "LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s",
             requested,
             low_memory,
+            whisper_model,
         )
         self._transcriber: Any = None
@@ -62,7 +71,11 @@ class LocalDubbingPipeline:
         """Initialize the transcription model."""
         from videopython.ai.understanding.audio import AudioToText
-        self._transcriber = AudioToText(device=self.device, enable_diarization=enable_diarization)
+        self._transcriber = AudioToText(
+            model_name=self.whisper_model,
+            device=self.device,
+            enable_diarization=enable_diarization,
+        )
     def _init_translator(self) -> None:
         """Initialize the translation model."""
@@ -94,6 +107,7 @@ class LocalDubbingPipeline:
         max_duration: float = 10.0,
     ) -> dict[str, Any]:
         """Extract voice samples for each speaker from the audio."""
+        from videopython.base.audio import Audio
         voice_samples: dict[str, Audio] = {}
@@ -120,7 +134,11 @@ class LocalDubbingPipeline:
             if best_segment is not None:
                 start = best_segment.start
                 end = min(best_segment.end, start + max_duration)
-                voice_samples[speaker] = audio.slice(start, end)
+                sliced = audio.slice(start, end)
+                # Audio.slice returns a numpy view into the source. Copy so the
+                # short voice sample doesn't keep the full vocals array (~1.3 GB
+                # for 2h sources) alive across translate + TTS.
+                voice_samples[speaker] = Audio(sliced.data.copy(), sliced.metadata)
         return voice_samples
@@ -175,6 +193,7 @@ class LocalDubbingPipeline:
         separated_audio: SeparatedAudio | None = None
         vocal_audio = source_audio
+        background_audio: Audio | None = None
         if preserve_background:
             report_progress("Separating audio", 0.15)
@@ -184,12 +203,24 @@ class LocalDubbingPipeline:
             separated_audio = self._separator.separate(source_audio)
             self._maybe_unload("_separator")
             vocal_audio = separated_audio.vocals
+            background_audio = separated_audio.background
+            # In low_memory mode, drop the SeparatedAudio container so vocals
+            # and background can be released as soon as their last local
+            # reference goes (after voice-sample extraction and final overlay
+            # respectively). The result will report separated_audio=None.
+            if self.low_memory:
+                separated_audio = None
         voice_samples: dict[str, Audio] = {}
         if voice_clone:
             report_progress("Extracting voice samples", 0.25)
             voice_samples = self._extract_voice_samples(vocal_audio, transcription)
+        # vocals is no longer needed; voice_samples are independent copies.
+        # In low_memory mode this is the only ref keeping the buffer alive
+        # (separated_audio was dropped above), so dropping the local frees it.
+        del vocal_audio
         report_progress("Translating text", 0.35)
         if self._translator is None:
             self._init_translator()
@@ -237,17 +268,23 @@ class LocalDubbingPipeline:
         assert self._synchronizer is not None
         synchronized_segments, _ = self._synchronizer.synchronize_segments(dubbed_segments, target_durations)
+        del dubbed_segments
         report_progress("Assembling final audio", 0.90)
         total_duration = source_audio.metadata.duration_seconds
         dubbed_speech = self._synchronizer.assemble_with_timing(synchronized_segments, start_times, total_duration)
+        del synchronized_segments
-        if separated_audio is not None:
-            background_sr = separated_audio.background.metadata.sample_rate
+        if background_audio is not None:
+            background_sr = background_audio.metadata.sample_rate
             if dubbed_speech.metadata.sample_rate != background_sr:
                 dubbed_speech = dubbed_speech.resample(background_sr)
-            final_audio = separated_audio.background.overlay(dubbed_speech, position=0.0)
+            final_audio = background_audio.overlay(dubbed_speech, position=0.0)
+            # Drop the local; in low_memory this releases the background
+            # buffer (~1.3 GB for 2h sources). In non-low_memory the same
+            # array is still held by separated_audio.background.
+            del background_audio
         else:
             final_audio = dubbed_speech
@@ -294,6 +331,7 @@ class LocalDubbingPipeline:
         separated_audio: SeparatedAudio | None = None
         vocal_audio = source_audio
+        background_audio: Audio | None = None
         if preserve_background:
             report_progress("Separating audio", 0.20)
@@ -303,6 +341,9 @@ class LocalDubbingPipeline:
             separated_audio = self._separator.separate(source_audio)
             self._maybe_unload("_separator")
             vocal_audio = separated_audio.vocals
+            background_audio = separated_audio.background
+            if self.low_memory:
+                separated_audio = None
         report_progress("Extracting voice sample", 0.40)
         voice_sample: Audio | None = None
@@ -314,7 +355,11 @@ class LocalDubbingPipeline:
         if voice_sample is None:
             sample_duration = min(6.0, original_duration)
-            voice_sample = vocal_audio.slice(0, sample_duration)
+            sliced = vocal_audio.slice(0, sample_duration)
+            # Copy so the short sample doesn't pin the full vocals array.
+            voice_sample = Audio(sliced.data.copy(), sliced.metadata)
+        del vocal_audio
         report_progress("Generating speech", 0.60)
         if self._tts is None or self._tts_language != "en":
@@ -327,24 +372,24 @@ class LocalDubbingPipeline:
         report_progress("Assembling audio", 0.85)
-        if separated_audio is not None:
-            background_sr = separated_audio.background.metadata.sample_rate
+        if background_audio is not None:
+            background_sr = background_audio.metadata.sample_rate
             if generated_speech.metadata.sample_rate != background_sr:
                 generated_speech = generated_speech.resample(background_sr)
-            background = separated_audio.background
-            if background.metadata.duration_seconds > speech_duration:
-                background = background.slice(0, speech_duration)
-            elif background.metadata.duration_seconds < speech_duration:
-                silence_duration = speech_duration - background.metadata.duration_seconds
+            if background_audio.metadata.duration_seconds > speech_duration:
+                background_audio = background_audio.slice(0, speech_duration)
+            elif background_audio.metadata.duration_seconds < speech_duration:
+                silence_duration = speech_duration - background_audio.metadata.duration_seconds
                 silence = Audio.silence(
                     duration=silence_duration,
                     sample_rate=background_sr,
-                    channels=background.metadata.channels,
+                    channels=background_audio.metadata.channels,
                 )
-                background = background.concat(silence)
+                background_audio = background_audio.concat(silence)
-            final_audio = background.overlay(generated_speech, position=0.0)
+            final_audio = background_audio.overlay(generated_speech, position=0.0)
+            del background_audio
         else:
             final_audio = generated_speech

{videopython-0.26.4 → videopython-0.26.5}/src/videopython/ai/understanding/separation.py RENAMED Viewed

@@ -42,7 +42,15 @@ class AudioSeparator:
         )
     def _separate_local(self, audio: Audio) -> SeparatedAudio:
-        """Separate audio using local Demucs model."""
+        """Separate audio using local Demucs model.
+        Keeps the input tensor on CPU and passes ``device=self.device`` to
+        ``apply_model`` so per-chunk compute runs on GPU while the full
+        ``(stems, channels, samples)`` output is stored in CPU RAM. For long
+        sources this is the difference between OOM-on-GPU and running cleanly:
+        a 2h stereo @ 44.1kHz output is ~10 GB — too big for an 8 GB card but
+        comfortable on a 32 GB host.
+        """
         import numpy as np
         import torch
         from demucs.apply import apply_model
@@ -65,61 +73,40 @@ class AudioSeparator:
             audio_data = audio_data.T
         wav = torch.tensor(audio_data, dtype=torch.float32).unsqueeze(0)
-        wav = wav.to(self.device)
         with torch.no_grad():
             sources = apply_model(self._model, wav, device=self.device)
         sources_np = sources[0].cpu().numpy()
+        del sources
         stem_names = self.STEM_NAMES_6S if self.model_name == "htdemucs_6s" else self.STEM_NAMES
+        vocals_idx = stem_names.index("vocals")
+        non_vocal_indices = [i for i in range(len(stem_names)) if i != vocals_idx]
-        stems: dict[str, Audio] = {}
-        for i, name in enumerate(stem_names):
-            stem_data = sources_np[i].T
-            metadata = AudioMetadata(
-                sample_rate=target_sr,
-                channels=2,
-                sample_width=2,
-                duration_seconds=stem_data.shape[0] / target_sr,
-                frame_count=stem_data.shape[0],
-            )
-            stems[name] = Audio(stem_data.astype(np.float32), metadata)
-        vocals = stems["vocals"]
-        non_vocal_stems = [stems[name] for name in stem_names if name != "vocals"]
-        background_data = np.zeros_like(vocals.data)
-        for stem in non_vocal_stems:
-            background_data += stem.data
+        vocals_data = sources_np[vocals_idx].T
+        background_data = sources_np[non_vocal_indices].sum(axis=0).T
+        del sources_np
         max_val = np.max(np.abs(background_data))
         if max_val > 1.0:
-            background_data = background_data / max_val
-        background = Audio(background_data.astype(np.float32), vocals.metadata)
-        music_stems = ["drums", "bass", "other"]
-        if self.model_name == "htdemucs_6s":
-            music_stems.extend(["guitar", "piano"])
-        music_data = np.zeros_like(vocals.data)
-        for name in music_stems:
-            if name in stems:
-                music_data += stems[name].data
-        max_val = np.max(np.abs(music_data))
-        if max_val > 1.0:
-            music_data = music_data / max_val
-        music = Audio(music_data.astype(np.float32), vocals.metadata)
+            background_data /= max_val
+        metadata = AudioMetadata(
+            sample_rate=target_sr,
+            channels=2,
+            sample_width=2,
+            duration_seconds=vocals_data.shape[0] / target_sr,
+            frame_count=vocals_data.shape[0],
+        )
+        vocals = Audio(np.ascontiguousarray(vocals_data, dtype=np.float32), metadata)
+        background = Audio(np.ascontiguousarray(background_data, dtype=np.float32), metadata)
         return SeparatedAudio(
             vocals=vocals,
             background=background,
             original=audio,
-            music=music,
+            music=None,
             effects=None,
         )