PyPI - videopython - Versions diffs - 0.26.4__tar.gz → 0.26.6__tar.gz - Mend

videopython 0.26.4tar.gz → 0.26.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

{videopython-0.26.4 → videopython-0.26.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: videopython
-Version: 0.26.4
+Version: 0.26.6
 Summary: Minimal video generation and processing library.
 Project-URL: Homepage, https://videopython.com
 Project-URL: Repository, https://github.com/bartwojtowicz/videopython/

{videopython-0.26.4 → videopython-0.26.6}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "videopython"
-version = "0.26.4"
+version = "0.26.6"
 description = "Minimal video generation and processing library."
 authors = [
     { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },

{videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/dubbing/dubber.py RENAMED Viewed

@@ -8,6 +8,7 @@ from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable
 from videopython.ai.dubbing.models import DubbingResult, RevoiceResult
+from videopython.ai.dubbing.pipeline import WhisperModel
 if TYPE_CHECKING:
     from videopython.base.video import Video
@@ -25,19 +26,38 @@ class VideoDubber:
             model is resident at a time. Trades per-run latency (~10-30s of
             extra model loads) for a much lower memory ceiling. Recommended for
             GPUs with <=12GB VRAM or hosts with <32GB RAM. Default False.
+        whisper_model: Whisper model size used for transcription. Larger models
+            give better accuracy at the cost of VRAM and latency. One of
+            ``tiny``, ``base``, ``small``, ``medium``, ``large``, ``turbo``.
+            Default ``small``.
     """
-    def __init__(self, device: str | None = None, low_memory: bool = False):
+    def __init__(
+        self,
+        device: str | None = None,
+        low_memory: bool = False,
+        whisper_model: WhisperModel = "small",
+    ):
         self.device = device
         self.low_memory = low_memory
+        self.whisper_model = whisper_model
         self._local_pipeline: Any = None
         requested = device.lower() if isinstance(device, str) else "auto"
-        logger.info("VideoDubber initialized with device=%s low_memory=%s", requested, low_memory)
+        logger.info(
+            "VideoDubber initialized with device=%s low_memory=%s whisper_model=%s",
+            requested,
+            low_memory,
+            whisper_model,
+        )
     def _init_local_pipeline(self) -> None:
         from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
-        self._local_pipeline = LocalDubbingPipeline(device=self.device, low_memory=self.low_memory)
+        self._local_pipeline = LocalDubbingPipeline(
+            device=self.device,
+            low_memory=self.low_memory,
+            whisper_model=self.whisper_model,
+        )
     def dub(
         self,
@@ -54,9 +74,14 @@ class VideoDubber:
         Args:
             enable_diarization: Enable speaker diarization to clone each speaker's
-                voice separately. Requires additional VRAM for the diarization model.
-            transcription: Optional pre-computed Transcription object. When provided,
-                the internal Whisper transcription step is skipped.
+                voice separately. With ``transcription=None``, runs alongside Whisper.
+                With a supplied ``transcription`` that has no speakers, runs pyannote
+                standalone and overlays speakers onto the supplied words. Ignored when
+                the supplied transcription already has speaker labels.
+            transcription: Optional pre-computed ``Transcription`` to skip the Whisper
+                step. Speaker labels on the supplied transcription drive per-speaker
+                voice cloning. If it has no speakers, pass ``enable_diarization=True``
+                to add them via pyannote (requires word-level timings).
         """
         if self._local_pipeline is None:
             self._init_local_pipeline()
@@ -86,8 +111,10 @@ class VideoDubber:
         """Dub a video and return a new video with the dubbed audio.
         Args:
-            transcription: Optional pre-computed Transcription object. When provided,
-                the internal Whisper transcription step is skipped.
+            transcription: Optional pre-computed ``Transcription`` to skip the Whisper
+                step. Speaker labels on the supplied transcription drive per-speaker
+                voice cloning. See ``dub()`` for the interaction with
+                ``enable_diarization``.
         """
         result = self.dub(
             video=video,
@@ -132,8 +159,12 @@ class VideoDubber:
             preserve_background: Preserve background music/effects via source separation.
             voice_clone: Clone the source speaker's voice for the dubbed track.
             enable_diarization: Enable speaker diarization for per-speaker voice cloning.
+                See ``dub()`` for the interaction with ``transcription``.
             progress_callback: Optional callback ``(stage: str, progress: float) -> None``.
-            transcription: Optional pre-computed ``Transcription`` to skip the Whisper step.
+            transcription: Optional pre-computed ``Transcription`` to skip the Whisper
+                step. Speaker labels on the supplied transcription drive per-speaker
+                voice cloning. If it has no speakers, pass ``enable_diarization=True``
+                to add them via pyannote (requires word-level timings).
         Returns:
             ``DubbingResult`` with the dubbed audio, translated segments, and

{videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/dubbing/pipeline.py RENAMED Viewed

@@ -3,7 +3,7 @@
 from __future__ import annotations
 import logging
-from typing import TYPE_CHECKING, Any, Callable
+from typing import TYPE_CHECKING, Any, Callable, Literal
 from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio
 from videopython.ai.dubbing.timing import TimingSynchronizer
@@ -11,6 +11,8 @@ from videopython.ai.dubbing.timing import TimingSynchronizer
 if TYPE_CHECKING:
     from videopython.base.audio import Audio
+WhisperModel = Literal["tiny", "base", "small", "medium", "large", "turbo"]
 logger = logging.getLogger(__name__)
@@ -23,14 +25,21 @@ class LocalDubbingPipeline:
     with <=12GB VRAM or hosts with <32GB RAM.
     """
-    def __init__(self, device: str | None = None, low_memory: bool = False):
+    def __init__(
+        self,
+        device: str | None = None,
+        low_memory: bool = False,
+        whisper_model: WhisperModel = "small",
+    ):
         self.device = device
         self.low_memory = low_memory
+        self.whisper_model = whisper_model
         requested = device.lower() if isinstance(device, str) else "auto"
         logger.info(
-            "LocalDubbingPipeline initialized with device=%s low_memory=%s",
+            "LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s",
             requested,
             low_memory,
+            whisper_model,
         )
         self._transcriber: Any = None
@@ -62,7 +71,11 @@ class LocalDubbingPipeline:
         """Initialize the transcription model."""
         from videopython.ai.understanding.audio import AudioToText
-        self._transcriber = AudioToText(device=self.device, enable_diarization=enable_diarization)
+        self._transcriber = AudioToText(
+            model_name=self.whisper_model,
+            device=self.device,
+            enable_diarization=enable_diarization,
+        )
     def _init_translator(self) -> None:
         """Initialize the translation model."""
@@ -94,6 +107,7 @@ class LocalDubbingPipeline:
         max_duration: float = 10.0,
     ) -> dict[str, Any]:
         """Extract voice samples for each speaker from the audio."""
+        from videopython.base.audio import Audio
         voice_samples: dict[str, Audio] = {}
@@ -120,7 +134,11 @@ class LocalDubbingPipeline:
             if best_segment is not None:
                 start = best_segment.start
                 end = min(best_segment.end, start + max_duration)
-                voice_samples[speaker] = audio.slice(start, end)
+                sliced = audio.slice(start, end)
+                # Audio.slice returns a numpy view into the source. Copy so the
+                # short voice sample doesn't keep the full vocals array (~1.3 GB
+                # for 2h sources) alive across translate + TTS.
+                voice_samples[speaker] = Audio(sliced.data.copy(), sliced.metadata)
         return voice_samples
@@ -144,7 +162,16 @@ class LocalDubbingPipeline:
             transcription: Optional pre-computed Transcription object. When provided,
                 the internal Whisper transcription step is skipped (saving time and VRAM).
                 Must be a ``videopython.base.text.transcription.Transcription`` instance
-                with populated ``segments``.
+                with populated ``segments``. Speaker labels on the supplied transcription
+                drive per-speaker voice cloning. If the supplied transcription has no
+                speakers and ``enable_diarization=True``, pyannote is run standalone on
+                ``source_audio`` and speakers are attached to the supplied words
+                (requires word-level timings).
+            enable_diarization: When True, run speaker diarization to enable per-speaker
+                voice cloning. With ``transcription=None``, runs alongside Whisper. With
+                a supplied ``transcription`` that has no speakers, runs pyannote
+                standalone and overlays speakers onto the supplied words. Ignored when
+                the supplied transcription already has speaker labels.
         """
         def report_progress(stage: str, progress: float) -> None:
@@ -153,6 +180,34 @@ class LocalDubbingPipeline:
         if transcription is not None:
             report_progress("Using provided transcription", 0.05)
+            if transcription.speakers:
+                logger.info(
+                    "Using provided transcription: %d segment(s), %d speaker(s)",
+                    len(transcription.segments),
+                    len(transcription.speakers),
+                )
+                if enable_diarization:
+                    logger.info("enable_diarization=True ignored: supplied transcription already has speaker labels.")
+            elif enable_diarization:
+                report_progress("Diarizing supplied transcription", 0.10)
+                if self._transcriber is None or self._transcriber_diarization is not True:
+                    self._init_transcriber(enable_diarization=True)
+                    self._transcriber_diarization = True
+                transcription = self._transcriber.diarize_transcription(source_audio, transcription)
+                self._maybe_unload("_transcriber")
+                logger.info(
+                    "Diarized supplied transcription: %d segment(s), %d speaker(s)",
+                    len(transcription.segments),
+                    len(transcription.speakers),
+                )
+            else:
+                logger.info(
+                    "Using provided transcription: %d segment(s), no speaker labels. "
+                    "All segments will share a single voice clone. Pass "
+                    "enable_diarization=True to add per-speaker labels, or "
+                    "voice_clone=False to use the default TTS voice.",
+                    len(transcription.segments),
+                )
         else:
             report_progress("Transcribing audio", 0.05)
             if self._transcriber is None or self._transcriber_diarization != enable_diarization:
@@ -175,6 +230,7 @@ class LocalDubbingPipeline:
         separated_audio: SeparatedAudio | None = None
         vocal_audio = source_audio
+        background_audio: Audio | None = None
         if preserve_background:
             report_progress("Separating audio", 0.15)
@@ -184,12 +240,24 @@ class LocalDubbingPipeline:
             separated_audio = self._separator.separate(source_audio)
             self._maybe_unload("_separator")
             vocal_audio = separated_audio.vocals
+            background_audio = separated_audio.background
+            # In low_memory mode, drop the SeparatedAudio container so vocals
+            # and background can be released as soon as their last local
+            # reference goes (after voice-sample extraction and final overlay
+            # respectively). The result will report separated_audio=None.
+            if self.low_memory:
+                separated_audio = None
         voice_samples: dict[str, Audio] = {}
         if voice_clone:
             report_progress("Extracting voice samples", 0.25)
             voice_samples = self._extract_voice_samples(vocal_audio, transcription)
+        # vocals is no longer needed; voice_samples are independent copies.
+        # In low_memory mode this is the only ref keeping the buffer alive
+        # (separated_audio was dropped above), so dropping the local frees it.
+        del vocal_audio
         report_progress("Translating text", 0.35)
         if self._translator is None:
             self._init_translator()
@@ -237,17 +305,23 @@ class LocalDubbingPipeline:
         assert self._synchronizer is not None
         synchronized_segments, _ = self._synchronizer.synchronize_segments(dubbed_segments, target_durations)
+        del dubbed_segments
         report_progress("Assembling final audio", 0.90)
         total_duration = source_audio.metadata.duration_seconds
         dubbed_speech = self._synchronizer.assemble_with_timing(synchronized_segments, start_times, total_duration)
+        del synchronized_segments
-        if separated_audio is not None:
-            background_sr = separated_audio.background.metadata.sample_rate
+        if background_audio is not None:
+            background_sr = background_audio.metadata.sample_rate
             if dubbed_speech.metadata.sample_rate != background_sr:
                 dubbed_speech = dubbed_speech.resample(background_sr)
-            final_audio = separated_audio.background.overlay(dubbed_speech, position=0.0)
+            final_audio = background_audio.overlay(dubbed_speech, position=0.0)
+            # Drop the local; in low_memory this releases the background
+            # buffer (~1.3 GB for 2h sources). In non-low_memory the same
+            # array is still held by separated_audio.background.
+            del background_audio
         else:
             final_audio = dubbed_speech
@@ -294,6 +368,7 @@ class LocalDubbingPipeline:
         separated_audio: SeparatedAudio | None = None
         vocal_audio = source_audio
+        background_audio: Audio | None = None
         if preserve_background:
             report_progress("Separating audio", 0.20)
@@ -303,6 +378,9 @@ class LocalDubbingPipeline:
             separated_audio = self._separator.separate(source_audio)
             self._maybe_unload("_separator")
             vocal_audio = separated_audio.vocals
+            background_audio = separated_audio.background
+            if self.low_memory:
+                separated_audio = None
         report_progress("Extracting voice sample", 0.40)
         voice_sample: Audio | None = None
@@ -314,7 +392,11 @@ class LocalDubbingPipeline:
         if voice_sample is None:
             sample_duration = min(6.0, original_duration)
-            voice_sample = vocal_audio.slice(0, sample_duration)
+            sliced = vocal_audio.slice(0, sample_duration)
+            # Copy so the short sample doesn't pin the full vocals array.
+            voice_sample = Audio(sliced.data.copy(), sliced.metadata)
+        del vocal_audio
         report_progress("Generating speech", 0.60)
         if self._tts is None or self._tts_language != "en":
@@ -327,24 +409,24 @@ class LocalDubbingPipeline:
         report_progress("Assembling audio", 0.85)
-        if separated_audio is not None:
-            background_sr = separated_audio.background.metadata.sample_rate
+        if background_audio is not None:
+            background_sr = background_audio.metadata.sample_rate
             if generated_speech.metadata.sample_rate != background_sr:
                 generated_speech = generated_speech.resample(background_sr)
-            background = separated_audio.background
-            if background.metadata.duration_seconds > speech_duration:
-                background = background.slice(0, speech_duration)
-            elif background.metadata.duration_seconds < speech_duration:
-                silence_duration = speech_duration - background.metadata.duration_seconds
+            if background_audio.metadata.duration_seconds > speech_duration:
+                background_audio = background_audio.slice(0, speech_duration)
+            elif background_audio.metadata.duration_seconds < speech_duration:
+                silence_duration = speech_duration - background_audio.metadata.duration_seconds
                 silence = Audio.silence(
                     duration=silence_duration,
                     sample_rate=background_sr,
-                    channels=background.metadata.channels,
+                    channels=background_audio.metadata.channels,
                 )
-                background = background.concat(silence)
+                background_audio = background_audio.concat(silence)
-            final_audio = background.overlay(generated_speech, position=0.0)
+            final_audio = background_audio.overlay(generated_speech, position=0.0)
+            del background_audio
         else:
             final_audio = generated_speech

{videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/understanding/audio.py RENAMED Viewed

@@ -130,6 +130,48 @@ class AudioToText:
             )
         return result
+    def diarize_transcription(self, audio: Audio, transcription: Transcription) -> Transcription:
+        """Attach speaker labels to a pre-computed transcription using pyannote.
+        Useful when callers have a transcription (e.g. pre-computed and edited)
+        but no speakers, and want per-speaker voice cloning in dubbing without
+        re-running Whisper. Runs pyannote standalone on ``audio`` and overlays
+        speakers onto the supplied transcription's words.
+        Requires word-level timings: at least one segment must contain more
+        than one word. Transcriptions loaded from SRT (one synthetic word per
+        segment) will not produce useful speakers and are rejected.
+        """
+        import numpy as np
+        import torch
+        all_words: list[TranscriptionWord] = list(transcription.words)
+        if not all_words:
+            raise ValueError("Cannot diarize a transcription with no words.")
+        if not any(len(seg.words) > 1 for seg in transcription.segments):
+            raise ValueError(
+                "Cannot diarize a transcription without word-level timings. "
+                "Supplied transcription has at most one word per segment "
+                "(e.g. loaded from SRT). Provide a transcription with "
+                "word-level timings, or omit `transcription` to let the "
+                "pipeline transcribe and diarize from scratch."
+            )
+        if self._diarization_pipeline is None:
+            self._init_diarization()
+        import whisper
+        audio_mono = audio.to_mono().resample(whisper.audio.SAMPLE_RATE)
+        waveform = torch.from_numpy(audio_mono.data.astype(np.float32)).unsqueeze(0)
+        diarization_result = self._diarization_pipeline(
+            {"waveform": waveform, "sample_rate": audio_mono.metadata.sample_rate}
+        )
+        all_words = self._assign_speakers_to_words(all_words, diarization_result)
+        return Transcription(words=all_words, language=transcription.language)
     def _transcribe_with_diarization(self, audio_mono: Audio) -> Transcription:
         """Transcribe with word timestamps and assign speakers via pyannote."""
         import numpy as np

{videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/understanding/separation.py RENAMED Viewed

@@ -42,7 +42,15 @@ class AudioSeparator:
         )
     def _separate_local(self, audio: Audio) -> SeparatedAudio:
-        """Separate audio using local Demucs model."""
+        """Separate audio using local Demucs model.
+        Keeps the input tensor on CPU and passes ``device=self.device`` to
+        ``apply_model`` so per-chunk compute runs on GPU while the full
+        ``(stems, channels, samples)`` output is stored in CPU RAM. For long
+        sources this is the difference between OOM-on-GPU and running cleanly:
+        a 2h stereo @ 44.1kHz output is ~10 GB — too big for an 8 GB card but
+        comfortable on a 32 GB host.
+        """
         import numpy as np
         import torch
         from demucs.apply import apply_model
@@ -65,61 +73,40 @@ class AudioSeparator:
             audio_data = audio_data.T
         wav = torch.tensor(audio_data, dtype=torch.float32).unsqueeze(0)
-        wav = wav.to(self.device)
         with torch.no_grad():
             sources = apply_model(self._model, wav, device=self.device)
         sources_np = sources[0].cpu().numpy()
+        del sources
         stem_names = self.STEM_NAMES_6S if self.model_name == "htdemucs_6s" else self.STEM_NAMES
+        vocals_idx = stem_names.index("vocals")
+        non_vocal_indices = [i for i in range(len(stem_names)) if i != vocals_idx]
-        stems: dict[str, Audio] = {}
-        for i, name in enumerate(stem_names):
-            stem_data = sources_np[i].T
-            metadata = AudioMetadata(
-                sample_rate=target_sr,
-                channels=2,
-                sample_width=2,
-                duration_seconds=stem_data.shape[0] / target_sr,
-                frame_count=stem_data.shape[0],
-            )
-            stems[name] = Audio(stem_data.astype(np.float32), metadata)
-        vocals = stems["vocals"]
-        non_vocal_stems = [stems[name] for name in stem_names if name != "vocals"]
-        background_data = np.zeros_like(vocals.data)
-        for stem in non_vocal_stems:
-            background_data += stem.data
+        vocals_data = sources_np[vocals_idx].T
+        background_data = sources_np[non_vocal_indices].sum(axis=0).T
+        del sources_np
         max_val = np.max(np.abs(background_data))
         if max_val > 1.0:
-            background_data = background_data / max_val
-        background = Audio(background_data.astype(np.float32), vocals.metadata)
-        music_stems = ["drums", "bass", "other"]
-        if self.model_name == "htdemucs_6s":
-            music_stems.extend(["guitar", "piano"])
-        music_data = np.zeros_like(vocals.data)
-        for name in music_stems:
-            if name in stems:
-                music_data += stems[name].data
-        max_val = np.max(np.abs(music_data))
-        if max_val > 1.0:
-            music_data = music_data / max_val
-        music = Audio(music_data.astype(np.float32), vocals.metadata)
+            background_data /= max_val
+        metadata = AudioMetadata(
+            sample_rate=target_sr,
+            channels=2,
+            sample_width=2,
+            duration_seconds=vocals_data.shape[0] / target_sr,
+            frame_count=vocals_data.shape[0],
+        )
+        vocals = Audio(np.ascontiguousarray(vocals_data, dtype=np.float32), metadata)
+        background = Audio(np.ascontiguousarray(background_data, dtype=np.float32), metadata)
         return SeparatedAudio(
             vocals=vocals,
             background=background,
             original=audio,
-            music=music,
+            music=None,
             effects=None,
         )