PyPI - videopython - Versions diffs - 0.26.5__tar.gz → 0.26.6__tar.gz - Mend

videopython 0.26.5tar.gz → 0.26.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

{videopython-0.26.5 → videopython-0.26.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: videopython
-Version: 0.26.5
+Version: 0.26.6
 Summary: Minimal video generation and processing library.
 Project-URL: Homepage, https://videopython.com
 Project-URL: Repository, https://github.com/bartwojtowicz/videopython/

{videopython-0.26.5 → videopython-0.26.6}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "videopython"
-version = "0.26.5"
+version = "0.26.6"
 description = "Minimal video generation and processing library."
 authors = [
     { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },

{videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/dubbing/dubber.py RENAMED Viewed

@@ -74,9 +74,14 @@ class VideoDubber:
         Args:
             enable_diarization: Enable speaker diarization to clone each speaker's
-                voice separately. Requires additional VRAM for the diarization model.
-            transcription: Optional pre-computed Transcription object. When provided,
-                the internal Whisper transcription step is skipped.
+                voice separately. With ``transcription=None``, runs alongside Whisper.
+                With a supplied ``transcription`` that has no speakers, runs pyannote
+                standalone and overlays speakers onto the supplied words. Ignored when
+                the supplied transcription already has speaker labels.
+            transcription: Optional pre-computed ``Transcription`` to skip the Whisper
+                step. Speaker labels on the supplied transcription drive per-speaker
+                voice cloning. If it has no speakers, pass ``enable_diarization=True``
+                to add them via pyannote (requires word-level timings).
         """
         if self._local_pipeline is None:
             self._init_local_pipeline()
@@ -106,8 +111,10 @@ class VideoDubber:
         """Dub a video and return a new video with the dubbed audio.
         Args:
-            transcription: Optional pre-computed Transcription object. When provided,
-                the internal Whisper transcription step is skipped.
+            transcription: Optional pre-computed ``Transcription`` to skip the Whisper
+                step. Speaker labels on the supplied transcription drive per-speaker
+                voice cloning. See ``dub()`` for the interaction with
+                ``enable_diarization``.
         """
         result = self.dub(
             video=video,
@@ -152,8 +159,12 @@ class VideoDubber:
             preserve_background: Preserve background music/effects via source separation.
             voice_clone: Clone the source speaker's voice for the dubbed track.
             enable_diarization: Enable speaker diarization for per-speaker voice cloning.
+                See ``dub()`` for the interaction with ``transcription``.
             progress_callback: Optional callback ``(stage: str, progress: float) -> None``.
-            transcription: Optional pre-computed ``Transcription`` to skip the Whisper step.
+            transcription: Optional pre-computed ``Transcription`` to skip the Whisper
+                step. Speaker labels on the supplied transcription drive per-speaker
+                voice cloning. If it has no speakers, pass ``enable_diarization=True``
+                to add them via pyannote (requires word-level timings).
         Returns:
             ``DubbingResult`` with the dubbed audio, translated segments, and

{videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/dubbing/pipeline.py RENAMED Viewed

@@ -162,7 +162,16 @@ class LocalDubbingPipeline:
             transcription: Optional pre-computed Transcription object. When provided,
                 the internal Whisper transcription step is skipped (saving time and VRAM).
                 Must be a ``videopython.base.text.transcription.Transcription`` instance
-                with populated ``segments``.
+                with populated ``segments``. Speaker labels on the supplied transcription
+                drive per-speaker voice cloning. If the supplied transcription has no
+                speakers and ``enable_diarization=True``, pyannote is run standalone on
+                ``source_audio`` and speakers are attached to the supplied words
+                (requires word-level timings).
+            enable_diarization: When True, run speaker diarization to enable per-speaker
+                voice cloning. With ``transcription=None``, runs alongside Whisper. With
+                a supplied ``transcription`` that has no speakers, runs pyannote
+                standalone and overlays speakers onto the supplied words. Ignored when
+                the supplied transcription already has speaker labels.
         """
         def report_progress(stage: str, progress: float) -> None:
@@ -171,6 +180,34 @@ class LocalDubbingPipeline:
         if transcription is not None:
             report_progress("Using provided transcription", 0.05)
+            if transcription.speakers:
+                logger.info(
+                    "Using provided transcription: %d segment(s), %d speaker(s)",
+                    len(transcription.segments),
+                    len(transcription.speakers),
+                )
+                if enable_diarization:
+                    logger.info("enable_diarization=True ignored: supplied transcription already has speaker labels.")
+            elif enable_diarization:
+                report_progress("Diarizing supplied transcription", 0.10)
+                if self._transcriber is None or self._transcriber_diarization is not True:
+                    self._init_transcriber(enable_diarization=True)
+                    self._transcriber_diarization = True
+                transcription = self._transcriber.diarize_transcription(source_audio, transcription)
+                self._maybe_unload("_transcriber")
+                logger.info(
+                    "Diarized supplied transcription: %d segment(s), %d speaker(s)",
+                    len(transcription.segments),
+                    len(transcription.speakers),
+                )
+            else:
+                logger.info(
+                    "Using provided transcription: %d segment(s), no speaker labels. "
+                    "All segments will share a single voice clone. Pass "
+                    "enable_diarization=True to add per-speaker labels, or "
+                    "voice_clone=False to use the default TTS voice.",
+                    len(transcription.segments),
+                )
         else:
             report_progress("Transcribing audio", 0.05)
             if self._transcriber is None or self._transcriber_diarization != enable_diarization:

{videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/understanding/audio.py RENAMED Viewed

@@ -130,6 +130,48 @@ class AudioToText:
             )
         return result
+    def diarize_transcription(self, audio: Audio, transcription: Transcription) -> Transcription:
+        """Attach speaker labels to a pre-computed transcription using pyannote.
+        Useful when callers have a transcription (e.g. pre-computed and edited)
+        but no speakers, and want per-speaker voice cloning in dubbing without
+        re-running Whisper. Runs pyannote standalone on ``audio`` and overlays
+        speakers onto the supplied transcription's words.
+        Requires word-level timings: at least one segment must contain more
+        than one word. Transcriptions loaded from SRT (one synthetic word per
+        segment) will not produce useful speakers and are rejected.
+        """
+        import numpy as np
+        import torch
+        all_words: list[TranscriptionWord] = list(transcription.words)
+        if not all_words:
+            raise ValueError("Cannot diarize a transcription with no words.")
+        if not any(len(seg.words) > 1 for seg in transcription.segments):
+            raise ValueError(
+                "Cannot diarize a transcription without word-level timings. "
+                "Supplied transcription has at most one word per segment "
+                "(e.g. loaded from SRT). Provide a transcription with "
+                "word-level timings, or omit `transcription` to let the "
+                "pipeline transcribe and diarize from scratch."
+            )
+        if self._diarization_pipeline is None:
+            self._init_diarization()
+        import whisper
+        audio_mono = audio.to_mono().resample(whisper.audio.SAMPLE_RATE)
+        waveform = torch.from_numpy(audio_mono.data.astype(np.float32)).unsqueeze(0)
+        diarization_result = self._diarization_pipeline(
+            {"waveform": waveform, "sample_rate": audio_mono.metadata.sample_rate}
+        )
+        all_words = self._assign_speakers_to_words(all_words, diarization_result)
+        return Transcription(words=all_words, language=transcription.language)
     def _transcribe_with_diarization(self, audio_mono: Audio) -> Transcription:
         """Transcribe with word timestamps and assign speakers via pyannote."""
         import numpy as np