PyPI - videopython - Versions diffs - 0.26.0__tar.gz → 0.26.2__tar.gz - Mend

videopython 0.26.0tar.gz → 0.26.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

{videopython-0.26.0 → videopython-0.26.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: videopython
-Version: 0.26.0
+Version: 0.26.2
 Summary: Minimal video generation and processing library.
 Project-URL: Homepage, https://videopython.com
 Project-URL: Repository, https://github.com/bartwojtowicz/videopython/

{videopython-0.26.0 → videopython-0.26.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "videopython"
-version = "0.26.0"
+version = "0.26.2"
 description = "Minimal video generation and processing library."
 authors = [
     { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },

{videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/_device.py RENAMED Viewed

@@ -25,6 +25,33 @@ def log_device_initialization(
     )
+def release_device_memory(device: str | None) -> None:
+    """Release cached allocator memory for the given device.
+    Safe to call when torch is not importable or the device is CPU/None.
+    """
+    try:
+        import torch
+    except ImportError:
+        return
+    import gc
+    gc.collect()
+    if device == "cuda" and torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        return
+    if device == "mps":
+        mps_backend = getattr(torch.backends, "mps", None)
+        if mps_backend is not None and mps_backend.is_available():
+            mps_mod = getattr(torch, "mps", None)
+            empty_cache = getattr(mps_mod, "empty_cache", None) if mps_mod is not None else None
+            if callable(empty_cache):
+                empty_cache()
 def select_device(
     device: str | None,
     *,

{videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/dubbing/dubber.py RENAMED Viewed

@@ -14,18 +14,28 @@ logger = logging.getLogger(__name__)
 class VideoDubber:
-    """Dubs videos into different languages using the local pipeline."""
-    def __init__(self, device: str | None = None):
+    """Dubs videos into different languages using the local pipeline.
+    Args:
+        device: Execution device (``cpu``, ``cuda``, ``mps``, or ``None`` for auto).
+        low_memory: When True, each pipeline stage (Whisper, Demucs, MarianMT,
+            Chatterbox TTS) is unloaded from memory after it runs, so only one
+            model is resident at a time. Trades per-run latency (~10-30s of
+            extra model loads) for a much lower memory ceiling. Recommended for
+            GPUs with <=12GB VRAM or hosts with <32GB RAM. Default False.
+    """
+    def __init__(self, device: str | None = None, low_memory: bool = False):
         self.device = device
+        self.low_memory = low_memory
         self._local_pipeline: Any = None
         requested = device.lower() if isinstance(device, str) else "auto"
-        logger.info("VideoDubber initialized with device=%s", requested)
+        logger.info("VideoDubber initialized with device=%s low_memory=%s", requested, low_memory)
     def _init_local_pipeline(self) -> None:
         from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
-        self._local_pipeline = LocalDubbingPipeline(device=self.device)
+        self._local_pipeline = LocalDubbingPipeline(device=self.device, low_memory=self.low_memory)
     def dub(
         self,
@@ -36,12 +46,15 @@ class VideoDubber:
         voice_clone: bool = True,
         enable_diarization: bool = False,
         progress_callback: Callable[[str, float], None] | None = None,
+        transcription: Any = None,
     ) -> DubbingResult:
         """Dub a video into a target language.
         Args:
             enable_diarization: Enable speaker diarization to clone each speaker's
                 voice separately. Requires additional VRAM for the diarization model.
+            transcription: Optional pre-computed Transcription object. When provided,
+                the internal Whisper transcription step is skipped.
         """
         if self._local_pipeline is None:
             self._init_local_pipeline()
@@ -54,6 +67,7 @@ class VideoDubber:
             voice_clone=voice_clone,
             enable_diarization=enable_diarization,
             progress_callback=progress_callback,
+            transcription=transcription,
         )
     def dub_and_replace(
@@ -65,8 +79,14 @@ class VideoDubber:
         voice_clone: bool = True,
         enable_diarization: bool = False,
         progress_callback: Callable[[str, float], None] | None = None,
+        transcription: Any = None,
     ) -> Video:
-        """Dub a video and return a new video with the dubbed audio."""
+        """Dub a video and return a new video with the dubbed audio.
+        Args:
+            transcription: Optional pre-computed Transcription object. When provided,
+                the internal Whisper transcription step is skipped.
+        """
         result = self.dub(
             video=video,
             target_lang=target_lang,
@@ -75,6 +95,7 @@ class VideoDubber:
             voice_clone=voice_clone,
             enable_diarization=enable_diarization,
             progress_callback=progress_callback,
+            transcription=transcription,
         )
         return video.add_audio(result.dubbed_audio, overlay=False)

{videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/dubbing/pipeline.py RENAMED Viewed

@@ -15,12 +15,23 @@ logger = logging.getLogger(__name__)
 class LocalDubbingPipeline:
-    """Local pipeline for video dubbing."""
+    """Local pipeline for video dubbing.
-    def __init__(self, device: str | None = None):
+    When ``low_memory=True``, each stage's model is unloaded after it runs, so
+    only one model is resident at a time. This trades per-run latency (models
+    re-load from disk between stages) for peak memory. Recommended for GPUs
+    with <=12GB VRAM or hosts with <32GB RAM.
+    """
+    def __init__(self, device: str | None = None, low_memory: bool = False):
         self.device = device
+        self.low_memory = low_memory
         requested = device.lower() if isinstance(device, str) else "auto"
-        logger.info("LocalDubbingPipeline initialized with device=%s", requested)
+        logger.info(
+            "LocalDubbingPipeline initialized with device=%s low_memory=%s",
+            requested,
+            low_memory,
+        )
         self._transcriber: Any = None
         self._transcriber_diarization: bool | None = None
@@ -31,6 +42,23 @@ class LocalDubbingPipeline:
         self._separator: Any = None
         self._synchronizer: TimingSynchronizer | None = None
+    def _maybe_unload(self, component_name: str) -> None:
+        """Unload a stage's model when low_memory mode is enabled.
+        No-op when low_memory=False or the component was never initialized
+        (e.g. caller supplied a pre-computed transcription so the transcriber
+        was skipped).
+        """
+        if not self.low_memory:
+            return
+        component = getattr(self, component_name, None)
+        if component is None:
+            return
+        unload = getattr(component, "unload", None)
+        if callable(unload):
+            logger.info("low_memory: unloading %s", component_name.lstrip("_"))
+            unload()
     def _init_transcriber(self, enable_diarization: bool = False) -> None:
         """Initialize the transcription model."""
         from videopython.ai.understanding.audio import AudioToText
@@ -114,21 +142,34 @@ class LocalDubbingPipeline:
         voice_clone: bool = True,
         enable_diarization: bool = False,
         progress_callback: Callable[[str, float], None] | None = None,
+        transcription: Any | None = None,
     ) -> DubbingResult:
-        """Process a video through the local dubbing pipeline."""
+        """Process a video through the local dubbing pipeline.
+        Args:
+            transcription: Optional pre-computed Transcription object. When provided,
+                the internal Whisper transcription step is skipped (saving time and VRAM).
+                Must be a ``videopython.base.text.transcription.Transcription`` instance
+                with populated ``segments``.
+        """
         from videopython.base.audio import Audio
         def report_progress(stage: str, progress: float) -> None:
             if progress_callback:
                 progress_callback(stage, progress)
-        report_progress("Transcribing audio", 0.05)
-        if self._transcriber is None or self._transcriber_diarization != enable_diarization:
-            self._init_transcriber(enable_diarization=enable_diarization)
-            self._transcriber_diarization = enable_diarization
         source_audio = video.audio
-        transcription = self._transcriber.transcribe(source_audio)
+        if transcription is not None:
+            report_progress("Using provided transcription", 0.05)
+        else:
+            report_progress("Transcribing audio", 0.05)
+            if self._transcriber is None or self._transcriber_diarization != enable_diarization:
+                self._init_transcriber(enable_diarization=enable_diarization)
+                self._transcriber_diarization = enable_diarization
+            transcription = self._transcriber.transcribe(source_audio)
+            self._maybe_unload("_transcriber")
         if not transcription.segments:
             return DubbingResult(
@@ -150,6 +191,7 @@ class LocalDubbingPipeline:
                 self._init_separator()
             separated_audio = self._separator.separate(source_audio)
+            self._maybe_unload("_separator")
             vocal_audio = separated_audio.vocals
         voice_samples: dict[str, Audio] = {}
@@ -166,6 +208,7 @@ class LocalDubbingPipeline:
             target_lang=target_lang,
             source_lang=detected_lang,
         )
+        self._maybe_unload("_translator")
         report_progress("Generating dubbed speech", 0.50)
         if self._tts is None or self._tts_voice_clone != voice_clone or self._tts_language != target_lang:
@@ -196,6 +239,8 @@ class LocalDubbingPipeline:
             target_durations.append(segment.duration)
             start_times.append(segment.start)
+        self._maybe_unload("_tts")
         report_progress("Synchronizing timing", 0.85)
         if self._synchronizer is None:
             self._init_synchronizer()
@@ -251,6 +296,7 @@ class LocalDubbingPipeline:
             self._transcriber_diarization = False
         transcription = self._transcriber.transcribe(source_audio)
+        self._maybe_unload("_transcriber")
         separated_audio: SeparatedAudio | None = None
         vocal_audio = source_audio
@@ -261,6 +307,7 @@ class LocalDubbingPipeline:
                 self._init_separator()
             separated_audio = self._separator.separate(source_audio)
+            self._maybe_unload("_separator")
             vocal_audio = separated_audio.vocals
         report_progress("Extracting voice sample", 0.40)
@@ -283,6 +330,7 @@ class LocalDubbingPipeline:
         generated_speech = self._tts.generate_audio(text, voice_sample=voice_sample)
         speech_duration = generated_speech.metadata.duration_seconds
+        self._maybe_unload("_tts")
         report_progress("Assembling audio", 0.85)

{videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/generation/audio.py RENAMED Viewed

@@ -4,7 +4,7 @@ from __future__ import annotations
 from typing import Any
-from videopython.ai._device import log_device_initialization, select_device
+from videopython.ai._device import log_device_initialization, release_device_memory, select_device
 from videopython.base.audio import Audio, AudioMetadata
@@ -151,6 +151,16 @@ class TextToSpeech:
         return self._generate_local(text, effective_voice)
+    def unload(self) -> None:
+        """Release the TTS model(s) so the next generate_audio() re-initializes.
+        Used by low-memory dubbing to free VRAM between pipeline stages.
+        """
+        self._model = None
+        self._processor = None
+        self._chatterbox_model = None
+        release_device_memory(self.device)
 class TextToMusic:
     """Generates music from text descriptions using MusicGen."""

{videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/generation/translation.py RENAMED Viewed

@@ -4,7 +4,7 @@ from __future__ import annotations
 from typing import Any
-from videopython.ai._device import log_device_initialization, select_device
+from videopython.ai._device import log_device_initialization, release_device_memory, select_device
 from videopython.ai.dubbing.models import TranslatedSegment
 from videopython.base.text.transcription import TranscriptionSegment
@@ -48,6 +48,15 @@ LANGUAGE_NAMES = {
 class TextTranslator:
     """Translates text between languages using local seq2seq models."""
+    # Languages without a direct opus-mt-{src}-{tgt} model. Maps (source, target)
+    # to an alternative HuggingFace model identifier.
+    _MODEL_OVERRIDES: dict[tuple[str, str], str] = {
+        ("en", "pt"): "Helsinki-NLP/opus-mt-tc-big-en-pt",
+        ("en", "ko"): "Helsinki-NLP/opus-mt-tc-big-en-ko",
+        ("en", "ja"): "Helsinki-NLP/opus-mt-en-jap",
+        ("en", "pl"): "Helsinki-NLP/opus-mt-en-zlw",
+    }
     def __init__(self, model_name: str | None = None, device: str | None = None):
         self.model_name = model_name
         self.device = device
@@ -58,6 +67,9 @@ class TextTranslator:
     def _get_local_model_name(self, source_lang: str, target_lang: str) -> str:
         if self.model_name:
             return self.model_name
+        override = self._MODEL_OVERRIDES.get((source_lang, target_lang))
+        if override:
+            return override
         return f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
     def _init_local(self, source_lang: str, target_lang: str) -> None:
@@ -168,6 +180,16 @@ class TextTranslator:
         return translated_segments
+    def unload(self) -> None:
+        """Release the translation model so the next translate() re-initializes.
+        Used by low-memory dubbing to free VRAM between pipeline stages.
+        """
+        self._model = None
+        self._tokenizer = None
+        self._current_lang_pair = None
+        release_device_memory(self.device)
     @staticmethod
     def get_supported_languages() -> dict[str, str]:
         return LANGUAGE_NAMES.copy()

{videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/understanding/audio.py RENAMED Viewed

@@ -4,7 +4,7 @@ from __future__ import annotations
 from typing import Any, Literal
-from videopython.ai._device import log_device_initialization, select_device
+from videopython.ai._device import log_device_initialization, release_device_memory, select_device
 from videopython.base.audio import Audio
 from videopython.base.description import AudioClassification, AudioEvent
 from videopython.base.text.transcription import Transcription, TranscriptionSegment, TranscriptionWord
@@ -51,6 +51,15 @@ class AudioToText:
         self._diarization_pipeline = Pipeline.from_pretrained(self.PYANNOTE_DIARIZATION_MODEL)
         self._diarization_pipeline.to(torch.device(self.device))
+    def unload(self) -> None:
+        """Release the Whisper and diarization models so the next call re-initializes.
+        Used by low-memory dubbing to free VRAM between pipeline stages.
+        """
+        self._model = None
+        self._diarization_pipeline = None
+        release_device_memory(self.device)
     def _process_transcription_result(self, transcription_result: dict) -> Transcription:
         """Process raw transcription result into a Transcription object."""
         transcription_segments = []

{videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/understanding/separation.py RENAMED Viewed

@@ -4,7 +4,7 @@ from __future__ import annotations
 from typing import Any
-from videopython.ai._device import log_device_initialization, select_device
+from videopython.ai._device import log_device_initialization, release_device_memory, select_device
 from videopython.ai.dubbing.models import SeparatedAudio
 from videopython.base.audio import Audio, AudioMetadata
@@ -134,3 +134,11 @@ class AudioSeparator:
     def extract_background(self, audio: Audio) -> Audio:
         """Convenience method to extract only background from audio."""
         return self.separate(audio).background
+    def unload(self) -> None:
+        """Release the Demucs model so the next separate() re-initializes.
+        Used by low-memory dubbing to free VRAM between pipeline stages.
+        """
+        self._model = None
+        release_device_memory(self.device)