PyPI - videopython - Versions diffs - 0.26.3__tar.gz → 0.26.5__tar.gz - Mend

videopython 0.26.3tar.gz → 0.26.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

{videopython-0.26.3 → videopython-0.26.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: videopython
-Version: 0.26.3
+Version: 0.26.5
 Summary: Minimal video generation and processing library.
 Project-URL: Homepage, https://videopython.com
 Project-URL: Repository, https://github.com/bartwojtowicz/videopython/

{videopython-0.26.3 → videopython-0.26.5}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "videopython"
-version = "0.26.3"
+version = "0.26.5"
 description = "Minimal video generation and processing library."
 authors = [
     { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },

{videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/dubbing/dubber.py RENAMED Viewed

@@ -8,6 +8,7 @@ from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable
 from videopython.ai.dubbing.models import DubbingResult, RevoiceResult
+from videopython.ai.dubbing.pipeline import WhisperModel
 if TYPE_CHECKING:
     from videopython.base.video import Video
@@ -25,19 +26,38 @@ class VideoDubber:
             model is resident at a time. Trades per-run latency (~10-30s of
             extra model loads) for a much lower memory ceiling. Recommended for
             GPUs with <=12GB VRAM or hosts with <32GB RAM. Default False.
+        whisper_model: Whisper model size used for transcription. Larger models
+            give better accuracy at the cost of VRAM and latency. One of
+            ``tiny``, ``base``, ``small``, ``medium``, ``large``, ``turbo``.
+            Default ``small``.
     """
-    def __init__(self, device: str | None = None, low_memory: bool = False):
+    def __init__(
+        self,
+        device: str | None = None,
+        low_memory: bool = False,
+        whisper_model: WhisperModel = "small",
+    ):
         self.device = device
         self.low_memory = low_memory
+        self.whisper_model = whisper_model
         self._local_pipeline: Any = None
         requested = device.lower() if isinstance(device, str) else "auto"
-        logger.info("VideoDubber initialized with device=%s low_memory=%s", requested, low_memory)
+        logger.info(
+            "VideoDubber initialized with device=%s low_memory=%s whisper_model=%s",
+            requested,
+            low_memory,
+            whisper_model,
+        )
     def _init_local_pipeline(self) -> None:
         from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
-        self._local_pipeline = LocalDubbingPipeline(device=self.device, low_memory=self.low_memory)
+        self._local_pipeline = LocalDubbingPipeline(
+            device=self.device,
+            low_memory=self.low_memory,
+            whisper_model=self.whisper_model,
+        )
     def dub(
         self,

{videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/dubbing/pipeline.py RENAMED Viewed

@@ -3,7 +3,7 @@
 from __future__ import annotations
 import logging
-from typing import TYPE_CHECKING, Any, Callable
+from typing import TYPE_CHECKING, Any, Callable, Literal
 from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio
 from videopython.ai.dubbing.timing import TimingSynchronizer
@@ -11,6 +11,8 @@ from videopython.ai.dubbing.timing import TimingSynchronizer
 if TYPE_CHECKING:
     from videopython.base.audio import Audio
+WhisperModel = Literal["tiny", "base", "small", "medium", "large", "turbo"]
 logger = logging.getLogger(__name__)
@@ -23,21 +25,27 @@ class LocalDubbingPipeline:
     with <=12GB VRAM or hosts with <32GB RAM.
     """
-    def __init__(self, device: str | None = None, low_memory: bool = False):
+    def __init__(
+        self,
+        device: str | None = None,
+        low_memory: bool = False,
+        whisper_model: WhisperModel = "small",
+    ):
         self.device = device
         self.low_memory = low_memory
+        self.whisper_model = whisper_model
         requested = device.lower() if isinstance(device, str) else "auto"
         logger.info(
-            "LocalDubbingPipeline initialized with device=%s low_memory=%s",
+            "LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s",
             requested,
             low_memory,
+            whisper_model,
         )
         self._transcriber: Any = None
         self._transcriber_diarization: bool | None = None
         self._translator: Any = None
         self._tts: Any = None
-        self._tts_voice_clone: bool | None = None
         self._tts_language: str | None = None
         self._separator: Any = None
         self._synchronizer: TimingSynchronizer | None = None
@@ -63,7 +71,11 @@ class LocalDubbingPipeline:
         """Initialize the transcription model."""
         from videopython.ai.understanding.audio import AudioToText
-        self._transcriber = AudioToText(device=self.device, enable_diarization=enable_diarization)
+        self._transcriber = AudioToText(
+            model_name=self.whisper_model,
+            device=self.device,
+            enable_diarization=enable_diarization,
+        )
     def _init_translator(self) -> None:
         """Initialize the translation model."""
@@ -71,18 +83,11 @@ class LocalDubbingPipeline:
         self._translator = TextTranslator(device=self.device)
-    def _init_tts(self, voice_clone: bool = False, language: str = "en") -> None:
+    def _init_tts(self, language: str = "en") -> None:
         """Initialize the text-to-speech model."""
         from videopython.ai.generation.audio import TextToSpeech
-        if voice_clone:
-            self._tts = TextToSpeech(
-                model_size="chatterbox",
-                device=self.device,
-                language=language,
-            )
-        else:
-            self._tts = TextToSpeech(device=self.device, language=language)
+        self._tts = TextToSpeech(device=self.device, language=language)
     def _init_separator(self) -> None:
         """Initialize the audio separator."""
@@ -102,6 +107,7 @@ class LocalDubbingPipeline:
         max_duration: float = 10.0,
     ) -> dict[str, Any]:
         """Extract voice samples for each speaker from the audio."""
+        from videopython.base.audio import Audio
         voice_samples: dict[str, Audio] = {}
@@ -128,7 +134,11 @@ class LocalDubbingPipeline:
             if best_segment is not None:
                 start = best_segment.start
                 end = min(best_segment.end, start + max_duration)
-                voice_samples[speaker] = audio.slice(start, end)
+                sliced = audio.slice(start, end)
+                # Audio.slice returns a numpy view into the source. Copy so the
+                # short voice sample doesn't keep the full vocals array (~1.3 GB
+                # for 2h sources) alive across translate + TTS.
+                voice_samples[speaker] = Audio(sliced.data.copy(), sliced.metadata)
         return voice_samples
@@ -183,6 +193,7 @@ class LocalDubbingPipeline:
         separated_audio: SeparatedAudio | None = None
         vocal_audio = source_audio
+        background_audio: Audio | None = None
         if preserve_background:
             report_progress("Separating audio", 0.15)
@@ -192,12 +203,24 @@ class LocalDubbingPipeline:
             separated_audio = self._separator.separate(source_audio)
             self._maybe_unload("_separator")
             vocal_audio = separated_audio.vocals
+            background_audio = separated_audio.background
+            # In low_memory mode, drop the SeparatedAudio container so vocals
+            # and background can be released as soon as their last local
+            # reference goes (after voice-sample extraction and final overlay
+            # respectively). The result will report separated_audio=None.
+            if self.low_memory:
+                separated_audio = None
         voice_samples: dict[str, Audio] = {}
         if voice_clone:
             report_progress("Extracting voice samples", 0.25)
             voice_samples = self._extract_voice_samples(vocal_audio, transcription)
+        # vocals is no longer needed; voice_samples are independent copies.
+        # In low_memory mode this is the only ref keeping the buffer alive
+        # (separated_audio was dropped above), so dropping the local frees it.
+        del vocal_audio
         report_progress("Translating text", 0.35)
         if self._translator is None:
             self._init_translator()
@@ -210,9 +233,8 @@ class LocalDubbingPipeline:
         self._maybe_unload("_translator")
         report_progress("Generating dubbed speech", 0.50)
-        if self._tts is None or self._tts_voice_clone != voice_clone or self._tts_language != target_lang:
-            self._init_tts(voice_clone=voice_clone, language=target_lang)
-            self._tts_voice_clone = voice_clone
+        if self._tts is None or self._tts_language != target_lang:
+            self._init_tts(language=target_lang)
             self._tts_language = target_lang
         dubbed_segments: list[Audio] = []
@@ -246,17 +268,23 @@ class LocalDubbingPipeline:
         assert self._synchronizer is not None
         synchronized_segments, _ = self._synchronizer.synchronize_segments(dubbed_segments, target_durations)
+        del dubbed_segments
         report_progress("Assembling final audio", 0.90)
         total_duration = source_audio.metadata.duration_seconds
         dubbed_speech = self._synchronizer.assemble_with_timing(synchronized_segments, start_times, total_duration)
+        del synchronized_segments
-        if separated_audio is not None:
-            background_sr = separated_audio.background.metadata.sample_rate
+        if background_audio is not None:
+            background_sr = background_audio.metadata.sample_rate
             if dubbed_speech.metadata.sample_rate != background_sr:
                 dubbed_speech = dubbed_speech.resample(background_sr)
-            final_audio = separated_audio.background.overlay(dubbed_speech, position=0.0)
+            final_audio = background_audio.overlay(dubbed_speech, position=0.0)
+            # Drop the local; in low_memory this releases the background
+            # buffer (~1.3 GB for 2h sources). In non-low_memory the same
+            # array is still held by separated_audio.background.
+            del background_audio
         else:
             final_audio = dubbed_speech
@@ -303,6 +331,7 @@ class LocalDubbingPipeline:
         separated_audio: SeparatedAudio | None = None
         vocal_audio = source_audio
+        background_audio: Audio | None = None
         if preserve_background:
             report_progress("Separating audio", 0.20)
@@ -312,6 +341,9 @@ class LocalDubbingPipeline:
             separated_audio = self._separator.separate(source_audio)
             self._maybe_unload("_separator")
             vocal_audio = separated_audio.vocals
+            background_audio = separated_audio.background
+            if self.low_memory:
+                separated_audio = None
         report_progress("Extracting voice sample", 0.40)
         voice_sample: Audio | None = None
@@ -323,12 +355,15 @@ class LocalDubbingPipeline:
         if voice_sample is None:
             sample_duration = min(6.0, original_duration)
-            voice_sample = vocal_audio.slice(0, sample_duration)
+            sliced = vocal_audio.slice(0, sample_duration)
+            # Copy so the short sample doesn't pin the full vocals array.
+            voice_sample = Audio(sliced.data.copy(), sliced.metadata)
+        del vocal_audio
         report_progress("Generating speech", 0.60)
-        if self._tts is None or self._tts_voice_clone is not True or self._tts_language != "en":
-            self._init_tts(voice_clone=True, language="en")
-            self._tts_voice_clone = True
+        if self._tts is None or self._tts_language != "en":
+            self._init_tts(language="en")
             self._tts_language = "en"
         generated_speech = self._tts.generate_audio(text, voice_sample=voice_sample)
@@ -337,24 +372,24 @@ class LocalDubbingPipeline:
         report_progress("Assembling audio", 0.85)
-        if separated_audio is not None:
-            background_sr = separated_audio.background.metadata.sample_rate
+        if background_audio is not None:
+            background_sr = background_audio.metadata.sample_rate
             if generated_speech.metadata.sample_rate != background_sr:
                 generated_speech = generated_speech.resample(background_sr)
-            background = separated_audio.background
-            if background.metadata.duration_seconds > speech_duration:
-                background = background.slice(0, speech_duration)
-            elif background.metadata.duration_seconds < speech_duration:
-                silence_duration = speech_duration - background.metadata.duration_seconds
+            if background_audio.metadata.duration_seconds > speech_duration:
+                background_audio = background_audio.slice(0, speech_duration)
+            elif background_audio.metadata.duration_seconds < speech_duration:
+                silence_duration = speech_duration - background_audio.metadata.duration_seconds
                 silence = Audio.silence(
                     duration=silence_duration,
                     sample_rate=background_sr,
-                    channels=background.metadata.channels,
+                    channels=background_audio.metadata.channels,
                 )
-                background = background.concat(silence)
+                background_audio = background_audio.concat(silence)
-            final_audio = background.overlay(generated_speech, position=0.0)
+            final_audio = background_audio.overlay(generated_speech, position=0.0)
+            del background_audio
         else:
             final_audio = generated_speech

videopython-0.26.5/src/videopython/ai/generation/audio.py ADDED Viewed

@@ -0,0 +1,156 @@
+"""Audio generation using local models."""
+from __future__ import annotations
+from typing import Any
+from videopython.ai._device import log_device_initialization, release_device_memory, select_device
+from videopython.base.audio import Audio, AudioMetadata
+class TextToSpeech:
+    """Generates speech audio from text using Chatterbox Multilingual.
+    Backed by Chatterbox Multilingual (Resemble AI). When ``voice_sample`` is
+    provided to ``generate_audio``, the model clones that voice; otherwise it
+    falls back to Chatterbox's built-in default speaker.
+    """
+    SAMPLE_RATE: int = 24000
+    def __init__(
+        self,
+        voice: Audio | None = None,
+        device: str | None = None,
+        language: str = "en",
+    ):
+        self.voice = voice
+        self.device = device
+        self.language = language
+        self._model: Any = None
+    def _init_model(self) -> None:
+        from chatterbox.mtl_tts import ChatterboxMultilingualTTS  # type: ignore[import-untyped]
+        requested_device = self.device
+        device = select_device(self.device, mps_allowed=False)
+        self._model = ChatterboxMultilingualTTS.from_pretrained(device=device)
+        self.device = device
+        log_device_initialization(
+            "TextToSpeech",
+            requested_device=requested_device,
+            resolved_device=device,
+        )
+    def generate_audio(
+        self,
+        text: str,
+        voice_sample: Audio | None = None,
+    ) -> Audio:
+        """Generate speech audio from text.
+        Args:
+            text: Text to synthesize.
+            voice_sample: Optional voice sample to clone. Falls back to the
+                instance's ``voice`` and then to Chatterbox's default speaker.
+        """
+        import tempfile
+        from pathlib import Path
+        import numpy as np
+        if self._model is None:
+            self._init_model()
+        effective_sample = voice_sample or self.voice
+        speaker_wav_path: Path | None = None
+        if effective_sample is not None:
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+                effective_sample.save(f.name)
+                speaker_wav_path = Path(f.name)
+        try:
+            wav = self._model.generate(
+                text=text,
+                language_id=self.language,
+                audio_prompt_path=str(speaker_wav_path) if speaker_wav_path else None,
+            )
+            audio_data = wav.cpu().float().numpy().squeeze()
+            if audio_data.ndim == 0:
+                audio_data = np.array([audio_data], dtype=np.float32)
+            metadata = AudioMetadata(
+                sample_rate=self.SAMPLE_RATE,
+                channels=1,
+                sample_width=2,
+                duration_seconds=len(audio_data) / self.SAMPLE_RATE,
+                frame_count=len(audio_data),
+            )
+            return Audio(audio_data, metadata)
+        finally:
+            if speaker_wav_path is not None:
+                speaker_wav_path.unlink()
+    def unload(self) -> None:
+        """Release the TTS model so the next generate_audio() re-initializes.
+        Used by low-memory dubbing to free VRAM between pipeline stages.
+        """
+        self._model = None
+        release_device_memory(self.device)
+class TextToMusic:
+    """Generates music from text descriptions using MusicGen."""
+    def __init__(self, device: str | None = None):
+        self.device = device
+        self._processor: Any = None
+        self._model: Any = None
+        self._device: str | None = None
+    def _init_local(self) -> None:
+        """Initialize local MusicGen model."""
+        import os
+        from transformers import AutoProcessor, MusicgenForConditionalGeneration
+        os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+        requested_device = self.device
+        self._device = select_device(self.device, mps_allowed=True)
+        model_name = "facebook/musicgen-small"
+        self._processor = AutoProcessor.from_pretrained(model_name)
+        self._model = MusicgenForConditionalGeneration.from_pretrained(model_name)
+        self._model.to(self._device)
+        self.device = self._device
+        log_device_initialization(
+            "TextToMusic",
+            requested_device=requested_device,
+            resolved_device=self._device,
+        )
+    def generate_audio(self, text: str, max_new_tokens: int = 256) -> Audio:
+        """Generate music audio from text description."""
+        if self._model is None:
+            self._init_local()
+        inputs = self._processor(text=[text], padding=True, return_tensors="pt")
+        inputs = {k: v.to(self._device) if hasattr(v, "to") else v for k, v in inputs.items()}
+        audio_values = self._model.generate(**inputs, max_new_tokens=max_new_tokens)
+        sampling_rate = self._model.config.audio_encoder.sampling_rate
+        audio_data = audio_values[0, 0].cpu().float().numpy()
+        metadata = AudioMetadata(
+            sample_rate=sampling_rate,
+            channels=1,
+            sample_width=2,
+            duration_seconds=len(audio_data) / sampling_rate,
+            frame_count=len(audio_data),
+        )
+        return Audio(audio_data, metadata)

{videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/understanding/separation.py RENAMED Viewed

@@ -42,7 +42,15 @@ class AudioSeparator:
         )
     def _separate_local(self, audio: Audio) -> SeparatedAudio:
-        """Separate audio using local Demucs model."""
+        """Separate audio using local Demucs model.
+        Keeps the input tensor on CPU and passes ``device=self.device`` to
+        ``apply_model`` so per-chunk compute runs on GPU while the full
+        ``(stems, channels, samples)`` output is stored in CPU RAM. For long
+        sources this is the difference between OOM-on-GPU and running cleanly:
+        a 2h stereo @ 44.1kHz output is ~10 GB — too big for an 8 GB card but
+        comfortable on a 32 GB host.
+        """
         import numpy as np
         import torch
         from demucs.apply import apply_model
@@ -65,61 +73,40 @@ class AudioSeparator:
             audio_data = audio_data.T
         wav = torch.tensor(audio_data, dtype=torch.float32).unsqueeze(0)
-        wav = wav.to(self.device)
         with torch.no_grad():
             sources = apply_model(self._model, wav, device=self.device)
         sources_np = sources[0].cpu().numpy()
+        del sources
         stem_names = self.STEM_NAMES_6S if self.model_name == "htdemucs_6s" else self.STEM_NAMES
+        vocals_idx = stem_names.index("vocals")
+        non_vocal_indices = [i for i in range(len(stem_names)) if i != vocals_idx]
-        stems: dict[str, Audio] = {}
-        for i, name in enumerate(stem_names):
-            stem_data = sources_np[i].T
-            metadata = AudioMetadata(
-                sample_rate=target_sr,
-                channels=2,
-                sample_width=2,
-                duration_seconds=stem_data.shape[0] / target_sr,
-                frame_count=stem_data.shape[0],
-            )
-            stems[name] = Audio(stem_data.astype(np.float32), metadata)
-        vocals = stems["vocals"]
-        non_vocal_stems = [stems[name] for name in stem_names if name != "vocals"]
-        background_data = np.zeros_like(vocals.data)
-        for stem in non_vocal_stems:
-            background_data += stem.data
+        vocals_data = sources_np[vocals_idx].T
+        background_data = sources_np[non_vocal_indices].sum(axis=0).T
+        del sources_np
         max_val = np.max(np.abs(background_data))
         if max_val > 1.0:
-            background_data = background_data / max_val
-        background = Audio(background_data.astype(np.float32), vocals.metadata)
-        music_stems = ["drums", "bass", "other"]
-        if self.model_name == "htdemucs_6s":
-            music_stems.extend(["guitar", "piano"])
-        music_data = np.zeros_like(vocals.data)
-        for name in music_stems:
-            if name in stems:
-                music_data += stems[name].data
-        max_val = np.max(np.abs(music_data))
-        if max_val > 1.0:
-            music_data = music_data / max_val
-        music = Audio(music_data.astype(np.float32), vocals.metadata)
+            background_data /= max_val
+        metadata = AudioMetadata(
+            sample_rate=target_sr,
+            channels=2,
+            sample_width=2,
+            duration_seconds=vocals_data.shape[0] / target_sr,
+            frame_count=vocals_data.shape[0],
+        )
+        vocals = Audio(np.ascontiguousarray(vocals_data, dtype=np.float32), metadata)
+        background = Audio(np.ascontiguousarray(background_data, dtype=np.float32), metadata)
         return SeparatedAudio(
             vocals=vocals,
             background=background,
             original=audio,
-            music=music,
+            music=None,
             effects=None,
         )

videopython-0.26.3/src/videopython/ai/generation/audio.py DELETED Viewed

@@ -1,215 +0,0 @@
-"""Audio generation using local models."""
-from __future__ import annotations
-from typing import Any
-from videopython.ai._device import log_device_initialization, release_device_memory, select_device
-from videopython.base.audio import Audio, AudioMetadata
-class TextToSpeech:
-    """Generates speech audio from text using local models.
-    Supports Bark (`base`, `small`) for general TTS and Chatterbox Multilingual
-    (`chatterbox`) for multilingual voice cloning.
-    """
-    SUPPORTED_LOCAL_MODELS: list[str] = ["base", "small", "chatterbox"]
-    CHATTERBOX_SAMPLE_RATE: int = 24000
-    def __init__(
-        self,
-        model_size: str = "base",
-        voice: str | None = None,
-        device: str | None = None,
-        language: str = "en",
-    ):
-        if model_size not in self.SUPPORTED_LOCAL_MODELS:
-            raise ValueError(f"model_size must be one of {self.SUPPORTED_LOCAL_MODELS}, got '{model_size}'")
-        self.model_size = model_size
-        self.voice = voice
-        self.device = device
-        self.language = language
-        self._model: Any = None
-        self._processor: Any = None
-        self._chatterbox_model: Any = None
-    def _init_local(self) -> None:
-        """Initialize local Bark model."""
-        from transformers import AutoModel, AutoProcessor
-        requested_device = self.device
-        device = select_device(self.device, mps_allowed=False)
-        model_name = "suno/bark" if self.model_size == "base" else "suno/bark-small"
-        self._processor = AutoProcessor.from_pretrained(model_name)
-        self._model = AutoModel.from_pretrained(model_name).to(device)
-        self.device = device
-        log_device_initialization(
-            "TextToSpeech",
-            requested_device=requested_device,
-            resolved_device=device,
-        )
-    def _init_chatterbox(self) -> None:
-        """Initialize Chatterbox Multilingual model for voice cloning."""
-        from chatterbox.mtl_tts import ChatterboxMultilingualTTS  # type: ignore[import-untyped]
-        requested_device = self.device
-        device = select_device(self.device, mps_allowed=False)
-        self._chatterbox_model = ChatterboxMultilingualTTS.from_pretrained(device=device)
-        self.device = device
-        log_device_initialization(
-            "TextToSpeech",
-            requested_device=requested_device,
-            resolved_device=device,
-        )
-    def _generate_local(self, text: str, voice_preset: str | None) -> Audio:
-        """Generate speech using Bark."""
-        import torch
-        if self._model is None:
-            self._init_local()
-        inputs = self._processor(text=[text], return_tensors="pt", voice_preset=voice_preset)
-        inputs = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
-        with torch.no_grad():
-            speech_values = self._model.generate(**inputs, do_sample=True)
-        audio_data = speech_values.cpu().float().numpy().squeeze()
-        sample_rate = self._model.generation_config.sample_rate
-        metadata = AudioMetadata(
-            sample_rate=sample_rate,
-            channels=1,
-            sample_width=2,
-            duration_seconds=len(audio_data) / sample_rate,
-            frame_count=len(audio_data),
-        )
-        return Audio(audio_data, metadata)
-    def _generate_chatterbox(self, text: str, voice_sample: Audio) -> Audio:
-        """Generate speech using Chatterbox Multilingual with voice cloning."""
-        import tempfile
-        from pathlib import Path
-        import numpy as np
-        if self._chatterbox_model is None:
-            self._init_chatterbox()
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
-            voice_sample.save(f.name)
-            speaker_wav_path = Path(f.name)
-        try:
-            wav = self._chatterbox_model.generate(
-                text=text,
-                language_id=self.language,
-                audio_prompt_path=str(speaker_wav_path),
-            )
-            audio_data = wav.cpu().float().numpy().squeeze()
-            if audio_data.ndim == 0:
-                audio_data = np.array([audio_data], dtype=np.float32)
-            sample_rate = self.CHATTERBOX_SAMPLE_RATE
-            metadata = AudioMetadata(
-                sample_rate=sample_rate,
-                channels=1,
-                sample_width=2,
-                duration_seconds=len(audio_data) / sample_rate,
-                frame_count=len(audio_data),
-            )
-            return Audio(audio_data, metadata)
-        finally:
-            speaker_wav_path.unlink()
-    def generate_audio(
-        self,
-        text: str,
-        voice_preset: str | None = None,
-        voice_sample: Audio | None = None,
-    ) -> Audio:
-        """Generate speech audio from text."""
-        effective_voice = voice_preset or self.voice
-        if self.model_size == "chatterbox" or voice_sample is not None:
-            if voice_sample is None:
-                raise ValueError(
-                    "voice_sample is required for Chatterbox voice cloning. "
-                    "Provide an Audio sample of the voice to clone."
-                )
-            return self._generate_chatterbox(text, voice_sample)
-        return self._generate_local(text, effective_voice)
-    def unload(self) -> None:
-        """Release the TTS model(s) so the next generate_audio() re-initializes.
-        Used by low-memory dubbing to free VRAM between pipeline stages.
-        """
-        self._model = None
-        self._processor = None
-        self._chatterbox_model = None
-        release_device_memory(self.device)
-class TextToMusic:
-    """Generates music from text descriptions using MusicGen."""
-    def __init__(self, device: str | None = None):
-        self.device = device
-        self._processor: Any = None
-        self._model: Any = None
-        self._device: str | None = None
-    def _init_local(self) -> None:
-        """Initialize local MusicGen model."""
-        import os
-        from transformers import AutoProcessor, MusicgenForConditionalGeneration
-        os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
-        requested_device = self.device
-        self._device = select_device(self.device, mps_allowed=True)
-        model_name = "facebook/musicgen-small"
-        self._processor = AutoProcessor.from_pretrained(model_name)
-        self._model = MusicgenForConditionalGeneration.from_pretrained(model_name)
-        self._model.to(self._device)
-        self.device = self._device
-        log_device_initialization(
-            "TextToMusic",
-            requested_device=requested_device,
-            resolved_device=self._device,
-        )
-    def generate_audio(self, text: str, max_new_tokens: int = 256) -> Audio:
-        """Generate music audio from text description."""
-        if self._model is None:
-            self._init_local()
-        inputs = self._processor(text=[text], padding=True, return_tensors="pt")
-        inputs = {k: v.to(self._device) if hasattr(v, "to") else v for k, v in inputs.items()}
-        audio_values = self._model.generate(**inputs, max_new_tokens=max_new_tokens)
-        sampling_rate = self._model.config.audio_encoder.sampling_rate
-        audio_data = audio_values[0, 0].cpu().float().numpy()
-        metadata = AudioMetadata(
-            sample_rate=sampling_rate,
-            channels=1,
-            sample_width=2,
-            duration_seconds=len(audio_data) / sampling_rate,
-            frame_count=len(audio_data),
-        )
-        return Audio(audio_data, metadata)