videopython 0.26.3__tar.gz → 0.26.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.26.3 → videopython-0.26.4}/PKG-INFO +1 -1
- {videopython-0.26.3 → videopython-0.26.4}/pyproject.toml +1 -1
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/ai/dubbing/pipeline.py +6 -16
- videopython-0.26.4/src/videopython/ai/generation/audio.py +156 -0
- videopython-0.26.3/src/videopython/ai/generation/audio.py +0 -215
- {videopython-0.26.3 → videopython-0.26.4}/.gitignore +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/LICENSE +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/README.md +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/__init__.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/ai/_device.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/ai/dubbing/__init__.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/ai/dubbing/dubber.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/ai/dubbing/models.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/ai/dubbing/remux.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/ai/generation/translation.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/ai/registry.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/ai/swapping/__init__.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/ai/swapping/inpainter.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/ai/swapping/models.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/ai/swapping/segmenter.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/ai/swapping/swapper.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/ai/transforms.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/ai/understanding/audio.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/ai/understanding/separation.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/ai/video_analysis.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/base/__init__.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/base/audio/__init__.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/base/audio/analysis.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/base/audio/audio.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/base/combine.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/base/description.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/base/effects.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/base/progress.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/base/registry.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/base/scene.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/base/streaming.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/base/text/__init__.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/base/text/overlay.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/base/text/transcription.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/base/transforms.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/base/transitions.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/base/utils.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/base/video.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/editing/__init__.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/editing/multicam.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/editing/premiere_xml.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/editing/video_edit.py +0 -0
- {videopython-0.26.3 → videopython-0.26.4}/src/videopython/py.typed +0 -0
|
@@ -37,7 +37,6 @@ class LocalDubbingPipeline:
|
|
|
37
37
|
self._transcriber_diarization: bool | None = None
|
|
38
38
|
self._translator: Any = None
|
|
39
39
|
self._tts: Any = None
|
|
40
|
-
self._tts_voice_clone: bool | None = None
|
|
41
40
|
self._tts_language: str | None = None
|
|
42
41
|
self._separator: Any = None
|
|
43
42
|
self._synchronizer: TimingSynchronizer | None = None
|
|
@@ -71,18 +70,11 @@ class LocalDubbingPipeline:
|
|
|
71
70
|
|
|
72
71
|
self._translator = TextTranslator(device=self.device)
|
|
73
72
|
|
|
74
|
-
def _init_tts(self,
|
|
73
|
+
def _init_tts(self, language: str = "en") -> None:
|
|
75
74
|
"""Initialize the text-to-speech model."""
|
|
76
75
|
from videopython.ai.generation.audio import TextToSpeech
|
|
77
76
|
|
|
78
|
-
|
|
79
|
-
self._tts = TextToSpeech(
|
|
80
|
-
model_size="chatterbox",
|
|
81
|
-
device=self.device,
|
|
82
|
-
language=language,
|
|
83
|
-
)
|
|
84
|
-
else:
|
|
85
|
-
self._tts = TextToSpeech(device=self.device, language=language)
|
|
77
|
+
self._tts = TextToSpeech(device=self.device, language=language)
|
|
86
78
|
|
|
87
79
|
def _init_separator(self) -> None:
|
|
88
80
|
"""Initialize the audio separator."""
|
|
@@ -210,9 +202,8 @@ class LocalDubbingPipeline:
|
|
|
210
202
|
self._maybe_unload("_translator")
|
|
211
203
|
|
|
212
204
|
report_progress("Generating dubbed speech", 0.50)
|
|
213
|
-
if self._tts is None or self.
|
|
214
|
-
self._init_tts(
|
|
215
|
-
self._tts_voice_clone = voice_clone
|
|
205
|
+
if self._tts is None or self._tts_language != target_lang:
|
|
206
|
+
self._init_tts(language=target_lang)
|
|
216
207
|
self._tts_language = target_lang
|
|
217
208
|
|
|
218
209
|
dubbed_segments: list[Audio] = []
|
|
@@ -326,9 +317,8 @@ class LocalDubbingPipeline:
|
|
|
326
317
|
voice_sample = vocal_audio.slice(0, sample_duration)
|
|
327
318
|
|
|
328
319
|
report_progress("Generating speech", 0.60)
|
|
329
|
-
if self._tts is None or self.
|
|
330
|
-
self._init_tts(
|
|
331
|
-
self._tts_voice_clone = True
|
|
320
|
+
if self._tts is None or self._tts_language != "en":
|
|
321
|
+
self._init_tts(language="en")
|
|
332
322
|
self._tts_language = "en"
|
|
333
323
|
|
|
334
324
|
generated_speech = self._tts.generate_audio(text, voice_sample=voice_sample)
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""Audio generation using local models."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from videopython.ai._device import log_device_initialization, release_device_memory, select_device
|
|
8
|
+
from videopython.base.audio import Audio, AudioMetadata
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TextToSpeech:
|
|
12
|
+
"""Generates speech audio from text using Chatterbox Multilingual.
|
|
13
|
+
|
|
14
|
+
Backed by Chatterbox Multilingual (Resemble AI). When ``voice_sample`` is
|
|
15
|
+
provided to ``generate_audio``, the model clones that voice; otherwise it
|
|
16
|
+
falls back to Chatterbox's built-in default speaker.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
SAMPLE_RATE: int = 24000
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
voice: Audio | None = None,
|
|
24
|
+
device: str | None = None,
|
|
25
|
+
language: str = "en",
|
|
26
|
+
):
|
|
27
|
+
self.voice = voice
|
|
28
|
+
self.device = device
|
|
29
|
+
self.language = language
|
|
30
|
+
self._model: Any = None
|
|
31
|
+
|
|
32
|
+
def _init_model(self) -> None:
|
|
33
|
+
from chatterbox.mtl_tts import ChatterboxMultilingualTTS # type: ignore[import-untyped]
|
|
34
|
+
|
|
35
|
+
requested_device = self.device
|
|
36
|
+
device = select_device(self.device, mps_allowed=False)
|
|
37
|
+
|
|
38
|
+
self._model = ChatterboxMultilingualTTS.from_pretrained(device=device)
|
|
39
|
+
self.device = device
|
|
40
|
+
log_device_initialization(
|
|
41
|
+
"TextToSpeech",
|
|
42
|
+
requested_device=requested_device,
|
|
43
|
+
resolved_device=device,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
def generate_audio(
|
|
47
|
+
self,
|
|
48
|
+
text: str,
|
|
49
|
+
voice_sample: Audio | None = None,
|
|
50
|
+
) -> Audio:
|
|
51
|
+
"""Generate speech audio from text.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
text: Text to synthesize.
|
|
55
|
+
voice_sample: Optional voice sample to clone. Falls back to the
|
|
56
|
+
instance's ``voice`` and then to Chatterbox's default speaker.
|
|
57
|
+
"""
|
|
58
|
+
import tempfile
|
|
59
|
+
from pathlib import Path
|
|
60
|
+
|
|
61
|
+
import numpy as np
|
|
62
|
+
|
|
63
|
+
if self._model is None:
|
|
64
|
+
self._init_model()
|
|
65
|
+
|
|
66
|
+
effective_sample = voice_sample or self.voice
|
|
67
|
+
speaker_wav_path: Path | None = None
|
|
68
|
+
|
|
69
|
+
if effective_sample is not None:
|
|
70
|
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
|
71
|
+
effective_sample.save(f.name)
|
|
72
|
+
speaker_wav_path = Path(f.name)
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
wav = self._model.generate(
|
|
76
|
+
text=text,
|
|
77
|
+
language_id=self.language,
|
|
78
|
+
audio_prompt_path=str(speaker_wav_path) if speaker_wav_path else None,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
audio_data = wav.cpu().float().numpy().squeeze()
|
|
82
|
+
if audio_data.ndim == 0:
|
|
83
|
+
audio_data = np.array([audio_data], dtype=np.float32)
|
|
84
|
+
|
|
85
|
+
metadata = AudioMetadata(
|
|
86
|
+
sample_rate=self.SAMPLE_RATE,
|
|
87
|
+
channels=1,
|
|
88
|
+
sample_width=2,
|
|
89
|
+
duration_seconds=len(audio_data) / self.SAMPLE_RATE,
|
|
90
|
+
frame_count=len(audio_data),
|
|
91
|
+
)
|
|
92
|
+
return Audio(audio_data, metadata)
|
|
93
|
+
finally:
|
|
94
|
+
if speaker_wav_path is not None:
|
|
95
|
+
speaker_wav_path.unlink()
|
|
96
|
+
|
|
97
|
+
def unload(self) -> None:
|
|
98
|
+
"""Release the TTS model so the next generate_audio() re-initializes.
|
|
99
|
+
|
|
100
|
+
Used by low-memory dubbing to free VRAM between pipeline stages.
|
|
101
|
+
"""
|
|
102
|
+
self._model = None
|
|
103
|
+
release_device_memory(self.device)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class TextToMusic:
|
|
107
|
+
"""Generates music from text descriptions using MusicGen."""
|
|
108
|
+
|
|
109
|
+
def __init__(self, device: str | None = None):
|
|
110
|
+
self.device = device
|
|
111
|
+
self._processor: Any = None
|
|
112
|
+
self._model: Any = None
|
|
113
|
+
self._device: str | None = None
|
|
114
|
+
|
|
115
|
+
def _init_local(self) -> None:
|
|
116
|
+
"""Initialize local MusicGen model."""
|
|
117
|
+
import os
|
|
118
|
+
|
|
119
|
+
from transformers import AutoProcessor, MusicgenForConditionalGeneration
|
|
120
|
+
|
|
121
|
+
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
|
122
|
+
|
|
123
|
+
requested_device = self.device
|
|
124
|
+
self._device = select_device(self.device, mps_allowed=True)
|
|
125
|
+
|
|
126
|
+
model_name = "facebook/musicgen-small"
|
|
127
|
+
self._processor = AutoProcessor.from_pretrained(model_name)
|
|
128
|
+
self._model = MusicgenForConditionalGeneration.from_pretrained(model_name)
|
|
129
|
+
self._model.to(self._device)
|
|
130
|
+
self.device = self._device
|
|
131
|
+
log_device_initialization(
|
|
132
|
+
"TextToMusic",
|
|
133
|
+
requested_device=requested_device,
|
|
134
|
+
resolved_device=self._device,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
def generate_audio(self, text: str, max_new_tokens: int = 256) -> Audio:
|
|
138
|
+
"""Generate music audio from text description."""
|
|
139
|
+
if self._model is None:
|
|
140
|
+
self._init_local()
|
|
141
|
+
|
|
142
|
+
inputs = self._processor(text=[text], padding=True, return_tensors="pt")
|
|
143
|
+
inputs = {k: v.to(self._device) if hasattr(v, "to") else v for k, v in inputs.items()}
|
|
144
|
+
audio_values = self._model.generate(**inputs, max_new_tokens=max_new_tokens)
|
|
145
|
+
sampling_rate = self._model.config.audio_encoder.sampling_rate
|
|
146
|
+
|
|
147
|
+
audio_data = audio_values[0, 0].cpu().float().numpy()
|
|
148
|
+
|
|
149
|
+
metadata = AudioMetadata(
|
|
150
|
+
sample_rate=sampling_rate,
|
|
151
|
+
channels=1,
|
|
152
|
+
sample_width=2,
|
|
153
|
+
duration_seconds=len(audio_data) / sampling_rate,
|
|
154
|
+
frame_count=len(audio_data),
|
|
155
|
+
)
|
|
156
|
+
return Audio(audio_data, metadata)
|
|
@@ -1,215 +0,0 @@
|
|
|
1
|
-
"""Audio generation using local models."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
from typing import Any
|
|
6
|
-
|
|
7
|
-
from videopython.ai._device import log_device_initialization, release_device_memory, select_device
|
|
8
|
-
from videopython.base.audio import Audio, AudioMetadata
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class TextToSpeech:
|
|
12
|
-
"""Generates speech audio from text using local models.
|
|
13
|
-
|
|
14
|
-
Supports Bark (`base`, `small`) for general TTS and Chatterbox Multilingual
|
|
15
|
-
(`chatterbox`) for multilingual voice cloning.
|
|
16
|
-
"""
|
|
17
|
-
|
|
18
|
-
SUPPORTED_LOCAL_MODELS: list[str] = ["base", "small", "chatterbox"]
|
|
19
|
-
|
|
20
|
-
CHATTERBOX_SAMPLE_RATE: int = 24000
|
|
21
|
-
|
|
22
|
-
def __init__(
|
|
23
|
-
self,
|
|
24
|
-
model_size: str = "base",
|
|
25
|
-
voice: str | None = None,
|
|
26
|
-
device: str | None = None,
|
|
27
|
-
language: str = "en",
|
|
28
|
-
):
|
|
29
|
-
if model_size not in self.SUPPORTED_LOCAL_MODELS:
|
|
30
|
-
raise ValueError(f"model_size must be one of {self.SUPPORTED_LOCAL_MODELS}, got '{model_size}'")
|
|
31
|
-
|
|
32
|
-
self.model_size = model_size
|
|
33
|
-
self.voice = voice
|
|
34
|
-
self.device = device
|
|
35
|
-
self.language = language
|
|
36
|
-
self._model: Any = None
|
|
37
|
-
self._processor: Any = None
|
|
38
|
-
self._chatterbox_model: Any = None
|
|
39
|
-
|
|
40
|
-
def _init_local(self) -> None:
|
|
41
|
-
"""Initialize local Bark model."""
|
|
42
|
-
from transformers import AutoModel, AutoProcessor
|
|
43
|
-
|
|
44
|
-
requested_device = self.device
|
|
45
|
-
device = select_device(self.device, mps_allowed=False)
|
|
46
|
-
|
|
47
|
-
model_name = "suno/bark" if self.model_size == "base" else "suno/bark-small"
|
|
48
|
-
self._processor = AutoProcessor.from_pretrained(model_name)
|
|
49
|
-
self._model = AutoModel.from_pretrained(model_name).to(device)
|
|
50
|
-
self.device = device
|
|
51
|
-
log_device_initialization(
|
|
52
|
-
"TextToSpeech",
|
|
53
|
-
requested_device=requested_device,
|
|
54
|
-
resolved_device=device,
|
|
55
|
-
)
|
|
56
|
-
|
|
57
|
-
def _init_chatterbox(self) -> None:
|
|
58
|
-
"""Initialize Chatterbox Multilingual model for voice cloning."""
|
|
59
|
-
from chatterbox.mtl_tts import ChatterboxMultilingualTTS # type: ignore[import-untyped]
|
|
60
|
-
|
|
61
|
-
requested_device = self.device
|
|
62
|
-
device = select_device(self.device, mps_allowed=False)
|
|
63
|
-
|
|
64
|
-
self._chatterbox_model = ChatterboxMultilingualTTS.from_pretrained(device=device)
|
|
65
|
-
self.device = device
|
|
66
|
-
log_device_initialization(
|
|
67
|
-
"TextToSpeech",
|
|
68
|
-
requested_device=requested_device,
|
|
69
|
-
resolved_device=device,
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
def _generate_local(self, text: str, voice_preset: str | None) -> Audio:
|
|
73
|
-
"""Generate speech using Bark."""
|
|
74
|
-
import torch
|
|
75
|
-
|
|
76
|
-
if self._model is None:
|
|
77
|
-
self._init_local()
|
|
78
|
-
|
|
79
|
-
inputs = self._processor(text=[text], return_tensors="pt", voice_preset=voice_preset)
|
|
80
|
-
inputs = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
|
|
81
|
-
|
|
82
|
-
with torch.no_grad():
|
|
83
|
-
speech_values = self._model.generate(**inputs, do_sample=True)
|
|
84
|
-
|
|
85
|
-
audio_data = speech_values.cpu().float().numpy().squeeze()
|
|
86
|
-
sample_rate = self._model.generation_config.sample_rate
|
|
87
|
-
|
|
88
|
-
metadata = AudioMetadata(
|
|
89
|
-
sample_rate=sample_rate,
|
|
90
|
-
channels=1,
|
|
91
|
-
sample_width=2,
|
|
92
|
-
duration_seconds=len(audio_data) / sample_rate,
|
|
93
|
-
frame_count=len(audio_data),
|
|
94
|
-
)
|
|
95
|
-
return Audio(audio_data, metadata)
|
|
96
|
-
|
|
97
|
-
def _generate_chatterbox(self, text: str, voice_sample: Audio) -> Audio:
|
|
98
|
-
"""Generate speech using Chatterbox Multilingual with voice cloning."""
|
|
99
|
-
import tempfile
|
|
100
|
-
from pathlib import Path
|
|
101
|
-
|
|
102
|
-
import numpy as np
|
|
103
|
-
|
|
104
|
-
if self._chatterbox_model is None:
|
|
105
|
-
self._init_chatterbox()
|
|
106
|
-
|
|
107
|
-
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
|
108
|
-
voice_sample.save(f.name)
|
|
109
|
-
speaker_wav_path = Path(f.name)
|
|
110
|
-
|
|
111
|
-
try:
|
|
112
|
-
wav = self._chatterbox_model.generate(
|
|
113
|
-
text=text,
|
|
114
|
-
language_id=self.language,
|
|
115
|
-
audio_prompt_path=str(speaker_wav_path),
|
|
116
|
-
)
|
|
117
|
-
|
|
118
|
-
audio_data = wav.cpu().float().numpy().squeeze()
|
|
119
|
-
if audio_data.ndim == 0:
|
|
120
|
-
audio_data = np.array([audio_data], dtype=np.float32)
|
|
121
|
-
|
|
122
|
-
sample_rate = self.CHATTERBOX_SAMPLE_RATE
|
|
123
|
-
|
|
124
|
-
metadata = AudioMetadata(
|
|
125
|
-
sample_rate=sample_rate,
|
|
126
|
-
channels=1,
|
|
127
|
-
sample_width=2,
|
|
128
|
-
duration_seconds=len(audio_data) / sample_rate,
|
|
129
|
-
frame_count=len(audio_data),
|
|
130
|
-
)
|
|
131
|
-
return Audio(audio_data, metadata)
|
|
132
|
-
finally:
|
|
133
|
-
speaker_wav_path.unlink()
|
|
134
|
-
|
|
135
|
-
def generate_audio(
|
|
136
|
-
self,
|
|
137
|
-
text: str,
|
|
138
|
-
voice_preset: str | None = None,
|
|
139
|
-
voice_sample: Audio | None = None,
|
|
140
|
-
) -> Audio:
|
|
141
|
-
"""Generate speech audio from text."""
|
|
142
|
-
effective_voice = voice_preset or self.voice
|
|
143
|
-
|
|
144
|
-
if self.model_size == "chatterbox" or voice_sample is not None:
|
|
145
|
-
if voice_sample is None:
|
|
146
|
-
raise ValueError(
|
|
147
|
-
"voice_sample is required for Chatterbox voice cloning. "
|
|
148
|
-
"Provide an Audio sample of the voice to clone."
|
|
149
|
-
)
|
|
150
|
-
return self._generate_chatterbox(text, voice_sample)
|
|
151
|
-
|
|
152
|
-
return self._generate_local(text, effective_voice)
|
|
153
|
-
|
|
154
|
-
def unload(self) -> None:
|
|
155
|
-
"""Release the TTS model(s) so the next generate_audio() re-initializes.
|
|
156
|
-
|
|
157
|
-
Used by low-memory dubbing to free VRAM between pipeline stages.
|
|
158
|
-
"""
|
|
159
|
-
self._model = None
|
|
160
|
-
self._processor = None
|
|
161
|
-
self._chatterbox_model = None
|
|
162
|
-
release_device_memory(self.device)
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
class TextToMusic:
|
|
166
|
-
"""Generates music from text descriptions using MusicGen."""
|
|
167
|
-
|
|
168
|
-
def __init__(self, device: str | None = None):
|
|
169
|
-
self.device = device
|
|
170
|
-
self._processor: Any = None
|
|
171
|
-
self._model: Any = None
|
|
172
|
-
self._device: str | None = None
|
|
173
|
-
|
|
174
|
-
def _init_local(self) -> None:
|
|
175
|
-
"""Initialize local MusicGen model."""
|
|
176
|
-
import os
|
|
177
|
-
|
|
178
|
-
from transformers import AutoProcessor, MusicgenForConditionalGeneration
|
|
179
|
-
|
|
180
|
-
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
|
181
|
-
|
|
182
|
-
requested_device = self.device
|
|
183
|
-
self._device = select_device(self.device, mps_allowed=True)
|
|
184
|
-
|
|
185
|
-
model_name = "facebook/musicgen-small"
|
|
186
|
-
self._processor = AutoProcessor.from_pretrained(model_name)
|
|
187
|
-
self._model = MusicgenForConditionalGeneration.from_pretrained(model_name)
|
|
188
|
-
self._model.to(self._device)
|
|
189
|
-
self.device = self._device
|
|
190
|
-
log_device_initialization(
|
|
191
|
-
"TextToMusic",
|
|
192
|
-
requested_device=requested_device,
|
|
193
|
-
resolved_device=self._device,
|
|
194
|
-
)
|
|
195
|
-
|
|
196
|
-
def generate_audio(self, text: str, max_new_tokens: int = 256) -> Audio:
|
|
197
|
-
"""Generate music audio from text description."""
|
|
198
|
-
if self._model is None:
|
|
199
|
-
self._init_local()
|
|
200
|
-
|
|
201
|
-
inputs = self._processor(text=[text], padding=True, return_tensors="pt")
|
|
202
|
-
inputs = {k: v.to(self._device) if hasattr(v, "to") else v for k, v in inputs.items()}
|
|
203
|
-
audio_values = self._model.generate(**inputs, max_new_tokens=max_new_tokens)
|
|
204
|
-
sampling_rate = self._model.config.audio_encoder.sampling_rate
|
|
205
|
-
|
|
206
|
-
audio_data = audio_values[0, 0].cpu().float().numpy()
|
|
207
|
-
|
|
208
|
-
metadata = AudioMetadata(
|
|
209
|
-
sample_rate=sampling_rate,
|
|
210
|
-
channels=1,
|
|
211
|
-
sample_width=2,
|
|
212
|
-
duration_seconds=len(audio_data) / sampling_rate,
|
|
213
|
-
frame_count=len(audio_data),
|
|
214
|
-
)
|
|
215
|
-
return Audio(audio_data, metadata)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|