videopython 0.26.3__tar.gz → 0.26.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.26.3 → videopython-0.26.5}/PKG-INFO +1 -1
- {videopython-0.26.3 → videopython-0.26.5}/pyproject.toml +1 -1
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/dubbing/dubber.py +23 -3
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/dubbing/pipeline.py +70 -35
- videopython-0.26.5/src/videopython/ai/generation/audio.py +156 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/understanding/separation.py +27 -40
- videopython-0.26.3/src/videopython/ai/generation/audio.py +0 -215
- {videopython-0.26.3 → videopython-0.26.5}/.gitignore +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/LICENSE +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/README.md +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/__init__.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/_device.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/dubbing/__init__.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/dubbing/models.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/dubbing/remux.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/generation/translation.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/registry.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/swapping/__init__.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/swapping/inpainter.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/swapping/models.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/swapping/segmenter.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/swapping/swapper.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/transforms.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/understanding/audio.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/video_analysis.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/__init__.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/audio/__init__.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/audio/analysis.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/audio/audio.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/combine.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/description.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/effects.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/progress.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/registry.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/scene.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/streaming.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/text/__init__.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/text/overlay.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/text/transcription.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/transforms.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/transitions.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/utils.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/video.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/editing/__init__.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/editing/multicam.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/editing/premiere_xml.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/editing/video_edit.py +0 -0
- {videopython-0.26.3 → videopython-0.26.5}/src/videopython/py.typed +0 -0
|
@@ -8,6 +8,7 @@ from pathlib import Path
|
|
|
8
8
|
from typing import TYPE_CHECKING, Any, Callable
|
|
9
9
|
|
|
10
10
|
from videopython.ai.dubbing.models import DubbingResult, RevoiceResult
|
|
11
|
+
from videopython.ai.dubbing.pipeline import WhisperModel
|
|
11
12
|
|
|
12
13
|
if TYPE_CHECKING:
|
|
13
14
|
from videopython.base.video import Video
|
|
@@ -25,19 +26,38 @@ class VideoDubber:
|
|
|
25
26
|
model is resident at a time. Trades per-run latency (~10-30s of
|
|
26
27
|
extra model loads) for a much lower memory ceiling. Recommended for
|
|
27
28
|
GPUs with <=12GB VRAM or hosts with <32GB RAM. Default False.
|
|
29
|
+
whisper_model: Whisper model size used for transcription. Larger models
|
|
30
|
+
give better accuracy at the cost of VRAM and latency. One of
|
|
31
|
+
``tiny``, ``base``, ``small``, ``medium``, ``large``, ``turbo``.
|
|
32
|
+
Default ``small``.
|
|
28
33
|
"""
|
|
29
34
|
|
|
30
|
-
def __init__(
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
device: str | None = None,
|
|
38
|
+
low_memory: bool = False,
|
|
39
|
+
whisper_model: WhisperModel = "small",
|
|
40
|
+
):
|
|
31
41
|
self.device = device
|
|
32
42
|
self.low_memory = low_memory
|
|
43
|
+
self.whisper_model = whisper_model
|
|
33
44
|
self._local_pipeline: Any = None
|
|
34
45
|
requested = device.lower() if isinstance(device, str) else "auto"
|
|
35
|
-
logger.info(
|
|
46
|
+
logger.info(
|
|
47
|
+
"VideoDubber initialized with device=%s low_memory=%s whisper_model=%s",
|
|
48
|
+
requested,
|
|
49
|
+
low_memory,
|
|
50
|
+
whisper_model,
|
|
51
|
+
)
|
|
36
52
|
|
|
37
53
|
def _init_local_pipeline(self) -> None:
|
|
38
54
|
from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
|
|
39
55
|
|
|
40
|
-
self._local_pipeline = LocalDubbingPipeline(
|
|
56
|
+
self._local_pipeline = LocalDubbingPipeline(
|
|
57
|
+
device=self.device,
|
|
58
|
+
low_memory=self.low_memory,
|
|
59
|
+
whisper_model=self.whisper_model,
|
|
60
|
+
)
|
|
41
61
|
|
|
42
62
|
def dub(
|
|
43
63
|
self,
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
from typing import TYPE_CHECKING, Any, Callable
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Callable, Literal
|
|
7
7
|
|
|
8
8
|
from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio
|
|
9
9
|
from videopython.ai.dubbing.timing import TimingSynchronizer
|
|
@@ -11,6 +11,8 @@ from videopython.ai.dubbing.timing import TimingSynchronizer
|
|
|
11
11
|
if TYPE_CHECKING:
|
|
12
12
|
from videopython.base.audio import Audio
|
|
13
13
|
|
|
14
|
+
WhisperModel = Literal["tiny", "base", "small", "medium", "large", "turbo"]
|
|
15
|
+
|
|
14
16
|
logger = logging.getLogger(__name__)
|
|
15
17
|
|
|
16
18
|
|
|
@@ -23,21 +25,27 @@ class LocalDubbingPipeline:
|
|
|
23
25
|
with <=12GB VRAM or hosts with <32GB RAM.
|
|
24
26
|
"""
|
|
25
27
|
|
|
26
|
-
def __init__(
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
device: str | None = None,
|
|
31
|
+
low_memory: bool = False,
|
|
32
|
+
whisper_model: WhisperModel = "small",
|
|
33
|
+
):
|
|
27
34
|
self.device = device
|
|
28
35
|
self.low_memory = low_memory
|
|
36
|
+
self.whisper_model = whisper_model
|
|
29
37
|
requested = device.lower() if isinstance(device, str) else "auto"
|
|
30
38
|
logger.info(
|
|
31
|
-
"LocalDubbingPipeline initialized with device=%s low_memory=%s",
|
|
39
|
+
"LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s",
|
|
32
40
|
requested,
|
|
33
41
|
low_memory,
|
|
42
|
+
whisper_model,
|
|
34
43
|
)
|
|
35
44
|
|
|
36
45
|
self._transcriber: Any = None
|
|
37
46
|
self._transcriber_diarization: bool | None = None
|
|
38
47
|
self._translator: Any = None
|
|
39
48
|
self._tts: Any = None
|
|
40
|
-
self._tts_voice_clone: bool | None = None
|
|
41
49
|
self._tts_language: str | None = None
|
|
42
50
|
self._separator: Any = None
|
|
43
51
|
self._synchronizer: TimingSynchronizer | None = None
|
|
@@ -63,7 +71,11 @@ class LocalDubbingPipeline:
|
|
|
63
71
|
"""Initialize the transcription model."""
|
|
64
72
|
from videopython.ai.understanding.audio import AudioToText
|
|
65
73
|
|
|
66
|
-
self._transcriber = AudioToText(
|
|
74
|
+
self._transcriber = AudioToText(
|
|
75
|
+
model_name=self.whisper_model,
|
|
76
|
+
device=self.device,
|
|
77
|
+
enable_diarization=enable_diarization,
|
|
78
|
+
)
|
|
67
79
|
|
|
68
80
|
def _init_translator(self) -> None:
|
|
69
81
|
"""Initialize the translation model."""
|
|
@@ -71,18 +83,11 @@ class LocalDubbingPipeline:
|
|
|
71
83
|
|
|
72
84
|
self._translator = TextTranslator(device=self.device)
|
|
73
85
|
|
|
74
|
-
def _init_tts(self,
|
|
86
|
+
def _init_tts(self, language: str = "en") -> None:
|
|
75
87
|
"""Initialize the text-to-speech model."""
|
|
76
88
|
from videopython.ai.generation.audio import TextToSpeech
|
|
77
89
|
|
|
78
|
-
|
|
79
|
-
self._tts = TextToSpeech(
|
|
80
|
-
model_size="chatterbox",
|
|
81
|
-
device=self.device,
|
|
82
|
-
language=language,
|
|
83
|
-
)
|
|
84
|
-
else:
|
|
85
|
-
self._tts = TextToSpeech(device=self.device, language=language)
|
|
90
|
+
self._tts = TextToSpeech(device=self.device, language=language)
|
|
86
91
|
|
|
87
92
|
def _init_separator(self) -> None:
|
|
88
93
|
"""Initialize the audio separator."""
|
|
@@ -102,6 +107,7 @@ class LocalDubbingPipeline:
|
|
|
102
107
|
max_duration: float = 10.0,
|
|
103
108
|
) -> dict[str, Any]:
|
|
104
109
|
"""Extract voice samples for each speaker from the audio."""
|
|
110
|
+
from videopython.base.audio import Audio
|
|
105
111
|
|
|
106
112
|
voice_samples: dict[str, Audio] = {}
|
|
107
113
|
|
|
@@ -128,7 +134,11 @@ class LocalDubbingPipeline:
|
|
|
128
134
|
if best_segment is not None:
|
|
129
135
|
start = best_segment.start
|
|
130
136
|
end = min(best_segment.end, start + max_duration)
|
|
131
|
-
|
|
137
|
+
sliced = audio.slice(start, end)
|
|
138
|
+
# Audio.slice returns a numpy view into the source. Copy so the
|
|
139
|
+
# short voice sample doesn't keep the full vocals array (~1.3 GB
|
|
140
|
+
# for 2h sources) alive across translate + TTS.
|
|
141
|
+
voice_samples[speaker] = Audio(sliced.data.copy(), sliced.metadata)
|
|
132
142
|
|
|
133
143
|
return voice_samples
|
|
134
144
|
|
|
@@ -183,6 +193,7 @@ class LocalDubbingPipeline:
|
|
|
183
193
|
|
|
184
194
|
separated_audio: SeparatedAudio | None = None
|
|
185
195
|
vocal_audio = source_audio
|
|
196
|
+
background_audio: Audio | None = None
|
|
186
197
|
|
|
187
198
|
if preserve_background:
|
|
188
199
|
report_progress("Separating audio", 0.15)
|
|
@@ -192,12 +203,24 @@ class LocalDubbingPipeline:
|
|
|
192
203
|
separated_audio = self._separator.separate(source_audio)
|
|
193
204
|
self._maybe_unload("_separator")
|
|
194
205
|
vocal_audio = separated_audio.vocals
|
|
206
|
+
background_audio = separated_audio.background
|
|
207
|
+
# In low_memory mode, drop the SeparatedAudio container so vocals
|
|
208
|
+
# and background can be released as soon as their last local
|
|
209
|
+
# reference goes (after voice-sample extraction and final overlay
|
|
210
|
+
# respectively). The result will report separated_audio=None.
|
|
211
|
+
if self.low_memory:
|
|
212
|
+
separated_audio = None
|
|
195
213
|
|
|
196
214
|
voice_samples: dict[str, Audio] = {}
|
|
197
215
|
if voice_clone:
|
|
198
216
|
report_progress("Extracting voice samples", 0.25)
|
|
199
217
|
voice_samples = self._extract_voice_samples(vocal_audio, transcription)
|
|
200
218
|
|
|
219
|
+
# vocals is no longer needed; voice_samples are independent copies.
|
|
220
|
+
# In low_memory mode this is the only ref keeping the buffer alive
|
|
221
|
+
# (separated_audio was dropped above), so dropping the local frees it.
|
|
222
|
+
del vocal_audio
|
|
223
|
+
|
|
201
224
|
report_progress("Translating text", 0.35)
|
|
202
225
|
if self._translator is None:
|
|
203
226
|
self._init_translator()
|
|
@@ -210,9 +233,8 @@ class LocalDubbingPipeline:
|
|
|
210
233
|
self._maybe_unload("_translator")
|
|
211
234
|
|
|
212
235
|
report_progress("Generating dubbed speech", 0.50)
|
|
213
|
-
if self._tts is None or self.
|
|
214
|
-
self._init_tts(
|
|
215
|
-
self._tts_voice_clone = voice_clone
|
|
236
|
+
if self._tts is None or self._tts_language != target_lang:
|
|
237
|
+
self._init_tts(language=target_lang)
|
|
216
238
|
self._tts_language = target_lang
|
|
217
239
|
|
|
218
240
|
dubbed_segments: list[Audio] = []
|
|
@@ -246,17 +268,23 @@ class LocalDubbingPipeline:
|
|
|
246
268
|
assert self._synchronizer is not None
|
|
247
269
|
|
|
248
270
|
synchronized_segments, _ = self._synchronizer.synchronize_segments(dubbed_segments, target_durations)
|
|
271
|
+
del dubbed_segments
|
|
249
272
|
|
|
250
273
|
report_progress("Assembling final audio", 0.90)
|
|
251
274
|
total_duration = source_audio.metadata.duration_seconds
|
|
252
275
|
dubbed_speech = self._synchronizer.assemble_with_timing(synchronized_segments, start_times, total_duration)
|
|
276
|
+
del synchronized_segments
|
|
253
277
|
|
|
254
|
-
if
|
|
255
|
-
background_sr =
|
|
278
|
+
if background_audio is not None:
|
|
279
|
+
background_sr = background_audio.metadata.sample_rate
|
|
256
280
|
if dubbed_speech.metadata.sample_rate != background_sr:
|
|
257
281
|
dubbed_speech = dubbed_speech.resample(background_sr)
|
|
258
282
|
|
|
259
|
-
final_audio =
|
|
283
|
+
final_audio = background_audio.overlay(dubbed_speech, position=0.0)
|
|
284
|
+
# Drop the local; in low_memory this releases the background
|
|
285
|
+
# buffer (~1.3 GB for 2h sources). In non-low_memory the same
|
|
286
|
+
# array is still held by separated_audio.background.
|
|
287
|
+
del background_audio
|
|
260
288
|
else:
|
|
261
289
|
final_audio = dubbed_speech
|
|
262
290
|
|
|
@@ -303,6 +331,7 @@ class LocalDubbingPipeline:
|
|
|
303
331
|
|
|
304
332
|
separated_audio: SeparatedAudio | None = None
|
|
305
333
|
vocal_audio = source_audio
|
|
334
|
+
background_audio: Audio | None = None
|
|
306
335
|
|
|
307
336
|
if preserve_background:
|
|
308
337
|
report_progress("Separating audio", 0.20)
|
|
@@ -312,6 +341,9 @@ class LocalDubbingPipeline:
|
|
|
312
341
|
separated_audio = self._separator.separate(source_audio)
|
|
313
342
|
self._maybe_unload("_separator")
|
|
314
343
|
vocal_audio = separated_audio.vocals
|
|
344
|
+
background_audio = separated_audio.background
|
|
345
|
+
if self.low_memory:
|
|
346
|
+
separated_audio = None
|
|
315
347
|
|
|
316
348
|
report_progress("Extracting voice sample", 0.40)
|
|
317
349
|
voice_sample: Audio | None = None
|
|
@@ -323,12 +355,15 @@ class LocalDubbingPipeline:
|
|
|
323
355
|
|
|
324
356
|
if voice_sample is None:
|
|
325
357
|
sample_duration = min(6.0, original_duration)
|
|
326
|
-
|
|
358
|
+
sliced = vocal_audio.slice(0, sample_duration)
|
|
359
|
+
# Copy so the short sample doesn't pin the full vocals array.
|
|
360
|
+
voice_sample = Audio(sliced.data.copy(), sliced.metadata)
|
|
361
|
+
|
|
362
|
+
del vocal_audio
|
|
327
363
|
|
|
328
364
|
report_progress("Generating speech", 0.60)
|
|
329
|
-
if self._tts is None or self.
|
|
330
|
-
self._init_tts(
|
|
331
|
-
self._tts_voice_clone = True
|
|
365
|
+
if self._tts is None or self._tts_language != "en":
|
|
366
|
+
self._init_tts(language="en")
|
|
332
367
|
self._tts_language = "en"
|
|
333
368
|
|
|
334
369
|
generated_speech = self._tts.generate_audio(text, voice_sample=voice_sample)
|
|
@@ -337,24 +372,24 @@ class LocalDubbingPipeline:
|
|
|
337
372
|
|
|
338
373
|
report_progress("Assembling audio", 0.85)
|
|
339
374
|
|
|
340
|
-
if
|
|
341
|
-
background_sr =
|
|
375
|
+
if background_audio is not None:
|
|
376
|
+
background_sr = background_audio.metadata.sample_rate
|
|
342
377
|
if generated_speech.metadata.sample_rate != background_sr:
|
|
343
378
|
generated_speech = generated_speech.resample(background_sr)
|
|
344
379
|
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
silence_duration = speech_duration - background.metadata.duration_seconds
|
|
380
|
+
if background_audio.metadata.duration_seconds > speech_duration:
|
|
381
|
+
background_audio = background_audio.slice(0, speech_duration)
|
|
382
|
+
elif background_audio.metadata.duration_seconds < speech_duration:
|
|
383
|
+
silence_duration = speech_duration - background_audio.metadata.duration_seconds
|
|
350
384
|
silence = Audio.silence(
|
|
351
385
|
duration=silence_duration,
|
|
352
386
|
sample_rate=background_sr,
|
|
353
|
-
channels=
|
|
387
|
+
channels=background_audio.metadata.channels,
|
|
354
388
|
)
|
|
355
|
-
|
|
389
|
+
background_audio = background_audio.concat(silence)
|
|
356
390
|
|
|
357
|
-
final_audio =
|
|
391
|
+
final_audio = background_audio.overlay(generated_speech, position=0.0)
|
|
392
|
+
del background_audio
|
|
358
393
|
else:
|
|
359
394
|
final_audio = generated_speech
|
|
360
395
|
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""Audio generation using local models."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from videopython.ai._device import log_device_initialization, release_device_memory, select_device
|
|
8
|
+
from videopython.base.audio import Audio, AudioMetadata
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TextToSpeech:
|
|
12
|
+
"""Generates speech audio from text using Chatterbox Multilingual.
|
|
13
|
+
|
|
14
|
+
Backed by Chatterbox Multilingual (Resemble AI). When ``voice_sample`` is
|
|
15
|
+
provided to ``generate_audio``, the model clones that voice; otherwise it
|
|
16
|
+
falls back to Chatterbox's built-in default speaker.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
SAMPLE_RATE: int = 24000
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
voice: Audio | None = None,
|
|
24
|
+
device: str | None = None,
|
|
25
|
+
language: str = "en",
|
|
26
|
+
):
|
|
27
|
+
self.voice = voice
|
|
28
|
+
self.device = device
|
|
29
|
+
self.language = language
|
|
30
|
+
self._model: Any = None
|
|
31
|
+
|
|
32
|
+
def _init_model(self) -> None:
|
|
33
|
+
from chatterbox.mtl_tts import ChatterboxMultilingualTTS # type: ignore[import-untyped]
|
|
34
|
+
|
|
35
|
+
requested_device = self.device
|
|
36
|
+
device = select_device(self.device, mps_allowed=False)
|
|
37
|
+
|
|
38
|
+
self._model = ChatterboxMultilingualTTS.from_pretrained(device=device)
|
|
39
|
+
self.device = device
|
|
40
|
+
log_device_initialization(
|
|
41
|
+
"TextToSpeech",
|
|
42
|
+
requested_device=requested_device,
|
|
43
|
+
resolved_device=device,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
def generate_audio(
|
|
47
|
+
self,
|
|
48
|
+
text: str,
|
|
49
|
+
voice_sample: Audio | None = None,
|
|
50
|
+
) -> Audio:
|
|
51
|
+
"""Generate speech audio from text.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
text: Text to synthesize.
|
|
55
|
+
voice_sample: Optional voice sample to clone. Falls back to the
|
|
56
|
+
instance's ``voice`` and then to Chatterbox's default speaker.
|
|
57
|
+
"""
|
|
58
|
+
import tempfile
|
|
59
|
+
from pathlib import Path
|
|
60
|
+
|
|
61
|
+
import numpy as np
|
|
62
|
+
|
|
63
|
+
if self._model is None:
|
|
64
|
+
self._init_model()
|
|
65
|
+
|
|
66
|
+
effective_sample = voice_sample or self.voice
|
|
67
|
+
speaker_wav_path: Path | None = None
|
|
68
|
+
|
|
69
|
+
if effective_sample is not None:
|
|
70
|
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
|
71
|
+
effective_sample.save(f.name)
|
|
72
|
+
speaker_wav_path = Path(f.name)
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
wav = self._model.generate(
|
|
76
|
+
text=text,
|
|
77
|
+
language_id=self.language,
|
|
78
|
+
audio_prompt_path=str(speaker_wav_path) if speaker_wav_path else None,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
audio_data = wav.cpu().float().numpy().squeeze()
|
|
82
|
+
if audio_data.ndim == 0:
|
|
83
|
+
audio_data = np.array([audio_data], dtype=np.float32)
|
|
84
|
+
|
|
85
|
+
metadata = AudioMetadata(
|
|
86
|
+
sample_rate=self.SAMPLE_RATE,
|
|
87
|
+
channels=1,
|
|
88
|
+
sample_width=2,
|
|
89
|
+
duration_seconds=len(audio_data) / self.SAMPLE_RATE,
|
|
90
|
+
frame_count=len(audio_data),
|
|
91
|
+
)
|
|
92
|
+
return Audio(audio_data, metadata)
|
|
93
|
+
finally:
|
|
94
|
+
if speaker_wav_path is not None:
|
|
95
|
+
speaker_wav_path.unlink()
|
|
96
|
+
|
|
97
|
+
def unload(self) -> None:
|
|
98
|
+
"""Release the TTS model so the next generate_audio() re-initializes.
|
|
99
|
+
|
|
100
|
+
Used by low-memory dubbing to free VRAM between pipeline stages.
|
|
101
|
+
"""
|
|
102
|
+
self._model = None
|
|
103
|
+
release_device_memory(self.device)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class TextToMusic:
|
|
107
|
+
"""Generates music from text descriptions using MusicGen."""
|
|
108
|
+
|
|
109
|
+
def __init__(self, device: str | None = None):
|
|
110
|
+
self.device = device
|
|
111
|
+
self._processor: Any = None
|
|
112
|
+
self._model: Any = None
|
|
113
|
+
self._device: str | None = None
|
|
114
|
+
|
|
115
|
+
def _init_local(self) -> None:
|
|
116
|
+
"""Initialize local MusicGen model."""
|
|
117
|
+
import os
|
|
118
|
+
|
|
119
|
+
from transformers import AutoProcessor, MusicgenForConditionalGeneration
|
|
120
|
+
|
|
121
|
+
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
|
122
|
+
|
|
123
|
+
requested_device = self.device
|
|
124
|
+
self._device = select_device(self.device, mps_allowed=True)
|
|
125
|
+
|
|
126
|
+
model_name = "facebook/musicgen-small"
|
|
127
|
+
self._processor = AutoProcessor.from_pretrained(model_name)
|
|
128
|
+
self._model = MusicgenForConditionalGeneration.from_pretrained(model_name)
|
|
129
|
+
self._model.to(self._device)
|
|
130
|
+
self.device = self._device
|
|
131
|
+
log_device_initialization(
|
|
132
|
+
"TextToMusic",
|
|
133
|
+
requested_device=requested_device,
|
|
134
|
+
resolved_device=self._device,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
def generate_audio(self, text: str, max_new_tokens: int = 256) -> Audio:
|
|
138
|
+
"""Generate music audio from text description."""
|
|
139
|
+
if self._model is None:
|
|
140
|
+
self._init_local()
|
|
141
|
+
|
|
142
|
+
inputs = self._processor(text=[text], padding=True, return_tensors="pt")
|
|
143
|
+
inputs = {k: v.to(self._device) if hasattr(v, "to") else v for k, v in inputs.items()}
|
|
144
|
+
audio_values = self._model.generate(**inputs, max_new_tokens=max_new_tokens)
|
|
145
|
+
sampling_rate = self._model.config.audio_encoder.sampling_rate
|
|
146
|
+
|
|
147
|
+
audio_data = audio_values[0, 0].cpu().float().numpy()
|
|
148
|
+
|
|
149
|
+
metadata = AudioMetadata(
|
|
150
|
+
sample_rate=sampling_rate,
|
|
151
|
+
channels=1,
|
|
152
|
+
sample_width=2,
|
|
153
|
+
duration_seconds=len(audio_data) / sampling_rate,
|
|
154
|
+
frame_count=len(audio_data),
|
|
155
|
+
)
|
|
156
|
+
return Audio(audio_data, metadata)
|
|
@@ -42,7 +42,15 @@ class AudioSeparator:
|
|
|
42
42
|
)
|
|
43
43
|
|
|
44
44
|
def _separate_local(self, audio: Audio) -> SeparatedAudio:
|
|
45
|
-
"""Separate audio using local Demucs model.
|
|
45
|
+
"""Separate audio using local Demucs model.
|
|
46
|
+
|
|
47
|
+
Keeps the input tensor on CPU and passes ``device=self.device`` to
|
|
48
|
+
``apply_model`` so per-chunk compute runs on GPU while the full
|
|
49
|
+
``(stems, channels, samples)`` output is stored in CPU RAM. For long
|
|
50
|
+
sources this is the difference between OOM-on-GPU and running cleanly:
|
|
51
|
+
a 2h stereo @ 44.1kHz output is ~10 GB — too big for an 8 GB card but
|
|
52
|
+
comfortable on a 32 GB host.
|
|
53
|
+
"""
|
|
46
54
|
import numpy as np
|
|
47
55
|
import torch
|
|
48
56
|
from demucs.apply import apply_model
|
|
@@ -65,61 +73,40 @@ class AudioSeparator:
|
|
|
65
73
|
audio_data = audio_data.T
|
|
66
74
|
|
|
67
75
|
wav = torch.tensor(audio_data, dtype=torch.float32).unsqueeze(0)
|
|
68
|
-
wav = wav.to(self.device)
|
|
69
76
|
|
|
70
77
|
with torch.no_grad():
|
|
71
78
|
sources = apply_model(self._model, wav, device=self.device)
|
|
72
79
|
|
|
73
80
|
sources_np = sources[0].cpu().numpy()
|
|
81
|
+
del sources
|
|
74
82
|
|
|
75
83
|
stem_names = self.STEM_NAMES_6S if self.model_name == "htdemucs_6s" else self.STEM_NAMES
|
|
84
|
+
vocals_idx = stem_names.index("vocals")
|
|
85
|
+
non_vocal_indices = [i for i in range(len(stem_names)) if i != vocals_idx]
|
|
76
86
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
metadata = AudioMetadata(
|
|
82
|
-
sample_rate=target_sr,
|
|
83
|
-
channels=2,
|
|
84
|
-
sample_width=2,
|
|
85
|
-
duration_seconds=stem_data.shape[0] / target_sr,
|
|
86
|
-
frame_count=stem_data.shape[0],
|
|
87
|
-
)
|
|
88
|
-
stems[name] = Audio(stem_data.astype(np.float32), metadata)
|
|
89
|
-
|
|
90
|
-
vocals = stems["vocals"]
|
|
91
|
-
|
|
92
|
-
non_vocal_stems = [stems[name] for name in stem_names if name != "vocals"]
|
|
93
|
-
background_data = np.zeros_like(vocals.data)
|
|
94
|
-
for stem in non_vocal_stems:
|
|
95
|
-
background_data += stem.data
|
|
87
|
+
vocals_data = sources_np[vocals_idx].T
|
|
88
|
+
background_data = sources_np[non_vocal_indices].sum(axis=0).T
|
|
89
|
+
del sources_np
|
|
96
90
|
|
|
97
91
|
max_val = np.max(np.abs(background_data))
|
|
98
92
|
if max_val > 1.0:
|
|
99
|
-
background_data
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
music_data += stems[name].data
|
|
111
|
-
|
|
112
|
-
max_val = np.max(np.abs(music_data))
|
|
113
|
-
if max_val > 1.0:
|
|
114
|
-
music_data = music_data / max_val
|
|
115
|
-
|
|
116
|
-
music = Audio(music_data.astype(np.float32), vocals.metadata)
|
|
93
|
+
background_data /= max_val
|
|
94
|
+
|
|
95
|
+
metadata = AudioMetadata(
|
|
96
|
+
sample_rate=target_sr,
|
|
97
|
+
channels=2,
|
|
98
|
+
sample_width=2,
|
|
99
|
+
duration_seconds=vocals_data.shape[0] / target_sr,
|
|
100
|
+
frame_count=vocals_data.shape[0],
|
|
101
|
+
)
|
|
102
|
+
vocals = Audio(np.ascontiguousarray(vocals_data, dtype=np.float32), metadata)
|
|
103
|
+
background = Audio(np.ascontiguousarray(background_data, dtype=np.float32), metadata)
|
|
117
104
|
|
|
118
105
|
return SeparatedAudio(
|
|
119
106
|
vocals=vocals,
|
|
120
107
|
background=background,
|
|
121
108
|
original=audio,
|
|
122
|
-
music=
|
|
109
|
+
music=None,
|
|
123
110
|
effects=None,
|
|
124
111
|
)
|
|
125
112
|
|
|
@@ -1,215 +0,0 @@
|
|
|
1
|
-
"""Audio generation using local models."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
from typing import Any
|
|
6
|
-
|
|
7
|
-
from videopython.ai._device import log_device_initialization, release_device_memory, select_device
|
|
8
|
-
from videopython.base.audio import Audio, AudioMetadata
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class TextToSpeech:
|
|
12
|
-
"""Generates speech audio from text using local models.
|
|
13
|
-
|
|
14
|
-
Supports Bark (`base`, `small`) for general TTS and Chatterbox Multilingual
|
|
15
|
-
(`chatterbox`) for multilingual voice cloning.
|
|
16
|
-
"""
|
|
17
|
-
|
|
18
|
-
SUPPORTED_LOCAL_MODELS: list[str] = ["base", "small", "chatterbox"]
|
|
19
|
-
|
|
20
|
-
CHATTERBOX_SAMPLE_RATE: int = 24000
|
|
21
|
-
|
|
22
|
-
def __init__(
|
|
23
|
-
self,
|
|
24
|
-
model_size: str = "base",
|
|
25
|
-
voice: str | None = None,
|
|
26
|
-
device: str | None = None,
|
|
27
|
-
language: str = "en",
|
|
28
|
-
):
|
|
29
|
-
if model_size not in self.SUPPORTED_LOCAL_MODELS:
|
|
30
|
-
raise ValueError(f"model_size must be one of {self.SUPPORTED_LOCAL_MODELS}, got '{model_size}'")
|
|
31
|
-
|
|
32
|
-
self.model_size = model_size
|
|
33
|
-
self.voice = voice
|
|
34
|
-
self.device = device
|
|
35
|
-
self.language = language
|
|
36
|
-
self._model: Any = None
|
|
37
|
-
self._processor: Any = None
|
|
38
|
-
self._chatterbox_model: Any = None
|
|
39
|
-
|
|
40
|
-
def _init_local(self) -> None:
|
|
41
|
-
"""Initialize local Bark model."""
|
|
42
|
-
from transformers import AutoModel, AutoProcessor
|
|
43
|
-
|
|
44
|
-
requested_device = self.device
|
|
45
|
-
device = select_device(self.device, mps_allowed=False)
|
|
46
|
-
|
|
47
|
-
model_name = "suno/bark" if self.model_size == "base" else "suno/bark-small"
|
|
48
|
-
self._processor = AutoProcessor.from_pretrained(model_name)
|
|
49
|
-
self._model = AutoModel.from_pretrained(model_name).to(device)
|
|
50
|
-
self.device = device
|
|
51
|
-
log_device_initialization(
|
|
52
|
-
"TextToSpeech",
|
|
53
|
-
requested_device=requested_device,
|
|
54
|
-
resolved_device=device,
|
|
55
|
-
)
|
|
56
|
-
|
|
57
|
-
def _init_chatterbox(self) -> None:
|
|
58
|
-
"""Initialize Chatterbox Multilingual model for voice cloning."""
|
|
59
|
-
from chatterbox.mtl_tts import ChatterboxMultilingualTTS # type: ignore[import-untyped]
|
|
60
|
-
|
|
61
|
-
requested_device = self.device
|
|
62
|
-
device = select_device(self.device, mps_allowed=False)
|
|
63
|
-
|
|
64
|
-
self._chatterbox_model = ChatterboxMultilingualTTS.from_pretrained(device=device)
|
|
65
|
-
self.device = device
|
|
66
|
-
log_device_initialization(
|
|
67
|
-
"TextToSpeech",
|
|
68
|
-
requested_device=requested_device,
|
|
69
|
-
resolved_device=device,
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
def _generate_local(self, text: str, voice_preset: str | None) -> Audio:
|
|
73
|
-
"""Generate speech using Bark."""
|
|
74
|
-
import torch
|
|
75
|
-
|
|
76
|
-
if self._model is None:
|
|
77
|
-
self._init_local()
|
|
78
|
-
|
|
79
|
-
inputs = self._processor(text=[text], return_tensors="pt", voice_preset=voice_preset)
|
|
80
|
-
inputs = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
|
|
81
|
-
|
|
82
|
-
with torch.no_grad():
|
|
83
|
-
speech_values = self._model.generate(**inputs, do_sample=True)
|
|
84
|
-
|
|
85
|
-
audio_data = speech_values.cpu().float().numpy().squeeze()
|
|
86
|
-
sample_rate = self._model.generation_config.sample_rate
|
|
87
|
-
|
|
88
|
-
metadata = AudioMetadata(
|
|
89
|
-
sample_rate=sample_rate,
|
|
90
|
-
channels=1,
|
|
91
|
-
sample_width=2,
|
|
92
|
-
duration_seconds=len(audio_data) / sample_rate,
|
|
93
|
-
frame_count=len(audio_data),
|
|
94
|
-
)
|
|
95
|
-
return Audio(audio_data, metadata)
|
|
96
|
-
|
|
97
|
-
def _generate_chatterbox(self, text: str, voice_sample: Audio) -> Audio:
|
|
98
|
-
"""Generate speech using Chatterbox Multilingual with voice cloning."""
|
|
99
|
-
import tempfile
|
|
100
|
-
from pathlib import Path
|
|
101
|
-
|
|
102
|
-
import numpy as np
|
|
103
|
-
|
|
104
|
-
if self._chatterbox_model is None:
|
|
105
|
-
self._init_chatterbox()
|
|
106
|
-
|
|
107
|
-
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
|
108
|
-
voice_sample.save(f.name)
|
|
109
|
-
speaker_wav_path = Path(f.name)
|
|
110
|
-
|
|
111
|
-
try:
|
|
112
|
-
wav = self._chatterbox_model.generate(
|
|
113
|
-
text=text,
|
|
114
|
-
language_id=self.language,
|
|
115
|
-
audio_prompt_path=str(speaker_wav_path),
|
|
116
|
-
)
|
|
117
|
-
|
|
118
|
-
audio_data = wav.cpu().float().numpy().squeeze()
|
|
119
|
-
if audio_data.ndim == 0:
|
|
120
|
-
audio_data = np.array([audio_data], dtype=np.float32)
|
|
121
|
-
|
|
122
|
-
sample_rate = self.CHATTERBOX_SAMPLE_RATE
|
|
123
|
-
|
|
124
|
-
metadata = AudioMetadata(
|
|
125
|
-
sample_rate=sample_rate,
|
|
126
|
-
channels=1,
|
|
127
|
-
sample_width=2,
|
|
128
|
-
duration_seconds=len(audio_data) / sample_rate,
|
|
129
|
-
frame_count=len(audio_data),
|
|
130
|
-
)
|
|
131
|
-
return Audio(audio_data, metadata)
|
|
132
|
-
finally:
|
|
133
|
-
speaker_wav_path.unlink()
|
|
134
|
-
|
|
135
|
-
def generate_audio(
|
|
136
|
-
self,
|
|
137
|
-
text: str,
|
|
138
|
-
voice_preset: str | None = None,
|
|
139
|
-
voice_sample: Audio | None = None,
|
|
140
|
-
) -> Audio:
|
|
141
|
-
"""Generate speech audio from text."""
|
|
142
|
-
effective_voice = voice_preset or self.voice
|
|
143
|
-
|
|
144
|
-
if self.model_size == "chatterbox" or voice_sample is not None:
|
|
145
|
-
if voice_sample is None:
|
|
146
|
-
raise ValueError(
|
|
147
|
-
"voice_sample is required for Chatterbox voice cloning. "
|
|
148
|
-
"Provide an Audio sample of the voice to clone."
|
|
149
|
-
)
|
|
150
|
-
return self._generate_chatterbox(text, voice_sample)
|
|
151
|
-
|
|
152
|
-
return self._generate_local(text, effective_voice)
|
|
153
|
-
|
|
154
|
-
def unload(self) -> None:
|
|
155
|
-
"""Release the TTS model(s) so the next generate_audio() re-initializes.
|
|
156
|
-
|
|
157
|
-
Used by low-memory dubbing to free VRAM between pipeline stages.
|
|
158
|
-
"""
|
|
159
|
-
self._model = None
|
|
160
|
-
self._processor = None
|
|
161
|
-
self._chatterbox_model = None
|
|
162
|
-
release_device_memory(self.device)
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
class TextToMusic:
|
|
166
|
-
"""Generates music from text descriptions using MusicGen."""
|
|
167
|
-
|
|
168
|
-
def __init__(self, device: str | None = None):
|
|
169
|
-
self.device = device
|
|
170
|
-
self._processor: Any = None
|
|
171
|
-
self._model: Any = None
|
|
172
|
-
self._device: str | None = None
|
|
173
|
-
|
|
174
|
-
def _init_local(self) -> None:
|
|
175
|
-
"""Initialize local MusicGen model."""
|
|
176
|
-
import os
|
|
177
|
-
|
|
178
|
-
from transformers import AutoProcessor, MusicgenForConditionalGeneration
|
|
179
|
-
|
|
180
|
-
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
|
181
|
-
|
|
182
|
-
requested_device = self.device
|
|
183
|
-
self._device = select_device(self.device, mps_allowed=True)
|
|
184
|
-
|
|
185
|
-
model_name = "facebook/musicgen-small"
|
|
186
|
-
self._processor = AutoProcessor.from_pretrained(model_name)
|
|
187
|
-
self._model = MusicgenForConditionalGeneration.from_pretrained(model_name)
|
|
188
|
-
self._model.to(self._device)
|
|
189
|
-
self.device = self._device
|
|
190
|
-
log_device_initialization(
|
|
191
|
-
"TextToMusic",
|
|
192
|
-
requested_device=requested_device,
|
|
193
|
-
resolved_device=self._device,
|
|
194
|
-
)
|
|
195
|
-
|
|
196
|
-
def generate_audio(self, text: str, max_new_tokens: int = 256) -> Audio:
|
|
197
|
-
"""Generate music audio from text description."""
|
|
198
|
-
if self._model is None:
|
|
199
|
-
self._init_local()
|
|
200
|
-
|
|
201
|
-
inputs = self._processor(text=[text], padding=True, return_tensors="pt")
|
|
202
|
-
inputs = {k: v.to(self._device) if hasattr(v, "to") else v for k, v in inputs.items()}
|
|
203
|
-
audio_values = self._model.generate(**inputs, max_new_tokens=max_new_tokens)
|
|
204
|
-
sampling_rate = self._model.config.audio_encoder.sampling_rate
|
|
205
|
-
|
|
206
|
-
audio_data = audio_values[0, 0].cpu().float().numpy()
|
|
207
|
-
|
|
208
|
-
metadata = AudioMetadata(
|
|
209
|
-
sample_rate=sampling_rate,
|
|
210
|
-
channels=1,
|
|
211
|
-
sample_width=2,
|
|
212
|
-
duration_seconds=len(audio_data) / sampling_rate,
|
|
213
|
-
frame_count=len(audio_data),
|
|
214
|
-
)
|
|
215
|
-
return Audio(audio_data, metadata)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|