videopython 0.26.2__tar.gz → 0.26.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.26.2 → videopython-0.26.4}/PKG-INFO +1 -1
- {videopython-0.26.2 → videopython-0.26.4}/pyproject.toml +1 -1
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/dubbing/dubber.py +82 -2
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/dubbing/pipeline.py +19 -26
- videopython-0.26.4/src/videopython/ai/dubbing/remux.py +73 -0
- videopython-0.26.4/src/videopython/ai/generation/audio.py +156 -0
- videopython-0.26.2/src/videopython/ai/generation/audio.py +0 -215
- {videopython-0.26.2 → videopython-0.26.4}/.gitignore +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/LICENSE +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/README.md +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/__init__.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/_device.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/dubbing/__init__.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/dubbing/models.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/generation/translation.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/registry.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/swapping/__init__.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/swapping/inpainter.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/swapping/models.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/swapping/segmenter.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/swapping/swapper.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/transforms.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/understanding/audio.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/understanding/separation.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/video_analysis.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/__init__.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/audio/__init__.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/audio/analysis.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/audio/audio.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/combine.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/description.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/effects.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/progress.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/registry.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/scene.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/streaming.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/text/__init__.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/text/overlay.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/text/transcription.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/transforms.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/transitions.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/utils.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/video.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/editing/__init__.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/editing/multicam.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/editing/premiere_xml.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/editing/video_edit.py +0 -0
- {videopython-0.26.2 → videopython-0.26.4}/src/videopython/py.typed +0 -0
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
+
import tempfile
|
|
7
|
+
from pathlib import Path
|
|
6
8
|
from typing import TYPE_CHECKING, Any, Callable
|
|
7
9
|
|
|
8
10
|
from videopython.ai.dubbing.models import DubbingResult, RevoiceResult
|
|
@@ -60,7 +62,7 @@ class VideoDubber:
|
|
|
60
62
|
self._init_local_pipeline()
|
|
61
63
|
|
|
62
64
|
return self._local_pipeline.process(
|
|
63
|
-
|
|
65
|
+
source_audio=video.audio,
|
|
64
66
|
target_lang=target_lang,
|
|
65
67
|
source_lang=source_lang,
|
|
66
68
|
preserve_background=preserve_background,
|
|
@@ -99,6 +101,84 @@ class VideoDubber:
|
|
|
99
101
|
)
|
|
100
102
|
return video.add_audio(result.dubbed_audio, overlay=False)
|
|
101
103
|
|
|
104
|
+
def dub_file(
|
|
105
|
+
self,
|
|
106
|
+
input_path: str | Path,
|
|
107
|
+
output_path: str | Path,
|
|
108
|
+
target_lang: str,
|
|
109
|
+
source_lang: str | None = None,
|
|
110
|
+
preserve_background: bool = True,
|
|
111
|
+
voice_clone: bool = True,
|
|
112
|
+
enable_diarization: bool = False,
|
|
113
|
+
progress_callback: Callable[[str, float], None] | None = None,
|
|
114
|
+
transcription: Any = None,
|
|
115
|
+
) -> DubbingResult:
|
|
116
|
+
"""Dub a video file in place on disk without loading video frames into memory.
|
|
117
|
+
|
|
118
|
+
Extracts the audio track via ffmpeg, runs the dubbing pipeline on the
|
|
119
|
+
audio only, then muxes the dubbed audio back into the source video
|
|
120
|
+
using ffmpeg stream-copy (no video re-encode). Peak memory is bounded
|
|
121
|
+
by model weights and the audio track — independent of video length and
|
|
122
|
+
resolution.
|
|
123
|
+
|
|
124
|
+
Use this instead of ``dub_and_replace`` when the source video is long
|
|
125
|
+
or high-resolution and you don't need frame-level access in Python.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
input_path: Path to the source video file.
|
|
129
|
+
output_path: Path to write the dubbed video. Overwritten if it exists.
|
|
130
|
+
target_lang: Target language code (e.g. ``"es"``, ``"fr"``).
|
|
131
|
+
source_lang: Source language code, or ``None`` to auto-detect.
|
|
132
|
+
preserve_background: Preserve background music/effects via source separation.
|
|
133
|
+
voice_clone: Clone the source speaker's voice for the dubbed track.
|
|
134
|
+
enable_diarization: Enable speaker diarization for per-speaker voice cloning.
|
|
135
|
+
progress_callback: Optional callback ``(stage: str, progress: float) -> None``.
|
|
136
|
+
transcription: Optional pre-computed ``Transcription`` to skip the Whisper step.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
``DubbingResult`` with the dubbed audio, translated segments, and
|
|
140
|
+
source transcription. The output video is written to ``output_path``.
|
|
141
|
+
"""
|
|
142
|
+
from videopython.ai.dubbing.remux import replace_audio_stream
|
|
143
|
+
from videopython.base.audio import Audio
|
|
144
|
+
|
|
145
|
+
input_path = Path(input_path)
|
|
146
|
+
output_path = Path(output_path)
|
|
147
|
+
|
|
148
|
+
if not input_path.exists():
|
|
149
|
+
raise FileNotFoundError(f"Input video not found: {input_path}")
|
|
150
|
+
|
|
151
|
+
logger.info("dub_file: loading audio from %s", input_path)
|
|
152
|
+
source_audio = Audio.from_path(input_path)
|
|
153
|
+
|
|
154
|
+
if self._local_pipeline is None:
|
|
155
|
+
self._init_local_pipeline()
|
|
156
|
+
|
|
157
|
+
result = self._local_pipeline.process(
|
|
158
|
+
source_audio=source_audio,
|
|
159
|
+
target_lang=target_lang,
|
|
160
|
+
source_lang=source_lang,
|
|
161
|
+
preserve_background=preserve_background,
|
|
162
|
+
voice_clone=voice_clone,
|
|
163
|
+
enable_diarization=enable_diarization,
|
|
164
|
+
progress_callback=progress_callback,
|
|
165
|
+
transcription=transcription,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
|
169
|
+
dubbed_audio_path = Path(tmp.name)
|
|
170
|
+
try:
|
|
171
|
+
result.dubbed_audio.save(dubbed_audio_path)
|
|
172
|
+
replace_audio_stream(
|
|
173
|
+
video_path=input_path,
|
|
174
|
+
audio_path=dubbed_audio_path,
|
|
175
|
+
output_path=output_path,
|
|
176
|
+
)
|
|
177
|
+
finally:
|
|
178
|
+
dubbed_audio_path.unlink(missing_ok=True)
|
|
179
|
+
|
|
180
|
+
return result
|
|
181
|
+
|
|
102
182
|
def revoice(
|
|
103
183
|
self,
|
|
104
184
|
video: Video,
|
|
@@ -111,7 +191,7 @@ class VideoDubber:
|
|
|
111
191
|
self._init_local_pipeline()
|
|
112
192
|
|
|
113
193
|
return self._local_pipeline.revoice(
|
|
114
|
-
|
|
194
|
+
source_audio=video.audio,
|
|
115
195
|
text=text,
|
|
116
196
|
preserve_background=preserve_background,
|
|
117
197
|
progress_callback=progress_callback,
|
|
@@ -9,7 +9,7 @@ from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, Separate
|
|
|
9
9
|
from videopython.ai.dubbing.timing import TimingSynchronizer
|
|
10
10
|
|
|
11
11
|
if TYPE_CHECKING:
|
|
12
|
-
from videopython.base.
|
|
12
|
+
from videopython.base.audio import Audio
|
|
13
13
|
|
|
14
14
|
logger = logging.getLogger(__name__)
|
|
15
15
|
|
|
@@ -37,7 +37,6 @@ class LocalDubbingPipeline:
|
|
|
37
37
|
self._transcriber_diarization: bool | None = None
|
|
38
38
|
self._translator: Any = None
|
|
39
39
|
self._tts: Any = None
|
|
40
|
-
self._tts_voice_clone: bool | None = None
|
|
41
40
|
self._tts_language: str | None = None
|
|
42
41
|
self._separator: Any = None
|
|
43
42
|
self._synchronizer: TimingSynchronizer | None = None
|
|
@@ -71,18 +70,11 @@ class LocalDubbingPipeline:
|
|
|
71
70
|
|
|
72
71
|
self._translator = TextTranslator(device=self.device)
|
|
73
72
|
|
|
74
|
-
def _init_tts(self,
|
|
73
|
+
def _init_tts(self, language: str = "en") -> None:
|
|
75
74
|
"""Initialize the text-to-speech model."""
|
|
76
75
|
from videopython.ai.generation.audio import TextToSpeech
|
|
77
76
|
|
|
78
|
-
|
|
79
|
-
self._tts = TextToSpeech(
|
|
80
|
-
model_size="chatterbox",
|
|
81
|
-
device=self.device,
|
|
82
|
-
language=language,
|
|
83
|
-
)
|
|
84
|
-
else:
|
|
85
|
-
self._tts = TextToSpeech(device=self.device, language=language)
|
|
77
|
+
self._tts = TextToSpeech(device=self.device, language=language)
|
|
86
78
|
|
|
87
79
|
def _init_separator(self) -> None:
|
|
88
80
|
"""Initialize the audio separator."""
|
|
@@ -102,7 +94,6 @@ class LocalDubbingPipeline:
|
|
|
102
94
|
max_duration: float = 10.0,
|
|
103
95
|
) -> dict[str, Any]:
|
|
104
96
|
"""Extract voice samples for each speaker from the audio."""
|
|
105
|
-
from videopython.base.audio import Audio
|
|
106
97
|
|
|
107
98
|
voice_samples: dict[str, Audio] = {}
|
|
108
99
|
|
|
@@ -135,7 +126,7 @@ class LocalDubbingPipeline:
|
|
|
135
126
|
|
|
136
127
|
def process(
|
|
137
128
|
self,
|
|
138
|
-
|
|
129
|
+
source_audio: Audio,
|
|
139
130
|
target_lang: str,
|
|
140
131
|
source_lang: str | None = None,
|
|
141
132
|
preserve_background: bool = True,
|
|
@@ -144,22 +135,22 @@ class LocalDubbingPipeline:
|
|
|
144
135
|
progress_callback: Callable[[str, float], None] | None = None,
|
|
145
136
|
transcription: Any | None = None,
|
|
146
137
|
) -> DubbingResult:
|
|
147
|
-
"""
|
|
138
|
+
"""Run the dubbing pipeline against the given source audio.
|
|
148
139
|
|
|
149
140
|
Args:
|
|
141
|
+
source_audio: Source audio track to dub. Callers with a ``Video``
|
|
142
|
+
object should pass ``video.audio``; callers with only a file path
|
|
143
|
+
can use ``Audio.from_path(path)`` to avoid loading video frames.
|
|
150
144
|
transcription: Optional pre-computed Transcription object. When provided,
|
|
151
145
|
the internal Whisper transcription step is skipped (saving time and VRAM).
|
|
152
146
|
Must be a ``videopython.base.text.transcription.Transcription`` instance
|
|
153
147
|
with populated ``segments``.
|
|
154
148
|
"""
|
|
155
|
-
from videopython.base.audio import Audio
|
|
156
149
|
|
|
157
150
|
def report_progress(stage: str, progress: float) -> None:
|
|
158
151
|
if progress_callback:
|
|
159
152
|
progress_callback(stage, progress)
|
|
160
153
|
|
|
161
|
-
source_audio = video.audio
|
|
162
|
-
|
|
163
154
|
if transcription is not None:
|
|
164
155
|
report_progress("Using provided transcription", 0.05)
|
|
165
156
|
else:
|
|
@@ -211,9 +202,8 @@ class LocalDubbingPipeline:
|
|
|
211
202
|
self._maybe_unload("_translator")
|
|
212
203
|
|
|
213
204
|
report_progress("Generating dubbed speech", 0.50)
|
|
214
|
-
if self._tts is None or self.
|
|
215
|
-
self._init_tts(
|
|
216
|
-
self._tts_voice_clone = voice_clone
|
|
205
|
+
if self._tts is None or self._tts_language != target_lang:
|
|
206
|
+
self._init_tts(language=target_lang)
|
|
217
207
|
self._tts_language = target_lang
|
|
218
208
|
|
|
219
209
|
dubbed_segments: list[Audio] = []
|
|
@@ -275,19 +265,23 @@ class LocalDubbingPipeline:
|
|
|
275
265
|
|
|
276
266
|
def revoice(
|
|
277
267
|
self,
|
|
278
|
-
|
|
268
|
+
source_audio: Audio,
|
|
279
269
|
text: str,
|
|
280
270
|
preserve_background: bool = True,
|
|
281
271
|
progress_callback: Callable[[str, float], None] | None = None,
|
|
282
272
|
) -> RevoiceResult:
|
|
283
|
-
"""Replace speech in
|
|
273
|
+
"""Replace speech in audio with new text using voice cloning.
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
source_audio: Source audio track to revoice. Callers with a ``Video``
|
|
277
|
+
object should pass ``video.audio``.
|
|
278
|
+
"""
|
|
284
279
|
from videopython.base.audio import Audio
|
|
285
280
|
|
|
286
281
|
def report_progress(stage: str, progress: float) -> None:
|
|
287
282
|
if progress_callback:
|
|
288
283
|
progress_callback(stage, progress)
|
|
289
284
|
|
|
290
|
-
source_audio = video.audio
|
|
291
285
|
original_duration = source_audio.metadata.duration_seconds
|
|
292
286
|
|
|
293
287
|
report_progress("Analyzing audio", 0.05)
|
|
@@ -323,9 +317,8 @@ class LocalDubbingPipeline:
|
|
|
323
317
|
voice_sample = vocal_audio.slice(0, sample_duration)
|
|
324
318
|
|
|
325
319
|
report_progress("Generating speech", 0.60)
|
|
326
|
-
if self._tts is None or self.
|
|
327
|
-
self._init_tts(
|
|
328
|
-
self._tts_voice_clone = True
|
|
320
|
+
if self._tts is None or self._tts_language != "en":
|
|
321
|
+
self._init_tts(language="en")
|
|
329
322
|
self._tts_language = "en"
|
|
330
323
|
|
|
331
324
|
generated_speech = self._tts.generate_audio(text, voice_sample=voice_sample)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""ffmpeg helper for replacing a video file's audio track without re-encoding video."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import subprocess
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class RemuxError(RuntimeError):
|
|
13
|
+
"""ffmpeg failed while replacing an audio stream."""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def replace_audio_stream(
|
|
17
|
+
video_path: str | Path,
|
|
18
|
+
audio_path: str | Path,
|
|
19
|
+
output_path: str | Path,
|
|
20
|
+
audio_codec: str = "aac",
|
|
21
|
+
audio_bitrate: str = "192k",
|
|
22
|
+
) -> None:
|
|
23
|
+
"""Copy ``video_path``'s video stream and mux in ``audio_path`` as the audio track.
|
|
24
|
+
|
|
25
|
+
Uses ffmpeg stream-copy for video (no re-encode) and encodes audio to AAC.
|
|
26
|
+
``-shortest`` trims to the shorter of the two streams so the output duration
|
|
27
|
+
matches the source video when the dubbed audio is slightly longer.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
video_path: Source video file (video stream is copied unchanged).
|
|
31
|
+
audio_path: Audio file to use as the new audio track.
|
|
32
|
+
output_path: Destination file. Overwritten if it exists.
|
|
33
|
+
audio_codec: ffmpeg audio codec name. Defaults to ``aac`` (MP4-compatible).
|
|
34
|
+
audio_bitrate: Audio bitrate passed to ffmpeg (``-b:a``).
|
|
35
|
+
|
|
36
|
+
Raises:
|
|
37
|
+
FileNotFoundError: If ``video_path`` or ``audio_path`` does not exist.
|
|
38
|
+
RemuxError: If ffmpeg returns a non-zero exit code.
|
|
39
|
+
"""
|
|
40
|
+
video_path = Path(video_path)
|
|
41
|
+
audio_path = Path(audio_path)
|
|
42
|
+
output_path = Path(output_path)
|
|
43
|
+
|
|
44
|
+
if not video_path.exists():
|
|
45
|
+
raise FileNotFoundError(f"Video file not found: {video_path}")
|
|
46
|
+
if not audio_path.exists():
|
|
47
|
+
raise FileNotFoundError(f"Audio file not found: {audio_path}")
|
|
48
|
+
|
|
49
|
+
cmd = [
|
|
50
|
+
"ffmpeg",
|
|
51
|
+
"-y",
|
|
52
|
+
"-i",
|
|
53
|
+
str(video_path),
|
|
54
|
+
"-i",
|
|
55
|
+
str(audio_path),
|
|
56
|
+
"-map",
|
|
57
|
+
"0:v:0",
|
|
58
|
+
"-map",
|
|
59
|
+
"1:a:0",
|
|
60
|
+
"-c:v",
|
|
61
|
+
"copy",
|
|
62
|
+
"-c:a",
|
|
63
|
+
audio_codec,
|
|
64
|
+
"-b:a",
|
|
65
|
+
audio_bitrate,
|
|
66
|
+
"-shortest",
|
|
67
|
+
str(output_path),
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
logger.info("replace_audio_stream: %s + %s -> %s", video_path, audio_path, output_path)
|
|
71
|
+
result = subprocess.run(cmd, capture_output=True)
|
|
72
|
+
if result.returncode != 0:
|
|
73
|
+
raise RemuxError(f"ffmpeg failed (exit {result.returncode}): {result.stderr.decode(errors='replace')}")
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""Audio generation using local models."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from videopython.ai._device import log_device_initialization, release_device_memory, select_device
|
|
8
|
+
from videopython.base.audio import Audio, AudioMetadata
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TextToSpeech:
|
|
12
|
+
"""Generates speech audio from text using Chatterbox Multilingual.
|
|
13
|
+
|
|
14
|
+
Backed by Chatterbox Multilingual (Resemble AI). When ``voice_sample`` is
|
|
15
|
+
provided to ``generate_audio``, the model clones that voice; otherwise it
|
|
16
|
+
falls back to Chatterbox's built-in default speaker.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
SAMPLE_RATE: int = 24000
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
voice: Audio | None = None,
|
|
24
|
+
device: str | None = None,
|
|
25
|
+
language: str = "en",
|
|
26
|
+
):
|
|
27
|
+
self.voice = voice
|
|
28
|
+
self.device = device
|
|
29
|
+
self.language = language
|
|
30
|
+
self._model: Any = None
|
|
31
|
+
|
|
32
|
+
def _init_model(self) -> None:
|
|
33
|
+
from chatterbox.mtl_tts import ChatterboxMultilingualTTS # type: ignore[import-untyped]
|
|
34
|
+
|
|
35
|
+
requested_device = self.device
|
|
36
|
+
device = select_device(self.device, mps_allowed=False)
|
|
37
|
+
|
|
38
|
+
self._model = ChatterboxMultilingualTTS.from_pretrained(device=device)
|
|
39
|
+
self.device = device
|
|
40
|
+
log_device_initialization(
|
|
41
|
+
"TextToSpeech",
|
|
42
|
+
requested_device=requested_device,
|
|
43
|
+
resolved_device=device,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
def generate_audio(
|
|
47
|
+
self,
|
|
48
|
+
text: str,
|
|
49
|
+
voice_sample: Audio | None = None,
|
|
50
|
+
) -> Audio:
|
|
51
|
+
"""Generate speech audio from text.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
text: Text to synthesize.
|
|
55
|
+
voice_sample: Optional voice sample to clone. Falls back to the
|
|
56
|
+
instance's ``voice`` and then to Chatterbox's default speaker.
|
|
57
|
+
"""
|
|
58
|
+
import tempfile
|
|
59
|
+
from pathlib import Path
|
|
60
|
+
|
|
61
|
+
import numpy as np
|
|
62
|
+
|
|
63
|
+
if self._model is None:
|
|
64
|
+
self._init_model()
|
|
65
|
+
|
|
66
|
+
effective_sample = voice_sample or self.voice
|
|
67
|
+
speaker_wav_path: Path | None = None
|
|
68
|
+
|
|
69
|
+
if effective_sample is not None:
|
|
70
|
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
|
71
|
+
effective_sample.save(f.name)
|
|
72
|
+
speaker_wav_path = Path(f.name)
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
wav = self._model.generate(
|
|
76
|
+
text=text,
|
|
77
|
+
language_id=self.language,
|
|
78
|
+
audio_prompt_path=str(speaker_wav_path) if speaker_wav_path else None,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
audio_data = wav.cpu().float().numpy().squeeze()
|
|
82
|
+
if audio_data.ndim == 0:
|
|
83
|
+
audio_data = np.array([audio_data], dtype=np.float32)
|
|
84
|
+
|
|
85
|
+
metadata = AudioMetadata(
|
|
86
|
+
sample_rate=self.SAMPLE_RATE,
|
|
87
|
+
channels=1,
|
|
88
|
+
sample_width=2,
|
|
89
|
+
duration_seconds=len(audio_data) / self.SAMPLE_RATE,
|
|
90
|
+
frame_count=len(audio_data),
|
|
91
|
+
)
|
|
92
|
+
return Audio(audio_data, metadata)
|
|
93
|
+
finally:
|
|
94
|
+
if speaker_wav_path is not None:
|
|
95
|
+
speaker_wav_path.unlink()
|
|
96
|
+
|
|
97
|
+
def unload(self) -> None:
|
|
98
|
+
"""Release the TTS model so the next generate_audio() re-initializes.
|
|
99
|
+
|
|
100
|
+
Used by low-memory dubbing to free VRAM between pipeline stages.
|
|
101
|
+
"""
|
|
102
|
+
self._model = None
|
|
103
|
+
release_device_memory(self.device)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class TextToMusic:
|
|
107
|
+
"""Generates music from text descriptions using MusicGen."""
|
|
108
|
+
|
|
109
|
+
def __init__(self, device: str | None = None):
|
|
110
|
+
self.device = device
|
|
111
|
+
self._processor: Any = None
|
|
112
|
+
self._model: Any = None
|
|
113
|
+
self._device: str | None = None
|
|
114
|
+
|
|
115
|
+
def _init_local(self) -> None:
|
|
116
|
+
"""Initialize local MusicGen model."""
|
|
117
|
+
import os
|
|
118
|
+
|
|
119
|
+
from transformers import AutoProcessor, MusicgenForConditionalGeneration
|
|
120
|
+
|
|
121
|
+
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
|
122
|
+
|
|
123
|
+
requested_device = self.device
|
|
124
|
+
self._device = select_device(self.device, mps_allowed=True)
|
|
125
|
+
|
|
126
|
+
model_name = "facebook/musicgen-small"
|
|
127
|
+
self._processor = AutoProcessor.from_pretrained(model_name)
|
|
128
|
+
self._model = MusicgenForConditionalGeneration.from_pretrained(model_name)
|
|
129
|
+
self._model.to(self._device)
|
|
130
|
+
self.device = self._device
|
|
131
|
+
log_device_initialization(
|
|
132
|
+
"TextToMusic",
|
|
133
|
+
requested_device=requested_device,
|
|
134
|
+
resolved_device=self._device,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
def generate_audio(self, text: str, max_new_tokens: int = 256) -> Audio:
|
|
138
|
+
"""Generate music audio from text description."""
|
|
139
|
+
if self._model is None:
|
|
140
|
+
self._init_local()
|
|
141
|
+
|
|
142
|
+
inputs = self._processor(text=[text], padding=True, return_tensors="pt")
|
|
143
|
+
inputs = {k: v.to(self._device) if hasattr(v, "to") else v for k, v in inputs.items()}
|
|
144
|
+
audio_values = self._model.generate(**inputs, max_new_tokens=max_new_tokens)
|
|
145
|
+
sampling_rate = self._model.config.audio_encoder.sampling_rate
|
|
146
|
+
|
|
147
|
+
audio_data = audio_values[0, 0].cpu().float().numpy()
|
|
148
|
+
|
|
149
|
+
metadata = AudioMetadata(
|
|
150
|
+
sample_rate=sampling_rate,
|
|
151
|
+
channels=1,
|
|
152
|
+
sample_width=2,
|
|
153
|
+
duration_seconds=len(audio_data) / sampling_rate,
|
|
154
|
+
frame_count=len(audio_data),
|
|
155
|
+
)
|
|
156
|
+
return Audio(audio_data, metadata)
|
|
@@ -1,215 +0,0 @@
|
|
|
1
|
-
"""Audio generation using local models."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
from typing import Any
|
|
6
|
-
|
|
7
|
-
from videopython.ai._device import log_device_initialization, release_device_memory, select_device
|
|
8
|
-
from videopython.base.audio import Audio, AudioMetadata
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class TextToSpeech:
|
|
12
|
-
"""Generates speech audio from text using local models.
|
|
13
|
-
|
|
14
|
-
Supports Bark (`base`, `small`) for general TTS and Chatterbox Multilingual
|
|
15
|
-
(`chatterbox`) for multilingual voice cloning.
|
|
16
|
-
"""
|
|
17
|
-
|
|
18
|
-
SUPPORTED_LOCAL_MODELS: list[str] = ["base", "small", "chatterbox"]
|
|
19
|
-
|
|
20
|
-
CHATTERBOX_SAMPLE_RATE: int = 24000
|
|
21
|
-
|
|
22
|
-
def __init__(
|
|
23
|
-
self,
|
|
24
|
-
model_size: str = "base",
|
|
25
|
-
voice: str | None = None,
|
|
26
|
-
device: str | None = None,
|
|
27
|
-
language: str = "en",
|
|
28
|
-
):
|
|
29
|
-
if model_size not in self.SUPPORTED_LOCAL_MODELS:
|
|
30
|
-
raise ValueError(f"model_size must be one of {self.SUPPORTED_LOCAL_MODELS}, got '{model_size}'")
|
|
31
|
-
|
|
32
|
-
self.model_size = model_size
|
|
33
|
-
self.voice = voice
|
|
34
|
-
self.device = device
|
|
35
|
-
self.language = language
|
|
36
|
-
self._model: Any = None
|
|
37
|
-
self._processor: Any = None
|
|
38
|
-
self._chatterbox_model: Any = None
|
|
39
|
-
|
|
40
|
-
def _init_local(self) -> None:
|
|
41
|
-
"""Initialize local Bark model."""
|
|
42
|
-
from transformers import AutoModel, AutoProcessor
|
|
43
|
-
|
|
44
|
-
requested_device = self.device
|
|
45
|
-
device = select_device(self.device, mps_allowed=False)
|
|
46
|
-
|
|
47
|
-
model_name = "suno/bark" if self.model_size == "base" else "suno/bark-small"
|
|
48
|
-
self._processor = AutoProcessor.from_pretrained(model_name)
|
|
49
|
-
self._model = AutoModel.from_pretrained(model_name).to(device)
|
|
50
|
-
self.device = device
|
|
51
|
-
log_device_initialization(
|
|
52
|
-
"TextToSpeech",
|
|
53
|
-
requested_device=requested_device,
|
|
54
|
-
resolved_device=device,
|
|
55
|
-
)
|
|
56
|
-
|
|
57
|
-
def _init_chatterbox(self) -> None:
|
|
58
|
-
"""Initialize Chatterbox Multilingual model for voice cloning."""
|
|
59
|
-
from chatterbox.mtl_tts import ChatterboxMultilingualTTS # type: ignore[import-untyped]
|
|
60
|
-
|
|
61
|
-
requested_device = self.device
|
|
62
|
-
device = select_device(self.device, mps_allowed=False)
|
|
63
|
-
|
|
64
|
-
self._chatterbox_model = ChatterboxMultilingualTTS.from_pretrained(device=device)
|
|
65
|
-
self.device = device
|
|
66
|
-
log_device_initialization(
|
|
67
|
-
"TextToSpeech",
|
|
68
|
-
requested_device=requested_device,
|
|
69
|
-
resolved_device=device,
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
def _generate_local(self, text: str, voice_preset: str | None) -> Audio:
|
|
73
|
-
"""Generate speech using Bark."""
|
|
74
|
-
import torch
|
|
75
|
-
|
|
76
|
-
if self._model is None:
|
|
77
|
-
self._init_local()
|
|
78
|
-
|
|
79
|
-
inputs = self._processor(text=[text], return_tensors="pt", voice_preset=voice_preset)
|
|
80
|
-
inputs = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
|
|
81
|
-
|
|
82
|
-
with torch.no_grad():
|
|
83
|
-
speech_values = self._model.generate(**inputs, do_sample=True)
|
|
84
|
-
|
|
85
|
-
audio_data = speech_values.cpu().float().numpy().squeeze()
|
|
86
|
-
sample_rate = self._model.generation_config.sample_rate
|
|
87
|
-
|
|
88
|
-
metadata = AudioMetadata(
|
|
89
|
-
sample_rate=sample_rate,
|
|
90
|
-
channels=1,
|
|
91
|
-
sample_width=2,
|
|
92
|
-
duration_seconds=len(audio_data) / sample_rate,
|
|
93
|
-
frame_count=len(audio_data),
|
|
94
|
-
)
|
|
95
|
-
return Audio(audio_data, metadata)
|
|
96
|
-
|
|
97
|
-
def _generate_chatterbox(self, text: str, voice_sample: Audio) -> Audio:
|
|
98
|
-
"""Generate speech using Chatterbox Multilingual with voice cloning."""
|
|
99
|
-
import tempfile
|
|
100
|
-
from pathlib import Path
|
|
101
|
-
|
|
102
|
-
import numpy as np
|
|
103
|
-
|
|
104
|
-
if self._chatterbox_model is None:
|
|
105
|
-
self._init_chatterbox()
|
|
106
|
-
|
|
107
|
-
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
|
108
|
-
voice_sample.save(f.name)
|
|
109
|
-
speaker_wav_path = Path(f.name)
|
|
110
|
-
|
|
111
|
-
try:
|
|
112
|
-
wav = self._chatterbox_model.generate(
|
|
113
|
-
text=text,
|
|
114
|
-
language_id=self.language,
|
|
115
|
-
audio_prompt_path=str(speaker_wav_path),
|
|
116
|
-
)
|
|
117
|
-
|
|
118
|
-
audio_data = wav.cpu().float().numpy().squeeze()
|
|
119
|
-
if audio_data.ndim == 0:
|
|
120
|
-
audio_data = np.array([audio_data], dtype=np.float32)
|
|
121
|
-
|
|
122
|
-
sample_rate = self.CHATTERBOX_SAMPLE_RATE
|
|
123
|
-
|
|
124
|
-
metadata = AudioMetadata(
|
|
125
|
-
sample_rate=sample_rate,
|
|
126
|
-
channels=1,
|
|
127
|
-
sample_width=2,
|
|
128
|
-
duration_seconds=len(audio_data) / sample_rate,
|
|
129
|
-
frame_count=len(audio_data),
|
|
130
|
-
)
|
|
131
|
-
return Audio(audio_data, metadata)
|
|
132
|
-
finally:
|
|
133
|
-
speaker_wav_path.unlink()
|
|
134
|
-
|
|
135
|
-
def generate_audio(
|
|
136
|
-
self,
|
|
137
|
-
text: str,
|
|
138
|
-
voice_preset: str | None = None,
|
|
139
|
-
voice_sample: Audio | None = None,
|
|
140
|
-
) -> Audio:
|
|
141
|
-
"""Generate speech audio from text."""
|
|
142
|
-
effective_voice = voice_preset or self.voice
|
|
143
|
-
|
|
144
|
-
if self.model_size == "chatterbox" or voice_sample is not None:
|
|
145
|
-
if voice_sample is None:
|
|
146
|
-
raise ValueError(
|
|
147
|
-
"voice_sample is required for Chatterbox voice cloning. "
|
|
148
|
-
"Provide an Audio sample of the voice to clone."
|
|
149
|
-
)
|
|
150
|
-
return self._generate_chatterbox(text, voice_sample)
|
|
151
|
-
|
|
152
|
-
return self._generate_local(text, effective_voice)
|
|
153
|
-
|
|
154
|
-
def unload(self) -> None:
|
|
155
|
-
"""Release the TTS model(s) so the next generate_audio() re-initializes.
|
|
156
|
-
|
|
157
|
-
Used by low-memory dubbing to free VRAM between pipeline stages.
|
|
158
|
-
"""
|
|
159
|
-
self._model = None
|
|
160
|
-
self._processor = None
|
|
161
|
-
self._chatterbox_model = None
|
|
162
|
-
release_device_memory(self.device)
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
class TextToMusic:
|
|
166
|
-
"""Generates music from text descriptions using MusicGen."""
|
|
167
|
-
|
|
168
|
-
def __init__(self, device: str | None = None):
|
|
169
|
-
self.device = device
|
|
170
|
-
self._processor: Any = None
|
|
171
|
-
self._model: Any = None
|
|
172
|
-
self._device: str | None = None
|
|
173
|
-
|
|
174
|
-
def _init_local(self) -> None:
|
|
175
|
-
"""Initialize local MusicGen model."""
|
|
176
|
-
import os
|
|
177
|
-
|
|
178
|
-
from transformers import AutoProcessor, MusicgenForConditionalGeneration
|
|
179
|
-
|
|
180
|
-
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
|
181
|
-
|
|
182
|
-
requested_device = self.device
|
|
183
|
-
self._device = select_device(self.device, mps_allowed=True)
|
|
184
|
-
|
|
185
|
-
model_name = "facebook/musicgen-small"
|
|
186
|
-
self._processor = AutoProcessor.from_pretrained(model_name)
|
|
187
|
-
self._model = MusicgenForConditionalGeneration.from_pretrained(model_name)
|
|
188
|
-
self._model.to(self._device)
|
|
189
|
-
self.device = self._device
|
|
190
|
-
log_device_initialization(
|
|
191
|
-
"TextToMusic",
|
|
192
|
-
requested_device=requested_device,
|
|
193
|
-
resolved_device=self._device,
|
|
194
|
-
)
|
|
195
|
-
|
|
196
|
-
def generate_audio(self, text: str, max_new_tokens: int = 256) -> Audio:
|
|
197
|
-
"""Generate music audio from text description."""
|
|
198
|
-
if self._model is None:
|
|
199
|
-
self._init_local()
|
|
200
|
-
|
|
201
|
-
inputs = self._processor(text=[text], padding=True, return_tensors="pt")
|
|
202
|
-
inputs = {k: v.to(self._device) if hasattr(v, "to") else v for k, v in inputs.items()}
|
|
203
|
-
audio_values = self._model.generate(**inputs, max_new_tokens=max_new_tokens)
|
|
204
|
-
sampling_rate = self._model.config.audio_encoder.sampling_rate
|
|
205
|
-
|
|
206
|
-
audio_data = audio_values[0, 0].cpu().float().numpy()
|
|
207
|
-
|
|
208
|
-
metadata = AudioMetadata(
|
|
209
|
-
sample_rate=sampling_rate,
|
|
210
|
-
channels=1,
|
|
211
|
-
sample_width=2,
|
|
212
|
-
duration_seconds=len(audio_data) / sampling_rate,
|
|
213
|
-
frame_count=len(audio_data),
|
|
214
|
-
)
|
|
215
|
-
return Audio(audio_data, metadata)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|