videopython 0.26.4__tar.gz → 0.26.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.26.4 → videopython-0.26.6}/PKG-INFO +1 -1
- {videopython-0.26.4 → videopython-0.26.6}/pyproject.toml +1 -1
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/dubbing/dubber.py +40 -9
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/dubbing/pipeline.py +102 -20
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/understanding/audio.py +42 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/understanding/separation.py +27 -40
- {videopython-0.26.4 → videopython-0.26.6}/.gitignore +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/LICENSE +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/README.md +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/__init__.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/_device.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/dubbing/__init__.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/dubbing/models.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/dubbing/remux.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/generation/audio.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/generation/translation.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/registry.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/swapping/__init__.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/swapping/inpainter.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/swapping/models.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/swapping/segmenter.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/swapping/swapper.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/transforms.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/video_analysis.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/__init__.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/audio/__init__.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/audio/analysis.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/audio/audio.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/combine.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/description.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/effects.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/progress.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/registry.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/scene.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/streaming.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/text/__init__.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/text/overlay.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/text/transcription.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/transforms.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/transitions.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/utils.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/video.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/editing/__init__.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/editing/multicam.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/editing/premiere_xml.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/editing/video_edit.py +0 -0
- {videopython-0.26.4 → videopython-0.26.6}/src/videopython/py.typed +0 -0
|
@@ -8,6 +8,7 @@ from pathlib import Path
|
|
|
8
8
|
from typing import TYPE_CHECKING, Any, Callable
|
|
9
9
|
|
|
10
10
|
from videopython.ai.dubbing.models import DubbingResult, RevoiceResult
|
|
11
|
+
from videopython.ai.dubbing.pipeline import WhisperModel
|
|
11
12
|
|
|
12
13
|
if TYPE_CHECKING:
|
|
13
14
|
from videopython.base.video import Video
|
|
@@ -25,19 +26,38 @@ class VideoDubber:
|
|
|
25
26
|
model is resident at a time. Trades per-run latency (~10-30s of
|
|
26
27
|
extra model loads) for a much lower memory ceiling. Recommended for
|
|
27
28
|
GPUs with <=12GB VRAM or hosts with <32GB RAM. Default False.
|
|
29
|
+
whisper_model: Whisper model size used for transcription. Larger models
|
|
30
|
+
give better accuracy at the cost of VRAM and latency. One of
|
|
31
|
+
``tiny``, ``base``, ``small``, ``medium``, ``large``, ``turbo``.
|
|
32
|
+
Default ``small``.
|
|
28
33
|
"""
|
|
29
34
|
|
|
30
|
-
def __init__(
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
device: str | None = None,
|
|
38
|
+
low_memory: bool = False,
|
|
39
|
+
whisper_model: WhisperModel = "small",
|
|
40
|
+
):
|
|
31
41
|
self.device = device
|
|
32
42
|
self.low_memory = low_memory
|
|
43
|
+
self.whisper_model = whisper_model
|
|
33
44
|
self._local_pipeline: Any = None
|
|
34
45
|
requested = device.lower() if isinstance(device, str) else "auto"
|
|
35
|
-
logger.info(
|
|
46
|
+
logger.info(
|
|
47
|
+
"VideoDubber initialized with device=%s low_memory=%s whisper_model=%s",
|
|
48
|
+
requested,
|
|
49
|
+
low_memory,
|
|
50
|
+
whisper_model,
|
|
51
|
+
)
|
|
36
52
|
|
|
37
53
|
def _init_local_pipeline(self) -> None:
|
|
38
54
|
from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
|
|
39
55
|
|
|
40
|
-
self._local_pipeline = LocalDubbingPipeline(
|
|
56
|
+
self._local_pipeline = LocalDubbingPipeline(
|
|
57
|
+
device=self.device,
|
|
58
|
+
low_memory=self.low_memory,
|
|
59
|
+
whisper_model=self.whisper_model,
|
|
60
|
+
)
|
|
41
61
|
|
|
42
62
|
def dub(
|
|
43
63
|
self,
|
|
@@ -54,9 +74,14 @@ class VideoDubber:
|
|
|
54
74
|
|
|
55
75
|
Args:
|
|
56
76
|
enable_diarization: Enable speaker diarization to clone each speaker's
|
|
57
|
-
voice separately.
|
|
58
|
-
|
|
59
|
-
|
|
77
|
+
voice separately. With ``transcription=None``, runs alongside Whisper.
|
|
78
|
+
With a supplied ``transcription`` that has no speakers, runs pyannote
|
|
79
|
+
standalone and overlays speakers onto the supplied words. Ignored when
|
|
80
|
+
the supplied transcription already has speaker labels.
|
|
81
|
+
transcription: Optional pre-computed ``Transcription`` to skip the Whisper
|
|
82
|
+
step. Speaker labels on the supplied transcription drive per-speaker
|
|
83
|
+
voice cloning. If it has no speakers, pass ``enable_diarization=True``
|
|
84
|
+
to add them via pyannote (requires word-level timings).
|
|
60
85
|
"""
|
|
61
86
|
if self._local_pipeline is None:
|
|
62
87
|
self._init_local_pipeline()
|
|
@@ -86,8 +111,10 @@ class VideoDubber:
|
|
|
86
111
|
"""Dub a video and return a new video with the dubbed audio.
|
|
87
112
|
|
|
88
113
|
Args:
|
|
89
|
-
transcription: Optional pre-computed Transcription
|
|
90
|
-
the
|
|
114
|
+
transcription: Optional pre-computed ``Transcription`` to skip the Whisper
|
|
115
|
+
step. Speaker labels on the supplied transcription drive per-speaker
|
|
116
|
+
voice cloning. See ``dub()`` for the interaction with
|
|
117
|
+
``enable_diarization``.
|
|
91
118
|
"""
|
|
92
119
|
result = self.dub(
|
|
93
120
|
video=video,
|
|
@@ -132,8 +159,12 @@ class VideoDubber:
|
|
|
132
159
|
preserve_background: Preserve background music/effects via source separation.
|
|
133
160
|
voice_clone: Clone the source speaker's voice for the dubbed track.
|
|
134
161
|
enable_diarization: Enable speaker diarization for per-speaker voice cloning.
|
|
162
|
+
See ``dub()`` for the interaction with ``transcription``.
|
|
135
163
|
progress_callback: Optional callback ``(stage: str, progress: float) -> None``.
|
|
136
|
-
transcription: Optional pre-computed ``Transcription`` to skip the Whisper
|
|
164
|
+
transcription: Optional pre-computed ``Transcription`` to skip the Whisper
|
|
165
|
+
step. Speaker labels on the supplied transcription drive per-speaker
|
|
166
|
+
voice cloning. If it has no speakers, pass ``enable_diarization=True``
|
|
167
|
+
to add them via pyannote (requires word-level timings).
|
|
137
168
|
|
|
138
169
|
Returns:
|
|
139
170
|
``DubbingResult`` with the dubbed audio, translated segments, and
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
from typing import TYPE_CHECKING, Any, Callable
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Callable, Literal
|
|
7
7
|
|
|
8
8
|
from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio
|
|
9
9
|
from videopython.ai.dubbing.timing import TimingSynchronizer
|
|
@@ -11,6 +11,8 @@ from videopython.ai.dubbing.timing import TimingSynchronizer
|
|
|
11
11
|
if TYPE_CHECKING:
|
|
12
12
|
from videopython.base.audio import Audio
|
|
13
13
|
|
|
14
|
+
WhisperModel = Literal["tiny", "base", "small", "medium", "large", "turbo"]
|
|
15
|
+
|
|
14
16
|
logger = logging.getLogger(__name__)
|
|
15
17
|
|
|
16
18
|
|
|
@@ -23,14 +25,21 @@ class LocalDubbingPipeline:
|
|
|
23
25
|
with <=12GB VRAM or hosts with <32GB RAM.
|
|
24
26
|
"""
|
|
25
27
|
|
|
26
|
-
def __init__(
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
device: str | None = None,
|
|
31
|
+
low_memory: bool = False,
|
|
32
|
+
whisper_model: WhisperModel = "small",
|
|
33
|
+
):
|
|
27
34
|
self.device = device
|
|
28
35
|
self.low_memory = low_memory
|
|
36
|
+
self.whisper_model = whisper_model
|
|
29
37
|
requested = device.lower() if isinstance(device, str) else "auto"
|
|
30
38
|
logger.info(
|
|
31
|
-
"LocalDubbingPipeline initialized with device=%s low_memory=%s",
|
|
39
|
+
"LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s",
|
|
32
40
|
requested,
|
|
33
41
|
low_memory,
|
|
42
|
+
whisper_model,
|
|
34
43
|
)
|
|
35
44
|
|
|
36
45
|
self._transcriber: Any = None
|
|
@@ -62,7 +71,11 @@ class LocalDubbingPipeline:
|
|
|
62
71
|
"""Initialize the transcription model."""
|
|
63
72
|
from videopython.ai.understanding.audio import AudioToText
|
|
64
73
|
|
|
65
|
-
self._transcriber = AudioToText(
|
|
74
|
+
self._transcriber = AudioToText(
|
|
75
|
+
model_name=self.whisper_model,
|
|
76
|
+
device=self.device,
|
|
77
|
+
enable_diarization=enable_diarization,
|
|
78
|
+
)
|
|
66
79
|
|
|
67
80
|
def _init_translator(self) -> None:
|
|
68
81
|
"""Initialize the translation model."""
|
|
@@ -94,6 +107,7 @@ class LocalDubbingPipeline:
|
|
|
94
107
|
max_duration: float = 10.0,
|
|
95
108
|
) -> dict[str, Any]:
|
|
96
109
|
"""Extract voice samples for each speaker from the audio."""
|
|
110
|
+
from videopython.base.audio import Audio
|
|
97
111
|
|
|
98
112
|
voice_samples: dict[str, Audio] = {}
|
|
99
113
|
|
|
@@ -120,7 +134,11 @@ class LocalDubbingPipeline:
|
|
|
120
134
|
if best_segment is not None:
|
|
121
135
|
start = best_segment.start
|
|
122
136
|
end = min(best_segment.end, start + max_duration)
|
|
123
|
-
|
|
137
|
+
sliced = audio.slice(start, end)
|
|
138
|
+
# Audio.slice returns a numpy view into the source. Copy so the
|
|
139
|
+
# short voice sample doesn't keep the full vocals array (~1.3 GB
|
|
140
|
+
# for 2h sources) alive across translate + TTS.
|
|
141
|
+
voice_samples[speaker] = Audio(sliced.data.copy(), sliced.metadata)
|
|
124
142
|
|
|
125
143
|
return voice_samples
|
|
126
144
|
|
|
@@ -144,7 +162,16 @@ class LocalDubbingPipeline:
|
|
|
144
162
|
transcription: Optional pre-computed Transcription object. When provided,
|
|
145
163
|
the internal Whisper transcription step is skipped (saving time and VRAM).
|
|
146
164
|
Must be a ``videopython.base.text.transcription.Transcription`` instance
|
|
147
|
-
with populated ``segments``.
|
|
165
|
+
with populated ``segments``. Speaker labels on the supplied transcription
|
|
166
|
+
drive per-speaker voice cloning. If the supplied transcription has no
|
|
167
|
+
speakers and ``enable_diarization=True``, pyannote is run standalone on
|
|
168
|
+
``source_audio`` and speakers are attached to the supplied words
|
|
169
|
+
(requires word-level timings).
|
|
170
|
+
enable_diarization: When True, run speaker diarization to enable per-speaker
|
|
171
|
+
voice cloning. With ``transcription=None``, runs alongside Whisper. With
|
|
172
|
+
a supplied ``transcription`` that has no speakers, runs pyannote
|
|
173
|
+
standalone and overlays speakers onto the supplied words. Ignored when
|
|
174
|
+
the supplied transcription already has speaker labels.
|
|
148
175
|
"""
|
|
149
176
|
|
|
150
177
|
def report_progress(stage: str, progress: float) -> None:
|
|
@@ -153,6 +180,34 @@ class LocalDubbingPipeline:
|
|
|
153
180
|
|
|
154
181
|
if transcription is not None:
|
|
155
182
|
report_progress("Using provided transcription", 0.05)
|
|
183
|
+
if transcription.speakers:
|
|
184
|
+
logger.info(
|
|
185
|
+
"Using provided transcription: %d segment(s), %d speaker(s)",
|
|
186
|
+
len(transcription.segments),
|
|
187
|
+
len(transcription.speakers),
|
|
188
|
+
)
|
|
189
|
+
if enable_diarization:
|
|
190
|
+
logger.info("enable_diarization=True ignored: supplied transcription already has speaker labels.")
|
|
191
|
+
elif enable_diarization:
|
|
192
|
+
report_progress("Diarizing supplied transcription", 0.10)
|
|
193
|
+
if self._transcriber is None or self._transcriber_diarization is not True:
|
|
194
|
+
self._init_transcriber(enable_diarization=True)
|
|
195
|
+
self._transcriber_diarization = True
|
|
196
|
+
transcription = self._transcriber.diarize_transcription(source_audio, transcription)
|
|
197
|
+
self._maybe_unload("_transcriber")
|
|
198
|
+
logger.info(
|
|
199
|
+
"Diarized supplied transcription: %d segment(s), %d speaker(s)",
|
|
200
|
+
len(transcription.segments),
|
|
201
|
+
len(transcription.speakers),
|
|
202
|
+
)
|
|
203
|
+
else:
|
|
204
|
+
logger.info(
|
|
205
|
+
"Using provided transcription: %d segment(s), no speaker labels. "
|
|
206
|
+
"All segments will share a single voice clone. Pass "
|
|
207
|
+
"enable_diarization=True to add per-speaker labels, or "
|
|
208
|
+
"voice_clone=False to use the default TTS voice.",
|
|
209
|
+
len(transcription.segments),
|
|
210
|
+
)
|
|
156
211
|
else:
|
|
157
212
|
report_progress("Transcribing audio", 0.05)
|
|
158
213
|
if self._transcriber is None or self._transcriber_diarization != enable_diarization:
|
|
@@ -175,6 +230,7 @@ class LocalDubbingPipeline:
|
|
|
175
230
|
|
|
176
231
|
separated_audio: SeparatedAudio | None = None
|
|
177
232
|
vocal_audio = source_audio
|
|
233
|
+
background_audio: Audio | None = None
|
|
178
234
|
|
|
179
235
|
if preserve_background:
|
|
180
236
|
report_progress("Separating audio", 0.15)
|
|
@@ -184,12 +240,24 @@ class LocalDubbingPipeline:
|
|
|
184
240
|
separated_audio = self._separator.separate(source_audio)
|
|
185
241
|
self._maybe_unload("_separator")
|
|
186
242
|
vocal_audio = separated_audio.vocals
|
|
243
|
+
background_audio = separated_audio.background
|
|
244
|
+
# In low_memory mode, drop the SeparatedAudio container so vocals
|
|
245
|
+
# and background can be released as soon as their last local
|
|
246
|
+
# reference goes (after voice-sample extraction and final overlay
|
|
247
|
+
# respectively). The result will report separated_audio=None.
|
|
248
|
+
if self.low_memory:
|
|
249
|
+
separated_audio = None
|
|
187
250
|
|
|
188
251
|
voice_samples: dict[str, Audio] = {}
|
|
189
252
|
if voice_clone:
|
|
190
253
|
report_progress("Extracting voice samples", 0.25)
|
|
191
254
|
voice_samples = self._extract_voice_samples(vocal_audio, transcription)
|
|
192
255
|
|
|
256
|
+
# vocals is no longer needed; voice_samples are independent copies.
|
|
257
|
+
# In low_memory mode this is the only ref keeping the buffer alive
|
|
258
|
+
# (separated_audio was dropped above), so dropping the local frees it.
|
|
259
|
+
del vocal_audio
|
|
260
|
+
|
|
193
261
|
report_progress("Translating text", 0.35)
|
|
194
262
|
if self._translator is None:
|
|
195
263
|
self._init_translator()
|
|
@@ -237,17 +305,23 @@ class LocalDubbingPipeline:
|
|
|
237
305
|
assert self._synchronizer is not None
|
|
238
306
|
|
|
239
307
|
synchronized_segments, _ = self._synchronizer.synchronize_segments(dubbed_segments, target_durations)
|
|
308
|
+
del dubbed_segments
|
|
240
309
|
|
|
241
310
|
report_progress("Assembling final audio", 0.90)
|
|
242
311
|
total_duration = source_audio.metadata.duration_seconds
|
|
243
312
|
dubbed_speech = self._synchronizer.assemble_with_timing(synchronized_segments, start_times, total_duration)
|
|
313
|
+
del synchronized_segments
|
|
244
314
|
|
|
245
|
-
if
|
|
246
|
-
background_sr =
|
|
315
|
+
if background_audio is not None:
|
|
316
|
+
background_sr = background_audio.metadata.sample_rate
|
|
247
317
|
if dubbed_speech.metadata.sample_rate != background_sr:
|
|
248
318
|
dubbed_speech = dubbed_speech.resample(background_sr)
|
|
249
319
|
|
|
250
|
-
final_audio =
|
|
320
|
+
final_audio = background_audio.overlay(dubbed_speech, position=0.0)
|
|
321
|
+
# Drop the local; in low_memory this releases the background
|
|
322
|
+
# buffer (~1.3 GB for 2h sources). In non-low_memory the same
|
|
323
|
+
# array is still held by separated_audio.background.
|
|
324
|
+
del background_audio
|
|
251
325
|
else:
|
|
252
326
|
final_audio = dubbed_speech
|
|
253
327
|
|
|
@@ -294,6 +368,7 @@ class LocalDubbingPipeline:
|
|
|
294
368
|
|
|
295
369
|
separated_audio: SeparatedAudio | None = None
|
|
296
370
|
vocal_audio = source_audio
|
|
371
|
+
background_audio: Audio | None = None
|
|
297
372
|
|
|
298
373
|
if preserve_background:
|
|
299
374
|
report_progress("Separating audio", 0.20)
|
|
@@ -303,6 +378,9 @@ class LocalDubbingPipeline:
|
|
|
303
378
|
separated_audio = self._separator.separate(source_audio)
|
|
304
379
|
self._maybe_unload("_separator")
|
|
305
380
|
vocal_audio = separated_audio.vocals
|
|
381
|
+
background_audio = separated_audio.background
|
|
382
|
+
if self.low_memory:
|
|
383
|
+
separated_audio = None
|
|
306
384
|
|
|
307
385
|
report_progress("Extracting voice sample", 0.40)
|
|
308
386
|
voice_sample: Audio | None = None
|
|
@@ -314,7 +392,11 @@ class LocalDubbingPipeline:
|
|
|
314
392
|
|
|
315
393
|
if voice_sample is None:
|
|
316
394
|
sample_duration = min(6.0, original_duration)
|
|
317
|
-
|
|
395
|
+
sliced = vocal_audio.slice(0, sample_duration)
|
|
396
|
+
# Copy so the short sample doesn't pin the full vocals array.
|
|
397
|
+
voice_sample = Audio(sliced.data.copy(), sliced.metadata)
|
|
398
|
+
|
|
399
|
+
del vocal_audio
|
|
318
400
|
|
|
319
401
|
report_progress("Generating speech", 0.60)
|
|
320
402
|
if self._tts is None or self._tts_language != "en":
|
|
@@ -327,24 +409,24 @@ class LocalDubbingPipeline:
|
|
|
327
409
|
|
|
328
410
|
report_progress("Assembling audio", 0.85)
|
|
329
411
|
|
|
330
|
-
if
|
|
331
|
-
background_sr =
|
|
412
|
+
if background_audio is not None:
|
|
413
|
+
background_sr = background_audio.metadata.sample_rate
|
|
332
414
|
if generated_speech.metadata.sample_rate != background_sr:
|
|
333
415
|
generated_speech = generated_speech.resample(background_sr)
|
|
334
416
|
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
silence_duration = speech_duration - background.metadata.duration_seconds
|
|
417
|
+
if background_audio.metadata.duration_seconds > speech_duration:
|
|
418
|
+
background_audio = background_audio.slice(0, speech_duration)
|
|
419
|
+
elif background_audio.metadata.duration_seconds < speech_duration:
|
|
420
|
+
silence_duration = speech_duration - background_audio.metadata.duration_seconds
|
|
340
421
|
silence = Audio.silence(
|
|
341
422
|
duration=silence_duration,
|
|
342
423
|
sample_rate=background_sr,
|
|
343
|
-
channels=
|
|
424
|
+
channels=background_audio.metadata.channels,
|
|
344
425
|
)
|
|
345
|
-
|
|
426
|
+
background_audio = background_audio.concat(silence)
|
|
346
427
|
|
|
347
|
-
final_audio =
|
|
428
|
+
final_audio = background_audio.overlay(generated_speech, position=0.0)
|
|
429
|
+
del background_audio
|
|
348
430
|
else:
|
|
349
431
|
final_audio = generated_speech
|
|
350
432
|
|
|
@@ -130,6 +130,48 @@ class AudioToText:
|
|
|
130
130
|
)
|
|
131
131
|
return result
|
|
132
132
|
|
|
133
|
+
def diarize_transcription(self, audio: Audio, transcription: Transcription) -> Transcription:
|
|
134
|
+
"""Attach speaker labels to a pre-computed transcription using pyannote.
|
|
135
|
+
|
|
136
|
+
Useful when callers have a transcription (e.g. pre-computed and edited)
|
|
137
|
+
but no speakers, and want per-speaker voice cloning in dubbing without
|
|
138
|
+
re-running Whisper. Runs pyannote standalone on ``audio`` and overlays
|
|
139
|
+
speakers onto the supplied transcription's words.
|
|
140
|
+
|
|
141
|
+
Requires word-level timings: at least one segment must contain more
|
|
142
|
+
than one word. Transcriptions loaded from SRT (one synthetic word per
|
|
143
|
+
segment) will not produce useful speakers and are rejected.
|
|
144
|
+
"""
|
|
145
|
+
import numpy as np
|
|
146
|
+
import torch
|
|
147
|
+
|
|
148
|
+
all_words: list[TranscriptionWord] = list(transcription.words)
|
|
149
|
+
if not all_words:
|
|
150
|
+
raise ValueError("Cannot diarize a transcription with no words.")
|
|
151
|
+
|
|
152
|
+
if not any(len(seg.words) > 1 for seg in transcription.segments):
|
|
153
|
+
raise ValueError(
|
|
154
|
+
"Cannot diarize a transcription without word-level timings. "
|
|
155
|
+
"Supplied transcription has at most one word per segment "
|
|
156
|
+
"(e.g. loaded from SRT). Provide a transcription with "
|
|
157
|
+
"word-level timings, or omit `transcription` to let the "
|
|
158
|
+
"pipeline transcribe and diarize from scratch."
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
if self._diarization_pipeline is None:
|
|
162
|
+
self._init_diarization()
|
|
163
|
+
|
|
164
|
+
import whisper
|
|
165
|
+
|
|
166
|
+
audio_mono = audio.to_mono().resample(whisper.audio.SAMPLE_RATE)
|
|
167
|
+
waveform = torch.from_numpy(audio_mono.data.astype(np.float32)).unsqueeze(0)
|
|
168
|
+
diarization_result = self._diarization_pipeline(
|
|
169
|
+
{"waveform": waveform, "sample_rate": audio_mono.metadata.sample_rate}
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
all_words = self._assign_speakers_to_words(all_words, diarization_result)
|
|
173
|
+
return Transcription(words=all_words, language=transcription.language)
|
|
174
|
+
|
|
133
175
|
def _transcribe_with_diarization(self, audio_mono: Audio) -> Transcription:
|
|
134
176
|
"""Transcribe with word timestamps and assign speakers via pyannote."""
|
|
135
177
|
import numpy as np
|
|
@@ -42,7 +42,15 @@ class AudioSeparator:
|
|
|
42
42
|
)
|
|
43
43
|
|
|
44
44
|
def _separate_local(self, audio: Audio) -> SeparatedAudio:
|
|
45
|
-
"""Separate audio using local Demucs model.
|
|
45
|
+
"""Separate audio using local Demucs model.
|
|
46
|
+
|
|
47
|
+
Keeps the input tensor on CPU and passes ``device=self.device`` to
|
|
48
|
+
``apply_model`` so per-chunk compute runs on GPU while the full
|
|
49
|
+
``(stems, channels, samples)`` output is stored in CPU RAM. For long
|
|
50
|
+
sources this is the difference between OOM-on-GPU and running cleanly:
|
|
51
|
+
a 2h stereo @ 44.1kHz output is ~10 GB — too big for an 8 GB card but
|
|
52
|
+
comfortable on a 32 GB host.
|
|
53
|
+
"""
|
|
46
54
|
import numpy as np
|
|
47
55
|
import torch
|
|
48
56
|
from demucs.apply import apply_model
|
|
@@ -65,61 +73,40 @@ class AudioSeparator:
|
|
|
65
73
|
audio_data = audio_data.T
|
|
66
74
|
|
|
67
75
|
wav = torch.tensor(audio_data, dtype=torch.float32).unsqueeze(0)
|
|
68
|
-
wav = wav.to(self.device)
|
|
69
76
|
|
|
70
77
|
with torch.no_grad():
|
|
71
78
|
sources = apply_model(self._model, wav, device=self.device)
|
|
72
79
|
|
|
73
80
|
sources_np = sources[0].cpu().numpy()
|
|
81
|
+
del sources
|
|
74
82
|
|
|
75
83
|
stem_names = self.STEM_NAMES_6S if self.model_name == "htdemucs_6s" else self.STEM_NAMES
|
|
84
|
+
vocals_idx = stem_names.index("vocals")
|
|
85
|
+
non_vocal_indices = [i for i in range(len(stem_names)) if i != vocals_idx]
|
|
76
86
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
metadata = AudioMetadata(
|
|
82
|
-
sample_rate=target_sr,
|
|
83
|
-
channels=2,
|
|
84
|
-
sample_width=2,
|
|
85
|
-
duration_seconds=stem_data.shape[0] / target_sr,
|
|
86
|
-
frame_count=stem_data.shape[0],
|
|
87
|
-
)
|
|
88
|
-
stems[name] = Audio(stem_data.astype(np.float32), metadata)
|
|
89
|
-
|
|
90
|
-
vocals = stems["vocals"]
|
|
91
|
-
|
|
92
|
-
non_vocal_stems = [stems[name] for name in stem_names if name != "vocals"]
|
|
93
|
-
background_data = np.zeros_like(vocals.data)
|
|
94
|
-
for stem in non_vocal_stems:
|
|
95
|
-
background_data += stem.data
|
|
87
|
+
vocals_data = sources_np[vocals_idx].T
|
|
88
|
+
background_data = sources_np[non_vocal_indices].sum(axis=0).T
|
|
89
|
+
del sources_np
|
|
96
90
|
|
|
97
91
|
max_val = np.max(np.abs(background_data))
|
|
98
92
|
if max_val > 1.0:
|
|
99
|
-
background_data
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
music_data += stems[name].data
|
|
111
|
-
|
|
112
|
-
max_val = np.max(np.abs(music_data))
|
|
113
|
-
if max_val > 1.0:
|
|
114
|
-
music_data = music_data / max_val
|
|
115
|
-
|
|
116
|
-
music = Audio(music_data.astype(np.float32), vocals.metadata)
|
|
93
|
+
background_data /= max_val
|
|
94
|
+
|
|
95
|
+
metadata = AudioMetadata(
|
|
96
|
+
sample_rate=target_sr,
|
|
97
|
+
channels=2,
|
|
98
|
+
sample_width=2,
|
|
99
|
+
duration_seconds=vocals_data.shape[0] / target_sr,
|
|
100
|
+
frame_count=vocals_data.shape[0],
|
|
101
|
+
)
|
|
102
|
+
vocals = Audio(np.ascontiguousarray(vocals_data, dtype=np.float32), metadata)
|
|
103
|
+
background = Audio(np.ascontiguousarray(background_data, dtype=np.float32), metadata)
|
|
117
104
|
|
|
118
105
|
return SeparatedAudio(
|
|
119
106
|
vocals=vocals,
|
|
120
107
|
background=background,
|
|
121
108
|
original=audio,
|
|
122
|
-
music=
|
|
109
|
+
music=None,
|
|
123
110
|
effects=None,
|
|
124
111
|
)
|
|
125
112
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|