videopython 0.26.6__tar.gz → 0.26.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.26.6 → videopython-0.26.8}/PKG-INFO +1 -1
- {videopython-0.26.6 → videopython-0.26.8}/pyproject.toml +1 -1
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/dubbing/dubber.py +9 -13
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/dubbing/pipeline.py +98 -20
- videopython-0.26.8/src/videopython/ai/dubbing/remux.py +159 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/dubbing/timing.py +46 -18
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/generation/audio.py +24 -9
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/generation/translation.py +27 -5
- videopython-0.26.8/src/videopython/ai/understanding/separation.py +304 -0
- videopython-0.26.6/src/videopython/ai/dubbing/remux.py +0 -73
- videopython-0.26.6/src/videopython/ai/understanding/separation.py +0 -131
- {videopython-0.26.6 → videopython-0.26.8}/.gitignore +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/LICENSE +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/README.md +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/__init__.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/_device.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/dubbing/__init__.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/dubbing/models.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/registry.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/swapping/__init__.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/swapping/inpainter.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/swapping/models.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/swapping/segmenter.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/swapping/swapper.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/transforms.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/understanding/audio.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/video_analysis.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/__init__.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/audio/__init__.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/audio/analysis.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/audio/audio.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/combine.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/description.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/effects.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/progress.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/registry.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/scene.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/streaming.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/text/__init__.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/text/overlay.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/text/transcription.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/transforms.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/transitions.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/utils.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/video.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/editing/__init__.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/editing/multicam.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/editing/premiere_xml.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/editing/video_edit.py +0 -0
- {videopython-0.26.6 → videopython-0.26.8}/src/videopython/py.typed +0 -0
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
import tempfile
|
|
7
6
|
from pathlib import Path
|
|
8
7
|
from typing import TYPE_CHECKING, Any, Callable
|
|
9
8
|
|
|
@@ -170,7 +169,7 @@ class VideoDubber:
|
|
|
170
169
|
``DubbingResult`` with the dubbed audio, translated segments, and
|
|
171
170
|
source transcription. The output video is written to ``output_path``.
|
|
172
171
|
"""
|
|
173
|
-
from videopython.ai.dubbing.remux import
|
|
172
|
+
from videopython.ai.dubbing.remux import replace_audio_stream_from_audio
|
|
174
173
|
from videopython.base.audio import Audio
|
|
175
174
|
|
|
176
175
|
input_path = Path(input_path)
|
|
@@ -196,17 +195,14 @@ class VideoDubber:
|
|
|
196
195
|
transcription=transcription,
|
|
197
196
|
)
|
|
198
197
|
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
)
|
|
208
|
-
finally:
|
|
209
|
-
dubbed_audio_path.unlink(missing_ok=True)
|
|
198
|
+
# Stream the dubbed Audio directly into ffmpeg via stdin instead of
|
|
199
|
+
# going through a temp WAV on disk. For a 2h dub the temp file would
|
|
200
|
+
# be ~10 GB written-then-read; the streaming path drops both copies.
|
|
201
|
+
replace_audio_stream_from_audio(
|
|
202
|
+
video_path=input_path,
|
|
203
|
+
audio=result.dubbed_audio,
|
|
204
|
+
output_path=output_path,
|
|
205
|
+
)
|
|
210
206
|
|
|
211
207
|
return result
|
|
212
208
|
|
|
@@ -3,14 +3,45 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
+
import tempfile
|
|
7
|
+
from pathlib import Path
|
|
6
8
|
from typing import TYPE_CHECKING, Any, Callable, Literal
|
|
7
9
|
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
8
12
|
from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio
|
|
9
13
|
from videopython.ai.dubbing.timing import TimingSynchronizer
|
|
10
14
|
|
|
11
15
|
if TYPE_CHECKING:
|
|
12
16
|
from videopython.base.audio import Audio
|
|
13
17
|
|
|
18
|
+
|
|
19
|
+
def _peak_match(target: Audio, reference: Audio) -> Audio:
|
|
20
|
+
"""Scale ``target`` so its peak amplitude matches ``reference``.
|
|
21
|
+
|
|
22
|
+
Demucs background normalization and the timing-assembler peak guard
|
|
23
|
+
each clamp at 1.0 instead of restoring headroom, so a dubbed mix
|
|
24
|
+
typically lands quieter than the source — perceptually "thinner."
|
|
25
|
+
A single peak match recovers most of that drift without LUFS deps.
|
|
26
|
+
|
|
27
|
+
No-op when either side has zero peak (silent input or all-silent dub).
|
|
28
|
+
The new ``Audio`` shares no buffer with ``target``.
|
|
29
|
+
"""
|
|
30
|
+
from videopython.base.audio import Audio as _Audio
|
|
31
|
+
|
|
32
|
+
target_peak = float(np.max(np.abs(target.data))) if target.data.size else 0.0
|
|
33
|
+
reference_peak = float(np.max(np.abs(reference.data))) if reference.data.size else 0.0
|
|
34
|
+
|
|
35
|
+
if target_peak <= 0.0 or reference_peak <= 0.0:
|
|
36
|
+
return target
|
|
37
|
+
|
|
38
|
+
scale = reference_peak / target_peak
|
|
39
|
+
if abs(scale - 1.0) < 1e-3:
|
|
40
|
+
return target
|
|
41
|
+
|
|
42
|
+
return _Audio(target.data * scale, target.metadata)
|
|
43
|
+
|
|
44
|
+
|
|
14
45
|
WhisperModel = Literal["tiny", "base", "small", "medium", "large", "turbo"]
|
|
15
46
|
|
|
16
47
|
logger = logging.getLogger(__name__)
|
|
@@ -237,7 +268,19 @@ class LocalDubbingPipeline:
|
|
|
237
268
|
if self._separator is None:
|
|
238
269
|
self._init_separator()
|
|
239
270
|
|
|
240
|
-
|
|
271
|
+
# Limit Demucs to the speech-bearing portion of the audio. The
|
|
272
|
+
# transcription has already located every speech region; running
|
|
273
|
+
# source separation outside those is pure overhead (no vocals to
|
|
274
|
+
# isolate). On talk-heavy sources with silence/music gaps this
|
|
275
|
+
# roughly halves separation time. When speech covers most of the
|
|
276
|
+
# track separate_regions falls back to a full-track separate().
|
|
277
|
+
from videopython.ai.understanding.separation import _merge_regions
|
|
278
|
+
|
|
279
|
+
speech_regions = _merge_regions(
|
|
280
|
+
[(s.start, s.end) for s in transcription.segments],
|
|
281
|
+
audio_duration=source_audio.metadata.duration_seconds,
|
|
282
|
+
)
|
|
283
|
+
separated_audio = self._separator.separate_regions(source_audio, speech_regions)
|
|
241
284
|
self._maybe_unload("_separator")
|
|
242
285
|
vocal_audio = separated_audio.vocals
|
|
243
286
|
background_audio = separated_audio.background
|
|
@@ -278,24 +321,46 @@ class LocalDubbingPipeline:
|
|
|
278
321
|
target_durations: list[float] = []
|
|
279
322
|
start_times: list[float] = []
|
|
280
323
|
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
324
|
+
# Encode each speaker's voice sample to a temp WAV exactly once and
|
|
325
|
+
# reuse the path across every segment for that speaker. Without this
|
|
326
|
+
# cache, TextToSpeech.generate_audio re-encodes the same voice sample
|
|
327
|
+
# on every call (one temp WAV write + delete per segment), which is
|
|
328
|
+
# pure overhead for long dubs with many segments per speaker.
|
|
329
|
+
speaker_wav_paths: dict[str, Path] = {}
|
|
330
|
+
try:
|
|
331
|
+
if voice_clone:
|
|
332
|
+
for speaker, sample in voice_samples.items():
|
|
333
|
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
|
334
|
+
sample.save(f.name)
|
|
335
|
+
speaker_wav_paths[speaker] = Path(f.name)
|
|
336
|
+
|
|
337
|
+
for i, segment in enumerate(translated_segments):
|
|
338
|
+
if segment.duration < 0.1:
|
|
339
|
+
continue
|
|
340
|
+
# Translation filter (translation.py:_is_translatable_text)
|
|
341
|
+
# leaves translated_text="" for punctuation-only or empty
|
|
342
|
+
# segments. Don't TTS those — saves a model call and avoids
|
|
343
|
+
# injecting hallucinated speech into the dubbed track.
|
|
344
|
+
if not segment.translated_text.strip():
|
|
345
|
+
continue
|
|
346
|
+
|
|
347
|
+
progress = 0.50 + (0.30 * (i / len(translated_segments)))
|
|
348
|
+
report_progress(f"Generating speech ({i + 1}/{len(translated_segments)})", progress)
|
|
349
|
+
|
|
350
|
+
speaker = segment.speaker or "speaker_0"
|
|
351
|
+
cached_path = speaker_wav_paths.get(speaker) if voice_clone else None
|
|
352
|
+
|
|
353
|
+
if cached_path is not None:
|
|
354
|
+
dubbed_audio = self._tts.generate_audio(segment.translated_text, voice_sample_path=cached_path)
|
|
355
|
+
else:
|
|
356
|
+
dubbed_audio = self._tts.generate_audio(segment.translated_text)
|
|
357
|
+
|
|
358
|
+
dubbed_segments.append(dubbed_audio)
|
|
359
|
+
target_durations.append(segment.duration)
|
|
360
|
+
start_times.append(segment.start)
|
|
361
|
+
finally:
|
|
362
|
+
for path in speaker_wav_paths.values():
|
|
363
|
+
path.unlink(missing_ok=True)
|
|
299
364
|
|
|
300
365
|
self._maybe_unload("_tts")
|
|
301
366
|
|
|
@@ -325,6 +390,11 @@ class LocalDubbingPipeline:
|
|
|
325
390
|
else:
|
|
326
391
|
final_audio = dubbed_speech
|
|
327
392
|
|
|
393
|
+
# Peak-match against the source so the dub doesn't land quieter
|
|
394
|
+
# than the original. Done last so it captures both vocals+background
|
|
395
|
+
# mixes and speech-only outputs uniformly.
|
|
396
|
+
final_audio = _peak_match(final_audio, source_audio)
|
|
397
|
+
|
|
328
398
|
report_progress("Complete", 1.0)
|
|
329
399
|
|
|
330
400
|
return DubbingResult(
|
|
@@ -375,7 +445,13 @@ class LocalDubbingPipeline:
|
|
|
375
445
|
if self._separator is None:
|
|
376
446
|
self._init_separator()
|
|
377
447
|
|
|
378
|
-
|
|
448
|
+
from videopython.ai.understanding.separation import _merge_regions
|
|
449
|
+
|
|
450
|
+
speech_regions = _merge_regions(
|
|
451
|
+
[(s.start, s.end) for s in transcription.segments],
|
|
452
|
+
audio_duration=source_audio.metadata.duration_seconds,
|
|
453
|
+
)
|
|
454
|
+
separated_audio = self._separator.separate_regions(source_audio, speech_regions)
|
|
379
455
|
self._maybe_unload("_separator")
|
|
380
456
|
vocal_audio = separated_audio.vocals
|
|
381
457
|
background_audio = separated_audio.background
|
|
@@ -430,6 +506,8 @@ class LocalDubbingPipeline:
|
|
|
430
506
|
else:
|
|
431
507
|
final_audio = generated_speech
|
|
432
508
|
|
|
509
|
+
final_audio = _peak_match(final_audio, source_audio)
|
|
510
|
+
|
|
433
511
|
report_progress("Complete", 1.0)
|
|
434
512
|
|
|
435
513
|
return RevoiceResult(
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
"""ffmpeg helper for replacing a video file's audio track without re-encoding video."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import io
|
|
6
|
+
import logging
|
|
7
|
+
import subprocess
|
|
8
|
+
import wave
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import TYPE_CHECKING
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from videopython.base.audio import Audio
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class RemuxError(RuntimeError):
|
|
21
|
+
"""ffmpeg failed while replacing an audio stream."""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def replace_audio_stream(
|
|
25
|
+
video_path: str | Path,
|
|
26
|
+
audio_path: str | Path,
|
|
27
|
+
output_path: str | Path,
|
|
28
|
+
audio_codec: str = "aac",
|
|
29
|
+
audio_bitrate: str = "192k",
|
|
30
|
+
) -> None:
|
|
31
|
+
"""Copy ``video_path``'s video stream and mux in ``audio_path`` as the audio track.
|
|
32
|
+
|
|
33
|
+
Uses ffmpeg stream-copy for video (no re-encode) and encodes audio to AAC.
|
|
34
|
+
``-shortest`` trims to the shorter of the two streams so the output duration
|
|
35
|
+
matches the source video when the dubbed audio is slightly longer.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
video_path: Source video file (video stream is copied unchanged).
|
|
39
|
+
audio_path: Audio file to use as the new audio track.
|
|
40
|
+
output_path: Destination file. Overwritten if it exists.
|
|
41
|
+
audio_codec: ffmpeg audio codec name. Defaults to ``aac`` (MP4-compatible).
|
|
42
|
+
audio_bitrate: Audio bitrate passed to ffmpeg (``-b:a``).
|
|
43
|
+
|
|
44
|
+
Raises:
|
|
45
|
+
FileNotFoundError: If ``video_path`` or ``audio_path`` does not exist.
|
|
46
|
+
RemuxError: If ffmpeg returns a non-zero exit code.
|
|
47
|
+
"""
|
|
48
|
+
video_path = Path(video_path)
|
|
49
|
+
audio_path = Path(audio_path)
|
|
50
|
+
output_path = Path(output_path)
|
|
51
|
+
|
|
52
|
+
if not video_path.exists():
|
|
53
|
+
raise FileNotFoundError(f"Video file not found: {video_path}")
|
|
54
|
+
if not audio_path.exists():
|
|
55
|
+
raise FileNotFoundError(f"Audio file not found: {audio_path}")
|
|
56
|
+
|
|
57
|
+
cmd = [
|
|
58
|
+
"ffmpeg",
|
|
59
|
+
"-y",
|
|
60
|
+
"-i",
|
|
61
|
+
str(video_path),
|
|
62
|
+
"-i",
|
|
63
|
+
str(audio_path),
|
|
64
|
+
"-map",
|
|
65
|
+
"0:v:0",
|
|
66
|
+
"-map",
|
|
67
|
+
"1:a:0",
|
|
68
|
+
"-c:v",
|
|
69
|
+
"copy",
|
|
70
|
+
"-c:a",
|
|
71
|
+
audio_codec,
|
|
72
|
+
"-b:a",
|
|
73
|
+
audio_bitrate,
|
|
74
|
+
"-shortest",
|
|
75
|
+
str(output_path),
|
|
76
|
+
]
|
|
77
|
+
|
|
78
|
+
logger.info("replace_audio_stream: %s + %s -> %s", video_path, audio_path, output_path)
|
|
79
|
+
result = subprocess.run(cmd, capture_output=True)
|
|
80
|
+
if result.returncode != 0:
|
|
81
|
+
raise RemuxError(f"ffmpeg failed (exit {result.returncode}): {result.stderr.decode(errors='replace')}")
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def replace_audio_stream_from_audio(
|
|
85
|
+
video_path: str | Path,
|
|
86
|
+
audio: Audio,
|
|
87
|
+
output_path: str | Path,
|
|
88
|
+
audio_codec: str = "aac",
|
|
89
|
+
audio_bitrate: str = "192k",
|
|
90
|
+
) -> None:
|
|
91
|
+
"""Like ``replace_audio_stream`` but takes an in-memory ``Audio`` and pipes WAV to ffmpeg.
|
|
92
|
+
|
|
93
|
+
Avoids the ``Audio.save -> read-from-disk -> ffmpeg`` round-trip used by
|
|
94
|
+
the path-based variant: we serialize the WAV in memory and feed it to
|
|
95
|
+
ffmpeg via stdin. For long dubs this saves a full WAV write+read of the
|
|
96
|
+
output audio (~10 GB for a 2h source).
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
video_path: Source video file (video stream is copied unchanged).
|
|
100
|
+
audio: ``Audio`` instance to mux in as the new audio track.
|
|
101
|
+
output_path: Destination file. Overwritten if it exists.
|
|
102
|
+
audio_codec: ffmpeg audio codec name. Defaults to ``aac``.
|
|
103
|
+
audio_bitrate: Audio bitrate passed to ffmpeg (``-b:a``).
|
|
104
|
+
|
|
105
|
+
Raises:
|
|
106
|
+
FileNotFoundError: If ``video_path`` does not exist.
|
|
107
|
+
RemuxError: If ffmpeg returns a non-zero exit code.
|
|
108
|
+
"""
|
|
109
|
+
video_path = Path(video_path)
|
|
110
|
+
output_path = Path(output_path)
|
|
111
|
+
|
|
112
|
+
if not video_path.exists():
|
|
113
|
+
raise FileNotFoundError(f"Video file not found: {video_path}")
|
|
114
|
+
|
|
115
|
+
# Serialize Audio to WAV bytes in memory. Mirrors Audio.save's WAV writer:
|
|
116
|
+
# int16 samples, header from metadata. We stream these bytes to ffmpeg's
|
|
117
|
+
# stdin as the second input (the first is the video file on disk).
|
|
118
|
+
int_data = (audio.data * np.iinfo(np.int16).max).astype(np.int16)
|
|
119
|
+
wav_io = io.BytesIO()
|
|
120
|
+
with wave.open(wav_io, "wb") as wav_file:
|
|
121
|
+
wav_file.setnchannels(audio.metadata.channels)
|
|
122
|
+
wav_file.setsampwidth(audio.metadata.sample_width)
|
|
123
|
+
wav_file.setframerate(audio.metadata.sample_rate)
|
|
124
|
+
wav_file.writeframes(int_data.tobytes())
|
|
125
|
+
wav_bytes = wav_io.getvalue()
|
|
126
|
+
|
|
127
|
+
cmd = [
|
|
128
|
+
"ffmpeg",
|
|
129
|
+
"-y",
|
|
130
|
+
"-i",
|
|
131
|
+
str(video_path),
|
|
132
|
+
"-f",
|
|
133
|
+
"wav",
|
|
134
|
+
"-i",
|
|
135
|
+
"-",
|
|
136
|
+
"-map",
|
|
137
|
+
"0:v:0",
|
|
138
|
+
"-map",
|
|
139
|
+
"1:a:0",
|
|
140
|
+
"-c:v",
|
|
141
|
+
"copy",
|
|
142
|
+
"-c:a",
|
|
143
|
+
audio_codec,
|
|
144
|
+
"-b:a",
|
|
145
|
+
audio_bitrate,
|
|
146
|
+
"-shortest",
|
|
147
|
+
str(output_path),
|
|
148
|
+
]
|
|
149
|
+
|
|
150
|
+
logger.info(
|
|
151
|
+
"replace_audio_stream_from_audio: %s + <stdin wav %d bytes> -> %s",
|
|
152
|
+
video_path,
|
|
153
|
+
len(wav_bytes),
|
|
154
|
+
output_path,
|
|
155
|
+
)
|
|
156
|
+
process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
157
|
+
_, stderr = process.communicate(wav_bytes)
|
|
158
|
+
if process.returncode != 0:
|
|
159
|
+
raise RemuxError(f"ffmpeg failed (exit {process.returncode}): {stderr.decode(errors='replace')}")
|
|
@@ -4,7 +4,9 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
from videopython.base.audio import Audio, AudioMetadata
|
|
8
10
|
|
|
9
11
|
|
|
10
12
|
@dataclass
|
|
@@ -181,32 +183,58 @@ class TimingSynchronizer:
|
|
|
181
183
|
if len(audio_segments) != len(start_times):
|
|
182
184
|
raise ValueError(f"Length mismatch: {len(audio_segments)} segments vs {len(start_times)} start times")
|
|
183
185
|
|
|
186
|
+
for start_time in start_times:
|
|
187
|
+
if start_time < 0:
|
|
188
|
+
raise ValueError(f"Invalid start time: {start_time}")
|
|
189
|
+
|
|
184
190
|
if not audio_segments:
|
|
185
191
|
return Audio.create_silent(total_duration, stereo=False)
|
|
186
192
|
|
|
187
|
-
#
|
|
193
|
+
# Single-pass assembler: allocate one mono float32 buffer and add each
|
|
194
|
+
# segment in place at its start sample. The previous implementation
|
|
195
|
+
# called Audio.overlay() per segment, which allocates np.zeros and
|
|
196
|
+
# copies the full track on every call — O(N * total_samples) memory
|
|
197
|
+
# traffic. For long dubs (thousands of segments) this loop dominated
|
|
198
|
+
# wall time and peak RAM.
|
|
188
199
|
sample_rate = audio_segments[0].metadata.sample_rate
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
#
|
|
200
|
+
base_samples = max(int(total_duration * sample_rate), 0)
|
|
201
|
+
|
|
202
|
+
# Pre-normalize each segment to (mono, target sample rate) and compute
|
|
203
|
+
# placement bounds so the output buffer is sized to fit any segment
|
|
204
|
+
# that runs past total_duration (mirrors Audio.overlay's extend-on-OOB
|
|
205
|
+
# behavior so we don't silently truncate speech).
|
|
206
|
+
normalized: list[tuple[int, np.ndarray]] = []
|
|
207
|
+
end_sample = base_samples
|
|
194
208
|
for audio, start_time in zip(audio_segments, start_times):
|
|
195
|
-
if start_time < 0:
|
|
196
|
-
raise ValueError(f"Invalid start time: {start_time}")
|
|
197
|
-
|
|
198
|
-
# Resample if needed
|
|
199
209
|
if audio.metadata.sample_rate != sample_rate:
|
|
200
210
|
audio = audio.resample(sample_rate)
|
|
201
|
-
|
|
202
|
-
# Convert to mono if needed
|
|
203
211
|
if audio.metadata.channels > 1:
|
|
204
212
|
audio = audio.to_mono()
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
213
|
+
start_sample = int(np.ceil(start_time * sample_rate))
|
|
214
|
+
seg_data = audio.data
|
|
215
|
+
normalized.append((start_sample, seg_data))
|
|
216
|
+
end_sample = max(end_sample, start_sample + len(seg_data))
|
|
217
|
+
|
|
218
|
+
output = np.zeros(end_sample, dtype=np.float32)
|
|
219
|
+
for start_sample, seg_data in normalized:
|
|
220
|
+
stop = start_sample + len(seg_data)
|
|
221
|
+
output[start_sample:stop] += seg_data
|
|
222
|
+
|
|
223
|
+
# Single post-mix peak guard, equivalent to Audio.overlay's per-call
|
|
224
|
+
# rescale collapsed into one pass. For non-overlapping dub segments
|
|
225
|
+
# this is a no-op; only the rare overlap case touches it.
|
|
226
|
+
max_amplitude = float(np.max(np.abs(output))) if output.size else 0.0
|
|
227
|
+
if max_amplitude > 1.0:
|
|
228
|
+
output /= max_amplitude
|
|
229
|
+
|
|
230
|
+
metadata = AudioMetadata(
|
|
231
|
+
sample_rate=sample_rate,
|
|
232
|
+
channels=1,
|
|
233
|
+
sample_width=audio_segments[0].metadata.sample_width,
|
|
234
|
+
duration_seconds=end_sample / sample_rate,
|
|
235
|
+
frame_count=end_sample,
|
|
236
|
+
)
|
|
237
|
+
return Audio(output, metadata)
|
|
210
238
|
|
|
211
239
|
def check_overlaps(
|
|
212
240
|
self,
|
|
@@ -2,11 +2,14 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
from typing import Any
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
6
|
|
|
7
7
|
from videopython.ai._device import log_device_initialization, release_device_memory, select_device
|
|
8
8
|
from videopython.base.audio import Audio, AudioMetadata
|
|
9
9
|
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
10
13
|
|
|
11
14
|
class TextToSpeech:
|
|
12
15
|
"""Generates speech audio from text using Chatterbox Multilingual.
|
|
@@ -47,6 +50,7 @@ class TextToSpeech:
|
|
|
47
50
|
self,
|
|
48
51
|
text: str,
|
|
49
52
|
voice_sample: Audio | None = None,
|
|
53
|
+
voice_sample_path: str | Path | None = None,
|
|
50
54
|
) -> Audio:
|
|
51
55
|
"""Generate speech audio from text.
|
|
52
56
|
|
|
@@ -54,6 +58,12 @@ class TextToSpeech:
|
|
|
54
58
|
text: Text to synthesize.
|
|
55
59
|
voice_sample: Optional voice sample to clone. Falls back to the
|
|
56
60
|
instance's ``voice`` and then to Chatterbox's default speaker.
|
|
61
|
+
voice_sample_path: Optional pre-encoded WAV path to use directly as
|
|
62
|
+
the speaker prompt. Skips the per-call temp-WAV encode that
|
|
63
|
+
``voice_sample`` would otherwise trigger. When set, takes
|
|
64
|
+
precedence over ``voice_sample`` and ``self.voice``. Used by
|
|
65
|
+
the dubbing pipeline to encode each speaker's sample once and
|
|
66
|
+
reuse it across all of that speaker's segments.
|
|
57
67
|
"""
|
|
58
68
|
import tempfile
|
|
59
69
|
from pathlib import Path
|
|
@@ -63,13 +73,18 @@ class TextToSpeech:
|
|
|
63
73
|
if self._model is None:
|
|
64
74
|
self._init_model()
|
|
65
75
|
|
|
66
|
-
effective_sample = voice_sample or self.voice
|
|
67
76
|
speaker_wav_path: Path | None = None
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
77
|
+
cleanup_path = False
|
|
78
|
+
|
|
79
|
+
if voice_sample_path is not None:
|
|
80
|
+
speaker_wav_path = Path(voice_sample_path)
|
|
81
|
+
else:
|
|
82
|
+
effective_sample = voice_sample or self.voice
|
|
83
|
+
if effective_sample is not None:
|
|
84
|
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
|
85
|
+
effective_sample.save(f.name)
|
|
86
|
+
speaker_wav_path = Path(f.name)
|
|
87
|
+
cleanup_path = True
|
|
73
88
|
|
|
74
89
|
try:
|
|
75
90
|
wav = self._model.generate(
|
|
@@ -91,8 +106,8 @@ class TextToSpeech:
|
|
|
91
106
|
)
|
|
92
107
|
return Audio(audio_data, metadata)
|
|
93
108
|
finally:
|
|
94
|
-
if speaker_wav_path is not None:
|
|
95
|
-
speaker_wav_path.unlink()
|
|
109
|
+
if cleanup_path and speaker_wav_path is not None:
|
|
110
|
+
speaker_wav_path.unlink(missing_ok=True)
|
|
96
111
|
|
|
97
112
|
def unload(self) -> None:
|
|
98
113
|
"""Release the TTS model so the next generate_audio() re-initializes.
|
|
@@ -8,6 +8,17 @@ from videopython.ai._device import log_device_initialization, release_device_mem
|
|
|
8
8
|
from videopython.ai.dubbing.models import TranslatedSegment
|
|
9
9
|
from videopython.base.text.transcription import TranscriptionSegment
|
|
10
10
|
|
|
11
|
+
|
|
12
|
+
def _is_translatable_text(text: str) -> bool:
|
|
13
|
+
"""Return True if text has enough content to be worth translating.
|
|
14
|
+
|
|
15
|
+
Whisper routinely emits punctuation-only or single-character segments
|
|
16
|
+
(" .", "...", "?", "♪") that MarianMT can hallucinate full sentences
|
|
17
|
+
from. Require at least 2 alphanumeric characters to filter these out.
|
|
18
|
+
"""
|
|
19
|
+
return sum(1 for c in text if c.isalnum()) >= 2
|
|
20
|
+
|
|
21
|
+
|
|
11
22
|
LANGUAGE_NAMES = {
|
|
12
23
|
"en": "English",
|
|
13
24
|
"es": "Spanish",
|
|
@@ -159,17 +170,28 @@ class TextTranslator:
|
|
|
159
170
|
target_lang: str,
|
|
160
171
|
source_lang: str | None = None,
|
|
161
172
|
) -> list[TranslatedSegment]:
|
|
162
|
-
"""Translate transcription segments while preserving timing/speaker info.
|
|
173
|
+
"""Translate transcription segments while preserving timing/speaker info.
|
|
174
|
+
|
|
175
|
+
Segments whose text is empty or contains fewer than 2 alphanumeric
|
|
176
|
+
characters are not sent to the model — they receive
|
|
177
|
+
``translated_text=""`` instead. This avoids MarianMT hallucinating
|
|
178
|
+
full sentences from " .", "...", or single-token Whisper segments,
|
|
179
|
+
which would otherwise be TTS'd into the dubbed track.
|
|
180
|
+
"""
|
|
163
181
|
effective_source = source_lang or "en"
|
|
164
|
-
|
|
165
|
-
|
|
182
|
+
|
|
183
|
+
translatable_indices = [i for i, segment in enumerate(segments) if _is_translatable_text(segment.text)]
|
|
184
|
+
translatable_texts = [segments[i].text for i in translatable_indices]
|
|
185
|
+
translated_texts = self.translate_batch(translatable_texts, target_lang, source_lang)
|
|
186
|
+
|
|
187
|
+
translation_map: dict[int, str] = dict(zip(translatable_indices, translated_texts))
|
|
166
188
|
|
|
167
189
|
translated_segments = []
|
|
168
|
-
for
|
|
190
|
+
for i, segment in enumerate(segments):
|
|
169
191
|
translated_segments.append(
|
|
170
192
|
TranslatedSegment(
|
|
171
193
|
original_segment=segment,
|
|
172
|
-
translated_text=
|
|
194
|
+
translated_text=translation_map.get(i, ""),
|
|
173
195
|
source_lang=effective_source,
|
|
174
196
|
target_lang=target_lang,
|
|
175
197
|
speaker=segment.speaker,
|
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
"""Audio source separation using local Demucs models."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from videopython.ai._device import log_device_initialization, release_device_memory, select_device
|
|
9
|
+
from videopython.ai.dubbing.models import SeparatedAudio
|
|
10
|
+
from videopython.base.audio import Audio, AudioMetadata
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _merge_regions(
|
|
16
|
+
regions: list[tuple[float, float]],
|
|
17
|
+
audio_duration: float,
|
|
18
|
+
pad: float = 0.5,
|
|
19
|
+
merge_gap: float = 1.0,
|
|
20
|
+
) -> list[tuple[float, float]]:
|
|
21
|
+
"""Merge overlapping/adjacent (start, end) ranges and pad each side.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
regions: Speech regions in seconds. Order does not matter.
|
|
25
|
+
audio_duration: Total audio duration; output is clamped to ``[0, audio_duration]``.
|
|
26
|
+
pad: Seconds added to each side. Demucs needs context to separate
|
|
27
|
+
cleanly at boundaries; 0.5s avoids clipped onsets/decays.
|
|
28
|
+
merge_gap: Adjacent regions whose padded edges are within this
|
|
29
|
+
many seconds are merged. Avoids running Demucs on very short
|
|
30
|
+
slices (where its temporal context isn't there).
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
Sorted list of non-overlapping (start, end) regions covering the
|
|
34
|
+
speech-bearing portion of the audio.
|
|
35
|
+
"""
|
|
36
|
+
if not regions:
|
|
37
|
+
return []
|
|
38
|
+
|
|
39
|
+
sorted_regions = sorted(regions)
|
|
40
|
+
|
|
41
|
+
merged: list[tuple[float, float]] = []
|
|
42
|
+
for start, end in sorted_regions:
|
|
43
|
+
if end <= start:
|
|
44
|
+
continue
|
|
45
|
+
padded_start = max(0.0, start - pad)
|
|
46
|
+
padded_end = min(audio_duration, end + pad)
|
|
47
|
+
if padded_start >= audio_duration or padded_end <= 0.0:
|
|
48
|
+
continue
|
|
49
|
+
|
|
50
|
+
if merged and padded_start - merged[-1][1] <= merge_gap:
|
|
51
|
+
merged[-1] = (merged[-1][0], max(merged[-1][1], padded_end))
|
|
52
|
+
else:
|
|
53
|
+
merged.append((padded_start, padded_end))
|
|
54
|
+
|
|
55
|
+
return merged
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class AudioSeparator:
|
|
59
|
+
"""Separates audio into vocals and background components using Demucs."""
|
|
60
|
+
|
|
61
|
+
SUPPORTED_MODELS: list[str] = ["htdemucs", "htdemucs_ft", "htdemucs_6s", "mdx_extra"]
|
|
62
|
+
STEM_NAMES = ["drums", "bass", "other", "vocals"]
|
|
63
|
+
STEM_NAMES_6S = ["drums", "bass", "other", "vocals", "guitar", "piano"]
|
|
64
|
+
|
|
65
|
+
def __init__(self, model_name: str = "htdemucs", device: str | None = None):
|
|
66
|
+
if model_name not in self.SUPPORTED_MODELS:
|
|
67
|
+
raise ValueError(f"Model '{model_name}' not supported. Supported: {self.SUPPORTED_MODELS}")
|
|
68
|
+
|
|
69
|
+
self.model_name = model_name
|
|
70
|
+
self.device = device
|
|
71
|
+
self._model: Any = None
|
|
72
|
+
|
|
73
|
+
def _init_local(self) -> None:
|
|
74
|
+
"""Initialize local Demucs model."""
|
|
75
|
+
from demucs.pretrained import get_model
|
|
76
|
+
|
|
77
|
+
requested_device = self.device
|
|
78
|
+
device = select_device(self.device, mps_allowed=False)
|
|
79
|
+
|
|
80
|
+
self._model = get_model(self.model_name)
|
|
81
|
+
self._model.to(device)
|
|
82
|
+
self._model.eval()
|
|
83
|
+
self.device = device
|
|
84
|
+
log_device_initialization(
|
|
85
|
+
"AudioSeparator",
|
|
86
|
+
requested_device=requested_device,
|
|
87
|
+
resolved_device=device,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
def _separate_local(self, audio: Audio) -> SeparatedAudio:
|
|
91
|
+
"""Separate audio using local Demucs model.
|
|
92
|
+
|
|
93
|
+
Keeps the input tensor on CPU and passes ``device=self.device`` to
|
|
94
|
+
``apply_model`` so per-chunk compute runs on GPU while the full
|
|
95
|
+
``(stems, channels, samples)`` output is stored in CPU RAM. For long
|
|
96
|
+
sources this is the difference between OOM-on-GPU and running cleanly:
|
|
97
|
+
a 2h stereo @ 44.1kHz output is ~10 GB — too big for an 8 GB card but
|
|
98
|
+
comfortable on a 32 GB host.
|
|
99
|
+
"""
|
|
100
|
+
import numpy as np
|
|
101
|
+
import torch
|
|
102
|
+
from demucs.apply import apply_model
|
|
103
|
+
|
|
104
|
+
if self._model is None:
|
|
105
|
+
self._init_local()
|
|
106
|
+
|
|
107
|
+
target_sr = self._model.samplerate
|
|
108
|
+
|
|
109
|
+
if audio.metadata.channels == 1:
|
|
110
|
+
audio = audio._to_stereo()
|
|
111
|
+
|
|
112
|
+
if audio.metadata.sample_rate != target_sr:
|
|
113
|
+
audio = audio.resample(target_sr)
|
|
114
|
+
|
|
115
|
+
audio_data = audio.data
|
|
116
|
+
if audio_data.ndim == 1:
|
|
117
|
+
audio_data = np.stack([audio_data, audio_data])
|
|
118
|
+
elif audio_data.ndim == 2:
|
|
119
|
+
audio_data = audio_data.T
|
|
120
|
+
|
|
121
|
+
wav = torch.tensor(audio_data, dtype=torch.float32).unsqueeze(0)
|
|
122
|
+
|
|
123
|
+
with torch.no_grad():
|
|
124
|
+
sources = apply_model(self._model, wav, device=self.device)
|
|
125
|
+
|
|
126
|
+
sources_np = sources[0].cpu().numpy()
|
|
127
|
+
del sources
|
|
128
|
+
|
|
129
|
+
stem_names = self.STEM_NAMES_6S if self.model_name == "htdemucs_6s" else self.STEM_NAMES
|
|
130
|
+
vocals_idx = stem_names.index("vocals")
|
|
131
|
+
non_vocal_indices = [i for i in range(len(stem_names)) if i != vocals_idx]
|
|
132
|
+
|
|
133
|
+
vocals_data = sources_np[vocals_idx].T
|
|
134
|
+
background_data = sources_np[non_vocal_indices].sum(axis=0).T
|
|
135
|
+
del sources_np
|
|
136
|
+
|
|
137
|
+
max_val = np.max(np.abs(background_data))
|
|
138
|
+
if max_val > 1.0:
|
|
139
|
+
background_data /= max_val
|
|
140
|
+
|
|
141
|
+
metadata = AudioMetadata(
|
|
142
|
+
sample_rate=target_sr,
|
|
143
|
+
channels=2,
|
|
144
|
+
sample_width=2,
|
|
145
|
+
duration_seconds=vocals_data.shape[0] / target_sr,
|
|
146
|
+
frame_count=vocals_data.shape[0],
|
|
147
|
+
)
|
|
148
|
+
vocals = Audio(np.ascontiguousarray(vocals_data, dtype=np.float32), metadata)
|
|
149
|
+
background = Audio(np.ascontiguousarray(background_data, dtype=np.float32), metadata)
|
|
150
|
+
|
|
151
|
+
return SeparatedAudio(
|
|
152
|
+
vocals=vocals,
|
|
153
|
+
background=background,
|
|
154
|
+
original=audio,
|
|
155
|
+
music=None,
|
|
156
|
+
effects=None,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
def separate(self, audio: Audio) -> SeparatedAudio:
|
|
160
|
+
"""Separate audio into vocals and background components."""
|
|
161
|
+
return self._separate_local(audio)
|
|
162
|
+
|
|
163
|
+
def separate_regions(
|
|
164
|
+
self,
|
|
165
|
+
audio: Audio,
|
|
166
|
+
regions: list[tuple[float, float]],
|
|
167
|
+
full_separation_threshold: float = 0.9,
|
|
168
|
+
) -> SeparatedAudio:
|
|
169
|
+
"""Separate only the given (start, end) regions; pass the rest through.
|
|
170
|
+
|
|
171
|
+
Demucs is the slowest stage of the dubbing pipeline. On talk-heavy
|
|
172
|
+
sources (podcasts, interviews) most of the track is speech, but
|
|
173
|
+
long pauses, silence, or music-only stretches don't need vocal
|
|
174
|
+
isolation — there's nothing to isolate. We run Demucs only on the
|
|
175
|
+
speech-bearing regions and treat the rest as pure background.
|
|
176
|
+
|
|
177
|
+
Output is full-length: vocals are silent outside the given
|
|
178
|
+
regions; background is the original audio outside the given
|
|
179
|
+
regions and the Demucs-separated background inside.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
audio: Source audio (typically the full track).
|
|
183
|
+
regions: List of ``(start, end)`` second pairs marking
|
|
184
|
+
speech-bearing portions. Caller is responsible for
|
|
185
|
+
merging/padding (use ``_merge_regions``).
|
|
186
|
+
full_separation_threshold: If the regions cover more than
|
|
187
|
+
this fraction of the audio, fall back to full-track
|
|
188
|
+
``separate()`` since per-region slicing+stitching
|
|
189
|
+
overhead would exceed the savings. Default 0.9.
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
``SeparatedAudio`` with full-length vocals and background.
|
|
193
|
+
"""
|
|
194
|
+
import numpy as np
|
|
195
|
+
|
|
196
|
+
if not regions:
|
|
197
|
+
logger.info("separate_regions: no regions, returning silent vocals over original audio")
|
|
198
|
+
return self._passthrough_separation(audio)
|
|
199
|
+
|
|
200
|
+
total_duration = audio.metadata.duration_seconds
|
|
201
|
+
speech_duration = sum(end - start for start, end in regions)
|
|
202
|
+
if total_duration > 0 and speech_duration / total_duration >= full_separation_threshold:
|
|
203
|
+
logger.info(
|
|
204
|
+
"separate_regions: speech covers %.0f%% of audio (>=%.0f%%), using full-track separation",
|
|
205
|
+
speech_duration / total_duration * 100,
|
|
206
|
+
full_separation_threshold * 100,
|
|
207
|
+
)
|
|
208
|
+
return self._separate_local(audio)
|
|
209
|
+
|
|
210
|
+
logger.info(
|
|
211
|
+
"separate_regions: separating %.1fs of speech across %d region(s) (full duration: %.1fs)",
|
|
212
|
+
speech_duration,
|
|
213
|
+
len(regions),
|
|
214
|
+
total_duration,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# Build full-length output buffers. Background defaults to the
|
|
218
|
+
# original audio (so non-speech gaps pass through unchanged); vocals
|
|
219
|
+
# default to silence (no speech to isolate outside the regions).
|
|
220
|
+
# Both are stereo to match the full-track separation contract.
|
|
221
|
+
sr = audio.metadata.sample_rate
|
|
222
|
+
stereo_audio = audio if audio.metadata.channels == 2 else audio._to_stereo()
|
|
223
|
+
|
|
224
|
+
total_samples = len(stereo_audio.data)
|
|
225
|
+
vocals_full = np.zeros((total_samples, 2), dtype=np.float32)
|
|
226
|
+
background_full = stereo_audio.data.astype(np.float32, copy=True)
|
|
227
|
+
|
|
228
|
+
for start, end in regions:
|
|
229
|
+
chunk = audio.slice(start, end)
|
|
230
|
+
separated_chunk = self._separate_local(chunk)
|
|
231
|
+
chunk_vocals = separated_chunk.vocals.data
|
|
232
|
+
chunk_background = separated_chunk.background.data
|
|
233
|
+
|
|
234
|
+
# Demucs operates at its model sample rate (typically 44.1 kHz)
|
|
235
|
+
# and returns stereo. The slice of `audio` we passed in may have
|
|
236
|
+
# been resampled inside _separate_local, so resample the chunk
|
|
237
|
+
# outputs back to the source sample rate before splicing.
|
|
238
|
+
chunk_sr = separated_chunk.vocals.metadata.sample_rate
|
|
239
|
+
if chunk_sr != sr:
|
|
240
|
+
chunk_vocals = separated_chunk.vocals.resample(sr).data
|
|
241
|
+
chunk_background = separated_chunk.background.resample(sr).data
|
|
242
|
+
|
|
243
|
+
start_sample = int(start * sr)
|
|
244
|
+
end_sample = min(start_sample + len(chunk_vocals), total_samples)
|
|
245
|
+
length = end_sample - start_sample
|
|
246
|
+
if length <= 0:
|
|
247
|
+
continue
|
|
248
|
+
|
|
249
|
+
vocals_full[start_sample:end_sample] = chunk_vocals[:length]
|
|
250
|
+
background_full[start_sample:end_sample] = chunk_background[:length]
|
|
251
|
+
|
|
252
|
+
metadata = AudioMetadata(
|
|
253
|
+
sample_rate=sr,
|
|
254
|
+
channels=2,
|
|
255
|
+
sample_width=audio.metadata.sample_width,
|
|
256
|
+
duration_seconds=total_samples / sr,
|
|
257
|
+
frame_count=total_samples,
|
|
258
|
+
)
|
|
259
|
+
vocals = Audio(np.ascontiguousarray(vocals_full, dtype=np.float32), metadata)
|
|
260
|
+
background = Audio(np.ascontiguousarray(background_full, dtype=np.float32), metadata)
|
|
261
|
+
|
|
262
|
+
return SeparatedAudio(
|
|
263
|
+
vocals=vocals,
|
|
264
|
+
background=background,
|
|
265
|
+
original=stereo_audio,
|
|
266
|
+
music=None,
|
|
267
|
+
effects=None,
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
def _passthrough_separation(self, audio: Audio) -> SeparatedAudio:
|
|
271
|
+
"""Return the original audio as background with silent vocals.
|
|
272
|
+
|
|
273
|
+
Used when no speech regions are present — there's nothing to
|
|
274
|
+
separate, so the entire signal is background by definition.
|
|
275
|
+
"""
|
|
276
|
+
import numpy as np
|
|
277
|
+
|
|
278
|
+
stereo_audio = audio if audio.metadata.channels == 2 else audio._to_stereo()
|
|
279
|
+
silent_vocals_data = np.zeros_like(stereo_audio.data, dtype=np.float32)
|
|
280
|
+
vocals = Audio(silent_vocals_data, stereo_audio.metadata)
|
|
281
|
+
|
|
282
|
+
return SeparatedAudio(
|
|
283
|
+
vocals=vocals,
|
|
284
|
+
background=stereo_audio,
|
|
285
|
+
original=stereo_audio,
|
|
286
|
+
music=None,
|
|
287
|
+
effects=None,
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
def extract_vocals(self, audio: Audio) -> Audio:
|
|
291
|
+
"""Convenience method to extract only vocals from audio."""
|
|
292
|
+
return self.separate(audio).vocals
|
|
293
|
+
|
|
294
|
+
def extract_background(self, audio: Audio) -> Audio:
|
|
295
|
+
"""Convenience method to extract only background from audio."""
|
|
296
|
+
return self.separate(audio).background
|
|
297
|
+
|
|
298
|
+
def unload(self) -> None:
|
|
299
|
+
"""Release the Demucs model so the next separate() re-initializes.
|
|
300
|
+
|
|
301
|
+
Used by low-memory dubbing to free VRAM between pipeline stages.
|
|
302
|
+
"""
|
|
303
|
+
self._model = None
|
|
304
|
+
release_device_memory(self.device)
|
|
@@ -1,73 +0,0 @@
|
|
|
1
|
-
"""ffmpeg helper for replacing a video file's audio track without re-encoding video."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import logging
|
|
6
|
-
import subprocess
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
|
|
9
|
-
logger = logging.getLogger(__name__)
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class RemuxError(RuntimeError):
|
|
13
|
-
"""ffmpeg failed while replacing an audio stream."""
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def replace_audio_stream(
|
|
17
|
-
video_path: str | Path,
|
|
18
|
-
audio_path: str | Path,
|
|
19
|
-
output_path: str | Path,
|
|
20
|
-
audio_codec: str = "aac",
|
|
21
|
-
audio_bitrate: str = "192k",
|
|
22
|
-
) -> None:
|
|
23
|
-
"""Copy ``video_path``'s video stream and mux in ``audio_path`` as the audio track.
|
|
24
|
-
|
|
25
|
-
Uses ffmpeg stream-copy for video (no re-encode) and encodes audio to AAC.
|
|
26
|
-
``-shortest`` trims to the shorter of the two streams so the output duration
|
|
27
|
-
matches the source video when the dubbed audio is slightly longer.
|
|
28
|
-
|
|
29
|
-
Args:
|
|
30
|
-
video_path: Source video file (video stream is copied unchanged).
|
|
31
|
-
audio_path: Audio file to use as the new audio track.
|
|
32
|
-
output_path: Destination file. Overwritten if it exists.
|
|
33
|
-
audio_codec: ffmpeg audio codec name. Defaults to ``aac`` (MP4-compatible).
|
|
34
|
-
audio_bitrate: Audio bitrate passed to ffmpeg (``-b:a``).
|
|
35
|
-
|
|
36
|
-
Raises:
|
|
37
|
-
FileNotFoundError: If ``video_path`` or ``audio_path`` does not exist.
|
|
38
|
-
RemuxError: If ffmpeg returns a non-zero exit code.
|
|
39
|
-
"""
|
|
40
|
-
video_path = Path(video_path)
|
|
41
|
-
audio_path = Path(audio_path)
|
|
42
|
-
output_path = Path(output_path)
|
|
43
|
-
|
|
44
|
-
if not video_path.exists():
|
|
45
|
-
raise FileNotFoundError(f"Video file not found: {video_path}")
|
|
46
|
-
if not audio_path.exists():
|
|
47
|
-
raise FileNotFoundError(f"Audio file not found: {audio_path}")
|
|
48
|
-
|
|
49
|
-
cmd = [
|
|
50
|
-
"ffmpeg",
|
|
51
|
-
"-y",
|
|
52
|
-
"-i",
|
|
53
|
-
str(video_path),
|
|
54
|
-
"-i",
|
|
55
|
-
str(audio_path),
|
|
56
|
-
"-map",
|
|
57
|
-
"0:v:0",
|
|
58
|
-
"-map",
|
|
59
|
-
"1:a:0",
|
|
60
|
-
"-c:v",
|
|
61
|
-
"copy",
|
|
62
|
-
"-c:a",
|
|
63
|
-
audio_codec,
|
|
64
|
-
"-b:a",
|
|
65
|
-
audio_bitrate,
|
|
66
|
-
"-shortest",
|
|
67
|
-
str(output_path),
|
|
68
|
-
]
|
|
69
|
-
|
|
70
|
-
logger.info("replace_audio_stream: %s + %s -> %s", video_path, audio_path, output_path)
|
|
71
|
-
result = subprocess.run(cmd, capture_output=True)
|
|
72
|
-
if result.returncode != 0:
|
|
73
|
-
raise RemuxError(f"ffmpeg failed (exit {result.returncode}): {result.stderr.decode(errors='replace')}")
|
|
@@ -1,131 +0,0 @@
|
|
|
1
|
-
"""Audio source separation using local Demucs models."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
from typing import Any
|
|
6
|
-
|
|
7
|
-
from videopython.ai._device import log_device_initialization, release_device_memory, select_device
|
|
8
|
-
from videopython.ai.dubbing.models import SeparatedAudio
|
|
9
|
-
from videopython.base.audio import Audio, AudioMetadata
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class AudioSeparator:
|
|
13
|
-
"""Separates audio into vocals and background components using Demucs."""
|
|
14
|
-
|
|
15
|
-
SUPPORTED_MODELS: list[str] = ["htdemucs", "htdemucs_ft", "htdemucs_6s", "mdx_extra"]
|
|
16
|
-
STEM_NAMES = ["drums", "bass", "other", "vocals"]
|
|
17
|
-
STEM_NAMES_6S = ["drums", "bass", "other", "vocals", "guitar", "piano"]
|
|
18
|
-
|
|
19
|
-
def __init__(self, model_name: str = "htdemucs", device: str | None = None):
|
|
20
|
-
if model_name not in self.SUPPORTED_MODELS:
|
|
21
|
-
raise ValueError(f"Model '{model_name}' not supported. Supported: {self.SUPPORTED_MODELS}")
|
|
22
|
-
|
|
23
|
-
self.model_name = model_name
|
|
24
|
-
self.device = device
|
|
25
|
-
self._model: Any = None
|
|
26
|
-
|
|
27
|
-
def _init_local(self) -> None:
|
|
28
|
-
"""Initialize local Demucs model."""
|
|
29
|
-
from demucs.pretrained import get_model
|
|
30
|
-
|
|
31
|
-
requested_device = self.device
|
|
32
|
-
device = select_device(self.device, mps_allowed=False)
|
|
33
|
-
|
|
34
|
-
self._model = get_model(self.model_name)
|
|
35
|
-
self._model.to(device)
|
|
36
|
-
self._model.eval()
|
|
37
|
-
self.device = device
|
|
38
|
-
log_device_initialization(
|
|
39
|
-
"AudioSeparator",
|
|
40
|
-
requested_device=requested_device,
|
|
41
|
-
resolved_device=device,
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
def _separate_local(self, audio: Audio) -> SeparatedAudio:
|
|
45
|
-
"""Separate audio using local Demucs model.
|
|
46
|
-
|
|
47
|
-
Keeps the input tensor on CPU and passes ``device=self.device`` to
|
|
48
|
-
``apply_model`` so per-chunk compute runs on GPU while the full
|
|
49
|
-
``(stems, channels, samples)`` output is stored in CPU RAM. For long
|
|
50
|
-
sources this is the difference between OOM-on-GPU and running cleanly:
|
|
51
|
-
a 2h stereo @ 44.1kHz output is ~10 GB — too big for an 8 GB card but
|
|
52
|
-
comfortable on a 32 GB host.
|
|
53
|
-
"""
|
|
54
|
-
import numpy as np
|
|
55
|
-
import torch
|
|
56
|
-
from demucs.apply import apply_model
|
|
57
|
-
|
|
58
|
-
if self._model is None:
|
|
59
|
-
self._init_local()
|
|
60
|
-
|
|
61
|
-
target_sr = self._model.samplerate
|
|
62
|
-
|
|
63
|
-
if audio.metadata.channels == 1:
|
|
64
|
-
audio = audio._to_stereo()
|
|
65
|
-
|
|
66
|
-
if audio.metadata.sample_rate != target_sr:
|
|
67
|
-
audio = audio.resample(target_sr)
|
|
68
|
-
|
|
69
|
-
audio_data = audio.data
|
|
70
|
-
if audio_data.ndim == 1:
|
|
71
|
-
audio_data = np.stack([audio_data, audio_data])
|
|
72
|
-
elif audio_data.ndim == 2:
|
|
73
|
-
audio_data = audio_data.T
|
|
74
|
-
|
|
75
|
-
wav = torch.tensor(audio_data, dtype=torch.float32).unsqueeze(0)
|
|
76
|
-
|
|
77
|
-
with torch.no_grad():
|
|
78
|
-
sources = apply_model(self._model, wav, device=self.device)
|
|
79
|
-
|
|
80
|
-
sources_np = sources[0].cpu().numpy()
|
|
81
|
-
del sources
|
|
82
|
-
|
|
83
|
-
stem_names = self.STEM_NAMES_6S if self.model_name == "htdemucs_6s" else self.STEM_NAMES
|
|
84
|
-
vocals_idx = stem_names.index("vocals")
|
|
85
|
-
non_vocal_indices = [i for i in range(len(stem_names)) if i != vocals_idx]
|
|
86
|
-
|
|
87
|
-
vocals_data = sources_np[vocals_idx].T
|
|
88
|
-
background_data = sources_np[non_vocal_indices].sum(axis=0).T
|
|
89
|
-
del sources_np
|
|
90
|
-
|
|
91
|
-
max_val = np.max(np.abs(background_data))
|
|
92
|
-
if max_val > 1.0:
|
|
93
|
-
background_data /= max_val
|
|
94
|
-
|
|
95
|
-
metadata = AudioMetadata(
|
|
96
|
-
sample_rate=target_sr,
|
|
97
|
-
channels=2,
|
|
98
|
-
sample_width=2,
|
|
99
|
-
duration_seconds=vocals_data.shape[0] / target_sr,
|
|
100
|
-
frame_count=vocals_data.shape[0],
|
|
101
|
-
)
|
|
102
|
-
vocals = Audio(np.ascontiguousarray(vocals_data, dtype=np.float32), metadata)
|
|
103
|
-
background = Audio(np.ascontiguousarray(background_data, dtype=np.float32), metadata)
|
|
104
|
-
|
|
105
|
-
return SeparatedAudio(
|
|
106
|
-
vocals=vocals,
|
|
107
|
-
background=background,
|
|
108
|
-
original=audio,
|
|
109
|
-
music=None,
|
|
110
|
-
effects=None,
|
|
111
|
-
)
|
|
112
|
-
|
|
113
|
-
def separate(self, audio: Audio) -> SeparatedAudio:
|
|
114
|
-
"""Separate audio into vocals and background components."""
|
|
115
|
-
return self._separate_local(audio)
|
|
116
|
-
|
|
117
|
-
def extract_vocals(self, audio: Audio) -> Audio:
|
|
118
|
-
"""Convenience method to extract only vocals from audio."""
|
|
119
|
-
return self.separate(audio).vocals
|
|
120
|
-
|
|
121
|
-
def extract_background(self, audio: Audio) -> Audio:
|
|
122
|
-
"""Convenience method to extract only background from audio."""
|
|
123
|
-
return self.separate(audio).background
|
|
124
|
-
|
|
125
|
-
def unload(self) -> None:
|
|
126
|
-
"""Release the Demucs model so the next separate() re-initializes.
|
|
127
|
-
|
|
128
|
-
Used by low-memory dubbing to free VRAM between pipeline stages.
|
|
129
|
-
"""
|
|
130
|
-
self._model = None
|
|
131
|
-
release_device_memory(self.device)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|