videopython 0.26.6__tar.gz → 0.26.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {videopython-0.26.6 → videopython-0.26.8}/PKG-INFO +1 -1
  2. {videopython-0.26.6 → videopython-0.26.8}/pyproject.toml +1 -1
  3. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/dubbing/dubber.py +9 -13
  4. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/dubbing/pipeline.py +98 -20
  5. videopython-0.26.8/src/videopython/ai/dubbing/remux.py +159 -0
  6. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/dubbing/timing.py +46 -18
  7. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/generation/audio.py +24 -9
  8. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/generation/translation.py +27 -5
  9. videopython-0.26.8/src/videopython/ai/understanding/separation.py +304 -0
  10. videopython-0.26.6/src/videopython/ai/dubbing/remux.py +0 -73
  11. videopython-0.26.6/src/videopython/ai/understanding/separation.py +0 -131
  12. {videopython-0.26.6 → videopython-0.26.8}/.gitignore +0 -0
  13. {videopython-0.26.6 → videopython-0.26.8}/LICENSE +0 -0
  14. {videopython-0.26.6 → videopython-0.26.8}/README.md +0 -0
  15. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/__init__.py +0 -0
  16. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/__init__.py +0 -0
  17. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/_device.py +0 -0
  18. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/dubbing/__init__.py +0 -0
  19. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/dubbing/models.py +0 -0
  20. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/generation/__init__.py +0 -0
  21. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/generation/image.py +0 -0
  22. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/generation/video.py +0 -0
  23. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/registry.py +0 -0
  24. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/swapping/__init__.py +0 -0
  25. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/swapping/inpainter.py +0 -0
  26. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/swapping/models.py +0 -0
  27. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/swapping/segmenter.py +0 -0
  28. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/swapping/swapper.py +0 -0
  29. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/transforms.py +0 -0
  30. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/understanding/__init__.py +0 -0
  31. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/understanding/audio.py +0 -0
  32. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/understanding/image.py +0 -0
  33. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/understanding/temporal.py +0 -0
  34. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/ai/video_analysis.py +0 -0
  35. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/__init__.py +0 -0
  36. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/audio/__init__.py +0 -0
  37. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/audio/analysis.py +0 -0
  38. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/audio/audio.py +0 -0
  39. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/combine.py +0 -0
  40. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/description.py +0 -0
  41. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/effects.py +0 -0
  42. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/exceptions.py +0 -0
  43. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/progress.py +0 -0
  44. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/registry.py +0 -0
  45. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/scene.py +0 -0
  46. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/streaming.py +0 -0
  47. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/text/__init__.py +0 -0
  48. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/text/overlay.py +0 -0
  49. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/text/transcription.py +0 -0
  50. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/transforms.py +0 -0
  51. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/transitions.py +0 -0
  52. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/utils.py +0 -0
  53. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/base/video.py +0 -0
  54. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/editing/__init__.py +0 -0
  55. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/editing/multicam.py +0 -0
  56. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/editing/premiere_xml.py +0 -0
  57. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/editing/video_edit.py +0 -0
  58. {videopython-0.26.6 → videopython-0.26.8}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.26.6
3
+ Version: 0.26.8
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.26.6"
3
+ version = "0.26.8"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -3,7 +3,6 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import logging
6
- import tempfile
7
6
  from pathlib import Path
8
7
  from typing import TYPE_CHECKING, Any, Callable
9
8
 
@@ -170,7 +169,7 @@ class VideoDubber:
170
169
  ``DubbingResult`` with the dubbed audio, translated segments, and
171
170
  source transcription. The output video is written to ``output_path``.
172
171
  """
173
- from videopython.ai.dubbing.remux import replace_audio_stream
172
+ from videopython.ai.dubbing.remux import replace_audio_stream_from_audio
174
173
  from videopython.base.audio import Audio
175
174
 
176
175
  input_path = Path(input_path)
@@ -196,17 +195,14 @@ class VideoDubber:
196
195
  transcription=transcription,
197
196
  )
198
197
 
199
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
200
- dubbed_audio_path = Path(tmp.name)
201
- try:
202
- result.dubbed_audio.save(dubbed_audio_path)
203
- replace_audio_stream(
204
- video_path=input_path,
205
- audio_path=dubbed_audio_path,
206
- output_path=output_path,
207
- )
208
- finally:
209
- dubbed_audio_path.unlink(missing_ok=True)
198
+ # Stream the dubbed Audio directly into ffmpeg via stdin instead of
199
+ # going through a temp WAV on disk. For a 2h dub the temp file would
200
+ # be ~10 GB written-then-read; the streaming path drops both copies.
201
+ replace_audio_stream_from_audio(
202
+ video_path=input_path,
203
+ audio=result.dubbed_audio,
204
+ output_path=output_path,
205
+ )
210
206
 
211
207
  return result
212
208
 
@@ -3,14 +3,45 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import logging
6
+ import tempfile
7
+ from pathlib import Path
6
8
  from typing import TYPE_CHECKING, Any, Callable, Literal
7
9
 
10
+ import numpy as np
11
+
8
12
  from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio
9
13
  from videopython.ai.dubbing.timing import TimingSynchronizer
10
14
 
11
15
  if TYPE_CHECKING:
12
16
  from videopython.base.audio import Audio
13
17
 
18
+
19
+ def _peak_match(target: Audio, reference: Audio) -> Audio:
20
+ """Scale ``target`` so its peak amplitude matches ``reference``.
21
+
22
+ Demucs background normalization and the timing-assembler peak guard
23
+ each clamp at 1.0 instead of restoring headroom, so a dubbed mix
24
+ typically lands quieter than the source — perceptually "thinner."
25
+ A single peak match recovers most of that drift without LUFS deps.
26
+
27
+ No-op when either side has zero peak (silent input or all-silent dub).
28
+ The new ``Audio`` shares no buffer with ``target``.
29
+ """
30
+ from videopython.base.audio import Audio as _Audio
31
+
32
+ target_peak = float(np.max(np.abs(target.data))) if target.data.size else 0.0
33
+ reference_peak = float(np.max(np.abs(reference.data))) if reference.data.size else 0.0
34
+
35
+ if target_peak <= 0.0 or reference_peak <= 0.0:
36
+ return target
37
+
38
+ scale = reference_peak / target_peak
39
+ if abs(scale - 1.0) < 1e-3:
40
+ return target
41
+
42
+ return _Audio(target.data * scale, target.metadata)
43
+
44
+
14
45
  WhisperModel = Literal["tiny", "base", "small", "medium", "large", "turbo"]
15
46
 
16
47
  logger = logging.getLogger(__name__)
@@ -237,7 +268,19 @@ class LocalDubbingPipeline:
237
268
  if self._separator is None:
238
269
  self._init_separator()
239
270
 
240
- separated_audio = self._separator.separate(source_audio)
271
+ # Limit Demucs to the speech-bearing portion of the audio. The
272
+ # transcription has already located every speech region; running
273
+ # source separation outside those is pure overhead (no vocals to
274
+ # isolate). On talk-heavy sources with silence/music gaps this
275
+ # roughly halves separation time. When speech covers most of the
276
+ # track separate_regions falls back to a full-track separate().
277
+ from videopython.ai.understanding.separation import _merge_regions
278
+
279
+ speech_regions = _merge_regions(
280
+ [(s.start, s.end) for s in transcription.segments],
281
+ audio_duration=source_audio.metadata.duration_seconds,
282
+ )
283
+ separated_audio = self._separator.separate_regions(source_audio, speech_regions)
241
284
  self._maybe_unload("_separator")
242
285
  vocal_audio = separated_audio.vocals
243
286
  background_audio = separated_audio.background
@@ -278,24 +321,46 @@ class LocalDubbingPipeline:
278
321
  target_durations: list[float] = []
279
322
  start_times: list[float] = []
280
323
 
281
- for i, segment in enumerate(translated_segments):
282
- if segment.duration < 0.1:
283
- continue
284
-
285
- progress = 0.50 + (0.30 * (i / len(translated_segments)))
286
- report_progress(f"Generating speech ({i + 1}/{len(translated_segments)})", progress)
287
-
288
- speaker = segment.speaker or "speaker_0"
289
- voice_sample = voice_samples.get(speaker)
290
-
291
- if voice_clone and voice_sample is not None:
292
- dubbed_audio = self._tts.generate_audio(segment.translated_text, voice_sample=voice_sample)
293
- else:
294
- dubbed_audio = self._tts.generate_audio(segment.translated_text)
295
-
296
- dubbed_segments.append(dubbed_audio)
297
- target_durations.append(segment.duration)
298
- start_times.append(segment.start)
324
+ # Encode each speaker's voice sample to a temp WAV exactly once and
325
+ # reuse the path across every segment for that speaker. Without this
326
+ # cache, TextToSpeech.generate_audio re-encodes the same voice sample
327
+ # on every call (one temp WAV write + delete per segment), which is
328
+ # pure overhead for long dubs with many segments per speaker.
329
+ speaker_wav_paths: dict[str, Path] = {}
330
+ try:
331
+ if voice_clone:
332
+ for speaker, sample in voice_samples.items():
333
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
334
+ sample.save(f.name)
335
+ speaker_wav_paths[speaker] = Path(f.name)
336
+
337
+ for i, segment in enumerate(translated_segments):
338
+ if segment.duration < 0.1:
339
+ continue
340
+ # Translation filter (translation.py:_is_translatable_text)
341
+ # leaves translated_text="" for punctuation-only or empty
342
+ # segments. Don't TTS those — saves a model call and avoids
343
+ # injecting hallucinated speech into the dubbed track.
344
+ if not segment.translated_text.strip():
345
+ continue
346
+
347
+ progress = 0.50 + (0.30 * (i / len(translated_segments)))
348
+ report_progress(f"Generating speech ({i + 1}/{len(translated_segments)})", progress)
349
+
350
+ speaker = segment.speaker or "speaker_0"
351
+ cached_path = speaker_wav_paths.get(speaker) if voice_clone else None
352
+
353
+ if cached_path is not None:
354
+ dubbed_audio = self._tts.generate_audio(segment.translated_text, voice_sample_path=cached_path)
355
+ else:
356
+ dubbed_audio = self._tts.generate_audio(segment.translated_text)
357
+
358
+ dubbed_segments.append(dubbed_audio)
359
+ target_durations.append(segment.duration)
360
+ start_times.append(segment.start)
361
+ finally:
362
+ for path in speaker_wav_paths.values():
363
+ path.unlink(missing_ok=True)
299
364
 
300
365
  self._maybe_unload("_tts")
301
366
 
@@ -325,6 +390,11 @@ class LocalDubbingPipeline:
325
390
  else:
326
391
  final_audio = dubbed_speech
327
392
 
393
+ # Peak-match against the source so the dub doesn't land quieter
394
+ # than the original. Done last so it captures both vocals+background
395
+ # mixes and speech-only outputs uniformly.
396
+ final_audio = _peak_match(final_audio, source_audio)
397
+
328
398
  report_progress("Complete", 1.0)
329
399
 
330
400
  return DubbingResult(
@@ -375,7 +445,13 @@ class LocalDubbingPipeline:
375
445
  if self._separator is None:
376
446
  self._init_separator()
377
447
 
378
- separated_audio = self._separator.separate(source_audio)
448
+ from videopython.ai.understanding.separation import _merge_regions
449
+
450
+ speech_regions = _merge_regions(
451
+ [(s.start, s.end) for s in transcription.segments],
452
+ audio_duration=source_audio.metadata.duration_seconds,
453
+ )
454
+ separated_audio = self._separator.separate_regions(source_audio, speech_regions)
379
455
  self._maybe_unload("_separator")
380
456
  vocal_audio = separated_audio.vocals
381
457
  background_audio = separated_audio.background
@@ -430,6 +506,8 @@ class LocalDubbingPipeline:
430
506
  else:
431
507
  final_audio = generated_speech
432
508
 
509
+ final_audio = _peak_match(final_audio, source_audio)
510
+
433
511
  report_progress("Complete", 1.0)
434
512
 
435
513
  return RevoiceResult(
@@ -0,0 +1,159 @@
1
+ """ffmpeg helper for replacing a video file's audio track without re-encoding video."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ import logging
7
+ import subprocess
8
+ import wave
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING
11
+
12
+ import numpy as np
13
+
14
+ if TYPE_CHECKING:
15
+ from videopython.base.audio import Audio
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class RemuxError(RuntimeError):
21
+ """ffmpeg failed while replacing an audio stream."""
22
+
23
+
24
+ def replace_audio_stream(
25
+ video_path: str | Path,
26
+ audio_path: str | Path,
27
+ output_path: str | Path,
28
+ audio_codec: str = "aac",
29
+ audio_bitrate: str = "192k",
30
+ ) -> None:
31
+ """Copy ``video_path``'s video stream and mux in ``audio_path`` as the audio track.
32
+
33
+ Uses ffmpeg stream-copy for video (no re-encode) and encodes audio to AAC.
34
+ ``-shortest`` trims to the shorter of the two streams so the output duration
35
+ matches the source video when the dubbed audio is slightly longer.
36
+
37
+ Args:
38
+ video_path: Source video file (video stream is copied unchanged).
39
+ audio_path: Audio file to use as the new audio track.
40
+ output_path: Destination file. Overwritten if it exists.
41
+ audio_codec: ffmpeg audio codec name. Defaults to ``aac`` (MP4-compatible).
42
+ audio_bitrate: Audio bitrate passed to ffmpeg (``-b:a``).
43
+
44
+ Raises:
45
+ FileNotFoundError: If ``video_path`` or ``audio_path`` does not exist.
46
+ RemuxError: If ffmpeg returns a non-zero exit code.
47
+ """
48
+ video_path = Path(video_path)
49
+ audio_path = Path(audio_path)
50
+ output_path = Path(output_path)
51
+
52
+ if not video_path.exists():
53
+ raise FileNotFoundError(f"Video file not found: {video_path}")
54
+ if not audio_path.exists():
55
+ raise FileNotFoundError(f"Audio file not found: {audio_path}")
56
+
57
+ cmd = [
58
+ "ffmpeg",
59
+ "-y",
60
+ "-i",
61
+ str(video_path),
62
+ "-i",
63
+ str(audio_path),
64
+ "-map",
65
+ "0:v:0",
66
+ "-map",
67
+ "1:a:0",
68
+ "-c:v",
69
+ "copy",
70
+ "-c:a",
71
+ audio_codec,
72
+ "-b:a",
73
+ audio_bitrate,
74
+ "-shortest",
75
+ str(output_path),
76
+ ]
77
+
78
+ logger.info("replace_audio_stream: %s + %s -> %s", video_path, audio_path, output_path)
79
+ result = subprocess.run(cmd, capture_output=True)
80
+ if result.returncode != 0:
81
+ raise RemuxError(f"ffmpeg failed (exit {result.returncode}): {result.stderr.decode(errors='replace')}")
82
+
83
+
84
+ def replace_audio_stream_from_audio(
85
+ video_path: str | Path,
86
+ audio: Audio,
87
+ output_path: str | Path,
88
+ audio_codec: str = "aac",
89
+ audio_bitrate: str = "192k",
90
+ ) -> None:
91
+ """Like ``replace_audio_stream`` but takes an in-memory ``Audio`` and pipes WAV to ffmpeg.
92
+
93
+ Avoids the ``Audio.save -> read-from-disk -> ffmpeg`` round-trip used by
94
+ the path-based variant: we serialize the WAV in memory and feed it to
95
+ ffmpeg via stdin. For long dubs this saves a full WAV write+read of the
96
+ output audio (~10 GB for a 2h source).
97
+
98
+ Args:
99
+ video_path: Source video file (video stream is copied unchanged).
100
+ audio: ``Audio`` instance to mux in as the new audio track.
101
+ output_path: Destination file. Overwritten if it exists.
102
+ audio_codec: ffmpeg audio codec name. Defaults to ``aac``.
103
+ audio_bitrate: Audio bitrate passed to ffmpeg (``-b:a``).
104
+
105
+ Raises:
106
+ FileNotFoundError: If ``video_path`` does not exist.
107
+ RemuxError: If ffmpeg returns a non-zero exit code.
108
+ """
109
+ video_path = Path(video_path)
110
+ output_path = Path(output_path)
111
+
112
+ if not video_path.exists():
113
+ raise FileNotFoundError(f"Video file not found: {video_path}")
114
+
115
+ # Serialize Audio to WAV bytes in memory. Mirrors Audio.save's WAV writer:
116
+ # int16 samples, header from metadata. We stream these bytes to ffmpeg's
117
+ # stdin as the second input (the first is the video file on disk).
118
+ int_data = (audio.data * np.iinfo(np.int16).max).astype(np.int16)
119
+ wav_io = io.BytesIO()
120
+ with wave.open(wav_io, "wb") as wav_file:
121
+ wav_file.setnchannels(audio.metadata.channels)
122
+ wav_file.setsampwidth(audio.metadata.sample_width)
123
+ wav_file.setframerate(audio.metadata.sample_rate)
124
+ wav_file.writeframes(int_data.tobytes())
125
+ wav_bytes = wav_io.getvalue()
126
+
127
+ cmd = [
128
+ "ffmpeg",
129
+ "-y",
130
+ "-i",
131
+ str(video_path),
132
+ "-f",
133
+ "wav",
134
+ "-i",
135
+ "-",
136
+ "-map",
137
+ "0:v:0",
138
+ "-map",
139
+ "1:a:0",
140
+ "-c:v",
141
+ "copy",
142
+ "-c:a",
143
+ audio_codec,
144
+ "-b:a",
145
+ audio_bitrate,
146
+ "-shortest",
147
+ str(output_path),
148
+ ]
149
+
150
+ logger.info(
151
+ "replace_audio_stream_from_audio: %s + <stdin wav %d bytes> -> %s",
152
+ video_path,
153
+ len(wav_bytes),
154
+ output_path,
155
+ )
156
+ process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stderr=subprocess.PIPE)
157
+ _, stderr = process.communicate(wav_bytes)
158
+ if process.returncode != 0:
159
+ raise RemuxError(f"ffmpeg failed (exit {process.returncode}): {stderr.decode(errors='replace')}")
@@ -4,7 +4,9 @@ from __future__ import annotations
4
4
 
5
5
  from dataclasses import dataclass
6
6
 
7
- from videopython.base.audio import Audio
7
+ import numpy as np
8
+
9
+ from videopython.base.audio import Audio, AudioMetadata
8
10
 
9
11
 
10
12
  @dataclass
@@ -181,32 +183,58 @@ class TimingSynchronizer:
181
183
  if len(audio_segments) != len(start_times):
182
184
  raise ValueError(f"Length mismatch: {len(audio_segments)} segments vs {len(start_times)} start times")
183
185
 
186
+ for start_time in start_times:
187
+ if start_time < 0:
188
+ raise ValueError(f"Invalid start time: {start_time}")
189
+
184
190
  if not audio_segments:
185
191
  return Audio.create_silent(total_duration, stereo=False)
186
192
 
187
- # Determine sample rate from first segment
193
+ # Single-pass assembler: allocate one mono float32 buffer and add each
194
+ # segment in place at its start sample. The previous implementation
195
+ # called Audio.overlay() per segment, which allocates np.zeros and
196
+ # copies the full track on every call — O(N * total_samples) memory
197
+ # traffic. For long dubs (thousands of segments) this loop dominated
198
+ # wall time and peak RAM.
188
199
  sample_rate = audio_segments[0].metadata.sample_rate
189
-
190
- # Create base silent track
191
- output = Audio.create_silent(total_duration, stereo=False, sample_rate=sample_rate)
192
-
193
- # Overlay each segment at its start time
200
+ base_samples = max(int(total_duration * sample_rate), 0)
201
+
202
+ # Pre-normalize each segment to (mono, target sample rate) and compute
203
+ # placement bounds so the output buffer is sized to fit any segment
204
+ # that runs past total_duration (mirrors Audio.overlay's extend-on-OOB
205
+ # behavior so we don't silently truncate speech).
206
+ normalized: list[tuple[int, np.ndarray]] = []
207
+ end_sample = base_samples
194
208
  for audio, start_time in zip(audio_segments, start_times):
195
- if start_time < 0:
196
- raise ValueError(f"Invalid start time: {start_time}")
197
-
198
- # Resample if needed
199
209
  if audio.metadata.sample_rate != sample_rate:
200
210
  audio = audio.resample(sample_rate)
201
-
202
- # Convert to mono if needed
203
211
  if audio.metadata.channels > 1:
204
212
  audio = audio.to_mono()
205
-
206
- # Overlay at position
207
- output = output.overlay(audio, position=start_time)
208
-
209
- return output
213
+ start_sample = int(np.ceil(start_time * sample_rate))
214
+ seg_data = audio.data
215
+ normalized.append((start_sample, seg_data))
216
+ end_sample = max(end_sample, start_sample + len(seg_data))
217
+
218
+ output = np.zeros(end_sample, dtype=np.float32)
219
+ for start_sample, seg_data in normalized:
220
+ stop = start_sample + len(seg_data)
221
+ output[start_sample:stop] += seg_data
222
+
223
+ # Single post-mix peak guard, equivalent to Audio.overlay's per-call
224
+ # rescale collapsed into one pass. For non-overlapping dub segments
225
+ # this is a no-op; only the rare overlap case touches it.
226
+ max_amplitude = float(np.max(np.abs(output))) if output.size else 0.0
227
+ if max_amplitude > 1.0:
228
+ output /= max_amplitude
229
+
230
+ metadata = AudioMetadata(
231
+ sample_rate=sample_rate,
232
+ channels=1,
233
+ sample_width=audio_segments[0].metadata.sample_width,
234
+ duration_seconds=end_sample / sample_rate,
235
+ frame_count=end_sample,
236
+ )
237
+ return Audio(output, metadata)
210
238
 
211
239
  def check_overlaps(
212
240
  self,
@@ -2,11 +2,14 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from typing import Any
5
+ from typing import TYPE_CHECKING, Any
6
6
 
7
7
  from videopython.ai._device import log_device_initialization, release_device_memory, select_device
8
8
  from videopython.base.audio import Audio, AudioMetadata
9
9
 
10
+ if TYPE_CHECKING:
11
+ from pathlib import Path
12
+
10
13
 
11
14
  class TextToSpeech:
12
15
  """Generates speech audio from text using Chatterbox Multilingual.
@@ -47,6 +50,7 @@ class TextToSpeech:
47
50
  self,
48
51
  text: str,
49
52
  voice_sample: Audio | None = None,
53
+ voice_sample_path: str | Path | None = None,
50
54
  ) -> Audio:
51
55
  """Generate speech audio from text.
52
56
 
@@ -54,6 +58,12 @@ class TextToSpeech:
54
58
  text: Text to synthesize.
55
59
  voice_sample: Optional voice sample to clone. Falls back to the
56
60
  instance's ``voice`` and then to Chatterbox's default speaker.
61
+ voice_sample_path: Optional pre-encoded WAV path to use directly as
62
+ the speaker prompt. Skips the per-call temp-WAV encode that
63
+ ``voice_sample`` would otherwise trigger. When set, takes
64
+ precedence over ``voice_sample`` and ``self.voice``. Used by
65
+ the dubbing pipeline to encode each speaker's sample once and
66
+ reuse it across all of that speaker's segments.
57
67
  """
58
68
  import tempfile
59
69
  from pathlib import Path
@@ -63,13 +73,18 @@ class TextToSpeech:
63
73
  if self._model is None:
64
74
  self._init_model()
65
75
 
66
- effective_sample = voice_sample or self.voice
67
76
  speaker_wav_path: Path | None = None
68
-
69
- if effective_sample is not None:
70
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
71
- effective_sample.save(f.name)
72
- speaker_wav_path = Path(f.name)
77
+ cleanup_path = False
78
+
79
+ if voice_sample_path is not None:
80
+ speaker_wav_path = Path(voice_sample_path)
81
+ else:
82
+ effective_sample = voice_sample or self.voice
83
+ if effective_sample is not None:
84
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
85
+ effective_sample.save(f.name)
86
+ speaker_wav_path = Path(f.name)
87
+ cleanup_path = True
73
88
 
74
89
  try:
75
90
  wav = self._model.generate(
@@ -91,8 +106,8 @@ class TextToSpeech:
91
106
  )
92
107
  return Audio(audio_data, metadata)
93
108
  finally:
94
- if speaker_wav_path is not None:
95
- speaker_wav_path.unlink()
109
+ if cleanup_path and speaker_wav_path is not None:
110
+ speaker_wav_path.unlink(missing_ok=True)
96
111
 
97
112
  def unload(self) -> None:
98
113
  """Release the TTS model so the next generate_audio() re-initializes.
@@ -8,6 +8,17 @@ from videopython.ai._device import log_device_initialization, release_device_mem
8
8
  from videopython.ai.dubbing.models import TranslatedSegment
9
9
  from videopython.base.text.transcription import TranscriptionSegment
10
10
 
11
+
12
+ def _is_translatable_text(text: str) -> bool:
13
+ """Return True if text has enough content to be worth translating.
14
+
15
+ Whisper routinely emits punctuation-only or single-character segments
16
+ (" .", "...", "?", "♪") that MarianMT can hallucinate full sentences
17
+ from. Require at least 2 alphanumeric characters to filter these out.
18
+ """
19
+ return sum(1 for c in text if c.isalnum()) >= 2
20
+
21
+
11
22
  LANGUAGE_NAMES = {
12
23
  "en": "English",
13
24
  "es": "Spanish",
@@ -159,17 +170,28 @@ class TextTranslator:
159
170
  target_lang: str,
160
171
  source_lang: str | None = None,
161
172
  ) -> list[TranslatedSegment]:
162
- """Translate transcription segments while preserving timing/speaker info."""
173
+ """Translate transcription segments while preserving timing/speaker info.
174
+
175
+ Segments whose text is empty or contains fewer than 2 alphanumeric
176
+ characters are not sent to the model — they receive
177
+ ``translated_text=""`` instead. This avoids MarianMT hallucinating
178
+ full sentences from " .", "...", or single-token Whisper segments,
179
+ which would otherwise be TTS'd into the dubbed track.
180
+ """
163
181
  effective_source = source_lang or "en"
164
- texts = [segment.text for segment in segments]
165
- translated_texts = self.translate_batch(texts, target_lang, source_lang)
182
+
183
+ translatable_indices = [i for i, segment in enumerate(segments) if _is_translatable_text(segment.text)]
184
+ translatable_texts = [segments[i].text for i in translatable_indices]
185
+ translated_texts = self.translate_batch(translatable_texts, target_lang, source_lang)
186
+
187
+ translation_map: dict[int, str] = dict(zip(translatable_indices, translated_texts))
166
188
 
167
189
  translated_segments = []
168
- for segment, translated_text in zip(segments, translated_texts):
190
+ for i, segment in enumerate(segments):
169
191
  translated_segments.append(
170
192
  TranslatedSegment(
171
193
  original_segment=segment,
172
- translated_text=translated_text,
194
+ translated_text=translation_map.get(i, ""),
173
195
  source_lang=effective_source,
174
196
  target_lang=target_lang,
175
197
  speaker=segment.speaker,
@@ -0,0 +1,304 @@
1
+ """Audio source separation using local Demucs models."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import Any
7
+
8
+ from videopython.ai._device import log_device_initialization, release_device_memory, select_device
9
+ from videopython.ai.dubbing.models import SeparatedAudio
10
+ from videopython.base.audio import Audio, AudioMetadata
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def _merge_regions(
16
+ regions: list[tuple[float, float]],
17
+ audio_duration: float,
18
+ pad: float = 0.5,
19
+ merge_gap: float = 1.0,
20
+ ) -> list[tuple[float, float]]:
21
+ """Merge overlapping/adjacent (start, end) ranges and pad each side.
22
+
23
+ Args:
24
+ regions: Speech regions in seconds. Order does not matter.
25
+ audio_duration: Total audio duration; output is clamped to ``[0, audio_duration]``.
26
+ pad: Seconds added to each side. Demucs needs context to separate
27
+ cleanly at boundaries; 0.5s avoids clipped onsets/decays.
28
+ merge_gap: Adjacent regions whose padded edges are within this
29
+ many seconds are merged. Avoids running Demucs on very short
30
+ slices (where its temporal context isn't there).
31
+
32
+ Returns:
33
+ Sorted list of non-overlapping (start, end) regions covering the
34
+ speech-bearing portion of the audio.
35
+ """
36
+ if not regions:
37
+ return []
38
+
39
+ sorted_regions = sorted(regions)
40
+
41
+ merged: list[tuple[float, float]] = []
42
+ for start, end in sorted_regions:
43
+ if end <= start:
44
+ continue
45
+ padded_start = max(0.0, start - pad)
46
+ padded_end = min(audio_duration, end + pad)
47
+ if padded_start >= audio_duration or padded_end <= 0.0:
48
+ continue
49
+
50
+ if merged and padded_start - merged[-1][1] <= merge_gap:
51
+ merged[-1] = (merged[-1][0], max(merged[-1][1], padded_end))
52
+ else:
53
+ merged.append((padded_start, padded_end))
54
+
55
+ return merged
56
+
57
+
58
+ class AudioSeparator:
59
+ """Separates audio into vocals and background components using Demucs."""
60
+
61
+ SUPPORTED_MODELS: list[str] = ["htdemucs", "htdemucs_ft", "htdemucs_6s", "mdx_extra"]
62
+ STEM_NAMES = ["drums", "bass", "other", "vocals"]
63
+ STEM_NAMES_6S = ["drums", "bass", "other", "vocals", "guitar", "piano"]
64
+
65
+ def __init__(self, model_name: str = "htdemucs", device: str | None = None):
66
+ if model_name not in self.SUPPORTED_MODELS:
67
+ raise ValueError(f"Model '{model_name}' not supported. Supported: {self.SUPPORTED_MODELS}")
68
+
69
+ self.model_name = model_name
70
+ self.device = device
71
+ self._model: Any = None
72
+
73
+ def _init_local(self) -> None:
74
+ """Initialize local Demucs model."""
75
+ from demucs.pretrained import get_model
76
+
77
+ requested_device = self.device
78
+ device = select_device(self.device, mps_allowed=False)
79
+
80
+ self._model = get_model(self.model_name)
81
+ self._model.to(device)
82
+ self._model.eval()
83
+ self.device = device
84
+ log_device_initialization(
85
+ "AudioSeparator",
86
+ requested_device=requested_device,
87
+ resolved_device=device,
88
+ )
89
+
90
+ def _separate_local(self, audio: Audio) -> SeparatedAudio:
91
+ """Separate audio using local Demucs model.
92
+
93
+ Keeps the input tensor on CPU and passes ``device=self.device`` to
94
+ ``apply_model`` so per-chunk compute runs on GPU while the full
95
+ ``(stems, channels, samples)`` output is stored in CPU RAM. For long
96
+ sources this is the difference between OOM-on-GPU and running cleanly:
97
+ a 2h stereo @ 44.1kHz output is ~10 GB — too big for an 8 GB card but
98
+ comfortable on a 32 GB host.
99
+ """
100
+ import numpy as np
101
+ import torch
102
+ from demucs.apply import apply_model
103
+
104
+ if self._model is None:
105
+ self._init_local()
106
+
107
+ target_sr = self._model.samplerate
108
+
109
+ if audio.metadata.channels == 1:
110
+ audio = audio._to_stereo()
111
+
112
+ if audio.metadata.sample_rate != target_sr:
113
+ audio = audio.resample(target_sr)
114
+
115
+ audio_data = audio.data
116
+ if audio_data.ndim == 1:
117
+ audio_data = np.stack([audio_data, audio_data])
118
+ elif audio_data.ndim == 2:
119
+ audio_data = audio_data.T
120
+
121
+ wav = torch.tensor(audio_data, dtype=torch.float32).unsqueeze(0)
122
+
123
+ with torch.no_grad():
124
+ sources = apply_model(self._model, wav, device=self.device)
125
+
126
+ sources_np = sources[0].cpu().numpy()
127
+ del sources
128
+
129
+ stem_names = self.STEM_NAMES_6S if self.model_name == "htdemucs_6s" else self.STEM_NAMES
130
+ vocals_idx = stem_names.index("vocals")
131
+ non_vocal_indices = [i for i in range(len(stem_names)) if i != vocals_idx]
132
+
133
+ vocals_data = sources_np[vocals_idx].T
134
+ background_data = sources_np[non_vocal_indices].sum(axis=0).T
135
+ del sources_np
136
+
137
+ max_val = np.max(np.abs(background_data))
138
+ if max_val > 1.0:
139
+ background_data /= max_val
140
+
141
+ metadata = AudioMetadata(
142
+ sample_rate=target_sr,
143
+ channels=2,
144
+ sample_width=2,
145
+ duration_seconds=vocals_data.shape[0] / target_sr,
146
+ frame_count=vocals_data.shape[0],
147
+ )
148
+ vocals = Audio(np.ascontiguousarray(vocals_data, dtype=np.float32), metadata)
149
+ background = Audio(np.ascontiguousarray(background_data, dtype=np.float32), metadata)
150
+
151
+ return SeparatedAudio(
152
+ vocals=vocals,
153
+ background=background,
154
+ original=audio,
155
+ music=None,
156
+ effects=None,
157
+ )
158
+
159
+ def separate(self, audio: Audio) -> SeparatedAudio:
160
+ """Separate audio into vocals and background components."""
161
+ return self._separate_local(audio)
162
+
163
+ def separate_regions(
164
+ self,
165
+ audio: Audio,
166
+ regions: list[tuple[float, float]],
167
+ full_separation_threshold: float = 0.9,
168
+ ) -> SeparatedAudio:
169
+ """Separate only the given (start, end) regions; pass the rest through.
170
+
171
+ Demucs is the slowest stage of the dubbing pipeline. On talk-heavy
172
+ sources (podcasts, interviews) most of the track is speech, but
173
+ long pauses, silence, or music-only stretches don't need vocal
174
+ isolation — there's nothing to isolate. We run Demucs only on the
175
+ speech-bearing regions and treat the rest as pure background.
176
+
177
+ Output is full-length: vocals are silent outside the given
178
+ regions; background is the original audio outside the given
179
+ regions and the Demucs-separated background inside.
180
+
181
+ Args:
182
+ audio: Source audio (typically the full track).
183
+ regions: List of ``(start, end)`` second pairs marking
184
+ speech-bearing portions. Caller is responsible for
185
+ merging/padding (use ``_merge_regions``).
186
+ full_separation_threshold: If the regions cover more than
187
+ this fraction of the audio, fall back to full-track
188
+ ``separate()`` since per-region slicing+stitching
189
+ overhead would exceed the savings. Default 0.9.
190
+
191
+ Returns:
192
+ ``SeparatedAudio`` with full-length vocals and background.
193
+ """
194
+ import numpy as np
195
+
196
+ if not regions:
197
+ logger.info("separate_regions: no regions, returning silent vocals over original audio")
198
+ return self._passthrough_separation(audio)
199
+
200
+ total_duration = audio.metadata.duration_seconds
201
+ speech_duration = sum(end - start for start, end in regions)
202
+ if total_duration > 0 and speech_duration / total_duration >= full_separation_threshold:
203
+ logger.info(
204
+ "separate_regions: speech covers %.0f%% of audio (>=%.0f%%), using full-track separation",
205
+ speech_duration / total_duration * 100,
206
+ full_separation_threshold * 100,
207
+ )
208
+ return self._separate_local(audio)
209
+
210
+ logger.info(
211
+ "separate_regions: separating %.1fs of speech across %d region(s) (full duration: %.1fs)",
212
+ speech_duration,
213
+ len(regions),
214
+ total_duration,
215
+ )
216
+
217
+ # Build full-length output buffers. Background defaults to the
218
+ # original audio (so non-speech gaps pass through unchanged); vocals
219
+ # default to silence (no speech to isolate outside the regions).
220
+ # Both are stereo to match the full-track separation contract.
221
+ sr = audio.metadata.sample_rate
222
+ stereo_audio = audio if audio.metadata.channels == 2 else audio._to_stereo()
223
+
224
+ total_samples = len(stereo_audio.data)
225
+ vocals_full = np.zeros((total_samples, 2), dtype=np.float32)
226
+ background_full = stereo_audio.data.astype(np.float32, copy=True)
227
+
228
+ for start, end in regions:
229
+ chunk = audio.slice(start, end)
230
+ separated_chunk = self._separate_local(chunk)
231
+ chunk_vocals = separated_chunk.vocals.data
232
+ chunk_background = separated_chunk.background.data
233
+
234
+ # Demucs operates at its model sample rate (typically 44.1 kHz)
235
+ # and returns stereo. The slice of `audio` we passed in may have
236
+ # been resampled inside _separate_local, so resample the chunk
237
+ # outputs back to the source sample rate before splicing.
238
+ chunk_sr = separated_chunk.vocals.metadata.sample_rate
239
+ if chunk_sr != sr:
240
+ chunk_vocals = separated_chunk.vocals.resample(sr).data
241
+ chunk_background = separated_chunk.background.resample(sr).data
242
+
243
+ start_sample = int(start * sr)
244
+ end_sample = min(start_sample + len(chunk_vocals), total_samples)
245
+ length = end_sample - start_sample
246
+ if length <= 0:
247
+ continue
248
+
249
+ vocals_full[start_sample:end_sample] = chunk_vocals[:length]
250
+ background_full[start_sample:end_sample] = chunk_background[:length]
251
+
252
+ metadata = AudioMetadata(
253
+ sample_rate=sr,
254
+ channels=2,
255
+ sample_width=audio.metadata.sample_width,
256
+ duration_seconds=total_samples / sr,
257
+ frame_count=total_samples,
258
+ )
259
+ vocals = Audio(np.ascontiguousarray(vocals_full, dtype=np.float32), metadata)
260
+ background = Audio(np.ascontiguousarray(background_full, dtype=np.float32), metadata)
261
+
262
+ return SeparatedAudio(
263
+ vocals=vocals,
264
+ background=background,
265
+ original=stereo_audio,
266
+ music=None,
267
+ effects=None,
268
+ )
269
+
270
+ def _passthrough_separation(self, audio: Audio) -> SeparatedAudio:
271
+ """Return the original audio as background with silent vocals.
272
+
273
+ Used when no speech regions are present — there's nothing to
274
+ separate, so the entire signal is background by definition.
275
+ """
276
+ import numpy as np
277
+
278
+ stereo_audio = audio if audio.metadata.channels == 2 else audio._to_stereo()
279
+ silent_vocals_data = np.zeros_like(stereo_audio.data, dtype=np.float32)
280
+ vocals = Audio(silent_vocals_data, stereo_audio.metadata)
281
+
282
+ return SeparatedAudio(
283
+ vocals=vocals,
284
+ background=stereo_audio,
285
+ original=stereo_audio,
286
+ music=None,
287
+ effects=None,
288
+ )
289
+
290
+ def extract_vocals(self, audio: Audio) -> Audio:
291
+ """Convenience method to extract only vocals from audio."""
292
+ return self.separate(audio).vocals
293
+
294
+ def extract_background(self, audio: Audio) -> Audio:
295
+ """Convenience method to extract only background from audio."""
296
+ return self.separate(audio).background
297
+
298
+ def unload(self) -> None:
299
+ """Release the Demucs model so the next separate() re-initializes.
300
+
301
+ Used by low-memory dubbing to free VRAM between pipeline stages.
302
+ """
303
+ self._model = None
304
+ release_device_memory(self.device)
@@ -1,73 +0,0 @@
1
- """ffmpeg helper for replacing a video file's audio track without re-encoding video."""
2
-
3
- from __future__ import annotations
4
-
5
- import logging
6
- import subprocess
7
- from pathlib import Path
8
-
9
- logger = logging.getLogger(__name__)
10
-
11
-
12
- class RemuxError(RuntimeError):
13
- """ffmpeg failed while replacing an audio stream."""
14
-
15
-
16
- def replace_audio_stream(
17
- video_path: str | Path,
18
- audio_path: str | Path,
19
- output_path: str | Path,
20
- audio_codec: str = "aac",
21
- audio_bitrate: str = "192k",
22
- ) -> None:
23
- """Copy ``video_path``'s video stream and mux in ``audio_path`` as the audio track.
24
-
25
- Uses ffmpeg stream-copy for video (no re-encode) and encodes audio to AAC.
26
- ``-shortest`` trims to the shorter of the two streams so the output duration
27
- matches the source video when the dubbed audio is slightly longer.
28
-
29
- Args:
30
- video_path: Source video file (video stream is copied unchanged).
31
- audio_path: Audio file to use as the new audio track.
32
- output_path: Destination file. Overwritten if it exists.
33
- audio_codec: ffmpeg audio codec name. Defaults to ``aac`` (MP4-compatible).
34
- audio_bitrate: Audio bitrate passed to ffmpeg (``-b:a``).
35
-
36
- Raises:
37
- FileNotFoundError: If ``video_path`` or ``audio_path`` does not exist.
38
- RemuxError: If ffmpeg returns a non-zero exit code.
39
- """
40
- video_path = Path(video_path)
41
- audio_path = Path(audio_path)
42
- output_path = Path(output_path)
43
-
44
- if not video_path.exists():
45
- raise FileNotFoundError(f"Video file not found: {video_path}")
46
- if not audio_path.exists():
47
- raise FileNotFoundError(f"Audio file not found: {audio_path}")
48
-
49
- cmd = [
50
- "ffmpeg",
51
- "-y",
52
- "-i",
53
- str(video_path),
54
- "-i",
55
- str(audio_path),
56
- "-map",
57
- "0:v:0",
58
- "-map",
59
- "1:a:0",
60
- "-c:v",
61
- "copy",
62
- "-c:a",
63
- audio_codec,
64
- "-b:a",
65
- audio_bitrate,
66
- "-shortest",
67
- str(output_path),
68
- ]
69
-
70
- logger.info("replace_audio_stream: %s + %s -> %s", video_path, audio_path, output_path)
71
- result = subprocess.run(cmd, capture_output=True)
72
- if result.returncode != 0:
73
- raise RemuxError(f"ffmpeg failed (exit {result.returncode}): {result.stderr.decode(errors='replace')}")
@@ -1,131 +0,0 @@
1
- """Audio source separation using local Demucs models."""
2
-
3
- from __future__ import annotations
4
-
5
- from typing import Any
6
-
7
- from videopython.ai._device import log_device_initialization, release_device_memory, select_device
8
- from videopython.ai.dubbing.models import SeparatedAudio
9
- from videopython.base.audio import Audio, AudioMetadata
10
-
11
-
12
- class AudioSeparator:
13
- """Separates audio into vocals and background components using Demucs."""
14
-
15
- SUPPORTED_MODELS: list[str] = ["htdemucs", "htdemucs_ft", "htdemucs_6s", "mdx_extra"]
16
- STEM_NAMES = ["drums", "bass", "other", "vocals"]
17
- STEM_NAMES_6S = ["drums", "bass", "other", "vocals", "guitar", "piano"]
18
-
19
- def __init__(self, model_name: str = "htdemucs", device: str | None = None):
20
- if model_name not in self.SUPPORTED_MODELS:
21
- raise ValueError(f"Model '{model_name}' not supported. Supported: {self.SUPPORTED_MODELS}")
22
-
23
- self.model_name = model_name
24
- self.device = device
25
- self._model: Any = None
26
-
27
- def _init_local(self) -> None:
28
- """Initialize local Demucs model."""
29
- from demucs.pretrained import get_model
30
-
31
- requested_device = self.device
32
- device = select_device(self.device, mps_allowed=False)
33
-
34
- self._model = get_model(self.model_name)
35
- self._model.to(device)
36
- self._model.eval()
37
- self.device = device
38
- log_device_initialization(
39
- "AudioSeparator",
40
- requested_device=requested_device,
41
- resolved_device=device,
42
- )
43
-
44
- def _separate_local(self, audio: Audio) -> SeparatedAudio:
45
- """Separate audio using local Demucs model.
46
-
47
- Keeps the input tensor on CPU and passes ``device=self.device`` to
48
- ``apply_model`` so per-chunk compute runs on GPU while the full
49
- ``(stems, channels, samples)`` output is stored in CPU RAM. For long
50
- sources this is the difference between OOM-on-GPU and running cleanly:
51
- a 2h stereo @ 44.1kHz output is ~10 GB — too big for an 8 GB card but
52
- comfortable on a 32 GB host.
53
- """
54
- import numpy as np
55
- import torch
56
- from demucs.apply import apply_model
57
-
58
- if self._model is None:
59
- self._init_local()
60
-
61
- target_sr = self._model.samplerate
62
-
63
- if audio.metadata.channels == 1:
64
- audio = audio._to_stereo()
65
-
66
- if audio.metadata.sample_rate != target_sr:
67
- audio = audio.resample(target_sr)
68
-
69
- audio_data = audio.data
70
- if audio_data.ndim == 1:
71
- audio_data = np.stack([audio_data, audio_data])
72
- elif audio_data.ndim == 2:
73
- audio_data = audio_data.T
74
-
75
- wav = torch.tensor(audio_data, dtype=torch.float32).unsqueeze(0)
76
-
77
- with torch.no_grad():
78
- sources = apply_model(self._model, wav, device=self.device)
79
-
80
- sources_np = sources[0].cpu().numpy()
81
- del sources
82
-
83
- stem_names = self.STEM_NAMES_6S if self.model_name == "htdemucs_6s" else self.STEM_NAMES
84
- vocals_idx = stem_names.index("vocals")
85
- non_vocal_indices = [i for i in range(len(stem_names)) if i != vocals_idx]
86
-
87
- vocals_data = sources_np[vocals_idx].T
88
- background_data = sources_np[non_vocal_indices].sum(axis=0).T
89
- del sources_np
90
-
91
- max_val = np.max(np.abs(background_data))
92
- if max_val > 1.0:
93
- background_data /= max_val
94
-
95
- metadata = AudioMetadata(
96
- sample_rate=target_sr,
97
- channels=2,
98
- sample_width=2,
99
- duration_seconds=vocals_data.shape[0] / target_sr,
100
- frame_count=vocals_data.shape[0],
101
- )
102
- vocals = Audio(np.ascontiguousarray(vocals_data, dtype=np.float32), metadata)
103
- background = Audio(np.ascontiguousarray(background_data, dtype=np.float32), metadata)
104
-
105
- return SeparatedAudio(
106
- vocals=vocals,
107
- background=background,
108
- original=audio,
109
- music=None,
110
- effects=None,
111
- )
112
-
113
- def separate(self, audio: Audio) -> SeparatedAudio:
114
- """Separate audio into vocals and background components."""
115
- return self._separate_local(audio)
116
-
117
- def extract_vocals(self, audio: Audio) -> Audio:
118
- """Convenience method to extract only vocals from audio."""
119
- return self.separate(audio).vocals
120
-
121
- def extract_background(self, audio: Audio) -> Audio:
122
- """Convenience method to extract only background from audio."""
123
- return self.separate(audio).background
124
-
125
- def unload(self) -> None:
126
- """Release the Demucs model so the next separate() re-initializes.
127
-
128
- Used by low-memory dubbing to free VRAM between pipeline stages.
129
- """
130
- self._model = None
131
- release_device_memory(self.device)
File without changes
File without changes
File without changes