videopython 0.26.4__tar.gz → 0.26.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {videopython-0.26.4 → videopython-0.26.6}/PKG-INFO +1 -1
  2. {videopython-0.26.4 → videopython-0.26.6}/pyproject.toml +1 -1
  3. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/dubbing/dubber.py +40 -9
  4. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/dubbing/pipeline.py +102 -20
  5. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/understanding/audio.py +42 -0
  6. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/understanding/separation.py +27 -40
  7. {videopython-0.26.4 → videopython-0.26.6}/.gitignore +0 -0
  8. {videopython-0.26.4 → videopython-0.26.6}/LICENSE +0 -0
  9. {videopython-0.26.4 → videopython-0.26.6}/README.md +0 -0
  10. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/__init__.py +0 -0
  11. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/__init__.py +0 -0
  12. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/_device.py +0 -0
  13. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/dubbing/__init__.py +0 -0
  14. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/dubbing/models.py +0 -0
  15. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/dubbing/remux.py +0 -0
  16. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/dubbing/timing.py +0 -0
  17. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/generation/__init__.py +0 -0
  18. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/generation/audio.py +0 -0
  19. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/generation/image.py +0 -0
  20. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/generation/translation.py +0 -0
  21. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/generation/video.py +0 -0
  22. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/registry.py +0 -0
  23. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/swapping/__init__.py +0 -0
  24. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/swapping/inpainter.py +0 -0
  25. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/swapping/models.py +0 -0
  26. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/swapping/segmenter.py +0 -0
  27. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/swapping/swapper.py +0 -0
  28. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/transforms.py +0 -0
  29. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/understanding/__init__.py +0 -0
  30. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/understanding/image.py +0 -0
  31. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/understanding/temporal.py +0 -0
  32. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/ai/video_analysis.py +0 -0
  33. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/__init__.py +0 -0
  34. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/audio/__init__.py +0 -0
  35. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/audio/analysis.py +0 -0
  36. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/audio/audio.py +0 -0
  37. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/combine.py +0 -0
  38. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/description.py +0 -0
  39. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/effects.py +0 -0
  40. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/exceptions.py +0 -0
  41. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/progress.py +0 -0
  42. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/registry.py +0 -0
  43. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/scene.py +0 -0
  44. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/streaming.py +0 -0
  45. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/text/__init__.py +0 -0
  46. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/text/overlay.py +0 -0
  47. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/text/transcription.py +0 -0
  48. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/transforms.py +0 -0
  49. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/transitions.py +0 -0
  50. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/utils.py +0 -0
  51. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/base/video.py +0 -0
  52. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/editing/__init__.py +0 -0
  53. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/editing/multicam.py +0 -0
  54. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/editing/premiere_xml.py +0 -0
  55. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/editing/video_edit.py +0 -0
  56. {videopython-0.26.4 → videopython-0.26.6}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.26.4
3
+ Version: 0.26.6
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.26.4"
3
+ version = "0.26.6"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -8,6 +8,7 @@ from pathlib import Path
8
8
  from typing import TYPE_CHECKING, Any, Callable
9
9
 
10
10
  from videopython.ai.dubbing.models import DubbingResult, RevoiceResult
11
+ from videopython.ai.dubbing.pipeline import WhisperModel
11
12
 
12
13
  if TYPE_CHECKING:
13
14
  from videopython.base.video import Video
@@ -25,19 +26,38 @@ class VideoDubber:
25
26
  model is resident at a time. Trades per-run latency (~10-30s of
26
27
  extra model loads) for a much lower memory ceiling. Recommended for
27
28
  GPUs with <=12GB VRAM or hosts with <32GB RAM. Default False.
29
+ whisper_model: Whisper model size used for transcription. Larger models
30
+ give better accuracy at the cost of VRAM and latency. One of
31
+ ``tiny``, ``base``, ``small``, ``medium``, ``large``, ``turbo``.
32
+ Default ``small``.
28
33
  """
29
34
 
30
- def __init__(self, device: str | None = None, low_memory: bool = False):
35
+ def __init__(
36
+ self,
37
+ device: str | None = None,
38
+ low_memory: bool = False,
39
+ whisper_model: WhisperModel = "small",
40
+ ):
31
41
  self.device = device
32
42
  self.low_memory = low_memory
43
+ self.whisper_model = whisper_model
33
44
  self._local_pipeline: Any = None
34
45
  requested = device.lower() if isinstance(device, str) else "auto"
35
- logger.info("VideoDubber initialized with device=%s low_memory=%s", requested, low_memory)
46
+ logger.info(
47
+ "VideoDubber initialized with device=%s low_memory=%s whisper_model=%s",
48
+ requested,
49
+ low_memory,
50
+ whisper_model,
51
+ )
36
52
 
37
53
  def _init_local_pipeline(self) -> None:
38
54
  from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
39
55
 
40
- self._local_pipeline = LocalDubbingPipeline(device=self.device, low_memory=self.low_memory)
56
+ self._local_pipeline = LocalDubbingPipeline(
57
+ device=self.device,
58
+ low_memory=self.low_memory,
59
+ whisper_model=self.whisper_model,
60
+ )
41
61
 
42
62
  def dub(
43
63
  self,
@@ -54,9 +74,14 @@ class VideoDubber:
54
74
 
55
75
  Args:
56
76
  enable_diarization: Enable speaker diarization to clone each speaker's
57
- voice separately. Requires additional VRAM for the diarization model.
58
- transcription: Optional pre-computed Transcription object. When provided,
59
- the internal Whisper transcription step is skipped.
77
+ voice separately. With ``transcription=None``, runs alongside Whisper.
78
+ With a supplied ``transcription`` that has no speakers, runs pyannote
79
+ standalone and overlays speakers onto the supplied words. Ignored when
80
+ the supplied transcription already has speaker labels.
81
+ transcription: Optional pre-computed ``Transcription`` to skip the Whisper
82
+ step. Speaker labels on the supplied transcription drive per-speaker
83
+ voice cloning. If it has no speakers, pass ``enable_diarization=True``
84
+ to add them via pyannote (requires word-level timings).
60
85
  """
61
86
  if self._local_pipeline is None:
62
87
  self._init_local_pipeline()
@@ -86,8 +111,10 @@ class VideoDubber:
86
111
  """Dub a video and return a new video with the dubbed audio.
87
112
 
88
113
  Args:
89
- transcription: Optional pre-computed Transcription object. When provided,
90
- the internal Whisper transcription step is skipped.
114
+ transcription: Optional pre-computed ``Transcription`` to skip the Whisper
115
+ step. Speaker labels on the supplied transcription drive per-speaker
116
+ voice cloning. See ``dub()`` for the interaction with
117
+ ``enable_diarization``.
91
118
  """
92
119
  result = self.dub(
93
120
  video=video,
@@ -132,8 +159,12 @@ class VideoDubber:
132
159
  preserve_background: Preserve background music/effects via source separation.
133
160
  voice_clone: Clone the source speaker's voice for the dubbed track.
134
161
  enable_diarization: Enable speaker diarization for per-speaker voice cloning.
162
+ See ``dub()`` for the interaction with ``transcription``.
135
163
  progress_callback: Optional callback ``(stage: str, progress: float) -> None``.
136
- transcription: Optional pre-computed ``Transcription`` to skip the Whisper step.
164
+ transcription: Optional pre-computed ``Transcription`` to skip the Whisper
165
+ step. Speaker labels on the supplied transcription drive per-speaker
166
+ voice cloning. If it has no speakers, pass ``enable_diarization=True``
167
+ to add them via pyannote (requires word-level timings).
137
168
 
138
169
  Returns:
139
170
  ``DubbingResult`` with the dubbed audio, translated segments, and
@@ -3,7 +3,7 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import logging
6
- from typing import TYPE_CHECKING, Any, Callable
6
+ from typing import TYPE_CHECKING, Any, Callable, Literal
7
7
 
8
8
  from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio
9
9
  from videopython.ai.dubbing.timing import TimingSynchronizer
@@ -11,6 +11,8 @@ from videopython.ai.dubbing.timing import TimingSynchronizer
11
11
  if TYPE_CHECKING:
12
12
  from videopython.base.audio import Audio
13
13
 
14
+ WhisperModel = Literal["tiny", "base", "small", "medium", "large", "turbo"]
15
+
14
16
  logger = logging.getLogger(__name__)
15
17
 
16
18
 
@@ -23,14 +25,21 @@ class LocalDubbingPipeline:
23
25
  with <=12GB VRAM or hosts with <32GB RAM.
24
26
  """
25
27
 
26
- def __init__(self, device: str | None = None, low_memory: bool = False):
28
+ def __init__(
29
+ self,
30
+ device: str | None = None,
31
+ low_memory: bool = False,
32
+ whisper_model: WhisperModel = "small",
33
+ ):
27
34
  self.device = device
28
35
  self.low_memory = low_memory
36
+ self.whisper_model = whisper_model
29
37
  requested = device.lower() if isinstance(device, str) else "auto"
30
38
  logger.info(
31
- "LocalDubbingPipeline initialized with device=%s low_memory=%s",
39
+ "LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s",
32
40
  requested,
33
41
  low_memory,
42
+ whisper_model,
34
43
  )
35
44
 
36
45
  self._transcriber: Any = None
@@ -62,7 +71,11 @@ class LocalDubbingPipeline:
62
71
  """Initialize the transcription model."""
63
72
  from videopython.ai.understanding.audio import AudioToText
64
73
 
65
- self._transcriber = AudioToText(device=self.device, enable_diarization=enable_diarization)
74
+ self._transcriber = AudioToText(
75
+ model_name=self.whisper_model,
76
+ device=self.device,
77
+ enable_diarization=enable_diarization,
78
+ )
66
79
 
67
80
  def _init_translator(self) -> None:
68
81
  """Initialize the translation model."""
@@ -94,6 +107,7 @@ class LocalDubbingPipeline:
94
107
  max_duration: float = 10.0,
95
108
  ) -> dict[str, Any]:
96
109
  """Extract voice samples for each speaker from the audio."""
110
+ from videopython.base.audio import Audio
97
111
 
98
112
  voice_samples: dict[str, Audio] = {}
99
113
 
@@ -120,7 +134,11 @@ class LocalDubbingPipeline:
120
134
  if best_segment is not None:
121
135
  start = best_segment.start
122
136
  end = min(best_segment.end, start + max_duration)
123
- voice_samples[speaker] = audio.slice(start, end)
137
+ sliced = audio.slice(start, end)
138
+ # Audio.slice returns a numpy view into the source. Copy so the
139
+ # short voice sample doesn't keep the full vocals array (~1.3 GB
140
+ # for 2h sources) alive across translate + TTS.
141
+ voice_samples[speaker] = Audio(sliced.data.copy(), sliced.metadata)
124
142
 
125
143
  return voice_samples
126
144
 
@@ -144,7 +162,16 @@ class LocalDubbingPipeline:
144
162
  transcription: Optional pre-computed Transcription object. When provided,
145
163
  the internal Whisper transcription step is skipped (saving time and VRAM).
146
164
  Must be a ``videopython.base.text.transcription.Transcription`` instance
147
- with populated ``segments``.
165
+ with populated ``segments``. Speaker labels on the supplied transcription
166
+ drive per-speaker voice cloning. If the supplied transcription has no
167
+ speakers and ``enable_diarization=True``, pyannote is run standalone on
168
+ ``source_audio`` and speakers are attached to the supplied words
169
+ (requires word-level timings).
170
+ enable_diarization: When True, run speaker diarization to enable per-speaker
171
+ voice cloning. With ``transcription=None``, runs alongside Whisper. With
172
+ a supplied ``transcription`` that has no speakers, runs pyannote
173
+ standalone and overlays speakers onto the supplied words. Ignored when
174
+ the supplied transcription already has speaker labels.
148
175
  """
149
176
 
150
177
  def report_progress(stage: str, progress: float) -> None:
@@ -153,6 +180,34 @@ class LocalDubbingPipeline:
153
180
 
154
181
  if transcription is not None:
155
182
  report_progress("Using provided transcription", 0.05)
183
+ if transcription.speakers:
184
+ logger.info(
185
+ "Using provided transcription: %d segment(s), %d speaker(s)",
186
+ len(transcription.segments),
187
+ len(transcription.speakers),
188
+ )
189
+ if enable_diarization:
190
+ logger.info("enable_diarization=True ignored: supplied transcription already has speaker labels.")
191
+ elif enable_diarization:
192
+ report_progress("Diarizing supplied transcription", 0.10)
193
+ if self._transcriber is None or self._transcriber_diarization is not True:
194
+ self._init_transcriber(enable_diarization=True)
195
+ self._transcriber_diarization = True
196
+ transcription = self._transcriber.diarize_transcription(source_audio, transcription)
197
+ self._maybe_unload("_transcriber")
198
+ logger.info(
199
+ "Diarized supplied transcription: %d segment(s), %d speaker(s)",
200
+ len(transcription.segments),
201
+ len(transcription.speakers),
202
+ )
203
+ else:
204
+ logger.info(
205
+ "Using provided transcription: %d segment(s), no speaker labels. "
206
+ "All segments will share a single voice clone. Pass "
207
+ "enable_diarization=True to add per-speaker labels, or "
208
+ "voice_clone=False to use the default TTS voice.",
209
+ len(transcription.segments),
210
+ )
156
211
  else:
157
212
  report_progress("Transcribing audio", 0.05)
158
213
  if self._transcriber is None or self._transcriber_diarization != enable_diarization:
@@ -175,6 +230,7 @@ class LocalDubbingPipeline:
175
230
 
176
231
  separated_audio: SeparatedAudio | None = None
177
232
  vocal_audio = source_audio
233
+ background_audio: Audio | None = None
178
234
 
179
235
  if preserve_background:
180
236
  report_progress("Separating audio", 0.15)
@@ -184,12 +240,24 @@ class LocalDubbingPipeline:
184
240
  separated_audio = self._separator.separate(source_audio)
185
241
  self._maybe_unload("_separator")
186
242
  vocal_audio = separated_audio.vocals
243
+ background_audio = separated_audio.background
244
+ # In low_memory mode, drop the SeparatedAudio container so vocals
245
+ # and background can be released as soon as their last local
246
+ # reference goes (after voice-sample extraction and final overlay
247
+ # respectively). The result will report separated_audio=None.
248
+ if self.low_memory:
249
+ separated_audio = None
187
250
 
188
251
  voice_samples: dict[str, Audio] = {}
189
252
  if voice_clone:
190
253
  report_progress("Extracting voice samples", 0.25)
191
254
  voice_samples = self._extract_voice_samples(vocal_audio, transcription)
192
255
 
256
+ # vocals is no longer needed; voice_samples are independent copies.
257
+ # In low_memory mode this is the only ref keeping the buffer alive
258
+ # (separated_audio was dropped above), so dropping the local frees it.
259
+ del vocal_audio
260
+
193
261
  report_progress("Translating text", 0.35)
194
262
  if self._translator is None:
195
263
  self._init_translator()
@@ -237,17 +305,23 @@ class LocalDubbingPipeline:
237
305
  assert self._synchronizer is not None
238
306
 
239
307
  synchronized_segments, _ = self._synchronizer.synchronize_segments(dubbed_segments, target_durations)
308
+ del dubbed_segments
240
309
 
241
310
  report_progress("Assembling final audio", 0.90)
242
311
  total_duration = source_audio.metadata.duration_seconds
243
312
  dubbed_speech = self._synchronizer.assemble_with_timing(synchronized_segments, start_times, total_duration)
313
+ del synchronized_segments
244
314
 
245
- if separated_audio is not None:
246
- background_sr = separated_audio.background.metadata.sample_rate
315
+ if background_audio is not None:
316
+ background_sr = background_audio.metadata.sample_rate
247
317
  if dubbed_speech.metadata.sample_rate != background_sr:
248
318
  dubbed_speech = dubbed_speech.resample(background_sr)
249
319
 
250
- final_audio = separated_audio.background.overlay(dubbed_speech, position=0.0)
320
+ final_audio = background_audio.overlay(dubbed_speech, position=0.0)
321
+ # Drop the local; in low_memory this releases the background
322
+ # buffer (~1.3 GB for 2h sources). In non-low_memory the same
323
+ # array is still held by separated_audio.background.
324
+ del background_audio
251
325
  else:
252
326
  final_audio = dubbed_speech
253
327
 
@@ -294,6 +368,7 @@ class LocalDubbingPipeline:
294
368
 
295
369
  separated_audio: SeparatedAudio | None = None
296
370
  vocal_audio = source_audio
371
+ background_audio: Audio | None = None
297
372
 
298
373
  if preserve_background:
299
374
  report_progress("Separating audio", 0.20)
@@ -303,6 +378,9 @@ class LocalDubbingPipeline:
303
378
  separated_audio = self._separator.separate(source_audio)
304
379
  self._maybe_unload("_separator")
305
380
  vocal_audio = separated_audio.vocals
381
+ background_audio = separated_audio.background
382
+ if self.low_memory:
383
+ separated_audio = None
306
384
 
307
385
  report_progress("Extracting voice sample", 0.40)
308
386
  voice_sample: Audio | None = None
@@ -314,7 +392,11 @@ class LocalDubbingPipeline:
314
392
 
315
393
  if voice_sample is None:
316
394
  sample_duration = min(6.0, original_duration)
317
- voice_sample = vocal_audio.slice(0, sample_duration)
395
+ sliced = vocal_audio.slice(0, sample_duration)
396
+ # Copy so the short sample doesn't pin the full vocals array.
397
+ voice_sample = Audio(sliced.data.copy(), sliced.metadata)
398
+
399
+ del vocal_audio
318
400
 
319
401
  report_progress("Generating speech", 0.60)
320
402
  if self._tts is None or self._tts_language != "en":
@@ -327,24 +409,24 @@ class LocalDubbingPipeline:
327
409
 
328
410
  report_progress("Assembling audio", 0.85)
329
411
 
330
- if separated_audio is not None:
331
- background_sr = separated_audio.background.metadata.sample_rate
412
+ if background_audio is not None:
413
+ background_sr = background_audio.metadata.sample_rate
332
414
  if generated_speech.metadata.sample_rate != background_sr:
333
415
  generated_speech = generated_speech.resample(background_sr)
334
416
 
335
- background = separated_audio.background
336
- if background.metadata.duration_seconds > speech_duration:
337
- background = background.slice(0, speech_duration)
338
- elif background.metadata.duration_seconds < speech_duration:
339
- silence_duration = speech_duration - background.metadata.duration_seconds
417
+ if background_audio.metadata.duration_seconds > speech_duration:
418
+ background_audio = background_audio.slice(0, speech_duration)
419
+ elif background_audio.metadata.duration_seconds < speech_duration:
420
+ silence_duration = speech_duration - background_audio.metadata.duration_seconds
340
421
  silence = Audio.silence(
341
422
  duration=silence_duration,
342
423
  sample_rate=background_sr,
343
- channels=background.metadata.channels,
424
+ channels=background_audio.metadata.channels,
344
425
  )
345
- background = background.concat(silence)
426
+ background_audio = background_audio.concat(silence)
346
427
 
347
- final_audio = background.overlay(generated_speech, position=0.0)
428
+ final_audio = background_audio.overlay(generated_speech, position=0.0)
429
+ del background_audio
348
430
  else:
349
431
  final_audio = generated_speech
350
432
 
@@ -130,6 +130,48 @@ class AudioToText:
130
130
  )
131
131
  return result
132
132
 
133
+ def diarize_transcription(self, audio: Audio, transcription: Transcription) -> Transcription:
134
+ """Attach speaker labels to a pre-computed transcription using pyannote.
135
+
136
+ Useful when callers have a transcription (e.g. pre-computed and edited)
137
+ but no speakers, and want per-speaker voice cloning in dubbing without
138
+ re-running Whisper. Runs pyannote standalone on ``audio`` and overlays
139
+ speakers onto the supplied transcription's words.
140
+
141
+ Requires word-level timings: at least one segment must contain more
142
+ than one word. Transcriptions loaded from SRT (one synthetic word per
143
+ segment) will not produce useful speakers and are rejected.
144
+ """
145
+ import numpy as np
146
+ import torch
147
+
148
+ all_words: list[TranscriptionWord] = list(transcription.words)
149
+ if not all_words:
150
+ raise ValueError("Cannot diarize a transcription with no words.")
151
+
152
+ if not any(len(seg.words) > 1 for seg in transcription.segments):
153
+ raise ValueError(
154
+ "Cannot diarize a transcription without word-level timings. "
155
+ "Supplied transcription has at most one word per segment "
156
+ "(e.g. loaded from SRT). Provide a transcription with "
157
+ "word-level timings, or omit `transcription` to let the "
158
+ "pipeline transcribe and diarize from scratch."
159
+ )
160
+
161
+ if self._diarization_pipeline is None:
162
+ self._init_diarization()
163
+
164
+ import whisper
165
+
166
+ audio_mono = audio.to_mono().resample(whisper.audio.SAMPLE_RATE)
167
+ waveform = torch.from_numpy(audio_mono.data.astype(np.float32)).unsqueeze(0)
168
+ diarization_result = self._diarization_pipeline(
169
+ {"waveform": waveform, "sample_rate": audio_mono.metadata.sample_rate}
170
+ )
171
+
172
+ all_words = self._assign_speakers_to_words(all_words, diarization_result)
173
+ return Transcription(words=all_words, language=transcription.language)
174
+
133
175
  def _transcribe_with_diarization(self, audio_mono: Audio) -> Transcription:
134
176
  """Transcribe with word timestamps and assign speakers via pyannote."""
135
177
  import numpy as np
@@ -42,7 +42,15 @@ class AudioSeparator:
42
42
  )
43
43
 
44
44
  def _separate_local(self, audio: Audio) -> SeparatedAudio:
45
- """Separate audio using local Demucs model."""
45
+ """Separate audio using local Demucs model.
46
+
47
+ Keeps the input tensor on CPU and passes ``device=self.device`` to
48
+ ``apply_model`` so per-chunk compute runs on GPU while the full
49
+ ``(stems, channels, samples)`` output is stored in CPU RAM. For long
50
+ sources this is the difference between OOM-on-GPU and running cleanly:
51
+ a 2h stereo @ 44.1kHz output is ~10 GB — too big for an 8 GB card but
52
+ comfortable on a 32 GB host.
53
+ """
46
54
  import numpy as np
47
55
  import torch
48
56
  from demucs.apply import apply_model
@@ -65,61 +73,40 @@ class AudioSeparator:
65
73
  audio_data = audio_data.T
66
74
 
67
75
  wav = torch.tensor(audio_data, dtype=torch.float32).unsqueeze(0)
68
- wav = wav.to(self.device)
69
76
 
70
77
  with torch.no_grad():
71
78
  sources = apply_model(self._model, wav, device=self.device)
72
79
 
73
80
  sources_np = sources[0].cpu().numpy()
81
+ del sources
74
82
 
75
83
  stem_names = self.STEM_NAMES_6S if self.model_name == "htdemucs_6s" else self.STEM_NAMES
84
+ vocals_idx = stem_names.index("vocals")
85
+ non_vocal_indices = [i for i in range(len(stem_names)) if i != vocals_idx]
76
86
 
77
- stems: dict[str, Audio] = {}
78
- for i, name in enumerate(stem_names):
79
- stem_data = sources_np[i].T
80
-
81
- metadata = AudioMetadata(
82
- sample_rate=target_sr,
83
- channels=2,
84
- sample_width=2,
85
- duration_seconds=stem_data.shape[0] / target_sr,
86
- frame_count=stem_data.shape[0],
87
- )
88
- stems[name] = Audio(stem_data.astype(np.float32), metadata)
89
-
90
- vocals = stems["vocals"]
91
-
92
- non_vocal_stems = [stems[name] for name in stem_names if name != "vocals"]
93
- background_data = np.zeros_like(vocals.data)
94
- for stem in non_vocal_stems:
95
- background_data += stem.data
87
+ vocals_data = sources_np[vocals_idx].T
88
+ background_data = sources_np[non_vocal_indices].sum(axis=0).T
89
+ del sources_np
96
90
 
97
91
  max_val = np.max(np.abs(background_data))
98
92
  if max_val > 1.0:
99
- background_data = background_data / max_val
100
-
101
- background = Audio(background_data.astype(np.float32), vocals.metadata)
102
-
103
- music_stems = ["drums", "bass", "other"]
104
- if self.model_name == "htdemucs_6s":
105
- music_stems.extend(["guitar", "piano"])
106
-
107
- music_data = np.zeros_like(vocals.data)
108
- for name in music_stems:
109
- if name in stems:
110
- music_data += stems[name].data
111
-
112
- max_val = np.max(np.abs(music_data))
113
- if max_val > 1.0:
114
- music_data = music_data / max_val
115
-
116
- music = Audio(music_data.astype(np.float32), vocals.metadata)
93
+ background_data /= max_val
94
+
95
+ metadata = AudioMetadata(
96
+ sample_rate=target_sr,
97
+ channels=2,
98
+ sample_width=2,
99
+ duration_seconds=vocals_data.shape[0] / target_sr,
100
+ frame_count=vocals_data.shape[0],
101
+ )
102
+ vocals = Audio(np.ascontiguousarray(vocals_data, dtype=np.float32), metadata)
103
+ background = Audio(np.ascontiguousarray(background_data, dtype=np.float32), metadata)
117
104
 
118
105
  return SeparatedAudio(
119
106
  vocals=vocals,
120
107
  background=background,
121
108
  original=audio,
122
- music=music,
109
+ music=None,
123
110
  effects=None,
124
111
  )
125
112
 
File without changes
File without changes
File without changes