videopython 0.26.3__tar.gz → 0.26.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {videopython-0.26.3 → videopython-0.26.5}/PKG-INFO +1 -1
  2. {videopython-0.26.3 → videopython-0.26.5}/pyproject.toml +1 -1
  3. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/dubbing/dubber.py +23 -3
  4. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/dubbing/pipeline.py +70 -35
  5. videopython-0.26.5/src/videopython/ai/generation/audio.py +156 -0
  6. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/understanding/separation.py +27 -40
  7. videopython-0.26.3/src/videopython/ai/generation/audio.py +0 -215
  8. {videopython-0.26.3 → videopython-0.26.5}/.gitignore +0 -0
  9. {videopython-0.26.3 → videopython-0.26.5}/LICENSE +0 -0
  10. {videopython-0.26.3 → videopython-0.26.5}/README.md +0 -0
  11. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/__init__.py +0 -0
  12. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/__init__.py +0 -0
  13. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/_device.py +0 -0
  14. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/dubbing/__init__.py +0 -0
  15. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/dubbing/models.py +0 -0
  16. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/dubbing/remux.py +0 -0
  17. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/dubbing/timing.py +0 -0
  18. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/generation/__init__.py +0 -0
  19. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/generation/image.py +0 -0
  20. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/generation/translation.py +0 -0
  21. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/generation/video.py +0 -0
  22. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/registry.py +0 -0
  23. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/swapping/__init__.py +0 -0
  24. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/swapping/inpainter.py +0 -0
  25. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/swapping/models.py +0 -0
  26. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/swapping/segmenter.py +0 -0
  27. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/swapping/swapper.py +0 -0
  28. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/transforms.py +0 -0
  29. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/understanding/__init__.py +0 -0
  30. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/understanding/audio.py +0 -0
  31. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/understanding/image.py +0 -0
  32. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/understanding/temporal.py +0 -0
  33. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/ai/video_analysis.py +0 -0
  34. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/__init__.py +0 -0
  35. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/audio/__init__.py +0 -0
  36. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/audio/analysis.py +0 -0
  37. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/audio/audio.py +0 -0
  38. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/combine.py +0 -0
  39. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/description.py +0 -0
  40. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/effects.py +0 -0
  41. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/exceptions.py +0 -0
  42. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/progress.py +0 -0
  43. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/registry.py +0 -0
  44. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/scene.py +0 -0
  45. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/streaming.py +0 -0
  46. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/text/__init__.py +0 -0
  47. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/text/overlay.py +0 -0
  48. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/text/transcription.py +0 -0
  49. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/transforms.py +0 -0
  50. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/transitions.py +0 -0
  51. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/utils.py +0 -0
  52. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/base/video.py +0 -0
  53. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/editing/__init__.py +0 -0
  54. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/editing/multicam.py +0 -0
  55. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/editing/premiere_xml.py +0 -0
  56. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/editing/video_edit.py +0 -0
  57. {videopython-0.26.3 → videopython-0.26.5}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.26.3
3
+ Version: 0.26.5
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.26.3"
3
+ version = "0.26.5"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -8,6 +8,7 @@ from pathlib import Path
8
8
  from typing import TYPE_CHECKING, Any, Callable
9
9
 
10
10
  from videopython.ai.dubbing.models import DubbingResult, RevoiceResult
11
+ from videopython.ai.dubbing.pipeline import WhisperModel
11
12
 
12
13
  if TYPE_CHECKING:
13
14
  from videopython.base.video import Video
@@ -25,19 +26,38 @@ class VideoDubber:
25
26
  model is resident at a time. Trades per-run latency (~10-30s of
26
27
  extra model loads) for a much lower memory ceiling. Recommended for
27
28
  GPUs with <=12GB VRAM or hosts with <32GB RAM. Default False.
29
+ whisper_model: Whisper model size used for transcription. Larger models
30
+ give better accuracy at the cost of VRAM and latency. One of
31
+ ``tiny``, ``base``, ``small``, ``medium``, ``large``, ``turbo``.
32
+ Default ``small``.
28
33
  """
29
34
 
30
- def __init__(self, device: str | None = None, low_memory: bool = False):
35
+ def __init__(
36
+ self,
37
+ device: str | None = None,
38
+ low_memory: bool = False,
39
+ whisper_model: WhisperModel = "small",
40
+ ):
31
41
  self.device = device
32
42
  self.low_memory = low_memory
43
+ self.whisper_model = whisper_model
33
44
  self._local_pipeline: Any = None
34
45
  requested = device.lower() if isinstance(device, str) else "auto"
35
- logger.info("VideoDubber initialized with device=%s low_memory=%s", requested, low_memory)
46
+ logger.info(
47
+ "VideoDubber initialized with device=%s low_memory=%s whisper_model=%s",
48
+ requested,
49
+ low_memory,
50
+ whisper_model,
51
+ )
36
52
 
37
53
  def _init_local_pipeline(self) -> None:
38
54
  from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
39
55
 
40
- self._local_pipeline = LocalDubbingPipeline(device=self.device, low_memory=self.low_memory)
56
+ self._local_pipeline = LocalDubbingPipeline(
57
+ device=self.device,
58
+ low_memory=self.low_memory,
59
+ whisper_model=self.whisper_model,
60
+ )
41
61
 
42
62
  def dub(
43
63
  self,
@@ -3,7 +3,7 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import logging
6
- from typing import TYPE_CHECKING, Any, Callable
6
+ from typing import TYPE_CHECKING, Any, Callable, Literal
7
7
 
8
8
  from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio
9
9
  from videopython.ai.dubbing.timing import TimingSynchronizer
@@ -11,6 +11,8 @@ from videopython.ai.dubbing.timing import TimingSynchronizer
11
11
  if TYPE_CHECKING:
12
12
  from videopython.base.audio import Audio
13
13
 
14
+ WhisperModel = Literal["tiny", "base", "small", "medium", "large", "turbo"]
15
+
14
16
  logger = logging.getLogger(__name__)
15
17
 
16
18
 
@@ -23,21 +25,27 @@ class LocalDubbingPipeline:
23
25
  with <=12GB VRAM or hosts with <32GB RAM.
24
26
  """
25
27
 
26
- def __init__(self, device: str | None = None, low_memory: bool = False):
28
+ def __init__(
29
+ self,
30
+ device: str | None = None,
31
+ low_memory: bool = False,
32
+ whisper_model: WhisperModel = "small",
33
+ ):
27
34
  self.device = device
28
35
  self.low_memory = low_memory
36
+ self.whisper_model = whisper_model
29
37
  requested = device.lower() if isinstance(device, str) else "auto"
30
38
  logger.info(
31
- "LocalDubbingPipeline initialized with device=%s low_memory=%s",
39
+ "LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s",
32
40
  requested,
33
41
  low_memory,
42
+ whisper_model,
34
43
  )
35
44
 
36
45
  self._transcriber: Any = None
37
46
  self._transcriber_diarization: bool | None = None
38
47
  self._translator: Any = None
39
48
  self._tts: Any = None
40
- self._tts_voice_clone: bool | None = None
41
49
  self._tts_language: str | None = None
42
50
  self._separator: Any = None
43
51
  self._synchronizer: TimingSynchronizer | None = None
@@ -63,7 +71,11 @@ class LocalDubbingPipeline:
63
71
  """Initialize the transcription model."""
64
72
  from videopython.ai.understanding.audio import AudioToText
65
73
 
66
- self._transcriber = AudioToText(device=self.device, enable_diarization=enable_diarization)
74
+ self._transcriber = AudioToText(
75
+ model_name=self.whisper_model,
76
+ device=self.device,
77
+ enable_diarization=enable_diarization,
78
+ )
67
79
 
68
80
  def _init_translator(self) -> None:
69
81
  """Initialize the translation model."""
@@ -71,18 +83,11 @@ class LocalDubbingPipeline:
71
83
 
72
84
  self._translator = TextTranslator(device=self.device)
73
85
 
74
- def _init_tts(self, voice_clone: bool = False, language: str = "en") -> None:
86
+ def _init_tts(self, language: str = "en") -> None:
75
87
  """Initialize the text-to-speech model."""
76
88
  from videopython.ai.generation.audio import TextToSpeech
77
89
 
78
- if voice_clone:
79
- self._tts = TextToSpeech(
80
- model_size="chatterbox",
81
- device=self.device,
82
- language=language,
83
- )
84
- else:
85
- self._tts = TextToSpeech(device=self.device, language=language)
90
+ self._tts = TextToSpeech(device=self.device, language=language)
86
91
 
87
92
  def _init_separator(self) -> None:
88
93
  """Initialize the audio separator."""
@@ -102,6 +107,7 @@ class LocalDubbingPipeline:
102
107
  max_duration: float = 10.0,
103
108
  ) -> dict[str, Any]:
104
109
  """Extract voice samples for each speaker from the audio."""
110
+ from videopython.base.audio import Audio
105
111
 
106
112
  voice_samples: dict[str, Audio] = {}
107
113
 
@@ -128,7 +134,11 @@ class LocalDubbingPipeline:
128
134
  if best_segment is not None:
129
135
  start = best_segment.start
130
136
  end = min(best_segment.end, start + max_duration)
131
- voice_samples[speaker] = audio.slice(start, end)
137
+ sliced = audio.slice(start, end)
138
+ # Audio.slice returns a numpy view into the source. Copy so the
139
+ # short voice sample doesn't keep the full vocals array (~1.3 GB
140
+ # for 2h sources) alive across translate + TTS.
141
+ voice_samples[speaker] = Audio(sliced.data.copy(), sliced.metadata)
132
142
 
133
143
  return voice_samples
134
144
 
@@ -183,6 +193,7 @@ class LocalDubbingPipeline:
183
193
 
184
194
  separated_audio: SeparatedAudio | None = None
185
195
  vocal_audio = source_audio
196
+ background_audio: Audio | None = None
186
197
 
187
198
  if preserve_background:
188
199
  report_progress("Separating audio", 0.15)
@@ -192,12 +203,24 @@ class LocalDubbingPipeline:
192
203
  separated_audio = self._separator.separate(source_audio)
193
204
  self._maybe_unload("_separator")
194
205
  vocal_audio = separated_audio.vocals
206
+ background_audio = separated_audio.background
207
+ # In low_memory mode, drop the SeparatedAudio container so vocals
208
+ # and background can be released as soon as their last local
209
+ # reference goes (after voice-sample extraction and final overlay
210
+ # respectively). The result will report separated_audio=None.
211
+ if self.low_memory:
212
+ separated_audio = None
195
213
 
196
214
  voice_samples: dict[str, Audio] = {}
197
215
  if voice_clone:
198
216
  report_progress("Extracting voice samples", 0.25)
199
217
  voice_samples = self._extract_voice_samples(vocal_audio, transcription)
200
218
 
219
+ # vocals is no longer needed; voice_samples are independent copies.
220
+ # In low_memory mode this is the only ref keeping the buffer alive
221
+ # (separated_audio was dropped above), so dropping the local frees it.
222
+ del vocal_audio
223
+
201
224
  report_progress("Translating text", 0.35)
202
225
  if self._translator is None:
203
226
  self._init_translator()
@@ -210,9 +233,8 @@ class LocalDubbingPipeline:
210
233
  self._maybe_unload("_translator")
211
234
 
212
235
  report_progress("Generating dubbed speech", 0.50)
213
- if self._tts is None or self._tts_voice_clone != voice_clone or self._tts_language != target_lang:
214
- self._init_tts(voice_clone=voice_clone, language=target_lang)
215
- self._tts_voice_clone = voice_clone
236
+ if self._tts is None or self._tts_language != target_lang:
237
+ self._init_tts(language=target_lang)
216
238
  self._tts_language = target_lang
217
239
 
218
240
  dubbed_segments: list[Audio] = []
@@ -246,17 +268,23 @@ class LocalDubbingPipeline:
246
268
  assert self._synchronizer is not None
247
269
 
248
270
  synchronized_segments, _ = self._synchronizer.synchronize_segments(dubbed_segments, target_durations)
271
+ del dubbed_segments
249
272
 
250
273
  report_progress("Assembling final audio", 0.90)
251
274
  total_duration = source_audio.metadata.duration_seconds
252
275
  dubbed_speech = self._synchronizer.assemble_with_timing(synchronized_segments, start_times, total_duration)
276
+ del synchronized_segments
253
277
 
254
- if separated_audio is not None:
255
- background_sr = separated_audio.background.metadata.sample_rate
278
+ if background_audio is not None:
279
+ background_sr = background_audio.metadata.sample_rate
256
280
  if dubbed_speech.metadata.sample_rate != background_sr:
257
281
  dubbed_speech = dubbed_speech.resample(background_sr)
258
282
 
259
- final_audio = separated_audio.background.overlay(dubbed_speech, position=0.0)
283
+ final_audio = background_audio.overlay(dubbed_speech, position=0.0)
284
+ # Drop the local; in low_memory this releases the background
285
+ # buffer (~1.3 GB for 2h sources). In non-low_memory the same
286
+ # array is still held by separated_audio.background.
287
+ del background_audio
260
288
  else:
261
289
  final_audio = dubbed_speech
262
290
 
@@ -303,6 +331,7 @@ class LocalDubbingPipeline:
303
331
 
304
332
  separated_audio: SeparatedAudio | None = None
305
333
  vocal_audio = source_audio
334
+ background_audio: Audio | None = None
306
335
 
307
336
  if preserve_background:
308
337
  report_progress("Separating audio", 0.20)
@@ -312,6 +341,9 @@ class LocalDubbingPipeline:
312
341
  separated_audio = self._separator.separate(source_audio)
313
342
  self._maybe_unload("_separator")
314
343
  vocal_audio = separated_audio.vocals
344
+ background_audio = separated_audio.background
345
+ if self.low_memory:
346
+ separated_audio = None
315
347
 
316
348
  report_progress("Extracting voice sample", 0.40)
317
349
  voice_sample: Audio | None = None
@@ -323,12 +355,15 @@ class LocalDubbingPipeline:
323
355
 
324
356
  if voice_sample is None:
325
357
  sample_duration = min(6.0, original_duration)
326
- voice_sample = vocal_audio.slice(0, sample_duration)
358
+ sliced = vocal_audio.slice(0, sample_duration)
359
+ # Copy so the short sample doesn't pin the full vocals array.
360
+ voice_sample = Audio(sliced.data.copy(), sliced.metadata)
361
+
362
+ del vocal_audio
327
363
 
328
364
  report_progress("Generating speech", 0.60)
329
- if self._tts is None or self._tts_voice_clone is not True or self._tts_language != "en":
330
- self._init_tts(voice_clone=True, language="en")
331
- self._tts_voice_clone = True
365
+ if self._tts is None or self._tts_language != "en":
366
+ self._init_tts(language="en")
332
367
  self._tts_language = "en"
333
368
 
334
369
  generated_speech = self._tts.generate_audio(text, voice_sample=voice_sample)
@@ -337,24 +372,24 @@ class LocalDubbingPipeline:
337
372
 
338
373
  report_progress("Assembling audio", 0.85)
339
374
 
340
- if separated_audio is not None:
341
- background_sr = separated_audio.background.metadata.sample_rate
375
+ if background_audio is not None:
376
+ background_sr = background_audio.metadata.sample_rate
342
377
  if generated_speech.metadata.sample_rate != background_sr:
343
378
  generated_speech = generated_speech.resample(background_sr)
344
379
 
345
- background = separated_audio.background
346
- if background.metadata.duration_seconds > speech_duration:
347
- background = background.slice(0, speech_duration)
348
- elif background.metadata.duration_seconds < speech_duration:
349
- silence_duration = speech_duration - background.metadata.duration_seconds
380
+ if background_audio.metadata.duration_seconds > speech_duration:
381
+ background_audio = background_audio.slice(0, speech_duration)
382
+ elif background_audio.metadata.duration_seconds < speech_duration:
383
+ silence_duration = speech_duration - background_audio.metadata.duration_seconds
350
384
  silence = Audio.silence(
351
385
  duration=silence_duration,
352
386
  sample_rate=background_sr,
353
- channels=background.metadata.channels,
387
+ channels=background_audio.metadata.channels,
354
388
  )
355
- background = background.concat(silence)
389
+ background_audio = background_audio.concat(silence)
356
390
 
357
- final_audio = background.overlay(generated_speech, position=0.0)
391
+ final_audio = background_audio.overlay(generated_speech, position=0.0)
392
+ del background_audio
358
393
  else:
359
394
  final_audio = generated_speech
360
395
 
@@ -0,0 +1,156 @@
1
+ """Audio generation using local models."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from videopython.ai._device import log_device_initialization, release_device_memory, select_device
8
+ from videopython.base.audio import Audio, AudioMetadata
9
+
10
+
11
+ class TextToSpeech:
12
+ """Generates speech audio from text using Chatterbox Multilingual.
13
+
14
+ Backed by Chatterbox Multilingual (Resemble AI). When ``voice_sample`` is
15
+ provided to ``generate_audio``, the model clones that voice; otherwise it
16
+ falls back to Chatterbox's built-in default speaker.
17
+ """
18
+
19
+ SAMPLE_RATE: int = 24000
20
+
21
+ def __init__(
22
+ self,
23
+ voice: Audio | None = None,
24
+ device: str | None = None,
25
+ language: str = "en",
26
+ ):
27
+ self.voice = voice
28
+ self.device = device
29
+ self.language = language
30
+ self._model: Any = None
31
+
32
+ def _init_model(self) -> None:
33
+ from chatterbox.mtl_tts import ChatterboxMultilingualTTS # type: ignore[import-untyped]
34
+
35
+ requested_device = self.device
36
+ device = select_device(self.device, mps_allowed=False)
37
+
38
+ self._model = ChatterboxMultilingualTTS.from_pretrained(device=device)
39
+ self.device = device
40
+ log_device_initialization(
41
+ "TextToSpeech",
42
+ requested_device=requested_device,
43
+ resolved_device=device,
44
+ )
45
+
46
+ def generate_audio(
47
+ self,
48
+ text: str,
49
+ voice_sample: Audio | None = None,
50
+ ) -> Audio:
51
+ """Generate speech audio from text.
52
+
53
+ Args:
54
+ text: Text to synthesize.
55
+ voice_sample: Optional voice sample to clone. Falls back to the
56
+ instance's ``voice`` and then to Chatterbox's default speaker.
57
+ """
58
+ import tempfile
59
+ from pathlib import Path
60
+
61
+ import numpy as np
62
+
63
+ if self._model is None:
64
+ self._init_model()
65
+
66
+ effective_sample = voice_sample or self.voice
67
+ speaker_wav_path: Path | None = None
68
+
69
+ if effective_sample is not None:
70
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
71
+ effective_sample.save(f.name)
72
+ speaker_wav_path = Path(f.name)
73
+
74
+ try:
75
+ wav = self._model.generate(
76
+ text=text,
77
+ language_id=self.language,
78
+ audio_prompt_path=str(speaker_wav_path) if speaker_wav_path else None,
79
+ )
80
+
81
+ audio_data = wav.cpu().float().numpy().squeeze()
82
+ if audio_data.ndim == 0:
83
+ audio_data = np.array([audio_data], dtype=np.float32)
84
+
85
+ metadata = AudioMetadata(
86
+ sample_rate=self.SAMPLE_RATE,
87
+ channels=1,
88
+ sample_width=2,
89
+ duration_seconds=len(audio_data) / self.SAMPLE_RATE,
90
+ frame_count=len(audio_data),
91
+ )
92
+ return Audio(audio_data, metadata)
93
+ finally:
94
+ if speaker_wav_path is not None:
95
+ speaker_wav_path.unlink()
96
+
97
+ def unload(self) -> None:
98
+ """Release the TTS model so the next generate_audio() re-initializes.
99
+
100
+ Used by low-memory dubbing to free VRAM between pipeline stages.
101
+ """
102
+ self._model = None
103
+ release_device_memory(self.device)
104
+
105
+
106
+ class TextToMusic:
107
+ """Generates music from text descriptions using MusicGen."""
108
+
109
+ def __init__(self, device: str | None = None):
110
+ self.device = device
111
+ self._processor: Any = None
112
+ self._model: Any = None
113
+ self._device: str | None = None
114
+
115
+ def _init_local(self) -> None:
116
+ """Initialize local MusicGen model."""
117
+ import os
118
+
119
+ from transformers import AutoProcessor, MusicgenForConditionalGeneration
120
+
121
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
122
+
123
+ requested_device = self.device
124
+ self._device = select_device(self.device, mps_allowed=True)
125
+
126
+ model_name = "facebook/musicgen-small"
127
+ self._processor = AutoProcessor.from_pretrained(model_name)
128
+ self._model = MusicgenForConditionalGeneration.from_pretrained(model_name)
129
+ self._model.to(self._device)
130
+ self.device = self._device
131
+ log_device_initialization(
132
+ "TextToMusic",
133
+ requested_device=requested_device,
134
+ resolved_device=self._device,
135
+ )
136
+
137
+ def generate_audio(self, text: str, max_new_tokens: int = 256) -> Audio:
138
+ """Generate music audio from text description."""
139
+ if self._model is None:
140
+ self._init_local()
141
+
142
+ inputs = self._processor(text=[text], padding=True, return_tensors="pt")
143
+ inputs = {k: v.to(self._device) if hasattr(v, "to") else v for k, v in inputs.items()}
144
+ audio_values = self._model.generate(**inputs, max_new_tokens=max_new_tokens)
145
+ sampling_rate = self._model.config.audio_encoder.sampling_rate
146
+
147
+ audio_data = audio_values[0, 0].cpu().float().numpy()
148
+
149
+ metadata = AudioMetadata(
150
+ sample_rate=sampling_rate,
151
+ channels=1,
152
+ sample_width=2,
153
+ duration_seconds=len(audio_data) / sampling_rate,
154
+ frame_count=len(audio_data),
155
+ )
156
+ return Audio(audio_data, metadata)
@@ -42,7 +42,15 @@ class AudioSeparator:
42
42
  )
43
43
 
44
44
  def _separate_local(self, audio: Audio) -> SeparatedAudio:
45
- """Separate audio using local Demucs model."""
45
+ """Separate audio using local Demucs model.
46
+
47
+ Keeps the input tensor on CPU and passes ``device=self.device`` to
48
+ ``apply_model`` so per-chunk compute runs on GPU while the full
49
+ ``(stems, channels, samples)`` output is stored in CPU RAM. For long
50
+ sources this is the difference between OOM-on-GPU and running cleanly:
51
+ a 2h stereo @ 44.1kHz output is ~10 GB — too big for an 8 GB card but
52
+ comfortable on a 32 GB host.
53
+ """
46
54
  import numpy as np
47
55
  import torch
48
56
  from demucs.apply import apply_model
@@ -65,61 +73,40 @@ class AudioSeparator:
65
73
  audio_data = audio_data.T
66
74
 
67
75
  wav = torch.tensor(audio_data, dtype=torch.float32).unsqueeze(0)
68
- wav = wav.to(self.device)
69
76
 
70
77
  with torch.no_grad():
71
78
  sources = apply_model(self._model, wav, device=self.device)
72
79
 
73
80
  sources_np = sources[0].cpu().numpy()
81
+ del sources
74
82
 
75
83
  stem_names = self.STEM_NAMES_6S if self.model_name == "htdemucs_6s" else self.STEM_NAMES
84
+ vocals_idx = stem_names.index("vocals")
85
+ non_vocal_indices = [i for i in range(len(stem_names)) if i != vocals_idx]
76
86
 
77
- stems: dict[str, Audio] = {}
78
- for i, name in enumerate(stem_names):
79
- stem_data = sources_np[i].T
80
-
81
- metadata = AudioMetadata(
82
- sample_rate=target_sr,
83
- channels=2,
84
- sample_width=2,
85
- duration_seconds=stem_data.shape[0] / target_sr,
86
- frame_count=stem_data.shape[0],
87
- )
88
- stems[name] = Audio(stem_data.astype(np.float32), metadata)
89
-
90
- vocals = stems["vocals"]
91
-
92
- non_vocal_stems = [stems[name] for name in stem_names if name != "vocals"]
93
- background_data = np.zeros_like(vocals.data)
94
- for stem in non_vocal_stems:
95
- background_data += stem.data
87
+ vocals_data = sources_np[vocals_idx].T
88
+ background_data = sources_np[non_vocal_indices].sum(axis=0).T
89
+ del sources_np
96
90
 
97
91
  max_val = np.max(np.abs(background_data))
98
92
  if max_val > 1.0:
99
- background_data = background_data / max_val
100
-
101
- background = Audio(background_data.astype(np.float32), vocals.metadata)
102
-
103
- music_stems = ["drums", "bass", "other"]
104
- if self.model_name == "htdemucs_6s":
105
- music_stems.extend(["guitar", "piano"])
106
-
107
- music_data = np.zeros_like(vocals.data)
108
- for name in music_stems:
109
- if name in stems:
110
- music_data += stems[name].data
111
-
112
- max_val = np.max(np.abs(music_data))
113
- if max_val > 1.0:
114
- music_data = music_data / max_val
115
-
116
- music = Audio(music_data.astype(np.float32), vocals.metadata)
93
+ background_data /= max_val
94
+
95
+ metadata = AudioMetadata(
96
+ sample_rate=target_sr,
97
+ channels=2,
98
+ sample_width=2,
99
+ duration_seconds=vocals_data.shape[0] / target_sr,
100
+ frame_count=vocals_data.shape[0],
101
+ )
102
+ vocals = Audio(np.ascontiguousarray(vocals_data, dtype=np.float32), metadata)
103
+ background = Audio(np.ascontiguousarray(background_data, dtype=np.float32), metadata)
117
104
 
118
105
  return SeparatedAudio(
119
106
  vocals=vocals,
120
107
  background=background,
121
108
  original=audio,
122
- music=music,
109
+ music=None,
123
110
  effects=None,
124
111
  )
125
112
 
@@ -1,215 +0,0 @@
1
- """Audio generation using local models."""
2
-
3
- from __future__ import annotations
4
-
5
- from typing import Any
6
-
7
- from videopython.ai._device import log_device_initialization, release_device_memory, select_device
8
- from videopython.base.audio import Audio, AudioMetadata
9
-
10
-
11
- class TextToSpeech:
12
- """Generates speech audio from text using local models.
13
-
14
- Supports Bark (`base`, `small`) for general TTS and Chatterbox Multilingual
15
- (`chatterbox`) for multilingual voice cloning.
16
- """
17
-
18
- SUPPORTED_LOCAL_MODELS: list[str] = ["base", "small", "chatterbox"]
19
-
20
- CHATTERBOX_SAMPLE_RATE: int = 24000
21
-
22
- def __init__(
23
- self,
24
- model_size: str = "base",
25
- voice: str | None = None,
26
- device: str | None = None,
27
- language: str = "en",
28
- ):
29
- if model_size not in self.SUPPORTED_LOCAL_MODELS:
30
- raise ValueError(f"model_size must be one of {self.SUPPORTED_LOCAL_MODELS}, got '{model_size}'")
31
-
32
- self.model_size = model_size
33
- self.voice = voice
34
- self.device = device
35
- self.language = language
36
- self._model: Any = None
37
- self._processor: Any = None
38
- self._chatterbox_model: Any = None
39
-
40
- def _init_local(self) -> None:
41
- """Initialize local Bark model."""
42
- from transformers import AutoModel, AutoProcessor
43
-
44
- requested_device = self.device
45
- device = select_device(self.device, mps_allowed=False)
46
-
47
- model_name = "suno/bark" if self.model_size == "base" else "suno/bark-small"
48
- self._processor = AutoProcessor.from_pretrained(model_name)
49
- self._model = AutoModel.from_pretrained(model_name).to(device)
50
- self.device = device
51
- log_device_initialization(
52
- "TextToSpeech",
53
- requested_device=requested_device,
54
- resolved_device=device,
55
- )
56
-
57
- def _init_chatterbox(self) -> None:
58
- """Initialize Chatterbox Multilingual model for voice cloning."""
59
- from chatterbox.mtl_tts import ChatterboxMultilingualTTS # type: ignore[import-untyped]
60
-
61
- requested_device = self.device
62
- device = select_device(self.device, mps_allowed=False)
63
-
64
- self._chatterbox_model = ChatterboxMultilingualTTS.from_pretrained(device=device)
65
- self.device = device
66
- log_device_initialization(
67
- "TextToSpeech",
68
- requested_device=requested_device,
69
- resolved_device=device,
70
- )
71
-
72
- def _generate_local(self, text: str, voice_preset: str | None) -> Audio:
73
- """Generate speech using Bark."""
74
- import torch
75
-
76
- if self._model is None:
77
- self._init_local()
78
-
79
- inputs = self._processor(text=[text], return_tensors="pt", voice_preset=voice_preset)
80
- inputs = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
81
-
82
- with torch.no_grad():
83
- speech_values = self._model.generate(**inputs, do_sample=True)
84
-
85
- audio_data = speech_values.cpu().float().numpy().squeeze()
86
- sample_rate = self._model.generation_config.sample_rate
87
-
88
- metadata = AudioMetadata(
89
- sample_rate=sample_rate,
90
- channels=1,
91
- sample_width=2,
92
- duration_seconds=len(audio_data) / sample_rate,
93
- frame_count=len(audio_data),
94
- )
95
- return Audio(audio_data, metadata)
96
-
97
- def _generate_chatterbox(self, text: str, voice_sample: Audio) -> Audio:
98
- """Generate speech using Chatterbox Multilingual with voice cloning."""
99
- import tempfile
100
- from pathlib import Path
101
-
102
- import numpy as np
103
-
104
- if self._chatterbox_model is None:
105
- self._init_chatterbox()
106
-
107
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
108
- voice_sample.save(f.name)
109
- speaker_wav_path = Path(f.name)
110
-
111
- try:
112
- wav = self._chatterbox_model.generate(
113
- text=text,
114
- language_id=self.language,
115
- audio_prompt_path=str(speaker_wav_path),
116
- )
117
-
118
- audio_data = wav.cpu().float().numpy().squeeze()
119
- if audio_data.ndim == 0:
120
- audio_data = np.array([audio_data], dtype=np.float32)
121
-
122
- sample_rate = self.CHATTERBOX_SAMPLE_RATE
123
-
124
- metadata = AudioMetadata(
125
- sample_rate=sample_rate,
126
- channels=1,
127
- sample_width=2,
128
- duration_seconds=len(audio_data) / sample_rate,
129
- frame_count=len(audio_data),
130
- )
131
- return Audio(audio_data, metadata)
132
- finally:
133
- speaker_wav_path.unlink()
134
-
135
- def generate_audio(
136
- self,
137
- text: str,
138
- voice_preset: str | None = None,
139
- voice_sample: Audio | None = None,
140
- ) -> Audio:
141
- """Generate speech audio from text."""
142
- effective_voice = voice_preset or self.voice
143
-
144
- if self.model_size == "chatterbox" or voice_sample is not None:
145
- if voice_sample is None:
146
- raise ValueError(
147
- "voice_sample is required for Chatterbox voice cloning. "
148
- "Provide an Audio sample of the voice to clone."
149
- )
150
- return self._generate_chatterbox(text, voice_sample)
151
-
152
- return self._generate_local(text, effective_voice)
153
-
154
- def unload(self) -> None:
155
- """Release the TTS model(s) so the next generate_audio() re-initializes.
156
-
157
- Used by low-memory dubbing to free VRAM between pipeline stages.
158
- """
159
- self._model = None
160
- self._processor = None
161
- self._chatterbox_model = None
162
- release_device_memory(self.device)
163
-
164
-
165
- class TextToMusic:
166
- """Generates music from text descriptions using MusicGen."""
167
-
168
- def __init__(self, device: str | None = None):
169
- self.device = device
170
- self._processor: Any = None
171
- self._model: Any = None
172
- self._device: str | None = None
173
-
174
- def _init_local(self) -> None:
175
- """Initialize local MusicGen model."""
176
- import os
177
-
178
- from transformers import AutoProcessor, MusicgenForConditionalGeneration
179
-
180
- os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
181
-
182
- requested_device = self.device
183
- self._device = select_device(self.device, mps_allowed=True)
184
-
185
- model_name = "facebook/musicgen-small"
186
- self._processor = AutoProcessor.from_pretrained(model_name)
187
- self._model = MusicgenForConditionalGeneration.from_pretrained(model_name)
188
- self._model.to(self._device)
189
- self.device = self._device
190
- log_device_initialization(
191
- "TextToMusic",
192
- requested_device=requested_device,
193
- resolved_device=self._device,
194
- )
195
-
196
- def generate_audio(self, text: str, max_new_tokens: int = 256) -> Audio:
197
- """Generate music audio from text description."""
198
- if self._model is None:
199
- self._init_local()
200
-
201
- inputs = self._processor(text=[text], padding=True, return_tensors="pt")
202
- inputs = {k: v.to(self._device) if hasattr(v, "to") else v for k, v in inputs.items()}
203
- audio_values = self._model.generate(**inputs, max_new_tokens=max_new_tokens)
204
- sampling_rate = self._model.config.audio_encoder.sampling_rate
205
-
206
- audio_data = audio_values[0, 0].cpu().float().numpy()
207
-
208
- metadata = AudioMetadata(
209
- sample_rate=sampling_rate,
210
- channels=1,
211
- sample_width=2,
212
- duration_seconds=len(audio_data) / sampling_rate,
213
- frame_count=len(audio_data),
214
- )
215
- return Audio(audio_data, metadata)
File without changes
File without changes
File without changes