videopython 0.26.2__tar.gz → 0.26.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {videopython-0.26.2 → videopython-0.26.4}/PKG-INFO +1 -1
  2. {videopython-0.26.2 → videopython-0.26.4}/pyproject.toml +1 -1
  3. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/dubbing/dubber.py +82 -2
  4. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/dubbing/pipeline.py +19 -26
  5. videopython-0.26.4/src/videopython/ai/dubbing/remux.py +73 -0
  6. videopython-0.26.4/src/videopython/ai/generation/audio.py +156 -0
  7. videopython-0.26.2/src/videopython/ai/generation/audio.py +0 -215
  8. {videopython-0.26.2 → videopython-0.26.4}/.gitignore +0 -0
  9. {videopython-0.26.2 → videopython-0.26.4}/LICENSE +0 -0
  10. {videopython-0.26.2 → videopython-0.26.4}/README.md +0 -0
  11. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/__init__.py +0 -0
  12. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/__init__.py +0 -0
  13. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/_device.py +0 -0
  14. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/dubbing/__init__.py +0 -0
  15. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/dubbing/models.py +0 -0
  16. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/dubbing/timing.py +0 -0
  17. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/generation/__init__.py +0 -0
  18. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/generation/image.py +0 -0
  19. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/generation/translation.py +0 -0
  20. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/generation/video.py +0 -0
  21. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/registry.py +0 -0
  22. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/swapping/__init__.py +0 -0
  23. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/swapping/inpainter.py +0 -0
  24. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/swapping/models.py +0 -0
  25. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/swapping/segmenter.py +0 -0
  26. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/swapping/swapper.py +0 -0
  27. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/transforms.py +0 -0
  28. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/understanding/__init__.py +0 -0
  29. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/understanding/audio.py +0 -0
  30. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/understanding/image.py +0 -0
  31. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/understanding/separation.py +0 -0
  32. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/understanding/temporal.py +0 -0
  33. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/ai/video_analysis.py +0 -0
  34. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/__init__.py +0 -0
  35. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/audio/__init__.py +0 -0
  36. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/audio/analysis.py +0 -0
  37. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/audio/audio.py +0 -0
  38. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/combine.py +0 -0
  39. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/description.py +0 -0
  40. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/effects.py +0 -0
  41. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/exceptions.py +0 -0
  42. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/progress.py +0 -0
  43. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/registry.py +0 -0
  44. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/scene.py +0 -0
  45. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/streaming.py +0 -0
  46. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/text/__init__.py +0 -0
  47. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/text/overlay.py +0 -0
  48. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/text/transcription.py +0 -0
  49. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/transforms.py +0 -0
  50. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/transitions.py +0 -0
  51. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/utils.py +0 -0
  52. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/base/video.py +0 -0
  53. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/editing/__init__.py +0 -0
  54. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/editing/multicam.py +0 -0
  55. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/editing/premiere_xml.py +0 -0
  56. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/editing/video_edit.py +0 -0
  57. {videopython-0.26.2 → videopython-0.26.4}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.26.2
3
+ Version: 0.26.4
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.26.2"
3
+ version = "0.26.4"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -3,6 +3,8 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import logging
6
+ import tempfile
7
+ from pathlib import Path
6
8
  from typing import TYPE_CHECKING, Any, Callable
7
9
 
8
10
  from videopython.ai.dubbing.models import DubbingResult, RevoiceResult
@@ -60,7 +62,7 @@ class VideoDubber:
60
62
  self._init_local_pipeline()
61
63
 
62
64
  return self._local_pipeline.process(
63
- video=video,
65
+ source_audio=video.audio,
64
66
  target_lang=target_lang,
65
67
  source_lang=source_lang,
66
68
  preserve_background=preserve_background,
@@ -99,6 +101,84 @@ class VideoDubber:
99
101
  )
100
102
  return video.add_audio(result.dubbed_audio, overlay=False)
101
103
 
104
+ def dub_file(
105
+ self,
106
+ input_path: str | Path,
107
+ output_path: str | Path,
108
+ target_lang: str,
109
+ source_lang: str | None = None,
110
+ preserve_background: bool = True,
111
+ voice_clone: bool = True,
112
+ enable_diarization: bool = False,
113
+ progress_callback: Callable[[str, float], None] | None = None,
114
+ transcription: Any = None,
115
+ ) -> DubbingResult:
116
+ """Dub a video file in place on disk without loading video frames into memory.
117
+
118
+ Extracts the audio track via ffmpeg, runs the dubbing pipeline on the
119
+ audio only, then muxes the dubbed audio back into the source video
120
+ using ffmpeg stream-copy (no video re-encode). Peak memory is bounded
121
+ by model weights and the audio track — independent of video length and
122
+ resolution.
123
+
124
+ Use this instead of ``dub_and_replace`` when the source video is long
125
+ or high-resolution and you don't need frame-level access in Python.
126
+
127
+ Args:
128
+ input_path: Path to the source video file.
129
+ output_path: Path to write the dubbed video. Overwritten if it exists.
130
+ target_lang: Target language code (e.g. ``"es"``, ``"fr"``).
131
+ source_lang: Source language code, or ``None`` to auto-detect.
132
+ preserve_background: Preserve background music/effects via source separation.
133
+ voice_clone: Clone the source speaker's voice for the dubbed track.
134
+ enable_diarization: Enable speaker diarization for per-speaker voice cloning.
135
+ progress_callback: Optional callback ``(stage: str, progress: float) -> None``.
136
+ transcription: Optional pre-computed ``Transcription`` to skip the Whisper step.
137
+
138
+ Returns:
139
+ ``DubbingResult`` with the dubbed audio, translated segments, and
140
+ source transcription. The output video is written to ``output_path``.
141
+ """
142
+ from videopython.ai.dubbing.remux import replace_audio_stream
143
+ from videopython.base.audio import Audio
144
+
145
+ input_path = Path(input_path)
146
+ output_path = Path(output_path)
147
+
148
+ if not input_path.exists():
149
+ raise FileNotFoundError(f"Input video not found: {input_path}")
150
+
151
+ logger.info("dub_file: loading audio from %s", input_path)
152
+ source_audio = Audio.from_path(input_path)
153
+
154
+ if self._local_pipeline is None:
155
+ self._init_local_pipeline()
156
+
157
+ result = self._local_pipeline.process(
158
+ source_audio=source_audio,
159
+ target_lang=target_lang,
160
+ source_lang=source_lang,
161
+ preserve_background=preserve_background,
162
+ voice_clone=voice_clone,
163
+ enable_diarization=enable_diarization,
164
+ progress_callback=progress_callback,
165
+ transcription=transcription,
166
+ )
167
+
168
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
169
+ dubbed_audio_path = Path(tmp.name)
170
+ try:
171
+ result.dubbed_audio.save(dubbed_audio_path)
172
+ replace_audio_stream(
173
+ video_path=input_path,
174
+ audio_path=dubbed_audio_path,
175
+ output_path=output_path,
176
+ )
177
+ finally:
178
+ dubbed_audio_path.unlink(missing_ok=True)
179
+
180
+ return result
181
+
102
182
  def revoice(
103
183
  self,
104
184
  video: Video,
@@ -111,7 +191,7 @@ class VideoDubber:
111
191
  self._init_local_pipeline()
112
192
 
113
193
  return self._local_pipeline.revoice(
114
- video=video,
194
+ source_audio=video.audio,
115
195
  text=text,
116
196
  preserve_background=preserve_background,
117
197
  progress_callback=progress_callback,
@@ -9,7 +9,7 @@ from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, Separate
9
9
  from videopython.ai.dubbing.timing import TimingSynchronizer
10
10
 
11
11
  if TYPE_CHECKING:
12
- from videopython.base.video import Video
12
+ from videopython.base.audio import Audio
13
13
 
14
14
  logger = logging.getLogger(__name__)
15
15
 
@@ -37,7 +37,6 @@ class LocalDubbingPipeline:
37
37
  self._transcriber_diarization: bool | None = None
38
38
  self._translator: Any = None
39
39
  self._tts: Any = None
40
- self._tts_voice_clone: bool | None = None
41
40
  self._tts_language: str | None = None
42
41
  self._separator: Any = None
43
42
  self._synchronizer: TimingSynchronizer | None = None
@@ -71,18 +70,11 @@ class LocalDubbingPipeline:
71
70
 
72
71
  self._translator = TextTranslator(device=self.device)
73
72
 
74
- def _init_tts(self, voice_clone: bool = False, language: str = "en") -> None:
73
+ def _init_tts(self, language: str = "en") -> None:
75
74
  """Initialize the text-to-speech model."""
76
75
  from videopython.ai.generation.audio import TextToSpeech
77
76
 
78
- if voice_clone:
79
- self._tts = TextToSpeech(
80
- model_size="chatterbox",
81
- device=self.device,
82
- language=language,
83
- )
84
- else:
85
- self._tts = TextToSpeech(device=self.device, language=language)
77
+ self._tts = TextToSpeech(device=self.device, language=language)
86
78
 
87
79
  def _init_separator(self) -> None:
88
80
  """Initialize the audio separator."""
@@ -102,7 +94,6 @@ class LocalDubbingPipeline:
102
94
  max_duration: float = 10.0,
103
95
  ) -> dict[str, Any]:
104
96
  """Extract voice samples for each speaker from the audio."""
105
- from videopython.base.audio import Audio
106
97
 
107
98
  voice_samples: dict[str, Audio] = {}
108
99
 
@@ -135,7 +126,7 @@ class LocalDubbingPipeline:
135
126
 
136
127
  def process(
137
128
  self,
138
- video: Video,
129
+ source_audio: Audio,
139
130
  target_lang: str,
140
131
  source_lang: str | None = None,
141
132
  preserve_background: bool = True,
@@ -144,22 +135,22 @@ class LocalDubbingPipeline:
144
135
  progress_callback: Callable[[str, float], None] | None = None,
145
136
  transcription: Any | None = None,
146
137
  ) -> DubbingResult:
147
- """Process a video through the local dubbing pipeline.
138
+ """Run the dubbing pipeline against the given source audio.
148
139
 
149
140
  Args:
141
+ source_audio: Source audio track to dub. Callers with a ``Video``
142
+ object should pass ``video.audio``; callers with only a file path
143
+ can use ``Audio.from_path(path)`` to avoid loading video frames.
150
144
  transcription: Optional pre-computed Transcription object. When provided,
151
145
  the internal Whisper transcription step is skipped (saving time and VRAM).
152
146
  Must be a ``videopython.base.text.transcription.Transcription`` instance
153
147
  with populated ``segments``.
154
148
  """
155
- from videopython.base.audio import Audio
156
149
 
157
150
  def report_progress(stage: str, progress: float) -> None:
158
151
  if progress_callback:
159
152
  progress_callback(stage, progress)
160
153
 
161
- source_audio = video.audio
162
-
163
154
  if transcription is not None:
164
155
  report_progress("Using provided transcription", 0.05)
165
156
  else:
@@ -211,9 +202,8 @@ class LocalDubbingPipeline:
211
202
  self._maybe_unload("_translator")
212
203
 
213
204
  report_progress("Generating dubbed speech", 0.50)
214
- if self._tts is None or self._tts_voice_clone != voice_clone or self._tts_language != target_lang:
215
- self._init_tts(voice_clone=voice_clone, language=target_lang)
216
- self._tts_voice_clone = voice_clone
205
+ if self._tts is None or self._tts_language != target_lang:
206
+ self._init_tts(language=target_lang)
217
207
  self._tts_language = target_lang
218
208
 
219
209
  dubbed_segments: list[Audio] = []
@@ -275,19 +265,23 @@ class LocalDubbingPipeline:
275
265
 
276
266
  def revoice(
277
267
  self,
278
- video: Video,
268
+ source_audio: Audio,
279
269
  text: str,
280
270
  preserve_background: bool = True,
281
271
  progress_callback: Callable[[str, float], None] | None = None,
282
272
  ) -> RevoiceResult:
283
- """Replace speech in a video with new text using voice cloning."""
273
+ """Replace speech in audio with new text using voice cloning.
274
+
275
+ Args:
276
+ source_audio: Source audio track to revoice. Callers with a ``Video``
277
+ object should pass ``video.audio``.
278
+ """
284
279
  from videopython.base.audio import Audio
285
280
 
286
281
  def report_progress(stage: str, progress: float) -> None:
287
282
  if progress_callback:
288
283
  progress_callback(stage, progress)
289
284
 
290
- source_audio = video.audio
291
285
  original_duration = source_audio.metadata.duration_seconds
292
286
 
293
287
  report_progress("Analyzing audio", 0.05)
@@ -323,9 +317,8 @@ class LocalDubbingPipeline:
323
317
  voice_sample = vocal_audio.slice(0, sample_duration)
324
318
 
325
319
  report_progress("Generating speech", 0.60)
326
- if self._tts is None or self._tts_voice_clone is not True or self._tts_language != "en":
327
- self._init_tts(voice_clone=True, language="en")
328
- self._tts_voice_clone = True
320
+ if self._tts is None or self._tts_language != "en":
321
+ self._init_tts(language="en")
329
322
  self._tts_language = "en"
330
323
 
331
324
  generated_speech = self._tts.generate_audio(text, voice_sample=voice_sample)
@@ -0,0 +1,73 @@
1
+ """ffmpeg helper for replacing a video file's audio track without re-encoding video."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import subprocess
7
+ from pathlib import Path
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class RemuxError(RuntimeError):
13
+ """ffmpeg failed while replacing an audio stream."""
14
+
15
+
16
+ def replace_audio_stream(
17
+ video_path: str | Path,
18
+ audio_path: str | Path,
19
+ output_path: str | Path,
20
+ audio_codec: str = "aac",
21
+ audio_bitrate: str = "192k",
22
+ ) -> None:
23
+ """Copy ``video_path``'s video stream and mux in ``audio_path`` as the audio track.
24
+
25
+ Uses ffmpeg stream-copy for video (no re-encode) and encodes audio to AAC.
26
+ ``-shortest`` trims to the shorter of the two streams so the output duration
27
+ matches the source video when the dubbed audio is slightly longer.
28
+
29
+ Args:
30
+ video_path: Source video file (video stream is copied unchanged).
31
+ audio_path: Audio file to use as the new audio track.
32
+ output_path: Destination file. Overwritten if it exists.
33
+ audio_codec: ffmpeg audio codec name. Defaults to ``aac`` (MP4-compatible).
34
+ audio_bitrate: Audio bitrate passed to ffmpeg (``-b:a``).
35
+
36
+ Raises:
37
+ FileNotFoundError: If ``video_path`` or ``audio_path`` does not exist.
38
+ RemuxError: If ffmpeg returns a non-zero exit code.
39
+ """
40
+ video_path = Path(video_path)
41
+ audio_path = Path(audio_path)
42
+ output_path = Path(output_path)
43
+
44
+ if not video_path.exists():
45
+ raise FileNotFoundError(f"Video file not found: {video_path}")
46
+ if not audio_path.exists():
47
+ raise FileNotFoundError(f"Audio file not found: {audio_path}")
48
+
49
+ cmd = [
50
+ "ffmpeg",
51
+ "-y",
52
+ "-i",
53
+ str(video_path),
54
+ "-i",
55
+ str(audio_path),
56
+ "-map",
57
+ "0:v:0",
58
+ "-map",
59
+ "1:a:0",
60
+ "-c:v",
61
+ "copy",
62
+ "-c:a",
63
+ audio_codec,
64
+ "-b:a",
65
+ audio_bitrate,
66
+ "-shortest",
67
+ str(output_path),
68
+ ]
69
+
70
+ logger.info("replace_audio_stream: %s + %s -> %s", video_path, audio_path, output_path)
71
+ result = subprocess.run(cmd, capture_output=True)
72
+ if result.returncode != 0:
73
+ raise RemuxError(f"ffmpeg failed (exit {result.returncode}): {result.stderr.decode(errors='replace')}")
@@ -0,0 +1,156 @@
1
+ """Audio generation using local models."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from videopython.ai._device import log_device_initialization, release_device_memory, select_device
8
+ from videopython.base.audio import Audio, AudioMetadata
9
+
10
+
11
+ class TextToSpeech:
12
+ """Generates speech audio from text using Chatterbox Multilingual.
13
+
14
+ Backed by Chatterbox Multilingual (Resemble AI). When ``voice_sample`` is
15
+ provided to ``generate_audio``, the model clones that voice; otherwise it
16
+ falls back to Chatterbox's built-in default speaker.
17
+ """
18
+
19
+ SAMPLE_RATE: int = 24000
20
+
21
+ def __init__(
22
+ self,
23
+ voice: Audio | None = None,
24
+ device: str | None = None,
25
+ language: str = "en",
26
+ ):
27
+ self.voice = voice
28
+ self.device = device
29
+ self.language = language
30
+ self._model: Any = None
31
+
32
+ def _init_model(self) -> None:
33
+ from chatterbox.mtl_tts import ChatterboxMultilingualTTS # type: ignore[import-untyped]
34
+
35
+ requested_device = self.device
36
+ device = select_device(self.device, mps_allowed=False)
37
+
38
+ self._model = ChatterboxMultilingualTTS.from_pretrained(device=device)
39
+ self.device = device
40
+ log_device_initialization(
41
+ "TextToSpeech",
42
+ requested_device=requested_device,
43
+ resolved_device=device,
44
+ )
45
+
46
+ def generate_audio(
47
+ self,
48
+ text: str,
49
+ voice_sample: Audio | None = None,
50
+ ) -> Audio:
51
+ """Generate speech audio from text.
52
+
53
+ Args:
54
+ text: Text to synthesize.
55
+ voice_sample: Optional voice sample to clone. Falls back to the
56
+ instance's ``voice`` and then to Chatterbox's default speaker.
57
+ """
58
+ import tempfile
59
+ from pathlib import Path
60
+
61
+ import numpy as np
62
+
63
+ if self._model is None:
64
+ self._init_model()
65
+
66
+ effective_sample = voice_sample or self.voice
67
+ speaker_wav_path: Path | None = None
68
+
69
+ if effective_sample is not None:
70
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
71
+ effective_sample.save(f.name)
72
+ speaker_wav_path = Path(f.name)
73
+
74
+ try:
75
+ wav = self._model.generate(
76
+ text=text,
77
+ language_id=self.language,
78
+ audio_prompt_path=str(speaker_wav_path) if speaker_wav_path else None,
79
+ )
80
+
81
+ audio_data = wav.cpu().float().numpy().squeeze()
82
+ if audio_data.ndim == 0:
83
+ audio_data = np.array([audio_data], dtype=np.float32)
84
+
85
+ metadata = AudioMetadata(
86
+ sample_rate=self.SAMPLE_RATE,
87
+ channels=1,
88
+ sample_width=2,
89
+ duration_seconds=len(audio_data) / self.SAMPLE_RATE,
90
+ frame_count=len(audio_data),
91
+ )
92
+ return Audio(audio_data, metadata)
93
+ finally:
94
+ if speaker_wav_path is not None:
95
+ speaker_wav_path.unlink()
96
+
97
+ def unload(self) -> None:
98
+ """Release the TTS model so the next generate_audio() re-initializes.
99
+
100
+ Used by low-memory dubbing to free VRAM between pipeline stages.
101
+ """
102
+ self._model = None
103
+ release_device_memory(self.device)
104
+
105
+
106
+ class TextToMusic:
107
+ """Generates music from text descriptions using MusicGen."""
108
+
109
+ def __init__(self, device: str | None = None):
110
+ self.device = device
111
+ self._processor: Any = None
112
+ self._model: Any = None
113
+ self._device: str | None = None
114
+
115
+ def _init_local(self) -> None:
116
+ """Initialize local MusicGen model."""
117
+ import os
118
+
119
+ from transformers import AutoProcessor, MusicgenForConditionalGeneration
120
+
121
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
122
+
123
+ requested_device = self.device
124
+ self._device = select_device(self.device, mps_allowed=True)
125
+
126
+ model_name = "facebook/musicgen-small"
127
+ self._processor = AutoProcessor.from_pretrained(model_name)
128
+ self._model = MusicgenForConditionalGeneration.from_pretrained(model_name)
129
+ self._model.to(self._device)
130
+ self.device = self._device
131
+ log_device_initialization(
132
+ "TextToMusic",
133
+ requested_device=requested_device,
134
+ resolved_device=self._device,
135
+ )
136
+
137
+ def generate_audio(self, text: str, max_new_tokens: int = 256) -> Audio:
138
+ """Generate music audio from text description."""
139
+ if self._model is None:
140
+ self._init_local()
141
+
142
+ inputs = self._processor(text=[text], padding=True, return_tensors="pt")
143
+ inputs = {k: v.to(self._device) if hasattr(v, "to") else v for k, v in inputs.items()}
144
+ audio_values = self._model.generate(**inputs, max_new_tokens=max_new_tokens)
145
+ sampling_rate = self._model.config.audio_encoder.sampling_rate
146
+
147
+ audio_data = audio_values[0, 0].cpu().float().numpy()
148
+
149
+ metadata = AudioMetadata(
150
+ sample_rate=sampling_rate,
151
+ channels=1,
152
+ sample_width=2,
153
+ duration_seconds=len(audio_data) / sampling_rate,
154
+ frame_count=len(audio_data),
155
+ )
156
+ return Audio(audio_data, metadata)
@@ -1,215 +0,0 @@
1
- """Audio generation using local models."""
2
-
3
- from __future__ import annotations
4
-
5
- from typing import Any
6
-
7
- from videopython.ai._device import log_device_initialization, release_device_memory, select_device
8
- from videopython.base.audio import Audio, AudioMetadata
9
-
10
-
11
- class TextToSpeech:
12
- """Generates speech audio from text using local models.
13
-
14
- Supports Bark (`base`, `small`) for general TTS and Chatterbox Multilingual
15
- (`chatterbox`) for multilingual voice cloning.
16
- """
17
-
18
- SUPPORTED_LOCAL_MODELS: list[str] = ["base", "small", "chatterbox"]
19
-
20
- CHATTERBOX_SAMPLE_RATE: int = 24000
21
-
22
- def __init__(
23
- self,
24
- model_size: str = "base",
25
- voice: str | None = None,
26
- device: str | None = None,
27
- language: str = "en",
28
- ):
29
- if model_size not in self.SUPPORTED_LOCAL_MODELS:
30
- raise ValueError(f"model_size must be one of {self.SUPPORTED_LOCAL_MODELS}, got '{model_size}'")
31
-
32
- self.model_size = model_size
33
- self.voice = voice
34
- self.device = device
35
- self.language = language
36
- self._model: Any = None
37
- self._processor: Any = None
38
- self._chatterbox_model: Any = None
39
-
40
- def _init_local(self) -> None:
41
- """Initialize local Bark model."""
42
- from transformers import AutoModel, AutoProcessor
43
-
44
- requested_device = self.device
45
- device = select_device(self.device, mps_allowed=False)
46
-
47
- model_name = "suno/bark" if self.model_size == "base" else "suno/bark-small"
48
- self._processor = AutoProcessor.from_pretrained(model_name)
49
- self._model = AutoModel.from_pretrained(model_name).to(device)
50
- self.device = device
51
- log_device_initialization(
52
- "TextToSpeech",
53
- requested_device=requested_device,
54
- resolved_device=device,
55
- )
56
-
57
- def _init_chatterbox(self) -> None:
58
- """Initialize Chatterbox Multilingual model for voice cloning."""
59
- from chatterbox.mtl_tts import ChatterboxMultilingualTTS # type: ignore[import-untyped]
60
-
61
- requested_device = self.device
62
- device = select_device(self.device, mps_allowed=False)
63
-
64
- self._chatterbox_model = ChatterboxMultilingualTTS.from_pretrained(device=device)
65
- self.device = device
66
- log_device_initialization(
67
- "TextToSpeech",
68
- requested_device=requested_device,
69
- resolved_device=device,
70
- )
71
-
72
- def _generate_local(self, text: str, voice_preset: str | None) -> Audio:
73
- """Generate speech using Bark."""
74
- import torch
75
-
76
- if self._model is None:
77
- self._init_local()
78
-
79
- inputs = self._processor(text=[text], return_tensors="pt", voice_preset=voice_preset)
80
- inputs = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
81
-
82
- with torch.no_grad():
83
- speech_values = self._model.generate(**inputs, do_sample=True)
84
-
85
- audio_data = speech_values.cpu().float().numpy().squeeze()
86
- sample_rate = self._model.generation_config.sample_rate
87
-
88
- metadata = AudioMetadata(
89
- sample_rate=sample_rate,
90
- channels=1,
91
- sample_width=2,
92
- duration_seconds=len(audio_data) / sample_rate,
93
- frame_count=len(audio_data),
94
- )
95
- return Audio(audio_data, metadata)
96
-
97
- def _generate_chatterbox(self, text: str, voice_sample: Audio) -> Audio:
98
- """Generate speech using Chatterbox Multilingual with voice cloning."""
99
- import tempfile
100
- from pathlib import Path
101
-
102
- import numpy as np
103
-
104
- if self._chatterbox_model is None:
105
- self._init_chatterbox()
106
-
107
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
108
- voice_sample.save(f.name)
109
- speaker_wav_path = Path(f.name)
110
-
111
- try:
112
- wav = self._chatterbox_model.generate(
113
- text=text,
114
- language_id=self.language,
115
- audio_prompt_path=str(speaker_wav_path),
116
- )
117
-
118
- audio_data = wav.cpu().float().numpy().squeeze()
119
- if audio_data.ndim == 0:
120
- audio_data = np.array([audio_data], dtype=np.float32)
121
-
122
- sample_rate = self.CHATTERBOX_SAMPLE_RATE
123
-
124
- metadata = AudioMetadata(
125
- sample_rate=sample_rate,
126
- channels=1,
127
- sample_width=2,
128
- duration_seconds=len(audio_data) / sample_rate,
129
- frame_count=len(audio_data),
130
- )
131
- return Audio(audio_data, metadata)
132
- finally:
133
- speaker_wav_path.unlink()
134
-
135
- def generate_audio(
136
- self,
137
- text: str,
138
- voice_preset: str | None = None,
139
- voice_sample: Audio | None = None,
140
- ) -> Audio:
141
- """Generate speech audio from text."""
142
- effective_voice = voice_preset or self.voice
143
-
144
- if self.model_size == "chatterbox" or voice_sample is not None:
145
- if voice_sample is None:
146
- raise ValueError(
147
- "voice_sample is required for Chatterbox voice cloning. "
148
- "Provide an Audio sample of the voice to clone."
149
- )
150
- return self._generate_chatterbox(text, voice_sample)
151
-
152
- return self._generate_local(text, effective_voice)
153
-
154
- def unload(self) -> None:
155
- """Release the TTS model(s) so the next generate_audio() re-initializes.
156
-
157
- Used by low-memory dubbing to free VRAM between pipeline stages.
158
- """
159
- self._model = None
160
- self._processor = None
161
- self._chatterbox_model = None
162
- release_device_memory(self.device)
163
-
164
-
165
- class TextToMusic:
166
- """Generates music from text descriptions using MusicGen."""
167
-
168
- def __init__(self, device: str | None = None):
169
- self.device = device
170
- self._processor: Any = None
171
- self._model: Any = None
172
- self._device: str | None = None
173
-
174
- def _init_local(self) -> None:
175
- """Initialize local MusicGen model."""
176
- import os
177
-
178
- from transformers import AutoProcessor, MusicgenForConditionalGeneration
179
-
180
- os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
181
-
182
- requested_device = self.device
183
- self._device = select_device(self.device, mps_allowed=True)
184
-
185
- model_name = "facebook/musicgen-small"
186
- self._processor = AutoProcessor.from_pretrained(model_name)
187
- self._model = MusicgenForConditionalGeneration.from_pretrained(model_name)
188
- self._model.to(self._device)
189
- self.device = self._device
190
- log_device_initialization(
191
- "TextToMusic",
192
- requested_device=requested_device,
193
- resolved_device=self._device,
194
- )
195
-
196
- def generate_audio(self, text: str, max_new_tokens: int = 256) -> Audio:
197
- """Generate music audio from text description."""
198
- if self._model is None:
199
- self._init_local()
200
-
201
- inputs = self._processor(text=[text], padding=True, return_tensors="pt")
202
- inputs = {k: v.to(self._device) if hasattr(v, "to") else v for k, v in inputs.items()}
203
- audio_values = self._model.generate(**inputs, max_new_tokens=max_new_tokens)
204
- sampling_rate = self._model.config.audio_encoder.sampling_rate
205
-
206
- audio_data = audio_values[0, 0].cpu().float().numpy()
207
-
208
- metadata = AudioMetadata(
209
- sample_rate=sampling_rate,
210
- channels=1,
211
- sample_width=2,
212
- duration_seconds=len(audio_data) / sampling_rate,
213
- frame_count=len(audio_data),
214
- )
215
- return Audio(audio_data, metadata)
File without changes
File without changes
File without changes