videopython 0.26.5__tar.gz → 0.26.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {videopython-0.26.5 → videopython-0.26.7}/PKG-INFO +1 -1
  2. {videopython-0.26.5 → videopython-0.26.7}/pyproject.toml +1 -1
  3. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/dubbing/dubber.py +26 -19
  4. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/dubbing/pipeline.py +74 -19
  5. videopython-0.26.7/src/videopython/ai/dubbing/remux.py +159 -0
  6. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/dubbing/timing.py +46 -18
  7. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/generation/audio.py +24 -9
  8. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/understanding/audio.py +42 -0
  9. videopython-0.26.5/src/videopython/ai/dubbing/remux.py +0 -73
  10. {videopython-0.26.5 → videopython-0.26.7}/.gitignore +0 -0
  11. {videopython-0.26.5 → videopython-0.26.7}/LICENSE +0 -0
  12. {videopython-0.26.5 → videopython-0.26.7}/README.md +0 -0
  13. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/__init__.py +0 -0
  14. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/__init__.py +0 -0
  15. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/_device.py +0 -0
  16. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/dubbing/__init__.py +0 -0
  17. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/dubbing/models.py +0 -0
  18. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/generation/__init__.py +0 -0
  19. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/generation/image.py +0 -0
  20. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/generation/translation.py +0 -0
  21. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/generation/video.py +0 -0
  22. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/registry.py +0 -0
  23. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/swapping/__init__.py +0 -0
  24. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/swapping/inpainter.py +0 -0
  25. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/swapping/models.py +0 -0
  26. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/swapping/segmenter.py +0 -0
  27. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/swapping/swapper.py +0 -0
  28. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/transforms.py +0 -0
  29. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/understanding/__init__.py +0 -0
  30. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/understanding/image.py +0 -0
  31. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/understanding/separation.py +0 -0
  32. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/understanding/temporal.py +0 -0
  33. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/video_analysis.py +0 -0
  34. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/base/__init__.py +0 -0
  35. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/base/audio/__init__.py +0 -0
  36. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/base/audio/analysis.py +0 -0
  37. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/base/audio/audio.py +0 -0
  38. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/base/combine.py +0 -0
  39. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/base/description.py +0 -0
  40. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/base/effects.py +0 -0
  41. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/base/exceptions.py +0 -0
  42. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/base/progress.py +0 -0
  43. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/base/registry.py +0 -0
  44. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/base/scene.py +0 -0
  45. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/base/streaming.py +0 -0
  46. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/base/text/__init__.py +0 -0
  47. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/base/text/overlay.py +0 -0
  48. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/base/text/transcription.py +0 -0
  49. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/base/transforms.py +0 -0
  50. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/base/transitions.py +0 -0
  51. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/base/utils.py +0 -0
  52. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/base/video.py +0 -0
  53. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/editing/__init__.py +0 -0
  54. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/editing/multicam.py +0 -0
  55. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/editing/premiere_xml.py +0 -0
  56. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/editing/video_edit.py +0 -0
  57. {videopython-0.26.5 → videopython-0.26.7}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.26.5
3
+ Version: 0.26.7
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.26.5"
3
+ version = "0.26.7"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -3,7 +3,6 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import logging
6
- import tempfile
7
6
  from pathlib import Path
8
7
  from typing import TYPE_CHECKING, Any, Callable
9
8
 
@@ -74,9 +73,14 @@ class VideoDubber:
74
73
 
75
74
  Args:
76
75
  enable_diarization: Enable speaker diarization to clone each speaker's
77
- voice separately. Requires additional VRAM for the diarization model.
78
- transcription: Optional pre-computed Transcription object. When provided,
79
- the internal Whisper transcription step is skipped.
76
+ voice separately. With ``transcription=None``, runs alongside Whisper.
77
+ With a supplied ``transcription`` that has no speakers, runs pyannote
78
+ standalone and overlays speakers onto the supplied words. Ignored when
79
+ the supplied transcription already has speaker labels.
80
+ transcription: Optional pre-computed ``Transcription`` to skip the Whisper
81
+ step. Speaker labels on the supplied transcription drive per-speaker
82
+ voice cloning. If it has no speakers, pass ``enable_diarization=True``
83
+ to add them via pyannote (requires word-level timings).
80
84
  """
81
85
  if self._local_pipeline is None:
82
86
  self._init_local_pipeline()
@@ -106,8 +110,10 @@ class VideoDubber:
106
110
  """Dub a video and return a new video with the dubbed audio.
107
111
 
108
112
  Args:
109
- transcription: Optional pre-computed Transcription object. When provided,
110
- the internal Whisper transcription step is skipped.
113
+ transcription: Optional pre-computed ``Transcription`` to skip the Whisper
114
+ step. Speaker labels on the supplied transcription drive per-speaker
115
+ voice cloning. See ``dub()`` for the interaction with
116
+ ``enable_diarization``.
111
117
  """
112
118
  result = self.dub(
113
119
  video=video,
@@ -152,14 +158,18 @@ class VideoDubber:
152
158
  preserve_background: Preserve background music/effects via source separation.
153
159
  voice_clone: Clone the source speaker's voice for the dubbed track.
154
160
  enable_diarization: Enable speaker diarization for per-speaker voice cloning.
161
+ See ``dub()`` for the interaction with ``transcription``.
155
162
  progress_callback: Optional callback ``(stage: str, progress: float) -> None``.
156
- transcription: Optional pre-computed ``Transcription`` to skip the Whisper step.
163
+ transcription: Optional pre-computed ``Transcription`` to skip the Whisper
164
+ step. Speaker labels on the supplied transcription drive per-speaker
165
+ voice cloning. If it has no speakers, pass ``enable_diarization=True``
166
+ to add them via pyannote (requires word-level timings).
157
167
 
158
168
  Returns:
159
169
  ``DubbingResult`` with the dubbed audio, translated segments, and
160
170
  source transcription. The output video is written to ``output_path``.
161
171
  """
162
- from videopython.ai.dubbing.remux import replace_audio_stream
172
+ from videopython.ai.dubbing.remux import replace_audio_stream_from_audio
163
173
  from videopython.base.audio import Audio
164
174
 
165
175
  input_path = Path(input_path)
@@ -185,17 +195,14 @@ class VideoDubber:
185
195
  transcription=transcription,
186
196
  )
187
197
 
188
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
189
- dubbed_audio_path = Path(tmp.name)
190
- try:
191
- result.dubbed_audio.save(dubbed_audio_path)
192
- replace_audio_stream(
193
- video_path=input_path,
194
- audio_path=dubbed_audio_path,
195
- output_path=output_path,
196
- )
197
- finally:
198
- dubbed_audio_path.unlink(missing_ok=True)
198
+ # Stream the dubbed Audio directly into ffmpeg via stdin instead of
199
+ # going through a temp WAV on disk. For a 2h dub the temp file would
200
+ # be ~10 GB written-then-read; the streaming path drops both copies.
201
+ replace_audio_stream_from_audio(
202
+ video_path=input_path,
203
+ audio=result.dubbed_audio,
204
+ output_path=output_path,
205
+ )
199
206
 
200
207
  return result
201
208
 
@@ -3,6 +3,8 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import logging
6
+ import tempfile
7
+ from pathlib import Path
6
8
  from typing import TYPE_CHECKING, Any, Callable, Literal
7
9
 
8
10
  from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio
@@ -162,7 +164,16 @@ class LocalDubbingPipeline:
162
164
  transcription: Optional pre-computed Transcription object. When provided,
163
165
  the internal Whisper transcription step is skipped (saving time and VRAM).
164
166
  Must be a ``videopython.base.text.transcription.Transcription`` instance
165
- with populated ``segments``.
167
+ with populated ``segments``. Speaker labels on the supplied transcription
168
+ drive per-speaker voice cloning. If the supplied transcription has no
169
+ speakers and ``enable_diarization=True``, pyannote is run standalone on
170
+ ``source_audio`` and speakers are attached to the supplied words
171
+ (requires word-level timings).
172
+ enable_diarization: When True, run speaker diarization to enable per-speaker
173
+ voice cloning. With ``transcription=None``, runs alongside Whisper. With
174
+ a supplied ``transcription`` that has no speakers, runs pyannote
175
+ standalone and overlays speakers onto the supplied words. Ignored when
176
+ the supplied transcription already has speaker labels.
166
177
  """
167
178
 
168
179
  def report_progress(stage: str, progress: float) -> None:
@@ -171,6 +182,34 @@ class LocalDubbingPipeline:
171
182
 
172
183
  if transcription is not None:
173
184
  report_progress("Using provided transcription", 0.05)
185
+ if transcription.speakers:
186
+ logger.info(
187
+ "Using provided transcription: %d segment(s), %d speaker(s)",
188
+ len(transcription.segments),
189
+ len(transcription.speakers),
190
+ )
191
+ if enable_diarization:
192
+ logger.info("enable_diarization=True ignored: supplied transcription already has speaker labels.")
193
+ elif enable_diarization:
194
+ report_progress("Diarizing supplied transcription", 0.10)
195
+ if self._transcriber is None or self._transcriber_diarization is not True:
196
+ self._init_transcriber(enable_diarization=True)
197
+ self._transcriber_diarization = True
198
+ transcription = self._transcriber.diarize_transcription(source_audio, transcription)
199
+ self._maybe_unload("_transcriber")
200
+ logger.info(
201
+ "Diarized supplied transcription: %d segment(s), %d speaker(s)",
202
+ len(transcription.segments),
203
+ len(transcription.speakers),
204
+ )
205
+ else:
206
+ logger.info(
207
+ "Using provided transcription: %d segment(s), no speaker labels. "
208
+ "All segments will share a single voice clone. Pass "
209
+ "enable_diarization=True to add per-speaker labels, or "
210
+ "voice_clone=False to use the default TTS voice.",
211
+ len(transcription.segments),
212
+ )
174
213
  else:
175
214
  report_progress("Transcribing audio", 0.05)
176
215
  if self._transcriber is None or self._transcriber_diarization != enable_diarization:
@@ -241,24 +280,40 @@ class LocalDubbingPipeline:
241
280
  target_durations: list[float] = []
242
281
  start_times: list[float] = []
243
282
 
244
- for i, segment in enumerate(translated_segments):
245
- if segment.duration < 0.1:
246
- continue
247
-
248
- progress = 0.50 + (0.30 * (i / len(translated_segments)))
249
- report_progress(f"Generating speech ({i + 1}/{len(translated_segments)})", progress)
250
-
251
- speaker = segment.speaker or "speaker_0"
252
- voice_sample = voice_samples.get(speaker)
253
-
254
- if voice_clone and voice_sample is not None:
255
- dubbed_audio = self._tts.generate_audio(segment.translated_text, voice_sample=voice_sample)
256
- else:
257
- dubbed_audio = self._tts.generate_audio(segment.translated_text)
258
-
259
- dubbed_segments.append(dubbed_audio)
260
- target_durations.append(segment.duration)
261
- start_times.append(segment.start)
283
+ # Encode each speaker's voice sample to a temp WAV exactly once and
284
+ # reuse the path across every segment for that speaker. Without this
285
+ # cache, TextToSpeech.generate_audio re-encodes the same voice sample
286
+ # on every call (one temp WAV write + delete per segment), which is
287
+ # pure overhead for long dubs with many segments per speaker.
288
+ speaker_wav_paths: dict[str, Path] = {}
289
+ try:
290
+ if voice_clone:
291
+ for speaker, sample in voice_samples.items():
292
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
293
+ sample.save(f.name)
294
+ speaker_wav_paths[speaker] = Path(f.name)
295
+
296
+ for i, segment in enumerate(translated_segments):
297
+ if segment.duration < 0.1:
298
+ continue
299
+
300
+ progress = 0.50 + (0.30 * (i / len(translated_segments)))
301
+ report_progress(f"Generating speech ({i + 1}/{len(translated_segments)})", progress)
302
+
303
+ speaker = segment.speaker or "speaker_0"
304
+ cached_path = speaker_wav_paths.get(speaker) if voice_clone else None
305
+
306
+ if cached_path is not None:
307
+ dubbed_audio = self._tts.generate_audio(segment.translated_text, voice_sample_path=cached_path)
308
+ else:
309
+ dubbed_audio = self._tts.generate_audio(segment.translated_text)
310
+
311
+ dubbed_segments.append(dubbed_audio)
312
+ target_durations.append(segment.duration)
313
+ start_times.append(segment.start)
314
+ finally:
315
+ for path in speaker_wav_paths.values():
316
+ path.unlink(missing_ok=True)
262
317
 
263
318
  self._maybe_unload("_tts")
264
319
 
@@ -0,0 +1,159 @@
1
+ """ffmpeg helper for replacing a video file's audio track without re-encoding video."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ import logging
7
+ import subprocess
8
+ import wave
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING
11
+
12
+ import numpy as np
13
+
14
+ if TYPE_CHECKING:
15
+ from videopython.base.audio import Audio
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class RemuxError(RuntimeError):
21
+ """ffmpeg failed while replacing an audio stream."""
22
+
23
+
24
+ def replace_audio_stream(
25
+ video_path: str | Path,
26
+ audio_path: str | Path,
27
+ output_path: str | Path,
28
+ audio_codec: str = "aac",
29
+ audio_bitrate: str = "192k",
30
+ ) -> None:
31
+ """Copy ``video_path``'s video stream and mux in ``audio_path`` as the audio track.
32
+
33
+ Uses ffmpeg stream-copy for video (no re-encode) and encodes audio to AAC.
34
+ ``-shortest`` trims to the shorter of the two streams so the output duration
35
+ matches the source video when the dubbed audio is slightly longer.
36
+
37
+ Args:
38
+ video_path: Source video file (video stream is copied unchanged).
39
+ audio_path: Audio file to use as the new audio track.
40
+ output_path: Destination file. Overwritten if it exists.
41
+ audio_codec: ffmpeg audio codec name. Defaults to ``aac`` (MP4-compatible).
42
+ audio_bitrate: Audio bitrate passed to ffmpeg (``-b:a``).
43
+
44
+ Raises:
45
+ FileNotFoundError: If ``video_path`` or ``audio_path`` does not exist.
46
+ RemuxError: If ffmpeg returns a non-zero exit code.
47
+ """
48
+ video_path = Path(video_path)
49
+ audio_path = Path(audio_path)
50
+ output_path = Path(output_path)
51
+
52
+ if not video_path.exists():
53
+ raise FileNotFoundError(f"Video file not found: {video_path}")
54
+ if not audio_path.exists():
55
+ raise FileNotFoundError(f"Audio file not found: {audio_path}")
56
+
57
+ cmd = [
58
+ "ffmpeg",
59
+ "-y",
60
+ "-i",
61
+ str(video_path),
62
+ "-i",
63
+ str(audio_path),
64
+ "-map",
65
+ "0:v:0",
66
+ "-map",
67
+ "1:a:0",
68
+ "-c:v",
69
+ "copy",
70
+ "-c:a",
71
+ audio_codec,
72
+ "-b:a",
73
+ audio_bitrate,
74
+ "-shortest",
75
+ str(output_path),
76
+ ]
77
+
78
+ logger.info("replace_audio_stream: %s + %s -> %s", video_path, audio_path, output_path)
79
+ result = subprocess.run(cmd, capture_output=True)
80
+ if result.returncode != 0:
81
+ raise RemuxError(f"ffmpeg failed (exit {result.returncode}): {result.stderr.decode(errors='replace')}")
82
+
83
+
84
+ def replace_audio_stream_from_audio(
85
+ video_path: str | Path,
86
+ audio: Audio,
87
+ output_path: str | Path,
88
+ audio_codec: str = "aac",
89
+ audio_bitrate: str = "192k",
90
+ ) -> None:
91
+ """Like ``replace_audio_stream`` but takes an in-memory ``Audio`` and pipes WAV to ffmpeg.
92
+
93
+ Avoids the ``Audio.save -> read-from-disk -> ffmpeg`` round-trip used by
94
+ the path-based variant: we serialize the WAV in memory and feed it to
95
+ ffmpeg via stdin. For long dubs this saves a full WAV write+read of the
96
+ output audio (~10 GB for a 2h source).
97
+
98
+ Args:
99
+ video_path: Source video file (video stream is copied unchanged).
100
+ audio: ``Audio`` instance to mux in as the new audio track.
101
+ output_path: Destination file. Overwritten if it exists.
102
+ audio_codec: ffmpeg audio codec name. Defaults to ``aac``.
103
+ audio_bitrate: Audio bitrate passed to ffmpeg (``-b:a``).
104
+
105
+ Raises:
106
+ FileNotFoundError: If ``video_path`` does not exist.
107
+ RemuxError: If ffmpeg returns a non-zero exit code.
108
+ """
109
+ video_path = Path(video_path)
110
+ output_path = Path(output_path)
111
+
112
+ if not video_path.exists():
113
+ raise FileNotFoundError(f"Video file not found: {video_path}")
114
+
115
+ # Serialize Audio to WAV bytes in memory. Mirrors Audio.save's WAV writer:
116
+ # int16 samples, header from metadata. We stream these bytes to ffmpeg's
117
+ # stdin as the second input (the first is the video file on disk).
118
+ int_data = (audio.data * np.iinfo(np.int16).max).astype(np.int16)
119
+ wav_io = io.BytesIO()
120
+ with wave.open(wav_io, "wb") as wav_file:
121
+ wav_file.setnchannels(audio.metadata.channels)
122
+ wav_file.setsampwidth(audio.metadata.sample_width)
123
+ wav_file.setframerate(audio.metadata.sample_rate)
124
+ wav_file.writeframes(int_data.tobytes())
125
+ wav_bytes = wav_io.getvalue()
126
+
127
+ cmd = [
128
+ "ffmpeg",
129
+ "-y",
130
+ "-i",
131
+ str(video_path),
132
+ "-f",
133
+ "wav",
134
+ "-i",
135
+ "-",
136
+ "-map",
137
+ "0:v:0",
138
+ "-map",
139
+ "1:a:0",
140
+ "-c:v",
141
+ "copy",
142
+ "-c:a",
143
+ audio_codec,
144
+ "-b:a",
145
+ audio_bitrate,
146
+ "-shortest",
147
+ str(output_path),
148
+ ]
149
+
150
+ logger.info(
151
+ "replace_audio_stream_from_audio: %s + <stdin wav %d bytes> -> %s",
152
+ video_path,
153
+ len(wav_bytes),
154
+ output_path,
155
+ )
156
+ process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stderr=subprocess.PIPE)
157
+ _, stderr = process.communicate(wav_bytes)
158
+ if process.returncode != 0:
159
+ raise RemuxError(f"ffmpeg failed (exit {process.returncode}): {stderr.decode(errors='replace')}")
@@ -4,7 +4,9 @@ from __future__ import annotations
4
4
 
5
5
  from dataclasses import dataclass
6
6
 
7
- from videopython.base.audio import Audio
7
+ import numpy as np
8
+
9
+ from videopython.base.audio import Audio, AudioMetadata
8
10
 
9
11
 
10
12
  @dataclass
@@ -181,32 +183,58 @@ class TimingSynchronizer:
181
183
  if len(audio_segments) != len(start_times):
182
184
  raise ValueError(f"Length mismatch: {len(audio_segments)} segments vs {len(start_times)} start times")
183
185
 
186
+ for start_time in start_times:
187
+ if start_time < 0:
188
+ raise ValueError(f"Invalid start time: {start_time}")
189
+
184
190
  if not audio_segments:
185
191
  return Audio.create_silent(total_duration, stereo=False)
186
192
 
187
- # Determine sample rate from first segment
193
+ # Single-pass assembler: allocate one mono float32 buffer and add each
194
+ # segment in place at its start sample. The previous implementation
195
+ # called Audio.overlay() per segment, which allocates np.zeros and
196
+ # copies the full track on every call — O(N * total_samples) memory
197
+ # traffic. For long dubs (thousands of segments) this loop dominated
198
+ # wall time and peak RAM.
188
199
  sample_rate = audio_segments[0].metadata.sample_rate
189
-
190
- # Create base silent track
191
- output = Audio.create_silent(total_duration, stereo=False, sample_rate=sample_rate)
192
-
193
- # Overlay each segment at its start time
200
+ base_samples = max(int(total_duration * sample_rate), 0)
201
+
202
+ # Pre-normalize each segment to (mono, target sample rate) and compute
203
+ # placement bounds so the output buffer is sized to fit any segment
204
+ # that runs past total_duration (mirrors Audio.overlay's extend-on-OOB
205
+ # behavior so we don't silently truncate speech).
206
+ normalized: list[tuple[int, np.ndarray]] = []
207
+ end_sample = base_samples
194
208
  for audio, start_time in zip(audio_segments, start_times):
195
- if start_time < 0:
196
- raise ValueError(f"Invalid start time: {start_time}")
197
-
198
- # Resample if needed
199
209
  if audio.metadata.sample_rate != sample_rate:
200
210
  audio = audio.resample(sample_rate)
201
-
202
- # Convert to mono if needed
203
211
  if audio.metadata.channels > 1:
204
212
  audio = audio.to_mono()
205
-
206
- # Overlay at position
207
- output = output.overlay(audio, position=start_time)
208
-
209
- return output
213
+ start_sample = int(np.ceil(start_time * sample_rate))
214
+ seg_data = audio.data
215
+ normalized.append((start_sample, seg_data))
216
+ end_sample = max(end_sample, start_sample + len(seg_data))
217
+
218
+ output = np.zeros(end_sample, dtype=np.float32)
219
+ for start_sample, seg_data in normalized:
220
+ stop = start_sample + len(seg_data)
221
+ output[start_sample:stop] += seg_data
222
+
223
+ # Single post-mix peak guard, equivalent to Audio.overlay's per-call
224
+ # rescale collapsed into one pass. For non-overlapping dub segments
225
+ # this is a no-op; only the rare overlap case touches it.
226
+ max_amplitude = float(np.max(np.abs(output))) if output.size else 0.0
227
+ if max_amplitude > 1.0:
228
+ output /= max_amplitude
229
+
230
+ metadata = AudioMetadata(
231
+ sample_rate=sample_rate,
232
+ channels=1,
233
+ sample_width=audio_segments[0].metadata.sample_width,
234
+ duration_seconds=end_sample / sample_rate,
235
+ frame_count=end_sample,
236
+ )
237
+ return Audio(output, metadata)
210
238
 
211
239
  def check_overlaps(
212
240
  self,
@@ -2,11 +2,14 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from typing import Any
5
+ from typing import TYPE_CHECKING, Any
6
6
 
7
7
  from videopython.ai._device import log_device_initialization, release_device_memory, select_device
8
8
  from videopython.base.audio import Audio, AudioMetadata
9
9
 
10
+ if TYPE_CHECKING:
11
+ from pathlib import Path
12
+
10
13
 
11
14
  class TextToSpeech:
12
15
  """Generates speech audio from text using Chatterbox Multilingual.
@@ -47,6 +50,7 @@ class TextToSpeech:
47
50
  self,
48
51
  text: str,
49
52
  voice_sample: Audio | None = None,
53
+ voice_sample_path: str | Path | None = None,
50
54
  ) -> Audio:
51
55
  """Generate speech audio from text.
52
56
 
@@ -54,6 +58,12 @@ class TextToSpeech:
54
58
  text: Text to synthesize.
55
59
  voice_sample: Optional voice sample to clone. Falls back to the
56
60
  instance's ``voice`` and then to Chatterbox's default speaker.
61
+ voice_sample_path: Optional pre-encoded WAV path to use directly as
62
+ the speaker prompt. Skips the per-call temp-WAV encode that
63
+ ``voice_sample`` would otherwise trigger. When set, takes
64
+ precedence over ``voice_sample`` and ``self.voice``. Used by
65
+ the dubbing pipeline to encode each speaker's sample once and
66
+ reuse it across all of that speaker's segments.
57
67
  """
58
68
  import tempfile
59
69
  from pathlib import Path
@@ -63,13 +73,18 @@ class TextToSpeech:
63
73
  if self._model is None:
64
74
  self._init_model()
65
75
 
66
- effective_sample = voice_sample or self.voice
67
76
  speaker_wav_path: Path | None = None
68
-
69
- if effective_sample is not None:
70
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
71
- effective_sample.save(f.name)
72
- speaker_wav_path = Path(f.name)
77
+ cleanup_path = False
78
+
79
+ if voice_sample_path is not None:
80
+ speaker_wav_path = Path(voice_sample_path)
81
+ else:
82
+ effective_sample = voice_sample or self.voice
83
+ if effective_sample is not None:
84
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
85
+ effective_sample.save(f.name)
86
+ speaker_wav_path = Path(f.name)
87
+ cleanup_path = True
73
88
 
74
89
  try:
75
90
  wav = self._model.generate(
@@ -91,8 +106,8 @@ class TextToSpeech:
91
106
  )
92
107
  return Audio(audio_data, metadata)
93
108
  finally:
94
- if speaker_wav_path is not None:
95
- speaker_wav_path.unlink()
109
+ if cleanup_path and speaker_wav_path is not None:
110
+ speaker_wav_path.unlink(missing_ok=True)
96
111
 
97
112
  def unload(self) -> None:
98
113
  """Release the TTS model so the next generate_audio() re-initializes.
@@ -130,6 +130,48 @@ class AudioToText:
130
130
  )
131
131
  return result
132
132
 
133
+ def diarize_transcription(self, audio: Audio, transcription: Transcription) -> Transcription:
134
+ """Attach speaker labels to a pre-computed transcription using pyannote.
135
+
136
+ Useful when callers have a transcription (e.g. pre-computed and edited)
137
+ but no speakers, and want per-speaker voice cloning in dubbing without
138
+ re-running Whisper. Runs pyannote standalone on ``audio`` and overlays
139
+ speakers onto the supplied transcription's words.
140
+
141
+ Requires word-level timings: at least one segment must contain more
142
+ than one word. Transcriptions loaded from SRT (one synthetic word per
143
+ segment) will not produce useful speakers and are rejected.
144
+ """
145
+ import numpy as np
146
+ import torch
147
+
148
+ all_words: list[TranscriptionWord] = list(transcription.words)
149
+ if not all_words:
150
+ raise ValueError("Cannot diarize a transcription with no words.")
151
+
152
+ if not any(len(seg.words) > 1 for seg in transcription.segments):
153
+ raise ValueError(
154
+ "Cannot diarize a transcription without word-level timings. "
155
+ "Supplied transcription has at most one word per segment "
156
+ "(e.g. loaded from SRT). Provide a transcription with "
157
+ "word-level timings, or omit `transcription` to let the "
158
+ "pipeline transcribe and diarize from scratch."
159
+ )
160
+
161
+ if self._diarization_pipeline is None:
162
+ self._init_diarization()
163
+
164
+ import whisper
165
+
166
+ audio_mono = audio.to_mono().resample(whisper.audio.SAMPLE_RATE)
167
+ waveform = torch.from_numpy(audio_mono.data.astype(np.float32)).unsqueeze(0)
168
+ diarization_result = self._diarization_pipeline(
169
+ {"waveform": waveform, "sample_rate": audio_mono.metadata.sample_rate}
170
+ )
171
+
172
+ all_words = self._assign_speakers_to_words(all_words, diarization_result)
173
+ return Transcription(words=all_words, language=transcription.language)
174
+
133
175
  def _transcribe_with_diarization(self, audio_mono: Audio) -> Transcription:
134
176
  """Transcribe with word timestamps and assign speakers via pyannote."""
135
177
  import numpy as np
@@ -1,73 +0,0 @@
1
- """ffmpeg helper for replacing a video file's audio track without re-encoding video."""
2
-
3
- from __future__ import annotations
4
-
5
- import logging
6
- import subprocess
7
- from pathlib import Path
8
-
9
- logger = logging.getLogger(__name__)
10
-
11
-
12
- class RemuxError(RuntimeError):
13
- """ffmpeg failed while replacing an audio stream."""
14
-
15
-
16
- def replace_audio_stream(
17
- video_path: str | Path,
18
- audio_path: str | Path,
19
- output_path: str | Path,
20
- audio_codec: str = "aac",
21
- audio_bitrate: str = "192k",
22
- ) -> None:
23
- """Copy ``video_path``'s video stream and mux in ``audio_path`` as the audio track.
24
-
25
- Uses ffmpeg stream-copy for video (no re-encode) and encodes audio to AAC.
26
- ``-shortest`` trims to the shorter of the two streams so the output duration
27
- matches the source video when the dubbed audio is slightly longer.
28
-
29
- Args:
30
- video_path: Source video file (video stream is copied unchanged).
31
- audio_path: Audio file to use as the new audio track.
32
- output_path: Destination file. Overwritten if it exists.
33
- audio_codec: ffmpeg audio codec name. Defaults to ``aac`` (MP4-compatible).
34
- audio_bitrate: Audio bitrate passed to ffmpeg (``-b:a``).
35
-
36
- Raises:
37
- FileNotFoundError: If ``video_path`` or ``audio_path`` does not exist.
38
- RemuxError: If ffmpeg returns a non-zero exit code.
39
- """
40
- video_path = Path(video_path)
41
- audio_path = Path(audio_path)
42
- output_path = Path(output_path)
43
-
44
- if not video_path.exists():
45
- raise FileNotFoundError(f"Video file not found: {video_path}")
46
- if not audio_path.exists():
47
- raise FileNotFoundError(f"Audio file not found: {audio_path}")
48
-
49
- cmd = [
50
- "ffmpeg",
51
- "-y",
52
- "-i",
53
- str(video_path),
54
- "-i",
55
- str(audio_path),
56
- "-map",
57
- "0:v:0",
58
- "-map",
59
- "1:a:0",
60
- "-c:v",
61
- "copy",
62
- "-c:a",
63
- audio_codec,
64
- "-b:a",
65
- audio_bitrate,
66
- "-shortest",
67
- str(output_path),
68
- ]
69
-
70
- logger.info("replace_audio_stream: %s + %s -> %s", video_path, audio_path, output_path)
71
- result = subprocess.run(cmd, capture_output=True)
72
- if result.returncode != 0:
73
- raise RemuxError(f"ffmpeg failed (exit {result.returncode}): {result.stderr.decode(errors='replace')}")
File without changes
File without changes
File without changes