videopython 0.26.5__tar.gz → 0.26.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.26.5 → videopython-0.26.6}/PKG-INFO +1 -1
- {videopython-0.26.5 → videopython-0.26.6}/pyproject.toml +1 -1
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/dubbing/dubber.py +17 -6
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/dubbing/pipeline.py +38 -1
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/understanding/audio.py +42 -0
- {videopython-0.26.5 → videopython-0.26.6}/.gitignore +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/LICENSE +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/README.md +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/__init__.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/_device.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/dubbing/__init__.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/dubbing/models.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/dubbing/remux.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/generation/audio.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/generation/translation.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/registry.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/swapping/__init__.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/swapping/inpainter.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/swapping/models.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/swapping/segmenter.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/swapping/swapper.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/transforms.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/understanding/separation.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/video_analysis.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/__init__.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/audio/__init__.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/audio/analysis.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/audio/audio.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/combine.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/description.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/effects.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/progress.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/registry.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/scene.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/streaming.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/text/__init__.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/text/overlay.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/text/transcription.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/transforms.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/transitions.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/utils.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/video.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/editing/__init__.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/editing/multicam.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/editing/premiere_xml.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/editing/video_edit.py +0 -0
- {videopython-0.26.5 → videopython-0.26.6}/src/videopython/py.typed +0 -0
|
@@ -74,9 +74,14 @@ class VideoDubber:
|
|
|
74
74
|
|
|
75
75
|
Args:
|
|
76
76
|
enable_diarization: Enable speaker diarization to clone each speaker's
|
|
77
|
-
voice separately.
|
|
78
|
-
|
|
79
|
-
|
|
77
|
+
voice separately. With ``transcription=None``, runs alongside Whisper.
|
|
78
|
+
With a supplied ``transcription`` that has no speakers, runs pyannote
|
|
79
|
+
standalone and overlays speakers onto the supplied words. Ignored when
|
|
80
|
+
the supplied transcription already has speaker labels.
|
|
81
|
+
transcription: Optional pre-computed ``Transcription`` to skip the Whisper
|
|
82
|
+
step. Speaker labels on the supplied transcription drive per-speaker
|
|
83
|
+
voice cloning. If it has no speakers, pass ``enable_diarization=True``
|
|
84
|
+
to add them via pyannote (requires word-level timings).
|
|
80
85
|
"""
|
|
81
86
|
if self._local_pipeline is None:
|
|
82
87
|
self._init_local_pipeline()
|
|
@@ -106,8 +111,10 @@ class VideoDubber:
|
|
|
106
111
|
"""Dub a video and return a new video with the dubbed audio.
|
|
107
112
|
|
|
108
113
|
Args:
|
|
109
|
-
transcription: Optional pre-computed Transcription
|
|
110
|
-
the
|
|
114
|
+
transcription: Optional pre-computed ``Transcription`` to skip the Whisper
|
|
115
|
+
step. Speaker labels on the supplied transcription drive per-speaker
|
|
116
|
+
voice cloning. See ``dub()`` for the interaction with
|
|
117
|
+
``enable_diarization``.
|
|
111
118
|
"""
|
|
112
119
|
result = self.dub(
|
|
113
120
|
video=video,
|
|
@@ -152,8 +159,12 @@ class VideoDubber:
|
|
|
152
159
|
preserve_background: Preserve background music/effects via source separation.
|
|
153
160
|
voice_clone: Clone the source speaker's voice for the dubbed track.
|
|
154
161
|
enable_diarization: Enable speaker diarization for per-speaker voice cloning.
|
|
162
|
+
See ``dub()`` for the interaction with ``transcription``.
|
|
155
163
|
progress_callback: Optional callback ``(stage: str, progress: float) -> None``.
|
|
156
|
-
transcription: Optional pre-computed ``Transcription`` to skip the Whisper
|
|
164
|
+
transcription: Optional pre-computed ``Transcription`` to skip the Whisper
|
|
165
|
+
step. Speaker labels on the supplied transcription drive per-speaker
|
|
166
|
+
voice cloning. If it has no speakers, pass ``enable_diarization=True``
|
|
167
|
+
to add them via pyannote (requires word-level timings).
|
|
157
168
|
|
|
158
169
|
Returns:
|
|
159
170
|
``DubbingResult`` with the dubbed audio, translated segments, and
|
|
@@ -162,7 +162,16 @@ class LocalDubbingPipeline:
|
|
|
162
162
|
transcription: Optional pre-computed Transcription object. When provided,
|
|
163
163
|
the internal Whisper transcription step is skipped (saving time and VRAM).
|
|
164
164
|
Must be a ``videopython.base.text.transcription.Transcription`` instance
|
|
165
|
-
with populated ``segments``.
|
|
165
|
+
with populated ``segments``. Speaker labels on the supplied transcription
|
|
166
|
+
drive per-speaker voice cloning. If the supplied transcription has no
|
|
167
|
+
speakers and ``enable_diarization=True``, pyannote is run standalone on
|
|
168
|
+
``source_audio`` and speakers are attached to the supplied words
|
|
169
|
+
(requires word-level timings).
|
|
170
|
+
enable_diarization: When True, run speaker diarization to enable per-speaker
|
|
171
|
+
voice cloning. With ``transcription=None``, runs alongside Whisper. With
|
|
172
|
+
a supplied ``transcription`` that has no speakers, runs pyannote
|
|
173
|
+
standalone and overlays speakers onto the supplied words. Ignored when
|
|
174
|
+
the supplied transcription already has speaker labels.
|
|
166
175
|
"""
|
|
167
176
|
|
|
168
177
|
def report_progress(stage: str, progress: float) -> None:
|
|
@@ -171,6 +180,34 @@ class LocalDubbingPipeline:
|
|
|
171
180
|
|
|
172
181
|
if transcription is not None:
|
|
173
182
|
report_progress("Using provided transcription", 0.05)
|
|
183
|
+
if transcription.speakers:
|
|
184
|
+
logger.info(
|
|
185
|
+
"Using provided transcription: %d segment(s), %d speaker(s)",
|
|
186
|
+
len(transcription.segments),
|
|
187
|
+
len(transcription.speakers),
|
|
188
|
+
)
|
|
189
|
+
if enable_diarization:
|
|
190
|
+
logger.info("enable_diarization=True ignored: supplied transcription already has speaker labels.")
|
|
191
|
+
elif enable_diarization:
|
|
192
|
+
report_progress("Diarizing supplied transcription", 0.10)
|
|
193
|
+
if self._transcriber is None or self._transcriber_diarization is not True:
|
|
194
|
+
self._init_transcriber(enable_diarization=True)
|
|
195
|
+
self._transcriber_diarization = True
|
|
196
|
+
transcription = self._transcriber.diarize_transcription(source_audio, transcription)
|
|
197
|
+
self._maybe_unload("_transcriber")
|
|
198
|
+
logger.info(
|
|
199
|
+
"Diarized supplied transcription: %d segment(s), %d speaker(s)",
|
|
200
|
+
len(transcription.segments),
|
|
201
|
+
len(transcription.speakers),
|
|
202
|
+
)
|
|
203
|
+
else:
|
|
204
|
+
logger.info(
|
|
205
|
+
"Using provided transcription: %d segment(s), no speaker labels. "
|
|
206
|
+
"All segments will share a single voice clone. Pass "
|
|
207
|
+
"enable_diarization=True to add per-speaker labels, or "
|
|
208
|
+
"voice_clone=False to use the default TTS voice.",
|
|
209
|
+
len(transcription.segments),
|
|
210
|
+
)
|
|
174
211
|
else:
|
|
175
212
|
report_progress("Transcribing audio", 0.05)
|
|
176
213
|
if self._transcriber is None or self._transcriber_diarization != enable_diarization:
|
|
@@ -130,6 +130,48 @@ class AudioToText:
|
|
|
130
130
|
)
|
|
131
131
|
return result
|
|
132
132
|
|
|
133
|
+
def diarize_transcription(self, audio: Audio, transcription: Transcription) -> Transcription:
|
|
134
|
+
"""Attach speaker labels to a pre-computed transcription using pyannote.
|
|
135
|
+
|
|
136
|
+
Useful when callers have a transcription (e.g. pre-computed and edited)
|
|
137
|
+
but no speakers, and want per-speaker voice cloning in dubbing without
|
|
138
|
+
re-running Whisper. Runs pyannote standalone on ``audio`` and overlays
|
|
139
|
+
speakers onto the supplied transcription's words.
|
|
140
|
+
|
|
141
|
+
Requires word-level timings: at least one segment must contain more
|
|
142
|
+
than one word. Transcriptions loaded from SRT (one synthetic word per
|
|
143
|
+
segment) will not produce useful speakers and are rejected.
|
|
144
|
+
"""
|
|
145
|
+
import numpy as np
|
|
146
|
+
import torch
|
|
147
|
+
|
|
148
|
+
all_words: list[TranscriptionWord] = list(transcription.words)
|
|
149
|
+
if not all_words:
|
|
150
|
+
raise ValueError("Cannot diarize a transcription with no words.")
|
|
151
|
+
|
|
152
|
+
if not any(len(seg.words) > 1 for seg in transcription.segments):
|
|
153
|
+
raise ValueError(
|
|
154
|
+
"Cannot diarize a transcription without word-level timings. "
|
|
155
|
+
"Supplied transcription has at most one word per segment "
|
|
156
|
+
"(e.g. loaded from SRT). Provide a transcription with "
|
|
157
|
+
"word-level timings, or omit `transcription` to let the "
|
|
158
|
+
"pipeline transcribe and diarize from scratch."
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
if self._diarization_pipeline is None:
|
|
162
|
+
self._init_diarization()
|
|
163
|
+
|
|
164
|
+
import whisper
|
|
165
|
+
|
|
166
|
+
audio_mono = audio.to_mono().resample(whisper.audio.SAMPLE_RATE)
|
|
167
|
+
waveform = torch.from_numpy(audio_mono.data.astype(np.float32)).unsqueeze(0)
|
|
168
|
+
diarization_result = self._diarization_pipeline(
|
|
169
|
+
{"waveform": waveform, "sample_rate": audio_mono.metadata.sample_rate}
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
all_words = self._assign_speakers_to_words(all_words, diarization_result)
|
|
173
|
+
return Transcription(words=all_words, language=transcription.language)
|
|
174
|
+
|
|
133
175
|
def _transcribe_with_diarization(self, audio_mono: Audio) -> Transcription:
|
|
134
176
|
"""Transcribe with word timestamps and assign speakers via pyannote."""
|
|
135
177
|
import numpy as np
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|