videopython 0.26.5__tar.gz → 0.26.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {videopython-0.26.5 → videopython-0.26.6}/PKG-INFO +1 -1
  2. {videopython-0.26.5 → videopython-0.26.6}/pyproject.toml +1 -1
  3. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/dubbing/dubber.py +17 -6
  4. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/dubbing/pipeline.py +38 -1
  5. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/understanding/audio.py +42 -0
  6. {videopython-0.26.5 → videopython-0.26.6}/.gitignore +0 -0
  7. {videopython-0.26.5 → videopython-0.26.6}/LICENSE +0 -0
  8. {videopython-0.26.5 → videopython-0.26.6}/README.md +0 -0
  9. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/__init__.py +0 -0
  10. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/__init__.py +0 -0
  11. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/_device.py +0 -0
  12. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/dubbing/__init__.py +0 -0
  13. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/dubbing/models.py +0 -0
  14. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/dubbing/remux.py +0 -0
  15. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/dubbing/timing.py +0 -0
  16. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/generation/__init__.py +0 -0
  17. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/generation/audio.py +0 -0
  18. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/generation/image.py +0 -0
  19. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/generation/translation.py +0 -0
  20. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/generation/video.py +0 -0
  21. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/registry.py +0 -0
  22. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/swapping/__init__.py +0 -0
  23. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/swapping/inpainter.py +0 -0
  24. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/swapping/models.py +0 -0
  25. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/swapping/segmenter.py +0 -0
  26. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/swapping/swapper.py +0 -0
  27. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/transforms.py +0 -0
  28. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/understanding/__init__.py +0 -0
  29. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/understanding/image.py +0 -0
  30. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/understanding/separation.py +0 -0
  31. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/understanding/temporal.py +0 -0
  32. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/ai/video_analysis.py +0 -0
  33. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/__init__.py +0 -0
  34. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/audio/__init__.py +0 -0
  35. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/audio/analysis.py +0 -0
  36. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/audio/audio.py +0 -0
  37. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/combine.py +0 -0
  38. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/description.py +0 -0
  39. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/effects.py +0 -0
  40. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/exceptions.py +0 -0
  41. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/progress.py +0 -0
  42. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/registry.py +0 -0
  43. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/scene.py +0 -0
  44. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/streaming.py +0 -0
  45. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/text/__init__.py +0 -0
  46. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/text/overlay.py +0 -0
  47. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/text/transcription.py +0 -0
  48. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/transforms.py +0 -0
  49. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/transitions.py +0 -0
  50. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/utils.py +0 -0
  51. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/base/video.py +0 -0
  52. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/editing/__init__.py +0 -0
  53. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/editing/multicam.py +0 -0
  54. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/editing/premiere_xml.py +0 -0
  55. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/editing/video_edit.py +0 -0
  56. {videopython-0.26.5 → videopython-0.26.6}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.26.5
3
+ Version: 0.26.6
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.26.5"
3
+ version = "0.26.6"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -74,9 +74,14 @@ class VideoDubber:
74
74
 
75
75
  Args:
76
76
  enable_diarization: Enable speaker diarization to clone each speaker's
77
- voice separately. Requires additional VRAM for the diarization model.
78
- transcription: Optional pre-computed Transcription object. When provided,
79
- the internal Whisper transcription step is skipped.
77
+ voice separately. With ``transcription=None``, runs alongside Whisper.
78
+ With a supplied ``transcription`` that has no speakers, runs pyannote
79
+ standalone and overlays speakers onto the supplied words. Ignored when
80
+ the supplied transcription already has speaker labels.
81
+ transcription: Optional pre-computed ``Transcription`` to skip the Whisper
82
+ step. Speaker labels on the supplied transcription drive per-speaker
83
+ voice cloning. If it has no speakers, pass ``enable_diarization=True``
84
+ to add them via pyannote (requires word-level timings).
80
85
  """
81
86
  if self._local_pipeline is None:
82
87
  self._init_local_pipeline()
@@ -106,8 +111,10 @@ class VideoDubber:
106
111
  """Dub a video and return a new video with the dubbed audio.
107
112
 
108
113
  Args:
109
- transcription: Optional pre-computed Transcription object. When provided,
110
- the internal Whisper transcription step is skipped.
114
+ transcription: Optional pre-computed ``Transcription`` to skip the Whisper
115
+ step. Speaker labels on the supplied transcription drive per-speaker
116
+ voice cloning. See ``dub()`` for the interaction with
117
+ ``enable_diarization``.
111
118
  """
112
119
  result = self.dub(
113
120
  video=video,
@@ -152,8 +159,12 @@ class VideoDubber:
152
159
  preserve_background: Preserve background music/effects via source separation.
153
160
  voice_clone: Clone the source speaker's voice for the dubbed track.
154
161
  enable_diarization: Enable speaker diarization for per-speaker voice cloning.
162
+ See ``dub()`` for the interaction with ``transcription``.
155
163
  progress_callback: Optional callback ``(stage: str, progress: float) -> None``.
156
- transcription: Optional pre-computed ``Transcription`` to skip the Whisper step.
164
+ transcription: Optional pre-computed ``Transcription`` to skip the Whisper
165
+ step. Speaker labels on the supplied transcription drive per-speaker
166
+ voice cloning. If it has no speakers, pass ``enable_diarization=True``
167
+ to add them via pyannote (requires word-level timings).
157
168
 
158
169
  Returns:
159
170
  ``DubbingResult`` with the dubbed audio, translated segments, and
@@ -162,7 +162,16 @@ class LocalDubbingPipeline:
162
162
  transcription: Optional pre-computed Transcription object. When provided,
163
163
  the internal Whisper transcription step is skipped (saving time and VRAM).
164
164
  Must be a ``videopython.base.text.transcription.Transcription`` instance
165
- with populated ``segments``.
165
+ with populated ``segments``. Speaker labels on the supplied transcription
166
+ drive per-speaker voice cloning. If the supplied transcription has no
167
+ speakers and ``enable_diarization=True``, pyannote is run standalone on
168
+ ``source_audio`` and speakers are attached to the supplied words
169
+ (requires word-level timings).
170
+ enable_diarization: When True, run speaker diarization to enable per-speaker
171
+ voice cloning. With ``transcription=None``, runs alongside Whisper. With
172
+ a supplied ``transcription`` that has no speakers, runs pyannote
173
+ standalone and overlays speakers onto the supplied words. Ignored when
174
+ the supplied transcription already has speaker labels.
166
175
  """
167
176
 
168
177
  def report_progress(stage: str, progress: float) -> None:
@@ -171,6 +180,34 @@ class LocalDubbingPipeline:
171
180
 
172
181
  if transcription is not None:
173
182
  report_progress("Using provided transcription", 0.05)
183
+ if transcription.speakers:
184
+ logger.info(
185
+ "Using provided transcription: %d segment(s), %d speaker(s)",
186
+ len(transcription.segments),
187
+ len(transcription.speakers),
188
+ )
189
+ if enable_diarization:
190
+ logger.info("enable_diarization=True ignored: supplied transcription already has speaker labels.")
191
+ elif enable_diarization:
192
+ report_progress("Diarizing supplied transcription", 0.10)
193
+ if self._transcriber is None or self._transcriber_diarization is not True:
194
+ self._init_transcriber(enable_diarization=True)
195
+ self._transcriber_diarization = True
196
+ transcription = self._transcriber.diarize_transcription(source_audio, transcription)
197
+ self._maybe_unload("_transcriber")
198
+ logger.info(
199
+ "Diarized supplied transcription: %d segment(s), %d speaker(s)",
200
+ len(transcription.segments),
201
+ len(transcription.speakers),
202
+ )
203
+ else:
204
+ logger.info(
205
+ "Using provided transcription: %d segment(s), no speaker labels. "
206
+ "All segments will share a single voice clone. Pass "
207
+ "enable_diarization=True to add per-speaker labels, or "
208
+ "voice_clone=False to use the default TTS voice.",
209
+ len(transcription.segments),
210
+ )
174
211
  else:
175
212
  report_progress("Transcribing audio", 0.05)
176
213
  if self._transcriber is None or self._transcriber_diarization != enable_diarization:
@@ -130,6 +130,48 @@ class AudioToText:
130
130
  )
131
131
  return result
132
132
 
133
+ def diarize_transcription(self, audio: Audio, transcription: Transcription) -> Transcription:
134
+ """Attach speaker labels to a pre-computed transcription using pyannote.
135
+
136
+ Useful when callers have a transcription (e.g. pre-computed and edited)
137
+ but no speakers, and want per-speaker voice cloning in dubbing without
138
+ re-running Whisper. Runs pyannote standalone on ``audio`` and overlays
139
+ speakers onto the supplied transcription's words.
140
+
141
+ Requires word-level timings: at least one segment must contain more
142
+ than one word. Transcriptions loaded from SRT (one synthetic word per
143
+ segment) will not produce useful speakers and are rejected.
144
+ """
145
+ import numpy as np
146
+ import torch
147
+
148
+ all_words: list[TranscriptionWord] = list(transcription.words)
149
+ if not all_words:
150
+ raise ValueError("Cannot diarize a transcription with no words.")
151
+
152
+ if not any(len(seg.words) > 1 for seg in transcription.segments):
153
+ raise ValueError(
154
+ "Cannot diarize a transcription without word-level timings. "
155
+ "Supplied transcription has at most one word per segment "
156
+ "(e.g. loaded from SRT). Provide a transcription with "
157
+ "word-level timings, or omit `transcription` to let the "
158
+ "pipeline transcribe and diarize from scratch."
159
+ )
160
+
161
+ if self._diarization_pipeline is None:
162
+ self._init_diarization()
163
+
164
+ import whisper
165
+
166
+ audio_mono = audio.to_mono().resample(whisper.audio.SAMPLE_RATE)
167
+ waveform = torch.from_numpy(audio_mono.data.astype(np.float32)).unsqueeze(0)
168
+ diarization_result = self._diarization_pipeline(
169
+ {"waveform": waveform, "sample_rate": audio_mono.metadata.sample_rate}
170
+ )
171
+
172
+ all_words = self._assign_speakers_to_words(all_words, diarization_result)
173
+ return Transcription(words=all_words, language=transcription.language)
174
+
133
175
  def _transcribe_with_diarization(self, audio_mono: Audio) -> Transcription:
134
176
  """Transcribe with word timestamps and assign speakers via pyannote."""
135
177
  import numpy as np
File without changes
File without changes
File without changes