videopython 0.26.10__tar.gz → 0.27.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {videopython-0.26.10 → videopython-0.27.0}/PKG-INFO +2 -1
  2. {videopython-0.26.10 → videopython-0.27.0}/pyproject.toml +6 -1
  3. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/understanding/audio.py +91 -6
  4. {videopython-0.26.10 → videopython-0.27.0}/.gitignore +0 -0
  5. {videopython-0.26.10 → videopython-0.27.0}/LICENSE +0 -0
  6. {videopython-0.26.10 → videopython-0.27.0}/README.md +0 -0
  7. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/__init__.py +0 -0
  8. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/__init__.py +0 -0
  9. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/_device.py +0 -0
  10. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/dubbing/__init__.py +0 -0
  11. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/dubbing/dubber.py +0 -0
  12. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/dubbing/models.py +0 -0
  13. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/dubbing/pipeline.py +0 -0
  14. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/dubbing/remux.py +0 -0
  15. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/dubbing/timing.py +0 -0
  16. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/generation/__init__.py +0 -0
  17. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/generation/audio.py +0 -0
  18. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/generation/image.py +0 -0
  19. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/generation/translation.py +0 -0
  20. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/generation/video.py +0 -0
  21. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/registry.py +0 -0
  22. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/swapping/__init__.py +0 -0
  23. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/swapping/inpainter.py +0 -0
  24. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/swapping/models.py +0 -0
  25. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/swapping/segmenter.py +0 -0
  26. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/swapping/swapper.py +0 -0
  27. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/transforms.py +0 -0
  28. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/understanding/__init__.py +0 -0
  29. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/understanding/image.py +0 -0
  30. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/understanding/separation.py +0 -0
  31. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/understanding/temporal.py +0 -0
  32. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/video_analysis.py +0 -0
  33. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/__init__.py +0 -0
  34. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/audio/__init__.py +0 -0
  35. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/audio/analysis.py +0 -0
  36. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/audio/audio.py +0 -0
  37. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/combine.py +0 -0
  38. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/description.py +0 -0
  39. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/effects.py +0 -0
  40. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/exceptions.py +0 -0
  41. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/progress.py +0 -0
  42. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/registry.py +0 -0
  43. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/scene.py +0 -0
  44. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/streaming.py +0 -0
  45. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/text/__init__.py +0 -0
  46. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/text/overlay.py +0 -0
  47. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/text/transcription.py +0 -0
  48. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/transforms.py +0 -0
  49. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/transitions.py +0 -0
  50. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/utils.py +0 -0
  51. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/video.py +0 -0
  52. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/editing/__init__.py +0 -0
  53. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/editing/multicam.py +0 -0
  54. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/editing/premiere_xml.py +0 -0
  55. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/editing/video_edit.py +0 -0
  56. {videopython-0.26.10 → videopython-0.27.0}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.26.10
3
+ Version: 0.27.0
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -36,6 +36,7 @@ Requires-Dist: pyannote-audio>=4.0.0; extra == 'ai'
36
36
  Requires-Dist: scikit-learn>=1.3.0; extra == 'ai'
37
37
  Requires-Dist: scipy>=1.10.0; extra == 'ai'
38
38
  Requires-Dist: sentencepiece>=0.1.99; extra == 'ai'
39
+ Requires-Dist: silero-vad>=5.1; extra == 'ai'
39
40
  Requires-Dist: torch>=2.8.0; extra == 'ai'
40
41
  Requires-Dist: torchaudio>=2.8.0; extra == 'ai'
41
42
  Requires-Dist: transformers>=5.2.0; extra == 'ai'
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.26.10"
3
+ version = "0.27.0"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -62,6 +62,8 @@ ai = [
62
62
  "transformers>=5.2.0",
63
63
  "openai-whisper>=20240930",
64
64
  "pyannote-audio>=4.0.0",
65
+ # Voice activity detection (used by AudioToText to gate Whisper language detection)
66
+ "silero-vad>=5.1",
65
67
  "numba>=0.61.0",
66
68
  "ollama>=0.4.5",
67
69
  "scipy>=1.10.0",
@@ -91,6 +93,8 @@ ai = [
91
93
  "transformers>=5.2.0",
92
94
  "openai-whisper>=20240930",
93
95
  "pyannote-audio>=4.0.0",
96
+ # Voice activity detection (used by AudioToText to gate Whisper language detection)
97
+ "silero-vad>=5.1",
94
98
  "numba>=0.61.0",
95
99
  "ollama>=0.4.5",
96
100
  "scipy>=1.10.0",
@@ -130,6 +134,7 @@ module = [
130
134
  "demucs", "demucs.*",
131
135
  "huggingface_hub", "huggingface_hub.*",
132
136
  "pyannote", "pyannote.*",
137
+ "silero_vad", "silero_vad.*",
133
138
  "cv2", "cv2.*",
134
139
  ]
135
140
  ignore_missing_imports = true
@@ -24,10 +24,12 @@ class AudioToText:
24
24
  self,
25
25
  model_name: Literal["tiny", "base", "small", "medium", "large", "turbo"] = "small",
26
26
  enable_diarization: bool = False,
27
+ enable_vad: bool = True,
27
28
  device: str | None = None,
28
29
  ):
29
30
  self.model_name = model_name
30
31
  self.enable_diarization = enable_diarization
32
+ self.enable_vad = enable_vad
31
33
  self.device = select_device(device, mps_allowed=False)
32
34
  log_device_initialization(
33
35
  "AudioToText",
@@ -36,6 +38,7 @@ class AudioToText:
36
38
  )
37
39
  self._model: Any = None
38
40
  self._diarization_pipeline: Any = None
41
+ self._vad_model: Any = None
39
42
 
40
43
  def _init_local(self) -> None:
41
44
  """Initialize local Whisper model."""
@@ -51,13 +54,25 @@ class AudioToText:
51
54
  self._diarization_pipeline = Pipeline.from_pretrained(self.PYANNOTE_DIARIZATION_MODEL)
52
55
  self._diarization_pipeline.to(torch.device(self.device))
53
56
 
57
+ def _init_vad(self) -> None:
58
+ """Initialize Silero VAD model.
59
+
60
+ The model is ~2 MB and CPU-fast (~5-15s for a 90 min movie); we keep
61
+ it on CPU regardless of ``self.device`` since dispatch overhead would
62
+ outweigh inference cost.
63
+ """
64
+ from silero_vad import load_silero_vad
65
+
66
+ self._vad_model = load_silero_vad()
67
+
54
68
  def unload(self) -> None:
55
- """Release the Whisper and diarization models so the next call re-initializes.
69
+ """Release the Whisper, diarization, and VAD models so the next call re-initializes.
56
70
 
57
71
  Used by low-memory dubbing to free VRAM between pipeline stages.
58
72
  """
59
73
  self._model = None
60
74
  self._diarization_pipeline = None
75
+ self._vad_model = None
61
76
  release_device_memory(self.device)
62
77
 
63
78
  def _process_transcription_result(self, transcription_result: dict) -> Transcription:
@@ -172,7 +187,60 @@ class AudioToText:
172
187
  all_words = self._assign_speakers_to_words(all_words, diarization_result)
173
188
  return Transcription(words=all_words, language=transcription.language)
174
189
 
175
- def _transcribe_with_diarization(self, audio_mono: Audio) -> Transcription:
190
+ def _run_vad(self, audio_mono: Audio) -> list[tuple[float, float]]:
191
+ """Return voiced spans in seconds using Silero VAD.
192
+
193
+ Audio must already be mono at ``whisper.audio.SAMPLE_RATE`` (16 kHz),
194
+ which is one of Silero's two supported rates.
195
+ """
196
+ import numpy as np
197
+ import torch
198
+
199
+ if self._vad_model is None:
200
+ self._init_vad()
201
+
202
+ from silero_vad import get_speech_timestamps
203
+
204
+ waveform = torch.from_numpy(audio_mono.data.astype(np.float32))
205
+ timestamps = get_speech_timestamps(
206
+ waveform,
207
+ self._vad_model,
208
+ sampling_rate=audio_mono.metadata.sample_rate,
209
+ return_seconds=True,
210
+ )
211
+ return [(float(ts["start"]), float(ts["end"])) for ts in timestamps]
212
+
213
+ def _detect_language(self, audio_mono: Audio, voiced_spans: list[tuple[float, float]]) -> str:
214
+ """Run Whisper language detection on a 30s window of voiced audio.
215
+
216
+ Whisper's auto-detection only inspects the first 30s of input. When
217
+ the file opens with silence/music/credits, that window contains no
218
+ speech and detection picks the closest-looking thing (typically
219
+ English). Concatenating voiced spans up to 30s and running
220
+ ``model.detect_language()`` on the resulting mel fixes this.
221
+ """
222
+ import numpy as np
223
+ import torch
224
+ import whisper
225
+
226
+ sample_rate = audio_mono.metadata.sample_rate
227
+ chunks: list[np.ndarray] = []
228
+ remaining = whisper.audio.N_SAMPLES
229
+ for start, end in voiced_spans:
230
+ if remaining <= 0:
231
+ break
232
+ chunk = audio_mono.data[int(start * sample_rate) : int(end * sample_rate)][:remaining]
233
+ chunks.append(chunk)
234
+ remaining -= len(chunk)
235
+
236
+ voiced_audio = np.concatenate(chunks).astype(np.float32) if chunks else np.zeros(0, dtype=np.float32)
237
+ padded = whisper.audio.pad_or_trim(torch.from_numpy(voiced_audio))
238
+ mel = whisper.audio.log_mel_spectrogram(padded, n_mels=self._model.dims.n_mels).to(self._model.device)
239
+
240
+ _, probs = self._model.detect_language(mel)
241
+ return max(probs, key=probs.get)
242
+
243
+ def _transcribe_with_diarization(self, audio_mono: Audio, language: str | None) -> Transcription:
176
244
  """Transcribe with word timestamps and assign speakers via pyannote."""
177
245
  import numpy as np
178
246
  import torch
@@ -181,7 +249,7 @@ class AudioToText:
181
249
  self._init_diarization()
182
250
 
183
251
  audio_data = audio_mono.data
184
- transcription_result = self._model.transcribe(audio=audio_data, word_timestamps=True)
252
+ transcription_result = self._model.transcribe(audio=audio_data, word_timestamps=True, language=language)
185
253
 
186
254
  waveform = torch.from_numpy(audio_data.astype(np.float32)).unsqueeze(0)
187
255
  diarization_result = self._diarization_pipeline(
@@ -200,7 +268,17 @@ class AudioToText:
200
268
  return Transcription(words=all_words, language=transcription.language)
201
269
 
202
270
  def _transcribe_local(self, audio: Audio) -> Transcription:
203
- """Transcribe using local Whisper model."""
271
+ """Transcribe using local Whisper model.
272
+
273
+ When ``enable_vad`` is True (default), Silero VAD locates voiced
274
+ regions and a 30s voiced window is used for Whisper language
275
+ detection -- avoiding the well-known failure where Whisper locks
276
+ onto the wrong language because the first 30s of input is silence
277
+ or music. The detected language is then passed into
278
+ ``transcribe()`` so chunked decoding stays consistent. If VAD
279
+ finds no speech, an empty Transcription is returned without
280
+ invoking Whisper.
281
+ """
204
282
  import whisper
205
283
 
206
284
  if self._model is None:
@@ -208,10 +286,17 @@ class AudioToText:
208
286
 
209
287
  audio_mono = audio.to_mono().resample(whisper.audio.SAMPLE_RATE)
210
288
 
289
+ language: str | None = None
290
+ if self.enable_vad:
291
+ voiced_spans = self._run_vad(audio_mono)
292
+ if not voiced_spans:
293
+ return Transcription(segments=[])
294
+ language = self._detect_language(audio_mono, voiced_spans)
295
+
211
296
  if self.enable_diarization:
212
- return self._transcribe_with_diarization(audio_mono)
297
+ return self._transcribe_with_diarization(audio_mono, language)
213
298
 
214
- transcription_result = self._model.transcribe(audio=audio_mono.data, word_timestamps=True)
299
+ transcription_result = self._model.transcribe(audio=audio_mono.data, word_timestamps=True, language=language)
215
300
  return self._process_transcription_result(transcription_result)
216
301
 
217
302
  def transcribe(self, media: Audio | Video) -> Transcription:
File without changes
File without changes
File without changes