videopython 0.26.10__tar.gz → 0.27.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.26.10 → videopython-0.27.0}/PKG-INFO +2 -1
- {videopython-0.26.10 → videopython-0.27.0}/pyproject.toml +6 -1
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/understanding/audio.py +91 -6
- {videopython-0.26.10 → videopython-0.27.0}/.gitignore +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/LICENSE +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/README.md +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/__init__.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/_device.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/dubbing/__init__.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/dubbing/dubber.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/dubbing/models.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/dubbing/pipeline.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/dubbing/remux.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/generation/audio.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/generation/translation.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/registry.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/swapping/__init__.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/swapping/inpainter.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/swapping/models.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/swapping/segmenter.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/swapping/swapper.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/transforms.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/understanding/separation.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/ai/video_analysis.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/__init__.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/audio/__init__.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/audio/analysis.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/audio/audio.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/combine.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/description.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/effects.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/progress.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/registry.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/scene.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/streaming.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/text/__init__.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/text/overlay.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/text/transcription.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/transforms.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/transitions.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/utils.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/base/video.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/editing/__init__.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/editing/multicam.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/editing/premiere_xml.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/editing/video_edit.py +0 -0
- {videopython-0.26.10 → videopython-0.27.0}/src/videopython/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: videopython
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.27.0
|
|
4
4
|
Summary: Minimal video generation and processing library.
|
|
5
5
|
Project-URL: Homepage, https://videopython.com
|
|
6
6
|
Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
|
|
@@ -36,6 +36,7 @@ Requires-Dist: pyannote-audio>=4.0.0; extra == 'ai'
|
|
|
36
36
|
Requires-Dist: scikit-learn>=1.3.0; extra == 'ai'
|
|
37
37
|
Requires-Dist: scipy>=1.10.0; extra == 'ai'
|
|
38
38
|
Requires-Dist: sentencepiece>=0.1.99; extra == 'ai'
|
|
39
|
+
Requires-Dist: silero-vad>=5.1; extra == 'ai'
|
|
39
40
|
Requires-Dist: torch>=2.8.0; extra == 'ai'
|
|
40
41
|
Requires-Dist: torchaudio>=2.8.0; extra == 'ai'
|
|
41
42
|
Requires-Dist: transformers>=5.2.0; extra == 'ai'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "videopython"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.27.0"
|
|
4
4
|
description = "Minimal video generation and processing library."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
|
|
@@ -62,6 +62,8 @@ ai = [
|
|
|
62
62
|
"transformers>=5.2.0",
|
|
63
63
|
"openai-whisper>=20240930",
|
|
64
64
|
"pyannote-audio>=4.0.0",
|
|
65
|
+
# Voice activity detection (used by AudioToText to gate Whisper language detection)
|
|
66
|
+
"silero-vad>=5.1",
|
|
65
67
|
"numba>=0.61.0",
|
|
66
68
|
"ollama>=0.4.5",
|
|
67
69
|
"scipy>=1.10.0",
|
|
@@ -91,6 +93,8 @@ ai = [
|
|
|
91
93
|
"transformers>=5.2.0",
|
|
92
94
|
"openai-whisper>=20240930",
|
|
93
95
|
"pyannote-audio>=4.0.0",
|
|
96
|
+
# Voice activity detection (used by AudioToText to gate Whisper language detection)
|
|
97
|
+
"silero-vad>=5.1",
|
|
94
98
|
"numba>=0.61.0",
|
|
95
99
|
"ollama>=0.4.5",
|
|
96
100
|
"scipy>=1.10.0",
|
|
@@ -130,6 +134,7 @@ module = [
|
|
|
130
134
|
"demucs", "demucs.*",
|
|
131
135
|
"huggingface_hub", "huggingface_hub.*",
|
|
132
136
|
"pyannote", "pyannote.*",
|
|
137
|
+
"silero_vad", "silero_vad.*",
|
|
133
138
|
"cv2", "cv2.*",
|
|
134
139
|
]
|
|
135
140
|
ignore_missing_imports = true
|
|
@@ -24,10 +24,12 @@ class AudioToText:
|
|
|
24
24
|
self,
|
|
25
25
|
model_name: Literal["tiny", "base", "small", "medium", "large", "turbo"] = "small",
|
|
26
26
|
enable_diarization: bool = False,
|
|
27
|
+
enable_vad: bool = True,
|
|
27
28
|
device: str | None = None,
|
|
28
29
|
):
|
|
29
30
|
self.model_name = model_name
|
|
30
31
|
self.enable_diarization = enable_diarization
|
|
32
|
+
self.enable_vad = enable_vad
|
|
31
33
|
self.device = select_device(device, mps_allowed=False)
|
|
32
34
|
log_device_initialization(
|
|
33
35
|
"AudioToText",
|
|
@@ -36,6 +38,7 @@ class AudioToText:
|
|
|
36
38
|
)
|
|
37
39
|
self._model: Any = None
|
|
38
40
|
self._diarization_pipeline: Any = None
|
|
41
|
+
self._vad_model: Any = None
|
|
39
42
|
|
|
40
43
|
def _init_local(self) -> None:
|
|
41
44
|
"""Initialize local Whisper model."""
|
|
@@ -51,13 +54,25 @@ class AudioToText:
|
|
|
51
54
|
self._diarization_pipeline = Pipeline.from_pretrained(self.PYANNOTE_DIARIZATION_MODEL)
|
|
52
55
|
self._diarization_pipeline.to(torch.device(self.device))
|
|
53
56
|
|
|
57
|
+
def _init_vad(self) -> None:
|
|
58
|
+
"""Initialize Silero VAD model.
|
|
59
|
+
|
|
60
|
+
The model is ~2 MB and CPU-fast (~5-15s for a 90 min movie); we keep
|
|
61
|
+
it on CPU regardless of ``self.device`` since dispatch overhead would
|
|
62
|
+
outweigh inference cost.
|
|
63
|
+
"""
|
|
64
|
+
from silero_vad import load_silero_vad
|
|
65
|
+
|
|
66
|
+
self._vad_model = load_silero_vad()
|
|
67
|
+
|
|
54
68
|
def unload(self) -> None:
|
|
55
|
-
"""Release the Whisper and
|
|
69
|
+
"""Release the Whisper, diarization, and VAD models so the next call re-initializes.
|
|
56
70
|
|
|
57
71
|
Used by low-memory dubbing to free VRAM between pipeline stages.
|
|
58
72
|
"""
|
|
59
73
|
self._model = None
|
|
60
74
|
self._diarization_pipeline = None
|
|
75
|
+
self._vad_model = None
|
|
61
76
|
release_device_memory(self.device)
|
|
62
77
|
|
|
63
78
|
def _process_transcription_result(self, transcription_result: dict) -> Transcription:
|
|
@@ -172,7 +187,60 @@ class AudioToText:
|
|
|
172
187
|
all_words = self._assign_speakers_to_words(all_words, diarization_result)
|
|
173
188
|
return Transcription(words=all_words, language=transcription.language)
|
|
174
189
|
|
|
175
|
-
def
|
|
190
|
+
def _run_vad(self, audio_mono: Audio) -> list[tuple[float, float]]:
|
|
191
|
+
"""Return voiced spans in seconds using Silero VAD.
|
|
192
|
+
|
|
193
|
+
Audio must already be mono at ``whisper.audio.SAMPLE_RATE`` (16 kHz),
|
|
194
|
+
which is one of Silero's two supported rates.
|
|
195
|
+
"""
|
|
196
|
+
import numpy as np
|
|
197
|
+
import torch
|
|
198
|
+
|
|
199
|
+
if self._vad_model is None:
|
|
200
|
+
self._init_vad()
|
|
201
|
+
|
|
202
|
+
from silero_vad import get_speech_timestamps
|
|
203
|
+
|
|
204
|
+
waveform = torch.from_numpy(audio_mono.data.astype(np.float32))
|
|
205
|
+
timestamps = get_speech_timestamps(
|
|
206
|
+
waveform,
|
|
207
|
+
self._vad_model,
|
|
208
|
+
sampling_rate=audio_mono.metadata.sample_rate,
|
|
209
|
+
return_seconds=True,
|
|
210
|
+
)
|
|
211
|
+
return [(float(ts["start"]), float(ts["end"])) for ts in timestamps]
|
|
212
|
+
|
|
213
|
+
def _detect_language(self, audio_mono: Audio, voiced_spans: list[tuple[float, float]]) -> str:
|
|
214
|
+
"""Run Whisper language detection on a 30s window of voiced audio.
|
|
215
|
+
|
|
216
|
+
Whisper's auto-detection only inspects the first 30s of input. When
|
|
217
|
+
the file opens with silence/music/credits, that window contains no
|
|
218
|
+
speech and detection picks the closest-looking thing (typically
|
|
219
|
+
English). Concatenating voiced spans up to 30s and running
|
|
220
|
+
``model.detect_language()`` on the resulting mel fixes this.
|
|
221
|
+
"""
|
|
222
|
+
import numpy as np
|
|
223
|
+
import torch
|
|
224
|
+
import whisper
|
|
225
|
+
|
|
226
|
+
sample_rate = audio_mono.metadata.sample_rate
|
|
227
|
+
chunks: list[np.ndarray] = []
|
|
228
|
+
remaining = whisper.audio.N_SAMPLES
|
|
229
|
+
for start, end in voiced_spans:
|
|
230
|
+
if remaining <= 0:
|
|
231
|
+
break
|
|
232
|
+
chunk = audio_mono.data[int(start * sample_rate) : int(end * sample_rate)][:remaining]
|
|
233
|
+
chunks.append(chunk)
|
|
234
|
+
remaining -= len(chunk)
|
|
235
|
+
|
|
236
|
+
voiced_audio = np.concatenate(chunks).astype(np.float32) if chunks else np.zeros(0, dtype=np.float32)
|
|
237
|
+
padded = whisper.audio.pad_or_trim(torch.from_numpy(voiced_audio))
|
|
238
|
+
mel = whisper.audio.log_mel_spectrogram(padded, n_mels=self._model.dims.n_mels).to(self._model.device)
|
|
239
|
+
|
|
240
|
+
_, probs = self._model.detect_language(mel)
|
|
241
|
+
return max(probs, key=probs.get)
|
|
242
|
+
|
|
243
|
+
def _transcribe_with_diarization(self, audio_mono: Audio, language: str | None) -> Transcription:
|
|
176
244
|
"""Transcribe with word timestamps and assign speakers via pyannote."""
|
|
177
245
|
import numpy as np
|
|
178
246
|
import torch
|
|
@@ -181,7 +249,7 @@ class AudioToText:
|
|
|
181
249
|
self._init_diarization()
|
|
182
250
|
|
|
183
251
|
audio_data = audio_mono.data
|
|
184
|
-
transcription_result = self._model.transcribe(audio=audio_data, word_timestamps=True)
|
|
252
|
+
transcription_result = self._model.transcribe(audio=audio_data, word_timestamps=True, language=language)
|
|
185
253
|
|
|
186
254
|
waveform = torch.from_numpy(audio_data.astype(np.float32)).unsqueeze(0)
|
|
187
255
|
diarization_result = self._diarization_pipeline(
|
|
@@ -200,7 +268,17 @@ class AudioToText:
|
|
|
200
268
|
return Transcription(words=all_words, language=transcription.language)
|
|
201
269
|
|
|
202
270
|
def _transcribe_local(self, audio: Audio) -> Transcription:
|
|
203
|
-
"""Transcribe using local Whisper model.
|
|
271
|
+
"""Transcribe using local Whisper model.
|
|
272
|
+
|
|
273
|
+
When ``enable_vad`` is True (default), Silero VAD locates voiced
|
|
274
|
+
regions and a 30s voiced window is used for Whisper language
|
|
275
|
+
detection -- avoiding the well-known failure where Whisper locks
|
|
276
|
+
onto the wrong language because the first 30s of input is silence
|
|
277
|
+
or music. The detected language is then passed into
|
|
278
|
+
``transcribe()`` so chunked decoding stays consistent. If VAD
|
|
279
|
+
finds no speech, an empty Transcription is returned without
|
|
280
|
+
invoking Whisper.
|
|
281
|
+
"""
|
|
204
282
|
import whisper
|
|
205
283
|
|
|
206
284
|
if self._model is None:
|
|
@@ -208,10 +286,17 @@ class AudioToText:
|
|
|
208
286
|
|
|
209
287
|
audio_mono = audio.to_mono().resample(whisper.audio.SAMPLE_RATE)
|
|
210
288
|
|
|
289
|
+
language: str | None = None
|
|
290
|
+
if self.enable_vad:
|
|
291
|
+
voiced_spans = self._run_vad(audio_mono)
|
|
292
|
+
if not voiced_spans:
|
|
293
|
+
return Transcription(segments=[])
|
|
294
|
+
language = self._detect_language(audio_mono, voiced_spans)
|
|
295
|
+
|
|
211
296
|
if self.enable_diarization:
|
|
212
|
-
return self._transcribe_with_diarization(audio_mono)
|
|
297
|
+
return self._transcribe_with_diarization(audio_mono, language)
|
|
213
298
|
|
|
214
|
-
transcription_result = self._model.transcribe(audio=audio_mono.data, word_timestamps=True)
|
|
299
|
+
transcription_result = self._model.transcribe(audio=audio_mono.data, word_timestamps=True, language=language)
|
|
215
300
|
return self._process_transcription_result(transcription_result)
|
|
216
301
|
|
|
217
302
|
def transcribe(self, media: Audio | Video) -> Transcription:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|