PyPI - videopython - Versions diffs - 0.26.9__tar.gz → 0.27.0__tar.gz - Mend

videopython 0.26.9tar.gz → 0.27.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

{videopython-0.26.9 → videopython-0.27.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: videopython
-Version: 0.26.9
+Version: 0.27.0
 Summary: Minimal video generation and processing library.
 Project-URL: Homepage, https://videopython.com
 Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -36,6 +36,7 @@ Requires-Dist: pyannote-audio>=4.0.0; extra == 'ai'
 Requires-Dist: scikit-learn>=1.3.0; extra == 'ai'
 Requires-Dist: scipy>=1.10.0; extra == 'ai'
 Requires-Dist: sentencepiece>=0.1.99; extra == 'ai'
+Requires-Dist: silero-vad>=5.1; extra == 'ai'
 Requires-Dist: torch>=2.8.0; extra == 'ai'
 Requires-Dist: torchaudio>=2.8.0; extra == 'ai'
 Requires-Dist: transformers>=5.2.0; extra == 'ai'

{videopython-0.26.9 → videopython-0.27.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "videopython"
-version = "0.26.9"
+version = "0.27.0"
 description = "Minimal video generation and processing library."
 authors = [
     { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -62,6 +62,8 @@ ai = [
     "transformers>=5.2.0",
     "openai-whisper>=20240930",
     "pyannote-audio>=4.0.0",
+    # Voice activity detection (used by AudioToText to gate Whisper language detection)
+    "silero-vad>=5.1",
     "numba>=0.61.0",
     "ollama>=0.4.5",
     "scipy>=1.10.0",
@@ -91,6 +93,8 @@ ai = [
     "transformers>=5.2.0",
     "openai-whisper>=20240930",
     "pyannote-audio>=4.0.0",
+    # Voice activity detection (used by AudioToText to gate Whisper language detection)
+    "silero-vad>=5.1",
     "numba>=0.61.0",
     "ollama>=0.4.5",
     "scipy>=1.10.0",
@@ -130,6 +134,7 @@ module = [
     "demucs", "demucs.*",
     "huggingface_hub", "huggingface_hub.*",
     "pyannote", "pyannote.*",
+    "silero_vad", "silero_vad.*",
     "cv2", "cv2.*",
 ]
 ignore_missing_imports = true

{videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/understanding/audio.py RENAMED Viewed

@@ -24,10 +24,12 @@ class AudioToText:
         self,
         model_name: Literal["tiny", "base", "small", "medium", "large", "turbo"] = "small",
         enable_diarization: bool = False,
+        enable_vad: bool = True,
         device: str | None = None,
     ):
         self.model_name = model_name
         self.enable_diarization = enable_diarization
+        self.enable_vad = enable_vad
         self.device = select_device(device, mps_allowed=False)
         log_device_initialization(
             "AudioToText",
@@ -36,6 +38,7 @@ class AudioToText:
         )
         self._model: Any = None
         self._diarization_pipeline: Any = None
+        self._vad_model: Any = None
     def _init_local(self) -> None:
         """Initialize local Whisper model."""
@@ -51,13 +54,25 @@ class AudioToText:
         self._diarization_pipeline = Pipeline.from_pretrained(self.PYANNOTE_DIARIZATION_MODEL)
         self._diarization_pipeline.to(torch.device(self.device))
+    def _init_vad(self) -> None:
+        """Initialize Silero VAD model.
+        The model is ~2 MB and CPU-fast (~5-15s for a 90 min movie); we keep
+        it on CPU regardless of ``self.device`` since dispatch overhead would
+        outweigh inference cost.
+        """
+        from silero_vad import load_silero_vad
+        self._vad_model = load_silero_vad()
     def unload(self) -> None:
-        """Release the Whisper and diarization models so the next call re-initializes.
+        """Release the Whisper, diarization, and VAD models so the next call re-initializes.
         Used by low-memory dubbing to free VRAM between pipeline stages.
         """
         self._model = None
         self._diarization_pipeline = None
+        self._vad_model = None
         release_device_memory(self.device)
     def _process_transcription_result(self, transcription_result: dict) -> Transcription:
@@ -172,7 +187,60 @@ class AudioToText:
         all_words = self._assign_speakers_to_words(all_words, diarization_result)
         return Transcription(words=all_words, language=transcription.language)
-    def _transcribe_with_diarization(self, audio_mono: Audio) -> Transcription:
+    def _run_vad(self, audio_mono: Audio) -> list[tuple[float, float]]:
+        """Return voiced spans in seconds using Silero VAD.
+        Audio must already be mono at ``whisper.audio.SAMPLE_RATE`` (16 kHz),
+        which is one of Silero's two supported rates.
+        """
+        import numpy as np
+        import torch
+        if self._vad_model is None:
+            self._init_vad()
+        from silero_vad import get_speech_timestamps
+        waveform = torch.from_numpy(audio_mono.data.astype(np.float32))
+        timestamps = get_speech_timestamps(
+            waveform,
+            self._vad_model,
+            sampling_rate=audio_mono.metadata.sample_rate,
+            return_seconds=True,
+        )
+        return [(float(ts["start"]), float(ts["end"])) for ts in timestamps]
+    def _detect_language(self, audio_mono: Audio, voiced_spans: list[tuple[float, float]]) -> str:
+        """Run Whisper language detection on a 30s window of voiced audio.
+        Whisper's auto-detection only inspects the first 30s of input. When
+        the file opens with silence/music/credits, that window contains no
+        speech and detection picks the closest-looking thing (typically
+        English). Concatenating voiced spans up to 30s and running
+        ``model.detect_language()`` on the resulting mel fixes this.
+        """
+        import numpy as np
+        import torch
+        import whisper
+        sample_rate = audio_mono.metadata.sample_rate
+        chunks: list[np.ndarray] = []
+        remaining = whisper.audio.N_SAMPLES
+        for start, end in voiced_spans:
+            if remaining <= 0:
+                break
+            chunk = audio_mono.data[int(start * sample_rate) : int(end * sample_rate)][:remaining]
+            chunks.append(chunk)
+            remaining -= len(chunk)
+        voiced_audio = np.concatenate(chunks).astype(np.float32) if chunks else np.zeros(0, dtype=np.float32)
+        padded = whisper.audio.pad_or_trim(torch.from_numpy(voiced_audio))
+        mel = whisper.audio.log_mel_spectrogram(padded, n_mels=self._model.dims.n_mels).to(self._model.device)
+        _, probs = self._model.detect_language(mel)
+        return max(probs, key=probs.get)
+    def _transcribe_with_diarization(self, audio_mono: Audio, language: str | None) -> Transcription:
         """Transcribe with word timestamps and assign speakers via pyannote."""
         import numpy as np
         import torch
@@ -181,7 +249,7 @@ class AudioToText:
             self._init_diarization()
         audio_data = audio_mono.data
-        transcription_result = self._model.transcribe(audio=audio_data, word_timestamps=True)
+        transcription_result = self._model.transcribe(audio=audio_data, word_timestamps=True, language=language)
         waveform = torch.from_numpy(audio_data.astype(np.float32)).unsqueeze(0)
         diarization_result = self._diarization_pipeline(
@@ -200,7 +268,17 @@ class AudioToText:
         return Transcription(words=all_words, language=transcription.language)
     def _transcribe_local(self, audio: Audio) -> Transcription:
-        """Transcribe using local Whisper model."""
+        """Transcribe using local Whisper model.
+        When ``enable_vad`` is True (default), Silero VAD locates voiced
+        regions and a 30s voiced window is used for Whisper language
+        detection -- avoiding the well-known failure where Whisper locks
+        onto the wrong language because the first 30s of input is silence
+        or music. The detected language is then passed into
+        ``transcribe()`` so chunked decoding stays consistent. If VAD
+        finds no speech, an empty Transcription is returned without
+        invoking Whisper.
+        """
         import whisper
         if self._model is None:
@@ -208,10 +286,17 @@ class AudioToText:
         audio_mono = audio.to_mono().resample(whisper.audio.SAMPLE_RATE)
+        language: str | None = None
+        if self.enable_vad:
+            voiced_spans = self._run_vad(audio_mono)
+            if not voiced_spans:
+                return Transcription(segments=[])
+            language = self._detect_language(audio_mono, voiced_spans)
         if self.enable_diarization:
-            return self._transcribe_with_diarization(audio_mono)
+            return self._transcribe_with_diarization(audio_mono, language)
-        transcription_result = self._model.transcribe(audio=audio_mono.data, word_timestamps=True)
+        transcription_result = self._model.transcribe(audio=audio_mono.data, word_timestamps=True, language=language)
         return self._process_transcription_result(transcription_result)
     def transcribe(self, media: Audio | Video) -> Transcription:

{videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/video_analysis.py RENAMED Viewed

@@ -7,12 +7,14 @@ import math
 import re
 import subprocess
 import time
+from collections.abc import Callable, Iterator
 from concurrent.futures import ThreadPoolExecutor
+from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass, field
 from datetime import datetime, timezone
 from importlib import metadata as importlib_metadata
 from pathlib import Path
-from typing import Any
+from typing import Any, TypeVar
 import numpy as np
 from PIL import Image
@@ -144,17 +146,28 @@ class VideoAnalysisSource:
 @dataclass
 class AnalysisRunInfo:
-    """Runtime/provenance metadata for a full analysis run."""
+    """Runtime/provenance metadata for a full analysis run.
+    ``stage_durations_seconds`` is populated by the analyzer with per-stage
+    wall-clock times (whisper, scene_detection, scene_analysis, scene_vlm,
+    audio_classification, and -- when both run together --
+    whisper_and_scene_detection_parallel). Consumers can persist or aggregate
+    these to track pipeline performance over time.
+    """
     created_at: str
     mode: str
     library_version: str | None = None
+    stage_durations_seconds: dict[str, float] = field(default_factory=dict)
+    total_duration_seconds: float | None = None
     def to_dict(self) -> dict[str, Any]:
         return {
             "created_at": self.created_at,
             "mode": self.mode,
             "library_version": self.library_version,
+            "stage_durations_seconds": dict(self.stage_durations_seconds),
+            "total_duration_seconds": self.total_duration_seconds,
         }
     @classmethod
@@ -163,6 +176,8 @@ class AnalysisRunInfo:
             created_at=data["created_at"],
             mode=data["mode"],
             library_version=data.get("library_version"),
+            stage_durations_seconds={str(k): float(v) for k, v in data["stage_durations_seconds"].items()},
+            total_duration_seconds=data["total_duration_seconds"],
         )
@@ -413,17 +428,17 @@ class VideoAnalyzer:
         # which corrupts Whisper's model weights if they're initialized at the
         # same time.
         if run_whisper and run_scene_det:
-            transcription, detected = self._run_whisper_and_scene_detection(source_path=source_path, video=video)
+            transcription, detected = self._run_whisper_and_scene_detection(
+                source_path=source_path, video=video, run_info=run_info
+            )
         else:
             if run_whisper:
-                t0 = time.perf_counter()
-                transcription = self._run_whisper(source_path=source_path, video=video)
-                logger.info("Whisper transcription completed in %.2fs", time.perf_counter() - t0)
+                with _record_stage(run_info, "whisper"):
+                    transcription = self._run_whisper(source_path=source_path, video=video)
             if run_scene_det:
-                t0 = time.perf_counter()
-                detected = self._run_scene_detection(source_path=source_path, video=video)
-                logger.info("Scene detection completed in %.2fs", time.perf_counter() - t0)
+                with _record_stage(run_info, "scene_detection"):
+                    detected = self._run_scene_detection(source_path=source_path, video=video)
         if run_scene_det:
             self._reset_transnetv2_torch_state()
@@ -442,19 +457,20 @@ class VideoAnalyzer:
         if not scenes:
             scenes = self._default_scene_boundaries(metadata)
-        t0 = time.perf_counter()
-        scene_section = self._analyze_scenes(
-            source_path=source_path,
-            video=video,
-            metadata=metadata,
-            scenes=scenes,
-            preloaded_scene_vlm=None,
-        )
-        logger.info("Scene analysis completed in %.2fs", time.perf_counter() - t0)
+        with _record_stage(run_info, "scene_analysis"):
+            scene_section = self._analyze_scenes(
+                source_path=source_path,
+                video=video,
+                metadata=metadata,
+                scenes=scenes,
+                preloaded_scene_vlm=None,
+                run_info=run_info,
+            )
         audio_section = AudioAnalysisSection(transcription=transcription) if transcription is not None else None
-        logger.info("Total analysis completed in %.2fs", time.perf_counter() - t_analysis_start)
+        run_info.total_duration_seconds = time.perf_counter() - t_analysis_start
+        logger.info("Total analysis completed in %.2fs", run_info.total_duration_seconds)
         return VideoAnalysis(
             source=source,
             config=self.config,
@@ -485,17 +501,23 @@ class VideoAnalyzer:
             return None
     def _run_whisper_and_scene_detection(
-        self, *, source_path: Path | None, video: Video | None
+        self, *, source_path: Path | None, video: Video | None, run_info: AnalysisRunInfo
     ) -> tuple[Transcription | None, list[SceneBoundary] | None]:
-        with ThreadPoolExecutor(max_workers=2) as pool:
-            t0 = time.perf_counter()
-            whisper_future = pool.submit(self._run_whisper, source_path=source_path, video=video)
-            scene_future = pool.submit(self._run_scene_detection, source_path=source_path, video=video)
-            transcription = whisper_future.result()
-            detected = scene_future.result()
-            elapsed = time.perf_counter() - t0
-            logger.info("Whisper + scene detection (parallel) completed in %.2fs", elapsed)
+        with _record_stage(run_info, "whisper_and_scene_detection_parallel"):
+            with ThreadPoolExecutor(max_workers=2) as pool:
+                whisper_future = pool.submit(
+                    _run_with_stage, run_info, "whisper", self._run_whisper, source_path=source_path, video=video
+                )
+                scene_future = pool.submit(
+                    _run_with_stage,
+                    run_info,
+                    "scene_detection",
+                    self._run_scene_detection,
+                    source_path=source_path,
+                    video=video,
+                )
+                transcription = whisper_future.result()
+                detected = scene_future.result()
         return transcription, detected
@@ -536,6 +558,7 @@ class VideoAnalyzer:
         video: Video | None,
         metadata: VideoMetadata,
         scenes: list[SceneBoundary],
+        run_info: AnalysisRunInfo,
         preloaded_scene_vlm: SceneVLM | None = None,
     ) -> SceneAnalysisSection:
         enabled = self.config.enabled_analyzers
@@ -571,60 +594,61 @@ class VideoAnalyzer:
         # -- Batched SceneVLM: collect all timestamps, extract frames once, run one forward pass --
         captions: list[str | None] = [None] * len(scenes)
         if scene_vlm is not None:
-            try:
-                captions = self._run_scene_vlm_batched(
-                    scene_vlm=scene_vlm,
-                    source_path=source_path,
-                    video=video,
-                    metadata=metadata,
-                    scenes=scenes,
-                )
-            except Exception:
-                logger.warning("Batched SceneVLM failed, skipping visual understanding", exc_info=True)
-        samples: list[SceneAnalysisSample] = []
-        t_audio_total = 0.0
-        for index, scene in enumerate(scenes):
-            sample = SceneAnalysisSample(
-                scene_index=index,
-                start_second=float(scene.start),
-                end_second=float(scene.end),
-                start_frame=int(scene.start_frame),
-                end_frame=int(scene.end_frame),
-                caption=captions[index],
-            )
-            if audio_classifier is not None:
-                t0 = time.perf_counter()
+            with _record_stage(run_info, "scene_vlm"):
                 try:
-                    scene_clip: Video | None = None
-                    if path_audio is None:
-                        try:
-                            scene_clip = self._load_scene_video_clip(
-                                source_path=source_path,
-                                video=video,
-                                start_second=scene.start,
-                                end_second=scene.end,
-                            )
-                        except Exception:
-                            scene_clip = None
-                    sample.audio_classification = self._run_scene_audio_classification(
-                        audio_classifier=audio_classifier,
-                        path_audio=path_audio,
-                        scene_clip=scene_clip,
-                        scene_start=scene.start,
-                        scene_end=scene.end,
+                    captions = self._run_scene_vlm_batched(
+                        scene_vlm=scene_vlm,
+                        source_path=source_path,
+                        video=video,
+                        metadata=metadata,
+                        scenes=scenes,
                     )
                 except Exception:
-                    logger.warning(
-                        "AudioClassifier failed for scene %d (%.1f-%.1fs)", index, scene.start, scene.end, exc_info=True
-                    )
-                t_audio_total += time.perf_counter() - t0
+                    logger.warning("Batched SceneVLM failed, skipping visual understanding", exc_info=True)
-            samples.append(sample)
+        samples: list[SceneAnalysisSample] = []
+        audio_ctx = _record_stage(run_info, "audio_classification") if audio_classifier is not None else nullcontext()
+        with audio_ctx:
+            for index, scene in enumerate(scenes):
+                sample = SceneAnalysisSample(
+                    scene_index=index,
+                    start_second=float(scene.start),
+                    end_second=float(scene.end),
+                    start_frame=int(scene.start_frame),
+                    end_frame=int(scene.end_frame),
+                    caption=captions[index],
+                )
-        if audio_classifier is not None:
-            logger.info("AudioClassifier inference total: %.2fs across %d scenes", t_audio_total, len(scenes))
+                if audio_classifier is not None:
+                    try:
+                        scene_clip: Video | None = None
+                        if path_audio is None:
+                            try:
+                                scene_clip = self._load_scene_video_clip(
+                                    source_path=source_path,
+                                    video=video,
+                                    start_second=scene.start,
+                                    end_second=scene.end,
+                                )
+                            except Exception:
+                                scene_clip = None
+                        sample.audio_classification = self._run_scene_audio_classification(
+                            audio_classifier=audio_classifier,
+                            path_audio=path_audio,
+                            scene_clip=scene_clip,
+                            scene_start=scene.start,
+                            scene_end=scene.end,
+                        )
+                    except Exception:
+                        logger.warning(
+                            "AudioClassifier failed for scene %d (%.1f-%.1fs)",
+                            index,
+                            scene.start,
+                            scene.end,
+                            exc_info=True,
+                        )
+                samples.append(sample)
         return SceneAnalysisSection(samples=samples)
@@ -893,6 +917,27 @@ def _utc_now_iso() -> str:
     return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
+@contextmanager
+def _record_stage(run_info: AnalysisRunInfo, stage: str) -> Iterator[None]:
+    """Time a block, write the elapsed seconds into ``run_info``, and log it."""
+    t0 = time.perf_counter()
+    try:
+        yield
+    finally:
+        elapsed = time.perf_counter() - t0
+        run_info.stage_durations_seconds[stage] = elapsed
+        logger.info("%s completed in %.2fs", stage, elapsed)
+_T = TypeVar("_T")
+def _run_with_stage(run_info: AnalysisRunInfo, stage: str, fn: Callable[..., _T], /, **kwargs: Any) -> _T:
+    """Call ``fn(**kwargs)`` inside ``_record_stage``. Use with ``ThreadPoolExecutor.submit``."""
+    with _record_stage(run_info, stage):
+        return fn(**kwargs)
 def _library_version() -> str | None:
     try:
         return importlib_metadata.version("videopython")