PyPI - videopython - Versions diffs - 0.27.0__tar.gz → 0.27.2__tar.gz - Mend

videopython 0.27.0tar.gz → 0.27.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

{videopython-0.27.0 → videopython-0.27.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: videopython
-Version: 0.27.0
+Version: 0.27.2
 Summary: Minimal video generation and processing library.
 Project-URL: Homepage, https://videopython.com
 Project-URL: Repository, https://github.com/bartwojtowicz/videopython/

{videopython-0.27.0 → videopython-0.27.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "videopython"
-version = "0.27.0"
+version = "0.27.2"
 description = "Minimal video generation and processing library."
 authors = [
     { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },

{videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/dubbing/dubber.py RENAMED Viewed

@@ -28,18 +28,32 @@ class VideoDubber:
         whisper_model: Whisper model size used for transcription. Larger models
             give better accuracy at the cost of VRAM and latency. One of
             ``tiny``, ``base``, ``small``, ``medium``, ``large``, ``turbo``.
-            Default ``small``.
+            Default ``turbo``.
+        condition_on_previous_text: Forwarded to ``AudioToText``. Defaults to
+            ``False`` (Whisper's own default is ``True``). With conditioning on,
+            a single hallucinated filler phrase cascades through the rest of
+            the file. See ``AudioToText`` for the full rationale.
+        no_speech_threshold: Forwarded to ``AudioToText``. Whisper's no-speech
+            gate; raise to drop more low-confidence windows.
+        logprob_threshold: Forwarded to ``AudioToText``. Whisper's average
+            log-probability gate.
     """
     def __init__(
         self,
         device: str | None = None,
         low_memory: bool = False,
-        whisper_model: WhisperModel = "small",
+        whisper_model: WhisperModel = "turbo",
+        condition_on_previous_text: bool = False,
+        no_speech_threshold: float = 0.6,
+        logprob_threshold: float | None = -1.0,
     ):
         self.device = device
         self.low_memory = low_memory
         self.whisper_model = whisper_model
+        self.condition_on_previous_text = condition_on_previous_text
+        self.no_speech_threshold = no_speech_threshold
+        self.logprob_threshold = logprob_threshold
         self._local_pipeline: Any = None
         requested = device.lower() if isinstance(device, str) else "auto"
         logger.info(
@@ -56,6 +70,9 @@ class VideoDubber:
             device=self.device,
             low_memory=self.low_memory,
             whisper_model=self.whisper_model,
+            condition_on_previous_text=self.condition_on_previous_text,
+            no_speech_threshold=self.no_speech_threshold,
+            logprob_threshold=self.logprob_threshold,
         )
     def dub(

{videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/dubbing/pipeline.py RENAMED Viewed

@@ -60,11 +60,17 @@ class LocalDubbingPipeline:
         self,
         device: str | None = None,
         low_memory: bool = False,
-        whisper_model: WhisperModel = "small",
+        whisper_model: WhisperModel = "turbo",
+        condition_on_previous_text: bool = False,
+        no_speech_threshold: float = 0.6,
+        logprob_threshold: float | None = -1.0,
     ):
         self.device = device
         self.low_memory = low_memory
         self.whisper_model = whisper_model
+        self.condition_on_previous_text = condition_on_previous_text
+        self.no_speech_threshold = no_speech_threshold
+        self.logprob_threshold = logprob_threshold
         requested = device.lower() if isinstance(device, str) else "auto"
         logger.info(
             "LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s",
@@ -106,6 +112,9 @@ class LocalDubbingPipeline:
             model_name=self.whisper_model,
             device=self.device,
             enable_diarization=enable_diarization,
+            condition_on_previous_text=self.condition_on_previous_text,
+            no_speech_threshold=self.no_speech_threshold,
+            logprob_threshold=self.logprob_threshold,
         )
     def _init_translator(self) -> None:

{videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/understanding/audio.py RENAMED Viewed

@@ -15,21 +15,45 @@ class AudioToText:
     """Transcription service for audio and video using local Whisper models.
     Uses openai-whisper for transcription (with word-level timestamps) and
-    pyannote-audio for optional speaker diarization.
+    pyannote-audio for optional speaker diarization. By default, Silero VAD
+    runs before Whisper to gate language detection on a 30s window built from
+    voiced regions only — fixes Whisper's tendency to lock onto the wrong
+    language when the file opens with silence, music, or non-vocal credits.
+    Disable with ``enable_vad=False`` to reproduce pre-0.27 behaviour.
+    Three Whisper decoder kwargs are surfaced for anti-hallucination tuning:
+    - ``condition_on_previous_text`` defaults to ``False`` (Whisper's own
+      default is ``True``). With conditioning on, a single hallucinated filler
+      phrase cascades through the rest of the file because each window's
+      decoder is primed by the previous window's decoded text. Turning it off
+      is the most commonly recommended fix for that failure mode; the cost on
+      clean audio is small (slightly less context for ambiguous homophones
+      across sentence boundaries).
+    - ``no_speech_threshold`` and ``logprob_threshold`` are forwarded with
+      Whisper's documented defaults (``0.6`` and ``-1.0``); raising
+      ``no_speech_threshold`` biases toward dropping low-confidence windows
+      instead of emitting filler.
     """
     PYANNOTE_DIARIZATION_MODEL = "pyannote/speaker-diarization-community-1"
     def __init__(
         self,
-        model_name: Literal["tiny", "base", "small", "medium", "large", "turbo"] = "small",
+        model_name: Literal["tiny", "base", "small", "medium", "large", "turbo"] = "turbo",
         enable_diarization: bool = False,
         enable_vad: bool = True,
+        condition_on_previous_text: bool = False,
+        no_speech_threshold: float = 0.6,
+        logprob_threshold: float | None = -1.0,
         device: str | None = None,
     ):
         self.model_name = model_name
         self.enable_diarization = enable_diarization
         self.enable_vad = enable_vad
+        self.condition_on_previous_text = condition_on_previous_text
+        self.no_speech_threshold = no_speech_threshold
+        self.logprob_threshold = logprob_threshold
         self.device = select_device(device, mps_allowed=False)
         log_device_initialization(
             "AudioToText",
@@ -40,6 +64,16 @@ class AudioToText:
         self._diarization_pipeline: Any = None
         self._vad_model: Any = None
+    def _transcribe_kwargs(self, language: str | None) -> dict[str, Any]:
+        """Kwargs threaded into ``whisper.Whisper.transcribe`` from both call sites."""
+        return {
+            "word_timestamps": True,
+            "language": language,
+            "condition_on_previous_text": self.condition_on_previous_text,
+            "no_speech_threshold": self.no_speech_threshold,
+            "logprob_threshold": self.logprob_threshold,
+        }
     def _init_local(self) -> None:
         """Initialize local Whisper model."""
         import whisper
@@ -249,7 +283,7 @@ class AudioToText:
             self._init_diarization()
         audio_data = audio_mono.data
-        transcription_result = self._model.transcribe(audio=audio_data, word_timestamps=True, language=language)
+        transcription_result = self._model.transcribe(audio=audio_data, **self._transcribe_kwargs(language))
         waveform = torch.from_numpy(audio_data.astype(np.float32)).unsqueeze(0)
         diarization_result = self._diarization_pipeline(
@@ -296,7 +330,7 @@ class AudioToText:
         if self.enable_diarization:
             return self._transcribe_with_diarization(audio_mono, language)
-        transcription_result = self._model.transcribe(audio=audio_mono.data, word_timestamps=True, language=language)
+        transcription_result = self._model.transcribe(audio=audio_mono.data, **self._transcribe_kwargs(language))
         return self._process_transcription_result(transcription_result)
     def transcribe(self, media: Audio | Video) -> Transcription: