PyPI - videopython - Versions diffs - 0.27.1__tar.gz → 0.28.0__tar.gz - Mend

videopython 0.27.1tar.gz → 0.28.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

{videopython-0.27.1 → videopython-0.28.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: videopython
-Version: 0.27.1
+Version: 0.28.0
 Summary: Minimal video generation and processing library.
 Project-URL: Homepage, https://videopython.com
 Project-URL: Repository, https://github.com/bartwojtowicz/videopython/

{videopython-0.27.1 → videopython-0.28.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "videopython"
-version = "0.27.1"
+version = "0.28.0"
 description = "Minimal video generation and processing library."
 authors = [
     { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },

{videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/dubbing/__init__.py RENAMED Viewed

@@ -3,6 +3,7 @@
 from videopython.ai.dubbing.dubber import VideoDubber
 from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio, TranslatedSegment
 from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
+from videopython.ai.dubbing.quality import GarbageTranscriptError, TranscriptQuality, assess_transcript
 from videopython.ai.dubbing.timing import TimingSynchronizer
 __all__ = [
@@ -13,4 +14,7 @@ __all__ = [
     "SeparatedAudio",
     "LocalDubbingPipeline",
     "TimingSynchronizer",
+    "GarbageTranscriptError",
+    "TranscriptQuality",
+    "assess_transcript",
 ]

{videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/dubbing/dubber.py RENAMED Viewed

@@ -29,6 +29,21 @@ class VideoDubber:
             give better accuracy at the cost of VRAM and latency. One of
             ``tiny``, ``base``, ``small``, ``medium``, ``large``, ``turbo``.
             Default ``turbo``.
+        condition_on_previous_text: Forwarded to ``AudioToText``. Defaults to
+            ``False`` (Whisper's own default is ``True``). With conditioning on,
+            a single hallucinated filler phrase cascades through the rest of
+            the file. See ``AudioToText`` for the full rationale.
+        no_speech_threshold: Forwarded to ``AudioToText``. Whisper's no-speech
+            gate; raise to drop more low-confidence windows.
+        logprob_threshold: Forwarded to ``AudioToText``. Whisper's average
+            log-probability gate.
+        strict_quality: When True, the pipeline raises
+            :class:`GarbageTranscriptError` before Demucs/translation/TTS run
+            if the transcript-quality heuristic returns ``"reject"``. When
+            False (default), low-quality transcripts are logged at WARNING
+            but processing continues. Either way the
+            :class:`TranscriptQuality` is exposed on ``DubbingResult`` for
+            inspection.
     """
     def __init__(
@@ -36,10 +51,18 @@ class VideoDubber:
         device: str | None = None,
         low_memory: bool = False,
         whisper_model: WhisperModel = "turbo",
+        condition_on_previous_text: bool = False,
+        no_speech_threshold: float = 0.6,
+        logprob_threshold: float | None = -1.0,
+        strict_quality: bool = False,
     ):
         self.device = device
         self.low_memory = low_memory
         self.whisper_model = whisper_model
+        self.condition_on_previous_text = condition_on_previous_text
+        self.no_speech_threshold = no_speech_threshold
+        self.logprob_threshold = logprob_threshold
+        self.strict_quality = strict_quality
         self._local_pipeline: Any = None
         requested = device.lower() if isinstance(device, str) else "auto"
         logger.info(
@@ -56,6 +79,10 @@ class VideoDubber:
             device=self.device,
             low_memory=self.low_memory,
             whisper_model=self.whisper_model,
+            condition_on_previous_text=self.condition_on_previous_text,
+            no_speech_threshold=self.no_speech_threshold,
+            logprob_threshold=self.logprob_threshold,
+            strict_quality=self.strict_quality,
         )
     def dub(

{videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/dubbing/models.py RENAMED Viewed

@@ -3,10 +3,21 @@
 from __future__ import annotations
 from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any
 from videopython.base.audio import Audio
 from videopython.base.text.transcription import Transcription, TranscriptionSegment
+if TYPE_CHECKING:
+    from videopython.ai.dubbing.quality import TranscriptQuality
+    from videopython.ai.dubbing.timing import TimingAdjustment
+# Speed factors within this band of 1.0 are treated as a "clean" timing
+# adjustment (no perceptible compression/stretch). Heuristic threshold for
+# the TimingSummary classification only.
+CLEAN_SPEED_TOLERANCE = 0.01
 @dataclass
 class TranslatedSegment:
@@ -73,6 +84,87 @@ class SeparatedAudio:
         return self.music is not None and self.effects is not None
+@dataclass
+class TimingSummary:
+    """Aggregate stats over per-segment timing adjustments.
+    Surfaces how aggressively the timing synchronizer had to compress or
+    truncate dubbed segments to fit the source's spoken regions. High
+    truncation rates indicate translation produced text too long for the
+    source duration.
+    """
+    total_segments: int
+    clean_count: int
+    stretched_count: int
+    truncated_count: int
+    mean_speed_factor: float
+    max_truncation_seconds: float
+    @classmethod
+    def from_adjustments(cls, adjustments: list[TimingAdjustment]) -> TimingSummary:
+        """Aggregate a list of TimingAdjustments into a TimingSummary."""
+        total = len(adjustments)
+        if total == 0:
+            return cls(
+                total_segments=0,
+                clean_count=0,
+                stretched_count=0,
+                truncated_count=0,
+                mean_speed_factor=1.0,
+                max_truncation_seconds=0.0,
+            )
+        clean = 0
+        stretched = 0
+        truncated = 0
+        speed_sum = 0.0
+        max_truncation = 0.0
+        for adj in adjustments:
+            speed_sum += adj.speed_factor
+            if adj.was_truncated:
+                truncated += 1
+                truncation = adj.original_duration - adj.actual_duration
+                if truncation > max_truncation:
+                    max_truncation = truncation
+            elif abs(adj.speed_factor - 1.0) <= CLEAN_SPEED_TOLERANCE:
+                clean += 1
+            else:
+                stretched += 1
+        return cls(
+            total_segments=total,
+            clean_count=clean,
+            stretched_count=stretched,
+            truncated_count=truncated,
+            mean_speed_factor=speed_sum / total,
+            max_truncation_seconds=max_truncation,
+        )
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "total_segments": self.total_segments,
+            "clean_count": self.clean_count,
+            "stretched_count": self.stretched_count,
+            "truncated_count": self.truncated_count,
+            "mean_speed_factor": self.mean_speed_factor,
+            "max_truncation_seconds": self.max_truncation_seconds,
+        }
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> TimingSummary:
+        """Create TimingSummary from dictionary."""
+        return cls(
+            total_segments=data["total_segments"],
+            clean_count=data["clean_count"],
+            stretched_count=data["stretched_count"],
+            truncated_count=data["truncated_count"],
+            mean_speed_factor=data["mean_speed_factor"],
+            max_truncation_seconds=data["max_truncation_seconds"],
+        )
 @dataclass
 class DubbingResult:
     """Result of a video dubbing operation.
@@ -85,6 +177,9 @@ class DubbingResult:
         target_lang: Target language for dubbing.
         separated_audio: Separated audio components (if preserve_background=True).
         voice_samples: Dictionary mapping speaker IDs to voice sample Audio.
+        timing_summary: Aggregate stats over per-segment timing adjustments.
+        transcript_quality: Heuristic quality assessment of the transcription
+            (None when the pipeline returned early on an empty transcription).
     """
     dubbed_audio: Audio
@@ -94,6 +189,8 @@ class DubbingResult:
     target_lang: str
     separated_audio: SeparatedAudio | None = None
     voice_samples: dict[str, Audio] = field(default_factory=dict)
+    timing_summary: TimingSummary | None = None
+    transcript_quality: TranscriptQuality | None = None
     @property
     def num_segments(self) -> int:

{videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/dubbing/pipeline.py RENAMED Viewed

@@ -9,7 +9,8 @@ from typing import TYPE_CHECKING, Any, Callable, Literal
 import numpy as np
-from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio
+from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio, TimingSummary
+from videopython.ai.dubbing.quality import GarbageTranscriptError, assess_transcript
 from videopython.ai.dubbing.timing import TimingSynchronizer
 if TYPE_CHECKING:
@@ -46,6 +47,14 @@ WhisperModel = Literal["tiny", "base", "small", "medium", "large", "turbo"]
 logger = logging.getLogger(__name__)
+# Voice-sample quality gating thresholds. Tuned conservatively to favor
+# accepting real-world dialogue over rejecting it; failures fall back to
+# the longest segment with a WARNING log so we can re-tune from production
+# data instead of guessing.
+PEAK_CLIP_THRESHOLD = 0.99
+MIN_VOCAL_BG_RMS_RATIO = 1.5
+VOICE_SAMPLE_TARGET_DURATION = 6.0
 class LocalDubbingPipeline:
     """Local pipeline for video dubbing.
@@ -61,10 +70,18 @@ class LocalDubbingPipeline:
         device: str | None = None,
         low_memory: bool = False,
         whisper_model: WhisperModel = "turbo",
+        condition_on_previous_text: bool = False,
+        no_speech_threshold: float = 0.6,
+        logprob_threshold: float | None = -1.0,
+        strict_quality: bool = False,
     ):
         self.device = device
         self.low_memory = low_memory
         self.whisper_model = whisper_model
+        self.condition_on_previous_text = condition_on_previous_text
+        self.no_speech_threshold = no_speech_threshold
+        self.logprob_threshold = logprob_threshold
+        self.strict_quality = strict_quality
         requested = device.lower() if isinstance(device, str) else "auto"
         logger.info(
             "LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s",
@@ -106,6 +123,9 @@ class LocalDubbingPipeline:
             model_name=self.whisper_model,
             device=self.device,
             enable_diarization=enable_diarization,
+            condition_on_previous_text=self.condition_on_previous_text,
+            no_speech_threshold=self.no_speech_threshold,
+            logprob_threshold=self.logprob_threshold,
         )
     def _init_translator(self) -> None:
@@ -132,12 +152,25 @@ class LocalDubbingPipeline:
     def _extract_voice_samples(
         self,
-        audio: Any,
+        vocal_audio: Any,
+        background_audio: Any | None,
         transcription: Any,
         min_duration: float = 3.0,
         max_duration: float = 10.0,
     ) -> dict[str, Any]:
-        """Extract voice samples for each speaker from the audio."""
+        """Extract a per-speaker voice sample with quality gating.
+        Picks the highest-scored segment per speaker after rejecting clipped
+        slices (peak >= ``PEAK_CLIP_THRESHOLD``) and slices where Demucs left
+        the background louder than the vocals
+        (``vocal_rms / bg_rms < MIN_VOCAL_BG_RMS_RATIO``). When the
+        background track isn't available (e.g. ``revoice`` after
+        ``low_memory`` dropped it), the RMS check is skipped silently.
+        Falls back to the longest available segment with a WARNING log when
+        every candidate is rejected, so the dub continues with the best
+        sample we have rather than silently dropping the speaker.
+        """
         from videopython.base.audio import Audio
         voice_samples: dict[str, Audio] = {}
@@ -150,29 +183,106 @@ class LocalDubbingPipeline:
             segments_by_speaker[speaker].append(segment)
         for speaker, segments in segments_by_speaker.items():
-            target_duration = 6.0
-            best_segment = None
-            best_diff = float("inf")
-            for segment in segments:
-                duration = segment.end - segment.start
-                if duration >= min_duration:
-                    diff = abs(duration - target_duration)
-                    if diff < best_diff:
-                        best_diff = diff
-                        best_segment = segment
-            if best_segment is not None:
-                start = best_segment.start
-                end = min(best_segment.end, start + max_duration)
-                sliced = audio.slice(start, end)
-                # Audio.slice returns a numpy view into the source. Copy so the
-                # short voice sample doesn't keep the full vocals array (~1.3 GB
-                # for 2h sources) alive across translate + TTS.
-                voice_samples[speaker] = Audio(sliced.data.copy(), sliced.metadata)
+            chosen, fallback_reason = self._pick_voice_segment(
+                speaker, segments, vocal_audio, background_audio, min_duration
+            )
+            if chosen is None:
+                logger.warning("No usable voice-sample segment for speaker %r (no candidates)", speaker)
+                continue
+            if fallback_reason is not None:
+                logger.warning(
+                    "Voice-sample quality fallback for speaker %r (%d candidates): %s — using longest segment",
+                    speaker,
+                    len(segments),
+                    fallback_reason,
+                )
+            start = chosen.start
+            end = min(chosen.end, start + max_duration)
+            sliced = vocal_audio.slice(start, end)
+            # Audio.slice returns a numpy view into the source. Copy so the
+            # short voice sample doesn't keep the full vocals array (~1.3 GB
+            # for 2h sources) alive across translate + TTS.
+            voice_samples[speaker] = Audio(sliced.data.copy(), sliced.metadata)
         return voice_samples
+    def _pick_voice_segment(
+        self,
+        speaker: str,
+        segments: list[Any],
+        vocal_audio: Any,
+        background_audio: Any | None,
+        min_duration: float,
+    ) -> tuple[Any | None, str | None]:
+        """Score eligible segments and pick the best one for ``speaker``.
+        Returns ``(segment, fallback_reason)``. ``fallback_reason`` is None
+        when scoring picked a segment cleanly; non-None when every candidate
+        was rejected and the longest segment was used instead.
+        """
+        if not segments:
+            return None, None
+        eligible = [s for s in segments if (s.end - s.start) >= min_duration]
+        rejection_reasons: list[str] = []
+        scored: list[tuple[float, Any]] = []
+        for segment in eligible:
+            score, reason = self._score_voice_segment(segment, vocal_audio, background_audio)
+            if score is None:
+                rejection_reasons.append(reason or "rejected")
+            else:
+                scored.append((score, segment))
+        if scored:
+            scored.sort(key=lambda item: item[0], reverse=True)
+            return scored[0][1], None
+        # All eligible segments rejected (or none met the min duration).
+        # Fall back to the longest segment overall so the speaker still
+        # gets a clone reference.
+        longest = max(segments, key=lambda s: s.end - s.start)
+        if eligible:
+            reason = ", ".join(sorted(set(rejection_reasons)))
+        else:
+            reason = f"no segment >= {min_duration:.1f}s"
+        return longest, reason
+    def _score_voice_segment(
+        self,
+        segment: Any,
+        vocal_audio: Any,
+        background_audio: Any | None,
+    ) -> tuple[float | None, str | None]:
+        """Return ``(score, reason)`` for a candidate segment.
+        ``score`` is ``None`` when the segment is rejected; ``reason`` carries
+        the rejection cause so the fallback logger can summarize.
+        """
+        vocal_slice = vocal_audio.slice(segment.start, segment.end)
+        if vocal_slice.data.size == 0:
+            return None, "empty slice"
+        peak = float(np.max(np.abs(vocal_slice.data)))
+        if peak >= PEAK_CLIP_THRESHOLD:
+            return None, "clipped"
+        vocal_rms = float(np.sqrt(np.mean(vocal_slice.data**2)))
+        if background_audio is not None:
+            bg_slice = background_audio.slice(segment.start, segment.end)
+            if bg_slice.data.size > 0:
+                bg_rms = float(np.sqrt(np.mean(bg_slice.data**2)))
+                if bg_rms > 0 and (vocal_rms / bg_rms) < MIN_VOCAL_BG_RMS_RATIO:
+                    return None, "background-dominated"
+        duration = segment.end - segment.start
+        duration_penalty = abs(duration - VOICE_SAMPLE_TARGET_DURATION)
+        return vocal_rms - 0.05 * duration_penalty, None
     def process(
         self,
         source_audio: Audio,
@@ -257,6 +367,23 @@ class LocalDubbingPipeline:
                 target_lang=target_lang,
             )
+        # Cheap heuristic gate before the expensive Demucs/translation/TTS
+        # stages. Lets strict_quality callers refuse-and-refund without
+        # running the rest of the pipeline; non-strict runs continue but
+        # surface the assessment on DubbingResult.
+        transcript_quality = assess_transcript(transcription, source_audio.metadata.duration_seconds)
+        if transcript_quality.recommendation == "reject" and self.strict_quality:
+            raise GarbageTranscriptError(
+                f"Refusing to dub: {', '.join(transcript_quality.flags)}",
+                transcript_quality,
+            )
+        if transcript_quality.recommendation in ("warn", "reject"):
+            logger.warning(
+                "Transcript quality flags raised: %s (recommendation=%s)",
+                ", ".join(transcript_quality.flags),
+                transcript_quality.recommendation,
+            )
         detected_lang = source_lang or transcription.language or "en"
         separated_audio: SeparatedAudio | None = None
@@ -294,7 +421,7 @@ class LocalDubbingPipeline:
         voice_samples: dict[str, Audio] = {}
         if voice_clone:
             report_progress("Extracting voice samples", 0.25)
-            voice_samples = self._extract_voice_samples(vocal_audio, transcription)
+            voice_samples = self._extract_voice_samples(vocal_audio, background_audio, transcription)
         # vocals is no longer needed; voice_samples are independent copies.
         # In low_memory mode this is the only ref keeping the buffer alive
@@ -305,10 +432,19 @@ class LocalDubbingPipeline:
         if self._translator is None:
             self._init_translator()
+        # Translation stage spans 0.35 → 0.50 of overall pipeline progress.
+        # MarianMT runs sequentially over 8-segment batches; on a 15-min
+        # source that's minutes of silent dwell on 0.35 without per-batch
+        # ticks. Map the [0,1] translation fraction onto that 15% window.
+        def _on_translation_progress(fraction: float) -> None:
+            clamped = max(0.0, min(1.0, fraction))
+            report_progress(f"Translating text ({int(clamped * 100)}%)", 0.35 + 0.15 * clamped)
         translated_segments = self._translator.translate_segments(
             segments=transcription.segments,
             target_lang=target_lang,
             source_lang=detected_lang,
+            progress_callback=_on_translation_progress,
         )
         self._maybe_unload("_translator")
@@ -384,7 +520,8 @@ class LocalDubbingPipeline:
             self._init_synchronizer()
         assert self._synchronizer is not None
-        synchronized_segments, _ = self._synchronizer.synchronize_segments(dubbed_segments, target_durations)
+        synchronized_segments, adjustments = self._synchronizer.synchronize_segments(dubbed_segments, target_durations)
+        timing_summary = TimingSummary.from_adjustments(adjustments)
         del dubbed_segments
         report_progress("Assembling final audio", 0.90)
@@ -420,6 +557,8 @@ class LocalDubbingPipeline:
             target_lang=target_lang,
             separated_audio=separated_audio,
             voice_samples=voice_samples,
+            timing_summary=timing_summary,
+            transcript_quality=transcript_quality,
         )
     def revoice(
@@ -477,7 +616,10 @@ class LocalDubbingPipeline:
         voice_sample: Audio | None = None
         if transcription.segments:
-            voice_samples = self._extract_voice_samples(vocal_audio, transcription)
+            # revoice doesn't track the background after the low_memory drop,
+            # so quality gating degrades to "no RMS check" here. Clipping is
+            # still rejected.
+            voice_samples = self._extract_voice_samples(vocal_audio, None, transcription)
             if voice_samples:
                 voice_sample = next(iter(voice_samples.values()))

videopython-0.28.0/src/videopython/ai/dubbing/quality.py ADDED Viewed

@@ -0,0 +1,178 @@
+"""Cheap heuristics over a Whisper transcription to flag degenerate output.
+Surfaces three failure modes seen in production where Demucs/translation/TTS
+would otherwise spend minutes producing a useless dub:
+- Dominant-phrase cascade — one phrase repeats across most segments. The
+  classic Whisper failure on ambient music / outro screens
+  ("Thank you for watching").
+- Low decoder confidence — median per-segment ``avg_logprob`` is poor.
+- Silent input misread as speech — total speech duration is tiny relative
+  to the clip's wall-clock duration (only meaningful on long inputs).
+Each check raises a flag; a recommendation is derived from how many fired.
+Threshold constants live at module scope so production data can re-tune them
+without touching code structure.
+"""
+from __future__ import annotations
+import re
+import statistics
+from collections import Counter
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, Literal
+if TYPE_CHECKING:
+    from videopython.base.text.transcription import Transcription
+# Tuned conservatively to favor "warn" over "reject"; first-week production
+# data may move them.
+DOMINANT_PHRASE_FRACTION_THRESHOLD = 0.70
+LOW_LOGPROB_MEDIAN_THRESHOLD = -1.5
+LOW_SPEECH_FRACTION_THRESHOLD = 0.05
+SHORT_CLIP_SECONDS = 30.0  # below this, speech-fraction is too unstable to trust
+Recommendation = Literal["ok", "warn", "reject"]
+_PUNCT_RE = re.compile(r"[^\w\s]+", re.UNICODE)
+_WHITESPACE_RE = re.compile(r"\s+")
+def _normalize_phrase(text: str) -> str:
+    """Lowercase, strip punctuation, collapse whitespace."""
+    cleaned = _PUNCT_RE.sub(" ", text.lower())
+    return _WHITESPACE_RE.sub(" ", cleaned).strip()
+@dataclass
+class TranscriptQuality:
+    """Quality assessment of a Whisper transcription.
+    Attributes:
+        recommendation: ``"ok"`` (continue), ``"warn"`` (continue, log), or
+            ``"reject"`` (caller should refuse to dub if strict_quality).
+        dominant_phrase: The repeating phrase that triggered the dominance
+            flag, or None when the flag didn't fire.
+        dominant_phrase_fraction: Character-count share of the most common
+            normalized segment phrase. 0.0 when no segments.
+        median_avg_logprob: Median of ``avg_logprob`` across segments that
+            carry it; None when no segment had a logprob (e.g. SRT-loaded).
+        speech_fraction: Sum of segment durations divided by the audio's
+            wall-clock duration.
+        flags: Human-readable list of which checks fired.
+    """
+    recommendation: Recommendation
+    dominant_phrase: str | None
+    dominant_phrase_fraction: float
+    median_avg_logprob: float | None
+    speech_fraction: float
+    flags: list[str] = field(default_factory=list)
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "recommendation": self.recommendation,
+            "dominant_phrase": self.dominant_phrase,
+            "dominant_phrase_fraction": self.dominant_phrase_fraction,
+            "median_avg_logprob": self.median_avg_logprob,
+            "speech_fraction": self.speech_fraction,
+            "flags": list(self.flags),
+        }
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> TranscriptQuality:
+        return cls(
+            recommendation=data["recommendation"],
+            dominant_phrase=data.get("dominant_phrase"),
+            dominant_phrase_fraction=data.get("dominant_phrase_fraction", 0.0),
+            median_avg_logprob=data.get("median_avg_logprob"),
+            speech_fraction=data.get("speech_fraction", 0.0),
+            flags=list(data.get("flags", [])),
+        )
+class GarbageTranscriptError(RuntimeError):
+    """Raised by the dubbing pipeline when ``strict_quality=True`` and the
+    transcript heuristic returns ``recommendation="reject"``.
+    The triggering :class:`TranscriptQuality` is attached as ``quality`` so
+    callers can introspect the flags without re-running the pipeline.
+    """
+    def __init__(self, message: str, quality: TranscriptQuality):
+        super().__init__(message)
+        self.quality = quality
+def assess_transcript(
+    transcription: Transcription,
+    audio_duration_seconds: float,
+) -> TranscriptQuality:
+    """Run the three quality checks and return a recommendation.
+    See module docstring for what each check looks for.
+    """
+    segments = list(transcription.segments)
+    # Dominant-phrase share by character count.
+    dominant_phrase: str | None = None
+    dominant_fraction = 0.0
+    if segments:
+        normalized = [_normalize_phrase(s.text) for s in segments]
+        char_counts: Counter[str] = Counter()
+        total_chars = 0
+        for phrase in normalized:
+            if not phrase:
+                continue
+            n = len(phrase)
+            char_counts[phrase] += n
+            total_chars += n
+        if total_chars > 0 and char_counts:
+            most_common_phrase, most_common_chars = char_counts.most_common(1)[0]
+            dominant_fraction = most_common_chars / total_chars
+            dominant_phrase = most_common_phrase
+    # Median avg_logprob across segments that carry it.
+    logprobs = [s.avg_logprob for s in segments if s.avg_logprob is not None]
+    median_logprob = statistics.median(logprobs) if logprobs else None
+    # Speech fraction = sum of segment durations / audio duration.
+    speech_seconds = sum(max(0.0, s.end - s.start) for s in segments)
+    speech_fraction = speech_seconds / audio_duration_seconds if audio_duration_seconds > 0 else 0.0
+    flags: list[str] = []
+    dominance_flag = dominant_fraction >= DOMINANT_PHRASE_FRACTION_THRESHOLD
+    if dominance_flag:
+        flags.append(f"dominant phrase {dominant_fraction:.0%}: {dominant_phrase!r}")
+    logprob_flag = median_logprob is not None and median_logprob < LOW_LOGPROB_MEDIAN_THRESHOLD
+    if logprob_flag:
+        flags.append(f"median avg_logprob {median_logprob:.2f} below {LOW_LOGPROB_MEDIAN_THRESHOLD}")
+    # Speech-fraction is unstable on short clips; skip it there.
+    speech_flag = audio_duration_seconds > SHORT_CLIP_SECONDS and speech_fraction < LOW_SPEECH_FRACTION_THRESHOLD
+    if speech_flag:
+        flags.append(f"speech fraction {speech_fraction:.1%} below {LOW_SPEECH_FRACTION_THRESHOLD:.0%}")
+    # Reject only when dominance + at least one other flag fires; legitimate
+    # repetitive content (chants, lyric clips) should warn, not reject.
+    recommendation: Recommendation
+    if dominance_flag and (logprob_flag or speech_flag):
+        recommendation = "reject"
+    elif flags:
+        recommendation = "warn"
+    else:
+        recommendation = "ok"
+    return TranscriptQuality(
+        recommendation=recommendation,
+        dominant_phrase=dominant_phrase if dominance_flag else None,
+        dominant_phrase_fraction=dominant_fraction,
+        median_avg_logprob=median_logprob,
+        speech_fraction=speech_fraction,
+        flags=flags,
+    )

{videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/generation/translation.py RENAMED Viewed

@@ -2,7 +2,7 @@
 from __future__ import annotations
-from typing import Any
+from typing import Any, Callable
 from videopython.ai._device import log_device_initialization, release_device_memory, select_device
 from videopython.ai.dubbing.models import TranslatedSegment
@@ -135,8 +135,15 @@ class TextTranslator:
         texts: list[str],
         target_lang: str,
         source_lang: str | None = None,
+        progress_callback: Callable[[float], None] | None = None,
     ) -> list[str]:
-        """Translate multiple texts to target language."""
+        """Translate multiple texts to target language.
+        ``progress_callback`` is called once per batch with a fraction in
+        ``[0, 1]`` representing translation-stage progress. It does not fire
+        on the empty-input or same-language shortcuts (those are O(0) work
+        and the caller frames its own progress events around the call).
+        """
         import torch
         if not texts:
@@ -150,8 +157,9 @@ class TextTranslator:
         translated: list[str] = []
         batch_size = 8
+        total = len(texts)
-        for i in range(0, len(texts), batch_size):
+        for i in range(0, total, batch_size):
             batch = texts[i : i + batch_size]
             inputs = self._tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
             inputs = {k: v.to(self.device) for k, v in inputs.items()}
@@ -162,6 +170,9 @@ class TextTranslator:
             for output in outputs:
                 translated.append(self._tokenizer.decode(output, skip_special_tokens=True))
+            if progress_callback is not None:
+                progress_callback(min(1.0, (i + len(batch)) / total))
         return translated
     def translate_segments(
@@ -169,6 +180,7 @@ class TextTranslator:
         segments: list[TranscriptionSegment],
         target_lang: str,
         source_lang: str | None = None,
+        progress_callback: Callable[[float], None] | None = None,
     ) -> list[TranslatedSegment]:
         """Translate transcription segments while preserving timing/speaker info.
@@ -177,12 +189,18 @@ class TextTranslator:
         ``translated_text=""`` instead. This avoids MarianMT hallucinating
         full sentences from " .", "...", or single-token Whisper segments,
         which would otherwise be TTS'd into the dubbed track.
+        ``progress_callback`` is forwarded to :meth:`translate_batch` so
+        callers can render translation-stage progress without knowing the
+        batch size.
         """
         effective_source = source_lang or "en"
         translatable_indices = [i for i, segment in enumerate(segments) if _is_translatable_text(segment.text)]
         translatable_texts = [segments[i].text for i in translatable_indices]
-        translated_texts = self.translate_batch(translatable_texts, target_lang, source_lang)
+        translated_texts = self.translate_batch(
+            translatable_texts, target_lang, source_lang, progress_callback=progress_callback
+        )
         translation_map: dict[int, str] = dict(zip(translatable_indices, translated_texts))

{videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/understanding/audio.py RENAMED Viewed

@@ -20,6 +20,20 @@ class AudioToText:
     voiced regions only — fixes Whisper's tendency to lock onto the wrong
     language when the file opens with silence, music, or non-vocal credits.
     Disable with ``enable_vad=False`` to reproduce pre-0.27 behaviour.
+    Three Whisper decoder kwargs are surfaced for anti-hallucination tuning:
+    - ``condition_on_previous_text`` defaults to ``False`` (Whisper's own
+      default is ``True``). With conditioning on, a single hallucinated filler
+      phrase cascades through the rest of the file because each window's
+      decoder is primed by the previous window's decoded text. Turning it off
+      is the most commonly recommended fix for that failure mode; the cost on
+      clean audio is small (slightly less context for ambiguous homophones
+      across sentence boundaries).
+    - ``no_speech_threshold`` and ``logprob_threshold`` are forwarded with
+      Whisper's documented defaults (``0.6`` and ``-1.0``); raising
+      ``no_speech_threshold`` biases toward dropping low-confidence windows
+      instead of emitting filler.
     """
     PYANNOTE_DIARIZATION_MODEL = "pyannote/speaker-diarization-community-1"
@@ -29,11 +43,17 @@ class AudioToText:
         model_name: Literal["tiny", "base", "small", "medium", "large", "turbo"] = "turbo",
         enable_diarization: bool = False,
         enable_vad: bool = True,
+        condition_on_previous_text: bool = False,
+        no_speech_threshold: float = 0.6,
+        logprob_threshold: float | None = -1.0,
         device: str | None = None,
     ):
         self.model_name = model_name
         self.enable_diarization = enable_diarization
         self.enable_vad = enable_vad
+        self.condition_on_previous_text = condition_on_previous_text
+        self.no_speech_threshold = no_speech_threshold
+        self.logprob_threshold = logprob_threshold
         self.device = select_device(device, mps_allowed=False)
         log_device_initialization(
             "AudioToText",
@@ -44,6 +64,16 @@ class AudioToText:
         self._diarization_pipeline: Any = None
         self._vad_model: Any = None
+    def _transcribe_kwargs(self, language: str | None) -> dict[str, Any]:
+        """Kwargs threaded into ``whisper.Whisper.transcribe`` from both call sites."""
+        return {
+            "word_timestamps": True,
+            "language": language,
+            "condition_on_previous_text": self.condition_on_previous_text,
+            "no_speech_threshold": self.no_speech_threshold,
+            "logprob_threshold": self.logprob_threshold,
+        }
     def _init_local(self) -> None:
         """Initialize local Whisper model."""
         import whisper
@@ -92,6 +122,9 @@ class AudioToText:
                 end=segment["end"],
                 text=segment["text"],
                 words=transcription_words,
+                avg_logprob=segment.get("avg_logprob"),
+                no_speech_prob=segment.get("no_speech_prob"),
+                compression_ratio=segment.get("compression_ratio"),
             )
             transcription_segments.append(transcription_segment)
@@ -253,7 +286,7 @@ class AudioToText:
             self._init_diarization()
         audio_data = audio_mono.data
-        transcription_result = self._model.transcribe(audio=audio_data, word_timestamps=True, language=language)
+        transcription_result = self._model.transcribe(audio=audio_data, **self._transcribe_kwargs(language))
         waveform = torch.from_numpy(audio_data.astype(np.float32)).unsqueeze(0)
         diarization_result = self._diarization_pipeline(
@@ -300,7 +333,7 @@ class AudioToText:
         if self.enable_diarization:
             return self._transcribe_with_diarization(audio_mono, language)
-        transcription_result = self._model.transcribe(audio=audio_mono.data, word_timestamps=True, language=language)
+        transcription_result = self._model.transcribe(audio=audio_mono.data, **self._transcribe_kwargs(language))
         return self._process_transcription_result(transcription_result)
     def transcribe(self, media: Audio | Video) -> Transcription:

{videopython-0.27.1 → videopython-0.28.0}/src/videopython/base/text/transcription.py RENAMED Viewed

@@ -40,6 +40,9 @@ class TranscriptionSegment:
     text: str
     words: list[TranscriptionWord]
     speaker: str | None = None
+    avg_logprob: float | None = None
+    no_speech_prob: float | None = None
+    compression_ratio: float | None = None
     def to_dict(self) -> dict:
         """Convert to dictionary for JSON serialization."""
@@ -49,6 +52,9 @@ class TranscriptionSegment:
             "text": self.text,
             "words": [w.to_dict() for w in self.words],
             "speaker": self.speaker,
+            "avg_logprob": self.avg_logprob,
+            "no_speech_prob": self.no_speech_prob,
+            "compression_ratio": self.compression_ratio,
         }
     @classmethod
@@ -60,6 +66,9 @@ class TranscriptionSegment:
             text=data["text"],
             words=[TranscriptionWord.from_dict(w) for w in data["words"]],
             speaker=data.get("speaker"),
+            avg_logprob=data.get("avg_logprob"),
+            no_speech_prob=data.get("no_speech_prob"),
+            compression_ratio=data.get("compression_ratio"),
         )