PyPI - videopython - Versions diffs - 0.27.2__tar.gz → 0.28.1__tar.gz - Mend

videopython 0.27.2tar.gz → 0.28.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

{videopython-0.27.2 → videopython-0.28.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: videopython
-Version: 0.27.2
+Version: 0.28.1
 Summary: Minimal video generation and processing library.
 Project-URL: Homepage, https://videopython.com
 Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -29,6 +29,7 @@ Requires-Dist: demucs>=4.0.0; extra == 'ai'
 Requires-Dist: diffusers>=0.30.0; extra == 'ai'
 Requires-Dist: easyocr>=1.7.0; extra == 'ai'
 Requires-Dist: hf-transfer>=0.1.9; extra == 'ai'
+Requires-Dist: llama-cpp-python>=0.3.0; extra == 'ai'
 Requires-Dist: numba>=0.61.0; extra == 'ai'
 Requires-Dist: ollama>=0.4.5; extra == 'ai'
 Requires-Dist: openai-whisper>=20240930; extra == 'ai'

{videopython-0.27.2 → videopython-0.28.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "videopython"
-version = "0.27.2"
+version = "0.28.1"
 description = "Minimal video generation and processing library."
 authors = [
     { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -80,6 +80,8 @@ ai = [
     "sentencepiece>=0.1.99",
     # Audio source separation
     "demucs>=4.0.0",
+    # Translation backend: Qwen3 GGUF inference (M2)
+    "llama-cpp-python>=0.3.0",
 ]
 # Required for pip install videopython[ai] - pip uses optional-dependencies, not dependency-groups
@@ -111,6 +113,8 @@ ai = [
     "sentencepiece>=0.1.99",
     # Audio source separation
     "demucs>=4.0.0",
+    # Translation backend: Qwen3 GGUF inference (M2)
+    "llama-cpp-python>=0.3.0",
 ]
 [project.urls]
@@ -136,6 +140,7 @@ module = [
     "pyannote", "pyannote.*",
     "silero_vad", "silero_vad.*",
     "cv2", "cv2.*",
+    "llama_cpp", "llama_cpp.*",
 ]
 ignore_missing_imports = true

{videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/dubbing/__init__.py RENAMED Viewed

@@ -3,7 +3,9 @@
 from videopython.ai.dubbing.dubber import VideoDubber
 from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio, TranslatedSegment
 from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
+from videopython.ai.dubbing.quality import GarbageTranscriptError, TranscriptQuality, assess_transcript
 from videopython.ai.dubbing.timing import TimingSynchronizer
+from videopython.ai.generation.translation import UnsupportedLanguageError
 __all__ = [
     "VideoDubber",
@@ -13,4 +15,8 @@ __all__ = [
     "SeparatedAudio",
     "LocalDubbingPipeline",
     "TimingSynchronizer",
+    "GarbageTranscriptError",
+    "TranscriptQuality",
+    "assess_transcript",
+    "UnsupportedLanguageError",
 ]

{videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/dubbing/dubber.py RENAMED Viewed

@@ -7,7 +7,7 @@ from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable
 from videopython.ai.dubbing.models import DubbingResult, RevoiceResult
-from videopython.ai.dubbing.pipeline import WhisperModel
+from videopython.ai.dubbing.pipeline import TranslatorChoice, WhisperModel
 if TYPE_CHECKING:
     from videopython.base.video import Video
@@ -37,6 +37,19 @@ class VideoDubber:
             gate; raise to drop more low-confidence windows.
         logprob_threshold: Forwarded to ``AudioToText``. Whisper's average
             log-probability gate.
+        strict_quality: When True, the pipeline raises
+            :class:`GarbageTranscriptError` before Demucs/translation/TTS run
+            if the transcript-quality heuristic returns ``"reject"``. When
+            False (default), low-quality transcripts are logged at WARNING
+            but processing continues. Either way the
+            :class:`TranscriptQuality` is exposed on ``DubbingResult`` for
+            inspection.
+        translator: Translation backend to use. ``"auto"`` (default)
+            picks Qwen3 on GPU, MarianMT on CPU; ``"marian"`` and
+            ``"qwen3"`` force the named backend regardless of device.
+            See :class:`videopython.ai.generation.qwen3.Qwen3Translator`
+            for tradeoffs (Qwen3 is slower on CPU but produces
+            context-aware, length-budgeted output).
     """
     def __init__(
@@ -47,6 +60,8 @@ class VideoDubber:
         condition_on_previous_text: bool = False,
         no_speech_threshold: float = 0.6,
         logprob_threshold: float | None = -1.0,
+        strict_quality: bool = False,
+        translator: TranslatorChoice = "auto",
     ):
         self.device = device
         self.low_memory = low_memory
@@ -54,13 +69,16 @@ class VideoDubber:
         self.condition_on_previous_text = condition_on_previous_text
         self.no_speech_threshold = no_speech_threshold
         self.logprob_threshold = logprob_threshold
+        self.strict_quality = strict_quality
+        self.translator = translator
         self._local_pipeline: Any = None
         requested = device.lower() if isinstance(device, str) else "auto"
         logger.info(
-            "VideoDubber initialized with device=%s low_memory=%s whisper_model=%s",
+            "VideoDubber initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
             requested,
             low_memory,
             whisper_model,
+            translator,
         )
     def _init_local_pipeline(self) -> None:
@@ -73,6 +91,8 @@ class VideoDubber:
             condition_on_previous_text=self.condition_on_previous_text,
             no_speech_threshold=self.no_speech_threshold,
             logprob_threshold=self.logprob_threshold,
+            strict_quality=self.strict_quality,
+            translator=self.translator,
         )
     def dub(

{videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/dubbing/models.py RENAMED Viewed

@@ -3,10 +3,21 @@
 from __future__ import annotations
 from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any
 from videopython.base.audio import Audio
 from videopython.base.text.transcription import Transcription, TranscriptionSegment
+if TYPE_CHECKING:
+    from videopython.ai.dubbing.quality import TranscriptQuality
+    from videopython.ai.dubbing.timing import TimingAdjustment
+# Speed factors within this band of 1.0 are treated as a "clean" timing
+# adjustment (no perceptible compression/stretch). Heuristic threshold for
+# the TimingSummary classification only.
+CLEAN_SPEED_TOLERANCE = 0.01
 @dataclass
 class TranslatedSegment:
@@ -73,6 +84,87 @@ class SeparatedAudio:
         return self.music is not None and self.effects is not None
+@dataclass
+class TimingSummary:
+    """Aggregate stats over per-segment timing adjustments.
+    Surfaces how aggressively the timing synchronizer had to compress or
+    truncate dubbed segments to fit the source's spoken regions. High
+    truncation rates indicate translation produced text too long for the
+    source duration.
+    """
+    total_segments: int
+    clean_count: int
+    stretched_count: int
+    truncated_count: int
+    mean_speed_factor: float
+    max_truncation_seconds: float
+    @classmethod
+    def from_adjustments(cls, adjustments: list[TimingAdjustment]) -> TimingSummary:
+        """Aggregate a list of TimingAdjustments into a TimingSummary."""
+        total = len(adjustments)
+        if total == 0:
+            return cls(
+                total_segments=0,
+                clean_count=0,
+                stretched_count=0,
+                truncated_count=0,
+                mean_speed_factor=1.0,
+                max_truncation_seconds=0.0,
+            )
+        clean = 0
+        stretched = 0
+        truncated = 0
+        speed_sum = 0.0
+        max_truncation = 0.0
+        for adj in adjustments:
+            speed_sum += adj.speed_factor
+            if adj.was_truncated:
+                truncated += 1
+                truncation = adj.original_duration - adj.actual_duration
+                if truncation > max_truncation:
+                    max_truncation = truncation
+            elif abs(adj.speed_factor - 1.0) <= CLEAN_SPEED_TOLERANCE:
+                clean += 1
+            else:
+                stretched += 1
+        return cls(
+            total_segments=total,
+            clean_count=clean,
+            stretched_count=stretched,
+            truncated_count=truncated,
+            mean_speed_factor=speed_sum / total,
+            max_truncation_seconds=max_truncation,
+        )
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "total_segments": self.total_segments,
+            "clean_count": self.clean_count,
+            "stretched_count": self.stretched_count,
+            "truncated_count": self.truncated_count,
+            "mean_speed_factor": self.mean_speed_factor,
+            "max_truncation_seconds": self.max_truncation_seconds,
+        }
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> TimingSummary:
+        """Create TimingSummary from dictionary."""
+        return cls(
+            total_segments=data["total_segments"],
+            clean_count=data["clean_count"],
+            stretched_count=data["stretched_count"],
+            truncated_count=data["truncated_count"],
+            mean_speed_factor=data["mean_speed_factor"],
+            max_truncation_seconds=data["max_truncation_seconds"],
+        )
 @dataclass
 class DubbingResult:
     """Result of a video dubbing operation.
@@ -85,6 +177,14 @@ class DubbingResult:
         target_lang: Target language for dubbing.
         separated_audio: Separated audio components (if preserve_background=True).
         voice_samples: Dictionary mapping speaker IDs to voice sample Audio.
+        timing_summary: Aggregate stats over per-segment timing adjustments.
+        transcript_quality: Heuristic quality assessment of the transcription
+            (None when the pipeline returned early on an empty transcription).
+        translation_failures: Indices of segments where translation failed
+            entirely. Used by Qwen3Translator when both the primary call and
+            the per-segment Marian fallback fail; those segments are dubbed
+            with empty text. Empty list under MarianTranslator (Marian has
+            no failure mode that drops segments).
     """
     dubbed_audio: Audio
@@ -94,6 +194,9 @@ class DubbingResult:
     target_lang: str
     separated_audio: SeparatedAudio | None = None
     voice_samples: dict[str, Audio] = field(default_factory=dict)
+    timing_summary: TimingSummary | None = None
+    transcript_quality: TranscriptQuality | None = None
+    translation_failures: list[int] = field(default_factory=list)
     @property
     def num_segments(self) -> int:

{videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/dubbing/pipeline.py RENAMED Viewed

@@ -9,13 +9,24 @@ from typing import TYPE_CHECKING, Any, Callable, Literal
 import numpy as np
-from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio
+from videopython.ai._device import select_device
+from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio, TimingSummary
+from videopython.ai.dubbing.quality import GarbageTranscriptError, assess_transcript
 from videopython.ai.dubbing.timing import TimingSynchronizer
+from videopython.ai.generation.qwen3 import Qwen3Translator
+from videopython.ai.generation.translation import (
+    MarianTranslator,
+    TranslationBackend,
+    UnsupportedLanguageError,
+)
 if TYPE_CHECKING:
     from videopython.base.audio import Audio
+TranslatorChoice = Literal["auto", "marian", "qwen3"]
 def _peak_match(target: Audio, reference: Audio) -> Audio:
     """Scale ``target`` so its peak amplitude matches ``reference``.
@@ -46,6 +57,14 @@ WhisperModel = Literal["tiny", "base", "small", "medium", "large", "turbo"]
 logger = logging.getLogger(__name__)
+# Voice-sample quality gating thresholds. Tuned conservatively to favor
+# accepting real-world dialogue over rejecting it; failures fall back to
+# the longest segment with a WARNING log so we can re-tune from production
+# data instead of guessing.
+PEAK_CLIP_THRESHOLD = 0.99
+MIN_VOCAL_BG_RMS_RATIO = 1.5
+VOICE_SAMPLE_TARGET_DURATION = 6.0
 class LocalDubbingPipeline:
     """Local pipeline for video dubbing.
@@ -64,6 +83,8 @@ class LocalDubbingPipeline:
         condition_on_previous_text: bool = False,
         no_speech_threshold: float = 0.6,
         logprob_threshold: float | None = -1.0,
+        strict_quality: bool = False,
+        translator: TranslatorChoice = "auto",
     ):
         self.device = device
         self.low_memory = low_memory
@@ -71,12 +92,15 @@ class LocalDubbingPipeline:
         self.condition_on_previous_text = condition_on_previous_text
         self.no_speech_threshold = no_speech_threshold
         self.logprob_threshold = logprob_threshold
+        self.strict_quality = strict_quality
+        self.translator = translator
         requested = device.lower() if isinstance(device, str) else "auto"
         logger.info(
-            "LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s",
+            "LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
             requested,
             low_memory,
             whisper_model,
+            translator,
         )
         self._transcriber: Any = None
@@ -117,11 +141,64 @@ class LocalDubbingPipeline:
             logprob_threshold=self.logprob_threshold,
         )
-    def _init_translator(self) -> None:
-        """Initialize the translation model."""
-        from videopython.ai.generation.translation import TextTranslator
+    def _init_translator(self, source_lang: str, target_lang: str) -> None:
+        """Initialize the translation backend.
+        Resolves the configured ``self.translator`` choice into a concrete
+        backend. ``"auto"`` uses :meth:`_resolve_translator_auto`; explicit
+        choices instantiate the named backend directly. Re-initialization
+        is a no-op when ``self._translator`` is already a matching instance
+        for the same language pair (handled at call sites via the existing
+        ``self._translator is None`` gate).
+        """
+        if self.translator == "marian":
+            self._translator = MarianTranslator(device=self.device)
+        elif self.translator == "qwen3":
+            self._translator = Qwen3Translator(device=self.device)
+        else:  # "auto"
+            self._translator = self._resolve_translator_auto(source_lang, target_lang)
+    def _resolve_translator_auto(self, source_lang: str, target_lang: str) -> TranslationBackend:
+        """Pick a backend based on language coverage AND device.
+        Qwen3-4B Q4_K_M on CPU is roughly 10-15x slower than MarianMT (M2.1
+        spike on dreams_15min.mp4). The resolver picks Marian on CPU
+        whenever it covers the language pair and only escalates to Qwen
+        when a GPU is available or Marian doesn't cover the pair.
+        """
+        device = select_device(self.device, mps_allowed=True)
+        has_gpu = device in ("cuda", "mps")
+        # 1. GPU + Qwen covers the pair → Qwen wins (best quality).
+        if has_gpu and Qwen3Translator.supports(source_lang, target_lang):
+            logger.info(
+                "translator: auto-selected qwen3 (device=%s, supports %s->%s)",
+                device,
+                source_lang,
+                target_lang,
+            )
+            return Qwen3Translator(device=self.device)
+        # 2. Marian covers the pair → Marian (fast).
+        if MarianTranslator.has_model_for(source_lang, target_lang):
+            if has_gpu:
+                reason = f"Qwen does not cover {source_lang}->{target_lang}"
+            else:
+                reason = f"device={device} (Qwen would be ~10-15x slower; pass translator='qwen3' to override)"
+            logger.info("translator: auto-selected marian (%s)", reason)
+            return MarianTranslator(device=self.device)
+        # 3. CPU + only Qwen covers it: warn loudly and use Qwen anyway.
+        if Qwen3Translator.supports(source_lang, target_lang):
+            logger.warning(
+                "translator: auto-selected qwen3 on CPU (%s->%s not in Marian); "
+                "translation will be slow (~10-15x MarianMT). Consider GPU.",
+                source_lang,
+                target_lang,
+            )
+            return Qwen3Translator(device=self.device)
-        self._translator = TextTranslator(device=self.device)
+        raise UnsupportedLanguageError(source_lang, target_lang)
     def _init_tts(self, language: str = "en") -> None:
         """Initialize the text-to-speech model."""
@@ -141,12 +218,25 @@ class LocalDubbingPipeline:
     def _extract_voice_samples(
         self,
-        audio: Any,
+        vocal_audio: Any,
+        background_audio: Any | None,
         transcription: Any,
         min_duration: float = 3.0,
         max_duration: float = 10.0,
     ) -> dict[str, Any]:
-        """Extract voice samples for each speaker from the audio."""
+        """Extract a per-speaker voice sample with quality gating.
+        Picks the highest-scored segment per speaker after rejecting clipped
+        slices (peak >= ``PEAK_CLIP_THRESHOLD``) and slices where Demucs left
+        the background louder than the vocals
+        (``vocal_rms / bg_rms < MIN_VOCAL_BG_RMS_RATIO``). When the
+        background track isn't available (e.g. ``revoice`` after
+        ``low_memory`` dropped it), the RMS check is skipped silently.
+        Falls back to the longest available segment with a WARNING log when
+        every candidate is rejected, so the dub continues with the best
+        sample we have rather than silently dropping the speaker.
+        """
         from videopython.base.audio import Audio
         voice_samples: dict[str, Audio] = {}
@@ -159,29 +249,106 @@ class LocalDubbingPipeline:
             segments_by_speaker[speaker].append(segment)
         for speaker, segments in segments_by_speaker.items():
-            target_duration = 6.0
-            best_segment = None
-            best_diff = float("inf")
-            for segment in segments:
-                duration = segment.end - segment.start
-                if duration >= min_duration:
-                    diff = abs(duration - target_duration)
-                    if diff < best_diff:
-                        best_diff = diff
-                        best_segment = segment
-            if best_segment is not None:
-                start = best_segment.start
-                end = min(best_segment.end, start + max_duration)
-                sliced = audio.slice(start, end)
-                # Audio.slice returns a numpy view into the source. Copy so the
-                # short voice sample doesn't keep the full vocals array (~1.3 GB
-                # for 2h sources) alive across translate + TTS.
-                voice_samples[speaker] = Audio(sliced.data.copy(), sliced.metadata)
+            chosen, fallback_reason = self._pick_voice_segment(
+                speaker, segments, vocal_audio, background_audio, min_duration
+            )
+            if chosen is None:
+                logger.warning("No usable voice-sample segment for speaker %r (no candidates)", speaker)
+                continue
+            if fallback_reason is not None:
+                logger.warning(
+                    "Voice-sample quality fallback for speaker %r (%d candidates): %s — using longest segment",
+                    speaker,
+                    len(segments),
+                    fallback_reason,
+                )
+            start = chosen.start
+            end = min(chosen.end, start + max_duration)
+            sliced = vocal_audio.slice(start, end)
+            # Audio.slice returns a numpy view into the source. Copy so the
+            # short voice sample doesn't keep the full vocals array (~1.3 GB
+            # for 2h sources) alive across translate + TTS.
+            voice_samples[speaker] = Audio(sliced.data.copy(), sliced.metadata)
         return voice_samples
+    def _pick_voice_segment(
+        self,
+        speaker: str,
+        segments: list[Any],
+        vocal_audio: Any,
+        background_audio: Any | None,
+        min_duration: float,
+    ) -> tuple[Any | None, str | None]:
+        """Score eligible segments and pick the best one for ``speaker``.
+        Returns ``(segment, fallback_reason)``. ``fallback_reason`` is None
+        when scoring picked a segment cleanly; non-None when every candidate
+        was rejected and the longest segment was used instead.
+        """
+        if not segments:
+            return None, None
+        eligible = [s for s in segments if (s.end - s.start) >= min_duration]
+        rejection_reasons: list[str] = []
+        scored: list[tuple[float, Any]] = []
+        for segment in eligible:
+            score, reason = self._score_voice_segment(segment, vocal_audio, background_audio)
+            if score is None:
+                rejection_reasons.append(reason or "rejected")
+            else:
+                scored.append((score, segment))
+        if scored:
+            scored.sort(key=lambda item: item[0], reverse=True)
+            return scored[0][1], None
+        # All eligible segments rejected (or none met the min duration).
+        # Fall back to the longest segment overall so the speaker still
+        # gets a clone reference.
+        longest = max(segments, key=lambda s: s.end - s.start)
+        if eligible:
+            reason = ", ".join(sorted(set(rejection_reasons)))
+        else:
+            reason = f"no segment >= {min_duration:.1f}s"
+        return longest, reason
+    def _score_voice_segment(
+        self,
+        segment: Any,
+        vocal_audio: Any,
+        background_audio: Any | None,
+    ) -> tuple[float | None, str | None]:
+        """Return ``(score, reason)`` for a candidate segment.
+        ``score`` is ``None`` when the segment is rejected; ``reason`` carries
+        the rejection cause so the fallback logger can summarize.
+        """
+        vocal_slice = vocal_audio.slice(segment.start, segment.end)
+        if vocal_slice.data.size == 0:
+            return None, "empty slice"
+        peak = float(np.max(np.abs(vocal_slice.data)))
+        if peak >= PEAK_CLIP_THRESHOLD:
+            return None, "clipped"
+        vocal_rms = float(np.sqrt(np.mean(vocal_slice.data**2)))
+        if background_audio is not None:
+            bg_slice = background_audio.slice(segment.start, segment.end)
+            if bg_slice.data.size > 0:
+                bg_rms = float(np.sqrt(np.mean(bg_slice.data**2)))
+                if bg_rms > 0 and (vocal_rms / bg_rms) < MIN_VOCAL_BG_RMS_RATIO:
+                    return None, "background-dominated"
+        duration = segment.end - segment.start
+        duration_penalty = abs(duration - VOICE_SAMPLE_TARGET_DURATION)
+        return vocal_rms - 0.05 * duration_penalty, None
     def process(
         self,
         source_audio: Audio,
@@ -266,6 +433,23 @@ class LocalDubbingPipeline:
                 target_lang=target_lang,
             )
+        # Cheap heuristic gate before the expensive Demucs/translation/TTS
+        # stages. Lets strict_quality callers refuse-and-refund without
+        # running the rest of the pipeline; non-strict runs continue but
+        # surface the assessment on DubbingResult.
+        transcript_quality = assess_transcript(transcription, source_audio.metadata.duration_seconds)
+        if transcript_quality.recommendation == "reject" and self.strict_quality:
+            raise GarbageTranscriptError(
+                f"Refusing to dub: {', '.join(transcript_quality.flags)}",
+                transcript_quality,
+            )
+        if transcript_quality.recommendation in ("warn", "reject"):
+            logger.warning(
+                "Transcript quality flags raised: %s (recommendation=%s)",
+                ", ".join(transcript_quality.flags),
+                transcript_quality.recommendation,
+            )
         detected_lang = source_lang or transcription.language or "en"
         separated_audio: SeparatedAudio | None = None
@@ -303,7 +487,7 @@ class LocalDubbingPipeline:
         voice_samples: dict[str, Audio] = {}
         if voice_clone:
             report_progress("Extracting voice samples", 0.25)
-            voice_samples = self._extract_voice_samples(vocal_audio, transcription)
+            voice_samples = self._extract_voice_samples(vocal_audio, background_audio, transcription)
         # vocals is no longer needed; voice_samples are independent copies.
         # In low_memory mode this is the only ref keeping the buffer alive
@@ -312,13 +496,25 @@ class LocalDubbingPipeline:
         report_progress("Translating text", 0.35)
         if self._translator is None:
-            self._init_translator()
+            self._init_translator(source_lang=detected_lang, target_lang=target_lang)
+        # Translation stage spans 0.35 → 0.50 of overall pipeline progress.
+        # MarianMT runs sequentially over 8-segment batches; on a 15-min
+        # source that's minutes of silent dwell on 0.35 without per-batch
+        # ticks. Map the [0,1] translation fraction onto that 15% window.
+        def _on_translation_progress(fraction: float) -> None:
+            clamped = max(0.0, min(1.0, fraction))
+            report_progress(f"Translating text ({int(clamped * 100)}%)", 0.35 + 0.15 * clamped)
         translated_segments = self._translator.translate_segments(
             segments=transcription.segments,
             target_lang=target_lang,
             source_lang=detected_lang,
+            progress_callback=_on_translation_progress,
         )
+        # Capture per-segment failures (always empty for Marian) before
+        # _maybe_unload nukes the backend in low_memory mode.
+        translation_failures = list(self._translator.translation_failures)
         self._maybe_unload("_translator")
         report_progress("Generating dubbed speech", 0.50)
@@ -393,7 +589,8 @@ class LocalDubbingPipeline:
             self._init_synchronizer()
         assert self._synchronizer is not None
-        synchronized_segments, _ = self._synchronizer.synchronize_segments(dubbed_segments, target_durations)
+        synchronized_segments, adjustments = self._synchronizer.synchronize_segments(dubbed_segments, target_durations)
+        timing_summary = TimingSummary.from_adjustments(adjustments)
         del dubbed_segments
         report_progress("Assembling final audio", 0.90)
@@ -429,6 +626,9 @@ class LocalDubbingPipeline:
             target_lang=target_lang,
             separated_audio=separated_audio,
             voice_samples=voice_samples,
+            timing_summary=timing_summary,
+            transcript_quality=transcript_quality,
+            translation_failures=translation_failures,
         )
     def revoice(
@@ -486,7 +686,10 @@ class LocalDubbingPipeline:
         voice_sample: Audio | None = None
         if transcription.segments:
-            voice_samples = self._extract_voice_samples(vocal_audio, transcription)
+            # revoice doesn't track the background after the low_memory drop,
+            # so quality gating degrades to "no RMS check" here. Clipping is
+            # still rejected.
+            voice_samples = self._extract_voice_samples(vocal_audio, None, transcription)
             if voice_samples:
                 voice_sample = next(iter(voice_samples.values()))

videopython 0.27.2__tar.gz → 0.28.1__tar.gz

videopython 0.27.2tar.gz → 0.28.1tar.gz