PyPI - videopython - Versions diffs - 0.28.0__tar.gz → 0.28.1__tar.gz - Mend

videopython 0.28.0tar.gz → 0.28.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

{videopython-0.28.0 → videopython-0.28.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: videopython
-Version: 0.28.0
+Version: 0.28.1
 Summary: Minimal video generation and processing library.
 Project-URL: Homepage, https://videopython.com
 Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -29,6 +29,7 @@ Requires-Dist: demucs>=4.0.0; extra == 'ai'
 Requires-Dist: diffusers>=0.30.0; extra == 'ai'
 Requires-Dist: easyocr>=1.7.0; extra == 'ai'
 Requires-Dist: hf-transfer>=0.1.9; extra == 'ai'
+Requires-Dist: llama-cpp-python>=0.3.0; extra == 'ai'
 Requires-Dist: numba>=0.61.0; extra == 'ai'
 Requires-Dist: ollama>=0.4.5; extra == 'ai'
 Requires-Dist: openai-whisper>=20240930; extra == 'ai'

{videopython-0.28.0 → videopython-0.28.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "videopython"
-version = "0.28.0"
+version = "0.28.1"
 description = "Minimal video generation and processing library."
 authors = [
     { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -80,6 +80,8 @@ ai = [
     "sentencepiece>=0.1.99",
     # Audio source separation
     "demucs>=4.0.0",
+    # Translation backend: Qwen3 GGUF inference (M2)
+    "llama-cpp-python>=0.3.0",
 ]
 # Required for pip install videopython[ai] - pip uses optional-dependencies, not dependency-groups
@@ -111,6 +113,8 @@ ai = [
     "sentencepiece>=0.1.99",
     # Audio source separation
     "demucs>=4.0.0",
+    # Translation backend: Qwen3 GGUF inference (M2)
+    "llama-cpp-python>=0.3.0",
 ]
 [project.urls]
@@ -136,6 +140,7 @@ module = [
     "pyannote", "pyannote.*",
     "silero_vad", "silero_vad.*",
     "cv2", "cv2.*",
+    "llama_cpp", "llama_cpp.*",
 ]
 ignore_missing_imports = true

{videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/dubbing/__init__.py RENAMED Viewed

@@ -5,6 +5,7 @@ from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, Separate
 from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
 from videopython.ai.dubbing.quality import GarbageTranscriptError, TranscriptQuality, assess_transcript
 from videopython.ai.dubbing.timing import TimingSynchronizer
+from videopython.ai.generation.translation import UnsupportedLanguageError
 __all__ = [
     "VideoDubber",
@@ -17,4 +18,5 @@ __all__ = [
     "GarbageTranscriptError",
     "TranscriptQuality",
     "assess_transcript",
+    "UnsupportedLanguageError",
 ]

{videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/dubbing/dubber.py RENAMED Viewed

@@ -7,7 +7,7 @@ from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable
 from videopython.ai.dubbing.models import DubbingResult, RevoiceResult
-from videopython.ai.dubbing.pipeline import WhisperModel
+from videopython.ai.dubbing.pipeline import TranslatorChoice, WhisperModel
 if TYPE_CHECKING:
     from videopython.base.video import Video
@@ -44,6 +44,12 @@ class VideoDubber:
             but processing continues. Either way the
             :class:`TranscriptQuality` is exposed on ``DubbingResult`` for
             inspection.
+        translator: Translation backend to use. ``"auto"`` (default)
+            picks Qwen3 on GPU, MarianMT on CPU; ``"marian"`` and
+            ``"qwen3"`` force the named backend regardless of device.
+            See :class:`videopython.ai.generation.qwen3.Qwen3Translator`
+            for tradeoffs (Qwen3 is slower on CPU but produces
+            context-aware, length-budgeted output).
     """
     def __init__(
@@ -55,6 +61,7 @@ class VideoDubber:
         no_speech_threshold: float = 0.6,
         logprob_threshold: float | None = -1.0,
         strict_quality: bool = False,
+        translator: TranslatorChoice = "auto",
     ):
         self.device = device
         self.low_memory = low_memory
@@ -63,13 +70,15 @@ class VideoDubber:
         self.no_speech_threshold = no_speech_threshold
         self.logprob_threshold = logprob_threshold
         self.strict_quality = strict_quality
+        self.translator = translator
         self._local_pipeline: Any = None
         requested = device.lower() if isinstance(device, str) else "auto"
         logger.info(
-            "VideoDubber initialized with device=%s low_memory=%s whisper_model=%s",
+            "VideoDubber initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
             requested,
             low_memory,
             whisper_model,
+            translator,
         )
     def _init_local_pipeline(self) -> None:
@@ -83,6 +92,7 @@ class VideoDubber:
             no_speech_threshold=self.no_speech_threshold,
             logprob_threshold=self.logprob_threshold,
             strict_quality=self.strict_quality,
+            translator=self.translator,
         )
     def dub(

{videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/dubbing/models.py RENAMED Viewed

@@ -180,6 +180,11 @@ class DubbingResult:
         timing_summary: Aggregate stats over per-segment timing adjustments.
         transcript_quality: Heuristic quality assessment of the transcription
             (None when the pipeline returned early on an empty transcription).
+        translation_failures: Indices of segments where translation failed
+            entirely. Used by Qwen3Translator when both the primary call and
+            the per-segment Marian fallback fail; those segments are dubbed
+            with empty text. Empty list under MarianTranslator (Marian has
+            no failure mode that drops segments).
     """
     dubbed_audio: Audio
@@ -191,6 +196,7 @@ class DubbingResult:
     voice_samples: dict[str, Audio] = field(default_factory=dict)
     timing_summary: TimingSummary | None = None
     transcript_quality: TranscriptQuality | None = None
+    translation_failures: list[int] = field(default_factory=list)
     @property
     def num_segments(self) -> int:

{videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/dubbing/pipeline.py RENAMED Viewed

@@ -9,14 +9,24 @@ from typing import TYPE_CHECKING, Any, Callable, Literal
 import numpy as np
+from videopython.ai._device import select_device
 from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio, TimingSummary
 from videopython.ai.dubbing.quality import GarbageTranscriptError, assess_transcript
 from videopython.ai.dubbing.timing import TimingSynchronizer
+from videopython.ai.generation.qwen3 import Qwen3Translator
+from videopython.ai.generation.translation import (
+    MarianTranslator,
+    TranslationBackend,
+    UnsupportedLanguageError,
+)
 if TYPE_CHECKING:
     from videopython.base.audio import Audio
+TranslatorChoice = Literal["auto", "marian", "qwen3"]
 def _peak_match(target: Audio, reference: Audio) -> Audio:
     """Scale ``target`` so its peak amplitude matches ``reference``.
@@ -74,6 +84,7 @@ class LocalDubbingPipeline:
         no_speech_threshold: float = 0.6,
         logprob_threshold: float | None = -1.0,
         strict_quality: bool = False,
+        translator: TranslatorChoice = "auto",
     ):
         self.device = device
         self.low_memory = low_memory
@@ -82,12 +93,14 @@ class LocalDubbingPipeline:
         self.no_speech_threshold = no_speech_threshold
         self.logprob_threshold = logprob_threshold
         self.strict_quality = strict_quality
+        self.translator = translator
         requested = device.lower() if isinstance(device, str) else "auto"
         logger.info(
-            "LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s",
+            "LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
             requested,
             low_memory,
             whisper_model,
+            translator,
         )
         self._transcriber: Any = None
@@ -128,11 +141,64 @@ class LocalDubbingPipeline:
             logprob_threshold=self.logprob_threshold,
         )
-    def _init_translator(self) -> None:
-        """Initialize the translation model."""
-        from videopython.ai.generation.translation import TextTranslator
+    def _init_translator(self, source_lang: str, target_lang: str) -> None:
+        """Initialize the translation backend.
+        Resolves the configured ``self.translator`` choice into a concrete
+        backend. ``"auto"`` uses :meth:`_resolve_translator_auto`; explicit
+        choices instantiate the named backend directly. Re-initialization
+        is a no-op when ``self._translator`` is already a matching instance
+        for the same language pair (handled at call sites via the existing
+        ``self._translator is None`` gate).
+        """
+        if self.translator == "marian":
+            self._translator = MarianTranslator(device=self.device)
+        elif self.translator == "qwen3":
+            self._translator = Qwen3Translator(device=self.device)
+        else:  # "auto"
+            self._translator = self._resolve_translator_auto(source_lang, target_lang)
+    def _resolve_translator_auto(self, source_lang: str, target_lang: str) -> TranslationBackend:
+        """Pick a backend based on language coverage AND device.
+        Qwen3-4B Q4_K_M on CPU is roughly 10-15x slower than MarianMT (M2.1
+        spike on dreams_15min.mp4). The resolver picks Marian on CPU
+        whenever it covers the language pair and only escalates to Qwen
+        when a GPU is available or Marian doesn't cover the pair.
+        """
+        device = select_device(self.device, mps_allowed=True)
+        has_gpu = device in ("cuda", "mps")
+        # 1. GPU + Qwen covers the pair → Qwen wins (best quality).
+        if has_gpu and Qwen3Translator.supports(source_lang, target_lang):
+            logger.info(
+                "translator: auto-selected qwen3 (device=%s, supports %s->%s)",
+                device,
+                source_lang,
+                target_lang,
+            )
+            return Qwen3Translator(device=self.device)
+        # 2. Marian covers the pair → Marian (fast).
+        if MarianTranslator.has_model_for(source_lang, target_lang):
+            if has_gpu:
+                reason = f"Qwen does not cover {source_lang}->{target_lang}"
+            else:
+                reason = f"device={device} (Qwen would be ~10-15x slower; pass translator='qwen3' to override)"
+            logger.info("translator: auto-selected marian (%s)", reason)
+            return MarianTranslator(device=self.device)
+        # 3. CPU + only Qwen covers it: warn loudly and use Qwen anyway.
+        if Qwen3Translator.supports(source_lang, target_lang):
+            logger.warning(
+                "translator: auto-selected qwen3 on CPU (%s->%s not in Marian); "
+                "translation will be slow (~10-15x MarianMT). Consider GPU.",
+                source_lang,
+                target_lang,
+            )
+            return Qwen3Translator(device=self.device)
-        self._translator = TextTranslator(device=self.device)
+        raise UnsupportedLanguageError(source_lang, target_lang)
     def _init_tts(self, language: str = "en") -> None:
         """Initialize the text-to-speech model."""
@@ -430,7 +496,7 @@ class LocalDubbingPipeline:
         report_progress("Translating text", 0.35)
         if self._translator is None:
-            self._init_translator()
+            self._init_translator(source_lang=detected_lang, target_lang=target_lang)
         # Translation stage spans 0.35 → 0.50 of overall pipeline progress.
         # MarianMT runs sequentially over 8-segment batches; on a 15-min
@@ -446,6 +512,9 @@ class LocalDubbingPipeline:
             source_lang=detected_lang,
             progress_callback=_on_translation_progress,
         )
+        # Capture per-segment failures (always empty for Marian) before
+        # _maybe_unload nukes the backend in low_memory mode.
+        translation_failures = list(self._translator.translation_failures)
         self._maybe_unload("_translator")
         report_progress("Generating dubbed speech", 0.50)
@@ -559,6 +628,7 @@ class LocalDubbingPipeline:
             voice_samples=voice_samples,
             timing_summary=timing_summary,
             transcript_quality=transcript_quality,
+            translation_failures=translation_failures,
         )
     def revoice(

videopython-0.28.1/src/videopython/ai/generation/qwen3.py ADDED Viewed

@@ -0,0 +1,394 @@
+"""Qwen3-Instruct translation backend (M2).
+GGUF inference via ``llama-cpp-python``. One model for now —
+``Qwen3-4B-Instruct-2507`` (Apache-2.0, ~2.4 GB Q4_K_M). The original M2
+plan called for low/medium/high tiers (4B / 8B / 30B-A3B); we deferred
+that complexity until M2.4 eval data shows the larger models actually
+deliver a quality lift worth the VRAM cost.
+Latency note: on CPU the 4B model is roughly 10-15× slower than
+:class:`MarianTranslator` per the M2.1 spike. On GPU it lands within ~2×
+of Marian. Translation quality is decisively higher than Marian on
+context-dependent and idiomatic content. The pipeline's
+:class:`LocalDubbingPipeline` chooses based on ``device`` + the
+``translator`` kwarg.
+"""
+from __future__ import annotations
+import json
+import logging
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Callable
+from videopython.ai._device import release_device_memory, select_device
+from videopython.ai.generation.translation import (
+    LANGUAGE_NAMES,
+    MarianTranslator,
+    _is_translatable_text,
+)
+from videopython.base.text.transcription import TranscriptionSegment
+# Imported under TYPE_CHECKING only — qwen3 sits below videopython.ai.dubbing
+# in the import order (pipeline.py imports Qwen3Translator), so a top-level
+# import would create a cycle. The runtime constructor reaches for it via a
+# lazy local import inside translate_segments.
+if TYPE_CHECKING:
+    from videopython.ai.dubbing.models import TranslatedSegment
+logger = logging.getLogger(__name__)
+# Default model. Constants are module-level so an eval harness or future
+# tier pick can override at the call site without forking the class.
+DEFAULT_REPO_ID = "unsloth/Qwen3-4B-Instruct-2507-GGUF"
+DEFAULT_FILENAME = "Qwen3-4B-Instruct-2507-Q4_K_M.gguf"
+# Average characters per second of natural speech, used to derive the
+# per-segment ``target_chars`` budget. Rough field measurements; the prompt
+# tells Qwen this is a target ±15%, not a hard cap.
+_SPEECH_CHARS_PER_SEC: dict[str, float] = {
+    "en": 14.0,
+    "es": 14.0,
+    "pt": 13.5,
+    "it": 13.5,
+    "fr": 13.0,
+    "de": 12.0,
+    "pl": 12.5,
+    "nl": 12.5,
+    "ru": 12.0,
+    "uk": 12.0,
+    "cs": 12.0,
+    "sk": 12.0,
+    "ro": 13.0,
+    "hu": 12.0,
+    "fi": 11.0,
+    "sv": 12.5,
+    "da": 13.0,
+    "nb": 13.0,
+    "no": 13.0,
+    "ja": 8.0,
+    "ko": 9.0,
+    "zh": 7.0,
+    "zh-CN": 7.0,
+    "zh-TW": 7.0,
+    "th": 9.0,
+    "vi": 11.0,
+    "ar": 10.0,
+    "he": 10.0,
+    "hi": 11.0,
+    "ta": 10.0,
+    "id": 12.0,
+    "ms": 12.0,
+    "tr": 12.0,
+    "el": 12.0,
+}
+_SPEECH_CHARS_DEFAULT = 12.0
+# Qwen's avg_logprob is in [-inf, 0]. Values below this threshold mark a
+# transcription window we don't trust — Qwen gets a hint not to over-anchor.
+_LOW_LOGPROB_HINT_THRESHOLD = -1.0
+def _target_chars_for(duration_seconds: float, target_lang: str) -> int:
+    """Character-count budget for a segment of ``duration_seconds`` in ``target_lang``."""
+    rate = _SPEECH_CHARS_PER_SEC.get(target_lang, _SPEECH_CHARS_DEFAULT)
+    return max(1, int(duration_seconds * rate * 1.15))
+def _build_system_prompt(source_lang: str, target_lang: str) -> str:
+    """Stable system + format spec. The few-shot example uses generic
+    phrases (no fixture-specific content) so the prompt generalizes.
+    """
+    src_name = LANGUAGE_NAMES.get(source_lang, source_lang)
+    tgt_name = LANGUAGE_NAMES.get(target_lang, target_lang)
+    return (
+        f"You are a professional dub translator. Translate from {src_name} to {tgt_name}.\n"
+        "Preserve register and proper nouns. Match each segment's syllable count so the\n"
+        "dub fits the original timing — translation is for spoken audio, not subtitles.\n"
+        "Aim for ``target_chars`` characters per segment (±15%).\n"
+        "If a segment is non-speech filler (grunts, laughter, music cues) keep it as filler in\n"
+        "the target language; do not invent content.\n"
+        "If a segment carries ``low_confidence``, the source transcription may be wrong;\n"
+        "translate conservatively rather than committing to a specific phrase.\n"
+        "\n"
+        "Output one JSON object per line, no preamble, no commentary, no markdown:\n"
+        '{"i": <segment_index>, "translated": "<text>"}\n'
+    )
+def _build_user_prompt(segments: list[TranscriptionSegment], target_lang: str) -> str:
+    """Per-call body — the segments to translate."""
+    lines: list[str] = []
+    for idx, seg in enumerate(segments):
+        budget = _target_chars_for(seg.end - seg.start, target_lang)
+        entry: dict[str, Any] = {
+            "i": idx,
+            "text": seg.text,
+            "target_chars": budget,
+        }
+        if seg.avg_logprob is not None and seg.avg_logprob < _LOW_LOGPROB_HINT_THRESHOLD:
+            entry["low_confidence"] = True
+        lines.append(json.dumps(entry, ensure_ascii=False))
+    request_block = "\n".join(lines)
+    return (
+        f"Input segments:\n{request_block}\nTranslations (one JSON object per line, exactly {len(segments)} lines):\n"
+    )
+def _parse_jsonl_response(raw: str) -> dict[int, str]:
+    """Extract ``{i: translated_text}`` from Qwen output. Permissive — tolerates
+    markdown fences and preamble lines that the model occasionally adds."""
+    parsed: dict[int, str] = {}
+    for line in raw.splitlines():
+        line = line.strip()
+        if not line or line.startswith("```"):
+            continue
+        try:
+            obj = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        if isinstance(obj, dict) and "i" in obj and "translated" in obj:
+            try:
+                parsed[int(obj["i"])] = str(obj["translated"])
+            except (TypeError, ValueError):
+                continue
+    return parsed
+class Qwen3Translator:
+    """Qwen3-Instruct translation via llama-cpp-python (GGUF).
+    Args:
+        device: ``"cuda"``, ``"mps"``, ``"cpu"``, or ``None`` for auto.
+        marian_fallback: If True (default), fall back to Marian for any
+            segment that fails Qwen's parse retry. Set False to disable
+            (failures land in ``translation_failures`` instead).
+        repo_id: HuggingFace repo for the GGUF weights. Defaults to
+            ``DEFAULT_REPO_ID``; override for eval harnesses.
+        filename: GGUF filename within ``repo_id``. Defaults to
+            ``DEFAULT_FILENAME``.
+        n_ctx: llama.cpp context window. 8192 is plenty for a 15-min source;
+            raise for very long sources. Hard cap is the model's training
+            context (262K for Qwen3-4B-Instruct-2507).
+        max_tokens: Generation cap per call. 4× the input character count
+            is a safe upper bound for translation output.
+        temperature: Decoding temperature. 0.1 keeps output structurally
+            consistent (high JSON parse rate) without being deterministic.
+    """
+    def __init__(
+        self,
+        device: str | None = None,
+        marian_fallback: bool = True,
+        repo_id: str = DEFAULT_REPO_ID,
+        filename: str = DEFAULT_FILENAME,
+        n_ctx: int = 8192,
+        max_tokens: int = 4096,
+        temperature: float = 0.1,
+    ):
+        self.device = device
+        self.marian_fallback = marian_fallback
+        self.repo_id = repo_id
+        self.filename = filename
+        self.n_ctx = n_ctx
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+        # Lazily initialized.
+        self._llm: Any = None
+        self._marian: MarianTranslator | None = None
+        # Tracks which segment indices both Qwen and Marian failed on. The
+        # pipeline reads this to populate DubbingResult.translation_failures.
+        self._failures_last_call: list[int] = []
+    def _init_local(self) -> None:
+        """Download (if needed) and load the GGUF weights."""
+        from huggingface_hub import hf_hub_download
+        from llama_cpp import Llama
+        # Warn about CPU latency at load time (not __init__) — the warning is
+        # about runtime cost, which only applies once the model is actually
+        # loaded. Construction is cheap; tests instantiate Qwen3Translator
+        # without intending to run inference, so __init__ shouldn't shout.
+        resolved = select_device(self.device, mps_allowed=True)
+        if resolved == "cpu":
+            logger.warning(
+                "Qwen3Translator on CPU is ~10-15x slower than MarianTranslator. "
+                "Consider translator='marian' for development or pass device='cuda'/'mps'.",
+            )
+        logger.info("Qwen3Translator: loading %s", self.filename)
+        model_path = Path(hf_hub_download(repo_id=self.repo_id, filename=self.filename))
+        # n_gpu_layers=-1 offloads everything to GPU when one is available;
+        # 0 forces CPU. llama-cpp-python's Metal/CUDA support detects and
+        # uses whatever the build was compiled against.
+        n_gpu_layers = 0 if resolved == "cpu" else -1
+        # n_threads omitted on purpose — llama-cpp-python defaults to a
+        # sensible per-host value (min(physical cores, 4)). Hardcoding 8
+        # under-utilizes a 16-core box and over-subscribes a 4-core CI.
+        self._llm = Llama(
+            model_path=str(model_path),
+            n_ctx=self.n_ctx,
+            n_gpu_layers=n_gpu_layers,
+            verbose=False,
+        )
+    def _qwen_translate(
+        self, segments: list[TranscriptionSegment], target_lang: str, source_lang: str
+    ) -> dict[int, str]:
+        """One Qwen call to translate all segments. Empty result on parse failure."""
+        if self._llm is None:
+            self._init_local()
+        system = _build_system_prompt(source_lang, target_lang)
+        user = _build_user_prompt(segments, target_lang)
+        prompt = system + user
+        response = self._llm(
+            prompt,
+            max_tokens=self.max_tokens,
+            temperature=self.temperature,
+            stop=None,
+        )
+        raw = response["choices"][0]["text"]
+        return _parse_jsonl_response(raw)
+    def translate_segments(
+        self,
+        segments: list[TranscriptionSegment],
+        target_lang: str,
+        source_lang: str | None = None,
+        progress_callback: Callable[[float], None] | None = None,
+    ) -> list[TranslatedSegment]:
+        """Translate segments via Qwen with parse-retry + optional Marian fallback.
+        The progress_callback fires three times: 0.5 after the first
+        Qwen call, 0.9 after the optional retry/fallback, 1.0 at the
+        end. M2.1 phase 2 confirmed smaller batches don't help on CPU,
+        so finer-grained progress isn't possible without fake ticks.
+        """
+        effective_source = source_lang or "en"
+        self._failures_last_call = []
+        translatable_indices = [i for i, seg in enumerate(segments) if _is_translatable_text(seg.text)]
+        translatable_segments = [segments[i] for i in translatable_indices]
+        # First attempt.
+        if translatable_segments:
+            qwen_results = self._qwen_translate(translatable_segments, target_lang, effective_source)
+        else:
+            qwen_results = {}
+        if progress_callback is not None:
+            progress_callback(0.5)
+        # Identify segments Qwen failed (unparseable or missing index).
+        # Indices in qwen_results / translatable_segments are 0-based positions
+        # within translatable_segments, NOT positions in the full ``segments``
+        # list. Map back at the end.
+        missing_local_indices = [li for li in range(len(translatable_segments)) if li not in qwen_results]
+        # Retry once on the missing subset with stricter instructions.
+        if missing_local_indices:
+            retry_segments = [translatable_segments[li] for li in missing_local_indices]
+            logger.info(
+                "Qwen3Translator: retrying %d/%d segments after first parse",
+                len(retry_segments),
+                len(translatable_segments),
+            )
+            retry_results = self._qwen_translate(retry_segments, target_lang, effective_source)
+            # retry_results uses 0..len(retry_segments)-1 as keys; map back.
+            for retry_local, original_local in enumerate(missing_local_indices):
+                if retry_local in retry_results:
+                    qwen_results[original_local] = retry_results[retry_local]
+        if progress_callback is not None:
+            progress_callback(0.9)
+        # Anything still missing → Marian fallback (or surface as failure).
+        still_missing_local = [li for li in range(len(translatable_segments)) if li not in qwen_results]
+        if still_missing_local and self.marian_fallback:
+            fallback_segments = [translatable_segments[li] for li in still_missing_local]
+            logger.warning(
+                "Qwen3Translator: falling back to Marian for %d segments after retry",
+                len(fallback_segments),
+            )
+            if self._marian is None:
+                self._marian = MarianTranslator(device=self.device)
+            try:
+                fallback_translated = self._marian.translate_segments(
+                    fallback_segments, target_lang=target_lang, source_lang=effective_source
+                )
+                for li, ts in zip(still_missing_local, fallback_translated):
+                    qwen_results[li] = ts.translated_text
+            except Exception as exc:
+                logger.warning("Qwen3Translator: Marian fallback failed (%s)", exc)
+                # Leave them missing; they'll be recorded as failures below.
+        # Whatever's still missing is a hard failure. Record original-segment
+        # indices (positions in the full ``segments`` list) so the caller
+        # can reconcile against translated_segments.
+        for li in range(len(translatable_segments)):
+            if li not in qwen_results:
+                self._failures_last_call.append(translatable_indices[li])
+        # Lazy import to avoid a circular dep through videopython.ai.dubbing
+        # (see TYPE_CHECKING import at the top of the module).
+        from videopython.ai.dubbing.models import TranslatedSegment
+        # Materialize TranslatedSegments parallel to the input list.
+        translated_segments: list[TranslatedSegment] = []
+        local_translation_for_orig: dict[int, str] = {}
+        for li, original_idx in enumerate(translatable_indices):
+            if li in qwen_results:
+                local_translation_for_orig[original_idx] = qwen_results[li]
+        for i, segment in enumerate(segments):
+            translated_text = local_translation_for_orig.get(i, "")
+            translated_segments.append(
+                TranslatedSegment(
+                    original_segment=segment,
+                    translated_text=translated_text,
+                    source_lang=effective_source,
+                    target_lang=target_lang,
+                    speaker=segment.speaker,
+                    start=segment.start,
+                    end=segment.end,
+                )
+            )
+        if progress_callback is not None:
+            progress_callback(1.0)
+        return translated_segments
+    @property
+    def translation_failures(self) -> list[int]:
+        """Indices (in the most recent ``segments`` input) where translation
+        failed entirely. Empty if all segments translated.
+        """
+        return list(self._failures_last_call)
+    def unload(self) -> None:
+        """Release the model so the next call re-initializes. Used by
+        :class:`LocalDubbingPipeline` in ``low_memory`` mode."""
+        self._llm = None
+        if self._marian is not None:
+            self._marian.unload()
+            self._marian = None
+        release_device_memory(self.device)
+    @staticmethod
+    def get_supported_languages() -> dict[str, str]:
+        """Qwen handles all of Marian's language set plus more; we expose the
+        Marian set for now and let M2.4 eval add anything Qwen-only.
+        """
+        return LANGUAGE_NAMES.copy()
+    @classmethod
+    def supports(cls, source_lang: str, target_lang: str) -> bool:
+        """Coverage hint for the M2.3 ``auto`` resolver."""
+        if source_lang == target_lang:
+            return True
+        return source_lang in LANGUAGE_NAMES and target_lang in LANGUAGE_NAMES

{videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/generation/translation.py RENAMED Viewed

@@ -1,13 +1,50 @@
-"""Text translation using local Helsinki-NLP models."""
+"""Text translation backends.
+Two backends share the :class:`TranslationBackend` protocol:
+- :class:`MarianTranslator` (HuggingFace Helsinki-NLP MarianMT) — fast,
+  segment-isolated, available for ~30 language pairs. Default on CPU.
+- :class:`Qwen3Translator` (Qwen3-4B/8B/14B-Instruct via llama-cpp-python) —
+  slower but produces context-aware, length-budgeted translations. Default
+  on GPU.
+The pipeline picks via :class:`videopython.ai.dubbing.pipeline` based on a
+``translator`` kwarg (``"auto"`` resolves at runtime).
+"""
 from __future__ import annotations
-from typing import Any, Callable
+from typing import TYPE_CHECKING, Any, Callable, Protocol, runtime_checkable
 from videopython.ai._device import log_device_initialization, release_device_memory, select_device
-from videopython.ai.dubbing.models import TranslatedSegment
 from videopython.base.text.transcription import TranscriptionSegment
+# Imported under TYPE_CHECKING to avoid a circular dep through
+# videopython.ai.dubbing (the dubbing pipeline imports both
+# MarianTranslator and Qwen3Translator, which both import
+# TranslatedSegment from dubbing.models). Runtime users do a lazy
+# local import inside translate_segments.
+if TYPE_CHECKING:
+    from videopython.ai.dubbing.models import TranslatedSegment
+class UnsupportedLanguageError(ValueError):
+    """Raised when no available translation backend supports a given
+    ``(source, target)`` language pair.
+    Carries the requested pair so callers can introspect:
+        try:
+            dubber.dub(video, target_lang="xh")
+        except UnsupportedLanguageError as e:
+            print(f"No backend covers {e.source_lang}->{e.target_lang}")
+    """
+    def __init__(self, source_lang: str, target_lang: str, message: str | None = None):
+        self.source_lang = source_lang
+        self.target_lang = target_lang
+        super().__init__(message or f"No translation backend supports {source_lang}->{target_lang}")
 def _is_translatable_text(text: str) -> bool:
     """Return True if text has enough content to be worth translating.
@@ -19,6 +56,36 @@ def _is_translatable_text(text: str) -> bool:
     return sum(1 for c in text if c.isalnum()) >= 2
+@runtime_checkable
+class TranslationBackend(Protocol):
+    """Pipeline-facing translation interface.
+    Both :class:`MarianTranslator` and :class:`Qwen3Translator` satisfy
+    this. The pipeline only depends on these methods.
+    """
+    def translate_segments(
+        self,
+        segments: list[TranscriptionSegment],
+        target_lang: str,
+        source_lang: str | None = None,
+        progress_callback: Callable[[float], None] | None = None,
+    ) -> list[TranslatedSegment]: ...
+    def unload(self) -> None: ...
+    @property
+    def translation_failures(self) -> list[int]:
+        """Indices into the most recent ``segments`` input where the backend
+        could not produce a translation. Empty for backends that never fail
+        per-segment (e.g. MarianTranslator). The dubbing pipeline copies
+        this onto :class:`DubbingResult.translation_failures`."""
+        ...
+    @staticmethod
+    def get_supported_languages() -> dict[str, str]: ...
 LANGUAGE_NAMES = {
     "en": "English",
     "es": "Spanish",
@@ -56,8 +123,8 @@ LANGUAGE_NAMES = {
 }
-class TextTranslator:
-    """Translates text between languages using local seq2seq models."""
+class MarianTranslator:
+    """Translates text between languages using local Helsinki-NLP MarianMT models."""
     # Languages without a direct opus-mt-{src}-{tgt} model. Maps (source, target)
     # to an alternative HuggingFace model identifier.
@@ -68,6 +135,25 @@ class TextTranslator:
         ("en", "pl"): "Helsinki-NLP/opus-mt-en-zlw",
     }
+    @classmethod
+    def has_model_for(cls, source_lang: str, target_lang: str) -> bool:
+        """Return True if Marian has (or is likely to have) a model for ``(source, target)``.
+        Same-language pairs return True (translation is the identity).
+        Otherwise: True if either an entry in ``_MODEL_OVERRIDES`` exists or
+        both languages are in :data:`LANGUAGE_NAMES`. The latter is a
+        permissive proxy — Marian publishes ``opus-mt-{src}-{tgt}`` for
+        most ISO-639-1 pairs we expose, but not all (e.g. some Asian-to-
+        Asian pairs route through English). Used by the M2.3 ``auto``
+        resolver as a *coverage hint*; the actual existence check happens
+        at first-use download time.
+        """
+        if source_lang == target_lang:
+            return True
+        if (source_lang, target_lang) in cls._MODEL_OVERRIDES:
+            return True
+        return source_lang in LANGUAGE_NAMES and target_lang in LANGUAGE_NAMES
     def __init__(self, model_name: str | None = None, device: str | None = None):
         self.model_name = model_name
         self.device = device
@@ -194,6 +280,10 @@ class TextTranslator:
         callers can render translation-stage progress without knowing the
         batch size.
         """
+        # Lazy import to avoid a circular dep through videopython.ai.dubbing
+        # (see TYPE_CHECKING import at the top of the module).
+        from videopython.ai.dubbing.models import TranslatedSegment
         effective_source = source_lang or "en"
         translatable_indices = [i for i, segment in enumerate(segments) if _is_translatable_text(segment.text)]
@@ -230,6 +320,20 @@ class TextTranslator:
         self._current_lang_pair = None
         release_device_memory(self.device)
+    @property
+    def translation_failures(self) -> list[int]:
+        """Marian never fails per-segment (worst case it produces poor
+        output, not no output). Always empty; satisfies the
+        :class:`TranslationBackend` protocol."""
+        return []
     @staticmethod
     def get_supported_languages() -> dict[str, str]:
         return LANGUAGE_NAMES.copy()
+# Back-compat alias. ``TextTranslator`` was the class name through 0.28.x;
+# 0.29.0 renames to ``MarianTranslator`` to make room for ``Qwen3Translator``
+# behind a shared :class:`TranslationBackend` protocol. The alias will be
+# removed in 0.30.0.
+TextTranslator = MarianTranslator

{videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/understanding/audio.py RENAMED Viewed

@@ -11,6 +11,36 @@ from videopython.base.text.transcription import Transcription, TranscriptionSegm
 from videopython.base.video import Video
+def _attach_confidence_by_overlap(
+    target_segments: list[TranscriptionSegment],
+    source_segments: list[TranscriptionSegment],
+) -> None:
+    """Stamp Whisper confidence (avg_logprob, no_speech_prob, compression_ratio)
+    onto ``target_segments`` from the ``source_segments`` they overlap most with.
+    Used to re-attach per-segment confidence after diarization rebuilds segments
+    from words and drops the original Whisper-segment metadata. Whisper's
+    confidence is window-level, not phoneme-level, so overlap-by-time is the
+    right granularity — re-deriving per-word and re-aggregating wouldn't be
+    more accurate.
+    Mutates ``target_segments`` in place. Segments with no overlap to any
+    source segment are left untouched (their confidence stays None).
+    """
+    for tgt in target_segments:
+        best_overlap = 0.0
+        best_src: TranscriptionSegment | None = None
+        for src in source_segments:
+            overlap = max(0.0, min(tgt.end, src.end) - max(tgt.start, src.start))
+            if overlap > best_overlap:
+                best_overlap = overlap
+                best_src = src
+        if best_src is not None:
+            tgt.avg_logprob = best_src.avg_logprob
+            tgt.no_speech_prob = best_src.no_speech_prob
+            tgt.compression_ratio = best_src.compression_ratio
 class AudioToText:
     """Transcription service for audio and video using local Whisper models.
@@ -295,6 +325,13 @@ class AudioToText:
         transcription = self._process_transcription_result(transcription_result)
+        # Capture original Whisper segments before flattening to words. The
+        # diarization rebuild via Transcription(words=...) regroups by speaker,
+        # which loses the per-segment confidence M1.3 plumbed through. We
+        # re-attach by max-overlap match below so M2's confidence-aware
+        # translation prompts have signal on the diarized path too.
+        whisper_segments = transcription.segments
         all_words: list[TranscriptionWord] = []
         for seg in transcription.segments:
             all_words.extend(seg.words)
@@ -302,7 +339,9 @@ class AudioToText:
         if all_words:
             all_words = self._assign_speakers_to_words(all_words, diarization_result)
-        return Transcription(words=all_words, language=transcription.language)
+        rebuilt = Transcription(words=all_words, language=transcription.language)
+        _attach_confidence_by_overlap(rebuilt.segments, whisper_segments)
+        return rebuilt
     def _transcribe_local(self, audio: Audio) -> Transcription:
         """Transcribe using local Whisper model.