PyPI - videopython - Versions diffs - 0.32.0__tar.gz → 0.33.0__tar.gz - Mend

videopython 0.32.0tar.gz → 0.33.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

{videopython-0.32.0 → videopython-0.33.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: videopython
-Version: 0.32.0
+Version: 0.33.0
 Summary: Minimal video generation and processing library.
 Project-URL: Homepage, https://videopython.com
 Project-URL: Repository, https://github.com/bartwojtowicz/videopython/

{videopython-0.32.0 → videopython-0.33.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "videopython"
-version = "0.32.0"
+version = "0.33.0"
 description = "Minimal video generation and processing library."
 authors = [
     { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },

{videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/dubbing/__init__.py RENAMED Viewed

@@ -1,5 +1,6 @@
 """Local video dubbing functionality."""
+from videopython.ai.dubbing.config import DubbingConfig
 from videopython.ai.dubbing.dubber import VideoDubber
 from videopython.ai.dubbing.models import (
     DubbingResult,
@@ -15,6 +16,7 @@ from videopython.ai.generation.translation import UnsupportedLanguageError
 __all__ = [
     "VideoDubber",
+    "DubbingConfig",
     "DubbingResult",
     "RevoiceResult",
     "TranslatedSegment",

videopython-0.33.0/src/videopython/ai/dubbing/config.py ADDED Viewed

@@ -0,0 +1,80 @@
+"""Configuration model for the dubbing pipeline."""
+from __future__ import annotations
+from typing import Literal
+from pydantic import BaseModel, ConfigDict
+TranslatorChoice = Literal["auto", "marian", "qwen3"]
+WhisperModel = Literal["tiny", "base", "small", "medium", "large", "turbo"]
+class DubbingConfig(BaseModel):
+    """Knobs shared by :class:`VideoDubber` and :class:`LocalDubbingPipeline`.
+    Accepted as either ``config=DubbingConfig(...)`` or flat kwargs on the
+    two constructors; the flat path builds a ``DubbingConfig`` internally.
+    Attributes:
+        device: Execution device (``cpu``, ``cuda``, ``mps``, or ``None`` for auto).
+        low_memory: When True, each pipeline stage (Whisper, Demucs, MarianMT,
+            Chatterbox TTS) is unloaded from memory after it runs, so only one
+            model is resident at a time. Trades per-run latency (~10-30s of
+            extra model loads) for a much lower memory ceiling. Recommended
+            for GPUs with <=12GB VRAM or hosts with <32GB RAM. Default False.
+        whisper_model: Whisper model size used for transcription. Larger
+            models give better accuracy at the cost of VRAM and latency. One
+            of ``tiny``, ``base``, ``small``, ``medium``, ``large``, ``turbo``.
+            Default ``turbo``.
+        condition_on_previous_text: Forwarded to ``AudioToText``. Defaults to
+            ``False`` (Whisper's own default is ``True``). With conditioning
+            on, a single hallucinated filler phrase cascades through the rest
+            of the file. See ``AudioToText`` for the full rationale.
+        no_speech_threshold: Forwarded to ``AudioToText``. Whisper's
+            no-speech gate; raise to drop more low-confidence windows.
+        logprob_threshold: Forwarded to ``AudioToText``. Whisper's average
+            log-probability gate.
+        vocabulary: Forwarded to ``AudioToText``. Optional list of brand
+            names, product names, or proper nouns to bias Whisper's
+            first-window decoder via ``initial_prompt``. Recovers
+            near-mishears (e.g. Klarna -> "carna") on brand-monitoring
+            inputs without new model deps.
+        strict_quality: When True, the pipeline raises
+            :class:`GarbageTranscriptError` before Demucs/translation/TTS
+            run if the transcript-quality heuristic returns ``"reject"``.
+            When False (default), low-quality transcripts are logged at
+            WARNING but processing continues. Either way the
+            :class:`TranscriptQuality` is exposed on ``DubbingResult`` for
+            inspection.
+        translator: Translation backend to use. ``"auto"`` (default) picks
+            Qwen3 on GPU, MarianMT on CPU; ``"marian"`` and ``"qwen3"`` force
+            the named backend regardless of device. See
+            :class:`videopython.ai.generation.qwen3.Qwen3Translator` for
+            tradeoffs (Qwen3 is slower on CPU but produces context-aware,
+            length-budgeted output).
+    """
+    model_config = ConfigDict(frozen=True)
+    device: str | None = None
+    low_memory: bool = False
+    whisper_model: WhisperModel = "turbo"
+    condition_on_previous_text: bool = False
+    no_speech_threshold: float = 0.6
+    logprob_threshold: float | None = -1.0
+    vocabulary: list[str] | None = None
+    strict_quality: bool = False
+    translator: TranslatorChoice = "auto"
+    def init_log_fields(self) -> dict[str, object]:
+        """Subset of fields surfaced in the init-log line.
+        Hand-picked so log noise stays bounded as the config grows.
+        """
+        return {
+            "device": self.device.lower() if isinstance(self.device, str) else "auto",
+            "low_memory": self.low_memory,
+            "whisper_model": self.whisper_model,
+            "translator": self.translator,
+        }

{videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/dubbing/dubber.py RENAMED Viewed

@@ -6,8 +6,8 @@ import logging
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable
+from videopython.ai.dubbing.config import DubbingConfig
 from videopython.ai.dubbing.models import DubbingResult, RevoiceResult
-from videopython.ai.dubbing.pipeline import TranslatorChoice, WhisperModel
 if TYPE_CHECKING:
     from videopython.base.video import Video
@@ -18,90 +18,26 @@ logger = logging.getLogger(__name__)
 class VideoDubber:
     """Dubs videos into different languages using the local pipeline.
-    Args:
-        device: Execution device (``cpu``, ``cuda``, ``mps``, or ``None`` for auto).
-        low_memory: When True, each pipeline stage (Whisper, Demucs, MarianMT,
-            Chatterbox TTS) is unloaded from memory after it runs, so only one
-            model is resident at a time. Trades per-run latency (~10-30s of
-            extra model loads) for a much lower memory ceiling. Recommended for
-            GPUs with <=12GB VRAM or hosts with <32GB RAM. Default False.
-        whisper_model: Whisper model size used for transcription. Larger models
-            give better accuracy at the cost of VRAM and latency. One of
-            ``tiny``, ``base``, ``small``, ``medium``, ``large``, ``turbo``.
-            Default ``turbo``.
-        condition_on_previous_text: Forwarded to ``AudioToText``. Defaults to
-            ``False`` (Whisper's own default is ``True``). With conditioning on,
-            a single hallucinated filler phrase cascades through the rest of
-            the file. See ``AudioToText`` for the full rationale.
-        no_speech_threshold: Forwarded to ``AudioToText``. Whisper's no-speech
-            gate; raise to drop more low-confidence windows.
-        logprob_threshold: Forwarded to ``AudioToText``. Whisper's average
-            log-probability gate.
-        vocabulary: Forwarded to ``AudioToText``. Optional list of brand
-            names, product names, or proper nouns to bias Whisper's first-
-            window decoder via ``initial_prompt``. Recovers near-mishears
-            (e.g. Klarna → "carna") on brand-monitoring inputs without new
-            model deps.
-        strict_quality: When True, the pipeline raises
-            :class:`GarbageTranscriptError` before Demucs/translation/TTS run
-            if the transcript-quality heuristic returns ``"reject"``. When
-            False (default), low-quality transcripts are logged at WARNING
-            but processing continues. Either way the
-            :class:`TranscriptQuality` is exposed on ``DubbingResult`` for
-            inspection.
-        translator: Translation backend to use. ``"auto"`` (default)
-            picks Qwen3 on GPU, MarianMT on CPU; ``"marian"`` and
-            ``"qwen3"`` force the named backend regardless of device.
-            See :class:`videopython.ai.generation.qwen3.Qwen3Translator`
-            for tradeoffs (Qwen3 is slower on CPU but produces
-            context-aware, length-budgeted output).
+    Accepts either a :class:`DubbingConfig` or the same knobs as flat kwargs
+    (``device``, ``low_memory``, ``whisper_model``, ``translator``, etc.) --
+    the flat path builds a ``DubbingConfig`` internally. See
+    :class:`DubbingConfig` for the full knob list and defaults.
     """
-    def __init__(
-        self,
-        device: str | None = None,
-        low_memory: bool = False,
-        whisper_model: WhisperModel = "turbo",
-        condition_on_previous_text: bool = False,
-        no_speech_threshold: float = 0.6,
-        logprob_threshold: float | None = -1.0,
-        vocabulary: list[str] | None = None,
-        strict_quality: bool = False,
-        translator: TranslatorChoice = "auto",
-    ):
-        self.device = device
-        self.low_memory = low_memory
-        self.whisper_model = whisper_model
-        self.condition_on_previous_text = condition_on_previous_text
-        self.no_speech_threshold = no_speech_threshold
-        self.logprob_threshold = logprob_threshold
-        self.vocabulary = vocabulary
-        self.strict_quality = strict_quality
-        self.translator = translator
+    def __init__(self, config: DubbingConfig | None = None, **kwargs: Any):
+        if config is not None and kwargs:
+            raise TypeError("Pass either `config=` or knob kwargs, not both")
+        self.config = config or DubbingConfig(**kwargs)
         self._local_pipeline: Any = None
-        requested = device.lower() if isinstance(device, str) else "auto"
         logger.info(
-            "VideoDubber initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
-            requested,
-            low_memory,
-            whisper_model,
-            translator,
+            "VideoDubber initialized with %s",
+            " ".join(f"{k}={v}" for k, v in self.config.init_log_fields().items()),
         )
     def _init_local_pipeline(self) -> None:
         from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
-        self._local_pipeline = LocalDubbingPipeline(
-            device=self.device,
-            low_memory=self.low_memory,
-            whisper_model=self.whisper_model,
-            condition_on_previous_text=self.condition_on_previous_text,
-            no_speech_threshold=self.no_speech_threshold,
-            logprob_threshold=self.logprob_threshold,
-            vocabulary=self.vocabulary,
-            strict_quality=self.strict_quality,
-            translator=self.translator,
-        )
+        self._local_pipeline = LocalDubbingPipeline(config=self.config)
     def dub(
         self,

videopython-0.33.0/src/videopython/ai/dubbing/expressiveness.py ADDED Viewed

@@ -0,0 +1,47 @@
+"""Source-prosody-driven expressiveness knobs for Chatterbox TTS."""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+import numpy as np
+from videopython.ai.dubbing.models import Expressiveness
+if TYPE_CHECKING:
+    from videopython.audio import Audio
+# Prosody-conditioning thresholds. Source-segment RMS / whole-vocals RMS
+# below CALM lands in the calm bucket; above DRAMATIC in the dramatic
+# bucket; in between gets Chatterbox's defaults. Knob values picked
+# by-ear on cam1_1min.mp4 -- see RELEASE_NOTES 0.29.0.
+CALM_RATIO_THRESHOLD = 0.7
+DRAMATIC_RATIO_THRESHOLD = 1.3
+_CALM = Expressiveness(exaggeration=0.3, cfg_weight=0.7)
+_DRAMATIC = Expressiveness(exaggeration=0.85, cfg_weight=0.35)
+def rms(data: np.ndarray) -> float:
+    """RMS over samples; ``0.0`` for empty input. float64 reduction so a
+    long slice can't overflow the squared accumulator."""
+    if data.size == 0:
+        return 0.0
+    return float(np.sqrt(np.mean(np.square(data, dtype=np.float64))))
+def expressiveness_for(source_slice: Audio, baseline_rms: float) -> Expressiveness:
+    """Map a source vocals slice to a Chatterbox expressiveness profile
+    by RMS ratio. Falls back to the no-knobs default for empty or silent
+    inputs."""
+    if baseline_rms <= 0.0:
+        return Expressiveness()
+    segment_rms = rms(source_slice.data)
+    if segment_rms <= 0.0:
+        return Expressiveness()
+    ratio = segment_rms / baseline_rms
+    if ratio < CALM_RATIO_THRESHOLD:
+        return _CALM
+    if ratio > DRAMATIC_RATIO_THRESHOLD:
+        return _DRAMATIC
+    return Expressiveness()

videopython-0.33.0/src/videopython/ai/dubbing/loudness.py ADDED Viewed

@@ -0,0 +1,86 @@
+"""LUFS / peak loudness matching for dubbed audio."""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+import numpy as np
+if TYPE_CHECKING:
+    from videopython.audio import Audio
+# BS.1770 integrated-loudness measurement requires at least 400 ms of audio
+# (one gating block). Below this, fall back to peak match -- pyloudnorm
+# returns -inf or warns, neither of which gives a usable gain.
+_LUFS_MIN_DURATION_SECONDS = 0.4
+def peak_match(target: Audio, reference: Audio) -> Audio:
+    """Scale ``target`` so its peak amplitude matches ``reference``.
+    Used as the fallback when LUFS measurement isn't viable (clip < 0.4s
+    or silent input). The new ``Audio`` shares no buffer with ``target``.
+    """
+    from videopython.audio import Audio as _Audio
+    target_peak = float(np.max(np.abs(target.data))) if target.data.size else 0.0
+    reference_peak = float(np.max(np.abs(reference.data))) if reference.data.size else 0.0
+    if target_peak <= 0.0 or reference_peak <= 0.0:
+        return target
+    scale = reference_peak / target_peak
+    if abs(scale - 1.0) < 1e-3:
+        return target
+    return _Audio(target.data * scale, target.metadata)
+def loudness_match(target: Audio, reference: Audio) -> Audio:
+    """Scale ``target`` so its integrated loudness (BS.1770 / LUFS) matches ``reference``.
+    Demucs background normalization and the timing-assembler peak guard
+    each clamp at 1.0 instead of restoring perceived loudness, so a
+    dubbed mix lands perceptually "thinner" than the source even after
+    peak match. LUFS captures the ear-weighted envelope that peak ratio
+    misses on dialogue-heavy material.
+    Falls back to :func:`peak_match` when either clip is shorter than
+    the BS.1770 gating block (400 ms) or when measurement returns -inf
+    (silent or near-silent gated content). After gain is applied, peaks
+    are clamped to 0.99 -- BS.1770 has no peak ceiling and a sufficiently
+    quiet source can demand gain that would otherwise clip.
+    """
+    from videopython.audio import Audio as _Audio
+    target_dur = target.metadata.duration_seconds
+    ref_dur = reference.metadata.duration_seconds
+    if target_dur < _LUFS_MIN_DURATION_SECONDS or ref_dur < _LUFS_MIN_DURATION_SECONDS:
+        return peak_match(target, reference)
+    if not target.data.size or not reference.data.size:
+        return target
+    import pyloudnorm
+    target_lufs = pyloudnorm.Meter(target.metadata.sample_rate).integrated_loudness(target.data)
+    reference_lufs = pyloudnorm.Meter(reference.metadata.sample_rate).integrated_loudness(reference.data)
+    # Either clip's gated content was below -70 LUFS (effectively silent
+    # under BS.1770). Gain would be undefined -- fall back to peak match,
+    # which has its own silent-input no-op.
+    if not np.isfinite(target_lufs) or not np.isfinite(reference_lufs):
+        return peak_match(target, reference)
+    gain_db = reference_lufs - target_lufs
+    if abs(gain_db) < 0.1:
+        return target
+    scale = float(10 ** (gain_db / 20.0))
+    scaled = target.data * scale
+    peak = float(np.max(np.abs(scaled)))
+    if peak > 0.99:
+        scaled = scaled * (0.99 / peak)
+    return _Audio(scaled, target.metadata)

{videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/dubbing/models.py RENAMED Viewed

@@ -2,14 +2,15 @@
 from __future__ import annotations
-from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Annotated, Any
+from pydantic import BaseModel, BeforeValidator, ConfigDict, Field, PlainSerializer, model_validator
+from videopython.ai.dubbing.quality import TranscriptQuality
 from videopython.audio import Audio
 from videopython.base.transcription import Transcription, TranscriptionSegment
 if TYPE_CHECKING:
-    from videopython.ai.dubbing.quality import TranscriptQuality
     from videopython.ai.dubbing.timing import TimingAdjustment
@@ -19,11 +20,30 @@ if TYPE_CHECKING:
 CLEAN_SPEED_TOLERANCE = 0.01
-@dataclass(frozen=True)
-class Expressiveness:
+# TranscriptionSegment and Transcription still live in videopython.base as
+# plain dataclasses with hand-rolled to_dict/from_dict. Bridge them at
+# the field boundary so the dubbing cache wire format stays identical.
+def _validate_transcription_segment(value: Any) -> Any:
+    if value is None or isinstance(value, TranscriptionSegment):
+        return value
+    return TranscriptionSegment.from_dict(value)
+def _serialize_with_to_dict(value: Any) -> Any:
+    return value.to_dict() if value is not None else None
+_TranscriptionSegmentField = Annotated[
+    TranscriptionSegment,
+    BeforeValidator(_validate_transcription_segment),
+    PlainSerializer(_serialize_with_to_dict, return_type=dict, when_used="always"),
+]
+class Expressiveness(BaseModel):
     """Chatterbox ``generate()`` knobs derived from source-segment prosody.
-    ``None`` on any field means "let Chatterbox use its own default" —
+    ``None`` on any field means "let Chatterbox use its own default" --
     avoids pinning the dub against future Chatterbox default changes.
     Attributes:
@@ -34,6 +54,8 @@ class Expressiveness:
         temperature: Sampling temperature. Chatterbox default ``0.8``.
     """
+    model_config = ConfigDict(frozen=True)
     exaggeration: float | None = None
     cfg_weight: float | None = None
     temperature: float | None = None
@@ -54,8 +76,7 @@ class Expressiveness:
         }
-@dataclass
-class TranslatedSegment:
+class TranslatedSegment(BaseModel):
     """A segment of translated text with timing information.
     Attributes:
@@ -68,7 +89,9 @@ class TranslatedSegment:
         end: End time in seconds.
     """
-    original_segment: TranscriptionSegment
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    original_segment: _TranscriptionSegmentField
     translated_text: str
     source_lang: str
     target_lang: str
@@ -76,13 +99,17 @@ class TranslatedSegment:
     start: float = 0.0
     end: float = 0.0
-    def __post_init__(self) -> None:
-        """Set timing from original segment if not provided."""
+    @model_validator(mode="after")
+    def _default_timing_from_segment(self) -> TranslatedSegment:
+        # ``start == end == 0.0`` is the dataclass-era sentinel for "use the
+        # original segment's timing." Preserved so legacy callers (and the
+        # dub cache wire format) keep working.
         if self.start == 0.0 and self.end == 0.0:
             self.start = self.original_segment.start
             self.end = self.original_segment.end
         if self.speaker is None:
             self.speaker = self.original_segment.speaker
+        return self
     @property
     def original_text(self) -> str:
@@ -94,34 +121,8 @@ class TranslatedSegment:
         """Duration of the segment in seconds."""
         return self.end - self.start
-    def to_dict(self) -> dict[str, Any]:
-        """Convert to dictionary for JSON serialization (used by the dub cache)."""
-        return {
-            "original_segment": self.original_segment.to_dict(),
-            "translated_text": self.translated_text,
-            "source_lang": self.source_lang,
-            "target_lang": self.target_lang,
-            "speaker": self.speaker,
-            "start": self.start,
-            "end": self.end,
-        }
-    @classmethod
-    def from_dict(cls, data: dict[str, Any]) -> TranslatedSegment:
-        """Reconstruct from a dict produced by :meth:`to_dict`."""
-        return cls(
-            original_segment=TranscriptionSegment.from_dict(data["original_segment"]),
-            translated_text=data["translated_text"],
-            source_lang=data["source_lang"],
-            target_lang=data["target_lang"],
-            speaker=data.get("speaker"),
-            start=data.get("start", 0.0),
-            end=data.get("end", 0.0),
-        )
-@dataclass
-class SeparatedAudio:
+class SeparatedAudio(BaseModel):
     """Audio separated into different components.
     Attributes:
@@ -132,6 +133,8 @@ class SeparatedAudio:
         original: The original unseparated audio.
     """
+    model_config = ConfigDict(arbitrary_types_allowed=True)
     vocals: Audio
     background: Audio
     original: Audio
@@ -144,8 +147,7 @@ class SeparatedAudio:
         return self.music is not None and self.effects is not None
-@dataclass
-class TimingSummary:
+class TimingSummary(BaseModel):
     """Aggregate stats over per-segment timing adjustments.
     Surfaces how aggressively the timing synchronizer had to compress or
@@ -201,32 +203,8 @@ class TimingSummary:
             max_truncation_seconds=max_truncation,
         )
-    def to_dict(self) -> dict[str, Any]:
-        """Convert to dictionary for JSON serialization."""
-        return {
-            "total_segments": self.total_segments,
-            "clean_count": self.clean_count,
-            "stretched_count": self.stretched_count,
-            "truncated_count": self.truncated_count,
-            "mean_speed_factor": self.mean_speed_factor,
-            "max_truncation_seconds": self.max_truncation_seconds,
-        }
-    @classmethod
-    def from_dict(cls, data: dict[str, Any]) -> TimingSummary:
-        """Create TimingSummary from dictionary."""
-        return cls(
-            total_segments=data["total_segments"],
-            clean_count=data["clean_count"],
-            stretched_count=data["stretched_count"],
-            truncated_count=data["truncated_count"],
-            mean_speed_factor=data["mean_speed_factor"],
-            max_truncation_seconds=data["max_truncation_seconds"],
-        )
-@dataclass
-class DubbingResult:
+class DubbingResult(BaseModel):
     """Result of a video dubbing operation.
     Attributes:
@@ -247,16 +225,18 @@ class DubbingResult:
             no failure mode that drops segments).
     """
+    model_config = ConfigDict(arbitrary_types_allowed=True)
     dubbed_audio: Audio
     translated_segments: list[TranslatedSegment]
     source_transcription: Transcription
     source_lang: str
     target_lang: str
     separated_audio: SeparatedAudio | None = None
-    voice_samples: dict[str, Audio] = field(default_factory=dict)
+    voice_samples: dict[str, Audio] = Field(default_factory=dict)
     timing_summary: TimingSummary | None = None
     transcript_quality: TranscriptQuality | None = None
-    translation_failures: list[int] = field(default_factory=list)
+    translation_failures: list[int] = Field(default_factory=list)
     @property
     def num_segments(self) -> int:
@@ -283,8 +263,7 @@ class DubbingResult:
         return segments_by_speaker
-@dataclass
-class RevoiceResult:
+class RevoiceResult(BaseModel):
     """Result of a voice replacement operation.
     Attributes:
@@ -296,6 +275,8 @@ class RevoiceResult:
         speech_duration: Duration of the generated speech.
     """
+    model_config = ConfigDict(arbitrary_types_allowed=True)
     revoiced_audio: Audio
     text: str
     separated_audio: SeparatedAudio | None = None

videopython 0.32.0__tar.gz → 0.33.0__tar.gz

videopython 0.32.0tar.gz → 0.33.0tar.gz