PyPI - videopython - Versions diffs - 0.29.0__tar.gz → 0.30.0__tar.gz - Mend

videopython 0.29.0tar.gz → 0.30.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

{videopython-0.29.0 → videopython-0.30.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: videopython
-Version: 0.29.0
+Version: 0.30.0
 Summary: Minimal video generation and processing library.
 Project-URL: Homepage, https://videopython.com
 Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -201,9 +201,8 @@ API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopyth
 | **Video analysis** | `VideoAnalyzer` - full-pipeline analysis combining multiple AI capabilities |
 | **Transforms** | `FaceTrackingCrop`, `SplitScreenComposite` |
 | **Dubbing** | `VideoDubber` - voice cloning and revoicing with timing sync |
-| **Object swapping** | `ObjectSwapper` - detect, segment, and inpaint objects in video |
-API docs: [Generation](https://videopython.com/api/ai/generation/) | [Understanding](https://videopython.com/api/ai/understanding/) | [Transforms](https://videopython.com/api/ai/transforms/) | [Dubbing](https://videopython.com/api/ai/dubbing/) | [Object Swapping](https://videopython.com/api/ai/swapping/)
+API docs: [Generation](https://videopython.com/api/ai/generation/) | [Understanding](https://videopython.com/api/ai/understanding/) | [Transforms](https://videopython.com/api/ai/transforms/) | [Dubbing](https://videopython.com/api/ai/dubbing/)
 ## Examples

{videopython-0.29.0 → videopython-0.30.0}/README.md RENAMED Viewed

@@ -152,9 +152,8 @@ API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopyth
 | **Video analysis** | `VideoAnalyzer` - full-pipeline analysis combining multiple AI capabilities |
 | **Transforms** | `FaceTrackingCrop`, `SplitScreenComposite` |
 | **Dubbing** | `VideoDubber` - voice cloning and revoicing with timing sync |
-| **Object swapping** | `ObjectSwapper` - detect, segment, and inpaint objects in video |
-API docs: [Generation](https://videopython.com/api/ai/generation/) | [Understanding](https://videopython.com/api/ai/understanding/) | [Transforms](https://videopython.com/api/ai/transforms/) | [Dubbing](https://videopython.com/api/ai/dubbing/) | [Object Swapping](https://videopython.com/api/ai/swapping/)
+API docs: [Generation](https://videopython.com/api/ai/generation/) | [Understanding](https://videopython.com/api/ai/understanding/) | [Transforms](https://videopython.com/api/ai/transforms/) | [Dubbing](https://videopython.com/api/ai/dubbing/)
 ## Examples

{videopython-0.29.0 → videopython-0.30.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "videopython"
-version = "0.29.0"
+version = "0.30.0"
 description = "Minimal video generation and processing library."
 authors = [
     { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },

{videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/__init__.py RENAMED Viewed

@@ -1,7 +1,6 @@
 from videopython.ai import registry as _ai_registry  # noqa: F401
 from .generation import ImageToVideo, TextToImage, TextToMusic, TextToSpeech, TextToVideo
-from .swapping import ObjectSwapper
 from .transforms import FaceTrackingCrop, SplitScreenComposite
 from .understanding import (
     AudioClassifier,
@@ -28,8 +27,6 @@ __all__ = [
     # Transforms (AI-powered)
     "FaceTrackingCrop",
     "SplitScreenComposite",
-    # Swapping
-    "ObjectSwapper",
     # Video analysis
     "VideoAnalysis",
     "VideoAnalysisConfig",

{videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/dubbing/__init__.py RENAMED Viewed

@@ -1,6 +1,5 @@
 """Local video dubbing functionality."""
-from videopython.ai.dubbing.cache import DubCache, dub_cache_clear
 from videopython.ai.dubbing.dubber import VideoDubber
 from videopython.ai.dubbing.models import (
     DubbingResult,
@@ -26,7 +25,5 @@ __all__ = [
     "TranscriptQuality",
     "assess_transcript",
     "UnsupportedLanguageError",
-    "DubCache",
-    "dub_cache_clear",
     "Expressiveness",
 ]

{videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/dubbing/dubber.py RENAMED Viewed

@@ -37,6 +37,11 @@ class VideoDubber:
             gate; raise to drop more low-confidence windows.
         logprob_threshold: Forwarded to ``AudioToText``. Whisper's average
             log-probability gate.
+        vocabulary: Forwarded to ``AudioToText``. Optional list of brand
+            names, product names, or proper nouns to bias Whisper's first-
+            window decoder via ``initial_prompt``. Recovers near-mishears
+            (e.g. Klarna → "carna") on brand-monitoring inputs without new
+            model deps.
         strict_quality: When True, the pipeline raises
             :class:`GarbageTranscriptError` before Demucs/translation/TTS run
             if the transcript-quality heuristic returns ``"reject"``. When
@@ -50,13 +55,6 @@ class VideoDubber:
             See :class:`videopython.ai.generation.qwen3.Qwen3Translator`
             for tradeoffs (Qwen3 is slower on CPU but produces
             context-aware, length-budgeted output).
-        cache_dir: When set, persist transcription, translated segments,
-            and per-segment TTS WAVs under this directory and skip stages
-            whose inputs already match a cache entry. Use to resume crashed
-            long runs or to iterate on dub configuration without paying
-            transcription cost each time. ``None`` (default) disables
-            caching. Cache grows unbounded; clear via
-            :func:`videopython.ai.dubbing.cache.dub_cache_clear`.
     """
     def __init__(
@@ -67,9 +65,9 @@ class VideoDubber:
         condition_on_previous_text: bool = False,
         no_speech_threshold: float = 0.6,
         logprob_threshold: float | None = -1.0,
+        vocabulary: list[str] | None = None,
         strict_quality: bool = False,
         translator: TranslatorChoice = "auto",
-        cache_dir: str | Path | None = None,
     ):
         self.device = device
         self.low_memory = low_memory
@@ -77,18 +75,17 @@ class VideoDubber:
         self.condition_on_previous_text = condition_on_previous_text
         self.no_speech_threshold = no_speech_threshold
         self.logprob_threshold = logprob_threshold
+        self.vocabulary = vocabulary
         self.strict_quality = strict_quality
         self.translator = translator
-        self.cache_dir = cache_dir
         self._local_pipeline: Any = None
         requested = device.lower() if isinstance(device, str) else "auto"
         logger.info(
-            "VideoDubber initialized with device=%s low_memory=%s whisper_model=%s translator=%s cache_dir=%s",
+            "VideoDubber initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
             requested,
             low_memory,
             whisper_model,
             translator,
-            cache_dir,
         )
     def _init_local_pipeline(self) -> None:
@@ -101,9 +98,9 @@ class VideoDubber:
             condition_on_previous_text=self.condition_on_previous_text,
             no_speech_threshold=self.no_speech_threshold,
             logprob_threshold=self.logprob_threshold,
+            vocabulary=self.vocabulary,
             strict_quality=self.strict_quality,
             translator=self.translator,
-            cache_dir=self.cache_dir,
         )
     def dub(

{videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/dubbing/models.py RENAMED Viewed

@@ -41,8 +41,7 @@ class Expressiveness:
     def as_kwargs(self) -> dict[str, float]:
         """Knobs as a dict, dropping ``None`` entries.
-        Suitable for ``**``-expansion into Chatterbox or
-        :meth:`DubCache.tts_key`.
+        Suitable for ``**``-expansion into Chatterbox.
         """
         return {
             name: value

{videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/dubbing/pipeline.py RENAMED Viewed

@@ -10,7 +10,6 @@ from typing import TYPE_CHECKING, Any, Callable, Literal
 import numpy as np
 from videopython.ai._device import select_device
-from videopython.ai.dubbing.cache import DubCache
 from videopython.ai.dubbing.models import DubbingResult, Expressiveness, RevoiceResult, SeparatedAudio, TimingSummary
 from videopython.ai.dubbing.quality import GarbageTranscriptError, assess_transcript
 from videopython.ai.dubbing.timing import TimingSynchronizer
@@ -170,9 +169,9 @@ class LocalDubbingPipeline:
         condition_on_previous_text: bool = False,
         no_speech_threshold: float = 0.6,
         logprob_threshold: float | None = -1.0,
+        vocabulary: list[str] | None = None,
         strict_quality: bool = False,
         translator: TranslatorChoice = "auto",
-        cache_dir: str | Path | None = None,
     ):
         self.device = device
         self.low_memory = low_memory
@@ -180,17 +179,16 @@ class LocalDubbingPipeline:
         self.condition_on_previous_text = condition_on_previous_text
         self.no_speech_threshold = no_speech_threshold
         self.logprob_threshold = logprob_threshold
+        self.vocabulary = vocabulary
         self.strict_quality = strict_quality
         self.translator = translator
-        self.cache_dir = Path(cache_dir) if cache_dir is not None else None
         requested = device.lower() if isinstance(device, str) else "auto"
         logger.info(
-            "LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s translator=%s cache_dir=%s",
+            "LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
             requested,
             low_memory,
             whisper_model,
             translator,
-            self.cache_dir,
         )
         self._transcriber: Any = None
@@ -200,7 +198,6 @@ class LocalDubbingPipeline:
         self._tts_language: str | None = None
         self._separator: Any = None
         self._synchronizer: TimingSynchronizer | None = None
-        self._cache: DubCache | None = DubCache(self.cache_dir) if self.cache_dir is not None else None
     def _maybe_unload(self, component_name: str) -> None:
         """Unload a stage's model when low_memory mode is enabled.
@@ -219,88 +216,42 @@ class LocalDubbingPipeline:
             logger.info("low_memory: unloading %s", component_name.lstrip("_"))
             unload()
-    def _transcribe_with_cache(
+    def _transcribe(
         self,
         source_audio: Audio,
         enable_diarization: bool,
     ) -> Transcription:
-        """Run transcription with cache-around-the-call.
-        Cache miss: lazy-init the transcriber, transcribe, store the
-        result (including all hashed kwargs in metadata.json so future
-        invalidators have provenance).
-        Cache hit: return the deserialized :class:`Transcription` without
-        touching Whisper/diarization at all.
-        """
-        src_hash, kwargs_hash = self._transcription_cache_keys(source_audio, enable_diarization)
-        if self._cache is not None:
-            cached = self._cache.get_transcription(src_hash, kwargs_hash)
-            if cached is not None:
-                return cached
+        """Lazy-init the transcriber and run it on ``source_audio``."""
         if self._transcriber is None or self._transcriber_diarization != enable_diarization:
             self._init_transcriber(enable_diarization=enable_diarization)
             self._transcriber_diarization = enable_diarization
         transcription = self._transcriber.transcribe(source_audio)
         self._maybe_unload("_transcriber")
-        if self._cache is not None:
-            self._cache.put_transcription(
-                src_hash,
-                kwargs_hash,
-                transcription,
-                hash_inputs={
-                    "whisper_model": self.whisper_model,
-                    "enable_diarization": enable_diarization,
-                    "condition_on_previous_text": self.condition_on_previous_text,
-                    "no_speech_threshold": self.no_speech_threshold,
-                    "logprob_threshold": self.logprob_threshold,
-                },
-            )
         return transcription
     def _tts_segment_audio(
         self,
         segment: TranslatedSegment,
         speaker: str,
-        speaker_bytes: bytes | None,
         target_lang: str,
         voice_clone: bool,
         voice_samples: dict[str, Audio],
         speaker_wav_paths: dict[str, Path],
-        src_hash_for_tts: str,
         expressiveness: Expressiveness = Expressiveness(),
     ) -> Audio | None:
-        """Produce the TTS audio for a single segment, with cache-around-the-call.
+        """Produce the TTS audio for a single segment.
         Returns the synthesized :class:`Audio`, or ``None`` if Chatterbox
-        crashed on the segment (the caller skips it). On cache miss the
-        TTS model is lazy-initialized and the per-speaker temp WAV is
-        materialized before generation; on cache hit none of that runs,
-        so a fully-cached run never loads Chatterbox.
+        crashed on the segment (the caller skips it). The TTS model is
+        lazy-initialized and per-speaker temp WAVs are materialized once
+        across the loop.
         ``expressiveness`` carries the M4 Chatterbox knobs derived from
         the source segment's prosody. Default is the no-knobs profile —
         lets Chatterbox use its own defaults — so callers that don't yet
         derive prosody (e.g. ``revoice``) keep pre-M4 behaviour.
         """
-        from videopython.base.audio import Audio as _Audio
-        tts_cache_key: str | None = None
-        if self._cache is not None:
-            tts_cache_key = DubCache.tts_key(
-                translated_text=segment.translated_text,
-                voice_sample_bytes=speaker_bytes,
-                language=target_lang,
-                **expressiveness.as_kwargs(),
-            )
-            cached_path = self._cache.get_tts_path(src_hash_for_tts, tts_cache_key)
-            if cached_path is not None:
-                return _Audio.from_path(cached_path)
-        # Cache miss: pay for TTS init + voice-sample WAV exactly once
-        # across the loop. Both are wasted work when every segment hits.
         if self._tts is None or self._tts_language != target_lang:
             self._init_tts(language=target_lang)
             self._tts_language = target_lang
@@ -311,7 +262,7 @@ class LocalDubbingPipeline:
         wav_path = speaker_wav_paths.get(speaker) if voice_clone else None
         try:
-            dubbed_audio = self._tts.generate_audio(
+            return self._tts.generate_audio(
                 segment.translated_text,
                 voice_sample_path=wav_path,
                 **expressiveness.as_kwargs(),
@@ -329,39 +280,19 @@ class LocalDubbingPipeline:
             )
             return None
-        if self._cache is not None and tts_cache_key is not None:
-            dubbed_audio.save(self._cache.reserve_tts_path(src_hash_for_tts, tts_cache_key))
-        return dubbed_audio
-    def _translate_with_cache(
+    def _translate(
         self,
         transcription: Transcription,
-        source_audio: Audio,
         source_lang: str,
         target_lang: str,
         report_progress: Callable[[str, float], None],
     ) -> tuple[list[TranslatedSegment], list[int]]:
-        """Run translation with cache-around-the-call.
+        """Translate the transcription's segments into ``target_lang``.
-        Returns ``(translated_segments, translation_failures)``. Only
-        fully-successful translations are cached — partial Qwen failures
-        would otherwise lock in an incomplete dub across runs. The
+        Returns ``(translated_segments, translation_failures)``. The
         progress callback maps the backend's [0, 1] fraction onto the
         pipeline's translation window (0.35 → 0.50).
         """
-        from videopython.ai.dubbing.models import TranslatedSegment
-        cache_key: str | None = None
-        if self._cache is not None:
-            cache_key = DubCache.translation_key(
-                source_lang=source_lang,
-                target_lang=target_lang,
-                translator_class=self._resolved_translator_class_name(source_lang, target_lang),
-            )
-            cached = self._cache.get_translation(DubCache.source_key(source_audio), cache_key)
-            if cached is not None:
-                return [TranslatedSegment.from_dict(d) for d in cached], []
         if self._translator is None:
             self._init_translator(source_lang=source_lang, target_lang=target_lang)
@@ -384,31 +315,8 @@ class LocalDubbingPipeline:
         translation_failures = list(self._translator.translation_failures)
         self._maybe_unload("_translator")
-        if self._cache is not None and cache_key is not None and not translation_failures:
-            self._cache.put_translation(
-                DubCache.source_key(source_audio),
-                cache_key,
-                [s.to_dict() for s in translated_segments],
-            )
         return translated_segments, translation_failures
-    def _transcription_cache_keys(self, source_audio: Audio, enable_diarization: bool = False) -> tuple[str, str]:
-        """Return ``(src_hash, kwargs_hash)`` for the current transcription config.
-        Centralizes the kwarg list so the cache lookup, the put, and any
-        future invalidator agree on what's hashed.
-        """
-        src_hash = DubCache.source_key(source_audio)
-        kwargs_hash = DubCache.transcription_kwargs_hash(
-            whisper_model=self.whisper_model,
-            enable_diarization=enable_diarization,
-            condition_on_previous_text=self.condition_on_previous_text,
-            no_speech_threshold=self.no_speech_threshold,
-            logprob_threshold=self.logprob_threshold,
-        )
-        return src_hash, kwargs_hash
     def _init_transcriber(self, enable_diarization: bool = False) -> None:
         """Initialize the transcription model."""
         from videopython.ai.understanding.audio import AudioToText
@@ -420,6 +328,7 @@ class LocalDubbingPipeline:
             condition_on_previous_text=self.condition_on_previous_text,
             no_speech_threshold=self.no_speech_threshold,
             logprob_threshold=self.logprob_threshold,
+            vocabulary=self.vocabulary,
         )
     def _init_translator(self, source_lang: str, target_lang: str) -> None:
@@ -439,31 +348,6 @@ class LocalDubbingPipeline:
         else:  # "auto"
             self._translator = self._resolve_translator_auto(source_lang, target_lang)
-    def _resolved_translator_class_name(self, source_lang: str, target_lang: str) -> str:
-        """Return the *class name* of the translator that ``_init_translator``
-        would pick — without constructing one.
-        Used by the cache to key translations on the resolved backend rather
-        than the user-supplied ``"auto"``: a CPU run that resolves to Marian
-        must not collide with a GPU run that resolves to Qwen.
-        """
-        if self.translator == "marian":
-            return "MarianTranslator"
-        if self.translator == "qwen3":
-            return "Qwen3Translator"
-        # auto — mirror _resolve_translator_auto's branching, no construction.
-        device = select_device(self.device, mps_allowed=True)
-        has_gpu = device in ("cuda", "mps")
-        if has_gpu and Qwen3Translator.supports(source_lang, target_lang):
-            return "Qwen3Translator"
-        if MarianTranslator.has_model_for(source_lang, target_lang):
-            return "MarianTranslator"
-        if Qwen3Translator.supports(source_lang, target_lang):
-            return "Qwen3Translator"
-        # No backend supports the pair — _init_translator will raise. We
-        # return a sentinel; the cache miss path will pay that cost.
-        return "Unsupported"
     def _resolve_translator_auto(self, source_lang: str, target_lang: str) -> TranslationBackend:
         """Pick a backend based on language coverage AND device.
@@ -723,7 +607,7 @@ class LocalDubbingPipeline:
                 )
         else:
             report_progress("Transcribing audio", 0.05)
-            transcription = self._transcribe_with_cache(source_audio, enable_diarization)
+            transcription = self._transcribe(source_audio, enable_diarization)
         if not transcription.segments:
             return DubbingResult(
@@ -791,8 +675,8 @@ class LocalDubbingPipeline:
             voice_samples = self._extract_voice_samples(vocal_audio, background_audio, transcription)
         report_progress("Translating text", 0.35)
-        translated_segments, translation_failures = self._translate_with_cache(
-            transcription, source_audio, detected_lang, target_lang, report_progress
+        translated_segments, translation_failures = self._translate(
+            transcription, detected_lang, target_lang, report_progress
         )
         # Per-segment expressiveness derived from source vocals RMS.
@@ -818,21 +702,12 @@ class LocalDubbingPipeline:
         report_progress("Generating dubbed speech", 0.50)
-        # Per-speaker voice-sample bytes for TTS cache key. Empty when
-        # voice_clone=False — the cache key still differentiates "no voice
-        # sample" from "specific clone" via the None path.
-        voice_sample_bytes: dict[str, bytes] = (
-            {speaker: sample.data.tobytes() for speaker, sample in voice_samples.items()} if voice_clone else {}
-        )
-        src_hash_for_tts = DubCache.source_key(source_audio) if self._cache is not None else ""
         dubbed_segments: list[Audio] = []
         target_durations: list[float] = []
         start_times: list[float] = []
-        # Per-speaker temp WAVs are materialized lazily by _tts_segment_audio
-        # so a fully-cached run never writes one. The dict is loop-scoped
-        # state so the finally block can clean up regardless of cache outcome.
+        # Per-speaker temp WAVs are materialized lazily by _tts_segment_audio.
+        # The dict is loop-scoped state so the finally block can clean up.
         speaker_wav_paths: dict[str, Path] = {}
         try:
             for i, segment in enumerate(translated_segments):
@@ -852,12 +727,10 @@ class LocalDubbingPipeline:
                 dubbed_audio = self._tts_segment_audio(
                     segment=segment,
                     speaker=speaker,
-                    speaker_bytes=voice_sample_bytes.get(speaker),
                     target_lang=target_lang,
                     voice_clone=voice_clone,
                     voice_samples=voice_samples,
                     speaker_wav_paths=speaker_wav_paths,
-                    src_hash_for_tts=src_hash_for_tts,
                     expressiveness=expressiveness_per_segment[i],
                 )
                 if dubbed_audio is None:

{videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/transforms.py RENAMED Viewed

@@ -54,7 +54,6 @@ class FaceTrackingCrop(Transformation):
         vertical_offset: float = -0.1,
         framing_rule: Literal["offset", "center", "headroom", "thirds", "dynamic"] = "offset",
         headroom: float = 0.15,
-        lead_room: float = 0.1,
         smoothing: float = 0.8,
         max_speed: float | None = None,
         fallback: Literal["center", "last_position", "full_frame"] = "last_position",
@@ -77,7 +76,6 @@ class FaceTrackingCrop(Transformation):
                 - "thirds": Place face near the upper-third line.
                 - "dynamic": Currently same as "headroom".
             headroom: Headroom amount for framing rules that use it.
-            lead_room: Reserved for future motion/look-direction framing.
             smoothing: Position smoothing factor (0-1, higher = smoother).
             max_speed: Optional max camera movement per frame (normalized).
             fallback: Behavior when no face detected.
@@ -92,7 +90,6 @@ class FaceTrackingCrop(Transformation):
         self.vertical_offset = vertical_offset
         self.framing_rule = framing_rule
         self.headroom = headroom
-        self.lead_room = lead_room
         self.smoothing = smoothing
         self.max_speed = max_speed
         self.fallback = fallback
@@ -238,10 +235,15 @@ class FaceTrackingCrop(Transformation):
         current_position = (0.5, 0.5)
         framing_label = self.framing_rule if self.framing_rule != "offset" else "legacy-offset"
-        print(
-            "Face tracking crop: "
-            f"{w}x{h} -> {out_w}x{out_h} "
-            f"({self.target_aspect[0]}:{self.target_aspect[1]}, framing={framing_label})"
+        logger.info(
+            "Face tracking crop: %dx%d -> %dx%d (%d:%d, framing=%s)",
+            w,
+            h,
+            out_w,
+            out_h,
+            self.target_aspect[0],
+            self.target_aspect[1],
+            framing_label,
         )
         new_frames = []
@@ -448,7 +450,7 @@ class SplitScreenComposite(Transformation):
             for _ in range(len(cell_rects))
         ]
-        print(f"Creating {self.layout} split screen: {out_w}x{out_h}")
+        logger.info("Creating %s split screen: %dx%d", self.layout, out_w, out_h)
         new_frames = []
         for i in tqdm(range(n_frames), desc="Split screen composite"):

videopython 0.29.0__tar.gz → 0.30.0__tar.gz

videopython 0.29.0tar.gz → 0.30.0tar.gz