PyPI - videopython - Versions diffs - 0.33.0__tar.gz → 0.33.2__tar.gz - Mend

videopython 0.33.0tar.gz → 0.33.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

{videopython-0.33.0 → videopython-0.33.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: videopython
-Version: 0.33.0
+Version: 0.33.2
 Summary: Minimal video generation and processing library.
 Project-URL: Homepage, https://videopython.com
 Project-URL: Repository, https://github.com/bartwojtowicz/videopython/

{videopython-0.33.0 → videopython-0.33.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "videopython"
-version = "0.33.0"
+version = "0.33.2"
 description = "Minimal video generation and processing library."
 authors = [
     { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -137,6 +137,9 @@ Documentation = "https://videopython.com"
 [tool.mypy]
 mypy_path = "src/stubs"
 plugins = ["pydantic.mypy"]
+warn_unused_ignores = true
+warn_redundant_casts = true
+disallow_any_generics = true
 [[tool.mypy.overrides]]
 module = [

{videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/generation/audio.py RENAMED Viewed

@@ -32,8 +32,8 @@ class TextToSpeech:
         self.language = language
         self._model: Any = None
-    def _init_model(self) -> None:
-        from chatterbox.mtl_tts import ChatterboxMultilingualTTS  # type: ignore[import-untyped]
+    def _init_local(self) -> None:
+        from chatterbox.mtl_tts import ChatterboxMultilingualTTS
         requested_device = self.device
         device = select_device(self.device, mps_allowed=False)
@@ -83,7 +83,7 @@ class TextToSpeech:
         import numpy as np
         if self._model is None:
-            self._init_model()
+            self._init_local()
         speaker_wav_path: Path | None = None
         cleanup_path = False
@@ -149,7 +149,6 @@ class TextToMusic:
         self.device = device
         self._processor: Any = None
         self._model: Any = None
-        self._device: str | None = None
     def _init_local(self) -> None:
         """Initialize local MusicGen model."""
@@ -160,17 +159,17 @@ class TextToMusic:
         os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
         requested_device = self.device
-        self._device = select_device(self.device, mps_allowed=True)
+        device = select_device(self.device, mps_allowed=True)
         model_name = "facebook/musicgen-small"
         self._processor = AutoProcessor.from_pretrained(model_name)
         self._model = MusicgenForConditionalGeneration.from_pretrained(model_name)
-        self._model.to(self._device)
-        self.device = self._device
+        self._model.to(device)
+        self.device = device
         log_device_initialization(
             "TextToMusic",
             requested_device=requested_device,
-            resolved_device=self._device,
+            resolved_device=device,
         )
     def generate_audio(self, text: str, max_new_tokens: int = 256) -> Audio:
@@ -179,7 +178,7 @@ class TextToMusic:
             self._init_local()
         inputs = self._processor(text=[text], padding=True, return_tensors="pt")
-        inputs = {k: v.to(self._device) if hasattr(v, "to") else v for k, v in inputs.items()}
+        inputs = {k: v.to(self.device) if hasattr(v, "to") else v for k, v in inputs.items()}
         audio_values = self._model.generate(**inputs, max_new_tokens=max_new_tokens)
         sampling_rate = self._model.config.audio_encoder.sampling_rate
@@ -193,3 +192,9 @@ class TextToMusic:
             frame_count=len(audio_data),
         )
         return Audio(audio_data, metadata)
+    def unload(self) -> None:
+        """Release the MusicGen model so the next generate_audio() re-initializes."""
+        self._model = None
+        self._processor = None
+        release_device_memory(self.device)

{videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/generation/image.py RENAMED Viewed

@@ -6,7 +6,7 @@ from typing import Any
 from PIL import Image
-from videopython.ai._device import log_device_initialization, select_device
+from videopython.ai._device import log_device_initialization, release_device_memory, select_device
 class TextToImage:
@@ -49,3 +49,8 @@ class TextToImage:
         if self._pipeline is None:
             self._init_local()
         return self._pipeline(prompt=prompt).images[0]
+    def unload(self) -> None:
+        """Release the diffusion pipeline so the next generate_image() re-initializes."""
+        self._pipeline = None
+        release_device_memory(self.device)

{videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/generation/translation.py RENAMED Viewed

@@ -170,7 +170,7 @@ class MarianTranslator:
         return f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
     def _init_local(self, source_lang: str, target_lang: str) -> None:
-        from transformers import MarianMTModel, MarianTokenizer  # type: ignore[attr-defined]
+        from transformers import MarianMTModel, MarianTokenizer
         model_name = self._get_local_model_name(source_lang, target_lang)
@@ -181,7 +181,7 @@ class MarianTranslator:
         self._model = MarianMTModel.from_pretrained(model_name).to(device)
         self.device = device
         log_device_initialization(
-            "TextTranslator",
+            "MarianTranslator",
             requested_device=requested_device,
             resolved_device=device,
         )

{videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/generation/video.py RENAMED Viewed

@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Any
 import numpy as np
-from videopython.ai._device import log_device_initialization, select_device
+from videopython.ai._device import log_device_initialization, release_device_memory, select_device
 from videopython.base.video import Video
 if TYPE_CHECKING:
@@ -29,22 +29,21 @@ class TextToVideo:
     def __init__(self, device: str | None = None):
         self.device = device
         self._pipeline: Any = None
-        self._device: str | None = None
     def _init_local(self) -> None:
         from diffusers import CogVideoXPipeline
         requested_device = self.device
-        self._device, dtype = _get_torch_device_and_dtype(self.device)
+        device, dtype = _get_torch_device_and_dtype(self.device)
         model_name = "THUDM/CogVideoX1.5-5B"
         self._pipeline = CogVideoXPipeline.from_pretrained(model_name, torch_dtype=dtype)
-        self._pipeline.to(self._device)
-        self.device = self._device
+        self._pipeline.to(device)
+        self.device = device
         log_device_initialization(
             "TextToVideo",
             requested_device=requested_device,
-            resolved_device=self._device,
+            resolved_device=device,
         )
     def generate_video(
@@ -65,11 +64,16 @@ class TextToVideo:
             num_inference_steps=num_steps,
             num_frames=num_frames,
             guidance_scale=guidance_scale,
-            generator=torch.Generator(device=self._device).manual_seed(42),
+            generator=torch.Generator(device=self.device).manual_seed(42),
         ).frames[0]
         video_frames = np.asarray(video_frames, dtype=np.uint8)
         return Video.from_frames(video_frames, fps=16.0)
+    def unload(self) -> None:
+        """Release the diffusion pipeline so the next generate_video() re-initializes."""
+        self._pipeline = None
+        release_device_memory(self.device)
 class ImageToVideo:
     """Generates videos from static images using local video diffusion."""
@@ -77,22 +81,21 @@ class ImageToVideo:
     def __init__(self, device: str | None = None):
         self.device = device
         self._pipeline: Any = None
-        self._device: str | None = None
     def _init_local(self) -> None:
         from diffusers import CogVideoXImageToVideoPipeline
         requested_device = self.device
-        self._device, dtype = _get_torch_device_and_dtype(self.device)
+        device, dtype = _get_torch_device_and_dtype(self.device)
         model_name = "THUDM/CogVideoX1.5-5B-I2V"
         self._pipeline = CogVideoXImageToVideoPipeline.from_pretrained(model_name, torch_dtype=dtype)
-        self._pipeline.to(self._device)
-        self.device = self._device
+        self._pipeline.to(device)
+        self.device = device
         log_device_initialization(
             "ImageToVideo",
             requested_device=requested_device,
-            resolved_device=self._device,
+            resolved_device=device,
         )
     def generate_video(
@@ -115,7 +118,12 @@ class ImageToVideo:
             num_inference_steps=num_steps,
             num_frames=num_frames,
             guidance_scale=guidance_scale,
-            generator=torch.Generator(device=self._device).manual_seed(42),
+            generator=torch.Generator(device=self.device).manual_seed(42),
         ).frames[0]
         video_frames = np.asarray(video_frames, dtype=np.uint8)
         return Video.from_frames(video_frames, fps=16.0)
+    def unload(self) -> None:
+        """Release the diffusion pipeline so the next generate_video() re-initializes."""
+        self._pipeline = None
+        release_device_memory(self.device)

{videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/understanding/audio.py RENAMED Viewed

@@ -188,7 +188,7 @@ class AudioToText:
     def _init_diarization(self) -> None:
         """Initialize pyannote speaker diarization pipeline."""
         import torch
-        from pyannote.audio import Pipeline  # type: ignore[import-untyped]
+        from pyannote.audio import Pipeline
         self._diarization_pipeline = Pipeline.from_pretrained(self.PYANNOTE_DIARIZATION_MODEL)
         self._diarization_pipeline.to(torch.device(self.device))
@@ -214,7 +214,7 @@ class AudioToText:
         self._vad_model = None
         release_device_memory(self.device)
-    def _process_transcription_result(self, transcription_result: dict) -> Transcription:
+    def _process_transcription_result(self, transcription_result: dict[str, Any]) -> Transcription:
         """Process raw transcription result into a Transcription object."""
         transcription_segments = []
         for segment in transcription_result["segments"]:
@@ -520,6 +520,15 @@ class AudioClassifier:
         self._labels = [self._model.config.id2label[i] for i in range(len(self._model.config.id2label))]
+    def unload(self) -> None:
+        """Release the AST model so the next classify() re-initializes.
+        Used by low-memory dubbing to free VRAM between pipeline stages.
+        """
+        self._model = None
+        self._processor = None
+        release_device_memory(self.device)
     def _merge_events(self, events: list[AudioEvent], gap_threshold: float = 0.5) -> list[AudioEvent]:
         """Merge consecutive events of the same class."""
         if not events:

{videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/understanding/faces.py RENAMED Viewed

@@ -237,7 +237,7 @@ class FaceTracker:
     def _select_face(
         self,
-        faces: list,
+        faces: list[DetectedFace],
         frame_width: int,
         frame_height: int,
     ) -> tuple[float, float, float, float] | None:
@@ -251,29 +251,24 @@ class FaceTracker:
         Returns:
             Tuple of (center_x, center_y, width, height) in normalized coords, or None.
         """
-        if not faces:
+        faces_with_box = [(f, f.bounding_box) for f in faces if f.bounding_box is not None]
+        if not faces_with_box:
             return None
         if self.selection_strategy == "largest":
-            face = faces[0]
+            _, bbox = faces_with_box[0]
         elif self.selection_strategy == "centered":
             frame_center = (0.5, 0.5)
-            face = min(
-                faces,
-                key=lambda f: (
-                    (f.bounding_box.center[0] - frame_center[0]) ** 2
-                    + (f.bounding_box.center[1] - frame_center[1]) ** 2
-                ),
+            _, bbox = min(
+                faces_with_box,
+                key=lambda fb: ((fb[1].center[0] - frame_center[0]) ** 2 + (fb[1].center[1] - frame_center[1]) ** 2),
             )
         elif self.selection_strategy == "index":
-            if self.face_index < len(faces):
-                face = faces[self.face_index]
-            else:
-                face = faces[0]
+            idx = self.face_index if self.face_index < len(faces_with_box) else 0
+            _, bbox = faces_with_box[idx]
         else:
-            face = faces[0]
+            _, bbox = faces_with_box[0]
-        bbox = face.bounding_box
         return (bbox.center[0], bbox.center[1], bbox.width, bbox.height)
     def detect_and_track(
@@ -407,7 +402,7 @@ class FaceTracker:
         sampled_frames = [frames[i] for i in sample_indices]
-        sampled_detections: list[list] = []
+        sampled_detections: list[list[DetectedFace]] = []
         for batch_start in range(0, len(sampled_frames), self.batch_size):
             batch_end = min(batch_start + self.batch_size, len(sampled_frames))
             batch = sampled_frames[batch_start:batch_end]

{videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/understanding/image.py RENAMED Viewed

@@ -11,7 +11,7 @@ from typing import Any, Literal
 import numpy as np
 from PIL import Image
-from videopython.ai._device import log_device_initialization, select_device
+from videopython.ai._device import log_device_initialization, release_device_memory, select_device
 from videopython.base.description import SceneDescription
 logger = logging.getLogger(__name__)
@@ -151,7 +151,7 @@ class SceneVLM:
     def _init_local(self) -> None:
         """Initialize local Qwen3.5 model."""
         import torch
-        from transformers import AutoModelForImageTextToText, AutoProcessor  # type: ignore[attr-defined]
+        from transformers import AutoModelForImageTextToText, AutoProcessor
         t0 = time.perf_counter()
         requested_device = self.device
@@ -190,16 +190,7 @@ class SceneVLM:
         """
         self._model = None
         self._processor = None
-        try:
-            import gc
-            import torch
-            gc.collect()
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-        except ImportError:
-            pass
+        release_device_memory(self.device)
     def _downscale_image(self, img: Image.Image) -> Image.Image:
         """Downscale image to fit within max_image_pixels budget, preserving aspect ratio."""
@@ -284,7 +275,7 @@ class SceneVLM:
     def _generate_from_message_batch(self, messages_batch: list[list[dict[str, Any]]]) -> list[str]:
         """Run batch generation for one or more multimodal chat messages."""
         import torch
-        from qwen_vl_utils import process_vision_info  # type: ignore
+        from qwen_vl_utils import process_vision_info
         if self._model is None:
             self._init_local()

{videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/understanding/temporal.py RENAMED Viewed

@@ -9,7 +9,7 @@ from __future__ import annotations
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
-from videopython.ai._device import log_device_initialization, select_device
+from videopython.ai._device import log_device_initialization, release_device_memory, select_device
 from videopython.base.description import SceneBoundary
 if TYPE_CHECKING:
@@ -56,26 +56,32 @@ class SemanticSceneDetector:
         self.threshold = threshold
         self.min_scene_length = min_scene_length
-        self._device: str | None = device
+        self.device: str | None = device
         self._model: Any = None
-    def _load_model(self) -> None:
+    def _init_local(self) -> None:
         """Load the TransNetV2 model with pretrained weights."""
         if self._model is not None:
             return
         from transnetv2_pytorch import TransNetV2
-        requested_device = self._device
-        device = select_device(self._device, mps_allowed=True)
+        requested_device = self.device
+        device = select_device(self.device, mps_allowed=True)
         log_device_initialization(
             "SemanticSceneDetector",
             requested_device=requested_device,
             resolved_device=device,
         )
+        self.device = device
         self._model = TransNetV2(device=device)
         self._model.eval()
+    def unload(self) -> None:
+        """Release the TransNetV2 model so the next call re-initializes."""
+        self._model = None
+        release_device_memory(self.device)
     def detect(self, video: Video) -> list[SceneBoundary]:
         """Detect scenes in a video using ML-based boundary detection.
@@ -114,7 +120,7 @@ class SemanticSceneDetector:
         Returns:
             List of SceneBoundary objects representing detected scenes.
         """
-        self._load_model()
+        self._init_local()
         # Use TransNetV2's detect_scenes which handles everything internally
         raw_scenes = self._model.detect_scenes(str(path), threshold=self.threshold)

{videopython-0.33.0 → videopython-0.33.2}/src/videopython/audio/audio.py RENAMED Viewed

@@ -5,7 +5,7 @@ import subprocess
 import wave
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 import numpy as np
@@ -69,7 +69,7 @@ class Audio:
         return bool(np.all(np.abs(self.data) < 1e-7))
     @staticmethod
-    def _get_ffmpeg_info(file_path: Path) -> dict:
+    def _get_ffmpeg_info(file_path: Path) -> dict[str, Any]:
         """Get audio metadata using ffprobe"""
         try:
             info = _ffmpeg.probe(file_path)
@@ -483,7 +483,7 @@ class Audio:
         if first.metadata.channels == 1:
             output = np.zeros(total_samples, dtype=np.float32)
         else:
-            output = np.zeros((total_samples, 2), dtype=np.float32)  # type: ignore
+            output = np.zeros((total_samples, 2), dtype=np.float32)
         # Copy non-crossfaded portions
         crossfade_start = len(first.data) - crossfade_samples
@@ -761,7 +761,7 @@ class Audio:
         if base.metadata.channels == 1:
             output = np.zeros(total_length, dtype=np.float32)
         else:
-            output = np.zeros((total_length, 2), dtype=np.float32)  # type: ignore
+            output = np.zeros((total_length, 2), dtype=np.float32)
         # Copy base audio
         output[: len(base.data)] = base.data

{videopython-0.33.0 → videopython-0.33.2}/src/videopython/base/_ffmpeg.py RENAMED Viewed

@@ -13,7 +13,7 @@ import json
 import subprocess
 from contextlib import contextmanager
 from pathlib import Path
-from typing import Iterator, Sequence
+from typing import Any, Iterator, Sequence
 from videopython.base.exceptions import FFmpegProbeError, FFmpegRunError
@@ -44,7 +44,7 @@ def run(cmd: Sequence[str], *, stdin: bytes | None = None) -> bytes:
     return result.stdout
-def probe(path: str | Path, *, extra_args: Sequence[str] | None = None) -> dict:
+def probe(path: str | Path, *, extra_args: Sequence[str] | None = None) -> dict[str, Any]:
     """Run ffprobe and return the parsed JSON payload.
     Args:
@@ -76,7 +76,7 @@ def probe(path: str | Path, *, extra_args: Sequence[str] | None = None) -> dict:
         raise FFmpegProbeError(f"Error parsing ffprobe output: {e}") from e
-def _terminate(proc: subprocess.Popen, *, timeout: float = 5) -> None:
+def _terminate(proc: subprocess.Popen[bytes], *, timeout: float = 5) -> None:
     """Terminate a still-running process, escalating to kill after ``timeout``."""
     if proc.poll() is None:
         proc.terminate()
@@ -88,7 +88,7 @@ def _terminate(proc: subprocess.Popen, *, timeout: float = 5) -> None:
 @contextmanager
-def popen_decode(cmd: Sequence[str], *, bufsize: int = -1) -> Iterator[subprocess.Popen]:
+def popen_decode(cmd: Sequence[str], *, bufsize: int = -1) -> Iterator[subprocess.Popen[bytes]]:
     """Context manager wrapping an ffmpeg decode process.
     Yields a Popen with ``stdout=PIPE`` and ``stderr=DEVNULL``. Callers
@@ -116,7 +116,7 @@ def popen_decode(cmd: Sequence[str], *, bufsize: int = -1) -> Iterator[subproces
 @contextmanager
-def popen_encode(cmd: Sequence[str]) -> Iterator[subprocess.Popen]:
+def popen_encode(cmd: Sequence[str]) -> Iterator[subprocess.Popen[bytes]]:
     """Context manager wrapping an ffmpeg encode process via stdin pipe.
     Yields a Popen with ``stdin=PIPE``, ``stdout=DEVNULL``, and

{videopython-0.33.0 → videopython-0.33.2}/src/videopython/base/_video_io.py RENAMED Viewed

@@ -173,7 +173,7 @@ def decode_video(
         if frames_read == 0:
             raise ValueError("No frames were read from the video")
-        frames = frames[:frames_read]  # type: ignore
+        frames = frames[:frames_read]
         try:
             audio = Audio.from_path(path)

videopython 0.33.0__tar.gz → 0.33.2__tar.gz

videopython 0.33.0tar.gz → 0.33.2tar.gz