PyPI - videopython - Versions diffs - 0.28.3__tar.gz → 0.29.0__tar.gz - Mend

videopython 0.28.3tar.gz → 0.29.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

{videopython-0.28.3 → videopython-0.29.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: videopython
-Version: 0.28.3
+Version: 0.29.0
 Summary: Minimal video generation and processing library.
 Project-URL: Homepage, https://videopython.com
 Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -27,14 +27,15 @@ Requires-Dist: accelerate>=0.29.2; extra == 'ai'
 Requires-Dist: chatterbox-tts>=0.1.7; extra == 'ai'
 Requires-Dist: demucs>=4.0.0; extra == 'ai'
 Requires-Dist: diffusers>=0.30.0; extra == 'ai'
-Requires-Dist: easyocr>=1.7.0; extra == 'ai'
 Requires-Dist: hf-transfer>=0.1.9; extra == 'ai'
+Requires-Dist: imagehash>=4.3; extra == 'ai'
 Requires-Dist: llama-cpp-python>=0.3.0; extra == 'ai'
 Requires-Dist: numba>=0.61.0; extra == 'ai'
 Requires-Dist: ollama>=0.4.5; extra == 'ai'
 Requires-Dist: openai-whisper>=20240930; extra == 'ai'
 Requires-Dist: pyannote-audio>=4.0.0; extra == 'ai'
 Requires-Dist: pyloudnorm>=0.1.1; extra == 'ai'
+Requires-Dist: qwen-vl-utils>=0.0.10; extra == 'ai'
 Requires-Dist: scikit-learn>=1.3.0; extra == 'ai'
 Requires-Dist: scipy>=1.10.0; extra == 'ai'
 Requires-Dist: sentencepiece>=0.1.99; extra == 'ai'
@@ -56,6 +57,8 @@ Minimal, LLM-friendly Python library for programmatic video editing, processing,
 Full documentation: [videopython.com](https://videopython.com)
+> **Disclaimer:** This project started as a hand-written hobby project, but most of the code is now produced by LLM agents. Humans still drive direction, approve changes, and own design decisions.
 ## Installation
 ### 1. Install FFmpeg
@@ -193,10 +196,10 @@ API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopyth
 | Area | Highlights |
 |---|---|
 | **Generation** | `TextToVideo`, `ImageToVideo`, `TextToImage`, `TextToSpeech`, `TextToMusic` |
-| **Understanding** | `AudioToText` (transcription), `AudioClassifier`, `SceneVLM` (visual scene description), `ActionRecognizer` |
+| **Understanding** | `AudioToText` (transcription), `AudioClassifier`, `SceneVLM` (structured visual scene description), `FaceTracker` (per-shot face tracks) |
 | **Scene detection** | `SemanticSceneDetector` (neural scene boundaries) |
 | **Video analysis** | `VideoAnalyzer` - full-pipeline analysis combining multiple AI capabilities |
-| **Transforms** | `FaceTracker`, `FaceTrackingCrop`, `SplitScreenComposite` |
+| **Transforms** | `FaceTrackingCrop`, `SplitScreenComposite` |
 | **Dubbing** | `VideoDubber` - voice cloning and revoicing with timing sync |
 | **Object swapping** | `ObjectSwapper` - detect, segment, and inpaint objects in video |

{videopython-0.28.3 → videopython-0.29.0}/README.md RENAMED Viewed

@@ -8,6 +8,8 @@ Minimal, LLM-friendly Python library for programmatic video editing, processing,
 Full documentation: [videopython.com](https://videopython.com)
+> **Disclaimer:** This project started as a hand-written hobby project, but most of the code is now produced by LLM agents. Humans still drive direction, approve changes, and own design decisions.
 ## Installation
 ### 1. Install FFmpeg
@@ -145,10 +147,10 @@ API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopyth
 | Area | Highlights |
 |---|---|
 | **Generation** | `TextToVideo`, `ImageToVideo`, `TextToImage`, `TextToSpeech`, `TextToMusic` |
-| **Understanding** | `AudioToText` (transcription), `AudioClassifier`, `SceneVLM` (visual scene description), `ActionRecognizer` |
+| **Understanding** | `AudioToText` (transcription), `AudioClassifier`, `SceneVLM` (structured visual scene description), `FaceTracker` (per-shot face tracks) |
 | **Scene detection** | `SemanticSceneDetector` (neural scene boundaries) |
 | **Video analysis** | `VideoAnalyzer` - full-pipeline analysis combining multiple AI capabilities |
-| **Transforms** | `FaceTracker`, `FaceTrackingCrop`, `SplitScreenComposite` |
+| **Transforms** | `FaceTrackingCrop`, `SplitScreenComposite` |
 | **Dubbing** | `VideoDubber` - voice cloning and revoicing with timing sync |
 | **Object swapping** | `ObjectSwapper` - detect, segment, and inpaint objects in video |

{videopython-0.28.3 → videopython-0.29.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "videopython"
-version = "0.28.3"
+version = "0.29.0"
 description = "Minimal video generation and processing library."
 authors = [
     { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -70,7 +70,6 @@ ai = [
     "scikit-learn>=1.3.0",
     # Detection backends
     "ultralytics>=8.0.0",
-    "easyocr>=1.7.0",
     # Audio classification (AST via transformers - no separate dep needed)
     # Scene detection
     "transnetv2-pytorch>=1.0.5",
@@ -84,6 +83,11 @@ ai = [
     "llama-cpp-python>=0.3.0",
     # Loudness measurement (BS.1770) for dub-vs-source loudness matching (M3)
     "pyloudnorm>=0.1.1",
+    # Vision-language preprocessing for Qwen3.5 (M5) - documented prerequisite
+    # for AutoModelForImageTextToText with image/video chat templates.
+    "qwen-vl-utils>=0.0.10",
+    # Perceptual hashing for SceneVLM frame dedup (M5)
+    "imagehash>=4.3",
 ]
 # Required for pip install videopython[ai] - pip uses optional-dependencies, not dependency-groups
@@ -105,7 +109,6 @@ ai = [
     "scikit-learn>=1.3.0",
     # Detection backends
     "ultralytics>=8.0.0",
-    "easyocr>=1.7.0",
     # Audio classification (AST via transformers - no separate dep needed)
     # Scene detection
     "transnetv2-pytorch>=1.0.5",
@@ -119,6 +122,11 @@ ai = [
     "llama-cpp-python>=0.3.0",
     # Loudness measurement (BS.1770) for dub-vs-source loudness matching (M3)
     "pyloudnorm>=0.1.1",
+    # Vision-language preprocessing for Qwen3.5 (M5) - documented prerequisite
+    # for AutoModelForImageTextToText with image/video chat templates.
+    "qwen-vl-utils>=0.0.10",
+    # Perceptual hashing for SceneVLM frame dedup (M5)
+    "imagehash>=4.3",
 ]
 [project.urls]
@@ -135,7 +143,6 @@ module = [
     "diffusers", "diffusers.*",
     "ollama", "ollama.*",
     "ultralytics", "ultralytics.*",
-    "easyocr", "easyocr.*",
     "transformers", "transformers.*",
     "transnetv2_pytorch", "transnetv2_pytorch.*",
     "chatterbox", "chatterbox.*",
@@ -146,6 +153,8 @@ module = [
     "cv2", "cv2.*",
     "llama_cpp", "llama_cpp.*",
     "pyloudnorm", "pyloudnorm.*",
+    "qwen_vl_utils", "qwen_vl_utils.*",
+    "imagehash", "imagehash.*",
 ]
 ignore_missing_imports = true

{videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/__init__.py RENAMED Viewed

@@ -2,11 +2,11 @@ from videopython.ai import registry as _ai_registry  # noqa: F401
 from .generation import ImageToVideo, TextToImage, TextToMusic, TextToSpeech, TextToVideo
 from .swapping import ObjectSwapper
-from .transforms import FaceTracker, FaceTrackingCrop, SplitScreenComposite
+from .transforms import FaceTrackingCrop, SplitScreenComposite
 from .understanding import (
-    ActionRecognizer,
     AudioClassifier,
     AudioToText,
+    FaceTracker,
     SceneVLM,
     SemanticSceneDetector,
 )
@@ -22,12 +22,10 @@ __all__ = [
     # Understanding
     "AudioToText",
     "AudioClassifier",
+    "FaceTracker",
     "SceneVLM",
-    # Temporal
-    "ActionRecognizer",
     "SemanticSceneDetector",
     # Transforms (AI-powered)
-    "FaceTracker",
     "FaceTrackingCrop",
     "SplitScreenComposite",
     # Swapping

{videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/transforms.py RENAMED Viewed

@@ -3,20 +3,16 @@
 from __future__ import annotations
 import logging
-from typing import TYPE_CHECKING, Any, Literal
+from typing import Literal
 import cv2
 import numpy as np
 from tqdm import tqdm
-from videopython.ai._device import select_device
-from videopython.base.description import BoundingBox, DetectedFace
+from videopython.ai.understanding.faces import FaceTracker
 from videopython.base.transforms import Transformation
 from videopython.base.video import Video
-if TYPE_CHECKING:
-    pass
 logger = logging.getLogger(__name__)
@@ -25,484 +21,12 @@ def _make_even(value: int) -> int:
     return value - (value % 2)
-class _FaceDetectionBackend:
-    """Internal YOLOv8-face detector used by AI transforms."""
-    def __init__(
-        self,
-        confidence_threshold: float = 0.5,
-        min_face_size: int = 30,
-        backend: Literal["cpu", "gpu", "auto"] = "auto",
-    ):
-        self.confidence_threshold = confidence_threshold
-        self.min_face_size = min_face_size
-        self.backend: Literal["cpu", "gpu", "auto"] = backend
-        self._resolved_device: Literal["cpu", "cuda"] | None = None
-        self._yolo_model: Any = None
-    def _resolve_device(self) -> Literal["cpu", "cuda"]:
-        if self._resolved_device is not None:
-            return self._resolved_device
-        if self.backend == "cpu":
-            self._resolved_device = "cpu"
-            return self._resolved_device
-        if self.backend == "gpu":
-            resolved = select_device(None, mps_allowed=False)
-            if resolved != "cuda":
-                raise ValueError("GPU backend requested but CUDA is not available.")
-            self._resolved_device = "cuda"
-            return self._resolved_device
-        resolved_auto = select_device(None, mps_allowed=False)
-        self._resolved_device = "cuda" if resolved_auto == "cuda" else "cpu"
-        return self._resolved_device
-    def execution_device(self) -> Literal["cpu", "cuda"]:
-        """Resolved execution device for this backend."""
-        return self._resolve_device()
-    def _init_yolo_face(self) -> None:
-        from huggingface_hub import hf_hub_download
-        from ultralytics import YOLO
-        model_path = hf_hub_download(
-            repo_id="arnabdhar/YOLOv8-Face-Detection",
-            filename="model.pt",
-        )
-        self._yolo_model = YOLO(model_path)
-        device = self._resolve_device()
-        if device == "cuda":
-            self._yolo_model.to("cuda")
-    def _faces_from_yolo_result(self, result: Any) -> list[DetectedFace]:
-        detected_faces: list[DetectedFace] = []
-        boxes = result.boxes
-        if boxes is None:
-            return detected_faces
-        img_h, img_w = result.orig_shape
-        for i in range(len(boxes)):
-            x1, y1, x2, y2 = boxes.xyxy[i].tolist()
-            conf = float(boxes.conf[i])
-            face_w = x2 - x1
-            face_h = y2 - y1
-            if face_w < self.min_face_size or face_h < self.min_face_size:
-                continue
-            detected_faces.append(
-                DetectedFace(
-                    bounding_box=BoundingBox(
-                        x=x1 / img_w,
-                        y=y1 / img_h,
-                        width=face_w / img_w,
-                        height=face_h / img_h,
-                    ),
-                    confidence=conf,
-                )
-            )
-        detected_faces.sort(key=lambda f: f.area or 0, reverse=True)
-        return detected_faces
-    def detect(self, image: np.ndarray) -> list[DetectedFace]:
-        if self._yolo_model is None:
-            self._init_yolo_face()
-        assert self._yolo_model is not None
-        results = self._yolo_model(image, conf=self.confidence_threshold, verbose=False)
-        if not results:
-            return []
-        return self._faces_from_yolo_result(results[0])
-    def detect_batch(self, images: list[np.ndarray] | np.ndarray) -> list[list[DetectedFace]]:
-        if isinstance(images, np.ndarray):
-            images = [images[i] for i in range(images.shape[0])] if images.ndim == 4 else [images]
-        if not images:
-            return []
-        if self._yolo_model is None:
-            self._init_yolo_face()
-        assert self._yolo_model is not None
-        results = self._yolo_model(images, conf=self.confidence_threshold, verbose=False)
-        return [self._faces_from_yolo_result(result) for result in results]
 __all__ = [
-    "FaceTracker",
     "FaceTrackingCrop",
     "SplitScreenComposite",
 ]
-class FaceTracker:
-    """Utility for tracking faces across video frames with smoothing.
-    Provides frame-by-frame face detection with position smoothing using
-    exponential moving average to prevent jitter in the tracked position.
-    Supports GPU acceleration via YOLOv8-face model for significantly faster
-    detection, with optional frame sampling and interpolation for video.
-    Example:
-        >>> # Auto backend (default): resolves to GPU when available, else CPU
-        >>> tracker = FaceTracker()
-        >>> for i, frame in enumerate(frames):
-        ...     pos = tracker.detect_and_track(frame, i)
-        >>>
-        >>> # GPU tracking with frame sampling
-        >>> tracker = FaceTracker(backend="gpu", sample_rate=5)
-        >>> positions = tracker.track_video(frames)
-    """
-    def __init__(
-        self,
-        selection_strategy: Literal["largest", "centered", "index"] = "largest",
-        face_index: int = 0,
-        smoothing: float = 0.8,
-        detection_interval: int = 3,
-        min_face_size: int = 30,
-        backend: Literal["cpu", "gpu", "auto"] = "auto",
-        sample_rate: int = 1,
-        batch_size: int = 16,
-    ):
-        """Initialize face tracker.
-        Args:
-            selection_strategy: How to select which face to track.
-                - "largest": Track the face with the largest bounding box.
-                - "centered": Track the face closest to frame center.
-                - "index": Track the face at a specific index (sorted by area).
-            face_index: Index of face to track when using "index" strategy.
-            smoothing: Exponential moving average factor (0-1). Higher = smoother.
-            detection_interval: Run detection every N frames, interpolate between.
-            min_face_size: Minimum face size in pixels for detection.
-            backend: Detection backend - "cpu", "gpu", or "auto".
-            sample_rate: For GPU backend, detect every Nth frame and interpolate.
-                Only used by track_video(). Default 1 (every frame).
-            batch_size: Batch size for GPU detection. Default 16.
-        """
-        self.selection_strategy = selection_strategy
-        self.face_index = face_index
-        self.smoothing = smoothing
-        self.detection_interval = detection_interval
-        self.min_face_size = min_face_size
-        self.backend: Literal["cpu", "gpu", "auto"] = backend
-        self.sample_rate = sample_rate
-        self.batch_size = batch_size
-        self._detector: _FaceDetectionBackend | None = None
-        self._last_position: tuple[float, float] | None = None
-        self._last_size: tuple[float, float] | None = None
-        self._smoothed_position: tuple[float, float] | None = None
-        self._smoothed_size: tuple[float, float] | None = None
-        logger.info("FaceTracker initialized with backend=%s", self.backend)
-    def _init_detector(self) -> None:
-        """Initialize face detector lazily."""
-        self._detector = _FaceDetectionBackend(
-            min_face_size=self.min_face_size,
-            backend=self.backend,
-        )
-    def _select_face(
-        self,
-        faces: list,
-        frame_width: int,
-        frame_height: int,
-    ) -> tuple[float, float, float, float] | None:
-        """Select a face based on the configured strategy.
-        Args:
-            faces: List of DetectedFace objects.
-            frame_width: Width of the frame.
-            frame_height: Height of the frame.
-        Returns:
-            Tuple of (center_x, center_y, width, height) in normalized coords, or None.
-        """
-        if not faces:
-            return None
-        if self.selection_strategy == "largest":
-            # Faces are already sorted by area (largest first)
-            face = faces[0]
-        elif self.selection_strategy == "centered":
-            # Find face closest to center
-            frame_center = (0.5, 0.5)
-            face = min(
-                faces,
-                key=lambda f: (
-                    (f.bounding_box.center[0] - frame_center[0]) ** 2
-                    + (f.bounding_box.center[1] - frame_center[1]) ** 2
-                ),
-            )
-        elif self.selection_strategy == "index":
-            if self.face_index < len(faces):
-                face = faces[self.face_index]
-            else:
-                face = faces[0]  # Fall back to largest
-        else:
-            face = faces[0]
-        bbox = face.bounding_box
-        return (bbox.center[0], bbox.center[1], bbox.width, bbox.height)
-    def detect_and_track(
-        self,
-        frame: np.ndarray,
-        frame_index: int,
-    ) -> tuple[float, float, float, float] | None:
-        """Detect face in frame and return smoothed position.
-        Args:
-            frame: Video frame as numpy array (H, W, 3).
-            frame_index: Index of current frame.
-        Returns:
-            Tuple of (center_x, center_y, width, height) in normalized coords,
-            or None if no face detected and no fallback available.
-        """
-        if self._detector is None:
-            self._init_detector()
-            assert self._detector is not None
-        h, w = frame.shape[:2]
-        # Only run detection on interval frames
-        should_detect = frame_index % self.detection_interval == 0
-        if should_detect:
-            faces = self._detector.detect(frame)
-            face_info = self._select_face(faces, w, h)
-            if face_info:
-                cx, cy, fw, fh = face_info
-                self._last_position = (cx, cy)
-                self._last_size = (fw, fh)
-        else:
-            # Use last detected position
-            face_info = None
-            if self._last_position and self._last_size:
-                face_info = (*self._last_position, *self._last_size)
-        if face_info:
-            cx, cy, fw, fh = face_info
-            # Apply exponential moving average smoothing
-            if self._smoothed_position is None:
-                self._smoothed_position = (cx, cy)
-                self._smoothed_size = (fw, fh)
-            else:
-                alpha = 1 - self.smoothing
-                self._smoothed_position = (
-                    self._smoothed_position[0] * self.smoothing + cx * alpha,
-                    self._smoothed_position[1] * self.smoothing + cy * alpha,
-                )
-                assert self._smoothed_size is not None  # Set alongside _smoothed_position
-                self._smoothed_size = (
-                    self._smoothed_size[0] * self.smoothing + fw * alpha,
-                    self._smoothed_size[1] * self.smoothing + fh * alpha,
-                )
-            return (*self._smoothed_position, *self._smoothed_size)
-        # Return last smoothed position as fallback
-        if self._smoothed_position and self._smoothed_size:
-            return (*self._smoothed_position, *self._smoothed_size)
-        return None
-    def reset(self) -> None:
-        """Reset tracker state for a new video."""
-        self._last_position = None
-        self._last_size = None
-        self._smoothed_position = None
-        self._smoothed_size = None
-    @staticmethod
-    def _interpolate_bbox(
-        bbox1: tuple[float, float, float, float],
-        bbox2: tuple[float, float, float, float],
-        t: float,
-    ) -> tuple[float, float, float, float]:
-        """Linearly interpolate between two bounding boxes.
-        Args:
-            bbox1: First bounding box (cx, cy, w, h).
-            bbox2: Second bounding box (cx, cy, w, h).
-            t: Interpolation factor (0 = bbox1, 1 = bbox2).
-        Returns:
-            Interpolated bounding box (cx, cy, w, h).
-        """
-        return (
-            bbox1[0] + (bbox2[0] - bbox1[0]) * t,
-            bbox1[1] + (bbox2[1] - bbox1[1]) * t,
-            bbox1[2] + (bbox2[2] - bbox1[2]) * t,
-            bbox1[3] + (bbox2[3] - bbox1[3]) * t,
-        )
-    def track_video(
-        self,
-        frames: np.ndarray,
-    ) -> list[tuple[float, float, float, float] | None]:
-        """Track face through entire video using optimized batch detection.
-        This method is optimized for GPU backends with frame sampling and
-        interpolation for smooth tracking with reduced computation.
-        Args:
-            frames: Video frames array of shape (N, H, W, 3).
-        Returns:
-            List of face positions (cx, cy, w, h) for each frame, or None if
-            no face detected and no fallback available.
-        """
-        if self._detector is None:
-            self._init_detector()
-            assert self._detector is not None
-        n_frames = len(frames)
-        if n_frames == 0:
-            return []
-        h, w = frames[0].shape[:2]
-        execution_device_getter = getattr(self._detector, "execution_device", None)
-        if callable(execution_device_getter):
-            resolved = execution_device_getter()
-            backend_execution_device = resolved if resolved in {"cpu", "cuda"} else None
-        else:
-            backend_execution_device = None
-        if backend_execution_device is None:
-            backend_execution_device = "cuda" if self.backend == "gpu" else "cpu"
-        use_sampled_interpolation = self.sample_rate > 1 and backend_execution_device == "cuda"
-        # Determine which frames to sample
-        if use_sampled_interpolation:
-            sample_indices = list(range(0, n_frames, self.sample_rate))
-            # Ensure last frame is included
-            if sample_indices[-1] != n_frames - 1:
-                sample_indices.append(n_frames - 1)
-        else:
-            sample_indices = list(range(n_frames))
-        # Batch detect on sampled frames
-        sampled_frames = [frames[i] for i in sample_indices]
-        # Process in batches
-        sampled_detections: list[list] = []
-        for batch_start in range(0, len(sampled_frames), self.batch_size):
-            batch_end = min(batch_start + self.batch_size, len(sampled_frames))
-            batch = sampled_frames[batch_start:batch_end]
-            batch_results = self._detector.detect_batch(batch)
-            sampled_detections.extend(batch_results)
-        # Extract face info from detections
-        sampled_faces: list[tuple[float, float, float, float] | None] = []
-        for faces in sampled_detections:
-            face_info = self._select_face(faces, w, h)
-            sampled_faces.append(face_info)
-        # If no sampled interpolation, apply smoothing directly over detections.
-        if not use_sampled_interpolation:
-            self.reset()
-            results: list[tuple[float, float, float, float] | None] = []
-            for i, face_info in enumerate(sampled_faces):
-                if face_info:
-                    cx, cy, fw, fh = face_info
-                    self._last_position = (cx, cy)
-                    self._last_size = (fw, fh)
-                    if self._smoothed_position is None:
-                        self._smoothed_position = (cx, cy)
-                        self._smoothed_size = (fw, fh)
-                    else:
-                        alpha = 1 - self.smoothing
-                        self._smoothed_position = (
-                            self._smoothed_position[0] * self.smoothing + cx * alpha,
-                            self._smoothed_position[1] * self.smoothing + cy * alpha,
-                        )
-                        assert self._smoothed_size is not None
-                        self._smoothed_size = (
-                            self._smoothed_size[0] * self.smoothing + fw * alpha,
-                            self._smoothed_size[1] * self.smoothing + fh * alpha,
-                        )
-                    results.append((*self._smoothed_position, *self._smoothed_size))
-                elif self._smoothed_position and self._smoothed_size:
-                    results.append((*self._smoothed_position, *self._smoothed_size))
-                else:
-                    results.append(None)
-            return results
-        # Interpolate between sampled frames
-        all_positions: list[tuple[float, float, float, float] | None] = [None] * n_frames
-        # Fill in sampled positions
-        for idx, sample_idx in enumerate(sample_indices):
-            all_positions[sample_idx] = sampled_faces[idx]
-        # Interpolate gaps
-        for i in range(len(sample_indices) - 1):
-            start_idx = sample_indices[i]
-            end_idx = sample_indices[i + 1]
-            start_face = sampled_faces[i]
-            end_face = sampled_faces[i + 1]
-            if start_face is None and end_face is None:
-                continue
-            elif start_face is None:
-                # Use end face for all
-                for j in range(start_idx, end_idx):
-                    all_positions[j] = end_face
-            elif end_face is None:
-                # Use start face for all
-                for j in range(start_idx + 1, end_idx + 1):
-                    all_positions[j] = start_face
-            else:
-                # Interpolate
-                gap = end_idx - start_idx
-                for j in range(start_idx + 1, end_idx):
-                    t = (j - start_idx) / gap
-                    all_positions[j] = self._interpolate_bbox(start_face, end_face, t)
-        # Apply smoothing to interpolated positions
-        self.reset()
-        results = []
-        for face_info in all_positions:
-            if face_info:
-                cx, cy, fw, fh = face_info
-                if self._smoothed_position is None:
-                    self._smoothed_position = (cx, cy)
-                    self._smoothed_size = (fw, fh)
-                else:
-                    alpha = 1 - self.smoothing
-                    self._smoothed_position = (
-                        self._smoothed_position[0] * self.smoothing + cx * alpha,
-                        self._smoothed_position[1] * self.smoothing + cy * alpha,
-                    )
-                    assert self._smoothed_size is not None
-                    self._smoothed_size = (
-                        self._smoothed_size[0] * self.smoothing + fw * alpha,
-                        self._smoothed_size[1] * self.smoothing + fh * alpha,
-                    )
-                results.append((*self._smoothed_position, *self._smoothed_size))
-            elif self._smoothed_position and self._smoothed_size:
-                results.append((*self._smoothed_position, *self._smoothed_size))
-            else:
-                results.append(None)
-        return results
 class FaceTrackingCrop(Transformation):
     """Crops video to follow detected faces.

{videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/understanding/__init__.py RENAMED Viewed

@@ -1,12 +1,12 @@
 from .audio import AudioClassifier, AudioToText
+from .faces import FaceTracker
 from .image import SceneVLM
-from .temporal import ActionRecognizer, SemanticSceneDetector
+from .temporal import SemanticSceneDetector
 __all__ = [
     "AudioToText",
     "AudioClassifier",
+    "FaceTracker",
     "SceneVLM",
-    # Temporal
-    "ActionRecognizer",
     "SemanticSceneDetector",
 ]

videopython 0.28.3__tar.gz → 0.29.0__tar.gz

videopython 0.28.3tar.gz → 0.29.0tar.gz