PyPI - videopython - Versions diffs - 0.36.1__tar.gz → 0.37.0__tar.gz - Mend

videopython 0.36.1tar.gz → 0.37.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

{videopython-0.36.1 → videopython-0.37.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: videopython
-Version: 0.36.1
+Version: 0.37.0
 Summary: Minimal video generation and processing library.
 Project-URL: Homepage, https://videopython.com
 Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -118,7 +118,7 @@ See the [LLM Integration Guide](https://videopython.com/guides/llm-integration/)
 - **`videopython.base`** — `Video`, `VideoMetadata`, `FrameIterator`, `ImageText`, `Transcription`, and shared result types (`BoundingBox`, `FaceTrack`, `SceneBoundary`, ...). No AI dependencies.
 - **`videopython.audio`** — `Audio` with overlay, concat, normalize, time-stretch, silence detection, segment classification.
 - **`videopython.editing`** — `Operation`/`Effect` foundation, `VideoEdit` plan runner with JSON Schema + streaming execution. Transforms (cut, resize, crop, fps, speed, reverse, freeze, silence removal) and effects (blur, zoom, color grading, vignette, Ken Burns, fade, overlays, animated subtitles).
-- **`videopython.ai`** *(install with `[ai]`)* — generation (`TextToVideo`, `ImageToVideo`, `TextToImage`, `TextToSpeech`, `TextToMusic`), understanding (`AudioToText`, `AudioClassifier`, `SceneVLM`, `FaceTracker`, `SemanticSceneDetector`), `FaceTrackingCrop` transform, and the full-pipeline `VideoAnalyzer`.
+- **`videopython.ai`** *(install with `[ai]`)* — generation (`TextToVideo`, `ImageToVideo`, `TextToImage`, `TextToSpeech`, `TextToMusic`), understanding (`AudioToText`, `AudioClassifier`, `SceneVLM`, `FaceTracker`, `ObjectDetector`, `SemanticSceneDetector`), the `FaceTrackingCrop` transform, the `ObjectDetectionOverlay` effect (per-frame bounding boxes + labels), and the full-pipeline `VideoAnalyzer`.
 - **`videopython.ai.dubbing`** — `VideoDubber` for voice-cloned revoicing with timing sync.
 ## Examples

{videopython-0.36.1 → videopython-0.37.0}/README.md RENAMED Viewed

@@ -69,7 +69,7 @@ See the [LLM Integration Guide](https://videopython.com/guides/llm-integration/)
 - **`videopython.base`** — `Video`, `VideoMetadata`, `FrameIterator`, `ImageText`, `Transcription`, and shared result types (`BoundingBox`, `FaceTrack`, `SceneBoundary`, ...). No AI dependencies.
 - **`videopython.audio`** — `Audio` with overlay, concat, normalize, time-stretch, silence detection, segment classification.
 - **`videopython.editing`** — `Operation`/`Effect` foundation, `VideoEdit` plan runner with JSON Schema + streaming execution. Transforms (cut, resize, crop, fps, speed, reverse, freeze, silence removal) and effects (blur, zoom, color grading, vignette, Ken Burns, fade, overlays, animated subtitles).
-- **`videopython.ai`** *(install with `[ai]`)* — generation (`TextToVideo`, `ImageToVideo`, `TextToImage`, `TextToSpeech`, `TextToMusic`), understanding (`AudioToText`, `AudioClassifier`, `SceneVLM`, `FaceTracker`, `SemanticSceneDetector`), `FaceTrackingCrop` transform, and the full-pipeline `VideoAnalyzer`.
+- **`videopython.ai`** *(install with `[ai]`)* — generation (`TextToVideo`, `ImageToVideo`, `TextToImage`, `TextToSpeech`, `TextToMusic`), understanding (`AudioToText`, `AudioClassifier`, `SceneVLM`, `FaceTracker`, `ObjectDetector`, `SemanticSceneDetector`), the `FaceTrackingCrop` transform, the `ObjectDetectionOverlay` effect (per-frame bounding boxes + labels), and the full-pipeline `VideoAnalyzer`.
 - **`videopython.ai.dubbing`** — `VideoDubber` for voice-cloned revoicing with timing sync.
 ## Examples

{videopython-0.36.1 → videopython-0.37.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "videopython"
-version = "0.36.1"
+version = "0.37.0"
 description = "Minimal video generation and processing library."
 authors = [
     { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },

{videopython-0.36.1 → videopython-0.37.0}/src/videopython/ai/__init__.py RENAMED Viewed

@@ -1,9 +1,11 @@
+from .effects import ObjectDetectionOverlay
 from .generation import ImageToVideo, TextToImage, TextToMusic, TextToSpeech, TextToVideo
 from .transforms import FaceTrackingCrop
 from .understanding import (
     AudioClassifier,
     AudioToText,
     FaceTracker,
+    ObjectDetector,
     SceneVLM,
     SemanticSceneDetector,
 )
@@ -20,10 +22,13 @@ __all__ = [
     "AudioToText",
     "AudioClassifier",
     "FaceTracker",
+    "ObjectDetector",
     "SceneVLM",
     "SemanticSceneDetector",
     # Transforms (AI-powered)
     "FaceTrackingCrop",
+    # Effects (AI-powered)
+    "ObjectDetectionOverlay",
     # Video analysis
     "VideoAnalysis",
     "VideoAnalysisConfig",

videopython-0.37.0/src/videopython/ai/effects.py ADDED Viewed

@@ -0,0 +1,112 @@
+"""AI-powered video effects that require object detection.
+Effects here are real :class:`~videopython.editing.operation.Effect` subclasses
+(shape-preserving, streamable) that physically live in ``videopython.ai`` so the
+``videopython.editing`` layer keeps no AI dependency -- the same direction
+``FaceTrackingCrop`` imports ``Operation``. The pixel work is delegated to the
+AI-free renderer in :mod:`videopython.base.draw_detections`.
+"""
+from __future__ import annotations
+from typing import ClassVar, Literal
+import numpy as np
+from pydantic import Field, PrivateAttr
+from videopython.ai.understanding.objects import ObjectDetector
+from videopython.base.description import DetectedObject
+from videopython.base.draw_detections import DetectionStyle, draw_detections
+from videopython.editing.operation import Effect
+__all__ = ["ObjectDetectionOverlay"]
+class ObjectDetectionOverlay(Effect):
+    """Detect objects per frame and overlay labelled bounding boxes.
+    Runs a YOLOv8-COCO detector and composites tidy, colour-coded boxes with
+    class labels (and optional confidence) onto every frame in the window.
+    Detection runs on a ``detection_interval`` cadence in the streaming path and
+    boxes are held between detections, so the cost is *compute*-bound, not
+    *memory*-bound: ``"streamable"`` here means bounded memory, not bounded
+    compute. On long clips, cap cost with ``window`` (limit the time range),
+    a larger ``detection_interval``, a ``class_filter``, and/or the smaller
+    ``model_size``. Because only ``streaming_init`` and ``process_frame`` are
+    overridden, the base ``Effect._apply`` replays the identical contract for
+    in-memory execution, so eager and streaming results cannot drift.
+    """
+    op: Literal["object_detection_overlay"] = "object_detection_overlay"
+    streamable: ClassVar[bool] = True
+    confidence_threshold: float = Field(0.5, ge=0, le=1, description="Minimum detection confidence to draw a box, 0-1.")
+    class_filter: list[str] | None = Field(
+        None,
+        description='Only draw these COCO class names, e.g. ["person", "car", "dog"]. Null draws all classes.',
+    )
+    show_confidence: bool = Field(True, description="Append the detection confidence as a percentage to each label.")
+    box_color: tuple[int, int, int] | None = Field(
+        None,
+        description="Fixed box color as [R, G, B] (0-255) for every box, or null for distinct per-class colors.",
+    )
+    line_thickness: float = Field(
+        0.003, gt=0, description="Box stroke width as a fraction of the frame's longer side (~3px at 1080p)."
+    )
+    label_font_size: float = Field(
+        0.022, gt=0, description="Label text height as a fraction of the frame's longer side (~24px at 1080p)."
+    )
+    detection_interval: int = Field(
+        2,
+        ge=1,
+        description="Run detection every Nth frame and reuse the last result in between. Higher is faster.",
+    )
+    model_size: Literal["n", "s", "m"] = Field(
+        "n",
+        description=(
+            "YOLOv8 model size: 'n' (nano, fastest), 's' (small), 'm' (medium, most accurate). "
+            "Larger detects better but is slower."
+        ),
+    )
+    backend: Literal["cpu", "gpu", "auto"] = Field(
+        "auto",
+        description="Detection device: 'cpu', 'gpu', or 'auto'.",
+        json_schema_extra={"llm_hidden": True},
+    )
+    _detector: ObjectDetector | None = PrivateAttr(default=None)
+    _last: list[DetectedObject] = PrivateAttr(default_factory=list)
+    def _style(self) -> DetectionStyle:
+        return DetectionStyle(
+            box_color=self.box_color,
+            line_thickness=self.line_thickness,
+            show_confidence=self.show_confidence,
+            label_font_size=self.label_font_size,
+            min_confidence=self.confidence_threshold,
+        )
+    def _init_detector(self) -> None:
+        """Build the detector lazily. Single patch point for tests."""
+        if self._detector is None:
+            self._detector = ObjectDetector(
+                model_name=f"yolov8{self.model_size}.pt",
+                confidence_threshold=self.confidence_threshold,
+                class_filter=tuple(self.class_filter or ()),
+                backend=self.backend,
+            )
+    def streaming_init(self, total_frames: int, fps: float, width: int, height: int) -> None:
+        self._last = []
+        self._init_detector()
+    def process_frame(self, frame: np.ndarray, frame_index: int) -> np.ndarray:
+        if self._detector is None:
+            self._init_detector()
+        assert self._detector is not None
+        # frame_index is 0-based within the effect's window, so frame 0 always
+        # detects; intermediate frames reuse the last result.
+        if frame_index % self.detection_interval == 0:
+            self._last = self._detector.detect(frame)
+        return draw_detections(frame, self._last, self._style())

{videopython-0.36.1 → videopython-0.37.0}/src/videopython/ai/understanding/__init__.py RENAMED Viewed

@@ -1,12 +1,14 @@
 from .audio import AudioClassifier, AudioToText
 from .faces import FaceTracker
 from .image import SceneVLM
+from .objects import ObjectDetector
 from .temporal import SemanticSceneDetector
 __all__ = [
     "AudioToText",
     "AudioClassifier",
     "FaceTracker",
+    "ObjectDetector",
     "SceneVLM",
     "SemanticSceneDetector",
 ]

videopython-0.37.0/src/videopython/ai/understanding/objects.py ADDED Viewed

@@ -0,0 +1,145 @@
+"""General object detection for the understanding layer.
+``ObjectDetector`` is the object-detection counterpart to the face detector in
+``faces.py``: a lazy YOLOv8-COCO wrapper returning
+:class:`~videopython.base.description.DetectedObject` with normalized bounding
+boxes. It mirrors ``_FaceDetector`` (lazy init, device selection, ``detect`` /
+``detect_batch``) so the two share one mental model. Consumed by
+``videopython.ai.effects.ObjectDetectionOverlay``; usable directly for any
+per-frame object analysis.
+"""
+from __future__ import annotations
+import logging
+from typing import Any, Literal
+import numpy as np
+from videopython.ai._device import select_device
+from videopython.base.description import BoundingBox, DetectedObject
+logger = logging.getLogger(__name__)
+__all__ = ["ObjectDetector"]
+class ObjectDetector:
+    """Lazy YOLOv8-COCO object detector returning normalized detections.
+    The Ultralytics weights (default ``yolov8n.pt``) auto-download on first
+    real use; class names come from the loaded model. Detection is gated by
+    ``confidence_threshold`` and optionally restricted to ``class_filter``.
+    """
+    DEFAULT_CONFIDENCE_THRESHOLD = 0.5
+    def __init__(
+        self,
+        model_name: str = "yolov8n.pt",
+        confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD,
+        class_filter: tuple[str, ...] = (),
+        backend: Literal["cpu", "gpu", "auto"] = "auto",
+    ):
+        """Initialize the detector.
+        Args:
+            model_name: Ultralytics COCO model id or path (e.g. ``yolov8n.pt``,
+                ``yolov8s.pt``, ``yolov8m.pt``). Downloaded on first use.
+            confidence_threshold: Minimum detection confidence in ``[0, 1]``.
+            class_filter: If non-empty, only these COCO class names are kept.
+            backend: Detection device - ``"cpu"``, ``"gpu"``, or ``"auto"``.
+        """
+        self.model_name = model_name
+        self.confidence_threshold = confidence_threshold
+        self.class_filter = class_filter
+        self.backend: Literal["cpu", "gpu", "auto"] = backend
+        self._resolved_device: Literal["cpu", "cuda"] | None = None
+        self._yolo_model: Any = None
+        self._class_names: dict[int, str] = {}
+        logger.info("ObjectDetector initialized with model=%s backend=%s", model_name, backend)
+    def _resolve_device(self) -> Literal["cpu", "cuda"]:
+        if self._resolved_device is not None:
+            return self._resolved_device
+        if self.backend == "cpu":
+            self._resolved_device = "cpu"
+            return self._resolved_device
+        if self.backend == "gpu":
+            resolved = select_device(None, mps_allowed=False)
+            if resolved != "cuda":
+                raise ValueError("GPU backend requested but CUDA is not available.")
+            self._resolved_device = "cuda"
+            return self._resolved_device
+        resolved_auto = select_device(None, mps_allowed=False)
+        self._resolved_device = "cuda" if resolved_auto == "cuda" else "cpu"
+        return self._resolved_device
+    def execution_device(self) -> Literal["cpu", "cuda"]:
+        """Resolved execution device for this detector."""
+        return self._resolve_device()
+    def _init_yolo(self) -> None:
+        from ultralytics import YOLO
+        self._yolo_model = YOLO(self.model_name)
+        self._class_names = dict(self._yolo_model.names)
+        if self._resolve_device() == "cuda":
+            self._yolo_model.to("cuda")
+    def _objects_from_yolo_result(self, result: Any) -> list[DetectedObject]:
+        detected: list[DetectedObject] = []
+        boxes = result.boxes
+        if boxes is None:
+            return detected
+        img_h, img_w = result.orig_shape
+        for i in range(len(boxes)):
+            label = self._class_names.get(int(boxes.cls[i]), str(int(boxes.cls[i])))
+            if self.class_filter and label not in self.class_filter:
+                continue
+            x1, y1, x2, y2 = boxes.xyxy[i].tolist()
+            detected.append(
+                DetectedObject(
+                    label=label,
+                    confidence=float(boxes.conf[i]),
+                    bounding_box=BoundingBox(
+                        x=x1 / img_w,
+                        y=y1 / img_h,
+                        width=(x2 - x1) / img_w,
+                        height=(y2 - y1) / img_h,
+                    ),
+                )
+            )
+        detected.sort(key=lambda d: d.confidence, reverse=True)
+        return detected
+    def detect(self, image: np.ndarray) -> list[DetectedObject]:
+        """Detect objects in a single ``(H, W, 3)`` frame."""
+        if self._yolo_model is None:
+            self._init_yolo()
+        assert self._yolo_model is not None
+        results = self._yolo_model(image, conf=self.confidence_threshold, verbose=False)
+        if not results:
+            return []
+        return self._objects_from_yolo_result(results[0])
+    def detect_batch(self, images: list[np.ndarray] | np.ndarray) -> list[list[DetectedObject]]:
+        """Detect objects in a batch of frames (list or stacked ``(N, H, W, 3)``)."""
+        if isinstance(images, np.ndarray):
+            images = [images[i] for i in range(images.shape[0])] if images.ndim == 4 else [images]
+        if not images:
+            return []
+        if self._yolo_model is None:
+            self._init_yolo()
+        assert self._yolo_model is not None
+        results = self._yolo_model(images, conf=self.confidence_threshold, verbose=False)
+        return [self._objects_from_yolo_result(result) for result in results]

{videopython-0.36.1 → videopython-0.37.0}/src/videopython/base/__init__.py RENAMED Viewed

@@ -10,6 +10,7 @@ from .description import (
     SceneBoundary,
     SceneDescription,
 )
+from .draw_detections import DetectionStyle, class_color, draw_detections
 from .exceptions import (
     AudioError,
     AudioLoadError,
@@ -44,6 +45,10 @@ __all__ = [
     "ImageText",
     "TextAlign",
     "AnchorPoint",
+    # Detection overlay renderer (AI-free)
+    "draw_detections",
+    "DetectionStyle",
+    "class_color",
     # Transcription data classes
     "Transcription",
     "TranscriptionSegment",

videopython-0.37.0/src/videopython/base/draw_detections.py ADDED Viewed

@@ -0,0 +1,164 @@
+"""Pure, AI-free renderer for object-detection overlays.
+Draws labelled bounding boxes onto a frame from a list of
+:class:`~videopython.base.description.DetectedObject`. This module has **no AI
+dependencies** -- it is the single source of truth for how detections look, so
+it can be unit-tested with synthetic detections and reused by any detector. The
+AI side (``videopython.ai``) only produces the ``DetectedObject`` list and calls
+:func:`draw_detections`.
+Visuals: a resolution-scaled box stroke plus a label chip filled in the box's
+own colour (so chip and box read as one unit) with anti-aliased text. Colours
+are deterministic per class via :func:`class_color`, so the same class is the
+same colour in every frame and across runs.
+"""
+from __future__ import annotations
+import colorsys
+import hashlib
+from dataclasses import dataclass
+import numpy as np
+from PIL import Image, ImageDraw
+from videopython.base.description import DetectedObject
+from videopython.base.fonts import load_font
+__all__ = ["DetectionStyle", "class_color", "draw_detections"]
+# Hand-picked Material-palette hues for common COCO classes so busy scenes read
+# clearly. Any class not listed gets a deterministic colour from ``class_color``.
+_RESERVED_COLORS: dict[str, tuple[int, int, int]] = {
+    "person": (76, 175, 80),  # green
+    "bicycle": (0, 188, 212),  # cyan
+    "car": (33, 150, 243),  # blue
+    "motorcycle": (156, 39, 176),  # purple
+    "bus": (255, 193, 7),  # amber
+    "truck": (255, 87, 34),  # deep orange
+    "cat": (233, 30, 99),  # pink
+    "dog": (255, 152, 0),  # orange
+}
+def class_color(label: str) -> tuple[int, int, int]:
+    """Deterministic RGB colour for a class label.
+    Common COCO classes get a reserved Material hue; everything else maps
+    ``md5(label) -> HSV hue`` at fixed saturation/value. ``md5`` (not the
+    salted built-in ``hash``) is used so colours are stable across processes
+    and test runs.
+    """
+    reserved = _RESERVED_COLORS.get(label)
+    if reserved is not None:
+        return reserved
+    digest = int(hashlib.md5(label.encode("utf-8")).hexdigest(), 16)
+    hue = (digest % 360) / 360.0
+    r, g, b = colorsys.hsv_to_rgb(hue, 0.7, 0.95)
+    return int(r * 255), int(g * 255), int(b * 255)
+@dataclass(frozen=True)
+class DetectionStyle:
+    """Styling for :func:`draw_detections`.
+    Lengths expressed as a fraction of the frame's longer side are
+    resolution-independent: the same style reads consistently at 1080p and 4k.
+    """
+    box_color: tuple[int, int, int] | None = None
+    """Fixed ``(R, G, B)`` for every box, or ``None`` for per-class colours."""
+    line_thickness: float = 0.003
+    """Box stroke width as a fraction of ``max(height, width)`` (~3px at 1080p)."""
+    show_confidence: bool = True
+    """Append the confidence as a whole-number percent to each label."""
+    label_font_size: float = 0.022
+    """Label text height as a fraction of ``max(height, width)`` (~24px at 1080p)."""
+    label_text_color: tuple[int, int, int] = (255, 255, 255)
+    """Colour of the label text drawn on the chip."""
+    label_bg_alpha: int = 200
+    """Opacity (0-255) of the label chip background."""
+    min_confidence: float = 0.0
+    """Detections below this confidence are skipped."""
+    font: str | None = None
+    """Bundled font name or path; ``None`` uses the default font."""
+def draw_detections(
+    frame: np.ndarray,
+    detections: list[DetectedObject],
+    style: DetectionStyle = DetectionStyle(),
+) -> np.ndarray:
+    """Return a copy of ``frame`` with ``detections`` drawn as labelled boxes.
+    Shape-preserving: the result is the same ``(H, W, 3)`` ``uint8`` array. An
+    empty ``detections`` list (or one filtered out by ``min_confidence``) is a
+    no-op that returns ``frame`` unchanged. Boxes are clamped to the frame, so
+    off-frame coordinates clip cleanly instead of raising. Label chips flip
+    inside the box when they would overflow the top edge and clamp horizontally
+    so they never leave the frame.
+    Args:
+        frame: Source frame as ``(H, W, 3)`` ``uint8`` (RGB).
+        detections: Objects to draw; each uses its normalized ``bounding_box``.
+        style: Visual styling (colours, stroke width, label options).
+    Returns:
+        A new ``(H, W, 3)`` ``uint8`` frame with the overlays composited on.
+    """
+    if not detections:
+        return frame
+    h, w = frame.shape[:2]
+    scale = max(h, w)
+    thickness = max(1, round(style.line_thickness * scale))
+    font_px = max(8, round(style.label_font_size * scale))
+    font = load_font(style.font, font_px)
+    canvas = Image.new("RGBA", (w, h), (0, 0, 0, 0))
+    draw = ImageDraw.Draw(canvas)
+    drew_any = False
+    for det in detections:
+        box = det.bounding_box
+        if box is None or det.confidence < style.min_confidence:
+            continue
+        drew_any = True
+        color = style.box_color or class_color(det.label)
+        x0 = max(0, min(w - 1, int(box.x * w)))
+        y0 = max(0, min(h - 1, int(box.y * h)))
+        x1 = max(0, min(w - 1, int((box.x + box.width) * w)))
+        y1 = max(0, min(h - 1, int((box.y + box.height) * h)))
+        draw.rectangle((x0, y0, x1, y1), outline=(*color, 255), width=thickness)
+        text = det.label.title()
+        if style.show_confidence:
+            text = f"{text} {det.confidence * 100:.0f}%"
+        tb = draw.textbbox((0, 0), text, font=font)
+        text_w, text_h = tb[2] - tb[0], tb[3] - tb[1]
+        pad = max(2, thickness)
+        chip_w, chip_h = text_w + 2 * pad, text_h + 2 * pad
+        # Flip the chip inside the box when it would overflow the top edge,
+        # and clamp horizontally so it never leaves the frame.
+        chip_y = y0 - chip_h if y0 - chip_h >= 0 else y0
+        chip_x = max(0, min(x0, w - chip_w))
+        draw.rectangle(
+            (chip_x, chip_y, chip_x + chip_w, chip_y + chip_h),
+            fill=(*color, style.label_bg_alpha),
+        )
+        draw.text(
+            (chip_x + pad - tb[0], chip_y + pad - tb[1]),
+            text,
+            font=font,
+            fill=(*style.label_text_color, 255),
+        )
+    if not drew_any:
+        return frame
+    out = Image.fromarray(frame).convert("RGBA")
+    out.alpha_composite(canvas)
+    return np.array(out.convert("RGB"), dtype=np.uint8)