videopython 0.36.1__tar.gz → 0.38.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. {videopython-0.36.1 → videopython-0.38.0}/PKG-INFO +6 -4
  2. {videopython-0.36.1 → videopython-0.38.0}/README.md +5 -3
  3. {videopython-0.36.1 → videopython-0.38.0}/pyproject.toml +1 -1
  4. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/__init__.py +5 -0
  5. videopython-0.38.0/src/videopython/ai/effects.py +112 -0
  6. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/understanding/__init__.py +2 -0
  7. videopython-0.38.0/src/videopython/ai/understanding/objects.py +145 -0
  8. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/base/__init__.py +14 -0
  9. videopython-0.38.0/src/videopython/base/draw_detections.py +164 -0
  10. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/base/exceptions.py +39 -0
  11. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/editing/effects.py +6 -1
  12. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/editing/operation.py +114 -11
  13. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/editing/transforms.py +43 -4
  14. videopython-0.38.0/src/videopython/editing/video_edit.py +1353 -0
  15. videopython-0.36.1/src/videopython/editing/video_edit.py +0 -857
  16. {videopython-0.36.1 → videopython-0.38.0}/.gitignore +0 -0
  17. {videopython-0.36.1 → videopython-0.38.0}/LICENSE +0 -0
  18. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/__init__.py +0 -0
  19. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/_device.py +0 -0
  20. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/dubbing/__init__.py +0 -0
  21. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/dubbing/config.py +0 -0
  22. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/dubbing/dubber.py +0 -0
  23. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/dubbing/expressiveness.py +0 -0
  24. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/dubbing/loudness.py +0 -0
  25. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/dubbing/models.py +0 -0
  26. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/dubbing/pipeline.py +0 -0
  27. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/dubbing/quality.py +0 -0
  28. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/dubbing/remux.py +0 -0
  29. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/dubbing/timing.py +0 -0
  30. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/dubbing/voice_sample.py +0 -0
  31. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/generation/__init__.py +0 -0
  32. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/generation/audio.py +0 -0
  33. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/generation/image.py +0 -0
  34. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/generation/qwen3.py +0 -0
  35. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/generation/translation.py +0 -0
  36. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/generation/video.py +0 -0
  37. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/transforms.py +0 -0
  38. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/understanding/audio.py +0 -0
  39. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/understanding/faces.py +0 -0
  40. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/understanding/image.py +0 -0
  41. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/understanding/separation.py +0 -0
  42. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/understanding/temporal.py +0 -0
  43. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/video_analysis/__init__.py +0 -0
  44. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/video_analysis/analyzer.py +0 -0
  45. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/video_analysis/models.py +0 -0
  46. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/video_analysis/sampling.py +0 -0
  47. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/ai/video_analysis/stages.py +0 -0
  48. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/audio/__init__.py +0 -0
  49. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/audio/analysis.py +0 -0
  50. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/audio/audio.py +0 -0
  51. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/base/_dimensions.py +0 -0
  52. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/base/_ffmpeg.py +0 -0
  53. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/base/_video_io.py +0 -0
  54. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/base/description.py +0 -0
  55. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/base/fonts/Anton-OFL.txt +0 -0
  56. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/base/fonts/Anton-Regular.ttf +0 -0
  57. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/base/fonts/BebasNeue-OFL.txt +0 -0
  58. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/base/fonts/BebasNeue-Regular.ttf +0 -0
  59. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/base/fonts/DejaVuSans.ttf +0 -0
  60. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/base/fonts/LICENSE_DEJAVU +0 -0
  61. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/base/fonts/Lato-Bold.ttf +0 -0
  62. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/base/fonts/Lato-OFL.txt +0 -0
  63. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/base/fonts/Poppins-Bold.ttf +0 -0
  64. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/base/fonts/Poppins-OFL.txt +0 -0
  65. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/base/fonts/__init__.py +0 -0
  66. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/base/image_text.py +0 -0
  67. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/base/transcription.py +0 -0
  68. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/base/video.py +0 -0
  69. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/editing/__init__.py +0 -0
  70. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/editing/_easing.py +0 -0
  71. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/editing/streaming.py +0 -0
  72. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/editing/transcription_overlay.py +0 -0
  73. {videopython-0.36.1 → videopython-0.38.0}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.36.1
3
+ Version: 0.38.0
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -109,16 +109,18 @@ video.add_audio(audio).save("ai_video.mp4")
109
109
 
110
110
  ## LLM & AI Agent Integration
111
111
 
112
- Every operation is a Pydantic model whose fields ARE the JSON wire format. `VideoEdit.json_schema()` returns a JSON Schema with a discriminated union over every LLM-exposed `Operation` (server-only ops like `image_overlay` are excluded by default) — pass it straight to Anthropic tool use, OpenAI function calling, or any structured-output API. Then `edit.validate()` dry-runs the plan via metadata before any frames are loaded, raising a typed `PlanValidationError` (with structured `.errors`) that can be fed back to the LLM and retried cheaply.
112
+ Every operation is a Pydantic model whose fields ARE the JSON wire format. `VideoEdit.json_schema()` returns a JSON Schema with a discriminated union over every LLM-exposed `Operation` (server-only ops like `image_overlay` are excluded by default) — pass it straight to Anthropic tool use, OpenAI function calling, or any structured-output API. Pass `strict=True` for a provider strict-mode grammar that prevents simple bound violations at decode time.
113
113
 
114
- See the [LLM Integration Guide](https://videopython.com/guides/llm-integration/) for end-to-end examples, validation error loops, and operation discovery patterns.
114
+ The plan parses permissively (shape only) and owns numeric bounds at validation, so a refine loop converges fast: `edit.check(meta)` collects **every** structured `PlanError` in one pass, `edit.repair(meta)` auto-clamps the mechanical violations (window/timestamp overruns, negatives) with a reported changelog, and `edit.normalize_dimensions(meta, target)` makes heterogeneous segments concat-compatible by construction. `edit.validate()` still raises a typed `PlanValidationError` (a `ValueError` with structured `.errors`) for the single-error path.
115
+
116
+ See the [LLM Integration Guide](https://videopython.com/guides/llm-integration/) for end-to-end examples, the collect/repair/normalize refine loop, and operation discovery patterns.
115
117
 
116
118
  ## Features
117
119
 
118
120
  - **`videopython.base`** — `Video`, `VideoMetadata`, `FrameIterator`, `ImageText`, `Transcription`, and shared result types (`BoundingBox`, `FaceTrack`, `SceneBoundary`, ...). No AI dependencies.
119
121
  - **`videopython.audio`** — `Audio` with overlay, concat, normalize, time-stretch, silence detection, segment classification.
120
122
  - **`videopython.editing`** — `Operation`/`Effect` foundation, `VideoEdit` plan runner with JSON Schema + streaming execution. Transforms (cut, resize, crop, fps, speed, reverse, freeze, silence removal) and effects (blur, zoom, color grading, vignette, Ken Burns, fade, overlays, animated subtitles).
121
- - **`videopython.ai`** *(install with `[ai]`)* — generation (`TextToVideo`, `ImageToVideo`, `TextToImage`, `TextToSpeech`, `TextToMusic`), understanding (`AudioToText`, `AudioClassifier`, `SceneVLM`, `FaceTracker`, `SemanticSceneDetector`), `FaceTrackingCrop` transform, and the full-pipeline `VideoAnalyzer`.
123
+ - **`videopython.ai`** *(install with `[ai]`)* — generation (`TextToVideo`, `ImageToVideo`, `TextToImage`, `TextToSpeech`, `TextToMusic`), understanding (`AudioToText`, `AudioClassifier`, `SceneVLM`, `FaceTracker`, `ObjectDetector`, `SemanticSceneDetector`), the `FaceTrackingCrop` transform, the `ObjectDetectionOverlay` effect (per-frame bounding boxes + labels), and the full-pipeline `VideoAnalyzer`.
122
124
  - **`videopython.ai.dubbing`** — `VideoDubber` for voice-cloned revoicing with timing sync.
123
125
 
124
126
  ## Examples
@@ -60,16 +60,18 @@ video.add_audio(audio).save("ai_video.mp4")
60
60
 
61
61
  ## LLM & AI Agent Integration
62
62
 
63
- Every operation is a Pydantic model whose fields ARE the JSON wire format. `VideoEdit.json_schema()` returns a JSON Schema with a discriminated union over every LLM-exposed `Operation` (server-only ops like `image_overlay` are excluded by default) — pass it straight to Anthropic tool use, OpenAI function calling, or any structured-output API. Then `edit.validate()` dry-runs the plan via metadata before any frames are loaded, raising a typed `PlanValidationError` (with structured `.errors`) that can be fed back to the LLM and retried cheaply.
63
+ Every operation is a Pydantic model whose fields ARE the JSON wire format. `VideoEdit.json_schema()` returns a JSON Schema with a discriminated union over every LLM-exposed `Operation` (server-only ops like `image_overlay` are excluded by default) — pass it straight to Anthropic tool use, OpenAI function calling, or any structured-output API. Pass `strict=True` for a provider strict-mode grammar that prevents simple bound violations at decode time.
64
64
 
65
- See the [LLM Integration Guide](https://videopython.com/guides/llm-integration/) for end-to-end examples, validation error loops, and operation discovery patterns.
65
+ The plan parses permissively (shape only) and owns numeric bounds at validation, so a refine loop converges fast: `edit.check(meta)` collects **every** structured `PlanError` in one pass, `edit.repair(meta)` auto-clamps the mechanical violations (window/timestamp overruns, negatives) with a reported changelog, and `edit.normalize_dimensions(meta, target)` makes heterogeneous segments concat-compatible by construction. `edit.validate()` still raises a typed `PlanValidationError` (a `ValueError` with structured `.errors`) for the single-error path.
66
+
67
+ See the [LLM Integration Guide](https://videopython.com/guides/llm-integration/) for end-to-end examples, the collect/repair/normalize refine loop, and operation discovery patterns.
66
68
 
67
69
  ## Features
68
70
 
69
71
  - **`videopython.base`** — `Video`, `VideoMetadata`, `FrameIterator`, `ImageText`, `Transcription`, and shared result types (`BoundingBox`, `FaceTrack`, `SceneBoundary`, ...). No AI dependencies.
70
72
  - **`videopython.audio`** — `Audio` with overlay, concat, normalize, time-stretch, silence detection, segment classification.
71
73
  - **`videopython.editing`** — `Operation`/`Effect` foundation, `VideoEdit` plan runner with JSON Schema + streaming execution. Transforms (cut, resize, crop, fps, speed, reverse, freeze, silence removal) and effects (blur, zoom, color grading, vignette, Ken Burns, fade, overlays, animated subtitles).
72
- - **`videopython.ai`** *(install with `[ai]`)* — generation (`TextToVideo`, `ImageToVideo`, `TextToImage`, `TextToSpeech`, `TextToMusic`), understanding (`AudioToText`, `AudioClassifier`, `SceneVLM`, `FaceTracker`, `SemanticSceneDetector`), `FaceTrackingCrop` transform, and the full-pipeline `VideoAnalyzer`.
74
+ - **`videopython.ai`** *(install with `[ai]`)* — generation (`TextToVideo`, `ImageToVideo`, `TextToImage`, `TextToSpeech`, `TextToMusic`), understanding (`AudioToText`, `AudioClassifier`, `SceneVLM`, `FaceTracker`, `ObjectDetector`, `SemanticSceneDetector`), the `FaceTrackingCrop` transform, the `ObjectDetectionOverlay` effect (per-frame bounding boxes + labels), and the full-pipeline `VideoAnalyzer`.
73
75
  - **`videopython.ai.dubbing`** — `VideoDubber` for voice-cloned revoicing with timing sync.
74
76
 
75
77
  ## Examples
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.36.1"
3
+ version = "0.38.0"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -1,9 +1,11 @@
1
+ from .effects import ObjectDetectionOverlay
1
2
  from .generation import ImageToVideo, TextToImage, TextToMusic, TextToSpeech, TextToVideo
2
3
  from .transforms import FaceTrackingCrop
3
4
  from .understanding import (
4
5
  AudioClassifier,
5
6
  AudioToText,
6
7
  FaceTracker,
8
+ ObjectDetector,
7
9
  SceneVLM,
8
10
  SemanticSceneDetector,
9
11
  )
@@ -20,10 +22,13 @@ __all__ = [
20
22
  "AudioToText",
21
23
  "AudioClassifier",
22
24
  "FaceTracker",
25
+ "ObjectDetector",
23
26
  "SceneVLM",
24
27
  "SemanticSceneDetector",
25
28
  # Transforms (AI-powered)
26
29
  "FaceTrackingCrop",
30
+ # Effects (AI-powered)
31
+ "ObjectDetectionOverlay",
27
32
  # Video analysis
28
33
  "VideoAnalysis",
29
34
  "VideoAnalysisConfig",
@@ -0,0 +1,112 @@
1
+ """AI-powered video effects that require object detection.
2
+
3
+ Effects here are real :class:`~videopython.editing.operation.Effect` subclasses
4
+ (shape-preserving, streamable) that physically live in ``videopython.ai`` so the
5
+ ``videopython.editing`` layer keeps no AI dependency -- the same direction
6
+ ``FaceTrackingCrop`` imports ``Operation``. The pixel work is delegated to the
7
+ AI-free renderer in :mod:`videopython.base.draw_detections`.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from typing import ClassVar, Literal
13
+
14
+ import numpy as np
15
+ from pydantic import Field, PrivateAttr
16
+
17
+ from videopython.ai.understanding.objects import ObjectDetector
18
+ from videopython.base.description import DetectedObject
19
+ from videopython.base.draw_detections import DetectionStyle, draw_detections
20
+ from videopython.editing.operation import Effect
21
+
22
+ __all__ = ["ObjectDetectionOverlay"]
23
+
24
+
25
+ class ObjectDetectionOverlay(Effect):
26
+ """Detect objects per frame and overlay labelled bounding boxes.
27
+
28
+ Runs a YOLOv8-COCO detector and composites tidy, colour-coded boxes with
29
+ class labels (and optional confidence) onto every frame in the window.
30
+
31
+ Detection runs on a ``detection_interval`` cadence in the streaming path and
32
+ boxes are held between detections, so the cost is *compute*-bound, not
33
+ *memory*-bound: ``"streamable"`` here means bounded memory, not bounded
34
+ compute. On long clips, cap cost with ``window`` (limit the time range),
35
+ a larger ``detection_interval``, a ``class_filter``, and/or the smaller
36
+ ``model_size``. Because only ``streaming_init`` and ``process_frame`` are
37
+ overridden, the base ``Effect._apply`` replays the identical contract for
38
+ in-memory execution, so eager and streaming results cannot drift.
39
+ """
40
+
41
+ op: Literal["object_detection_overlay"] = "object_detection_overlay"
42
+ streamable: ClassVar[bool] = True
43
+
44
+ confidence_threshold: float = Field(0.5, ge=0, le=1, description="Minimum detection confidence to draw a box, 0-1.")
45
+ class_filter: list[str] | None = Field(
46
+ None,
47
+ description='Only draw these COCO class names, e.g. ["person", "car", "dog"]. Null draws all classes.',
48
+ )
49
+ show_confidence: bool = Field(True, description="Append the detection confidence as a percentage to each label.")
50
+ box_color: tuple[int, int, int] | None = Field(
51
+ None,
52
+ description="Fixed box color as [R, G, B] (0-255) for every box, or null for distinct per-class colors.",
53
+ )
54
+ line_thickness: float = Field(
55
+ 0.003, gt=0, description="Box stroke width as a fraction of the frame's longer side (~3px at 1080p)."
56
+ )
57
+ label_font_size: float = Field(
58
+ 0.022, gt=0, description="Label text height as a fraction of the frame's longer side (~24px at 1080p)."
59
+ )
60
+ detection_interval: int = Field(
61
+ 2,
62
+ ge=1,
63
+ description="Run detection every Nth frame and reuse the last result in between. Higher is faster.",
64
+ )
65
+ model_size: Literal["n", "s", "m"] = Field(
66
+ "n",
67
+ description=(
68
+ "YOLOv8 model size: 'n' (nano, fastest), 's' (small), 'm' (medium, most accurate). "
69
+ "Larger detects better but is slower."
70
+ ),
71
+ )
72
+ backend: Literal["cpu", "gpu", "auto"] = Field(
73
+ "auto",
74
+ description="Detection device: 'cpu', 'gpu', or 'auto'.",
75
+ json_schema_extra={"llm_hidden": True},
76
+ )
77
+
78
+ _detector: ObjectDetector | None = PrivateAttr(default=None)
79
+ _last: list[DetectedObject] = PrivateAttr(default_factory=list)
80
+
81
+ def _style(self) -> DetectionStyle:
82
+ return DetectionStyle(
83
+ box_color=self.box_color,
84
+ line_thickness=self.line_thickness,
85
+ show_confidence=self.show_confidence,
86
+ label_font_size=self.label_font_size,
87
+ min_confidence=self.confidence_threshold,
88
+ )
89
+
90
+ def _init_detector(self) -> None:
91
+ """Build the detector lazily. Single patch point for tests."""
92
+ if self._detector is None:
93
+ self._detector = ObjectDetector(
94
+ model_name=f"yolov8{self.model_size}.pt",
95
+ confidence_threshold=self.confidence_threshold,
96
+ class_filter=tuple(self.class_filter or ()),
97
+ backend=self.backend,
98
+ )
99
+
100
+ def streaming_init(self, total_frames: int, fps: float, width: int, height: int) -> None:
101
+ self._last = []
102
+ self._init_detector()
103
+
104
+ def process_frame(self, frame: np.ndarray, frame_index: int) -> np.ndarray:
105
+ if self._detector is None:
106
+ self._init_detector()
107
+ assert self._detector is not None
108
+ # frame_index is 0-based within the effect's window, so frame 0 always
109
+ # detects; intermediate frames reuse the last result.
110
+ if frame_index % self.detection_interval == 0:
111
+ self._last = self._detector.detect(frame)
112
+ return draw_detections(frame, self._last, self._style())
@@ -1,12 +1,14 @@
1
1
  from .audio import AudioClassifier, AudioToText
2
2
  from .faces import FaceTracker
3
3
  from .image import SceneVLM
4
+ from .objects import ObjectDetector
4
5
  from .temporal import SemanticSceneDetector
5
6
 
6
7
  __all__ = [
7
8
  "AudioToText",
8
9
  "AudioClassifier",
9
10
  "FaceTracker",
11
+ "ObjectDetector",
10
12
  "SceneVLM",
11
13
  "SemanticSceneDetector",
12
14
  ]
@@ -0,0 +1,145 @@
1
+ """General object detection for the understanding layer.
2
+
3
+ ``ObjectDetector`` is the object-detection counterpart to the face detector in
4
+ ``faces.py``: a lazy YOLOv8-COCO wrapper returning
5
+ :class:`~videopython.base.description.DetectedObject` with normalized bounding
6
+ boxes. It mirrors ``_FaceDetector`` (lazy init, device selection, ``detect`` /
7
+ ``detect_batch``) so the two share one mental model. Consumed by
8
+ ``videopython.ai.effects.ObjectDetectionOverlay``; usable directly for any
9
+ per-frame object analysis.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import logging
15
+ from typing import Any, Literal
16
+
17
+ import numpy as np
18
+
19
+ from videopython.ai._device import select_device
20
+ from videopython.base.description import BoundingBox, DetectedObject
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ __all__ = ["ObjectDetector"]
25
+
26
+
27
+ class ObjectDetector:
28
+ """Lazy YOLOv8-COCO object detector returning normalized detections.
29
+
30
+ The Ultralytics weights (default ``yolov8n.pt``) auto-download on first
31
+ real use; class names come from the loaded model. Detection is gated by
32
+ ``confidence_threshold`` and optionally restricted to ``class_filter``.
33
+ """
34
+
35
+ DEFAULT_CONFIDENCE_THRESHOLD = 0.5
36
+
37
+ def __init__(
38
+ self,
39
+ model_name: str = "yolov8n.pt",
40
+ confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD,
41
+ class_filter: tuple[str, ...] = (),
42
+ backend: Literal["cpu", "gpu", "auto"] = "auto",
43
+ ):
44
+ """Initialize the detector.
45
+
46
+ Args:
47
+ model_name: Ultralytics COCO model id or path (e.g. ``yolov8n.pt``,
48
+ ``yolov8s.pt``, ``yolov8m.pt``). Downloaded on first use.
49
+ confidence_threshold: Minimum detection confidence in ``[0, 1]``.
50
+ class_filter: If non-empty, only these COCO class names are kept.
51
+ backend: Detection device - ``"cpu"``, ``"gpu"``, or ``"auto"``.
52
+ """
53
+ self.model_name = model_name
54
+ self.confidence_threshold = confidence_threshold
55
+ self.class_filter = class_filter
56
+ self.backend: Literal["cpu", "gpu", "auto"] = backend
57
+ self._resolved_device: Literal["cpu", "cuda"] | None = None
58
+ self._yolo_model: Any = None
59
+ self._class_names: dict[int, str] = {}
60
+ logger.info("ObjectDetector initialized with model=%s backend=%s", model_name, backend)
61
+
62
+ def _resolve_device(self) -> Literal["cpu", "cuda"]:
63
+ if self._resolved_device is not None:
64
+ return self._resolved_device
65
+
66
+ if self.backend == "cpu":
67
+ self._resolved_device = "cpu"
68
+ return self._resolved_device
69
+
70
+ if self.backend == "gpu":
71
+ resolved = select_device(None, mps_allowed=False)
72
+ if resolved != "cuda":
73
+ raise ValueError("GPU backend requested but CUDA is not available.")
74
+ self._resolved_device = "cuda"
75
+ return self._resolved_device
76
+
77
+ resolved_auto = select_device(None, mps_allowed=False)
78
+ self._resolved_device = "cuda" if resolved_auto == "cuda" else "cpu"
79
+ return self._resolved_device
80
+
81
+ def execution_device(self) -> Literal["cpu", "cuda"]:
82
+ """Resolved execution device for this detector."""
83
+ return self._resolve_device()
84
+
85
+ def _init_yolo(self) -> None:
86
+ from ultralytics import YOLO
87
+
88
+ self._yolo_model = YOLO(self.model_name)
89
+ self._class_names = dict(self._yolo_model.names)
90
+
91
+ if self._resolve_device() == "cuda":
92
+ self._yolo_model.to("cuda")
93
+
94
+ def _objects_from_yolo_result(self, result: Any) -> list[DetectedObject]:
95
+ detected: list[DetectedObject] = []
96
+ boxes = result.boxes
97
+ if boxes is None:
98
+ return detected
99
+
100
+ img_h, img_w = result.orig_shape
101
+ for i in range(len(boxes)):
102
+ label = self._class_names.get(int(boxes.cls[i]), str(int(boxes.cls[i])))
103
+ if self.class_filter and label not in self.class_filter:
104
+ continue
105
+
106
+ x1, y1, x2, y2 = boxes.xyxy[i].tolist()
107
+ detected.append(
108
+ DetectedObject(
109
+ label=label,
110
+ confidence=float(boxes.conf[i]),
111
+ bounding_box=BoundingBox(
112
+ x=x1 / img_w,
113
+ y=y1 / img_h,
114
+ width=(x2 - x1) / img_w,
115
+ height=(y2 - y1) / img_h,
116
+ ),
117
+ )
118
+ )
119
+ detected.sort(key=lambda d: d.confidence, reverse=True)
120
+ return detected
121
+
122
+ def detect(self, image: np.ndarray) -> list[DetectedObject]:
123
+ """Detect objects in a single ``(H, W, 3)`` frame."""
124
+ if self._yolo_model is None:
125
+ self._init_yolo()
126
+ assert self._yolo_model is not None
127
+
128
+ results = self._yolo_model(image, conf=self.confidence_threshold, verbose=False)
129
+ if not results:
130
+ return []
131
+ return self._objects_from_yolo_result(results[0])
132
+
133
+ def detect_batch(self, images: list[np.ndarray] | np.ndarray) -> list[list[DetectedObject]]:
134
+ """Detect objects in a batch of frames (list or stacked ``(N, H, W, 3)``)."""
135
+ if isinstance(images, np.ndarray):
136
+ images = [images[i] for i in range(images.shape[0])] if images.ndim == 4 else [images]
137
+ if not images:
138
+ return []
139
+
140
+ if self._yolo_model is None:
141
+ self._init_yolo()
142
+ assert self._yolo_model is not None
143
+
144
+ results = self._yolo_model(images, conf=self.confidence_threshold, verbose=False)
145
+ return [self._objects_from_yolo_result(result) for result in results]
@@ -10,10 +10,15 @@ from .description import (
10
10
  SceneBoundary,
11
11
  SceneDescription,
12
12
  )
13
+ from .draw_detections import DetectionStyle, class_color, draw_detections
13
14
  from .exceptions import (
14
15
  AudioError,
15
16
  AudioLoadError,
16
17
  OutOfBoundsError,
18
+ PlanError,
19
+ PlanErrorCode,
20
+ PlanRepair,
21
+ PlanValidationError,
17
22
  TextRenderError,
18
23
  TransformError,
19
24
  VideoError,
@@ -40,10 +45,19 @@ __all__ = [
40
45
  "TransformError",
41
46
  "TextRenderError",
42
47
  "OutOfBoundsError",
48
+ # Structured plan validation / repair
49
+ "PlanError",
50
+ "PlanErrorCode",
51
+ "PlanValidationError",
52
+ "PlanRepair",
43
53
  # Text rendering primitives
44
54
  "ImageText",
45
55
  "TextAlign",
46
56
  "AnchorPoint",
57
+ # Detection overlay renderer (AI-free)
58
+ "draw_detections",
59
+ "DetectionStyle",
60
+ "class_color",
47
61
  # Transcription data classes
48
62
  "Transcription",
49
63
  "TranscriptionSegment",
@@ -0,0 +1,164 @@
1
+ """Pure, AI-free renderer for object-detection overlays.
2
+
3
+ Draws labelled bounding boxes onto a frame from a list of
4
+ :class:`~videopython.base.description.DetectedObject`. This module has **no AI
5
+ dependencies** -- it is the single source of truth for how detections look, so
6
+ it can be unit-tested with synthetic detections and reused by any detector. The
7
+ AI side (``videopython.ai``) only produces the ``DetectedObject`` list and calls
8
+ :func:`draw_detections`.
9
+
10
+ Visuals: a resolution-scaled box stroke plus a label chip filled in the box's
11
+ own colour (so chip and box read as one unit) with anti-aliased text. Colours
12
+ are deterministic per class via :func:`class_color`, so the same class is the
13
+ same colour in every frame and across runs.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import colorsys
19
+ import hashlib
20
+ from dataclasses import dataclass
21
+
22
+ import numpy as np
23
+ from PIL import Image, ImageDraw
24
+
25
+ from videopython.base.description import DetectedObject
26
+ from videopython.base.fonts import load_font
27
+
28
+ __all__ = ["DetectionStyle", "class_color", "draw_detections"]
29
+
30
+ # Hand-picked Material-palette hues for common COCO classes so busy scenes read
31
+ # clearly. Any class not listed gets a deterministic colour from ``class_color``.
32
+ _RESERVED_COLORS: dict[str, tuple[int, int, int]] = {
33
+ "person": (76, 175, 80), # green
34
+ "bicycle": (0, 188, 212), # cyan
35
+ "car": (33, 150, 243), # blue
36
+ "motorcycle": (156, 39, 176), # purple
37
+ "bus": (255, 193, 7), # amber
38
+ "truck": (255, 87, 34), # deep orange
39
+ "cat": (233, 30, 99), # pink
40
+ "dog": (255, 152, 0), # orange
41
+ }
42
+
43
+
44
+ def class_color(label: str) -> tuple[int, int, int]:
45
+ """Deterministic RGB colour for a class label.
46
+
47
+ Common COCO classes get a reserved Material hue; everything else maps
48
+ ``md5(label) -> HSV hue`` at fixed saturation/value. ``md5`` (not the
49
+ salted built-in ``hash``) is used so colours are stable across processes
50
+ and test runs.
51
+ """
52
+ reserved = _RESERVED_COLORS.get(label)
53
+ if reserved is not None:
54
+ return reserved
55
+ digest = int(hashlib.md5(label.encode("utf-8")).hexdigest(), 16)
56
+ hue = (digest % 360) / 360.0
57
+ r, g, b = colorsys.hsv_to_rgb(hue, 0.7, 0.95)
58
+ return int(r * 255), int(g * 255), int(b * 255)
59
+
60
+
61
+ @dataclass(frozen=True)
62
+ class DetectionStyle:
63
+ """Styling for :func:`draw_detections`.
64
+
65
+ Lengths expressed as a fraction of the frame's longer side are
66
+ resolution-independent: the same style reads consistently at 1080p and 4k.
67
+ """
68
+
69
+ box_color: tuple[int, int, int] | None = None
70
+ """Fixed ``(R, G, B)`` for every box, or ``None`` for per-class colours."""
71
+ line_thickness: float = 0.003
72
+ """Box stroke width as a fraction of ``max(height, width)`` (~3px at 1080p)."""
73
+ show_confidence: bool = True
74
+ """Append the confidence as a whole-number percent to each label."""
75
+ label_font_size: float = 0.022
76
+ """Label text height as a fraction of ``max(height, width)`` (~24px at 1080p)."""
77
+ label_text_color: tuple[int, int, int] = (255, 255, 255)
78
+ """Colour of the label text drawn on the chip."""
79
+ label_bg_alpha: int = 200
80
+ """Opacity (0-255) of the label chip background."""
81
+ min_confidence: float = 0.0
82
+ """Detections below this confidence are skipped."""
83
+ font: str | None = None
84
+ """Bundled font name or path; ``None`` uses the default font."""
85
+
86
+
87
+ def draw_detections(
88
+ frame: np.ndarray,
89
+ detections: list[DetectedObject],
90
+ style: DetectionStyle = DetectionStyle(),
91
+ ) -> np.ndarray:
92
+ """Return a copy of ``frame`` with ``detections`` drawn as labelled boxes.
93
+
94
+ Shape-preserving: the result is the same ``(H, W, 3)`` ``uint8`` array. An
95
+ empty ``detections`` list (or one filtered out by ``min_confidence``) is a
96
+ no-op that returns ``frame`` unchanged. Boxes are clamped to the frame, so
97
+ off-frame coordinates clip cleanly instead of raising. Label chips flip
98
+ inside the box when they would overflow the top edge and clamp horizontally
99
+ so they never leave the frame.
100
+
101
+ Args:
102
+ frame: Source frame as ``(H, W, 3)`` ``uint8`` (RGB).
103
+ detections: Objects to draw; each uses its normalized ``bounding_box``.
104
+ style: Visual styling (colours, stroke width, label options).
105
+
106
+ Returns:
107
+ A new ``(H, W, 3)`` ``uint8`` frame with the overlays composited on.
108
+ """
109
+ if not detections:
110
+ return frame
111
+
112
+ h, w = frame.shape[:2]
113
+ scale = max(h, w)
114
+ thickness = max(1, round(style.line_thickness * scale))
115
+ font_px = max(8, round(style.label_font_size * scale))
116
+ font = load_font(style.font, font_px)
117
+
118
+ canvas = Image.new("RGBA", (w, h), (0, 0, 0, 0))
119
+ draw = ImageDraw.Draw(canvas)
120
+
121
+ drew_any = False
122
+ for det in detections:
123
+ box = det.bounding_box
124
+ if box is None or det.confidence < style.min_confidence:
125
+ continue
126
+ drew_any = True
127
+ color = style.box_color or class_color(det.label)
128
+
129
+ x0 = max(0, min(w - 1, int(box.x * w)))
130
+ y0 = max(0, min(h - 1, int(box.y * h)))
131
+ x1 = max(0, min(w - 1, int((box.x + box.width) * w)))
132
+ y1 = max(0, min(h - 1, int((box.y + box.height) * h)))
133
+ draw.rectangle((x0, y0, x1, y1), outline=(*color, 255), width=thickness)
134
+
135
+ text = det.label.title()
136
+ if style.show_confidence:
137
+ text = f"{text} {det.confidence * 100:.0f}%"
138
+
139
+ tb = draw.textbbox((0, 0), text, font=font)
140
+ text_w, text_h = tb[2] - tb[0], tb[3] - tb[1]
141
+ pad = max(2, thickness)
142
+ chip_w, chip_h = text_w + 2 * pad, text_h + 2 * pad
143
+
144
+ # Flip the chip inside the box when it would overflow the top edge,
145
+ # and clamp horizontally so it never leaves the frame.
146
+ chip_y = y0 - chip_h if y0 - chip_h >= 0 else y0
147
+ chip_x = max(0, min(x0, w - chip_w))
148
+ draw.rectangle(
149
+ (chip_x, chip_y, chip_x + chip_w, chip_y + chip_h),
150
+ fill=(*color, style.label_bg_alpha),
151
+ )
152
+ draw.text(
153
+ (chip_x + pad - tb[0], chip_y + pad - tb[1]),
154
+ text,
155
+ font=font,
156
+ fill=(*style.label_text_color, 255),
157
+ )
158
+
159
+ if not drew_any:
160
+ return frame
161
+
162
+ out = Image.fromarray(frame).convert("RGBA")
163
+ out.alpha_composite(canvas)
164
+ return np.array(out.convert("RGB"), dtype=np.uint8)
@@ -85,12 +85,26 @@ class PlanErrorCode(str, Enum):
85
85
  instead of substring-matching the human message text.
86
86
  """
87
87
 
88
+ # Segment range vs source / shape.
88
89
  SEGMENT_END_EXCEEDS_SOURCE = "segment_end_exceeds_source"
90
+ SEGMENT_NEGATIVE = "segment_negative"
91
+ SEGMENT_RANGE = "segment_range"
92
+ # Effect windows.
89
93
  EFFECT_WINDOW_EXCEEDS_DURATION = "effect_window_exceeds_duration"
94
+ WINDOW_NEGATIVE = "window_negative"
95
+ WINDOW_ORDER = "window_order"
96
+ # Operation-level, metadata-relative checks.
90
97
  CUT_EXCEEDS_DURATION = "cut_exceeds_duration"
98
+ OP_TIMESTAMP_OUT_OF_RANGE = "op_timestamp_out_of_range"
99
+ CROP_EXCEEDS_SOURCE = "crop_exceeds_source"
100
+ DEGENERATE_DURATION = "degenerate_duration"
101
+ SOURCE_UNREADABLE = "source_unreadable"
102
+ OP_PREDICTION_FAILED = "op_prediction_failed"
103
+ # Assembly / structural.
91
104
  UNKNOWN_OP = "unknown_op"
92
105
  CONCAT_MISMATCH = "concat_mismatch"
93
106
  SUBTITLE_UNFITTABLE = "subtitle_unfittable"
107
+ POST_OP_REQUIRES_CONTEXT = "post_op_requires_context"
94
108
 
95
109
 
96
110
  @dataclass
@@ -110,12 +124,37 @@ class PlanError:
110
124
  predicted_duration: float | None = None
111
125
 
112
126
 
127
+ @dataclass
128
+ class PlanRepair:
129
+ """A single change a repair/normalize pass made to a plan.
130
+
131
+ The structured changelog returned by :meth:`VideoEdit.repair` and
132
+ :meth:`VideoEdit.normalize_dimensions`. ``location`` is a path into the
133
+ plan (e.g. ``'segments[0].operations[1]'``); ``field`` is the changed
134
+ field (``'window.stop'``, ``'timestamp'``, ``'dimensions'``, ...). ``old``
135
+ and ``new`` carry the before/after values -- a ``float`` for numeric
136
+ clamps, a ``str`` for composite values like ``'768x432'``. ``code`` is the
137
+ :class:`PlanErrorCode` of the violation that was repaired, so a consumer
138
+ can surface "we trimmed your effect to fit" wording keyed on the class.
139
+ """
140
+
141
+ location: str
142
+ field: str
143
+ old: float | str | None
144
+ new: float | str | None
145
+ code: PlanErrorCode
146
+
147
+
113
148
  class PlanValidationError(ValueError):
114
149
  """Typed plan-validation failure carrying structured :class:`PlanError`s.
115
150
 
116
151
  Subclasses ``ValueError`` so ``str(e)`` stays byte-identical to the bare
117
152
  ``ValueError`` prose emitted before this type existed -- existing
118
153
  ``pytest.raises(match=...)`` and consumer substring fallbacks keep working.
154
+
155
+ ``str(e)`` is the first error's human message; ``.errors`` carries every
156
+ structured :class:`PlanError`. The non-raising :meth:`VideoEdit.check`
157
+ returns the same ``PlanError`` list directly.
119
158
  """
120
159
 
121
160
  def __init__(self, message: str, errors: list[PlanError]):
@@ -29,6 +29,7 @@ from pydantic import Field, PrivateAttr, model_validator
29
29
  from tqdm import tqdm
30
30
 
31
31
  from videopython.base.description import BoundingBox
32
+ from videopython.base.exceptions import PlanError, PlanErrorCode, PlanValidationError
32
33
  from videopython.base.fonts import load_font
33
34
  from videopython.editing._easing import ease, ease_out
34
35
  from videopython.editing.operation import Effect
@@ -860,7 +861,11 @@ class ImageOverlay(_AnchoredOverlay):
860
861
  with Image.open(self.source) as im:
861
862
  im.verify()
862
863
  except (OSError, ValueError) as exc:
863
- raise ValueError(f"image_overlay source {str(self.source)!r} is not a readable image: {exc}") from exc
864
+ message = f"image_overlay source {str(self.source)!r} is not a readable image: {exc}"
865
+ raise PlanValidationError(
866
+ message,
867
+ [PlanError(code=PlanErrorCode.SOURCE_UNREADABLE, op=self.op, field="source")],
868
+ ) from exc
864
869
  return meta
865
870
 
866
871
  def _rasterize_svg(self, target_w: int) -> np.ndarray: