videopython 0.28.3__tar.gz → 0.29.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {videopython-0.28.3 → videopython-0.29.0}/PKG-INFO +7 -4
  2. {videopython-0.28.3 → videopython-0.29.0}/README.md +4 -2
  3. {videopython-0.28.3 → videopython-0.29.0}/pyproject.toml +13 -4
  4. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/__init__.py +3 -5
  5. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/transforms.py +2 -478
  6. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/understanding/__init__.py +3 -3
  7. videopython-0.29.0/src/videopython/ai/understanding/faces.py +592 -0
  8. videopython-0.29.0/src/videopython/ai/understanding/image.py +397 -0
  9. videopython-0.29.0/src/videopython/ai/understanding/temporal.py +218 -0
  10. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/video_analysis.py +217 -37
  11. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/__init__.py +4 -2
  12. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/description.py +82 -52
  13. videopython-0.28.3/src/videopython/ai/understanding/image.py +0 -215
  14. videopython-0.28.3/src/videopython/ai/understanding/temporal.py +0 -464
  15. {videopython-0.28.3 → videopython-0.29.0}/.gitignore +0 -0
  16. {videopython-0.28.3 → videopython-0.29.0}/LICENSE +0 -0
  17. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/__init__.py +0 -0
  18. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/_device.py +0 -0
  19. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/dubbing/__init__.py +0 -0
  20. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/dubbing/cache.py +0 -0
  21. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/dubbing/dubber.py +0 -0
  22. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/dubbing/models.py +0 -0
  23. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/dubbing/pipeline.py +0 -0
  24. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/dubbing/quality.py +0 -0
  25. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/dubbing/remux.py +0 -0
  26. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/dubbing/timing.py +0 -0
  27. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/generation/__init__.py +0 -0
  28. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/generation/audio.py +0 -0
  29. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/generation/image.py +0 -0
  30. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/generation/qwen3.py +0 -0
  31. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/generation/translation.py +0 -0
  32. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/generation/video.py +0 -0
  33. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/registry.py +0 -0
  34. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/swapping/__init__.py +0 -0
  35. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/swapping/inpainter.py +0 -0
  36. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/swapping/models.py +0 -0
  37. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/swapping/segmenter.py +0 -0
  38. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/swapping/swapper.py +0 -0
  39. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/understanding/audio.py +0 -0
  40. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/understanding/separation.py +0 -0
  41. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/audio/__init__.py +0 -0
  42. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/audio/analysis.py +0 -0
  43. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/audio/audio.py +0 -0
  44. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/combine.py +0 -0
  45. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/effects.py +0 -0
  46. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/exceptions.py +0 -0
  47. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/progress.py +0 -0
  48. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/registry.py +0 -0
  49. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/scene.py +0 -0
  50. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/streaming.py +0 -0
  51. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/text/__init__.py +0 -0
  52. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/text/overlay.py +0 -0
  53. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/text/transcription.py +0 -0
  54. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/transforms.py +0 -0
  55. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/transitions.py +0 -0
  56. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/utils.py +0 -0
  57. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/video.py +0 -0
  58. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/editing/__init__.py +0 -0
  59. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/editing/multicam.py +0 -0
  60. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/editing/premiere_xml.py +0 -0
  61. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/editing/video_edit.py +0 -0
  62. {videopython-0.28.3 → videopython-0.29.0}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.28.3
3
+ Version: 0.29.0
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -27,14 +27,15 @@ Requires-Dist: accelerate>=0.29.2; extra == 'ai'
27
27
  Requires-Dist: chatterbox-tts>=0.1.7; extra == 'ai'
28
28
  Requires-Dist: demucs>=4.0.0; extra == 'ai'
29
29
  Requires-Dist: diffusers>=0.30.0; extra == 'ai'
30
- Requires-Dist: easyocr>=1.7.0; extra == 'ai'
31
30
  Requires-Dist: hf-transfer>=0.1.9; extra == 'ai'
31
+ Requires-Dist: imagehash>=4.3; extra == 'ai'
32
32
  Requires-Dist: llama-cpp-python>=0.3.0; extra == 'ai'
33
33
  Requires-Dist: numba>=0.61.0; extra == 'ai'
34
34
  Requires-Dist: ollama>=0.4.5; extra == 'ai'
35
35
  Requires-Dist: openai-whisper>=20240930; extra == 'ai'
36
36
  Requires-Dist: pyannote-audio>=4.0.0; extra == 'ai'
37
37
  Requires-Dist: pyloudnorm>=0.1.1; extra == 'ai'
38
+ Requires-Dist: qwen-vl-utils>=0.0.10; extra == 'ai'
38
39
  Requires-Dist: scikit-learn>=1.3.0; extra == 'ai'
39
40
  Requires-Dist: scipy>=1.10.0; extra == 'ai'
40
41
  Requires-Dist: sentencepiece>=0.1.99; extra == 'ai'
@@ -56,6 +57,8 @@ Minimal, LLM-friendly Python library for programmatic video editing, processing,
56
57
 
57
58
  Full documentation: [videopython.com](https://videopython.com)
58
59
 
60
+ > **Disclaimer:** This project started as a hand-written hobby project, but most of the code is now produced by LLM agents. Humans still drive direction, approve changes, and own design decisions.
61
+
59
62
  ## Installation
60
63
 
61
64
  ### 1. Install FFmpeg
@@ -193,10 +196,10 @@ API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopyth
193
196
  | Area | Highlights |
194
197
  |---|---|
195
198
  | **Generation** | `TextToVideo`, `ImageToVideo`, `TextToImage`, `TextToSpeech`, `TextToMusic` |
196
- | **Understanding** | `AudioToText` (transcription), `AudioClassifier`, `SceneVLM` (visual scene description), `ActionRecognizer` |
199
+ | **Understanding** | `AudioToText` (transcription), `AudioClassifier`, `SceneVLM` (structured visual scene description), `FaceTracker` (per-shot face tracks) |
197
200
  | **Scene detection** | `SemanticSceneDetector` (neural scene boundaries) |
198
201
  | **Video analysis** | `VideoAnalyzer` - full-pipeline analysis combining multiple AI capabilities |
199
- | **Transforms** | `FaceTracker`, `FaceTrackingCrop`, `SplitScreenComposite` |
202
+ | **Transforms** | `FaceTrackingCrop`, `SplitScreenComposite` |
200
203
  | **Dubbing** | `VideoDubber` - voice cloning and revoicing with timing sync |
201
204
  | **Object swapping** | `ObjectSwapper` - detect, segment, and inpaint objects in video |
202
205
 
@@ -8,6 +8,8 @@ Minimal, LLM-friendly Python library for programmatic video editing, processing,
8
8
 
9
9
  Full documentation: [videopython.com](https://videopython.com)
10
10
 
11
+ > **Disclaimer:** This project started as a hand-written hobby project, but most of the code is now produced by LLM agents. Humans still drive direction, approve changes, and own design decisions.
12
+
11
13
  ## Installation
12
14
 
13
15
  ### 1. Install FFmpeg
@@ -145,10 +147,10 @@ API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopyth
145
147
  | Area | Highlights |
146
148
  |---|---|
147
149
  | **Generation** | `TextToVideo`, `ImageToVideo`, `TextToImage`, `TextToSpeech`, `TextToMusic` |
148
- | **Understanding** | `AudioToText` (transcription), `AudioClassifier`, `SceneVLM` (visual scene description), `ActionRecognizer` |
150
+ | **Understanding** | `AudioToText` (transcription), `AudioClassifier`, `SceneVLM` (structured visual scene description), `FaceTracker` (per-shot face tracks) |
149
151
  | **Scene detection** | `SemanticSceneDetector` (neural scene boundaries) |
150
152
  | **Video analysis** | `VideoAnalyzer` - full-pipeline analysis combining multiple AI capabilities |
151
- | **Transforms** | `FaceTracker`, `FaceTrackingCrop`, `SplitScreenComposite` |
153
+ | **Transforms** | `FaceTrackingCrop`, `SplitScreenComposite` |
152
154
  | **Dubbing** | `VideoDubber` - voice cloning and revoicing with timing sync |
153
155
  | **Object swapping** | `ObjectSwapper` - detect, segment, and inpaint objects in video |
154
156
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.28.3"
3
+ version = "0.29.0"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -70,7 +70,6 @@ ai = [
70
70
  "scikit-learn>=1.3.0",
71
71
  # Detection backends
72
72
  "ultralytics>=8.0.0",
73
- "easyocr>=1.7.0",
74
73
  # Audio classification (AST via transformers - no separate dep needed)
75
74
  # Scene detection
76
75
  "transnetv2-pytorch>=1.0.5",
@@ -84,6 +83,11 @@ ai = [
84
83
  "llama-cpp-python>=0.3.0",
85
84
  # Loudness measurement (BS.1770) for dub-vs-source loudness matching (M3)
86
85
  "pyloudnorm>=0.1.1",
86
+ # Vision-language preprocessing for Qwen3.5 (M5) - documented prerequisite
87
+ # for AutoModelForImageTextToText with image/video chat templates.
88
+ "qwen-vl-utils>=0.0.10",
89
+ # Perceptual hashing for SceneVLM frame dedup (M5)
90
+ "imagehash>=4.3",
87
91
  ]
88
92
 
89
93
  # Required for pip install videopython[ai] - pip uses optional-dependencies, not dependency-groups
@@ -105,7 +109,6 @@ ai = [
105
109
  "scikit-learn>=1.3.0",
106
110
  # Detection backends
107
111
  "ultralytics>=8.0.0",
108
- "easyocr>=1.7.0",
109
112
  # Audio classification (AST via transformers - no separate dep needed)
110
113
  # Scene detection
111
114
  "transnetv2-pytorch>=1.0.5",
@@ -119,6 +122,11 @@ ai = [
119
122
  "llama-cpp-python>=0.3.0",
120
123
  # Loudness measurement (BS.1770) for dub-vs-source loudness matching (M3)
121
124
  "pyloudnorm>=0.1.1",
125
+ # Vision-language preprocessing for Qwen3.5 (M5) - documented prerequisite
126
+ # for AutoModelForImageTextToText with image/video chat templates.
127
+ "qwen-vl-utils>=0.0.10",
128
+ # Perceptual hashing for SceneVLM frame dedup (M5)
129
+ "imagehash>=4.3",
122
130
  ]
123
131
 
124
132
  [project.urls]
@@ -135,7 +143,6 @@ module = [
135
143
  "diffusers", "diffusers.*",
136
144
  "ollama", "ollama.*",
137
145
  "ultralytics", "ultralytics.*",
138
- "easyocr", "easyocr.*",
139
146
  "transformers", "transformers.*",
140
147
  "transnetv2_pytorch", "transnetv2_pytorch.*",
141
148
  "chatterbox", "chatterbox.*",
@@ -146,6 +153,8 @@ module = [
146
153
  "cv2", "cv2.*",
147
154
  "llama_cpp", "llama_cpp.*",
148
155
  "pyloudnorm", "pyloudnorm.*",
156
+ "qwen_vl_utils", "qwen_vl_utils.*",
157
+ "imagehash", "imagehash.*",
149
158
  ]
150
159
  ignore_missing_imports = true
151
160
 
@@ -2,11 +2,11 @@ from videopython.ai import registry as _ai_registry # noqa: F401
2
2
 
3
3
  from .generation import ImageToVideo, TextToImage, TextToMusic, TextToSpeech, TextToVideo
4
4
  from .swapping import ObjectSwapper
5
- from .transforms import FaceTracker, FaceTrackingCrop, SplitScreenComposite
5
+ from .transforms import FaceTrackingCrop, SplitScreenComposite
6
6
  from .understanding import (
7
- ActionRecognizer,
8
7
  AudioClassifier,
9
8
  AudioToText,
9
+ FaceTracker,
10
10
  SceneVLM,
11
11
  SemanticSceneDetector,
12
12
  )
@@ -22,12 +22,10 @@ __all__ = [
22
22
  # Understanding
23
23
  "AudioToText",
24
24
  "AudioClassifier",
25
+ "FaceTracker",
25
26
  "SceneVLM",
26
- # Temporal
27
- "ActionRecognizer",
28
27
  "SemanticSceneDetector",
29
28
  # Transforms (AI-powered)
30
- "FaceTracker",
31
29
  "FaceTrackingCrop",
32
30
  "SplitScreenComposite",
33
31
  # Swapping
@@ -3,20 +3,16 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import logging
6
- from typing import TYPE_CHECKING, Any, Literal
6
+ from typing import Literal
7
7
 
8
8
  import cv2
9
9
  import numpy as np
10
10
  from tqdm import tqdm
11
11
 
12
- from videopython.ai._device import select_device
13
- from videopython.base.description import BoundingBox, DetectedFace
12
+ from videopython.ai.understanding.faces import FaceTracker
14
13
  from videopython.base.transforms import Transformation
15
14
  from videopython.base.video import Video
16
15
 
17
- if TYPE_CHECKING:
18
- pass
19
-
20
16
  logger = logging.getLogger(__name__)
21
17
 
22
18
 
@@ -25,484 +21,12 @@ def _make_even(value: int) -> int:
25
21
  return value - (value % 2)
26
22
 
27
23
 
28
- class _FaceDetectionBackend:
29
- """Internal YOLOv8-face detector used by AI transforms."""
30
-
31
- def __init__(
32
- self,
33
- confidence_threshold: float = 0.5,
34
- min_face_size: int = 30,
35
- backend: Literal["cpu", "gpu", "auto"] = "auto",
36
- ):
37
- self.confidence_threshold = confidence_threshold
38
- self.min_face_size = min_face_size
39
- self.backend: Literal["cpu", "gpu", "auto"] = backend
40
- self._resolved_device: Literal["cpu", "cuda"] | None = None
41
- self._yolo_model: Any = None
42
-
43
- def _resolve_device(self) -> Literal["cpu", "cuda"]:
44
- if self._resolved_device is not None:
45
- return self._resolved_device
46
-
47
- if self.backend == "cpu":
48
- self._resolved_device = "cpu"
49
- return self._resolved_device
50
-
51
- if self.backend == "gpu":
52
- resolved = select_device(None, mps_allowed=False)
53
- if resolved != "cuda":
54
- raise ValueError("GPU backend requested but CUDA is not available.")
55
- self._resolved_device = "cuda"
56
- return self._resolved_device
57
-
58
- resolved_auto = select_device(None, mps_allowed=False)
59
- self._resolved_device = "cuda" if resolved_auto == "cuda" else "cpu"
60
- return self._resolved_device
61
-
62
- def execution_device(self) -> Literal["cpu", "cuda"]:
63
- """Resolved execution device for this backend."""
64
- return self._resolve_device()
65
-
66
- def _init_yolo_face(self) -> None:
67
- from huggingface_hub import hf_hub_download
68
- from ultralytics import YOLO
69
-
70
- model_path = hf_hub_download(
71
- repo_id="arnabdhar/YOLOv8-Face-Detection",
72
- filename="model.pt",
73
- )
74
- self._yolo_model = YOLO(model_path)
75
-
76
- device = self._resolve_device()
77
- if device == "cuda":
78
- self._yolo_model.to("cuda")
79
-
80
- def _faces_from_yolo_result(self, result: Any) -> list[DetectedFace]:
81
- detected_faces: list[DetectedFace] = []
82
- boxes = result.boxes
83
- if boxes is None:
84
- return detected_faces
85
-
86
- img_h, img_w = result.orig_shape
87
- for i in range(len(boxes)):
88
- x1, y1, x2, y2 = boxes.xyxy[i].tolist()
89
- conf = float(boxes.conf[i])
90
-
91
- face_w = x2 - x1
92
- face_h = y2 - y1
93
- if face_w < self.min_face_size or face_h < self.min_face_size:
94
- continue
95
-
96
- detected_faces.append(
97
- DetectedFace(
98
- bounding_box=BoundingBox(
99
- x=x1 / img_w,
100
- y=y1 / img_h,
101
- width=face_w / img_w,
102
- height=face_h / img_h,
103
- ),
104
- confidence=conf,
105
- )
106
- )
107
- detected_faces.sort(key=lambda f: f.area or 0, reverse=True)
108
- return detected_faces
109
-
110
- def detect(self, image: np.ndarray) -> list[DetectedFace]:
111
- if self._yolo_model is None:
112
- self._init_yolo_face()
113
- assert self._yolo_model is not None
114
-
115
- results = self._yolo_model(image, conf=self.confidence_threshold, verbose=False)
116
- if not results:
117
- return []
118
- return self._faces_from_yolo_result(results[0])
119
-
120
- def detect_batch(self, images: list[np.ndarray] | np.ndarray) -> list[list[DetectedFace]]:
121
- if isinstance(images, np.ndarray):
122
- images = [images[i] for i in range(images.shape[0])] if images.ndim == 4 else [images]
123
- if not images:
124
- return []
125
-
126
- if self._yolo_model is None:
127
- self._init_yolo_face()
128
- assert self._yolo_model is not None
129
-
130
- results = self._yolo_model(images, conf=self.confidence_threshold, verbose=False)
131
- return [self._faces_from_yolo_result(result) for result in results]
132
-
133
-
134
24
  __all__ = [
135
- "FaceTracker",
136
25
  "FaceTrackingCrop",
137
26
  "SplitScreenComposite",
138
27
  ]
139
28
 
140
29
 
141
- class FaceTracker:
142
- """Utility for tracking faces across video frames with smoothing.
143
-
144
- Provides frame-by-frame face detection with position smoothing using
145
- exponential moving average to prevent jitter in the tracked position.
146
-
147
- Supports GPU acceleration via YOLOv8-face model for significantly faster
148
- detection, with optional frame sampling and interpolation for video.
149
-
150
- Example:
151
- >>> # Auto backend (default): resolves to GPU when available, else CPU
152
- >>> tracker = FaceTracker()
153
- >>> for i, frame in enumerate(frames):
154
- ... pos = tracker.detect_and_track(frame, i)
155
- >>>
156
- >>> # GPU tracking with frame sampling
157
- >>> tracker = FaceTracker(backend="gpu", sample_rate=5)
158
- >>> positions = tracker.track_video(frames)
159
- """
160
-
161
- def __init__(
162
- self,
163
- selection_strategy: Literal["largest", "centered", "index"] = "largest",
164
- face_index: int = 0,
165
- smoothing: float = 0.8,
166
- detection_interval: int = 3,
167
- min_face_size: int = 30,
168
- backend: Literal["cpu", "gpu", "auto"] = "auto",
169
- sample_rate: int = 1,
170
- batch_size: int = 16,
171
- ):
172
- """Initialize face tracker.
173
-
174
- Args:
175
- selection_strategy: How to select which face to track.
176
- - "largest": Track the face with the largest bounding box.
177
- - "centered": Track the face closest to frame center.
178
- - "index": Track the face at a specific index (sorted by area).
179
- face_index: Index of face to track when using "index" strategy.
180
- smoothing: Exponential moving average factor (0-1). Higher = smoother.
181
- detection_interval: Run detection every N frames, interpolate between.
182
- min_face_size: Minimum face size in pixels for detection.
183
- backend: Detection backend - "cpu", "gpu", or "auto".
184
- sample_rate: For GPU backend, detect every Nth frame and interpolate.
185
- Only used by track_video(). Default 1 (every frame).
186
- batch_size: Batch size for GPU detection. Default 16.
187
- """
188
- self.selection_strategy = selection_strategy
189
- self.face_index = face_index
190
- self.smoothing = smoothing
191
- self.detection_interval = detection_interval
192
- self.min_face_size = min_face_size
193
- self.backend: Literal["cpu", "gpu", "auto"] = backend
194
- self.sample_rate = sample_rate
195
- self.batch_size = batch_size
196
-
197
- self._detector: _FaceDetectionBackend | None = None
198
- self._last_position: tuple[float, float] | None = None
199
- self._last_size: tuple[float, float] | None = None
200
- self._smoothed_position: tuple[float, float] | None = None
201
- self._smoothed_size: tuple[float, float] | None = None
202
- logger.info("FaceTracker initialized with backend=%s", self.backend)
203
-
204
- def _init_detector(self) -> None:
205
- """Initialize face detector lazily."""
206
- self._detector = _FaceDetectionBackend(
207
- min_face_size=self.min_face_size,
208
- backend=self.backend,
209
- )
210
-
211
- def _select_face(
212
- self,
213
- faces: list,
214
- frame_width: int,
215
- frame_height: int,
216
- ) -> tuple[float, float, float, float] | None:
217
- """Select a face based on the configured strategy.
218
-
219
- Args:
220
- faces: List of DetectedFace objects.
221
- frame_width: Width of the frame.
222
- frame_height: Height of the frame.
223
-
224
- Returns:
225
- Tuple of (center_x, center_y, width, height) in normalized coords, or None.
226
- """
227
- if not faces:
228
- return None
229
-
230
- if self.selection_strategy == "largest":
231
- # Faces are already sorted by area (largest first)
232
- face = faces[0]
233
- elif self.selection_strategy == "centered":
234
- # Find face closest to center
235
- frame_center = (0.5, 0.5)
236
- face = min(
237
- faces,
238
- key=lambda f: (
239
- (f.bounding_box.center[0] - frame_center[0]) ** 2
240
- + (f.bounding_box.center[1] - frame_center[1]) ** 2
241
- ),
242
- )
243
- elif self.selection_strategy == "index":
244
- if self.face_index < len(faces):
245
- face = faces[self.face_index]
246
- else:
247
- face = faces[0] # Fall back to largest
248
- else:
249
- face = faces[0]
250
-
251
- bbox = face.bounding_box
252
- return (bbox.center[0], bbox.center[1], bbox.width, bbox.height)
253
-
254
- def detect_and_track(
255
- self,
256
- frame: np.ndarray,
257
- frame_index: int,
258
- ) -> tuple[float, float, float, float] | None:
259
- """Detect face in frame and return smoothed position.
260
-
261
- Args:
262
- frame: Video frame as numpy array (H, W, 3).
263
- frame_index: Index of current frame.
264
-
265
- Returns:
266
- Tuple of (center_x, center_y, width, height) in normalized coords,
267
- or None if no face detected and no fallback available.
268
- """
269
- if self._detector is None:
270
- self._init_detector()
271
- assert self._detector is not None
272
-
273
- h, w = frame.shape[:2]
274
-
275
- # Only run detection on interval frames
276
- should_detect = frame_index % self.detection_interval == 0
277
-
278
- if should_detect:
279
- faces = self._detector.detect(frame)
280
- face_info = self._select_face(faces, w, h)
281
-
282
- if face_info:
283
- cx, cy, fw, fh = face_info
284
- self._last_position = (cx, cy)
285
- self._last_size = (fw, fh)
286
- else:
287
- # Use last detected position
288
- face_info = None
289
- if self._last_position and self._last_size:
290
- face_info = (*self._last_position, *self._last_size)
291
-
292
- if face_info:
293
- cx, cy, fw, fh = face_info
294
-
295
- # Apply exponential moving average smoothing
296
- if self._smoothed_position is None:
297
- self._smoothed_position = (cx, cy)
298
- self._smoothed_size = (fw, fh)
299
- else:
300
- alpha = 1 - self.smoothing
301
- self._smoothed_position = (
302
- self._smoothed_position[0] * self.smoothing + cx * alpha,
303
- self._smoothed_position[1] * self.smoothing + cy * alpha,
304
- )
305
- assert self._smoothed_size is not None # Set alongside _smoothed_position
306
- self._smoothed_size = (
307
- self._smoothed_size[0] * self.smoothing + fw * alpha,
308
- self._smoothed_size[1] * self.smoothing + fh * alpha,
309
- )
310
-
311
- return (*self._smoothed_position, *self._smoothed_size)
312
-
313
- # Return last smoothed position as fallback
314
- if self._smoothed_position and self._smoothed_size:
315
- return (*self._smoothed_position, *self._smoothed_size)
316
-
317
- return None
318
-
319
- def reset(self) -> None:
320
- """Reset tracker state for a new video."""
321
- self._last_position = None
322
- self._last_size = None
323
- self._smoothed_position = None
324
- self._smoothed_size = None
325
-
326
- @staticmethod
327
- def _interpolate_bbox(
328
- bbox1: tuple[float, float, float, float],
329
- bbox2: tuple[float, float, float, float],
330
- t: float,
331
- ) -> tuple[float, float, float, float]:
332
- """Linearly interpolate between two bounding boxes.
333
-
334
- Args:
335
- bbox1: First bounding box (cx, cy, w, h).
336
- bbox2: Second bounding box (cx, cy, w, h).
337
- t: Interpolation factor (0 = bbox1, 1 = bbox2).
338
-
339
- Returns:
340
- Interpolated bounding box (cx, cy, w, h).
341
- """
342
- return (
343
- bbox1[0] + (bbox2[0] - bbox1[0]) * t,
344
- bbox1[1] + (bbox2[1] - bbox1[1]) * t,
345
- bbox1[2] + (bbox2[2] - bbox1[2]) * t,
346
- bbox1[3] + (bbox2[3] - bbox1[3]) * t,
347
- )
348
-
349
- def track_video(
350
- self,
351
- frames: np.ndarray,
352
- ) -> list[tuple[float, float, float, float] | None]:
353
- """Track face through entire video using optimized batch detection.
354
-
355
- This method is optimized for GPU backends with frame sampling and
356
- interpolation for smooth tracking with reduced computation.
357
-
358
- Args:
359
- frames: Video frames array of shape (N, H, W, 3).
360
-
361
- Returns:
362
- List of face positions (cx, cy, w, h) for each frame, or None if
363
- no face detected and no fallback available.
364
- """
365
- if self._detector is None:
366
- self._init_detector()
367
- assert self._detector is not None
368
-
369
- n_frames = len(frames)
370
- if n_frames == 0:
371
- return []
372
-
373
- h, w = frames[0].shape[:2]
374
-
375
- execution_device_getter = getattr(self._detector, "execution_device", None)
376
- if callable(execution_device_getter):
377
- resolved = execution_device_getter()
378
- backend_execution_device = resolved if resolved in {"cpu", "cuda"} else None
379
- else:
380
- backend_execution_device = None
381
- if backend_execution_device is None:
382
- backend_execution_device = "cuda" if self.backend == "gpu" else "cpu"
383
-
384
- use_sampled_interpolation = self.sample_rate > 1 and backend_execution_device == "cuda"
385
-
386
- # Determine which frames to sample
387
- if use_sampled_interpolation:
388
- sample_indices = list(range(0, n_frames, self.sample_rate))
389
- # Ensure last frame is included
390
- if sample_indices[-1] != n_frames - 1:
391
- sample_indices.append(n_frames - 1)
392
- else:
393
- sample_indices = list(range(n_frames))
394
-
395
- # Batch detect on sampled frames
396
- sampled_frames = [frames[i] for i in sample_indices]
397
-
398
- # Process in batches
399
- sampled_detections: list[list] = []
400
- for batch_start in range(0, len(sampled_frames), self.batch_size):
401
- batch_end = min(batch_start + self.batch_size, len(sampled_frames))
402
- batch = sampled_frames[batch_start:batch_end]
403
- batch_results = self._detector.detect_batch(batch)
404
- sampled_detections.extend(batch_results)
405
-
406
- # Extract face info from detections
407
- sampled_faces: list[tuple[float, float, float, float] | None] = []
408
- for faces in sampled_detections:
409
- face_info = self._select_face(faces, w, h)
410
- sampled_faces.append(face_info)
411
-
412
- # If no sampled interpolation, apply smoothing directly over detections.
413
- if not use_sampled_interpolation:
414
- self.reset()
415
- results: list[tuple[float, float, float, float] | None] = []
416
- for i, face_info in enumerate(sampled_faces):
417
- if face_info:
418
- cx, cy, fw, fh = face_info
419
- self._last_position = (cx, cy)
420
- self._last_size = (fw, fh)
421
-
422
- if self._smoothed_position is None:
423
- self._smoothed_position = (cx, cy)
424
- self._smoothed_size = (fw, fh)
425
- else:
426
- alpha = 1 - self.smoothing
427
- self._smoothed_position = (
428
- self._smoothed_position[0] * self.smoothing + cx * alpha,
429
- self._smoothed_position[1] * self.smoothing + cy * alpha,
430
- )
431
- assert self._smoothed_size is not None
432
- self._smoothed_size = (
433
- self._smoothed_size[0] * self.smoothing + fw * alpha,
434
- self._smoothed_size[1] * self.smoothing + fh * alpha,
435
- )
436
-
437
- results.append((*self._smoothed_position, *self._smoothed_size))
438
- elif self._smoothed_position and self._smoothed_size:
439
- results.append((*self._smoothed_position, *self._smoothed_size))
440
- else:
441
- results.append(None)
442
- return results
443
-
444
- # Interpolate between sampled frames
445
- all_positions: list[tuple[float, float, float, float] | None] = [None] * n_frames
446
-
447
- # Fill in sampled positions
448
- for idx, sample_idx in enumerate(sample_indices):
449
- all_positions[sample_idx] = sampled_faces[idx]
450
-
451
- # Interpolate gaps
452
- for i in range(len(sample_indices) - 1):
453
- start_idx = sample_indices[i]
454
- end_idx = sample_indices[i + 1]
455
- start_face = sampled_faces[i]
456
- end_face = sampled_faces[i + 1]
457
-
458
- if start_face is None and end_face is None:
459
- continue
460
- elif start_face is None:
461
- # Use end face for all
462
- for j in range(start_idx, end_idx):
463
- all_positions[j] = end_face
464
- elif end_face is None:
465
- # Use start face for all
466
- for j in range(start_idx + 1, end_idx + 1):
467
- all_positions[j] = start_face
468
- else:
469
- # Interpolate
470
- gap = end_idx - start_idx
471
- for j in range(start_idx + 1, end_idx):
472
- t = (j - start_idx) / gap
473
- all_positions[j] = self._interpolate_bbox(start_face, end_face, t)
474
-
475
- # Apply smoothing to interpolated positions
476
- self.reset()
477
- results = []
478
- for face_info in all_positions:
479
- if face_info:
480
- cx, cy, fw, fh = face_info
481
-
482
- if self._smoothed_position is None:
483
- self._smoothed_position = (cx, cy)
484
- self._smoothed_size = (fw, fh)
485
- else:
486
- alpha = 1 - self.smoothing
487
- self._smoothed_position = (
488
- self._smoothed_position[0] * self.smoothing + cx * alpha,
489
- self._smoothed_position[1] * self.smoothing + cy * alpha,
490
- )
491
- assert self._smoothed_size is not None
492
- self._smoothed_size = (
493
- self._smoothed_size[0] * self.smoothing + fw * alpha,
494
- self._smoothed_size[1] * self.smoothing + fh * alpha,
495
- )
496
-
497
- results.append((*self._smoothed_position, *self._smoothed_size))
498
- elif self._smoothed_position and self._smoothed_size:
499
- results.append((*self._smoothed_position, *self._smoothed_size))
500
- else:
501
- results.append(None)
502
-
503
- return results
504
-
505
-
506
30
  class FaceTrackingCrop(Transformation):
507
31
  """Crops video to follow detected faces.
508
32
 
@@ -1,12 +1,12 @@
1
1
  from .audio import AudioClassifier, AudioToText
2
+ from .faces import FaceTracker
2
3
  from .image import SceneVLM
3
- from .temporal import ActionRecognizer, SemanticSceneDetector
4
+ from .temporal import SemanticSceneDetector
4
5
 
5
6
  __all__ = [
6
7
  "AudioToText",
7
8
  "AudioClassifier",
9
+ "FaceTracker",
8
10
  "SceneVLM",
9
- # Temporal
10
- "ActionRecognizer",
11
11
  "SemanticSceneDetector",
12
12
  ]