videopython 0.28.3__tar.gz → 0.29.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.28.3 → videopython-0.29.0}/PKG-INFO +7 -4
- {videopython-0.28.3 → videopython-0.29.0}/README.md +4 -2
- {videopython-0.28.3 → videopython-0.29.0}/pyproject.toml +13 -4
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/__init__.py +3 -5
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/transforms.py +2 -478
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/understanding/__init__.py +3 -3
- videopython-0.29.0/src/videopython/ai/understanding/faces.py +592 -0
- videopython-0.29.0/src/videopython/ai/understanding/image.py +397 -0
- videopython-0.29.0/src/videopython/ai/understanding/temporal.py +218 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/video_analysis.py +217 -37
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/__init__.py +4 -2
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/description.py +82 -52
- videopython-0.28.3/src/videopython/ai/understanding/image.py +0 -215
- videopython-0.28.3/src/videopython/ai/understanding/temporal.py +0 -464
- {videopython-0.28.3 → videopython-0.29.0}/.gitignore +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/LICENSE +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/__init__.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/_device.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/dubbing/__init__.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/dubbing/cache.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/dubbing/dubber.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/dubbing/models.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/dubbing/pipeline.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/dubbing/quality.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/dubbing/remux.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/generation/audio.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/generation/qwen3.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/generation/translation.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/registry.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/swapping/__init__.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/swapping/inpainter.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/swapping/models.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/swapping/segmenter.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/swapping/swapper.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/understanding/audio.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/ai/understanding/separation.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/audio/__init__.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/audio/analysis.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/audio/audio.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/combine.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/effects.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/progress.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/registry.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/scene.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/streaming.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/text/__init__.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/text/overlay.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/text/transcription.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/transforms.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/transitions.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/utils.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/base/video.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/editing/__init__.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/editing/multicam.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/editing/premiere_xml.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/editing/video_edit.py +0 -0
- {videopython-0.28.3 → videopython-0.29.0}/src/videopython/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: videopython
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.29.0
|
|
4
4
|
Summary: Minimal video generation and processing library.
|
|
5
5
|
Project-URL: Homepage, https://videopython.com
|
|
6
6
|
Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
|
|
@@ -27,14 +27,15 @@ Requires-Dist: accelerate>=0.29.2; extra == 'ai'
|
|
|
27
27
|
Requires-Dist: chatterbox-tts>=0.1.7; extra == 'ai'
|
|
28
28
|
Requires-Dist: demucs>=4.0.0; extra == 'ai'
|
|
29
29
|
Requires-Dist: diffusers>=0.30.0; extra == 'ai'
|
|
30
|
-
Requires-Dist: easyocr>=1.7.0; extra == 'ai'
|
|
31
30
|
Requires-Dist: hf-transfer>=0.1.9; extra == 'ai'
|
|
31
|
+
Requires-Dist: imagehash>=4.3; extra == 'ai'
|
|
32
32
|
Requires-Dist: llama-cpp-python>=0.3.0; extra == 'ai'
|
|
33
33
|
Requires-Dist: numba>=0.61.0; extra == 'ai'
|
|
34
34
|
Requires-Dist: ollama>=0.4.5; extra == 'ai'
|
|
35
35
|
Requires-Dist: openai-whisper>=20240930; extra == 'ai'
|
|
36
36
|
Requires-Dist: pyannote-audio>=4.0.0; extra == 'ai'
|
|
37
37
|
Requires-Dist: pyloudnorm>=0.1.1; extra == 'ai'
|
|
38
|
+
Requires-Dist: qwen-vl-utils>=0.0.10; extra == 'ai'
|
|
38
39
|
Requires-Dist: scikit-learn>=1.3.0; extra == 'ai'
|
|
39
40
|
Requires-Dist: scipy>=1.10.0; extra == 'ai'
|
|
40
41
|
Requires-Dist: sentencepiece>=0.1.99; extra == 'ai'
|
|
@@ -56,6 +57,8 @@ Minimal, LLM-friendly Python library for programmatic video editing, processing,
|
|
|
56
57
|
|
|
57
58
|
Full documentation: [videopython.com](https://videopython.com)
|
|
58
59
|
|
|
60
|
+
> **Disclaimer:** This project started as a hand-written hobby project, but most of the code is now produced by LLM agents. Humans still drive direction, approve changes, and own design decisions.
|
|
61
|
+
|
|
59
62
|
## Installation
|
|
60
63
|
|
|
61
64
|
### 1. Install FFmpeg
|
|
@@ -193,10 +196,10 @@ API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopyth
|
|
|
193
196
|
| Area | Highlights |
|
|
194
197
|
|---|---|
|
|
195
198
|
| **Generation** | `TextToVideo`, `ImageToVideo`, `TextToImage`, `TextToSpeech`, `TextToMusic` |
|
|
196
|
-
| **Understanding** | `AudioToText` (transcription), `AudioClassifier`, `SceneVLM` (visual scene description), `
|
|
199
|
+
| **Understanding** | `AudioToText` (transcription), `AudioClassifier`, `SceneVLM` (structured visual scene description), `FaceTracker` (per-shot face tracks) |
|
|
197
200
|
| **Scene detection** | `SemanticSceneDetector` (neural scene boundaries) |
|
|
198
201
|
| **Video analysis** | `VideoAnalyzer` - full-pipeline analysis combining multiple AI capabilities |
|
|
199
|
-
| **Transforms** | `
|
|
202
|
+
| **Transforms** | `FaceTrackingCrop`, `SplitScreenComposite` |
|
|
200
203
|
| **Dubbing** | `VideoDubber` - voice cloning and revoicing with timing sync |
|
|
201
204
|
| **Object swapping** | `ObjectSwapper` - detect, segment, and inpaint objects in video |
|
|
202
205
|
|
|
@@ -8,6 +8,8 @@ Minimal, LLM-friendly Python library for programmatic video editing, processing,
|
|
|
8
8
|
|
|
9
9
|
Full documentation: [videopython.com](https://videopython.com)
|
|
10
10
|
|
|
11
|
+
> **Disclaimer:** This project started as a hand-written hobby project, but most of the code is now produced by LLM agents. Humans still drive direction, approve changes, and own design decisions.
|
|
12
|
+
|
|
11
13
|
## Installation
|
|
12
14
|
|
|
13
15
|
### 1. Install FFmpeg
|
|
@@ -145,10 +147,10 @@ API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopyth
|
|
|
145
147
|
| Area | Highlights |
|
|
146
148
|
|---|---|
|
|
147
149
|
| **Generation** | `TextToVideo`, `ImageToVideo`, `TextToImage`, `TextToSpeech`, `TextToMusic` |
|
|
148
|
-
| **Understanding** | `AudioToText` (transcription), `AudioClassifier`, `SceneVLM` (visual scene description), `
|
|
150
|
+
| **Understanding** | `AudioToText` (transcription), `AudioClassifier`, `SceneVLM` (structured visual scene description), `FaceTracker` (per-shot face tracks) |
|
|
149
151
|
| **Scene detection** | `SemanticSceneDetector` (neural scene boundaries) |
|
|
150
152
|
| **Video analysis** | `VideoAnalyzer` - full-pipeline analysis combining multiple AI capabilities |
|
|
151
|
-
| **Transforms** | `
|
|
153
|
+
| **Transforms** | `FaceTrackingCrop`, `SplitScreenComposite` |
|
|
152
154
|
| **Dubbing** | `VideoDubber` - voice cloning and revoicing with timing sync |
|
|
153
155
|
| **Object swapping** | `ObjectSwapper` - detect, segment, and inpaint objects in video |
|
|
154
156
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "videopython"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.29.0"
|
|
4
4
|
description = "Minimal video generation and processing library."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
|
|
@@ -70,7 +70,6 @@ ai = [
|
|
|
70
70
|
"scikit-learn>=1.3.0",
|
|
71
71
|
# Detection backends
|
|
72
72
|
"ultralytics>=8.0.0",
|
|
73
|
-
"easyocr>=1.7.0",
|
|
74
73
|
# Audio classification (AST via transformers - no separate dep needed)
|
|
75
74
|
# Scene detection
|
|
76
75
|
"transnetv2-pytorch>=1.0.5",
|
|
@@ -84,6 +83,11 @@ ai = [
|
|
|
84
83
|
"llama-cpp-python>=0.3.0",
|
|
85
84
|
# Loudness measurement (BS.1770) for dub-vs-source loudness matching (M3)
|
|
86
85
|
"pyloudnorm>=0.1.1",
|
|
86
|
+
# Vision-language preprocessing for Qwen3.5 (M5) - documented prerequisite
|
|
87
|
+
# for AutoModelForImageTextToText with image/video chat templates.
|
|
88
|
+
"qwen-vl-utils>=0.0.10",
|
|
89
|
+
# Perceptual hashing for SceneVLM frame dedup (M5)
|
|
90
|
+
"imagehash>=4.3",
|
|
87
91
|
]
|
|
88
92
|
|
|
89
93
|
# Required for pip install videopython[ai] - pip uses optional-dependencies, not dependency-groups
|
|
@@ -105,7 +109,6 @@ ai = [
|
|
|
105
109
|
"scikit-learn>=1.3.0",
|
|
106
110
|
# Detection backends
|
|
107
111
|
"ultralytics>=8.0.0",
|
|
108
|
-
"easyocr>=1.7.0",
|
|
109
112
|
# Audio classification (AST via transformers - no separate dep needed)
|
|
110
113
|
# Scene detection
|
|
111
114
|
"transnetv2-pytorch>=1.0.5",
|
|
@@ -119,6 +122,11 @@ ai = [
|
|
|
119
122
|
"llama-cpp-python>=0.3.0",
|
|
120
123
|
# Loudness measurement (BS.1770) for dub-vs-source loudness matching (M3)
|
|
121
124
|
"pyloudnorm>=0.1.1",
|
|
125
|
+
# Vision-language preprocessing for Qwen3.5 (M5) - documented prerequisite
|
|
126
|
+
# for AutoModelForImageTextToText with image/video chat templates.
|
|
127
|
+
"qwen-vl-utils>=0.0.10",
|
|
128
|
+
# Perceptual hashing for SceneVLM frame dedup (M5)
|
|
129
|
+
"imagehash>=4.3",
|
|
122
130
|
]
|
|
123
131
|
|
|
124
132
|
[project.urls]
|
|
@@ -135,7 +143,6 @@ module = [
|
|
|
135
143
|
"diffusers", "diffusers.*",
|
|
136
144
|
"ollama", "ollama.*",
|
|
137
145
|
"ultralytics", "ultralytics.*",
|
|
138
|
-
"easyocr", "easyocr.*",
|
|
139
146
|
"transformers", "transformers.*",
|
|
140
147
|
"transnetv2_pytorch", "transnetv2_pytorch.*",
|
|
141
148
|
"chatterbox", "chatterbox.*",
|
|
@@ -146,6 +153,8 @@ module = [
|
|
|
146
153
|
"cv2", "cv2.*",
|
|
147
154
|
"llama_cpp", "llama_cpp.*",
|
|
148
155
|
"pyloudnorm", "pyloudnorm.*",
|
|
156
|
+
"qwen_vl_utils", "qwen_vl_utils.*",
|
|
157
|
+
"imagehash", "imagehash.*",
|
|
149
158
|
]
|
|
150
159
|
ignore_missing_imports = true
|
|
151
160
|
|
|
@@ -2,11 +2,11 @@ from videopython.ai import registry as _ai_registry # noqa: F401
|
|
|
2
2
|
|
|
3
3
|
from .generation import ImageToVideo, TextToImage, TextToMusic, TextToSpeech, TextToVideo
|
|
4
4
|
from .swapping import ObjectSwapper
|
|
5
|
-
from .transforms import
|
|
5
|
+
from .transforms import FaceTrackingCrop, SplitScreenComposite
|
|
6
6
|
from .understanding import (
|
|
7
|
-
ActionRecognizer,
|
|
8
7
|
AudioClassifier,
|
|
9
8
|
AudioToText,
|
|
9
|
+
FaceTracker,
|
|
10
10
|
SceneVLM,
|
|
11
11
|
SemanticSceneDetector,
|
|
12
12
|
)
|
|
@@ -22,12 +22,10 @@ __all__ = [
|
|
|
22
22
|
# Understanding
|
|
23
23
|
"AudioToText",
|
|
24
24
|
"AudioClassifier",
|
|
25
|
+
"FaceTracker",
|
|
25
26
|
"SceneVLM",
|
|
26
|
-
# Temporal
|
|
27
|
-
"ActionRecognizer",
|
|
28
27
|
"SemanticSceneDetector",
|
|
29
28
|
# Transforms (AI-powered)
|
|
30
|
-
"FaceTracker",
|
|
31
29
|
"FaceTrackingCrop",
|
|
32
30
|
"SplitScreenComposite",
|
|
33
31
|
# Swapping
|
|
@@ -3,20 +3,16 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
from typing import
|
|
6
|
+
from typing import Literal
|
|
7
7
|
|
|
8
8
|
import cv2
|
|
9
9
|
import numpy as np
|
|
10
10
|
from tqdm import tqdm
|
|
11
11
|
|
|
12
|
-
from videopython.ai.
|
|
13
|
-
from videopython.base.description import BoundingBox, DetectedFace
|
|
12
|
+
from videopython.ai.understanding.faces import FaceTracker
|
|
14
13
|
from videopython.base.transforms import Transformation
|
|
15
14
|
from videopython.base.video import Video
|
|
16
15
|
|
|
17
|
-
if TYPE_CHECKING:
|
|
18
|
-
pass
|
|
19
|
-
|
|
20
16
|
logger = logging.getLogger(__name__)
|
|
21
17
|
|
|
22
18
|
|
|
@@ -25,484 +21,12 @@ def _make_even(value: int) -> int:
|
|
|
25
21
|
return value - (value % 2)
|
|
26
22
|
|
|
27
23
|
|
|
28
|
-
class _FaceDetectionBackend:
|
|
29
|
-
"""Internal YOLOv8-face detector used by AI transforms."""
|
|
30
|
-
|
|
31
|
-
def __init__(
|
|
32
|
-
self,
|
|
33
|
-
confidence_threshold: float = 0.5,
|
|
34
|
-
min_face_size: int = 30,
|
|
35
|
-
backend: Literal["cpu", "gpu", "auto"] = "auto",
|
|
36
|
-
):
|
|
37
|
-
self.confidence_threshold = confidence_threshold
|
|
38
|
-
self.min_face_size = min_face_size
|
|
39
|
-
self.backend: Literal["cpu", "gpu", "auto"] = backend
|
|
40
|
-
self._resolved_device: Literal["cpu", "cuda"] | None = None
|
|
41
|
-
self._yolo_model: Any = None
|
|
42
|
-
|
|
43
|
-
def _resolve_device(self) -> Literal["cpu", "cuda"]:
|
|
44
|
-
if self._resolved_device is not None:
|
|
45
|
-
return self._resolved_device
|
|
46
|
-
|
|
47
|
-
if self.backend == "cpu":
|
|
48
|
-
self._resolved_device = "cpu"
|
|
49
|
-
return self._resolved_device
|
|
50
|
-
|
|
51
|
-
if self.backend == "gpu":
|
|
52
|
-
resolved = select_device(None, mps_allowed=False)
|
|
53
|
-
if resolved != "cuda":
|
|
54
|
-
raise ValueError("GPU backend requested but CUDA is not available.")
|
|
55
|
-
self._resolved_device = "cuda"
|
|
56
|
-
return self._resolved_device
|
|
57
|
-
|
|
58
|
-
resolved_auto = select_device(None, mps_allowed=False)
|
|
59
|
-
self._resolved_device = "cuda" if resolved_auto == "cuda" else "cpu"
|
|
60
|
-
return self._resolved_device
|
|
61
|
-
|
|
62
|
-
def execution_device(self) -> Literal["cpu", "cuda"]:
|
|
63
|
-
"""Resolved execution device for this backend."""
|
|
64
|
-
return self._resolve_device()
|
|
65
|
-
|
|
66
|
-
def _init_yolo_face(self) -> None:
|
|
67
|
-
from huggingface_hub import hf_hub_download
|
|
68
|
-
from ultralytics import YOLO
|
|
69
|
-
|
|
70
|
-
model_path = hf_hub_download(
|
|
71
|
-
repo_id="arnabdhar/YOLOv8-Face-Detection",
|
|
72
|
-
filename="model.pt",
|
|
73
|
-
)
|
|
74
|
-
self._yolo_model = YOLO(model_path)
|
|
75
|
-
|
|
76
|
-
device = self._resolve_device()
|
|
77
|
-
if device == "cuda":
|
|
78
|
-
self._yolo_model.to("cuda")
|
|
79
|
-
|
|
80
|
-
def _faces_from_yolo_result(self, result: Any) -> list[DetectedFace]:
|
|
81
|
-
detected_faces: list[DetectedFace] = []
|
|
82
|
-
boxes = result.boxes
|
|
83
|
-
if boxes is None:
|
|
84
|
-
return detected_faces
|
|
85
|
-
|
|
86
|
-
img_h, img_w = result.orig_shape
|
|
87
|
-
for i in range(len(boxes)):
|
|
88
|
-
x1, y1, x2, y2 = boxes.xyxy[i].tolist()
|
|
89
|
-
conf = float(boxes.conf[i])
|
|
90
|
-
|
|
91
|
-
face_w = x2 - x1
|
|
92
|
-
face_h = y2 - y1
|
|
93
|
-
if face_w < self.min_face_size or face_h < self.min_face_size:
|
|
94
|
-
continue
|
|
95
|
-
|
|
96
|
-
detected_faces.append(
|
|
97
|
-
DetectedFace(
|
|
98
|
-
bounding_box=BoundingBox(
|
|
99
|
-
x=x1 / img_w,
|
|
100
|
-
y=y1 / img_h,
|
|
101
|
-
width=face_w / img_w,
|
|
102
|
-
height=face_h / img_h,
|
|
103
|
-
),
|
|
104
|
-
confidence=conf,
|
|
105
|
-
)
|
|
106
|
-
)
|
|
107
|
-
detected_faces.sort(key=lambda f: f.area or 0, reverse=True)
|
|
108
|
-
return detected_faces
|
|
109
|
-
|
|
110
|
-
def detect(self, image: np.ndarray) -> list[DetectedFace]:
|
|
111
|
-
if self._yolo_model is None:
|
|
112
|
-
self._init_yolo_face()
|
|
113
|
-
assert self._yolo_model is not None
|
|
114
|
-
|
|
115
|
-
results = self._yolo_model(image, conf=self.confidence_threshold, verbose=False)
|
|
116
|
-
if not results:
|
|
117
|
-
return []
|
|
118
|
-
return self._faces_from_yolo_result(results[0])
|
|
119
|
-
|
|
120
|
-
def detect_batch(self, images: list[np.ndarray] | np.ndarray) -> list[list[DetectedFace]]:
|
|
121
|
-
if isinstance(images, np.ndarray):
|
|
122
|
-
images = [images[i] for i in range(images.shape[0])] if images.ndim == 4 else [images]
|
|
123
|
-
if not images:
|
|
124
|
-
return []
|
|
125
|
-
|
|
126
|
-
if self._yolo_model is None:
|
|
127
|
-
self._init_yolo_face()
|
|
128
|
-
assert self._yolo_model is not None
|
|
129
|
-
|
|
130
|
-
results = self._yolo_model(images, conf=self.confidence_threshold, verbose=False)
|
|
131
|
-
return [self._faces_from_yolo_result(result) for result in results]
|
|
132
|
-
|
|
133
|
-
|
|
134
24
|
__all__ = [
|
|
135
|
-
"FaceTracker",
|
|
136
25
|
"FaceTrackingCrop",
|
|
137
26
|
"SplitScreenComposite",
|
|
138
27
|
]
|
|
139
28
|
|
|
140
29
|
|
|
141
|
-
class FaceTracker:
|
|
142
|
-
"""Utility for tracking faces across video frames with smoothing.
|
|
143
|
-
|
|
144
|
-
Provides frame-by-frame face detection with position smoothing using
|
|
145
|
-
exponential moving average to prevent jitter in the tracked position.
|
|
146
|
-
|
|
147
|
-
Supports GPU acceleration via YOLOv8-face model for significantly faster
|
|
148
|
-
detection, with optional frame sampling and interpolation for video.
|
|
149
|
-
|
|
150
|
-
Example:
|
|
151
|
-
>>> # Auto backend (default): resolves to GPU when available, else CPU
|
|
152
|
-
>>> tracker = FaceTracker()
|
|
153
|
-
>>> for i, frame in enumerate(frames):
|
|
154
|
-
... pos = tracker.detect_and_track(frame, i)
|
|
155
|
-
>>>
|
|
156
|
-
>>> # GPU tracking with frame sampling
|
|
157
|
-
>>> tracker = FaceTracker(backend="gpu", sample_rate=5)
|
|
158
|
-
>>> positions = tracker.track_video(frames)
|
|
159
|
-
"""
|
|
160
|
-
|
|
161
|
-
def __init__(
|
|
162
|
-
self,
|
|
163
|
-
selection_strategy: Literal["largest", "centered", "index"] = "largest",
|
|
164
|
-
face_index: int = 0,
|
|
165
|
-
smoothing: float = 0.8,
|
|
166
|
-
detection_interval: int = 3,
|
|
167
|
-
min_face_size: int = 30,
|
|
168
|
-
backend: Literal["cpu", "gpu", "auto"] = "auto",
|
|
169
|
-
sample_rate: int = 1,
|
|
170
|
-
batch_size: int = 16,
|
|
171
|
-
):
|
|
172
|
-
"""Initialize face tracker.
|
|
173
|
-
|
|
174
|
-
Args:
|
|
175
|
-
selection_strategy: How to select which face to track.
|
|
176
|
-
- "largest": Track the face with the largest bounding box.
|
|
177
|
-
- "centered": Track the face closest to frame center.
|
|
178
|
-
- "index": Track the face at a specific index (sorted by area).
|
|
179
|
-
face_index: Index of face to track when using "index" strategy.
|
|
180
|
-
smoothing: Exponential moving average factor (0-1). Higher = smoother.
|
|
181
|
-
detection_interval: Run detection every N frames, interpolate between.
|
|
182
|
-
min_face_size: Minimum face size in pixels for detection.
|
|
183
|
-
backend: Detection backend - "cpu", "gpu", or "auto".
|
|
184
|
-
sample_rate: For GPU backend, detect every Nth frame and interpolate.
|
|
185
|
-
Only used by track_video(). Default 1 (every frame).
|
|
186
|
-
batch_size: Batch size for GPU detection. Default 16.
|
|
187
|
-
"""
|
|
188
|
-
self.selection_strategy = selection_strategy
|
|
189
|
-
self.face_index = face_index
|
|
190
|
-
self.smoothing = smoothing
|
|
191
|
-
self.detection_interval = detection_interval
|
|
192
|
-
self.min_face_size = min_face_size
|
|
193
|
-
self.backend: Literal["cpu", "gpu", "auto"] = backend
|
|
194
|
-
self.sample_rate = sample_rate
|
|
195
|
-
self.batch_size = batch_size
|
|
196
|
-
|
|
197
|
-
self._detector: _FaceDetectionBackend | None = None
|
|
198
|
-
self._last_position: tuple[float, float] | None = None
|
|
199
|
-
self._last_size: tuple[float, float] | None = None
|
|
200
|
-
self._smoothed_position: tuple[float, float] | None = None
|
|
201
|
-
self._smoothed_size: tuple[float, float] | None = None
|
|
202
|
-
logger.info("FaceTracker initialized with backend=%s", self.backend)
|
|
203
|
-
|
|
204
|
-
def _init_detector(self) -> None:
|
|
205
|
-
"""Initialize face detector lazily."""
|
|
206
|
-
self._detector = _FaceDetectionBackend(
|
|
207
|
-
min_face_size=self.min_face_size,
|
|
208
|
-
backend=self.backend,
|
|
209
|
-
)
|
|
210
|
-
|
|
211
|
-
def _select_face(
|
|
212
|
-
self,
|
|
213
|
-
faces: list,
|
|
214
|
-
frame_width: int,
|
|
215
|
-
frame_height: int,
|
|
216
|
-
) -> tuple[float, float, float, float] | None:
|
|
217
|
-
"""Select a face based on the configured strategy.
|
|
218
|
-
|
|
219
|
-
Args:
|
|
220
|
-
faces: List of DetectedFace objects.
|
|
221
|
-
frame_width: Width of the frame.
|
|
222
|
-
frame_height: Height of the frame.
|
|
223
|
-
|
|
224
|
-
Returns:
|
|
225
|
-
Tuple of (center_x, center_y, width, height) in normalized coords, or None.
|
|
226
|
-
"""
|
|
227
|
-
if not faces:
|
|
228
|
-
return None
|
|
229
|
-
|
|
230
|
-
if self.selection_strategy == "largest":
|
|
231
|
-
# Faces are already sorted by area (largest first)
|
|
232
|
-
face = faces[0]
|
|
233
|
-
elif self.selection_strategy == "centered":
|
|
234
|
-
# Find face closest to center
|
|
235
|
-
frame_center = (0.5, 0.5)
|
|
236
|
-
face = min(
|
|
237
|
-
faces,
|
|
238
|
-
key=lambda f: (
|
|
239
|
-
(f.bounding_box.center[0] - frame_center[0]) ** 2
|
|
240
|
-
+ (f.bounding_box.center[1] - frame_center[1]) ** 2
|
|
241
|
-
),
|
|
242
|
-
)
|
|
243
|
-
elif self.selection_strategy == "index":
|
|
244
|
-
if self.face_index < len(faces):
|
|
245
|
-
face = faces[self.face_index]
|
|
246
|
-
else:
|
|
247
|
-
face = faces[0] # Fall back to largest
|
|
248
|
-
else:
|
|
249
|
-
face = faces[0]
|
|
250
|
-
|
|
251
|
-
bbox = face.bounding_box
|
|
252
|
-
return (bbox.center[0], bbox.center[1], bbox.width, bbox.height)
|
|
253
|
-
|
|
254
|
-
def detect_and_track(
|
|
255
|
-
self,
|
|
256
|
-
frame: np.ndarray,
|
|
257
|
-
frame_index: int,
|
|
258
|
-
) -> tuple[float, float, float, float] | None:
|
|
259
|
-
"""Detect face in frame and return smoothed position.
|
|
260
|
-
|
|
261
|
-
Args:
|
|
262
|
-
frame: Video frame as numpy array (H, W, 3).
|
|
263
|
-
frame_index: Index of current frame.
|
|
264
|
-
|
|
265
|
-
Returns:
|
|
266
|
-
Tuple of (center_x, center_y, width, height) in normalized coords,
|
|
267
|
-
or None if no face detected and no fallback available.
|
|
268
|
-
"""
|
|
269
|
-
if self._detector is None:
|
|
270
|
-
self._init_detector()
|
|
271
|
-
assert self._detector is not None
|
|
272
|
-
|
|
273
|
-
h, w = frame.shape[:2]
|
|
274
|
-
|
|
275
|
-
# Only run detection on interval frames
|
|
276
|
-
should_detect = frame_index % self.detection_interval == 0
|
|
277
|
-
|
|
278
|
-
if should_detect:
|
|
279
|
-
faces = self._detector.detect(frame)
|
|
280
|
-
face_info = self._select_face(faces, w, h)
|
|
281
|
-
|
|
282
|
-
if face_info:
|
|
283
|
-
cx, cy, fw, fh = face_info
|
|
284
|
-
self._last_position = (cx, cy)
|
|
285
|
-
self._last_size = (fw, fh)
|
|
286
|
-
else:
|
|
287
|
-
# Use last detected position
|
|
288
|
-
face_info = None
|
|
289
|
-
if self._last_position and self._last_size:
|
|
290
|
-
face_info = (*self._last_position, *self._last_size)
|
|
291
|
-
|
|
292
|
-
if face_info:
|
|
293
|
-
cx, cy, fw, fh = face_info
|
|
294
|
-
|
|
295
|
-
# Apply exponential moving average smoothing
|
|
296
|
-
if self._smoothed_position is None:
|
|
297
|
-
self._smoothed_position = (cx, cy)
|
|
298
|
-
self._smoothed_size = (fw, fh)
|
|
299
|
-
else:
|
|
300
|
-
alpha = 1 - self.smoothing
|
|
301
|
-
self._smoothed_position = (
|
|
302
|
-
self._smoothed_position[0] * self.smoothing + cx * alpha,
|
|
303
|
-
self._smoothed_position[1] * self.smoothing + cy * alpha,
|
|
304
|
-
)
|
|
305
|
-
assert self._smoothed_size is not None # Set alongside _smoothed_position
|
|
306
|
-
self._smoothed_size = (
|
|
307
|
-
self._smoothed_size[0] * self.smoothing + fw * alpha,
|
|
308
|
-
self._smoothed_size[1] * self.smoothing + fh * alpha,
|
|
309
|
-
)
|
|
310
|
-
|
|
311
|
-
return (*self._smoothed_position, *self._smoothed_size)
|
|
312
|
-
|
|
313
|
-
# Return last smoothed position as fallback
|
|
314
|
-
if self._smoothed_position and self._smoothed_size:
|
|
315
|
-
return (*self._smoothed_position, *self._smoothed_size)
|
|
316
|
-
|
|
317
|
-
return None
|
|
318
|
-
|
|
319
|
-
def reset(self) -> None:
|
|
320
|
-
"""Reset tracker state for a new video."""
|
|
321
|
-
self._last_position = None
|
|
322
|
-
self._last_size = None
|
|
323
|
-
self._smoothed_position = None
|
|
324
|
-
self._smoothed_size = None
|
|
325
|
-
|
|
326
|
-
@staticmethod
|
|
327
|
-
def _interpolate_bbox(
|
|
328
|
-
bbox1: tuple[float, float, float, float],
|
|
329
|
-
bbox2: tuple[float, float, float, float],
|
|
330
|
-
t: float,
|
|
331
|
-
) -> tuple[float, float, float, float]:
|
|
332
|
-
"""Linearly interpolate between two bounding boxes.
|
|
333
|
-
|
|
334
|
-
Args:
|
|
335
|
-
bbox1: First bounding box (cx, cy, w, h).
|
|
336
|
-
bbox2: Second bounding box (cx, cy, w, h).
|
|
337
|
-
t: Interpolation factor (0 = bbox1, 1 = bbox2).
|
|
338
|
-
|
|
339
|
-
Returns:
|
|
340
|
-
Interpolated bounding box (cx, cy, w, h).
|
|
341
|
-
"""
|
|
342
|
-
return (
|
|
343
|
-
bbox1[0] + (bbox2[0] - bbox1[0]) * t,
|
|
344
|
-
bbox1[1] + (bbox2[1] - bbox1[1]) * t,
|
|
345
|
-
bbox1[2] + (bbox2[2] - bbox1[2]) * t,
|
|
346
|
-
bbox1[3] + (bbox2[3] - bbox1[3]) * t,
|
|
347
|
-
)
|
|
348
|
-
|
|
349
|
-
def track_video(
|
|
350
|
-
self,
|
|
351
|
-
frames: np.ndarray,
|
|
352
|
-
) -> list[tuple[float, float, float, float] | None]:
|
|
353
|
-
"""Track face through entire video using optimized batch detection.
|
|
354
|
-
|
|
355
|
-
This method is optimized for GPU backends with frame sampling and
|
|
356
|
-
interpolation for smooth tracking with reduced computation.
|
|
357
|
-
|
|
358
|
-
Args:
|
|
359
|
-
frames: Video frames array of shape (N, H, W, 3).
|
|
360
|
-
|
|
361
|
-
Returns:
|
|
362
|
-
List of face positions (cx, cy, w, h) for each frame, or None if
|
|
363
|
-
no face detected and no fallback available.
|
|
364
|
-
"""
|
|
365
|
-
if self._detector is None:
|
|
366
|
-
self._init_detector()
|
|
367
|
-
assert self._detector is not None
|
|
368
|
-
|
|
369
|
-
n_frames = len(frames)
|
|
370
|
-
if n_frames == 0:
|
|
371
|
-
return []
|
|
372
|
-
|
|
373
|
-
h, w = frames[0].shape[:2]
|
|
374
|
-
|
|
375
|
-
execution_device_getter = getattr(self._detector, "execution_device", None)
|
|
376
|
-
if callable(execution_device_getter):
|
|
377
|
-
resolved = execution_device_getter()
|
|
378
|
-
backend_execution_device = resolved if resolved in {"cpu", "cuda"} else None
|
|
379
|
-
else:
|
|
380
|
-
backend_execution_device = None
|
|
381
|
-
if backend_execution_device is None:
|
|
382
|
-
backend_execution_device = "cuda" if self.backend == "gpu" else "cpu"
|
|
383
|
-
|
|
384
|
-
use_sampled_interpolation = self.sample_rate > 1 and backend_execution_device == "cuda"
|
|
385
|
-
|
|
386
|
-
# Determine which frames to sample
|
|
387
|
-
if use_sampled_interpolation:
|
|
388
|
-
sample_indices = list(range(0, n_frames, self.sample_rate))
|
|
389
|
-
# Ensure last frame is included
|
|
390
|
-
if sample_indices[-1] != n_frames - 1:
|
|
391
|
-
sample_indices.append(n_frames - 1)
|
|
392
|
-
else:
|
|
393
|
-
sample_indices = list(range(n_frames))
|
|
394
|
-
|
|
395
|
-
# Batch detect on sampled frames
|
|
396
|
-
sampled_frames = [frames[i] for i in sample_indices]
|
|
397
|
-
|
|
398
|
-
# Process in batches
|
|
399
|
-
sampled_detections: list[list] = []
|
|
400
|
-
for batch_start in range(0, len(sampled_frames), self.batch_size):
|
|
401
|
-
batch_end = min(batch_start + self.batch_size, len(sampled_frames))
|
|
402
|
-
batch = sampled_frames[batch_start:batch_end]
|
|
403
|
-
batch_results = self._detector.detect_batch(batch)
|
|
404
|
-
sampled_detections.extend(batch_results)
|
|
405
|
-
|
|
406
|
-
# Extract face info from detections
|
|
407
|
-
sampled_faces: list[tuple[float, float, float, float] | None] = []
|
|
408
|
-
for faces in sampled_detections:
|
|
409
|
-
face_info = self._select_face(faces, w, h)
|
|
410
|
-
sampled_faces.append(face_info)
|
|
411
|
-
|
|
412
|
-
# If no sampled interpolation, apply smoothing directly over detections.
|
|
413
|
-
if not use_sampled_interpolation:
|
|
414
|
-
self.reset()
|
|
415
|
-
results: list[tuple[float, float, float, float] | None] = []
|
|
416
|
-
for i, face_info in enumerate(sampled_faces):
|
|
417
|
-
if face_info:
|
|
418
|
-
cx, cy, fw, fh = face_info
|
|
419
|
-
self._last_position = (cx, cy)
|
|
420
|
-
self._last_size = (fw, fh)
|
|
421
|
-
|
|
422
|
-
if self._smoothed_position is None:
|
|
423
|
-
self._smoothed_position = (cx, cy)
|
|
424
|
-
self._smoothed_size = (fw, fh)
|
|
425
|
-
else:
|
|
426
|
-
alpha = 1 - self.smoothing
|
|
427
|
-
self._smoothed_position = (
|
|
428
|
-
self._smoothed_position[0] * self.smoothing + cx * alpha,
|
|
429
|
-
self._smoothed_position[1] * self.smoothing + cy * alpha,
|
|
430
|
-
)
|
|
431
|
-
assert self._smoothed_size is not None
|
|
432
|
-
self._smoothed_size = (
|
|
433
|
-
self._smoothed_size[0] * self.smoothing + fw * alpha,
|
|
434
|
-
self._smoothed_size[1] * self.smoothing + fh * alpha,
|
|
435
|
-
)
|
|
436
|
-
|
|
437
|
-
results.append((*self._smoothed_position, *self._smoothed_size))
|
|
438
|
-
elif self._smoothed_position and self._smoothed_size:
|
|
439
|
-
results.append((*self._smoothed_position, *self._smoothed_size))
|
|
440
|
-
else:
|
|
441
|
-
results.append(None)
|
|
442
|
-
return results
|
|
443
|
-
|
|
444
|
-
# Interpolate between sampled frames
|
|
445
|
-
all_positions: list[tuple[float, float, float, float] | None] = [None] * n_frames
|
|
446
|
-
|
|
447
|
-
# Fill in sampled positions
|
|
448
|
-
for idx, sample_idx in enumerate(sample_indices):
|
|
449
|
-
all_positions[sample_idx] = sampled_faces[idx]
|
|
450
|
-
|
|
451
|
-
# Interpolate gaps
|
|
452
|
-
for i in range(len(sample_indices) - 1):
|
|
453
|
-
start_idx = sample_indices[i]
|
|
454
|
-
end_idx = sample_indices[i + 1]
|
|
455
|
-
start_face = sampled_faces[i]
|
|
456
|
-
end_face = sampled_faces[i + 1]
|
|
457
|
-
|
|
458
|
-
if start_face is None and end_face is None:
|
|
459
|
-
continue
|
|
460
|
-
elif start_face is None:
|
|
461
|
-
# Use end face for all
|
|
462
|
-
for j in range(start_idx, end_idx):
|
|
463
|
-
all_positions[j] = end_face
|
|
464
|
-
elif end_face is None:
|
|
465
|
-
# Use start face for all
|
|
466
|
-
for j in range(start_idx + 1, end_idx + 1):
|
|
467
|
-
all_positions[j] = start_face
|
|
468
|
-
else:
|
|
469
|
-
# Interpolate
|
|
470
|
-
gap = end_idx - start_idx
|
|
471
|
-
for j in range(start_idx + 1, end_idx):
|
|
472
|
-
t = (j - start_idx) / gap
|
|
473
|
-
all_positions[j] = self._interpolate_bbox(start_face, end_face, t)
|
|
474
|
-
|
|
475
|
-
# Apply smoothing to interpolated positions
|
|
476
|
-
self.reset()
|
|
477
|
-
results = []
|
|
478
|
-
for face_info in all_positions:
|
|
479
|
-
if face_info:
|
|
480
|
-
cx, cy, fw, fh = face_info
|
|
481
|
-
|
|
482
|
-
if self._smoothed_position is None:
|
|
483
|
-
self._smoothed_position = (cx, cy)
|
|
484
|
-
self._smoothed_size = (fw, fh)
|
|
485
|
-
else:
|
|
486
|
-
alpha = 1 - self.smoothing
|
|
487
|
-
self._smoothed_position = (
|
|
488
|
-
self._smoothed_position[0] * self.smoothing + cx * alpha,
|
|
489
|
-
self._smoothed_position[1] * self.smoothing + cy * alpha,
|
|
490
|
-
)
|
|
491
|
-
assert self._smoothed_size is not None
|
|
492
|
-
self._smoothed_size = (
|
|
493
|
-
self._smoothed_size[0] * self.smoothing + fw * alpha,
|
|
494
|
-
self._smoothed_size[1] * self.smoothing + fh * alpha,
|
|
495
|
-
)
|
|
496
|
-
|
|
497
|
-
results.append((*self._smoothed_position, *self._smoothed_size))
|
|
498
|
-
elif self._smoothed_position and self._smoothed_size:
|
|
499
|
-
results.append((*self._smoothed_position, *self._smoothed_size))
|
|
500
|
-
else:
|
|
501
|
-
results.append(None)
|
|
502
|
-
|
|
503
|
-
return results
|
|
504
|
-
|
|
505
|
-
|
|
506
30
|
class FaceTrackingCrop(Transformation):
|
|
507
31
|
"""Crops video to follow detected faces.
|
|
508
32
|
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
from .audio import AudioClassifier, AudioToText
|
|
2
|
+
from .faces import FaceTracker
|
|
2
3
|
from .image import SceneVLM
|
|
3
|
-
from .temporal import
|
|
4
|
+
from .temporal import SemanticSceneDetector
|
|
4
5
|
|
|
5
6
|
__all__ = [
|
|
6
7
|
"AudioToText",
|
|
7
8
|
"AudioClassifier",
|
|
9
|
+
"FaceTracker",
|
|
8
10
|
"SceneVLM",
|
|
9
|
-
# Temporal
|
|
10
|
-
"ActionRecognizer",
|
|
11
11
|
"SemanticSceneDetector",
|
|
12
12
|
]
|