media-engine 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/clip.py +79 -0
- cli/faces.py +91 -0
- cli/metadata.py +68 -0
- cli/motion.py +77 -0
- cli/objects.py +94 -0
- cli/ocr.py +93 -0
- cli/scenes.py +57 -0
- cli/telemetry.py +65 -0
- cli/transcript.py +76 -0
- media_engine/__init__.py +7 -0
- media_engine/_version.py +34 -0
- media_engine/app.py +80 -0
- media_engine/batch/__init__.py +56 -0
- media_engine/batch/models.py +99 -0
- media_engine/batch/processor.py +1131 -0
- media_engine/batch/queue.py +232 -0
- media_engine/batch/state.py +30 -0
- media_engine/batch/timing.py +321 -0
- media_engine/cli.py +17 -0
- media_engine/config.py +674 -0
- media_engine/extractors/__init__.py +75 -0
- media_engine/extractors/clip.py +401 -0
- media_engine/extractors/faces.py +459 -0
- media_engine/extractors/frame_buffer.py +351 -0
- media_engine/extractors/frames.py +402 -0
- media_engine/extractors/metadata/__init__.py +127 -0
- media_engine/extractors/metadata/apple.py +169 -0
- media_engine/extractors/metadata/arri.py +118 -0
- media_engine/extractors/metadata/avchd.py +208 -0
- media_engine/extractors/metadata/avchd_gps.py +270 -0
- media_engine/extractors/metadata/base.py +688 -0
- media_engine/extractors/metadata/blackmagic.py +139 -0
- media_engine/extractors/metadata/camera_360.py +276 -0
- media_engine/extractors/metadata/canon.py +290 -0
- media_engine/extractors/metadata/dji.py +371 -0
- media_engine/extractors/metadata/dv.py +121 -0
- media_engine/extractors/metadata/ffmpeg.py +76 -0
- media_engine/extractors/metadata/generic.py +119 -0
- media_engine/extractors/metadata/gopro.py +256 -0
- media_engine/extractors/metadata/red.py +305 -0
- media_engine/extractors/metadata/registry.py +114 -0
- media_engine/extractors/metadata/sony.py +442 -0
- media_engine/extractors/metadata/tesla.py +157 -0
- media_engine/extractors/motion.py +765 -0
- media_engine/extractors/objects.py +245 -0
- media_engine/extractors/objects_qwen.py +754 -0
- media_engine/extractors/ocr.py +268 -0
- media_engine/extractors/scenes.py +82 -0
- media_engine/extractors/shot_type.py +217 -0
- media_engine/extractors/telemetry.py +262 -0
- media_engine/extractors/transcribe.py +579 -0
- media_engine/extractors/translate.py +121 -0
- media_engine/extractors/vad.py +263 -0
- media_engine/main.py +68 -0
- media_engine/py.typed +0 -0
- media_engine/routers/__init__.py +15 -0
- media_engine/routers/batch.py +78 -0
- media_engine/routers/health.py +93 -0
- media_engine/routers/models.py +211 -0
- media_engine/routers/settings.py +87 -0
- media_engine/routers/utils.py +135 -0
- media_engine/schemas.py +581 -0
- media_engine/utils/__init__.py +5 -0
- media_engine/utils/logging.py +54 -0
- media_engine/utils/memory.py +49 -0
- media_engine-0.1.0.dist-info/METADATA +276 -0
- media_engine-0.1.0.dist-info/RECORD +70 -0
- media_engine-0.1.0.dist-info/WHEEL +4 -0
- media_engine-0.1.0.dist-info/entry_points.txt +11 -0
- media_engine-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,765 @@
|
|
|
1
|
+
"""Camera motion analysis using optical flow."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import platform
|
|
5
|
+
import subprocess
|
|
6
|
+
import time
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from enum import StrEnum
|
|
9
|
+
|
|
10
|
+
import cv2
|
|
11
|
+
import numpy as np
|
|
12
|
+
|
|
13
|
+
from media_engine.extractors.metadata.base import get_video_info
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
# Cache for hardware acceleration detection
|
|
18
|
+
_hwaccel_cache: str | None = None
|
|
19
|
+
|
|
20
|
+
# Analysis resolution (scale down for speed)
|
|
21
|
+
ANALYSIS_HEIGHT = 720
|
|
22
|
+
ANALYSIS_WIDTH = 1280 # 720p aspect ratio
|
|
23
|
+
|
|
24
|
+
# Motion detection thresholds
|
|
25
|
+
MOTION_THRESHOLD = 2.0 # Minimum average flow magnitude for motion
|
|
26
|
+
PAN_TILT_THRESHOLD = 0.7 # Ratio of directional vs total flow for pan/tilt
|
|
27
|
+
ZOOM_THRESHOLD = 0.3 # Divergence threshold for zoom detection
|
|
28
|
+
STATIC_THRESHOLD = 0.5 # Below this = static
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class MotionType(StrEnum):
|
|
32
|
+
"""Types of camera motion.
|
|
33
|
+
|
|
34
|
+
Note: PUSH_IN/PULL_OUT describe the optical flow pattern (radial expansion/contraction).
|
|
35
|
+
This could be optical zoom OR physical camera movement (dolly/travel).
|
|
36
|
+
Frontend can interpret based on metadata (lens type, GPS movement, device type).
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
STATIC = "static"
|
|
40
|
+
PAN_LEFT = "pan_left"
|
|
41
|
+
PAN_RIGHT = "pan_right"
|
|
42
|
+
TILT_UP = "tilt_up"
|
|
43
|
+
TILT_DOWN = "tilt_down"
|
|
44
|
+
PUSH_IN = "push_in" # Radial expansion (zoom in or dolly forward)
|
|
45
|
+
PULL_OUT = "pull_out" # Radial contraction (zoom out or dolly backward)
|
|
46
|
+
HANDHELD = "handheld" # Random/shaky movement
|
|
47
|
+
COMPLEX = "complex" # Multiple motions combined
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass(slots=True)
|
|
51
|
+
class MotionSegment:
|
|
52
|
+
"""A segment of video with consistent motion.
|
|
53
|
+
|
|
54
|
+
Uses slots=True to reduce memory overhead per instance.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
start: float
|
|
58
|
+
end: float
|
|
59
|
+
motion_type: MotionType
|
|
60
|
+
intensity: float # Average flow magnitude
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass(slots=True)
|
|
64
|
+
class MotionAnalysis:
|
|
65
|
+
"""Complete motion analysis for a video.
|
|
66
|
+
|
|
67
|
+
Uses slots=True to reduce memory overhead per instance.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
duration: float
|
|
71
|
+
fps: float
|
|
72
|
+
primary_motion: MotionType
|
|
73
|
+
segments: list[MotionSegment]
|
|
74
|
+
avg_intensity: float
|
|
75
|
+
is_stable: bool # True if mostly static/tripod
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# Chunk duration in seconds (2 minutes)
|
|
79
|
+
CHUNK_DURATION = 120.0
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _detect_hwaccel() -> str | None:
|
|
83
|
+
"""Detect available hardware acceleration for video decoding.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Hardware acceleration method name, or None if not available.
|
|
87
|
+
- "videotoolbox" for macOS (Apple Silicon or Intel with VideoToolbox)
|
|
88
|
+
- "cuda" for NVIDIA GPUs
|
|
89
|
+
- None for software decoding
|
|
90
|
+
"""
|
|
91
|
+
global _hwaccel_cache
|
|
92
|
+
|
|
93
|
+
if _hwaccel_cache is not None:
|
|
94
|
+
return _hwaccel_cache if _hwaccel_cache != "" else None
|
|
95
|
+
|
|
96
|
+
# Check platform
|
|
97
|
+
system = platform.system()
|
|
98
|
+
|
|
99
|
+
if system == "Darwin":
|
|
100
|
+
# macOS - check for VideoToolbox support
|
|
101
|
+
try:
|
|
102
|
+
result = subprocess.run(
|
|
103
|
+
["ffmpeg", "-hwaccels"],
|
|
104
|
+
capture_output=True,
|
|
105
|
+
text=True,
|
|
106
|
+
timeout=5,
|
|
107
|
+
)
|
|
108
|
+
if "videotoolbox" in result.stdout:
|
|
109
|
+
logger.info("Hardware acceleration: VideoToolbox (macOS)")
|
|
110
|
+
_hwaccel_cache = "videotoolbox"
|
|
111
|
+
return "videotoolbox"
|
|
112
|
+
except Exception:
|
|
113
|
+
pass
|
|
114
|
+
|
|
115
|
+
elif system == "Linux":
|
|
116
|
+
# Linux - check for CUDA/NVDEC
|
|
117
|
+
try:
|
|
118
|
+
result = subprocess.run(
|
|
119
|
+
["ffmpeg", "-hwaccels"],
|
|
120
|
+
capture_output=True,
|
|
121
|
+
text=True,
|
|
122
|
+
timeout=5,
|
|
123
|
+
)
|
|
124
|
+
if "cuda" in result.stdout:
|
|
125
|
+
# Verify NVIDIA GPU is present
|
|
126
|
+
nvidia_check = subprocess.run(
|
|
127
|
+
["nvidia-smi", "-L"],
|
|
128
|
+
capture_output=True,
|
|
129
|
+
timeout=5,
|
|
130
|
+
)
|
|
131
|
+
if nvidia_check.returncode == 0:
|
|
132
|
+
logger.info("Hardware acceleration: CUDA (NVIDIA)")
|
|
133
|
+
_hwaccel_cache = "cuda"
|
|
134
|
+
return "cuda"
|
|
135
|
+
except Exception:
|
|
136
|
+
pass
|
|
137
|
+
|
|
138
|
+
logger.info("Hardware acceleration: None (software decoding)")
|
|
139
|
+
_hwaccel_cache = ""
|
|
140
|
+
return None
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _load_frames_chunk(
|
|
144
|
+
file_path: str,
|
|
145
|
+
start_time: float,
|
|
146
|
+
chunk_duration: float,
|
|
147
|
+
sample_fps: float,
|
|
148
|
+
out_width: int,
|
|
149
|
+
out_height: int,
|
|
150
|
+
hwaccel: str | None = None,
|
|
151
|
+
src_width: int = 0,
|
|
152
|
+
src_height: int = 0,
|
|
153
|
+
) -> np.ndarray:
|
|
154
|
+
"""Load a chunk of frames into memory using FFmpeg.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
file_path: Path to video file
|
|
158
|
+
start_time: Start time in seconds
|
|
159
|
+
chunk_duration: Duration of chunk to load in seconds
|
|
160
|
+
sample_fps: Frames per second to sample
|
|
161
|
+
out_width: Output frame width
|
|
162
|
+
out_height: Output frame height
|
|
163
|
+
hwaccel: Hardware acceleration method (videotoolbox, cuda, or None)
|
|
164
|
+
src_width: Source video width (for aspect ratio calculation with hwaccel)
|
|
165
|
+
src_height: Source video height (for aspect ratio calculation with hwaccel)
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
numpy array of shape (num_frames, height, width) with grayscale frames
|
|
169
|
+
"""
|
|
170
|
+
# Build command with optional hardware acceleration
|
|
171
|
+
cmd = ["ffmpeg", "-hide_banner"]
|
|
172
|
+
|
|
173
|
+
# For hardware acceleration, calculate output height based on source aspect ratio
|
|
174
|
+
actual_out_height = out_height
|
|
175
|
+
if hwaccel and src_width > 0 and src_height > 0:
|
|
176
|
+
# Calculate height maintaining aspect ratio, rounded to even number
|
|
177
|
+
actual_out_height = int(out_width * src_height / src_width)
|
|
178
|
+
actual_out_height = actual_out_height - (actual_out_height % 2) # Ensure even
|
|
179
|
+
|
|
180
|
+
# Use hardware-accelerated decode and scaling if available
|
|
181
|
+
if hwaccel == "videotoolbox":
|
|
182
|
+
# Decode on hardware, scale on GPU, then transfer to CPU
|
|
183
|
+
# p010le is required for VideoToolbox hwdownload (10-bit format)
|
|
184
|
+
cmd.extend(["-hwaccel", "videotoolbox", "-hwaccel_output_format", "videotoolbox_vld"])
|
|
185
|
+
vf_filter = f"scale_vt=w={out_width}:h={actual_out_height},hwdownload,format=p010le,fps={sample_fps}"
|
|
186
|
+
elif hwaccel == "cuda":
|
|
187
|
+
cmd.extend(["-hwaccel", "cuda", "-hwaccel_output_format", "cuda"])
|
|
188
|
+
vf_filter = f"scale_cuda={out_width}:{actual_out_height},hwdownload,format=nv12,fps={sample_fps}"
|
|
189
|
+
else:
|
|
190
|
+
actual_out_height = out_height # Use the provided value for software
|
|
191
|
+
vf_filter = f"scale={out_width}:{out_height}:force_original_aspect_ratio=decrease,fps={sample_fps}"
|
|
192
|
+
|
|
193
|
+
cmd.extend(
|
|
194
|
+
[
|
|
195
|
+
"-ss",
|
|
196
|
+
str(start_time),
|
|
197
|
+
"-t",
|
|
198
|
+
str(chunk_duration),
|
|
199
|
+
"-i",
|
|
200
|
+
file_path,
|
|
201
|
+
"-vf",
|
|
202
|
+
vf_filter,
|
|
203
|
+
"-f",
|
|
204
|
+
"rawvideo",
|
|
205
|
+
"-pix_fmt",
|
|
206
|
+
"gray",
|
|
207
|
+
"-",
|
|
208
|
+
]
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
logger.debug(f"FFmpeg command: {' '.join(cmd)}")
|
|
212
|
+
|
|
213
|
+
process = subprocess.Popen(
|
|
214
|
+
cmd,
|
|
215
|
+
stdout=subprocess.PIPE,
|
|
216
|
+
stderr=subprocess.PIPE,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
frame_size = out_width * actual_out_height
|
|
220
|
+
frames: list[np.ndarray] = []
|
|
221
|
+
|
|
222
|
+
while True:
|
|
223
|
+
raw_frame = process.stdout.read(frame_size) # type: ignore[union-attr]
|
|
224
|
+
if len(raw_frame) != frame_size:
|
|
225
|
+
break
|
|
226
|
+
frame = np.frombuffer(raw_frame, dtype=np.uint8).reshape((actual_out_height, out_width))
|
|
227
|
+
frames.append(frame.copy())
|
|
228
|
+
|
|
229
|
+
_, stderr = process.communicate()
|
|
230
|
+
if stderr and logger.isEnabledFor(logging.DEBUG):
|
|
231
|
+
# Log first few lines of stderr for debugging
|
|
232
|
+
stderr_lines = stderr.decode(errors="ignore").strip().split("\n")[:5]
|
|
233
|
+
for line in stderr_lines:
|
|
234
|
+
if line:
|
|
235
|
+
logger.debug(f"FFmpeg: {line}")
|
|
236
|
+
|
|
237
|
+
# If hardware acceleration failed (no frames), retry without it
|
|
238
|
+
if not frames and hwaccel:
|
|
239
|
+
# Log the stderr to understand why it failed
|
|
240
|
+
if stderr:
|
|
241
|
+
logger.warning(f"Hardware acceleration ({hwaccel}) failed: {stderr.decode(errors='ignore')[:500]}")
|
|
242
|
+
else:
|
|
243
|
+
logger.warning(f"Hardware acceleration ({hwaccel}) failed, no frames produced")
|
|
244
|
+
return _load_frames_chunk(
|
|
245
|
+
file_path,
|
|
246
|
+
start_time,
|
|
247
|
+
chunk_duration,
|
|
248
|
+
sample_fps,
|
|
249
|
+
out_width,
|
|
250
|
+
out_height,
|
|
251
|
+
hwaccel=None,
|
|
252
|
+
src_width=src_width,
|
|
253
|
+
src_height=src_height,
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
if not frames:
|
|
257
|
+
return np.array([], dtype=np.uint8)
|
|
258
|
+
|
|
259
|
+
return np.stack(frames)
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def analyze_motion(
|
|
263
|
+
file_path: str,
|
|
264
|
+
sample_fps: float = 5.0, # Analyze every N frames per second
|
|
265
|
+
chunk_duration: float = CHUNK_DURATION, # Process 2 minutes at a time
|
|
266
|
+
) -> MotionAnalysis:
|
|
267
|
+
"""Analyze camera motion in a video using optical flow.
|
|
268
|
+
|
|
269
|
+
Uses FFmpeg to decode frames at low resolution for efficiency with high-res video.
|
|
270
|
+
Processes in 2-minute chunks to balance memory usage and I/O efficiency.
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
file_path: Path to video file
|
|
274
|
+
sample_fps: How many frames per second to analyze (default 5)
|
|
275
|
+
chunk_duration: Duration of each processing chunk in seconds (default 120)
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
MotionAnalysis with motion type segments
|
|
279
|
+
"""
|
|
280
|
+
# Get video info
|
|
281
|
+
fps, duration, width, height = get_video_info(file_path)
|
|
282
|
+
if duration == 0:
|
|
283
|
+
# Fallback to opencv for duration
|
|
284
|
+
cap = cv2.VideoCapture(file_path)
|
|
285
|
+
fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
|
|
286
|
+
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
|
287
|
+
duration = frame_count / fps if fps > 0 else 0
|
|
288
|
+
cap.release()
|
|
289
|
+
|
|
290
|
+
total_samples = int(duration * sample_fps)
|
|
291
|
+
|
|
292
|
+
# Detect hardware acceleration
|
|
293
|
+
hwaccel = _detect_hwaccel()
|
|
294
|
+
|
|
295
|
+
logger.info(f"Analyzing motion: {duration:.1f}s @ {fps:.1f}fps, ~{total_samples} samples" + (f" (hwaccel={hwaccel})" if hwaccel else ""))
|
|
296
|
+
|
|
297
|
+
# Calculate actual frame dimensions after scaling
|
|
298
|
+
if width > height:
|
|
299
|
+
out_width = ANALYSIS_WIDTH
|
|
300
|
+
out_height = int(height * ANALYSIS_WIDTH / width)
|
|
301
|
+
else:
|
|
302
|
+
out_height = ANALYSIS_HEIGHT
|
|
303
|
+
out_width = int(width * ANALYSIS_HEIGHT / height)
|
|
304
|
+
|
|
305
|
+
# Ensure even dimensions for video
|
|
306
|
+
out_width = out_width - (out_width % 2)
|
|
307
|
+
out_height = out_height - (out_height % 2)
|
|
308
|
+
|
|
309
|
+
frame_motions: list[tuple[float, MotionType, float]] = []
|
|
310
|
+
prev_gray: np.ndarray | None = None
|
|
311
|
+
global_frame_idx = 0
|
|
312
|
+
|
|
313
|
+
# Timing stats
|
|
314
|
+
total_load_time = 0.0
|
|
315
|
+
total_flow_time = 0.0
|
|
316
|
+
|
|
317
|
+
# Process video in chunks
|
|
318
|
+
num_chunks = max(1, int(np.ceil(duration / chunk_duration)))
|
|
319
|
+
logger.debug(f"Processing in {num_chunks} chunk(s) of {chunk_duration}s each")
|
|
320
|
+
|
|
321
|
+
for chunk_idx in range(num_chunks):
|
|
322
|
+
chunk_start = chunk_idx * chunk_duration
|
|
323
|
+
actual_chunk_duration = min(chunk_duration, duration - chunk_start)
|
|
324
|
+
|
|
325
|
+
if actual_chunk_duration <= 0:
|
|
326
|
+
break
|
|
327
|
+
|
|
328
|
+
logger.debug(f"Loading chunk {chunk_idx + 1}/{num_chunks}: {chunk_start:.1f}s - {chunk_start + actual_chunk_duration:.1f}s")
|
|
329
|
+
|
|
330
|
+
# Load all frames for this chunk into memory
|
|
331
|
+
load_start = time.perf_counter()
|
|
332
|
+
frames = _load_frames_chunk(
|
|
333
|
+
file_path,
|
|
334
|
+
chunk_start,
|
|
335
|
+
actual_chunk_duration,
|
|
336
|
+
sample_fps,
|
|
337
|
+
out_width,
|
|
338
|
+
out_height,
|
|
339
|
+
hwaccel=hwaccel,
|
|
340
|
+
src_width=width,
|
|
341
|
+
src_height=height,
|
|
342
|
+
)
|
|
343
|
+
total_load_time += time.perf_counter() - load_start
|
|
344
|
+
|
|
345
|
+
if frames.size == 0:
|
|
346
|
+
continue
|
|
347
|
+
|
|
348
|
+
logger.debug(f"Loaded {len(frames)} frames into memory")
|
|
349
|
+
|
|
350
|
+
# Process optical flow for this chunk
|
|
351
|
+
flow_start = time.perf_counter()
|
|
352
|
+
for i in range(len(frames)):
|
|
353
|
+
gray = frames[i]
|
|
354
|
+
timestamp = global_frame_idx / sample_fps
|
|
355
|
+
|
|
356
|
+
if prev_gray is not None and prev_gray.shape == gray.shape:
|
|
357
|
+
# Compute optical flow
|
|
358
|
+
flow = cv2.calcOpticalFlowFarneback(
|
|
359
|
+
prev_gray,
|
|
360
|
+
gray,
|
|
361
|
+
None, # type: ignore[arg-type]
|
|
362
|
+
pyr_scale=0.5,
|
|
363
|
+
levels=3,
|
|
364
|
+
winsize=15,
|
|
365
|
+
iterations=3,
|
|
366
|
+
poly_n=5,
|
|
367
|
+
poly_sigma=1.2,
|
|
368
|
+
flags=0,
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
# Classify motion
|
|
372
|
+
motion_type, intensity = _classify_flow(flow)
|
|
373
|
+
frame_motions.append((timestamp, motion_type, intensity))
|
|
374
|
+
|
|
375
|
+
prev_gray = gray.copy()
|
|
376
|
+
global_frame_idx += 1
|
|
377
|
+
total_flow_time += time.perf_counter() - flow_start
|
|
378
|
+
|
|
379
|
+
# Log timing breakdown
|
|
380
|
+
logger.info(f"Motion analysis timing: decode={total_load_time:.2f}s, " f"optical_flow={total_flow_time:.2f}s, frames={global_frame_idx}")
|
|
381
|
+
|
|
382
|
+
if not frame_motions:
|
|
383
|
+
return MotionAnalysis(
|
|
384
|
+
duration=duration,
|
|
385
|
+
fps=fps,
|
|
386
|
+
primary_motion=MotionType.STATIC,
|
|
387
|
+
segments=[],
|
|
388
|
+
avg_intensity=0.0,
|
|
389
|
+
is_stable=True,
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
# Build segments from frame motions
|
|
393
|
+
segments = _build_segments(frame_motions)
|
|
394
|
+
|
|
395
|
+
# Determine primary motion (most common or longest)
|
|
396
|
+
primary_motion = _get_primary_motion(segments, duration)
|
|
397
|
+
|
|
398
|
+
# Calculate average intensity
|
|
399
|
+
avg_intensity = np.mean([m[2] for m in frame_motions])
|
|
400
|
+
|
|
401
|
+
# Determine if video is stable (mostly static or low intensity)
|
|
402
|
+
static_time = sum(s.end - s.start for s in segments if s.motion_type == MotionType.STATIC)
|
|
403
|
+
is_stable = static_time > duration * 0.7 or avg_intensity < MOTION_THRESHOLD
|
|
404
|
+
|
|
405
|
+
logger.info(f"Motion analysis: primary={primary_motion}, segments={len(segments)}, stable={is_stable}")
|
|
406
|
+
|
|
407
|
+
return MotionAnalysis(
|
|
408
|
+
duration=duration,
|
|
409
|
+
fps=fps,
|
|
410
|
+
primary_motion=primary_motion,
|
|
411
|
+
segments=segments,
|
|
412
|
+
avg_intensity=float(avg_intensity),
|
|
413
|
+
is_stable=bool(is_stable),
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def _classify_flow(flow: np.ndarray) -> tuple[MotionType, float]:
|
|
418
|
+
"""Classify motion type from optical flow field.
|
|
419
|
+
|
|
420
|
+
Args:
|
|
421
|
+
flow: Optical flow array (H, W, 2) with x and y components
|
|
422
|
+
|
|
423
|
+
Returns:
|
|
424
|
+
(motion_type, intensity)
|
|
425
|
+
"""
|
|
426
|
+
flow_x = flow[:, :, 0]
|
|
427
|
+
flow_y = flow[:, :, 1]
|
|
428
|
+
|
|
429
|
+
# Calculate flow statistics
|
|
430
|
+
mean_x = np.mean(flow_x)
|
|
431
|
+
mean_y = np.mean(flow_y)
|
|
432
|
+
magnitude = np.sqrt(flow_x**2 + flow_y**2)
|
|
433
|
+
mean_magnitude = np.mean(magnitude)
|
|
434
|
+
|
|
435
|
+
# Check if motion is significant
|
|
436
|
+
if mean_magnitude < STATIC_THRESHOLD:
|
|
437
|
+
return MotionType.STATIC, float(mean_magnitude)
|
|
438
|
+
|
|
439
|
+
# Check for zoom (divergence from center)
|
|
440
|
+
h, w = flow.shape[:2]
|
|
441
|
+
center_y, center_x = h // 2, w // 2
|
|
442
|
+
|
|
443
|
+
# Create coordinate grids relative to center
|
|
444
|
+
y_coords, x_coords = np.mgrid[0:h, 0:w]
|
|
445
|
+
x_rel = x_coords - center_x
|
|
446
|
+
y_rel = y_coords - center_y
|
|
447
|
+
|
|
448
|
+
# Normalize relative positions
|
|
449
|
+
dist_from_center = np.sqrt(x_rel**2 + y_rel**2) + 1e-7
|
|
450
|
+
x_norm = x_rel / dist_from_center
|
|
451
|
+
y_norm = y_rel / dist_from_center
|
|
452
|
+
|
|
453
|
+
# Compute divergence (dot product of flow with radial direction)
|
|
454
|
+
divergence = np.mean(flow_x * x_norm + flow_y * y_norm)
|
|
455
|
+
|
|
456
|
+
if abs(divergence) > ZOOM_THRESHOLD * mean_magnitude:
|
|
457
|
+
if divergence > 0:
|
|
458
|
+
return MotionType.PUSH_IN, float(mean_magnitude)
|
|
459
|
+
else:
|
|
460
|
+
return MotionType.PULL_OUT, float(mean_magnitude)
|
|
461
|
+
|
|
462
|
+
# Check for pan/tilt (consistent directional flow)
|
|
463
|
+
abs_mean_x = abs(mean_x)
|
|
464
|
+
abs_mean_y = abs(mean_y)
|
|
465
|
+
|
|
466
|
+
# Strong horizontal motion = pan
|
|
467
|
+
if abs_mean_x > abs_mean_y and abs_mean_x > MOTION_THRESHOLD:
|
|
468
|
+
ratio = abs_mean_x / (mean_magnitude + 1e-7)
|
|
469
|
+
if ratio > PAN_TILT_THRESHOLD:
|
|
470
|
+
if mean_x > 0:
|
|
471
|
+
return (
|
|
472
|
+
MotionType.PAN_LEFT,
|
|
473
|
+
float(mean_magnitude),
|
|
474
|
+
) # Flow right = camera pans left
|
|
475
|
+
else:
|
|
476
|
+
return MotionType.PAN_RIGHT, float(mean_magnitude)
|
|
477
|
+
|
|
478
|
+
# Strong vertical motion = tilt
|
|
479
|
+
if abs_mean_y > abs_mean_x and abs_mean_y > MOTION_THRESHOLD:
|
|
480
|
+
ratio = abs_mean_y / (mean_magnitude + 1e-7)
|
|
481
|
+
if ratio > PAN_TILT_THRESHOLD:
|
|
482
|
+
if mean_y > 0:
|
|
483
|
+
return MotionType.TILT_UP, float(mean_magnitude) # Flow down = camera tilts up
|
|
484
|
+
else:
|
|
485
|
+
return MotionType.TILT_DOWN, float(mean_magnitude)
|
|
486
|
+
|
|
487
|
+
# Significant motion but not consistent direction = handheld/complex
|
|
488
|
+
if mean_magnitude > MOTION_THRESHOLD:
|
|
489
|
+
return MotionType.HANDHELD, float(mean_magnitude)
|
|
490
|
+
|
|
491
|
+
return MotionType.STATIC, float(mean_magnitude)
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
def _build_segments(
|
|
495
|
+
frame_motions: list[tuple[float, MotionType, float]],
|
|
496
|
+
min_segment_duration: float = 0.5,
|
|
497
|
+
) -> list[MotionSegment]:
|
|
498
|
+
"""Build motion segments from frame-by-frame analysis.
|
|
499
|
+
|
|
500
|
+
Merges consecutive frames with same motion type into segments.
|
|
501
|
+
"""
|
|
502
|
+
if not frame_motions:
|
|
503
|
+
return []
|
|
504
|
+
|
|
505
|
+
segments: list[MotionSegment] = []
|
|
506
|
+
current_type = frame_motions[0][1]
|
|
507
|
+
current_start = frame_motions[0][0]
|
|
508
|
+
current_intensities: list[float] = [frame_motions[0][2]]
|
|
509
|
+
|
|
510
|
+
for timestamp, motion_type, intensity in frame_motions[1:]:
|
|
511
|
+
if motion_type == current_type:
|
|
512
|
+
current_intensities.append(intensity)
|
|
513
|
+
else:
|
|
514
|
+
# End current segment
|
|
515
|
+
segments.append(
|
|
516
|
+
MotionSegment(
|
|
517
|
+
start=current_start,
|
|
518
|
+
end=timestamp,
|
|
519
|
+
motion_type=current_type,
|
|
520
|
+
intensity=float(np.mean(current_intensities)),
|
|
521
|
+
)
|
|
522
|
+
)
|
|
523
|
+
current_type = motion_type
|
|
524
|
+
current_start = timestamp
|
|
525
|
+
current_intensities = [intensity]
|
|
526
|
+
|
|
527
|
+
# Add final segment
|
|
528
|
+
if frame_motions:
|
|
529
|
+
segments.append(
|
|
530
|
+
MotionSegment(
|
|
531
|
+
start=current_start,
|
|
532
|
+
end=frame_motions[-1][0] + 0.2, # Extend slightly past last frame
|
|
533
|
+
motion_type=current_type,
|
|
534
|
+
intensity=float(np.mean(current_intensities)),
|
|
535
|
+
)
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
# Merge short segments
|
|
539
|
+
merged: list[MotionSegment] = []
|
|
540
|
+
for seg in segments:
|
|
541
|
+
if seg.end - seg.start < min_segment_duration and merged:
|
|
542
|
+
# Merge with previous segment
|
|
543
|
+
prev = merged[-1]
|
|
544
|
+
merged[-1] = MotionSegment(
|
|
545
|
+
start=prev.start,
|
|
546
|
+
end=seg.end,
|
|
547
|
+
motion_type=(prev.motion_type if prev.end - prev.start > seg.end - seg.start else seg.motion_type),
|
|
548
|
+
intensity=(prev.intensity + seg.intensity) / 2,
|
|
549
|
+
)
|
|
550
|
+
else:
|
|
551
|
+
merged.append(seg)
|
|
552
|
+
|
|
553
|
+
return merged
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
def _get_primary_motion(segments: list[MotionSegment], duration: float) -> MotionType:
|
|
557
|
+
"""Determine the primary motion type based on segment durations."""
|
|
558
|
+
if not segments:
|
|
559
|
+
return MotionType.STATIC
|
|
560
|
+
|
|
561
|
+
# Sum duration per motion type
|
|
562
|
+
type_durations: dict[MotionType, float] = {}
|
|
563
|
+
for seg in segments:
|
|
564
|
+
seg_duration = seg.end - seg.start
|
|
565
|
+
type_durations[seg.motion_type] = type_durations.get(seg.motion_type, 0) + seg_duration
|
|
566
|
+
|
|
567
|
+
# Return type with longest total duration
|
|
568
|
+
return max(type_durations, key=type_durations.get) # type: ignore
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
def get_sample_timestamps(
|
|
572
|
+
motion: MotionAnalysis,
|
|
573
|
+
max_samples: int = 5,
|
|
574
|
+
) -> list[float]:
|
|
575
|
+
"""Get optimal timestamps for frame sampling based on motion analysis.
|
|
576
|
+
|
|
577
|
+
Static segments need fewer samples, moving segments need more.
|
|
578
|
+
|
|
579
|
+
Args:
|
|
580
|
+
motion: Motion analysis result
|
|
581
|
+
max_samples: Maximum number of samples to return
|
|
582
|
+
|
|
583
|
+
Returns:
|
|
584
|
+
List of timestamps to sample
|
|
585
|
+
"""
|
|
586
|
+
if not motion.segments:
|
|
587
|
+
if motion.is_stable or motion.primary_motion == MotionType.STATIC:
|
|
588
|
+
# Static video - just sample middle
|
|
589
|
+
return [motion.duration / 2]
|
|
590
|
+
else:
|
|
591
|
+
# No segments - sample evenly
|
|
592
|
+
return [motion.duration * i / (max_samples + 1) for i in range(1, max_samples + 1)]
|
|
593
|
+
|
|
594
|
+
# Check if there are any non-static segments
|
|
595
|
+
has_motion_segments = any(seg.motion_type not in (MotionType.STATIC, MotionType.HANDHELD) for seg in motion.segments)
|
|
596
|
+
|
|
597
|
+
if not has_motion_segments and motion.is_stable:
|
|
598
|
+
# All static/handheld - just sample middle
|
|
599
|
+
return [motion.duration / 2]
|
|
600
|
+
|
|
601
|
+
timestamps: list[float] = []
|
|
602
|
+
|
|
603
|
+
for seg in motion.segments:
|
|
604
|
+
seg_duration = seg.end - seg.start
|
|
605
|
+
|
|
606
|
+
if seg.motion_type == MotionType.STATIC:
|
|
607
|
+
# Static segment - one sample from middle
|
|
608
|
+
timestamps.append((seg.start + seg.end) / 2)
|
|
609
|
+
|
|
610
|
+
elif seg.motion_type in (
|
|
611
|
+
MotionType.PAN_LEFT,
|
|
612
|
+
MotionType.PAN_RIGHT,
|
|
613
|
+
MotionType.TILT_UP,
|
|
614
|
+
MotionType.TILT_DOWN,
|
|
615
|
+
):
|
|
616
|
+
# Pan/tilt - sample start and end (different content)
|
|
617
|
+
timestamps.append(seg.start + 0.2)
|
|
618
|
+
if seg_duration > 1.0:
|
|
619
|
+
timestamps.append(seg.end - 0.2)
|
|
620
|
+
|
|
621
|
+
elif seg.motion_type in (MotionType.PUSH_IN, MotionType.PULL_OUT):
|
|
622
|
+
# Zoom - sample at different zoom levels
|
|
623
|
+
timestamps.append(seg.start + 0.2)
|
|
624
|
+
if seg_duration > 1.0:
|
|
625
|
+
timestamps.append(seg.end - 0.2)
|
|
626
|
+
|
|
627
|
+
elif seg.motion_type == MotionType.HANDHELD:
|
|
628
|
+
# Handheld - content is same, just one sample
|
|
629
|
+
timestamps.append((seg.start + seg.end) / 2)
|
|
630
|
+
|
|
631
|
+
else:
|
|
632
|
+
# Complex/unknown - sample middle
|
|
633
|
+
timestamps.append((seg.start + seg.end) / 2)
|
|
634
|
+
|
|
635
|
+
# Remove duplicates and sort
|
|
636
|
+
timestamps = sorted(set(timestamps))
|
|
637
|
+
|
|
638
|
+
# Limit to max_samples, keeping evenly distributed
|
|
639
|
+
if len(timestamps) > max_samples:
|
|
640
|
+
indices = np.linspace(0, len(timestamps) - 1, max_samples, dtype=int)
|
|
641
|
+
timestamps = [timestamps[i] for i in indices]
|
|
642
|
+
|
|
643
|
+
# Ensure timestamps are within bounds
|
|
644
|
+
timestamps = [max(0.1, min(t, motion.duration - 0.1)) for t in timestamps]
|
|
645
|
+
|
|
646
|
+
logger.info(f"Smart sampling: {len(timestamps)} frames from {len(motion.segments)} motion segments")
|
|
647
|
+
|
|
648
|
+
return timestamps
|
|
649
|
+
|
|
650
|
+
|
|
651
|
+
def get_adaptive_timestamps(
|
|
652
|
+
motion: MotionAnalysis,
|
|
653
|
+
min_fps: float = 0.1,
|
|
654
|
+
max_fps: float = 2.0,
|
|
655
|
+
max_samples: int = 100,
|
|
656
|
+
) -> list[float]:
|
|
657
|
+
"""Get timestamps with adaptive sampling based on motion intensity.
|
|
658
|
+
|
|
659
|
+
SMART OPTIMIZATION: For stable/static footage, returns very few samples
|
|
660
|
+
since the content doesn't change. This dramatically speeds up processing
|
|
661
|
+
for tripod shots, interviews, static drone hovers, etc.
|
|
662
|
+
|
|
663
|
+
Stability-based limits:
|
|
664
|
+
- Fully stable (is_stable=True, avg_intensity < 1.0): max 3 samples
|
|
665
|
+
- Mostly stable (is_stable=True): max 5 samples
|
|
666
|
+
- Some motion: uses intensity-based adaptive sampling
|
|
667
|
+
|
|
668
|
+
Intensity to FPS mapping (for non-stable segments):
|
|
669
|
+
- 0-0.5: min_fps (static, nothing changing)
|
|
670
|
+
- 0.5-2.0: 0.25 fps (stable, minimal change)
|
|
671
|
+
- 2.0-4.0: 0.5 fps (moderate motion)
|
|
672
|
+
- 4.0-6.0: 1.0 fps (active motion)
|
|
673
|
+
- 6.0+: max_fps (high motion, rapid changes)
|
|
674
|
+
|
|
675
|
+
Args:
|
|
676
|
+
motion: Motion analysis result
|
|
677
|
+
min_fps: Minimum sample rate for static content (default 0.1 = 1 per 10s)
|
|
678
|
+
max_fps: Maximum sample rate for high motion (default 2.0)
|
|
679
|
+
max_samples: Maximum total samples to return
|
|
680
|
+
|
|
681
|
+
Returns:
|
|
682
|
+
List of timestamps to sample
|
|
683
|
+
"""
|
|
684
|
+
# OPTIMIZATION: Very stable footage needs minimal sampling
|
|
685
|
+
if motion.is_stable and motion.avg_intensity < 1.0:
|
|
686
|
+
# Extremely stable (tripod, static drone) - just 3 samples
|
|
687
|
+
if motion.duration < 10:
|
|
688
|
+
timestamps = [motion.duration / 2]
|
|
689
|
+
else:
|
|
690
|
+
# Start, middle, end
|
|
691
|
+
timestamps = [
|
|
692
|
+
motion.duration * 0.15,
|
|
693
|
+
motion.duration * 0.5,
|
|
694
|
+
motion.duration * 0.85,
|
|
695
|
+
]
|
|
696
|
+
logger.info(f"Stable video optimization: {len(timestamps)} frames only " f"(avg_intensity={motion.avg_intensity:.1f})")
|
|
697
|
+
return timestamps
|
|
698
|
+
|
|
699
|
+
if motion.is_stable:
|
|
700
|
+
# Mostly stable - cap at 5 samples spread across duration
|
|
701
|
+
num_samples = min(5, max(1, int(motion.duration / 10)))
|
|
702
|
+
if num_samples == 1:
|
|
703
|
+
timestamps = [motion.duration / 2]
|
|
704
|
+
else:
|
|
705
|
+
step = motion.duration / (num_samples + 1)
|
|
706
|
+
timestamps = [step * (i + 1) for i in range(num_samples)]
|
|
707
|
+
logger.info(f"Stable video: {len(timestamps)} frames " f"(avg_intensity={motion.avg_intensity:.1f})")
|
|
708
|
+
return timestamps
|
|
709
|
+
|
|
710
|
+
if not motion.segments:
|
|
711
|
+
# No segments but not stable - sample at moderate rate
|
|
712
|
+
interval = 2.0 # 0.5 fps
|
|
713
|
+
timestamps = [t for t in _frange(0.1, motion.duration - 0.1, interval)]
|
|
714
|
+
timestamps = timestamps[:max_samples]
|
|
715
|
+
return timestamps
|
|
716
|
+
|
|
717
|
+
timestamps: list[float] = []
|
|
718
|
+
|
|
719
|
+
for seg in motion.segments:
|
|
720
|
+
intensity = seg.intensity
|
|
721
|
+
|
|
722
|
+
# Map intensity to fps
|
|
723
|
+
if intensity < 0.5:
|
|
724
|
+
fps = min_fps
|
|
725
|
+
elif intensity < 2.0:
|
|
726
|
+
fps = 0.25
|
|
727
|
+
elif intensity < 4.0:
|
|
728
|
+
fps = 0.5
|
|
729
|
+
elif intensity < 6.0:
|
|
730
|
+
fps = 1.0
|
|
731
|
+
else:
|
|
732
|
+
fps = max_fps
|
|
733
|
+
|
|
734
|
+
# Generate timestamps for this segment
|
|
735
|
+
interval = 1.0 / fps
|
|
736
|
+
t = seg.start + 0.1 # Start slightly after segment boundary
|
|
737
|
+
while t < seg.end - 0.1:
|
|
738
|
+
timestamps.append(t)
|
|
739
|
+
t += interval
|
|
740
|
+
|
|
741
|
+
# Always include at least one sample per segment
|
|
742
|
+
if not any(seg.start <= ts <= seg.end for ts in timestamps):
|
|
743
|
+
timestamps.append((seg.start + seg.end) / 2)
|
|
744
|
+
|
|
745
|
+
# Remove duplicates and sort
|
|
746
|
+
timestamps = sorted(set(timestamps))
|
|
747
|
+
|
|
748
|
+
# Cap at max_samples, keeping even distribution
|
|
749
|
+
if len(timestamps) > max_samples:
|
|
750
|
+
step = len(timestamps) / max_samples
|
|
751
|
+
timestamps = [timestamps[int(i * step)] for i in range(max_samples)]
|
|
752
|
+
|
|
753
|
+
# Ensure timestamps are within video bounds
|
|
754
|
+
timestamps = [max(0.1, min(t, motion.duration - 0.1)) for t in timestamps]
|
|
755
|
+
|
|
756
|
+
logger.info(f"Adaptive sampling: {len(timestamps)} frames " f"(avg_intensity={motion.avg_intensity:.1f}, stable={motion.is_stable})")
|
|
757
|
+
|
|
758
|
+
return timestamps
|
|
759
|
+
|
|
760
|
+
|
|
761
|
+
def _frange(start: float, stop: float, step: float):
|
|
762
|
+
"""Float range generator."""
|
|
763
|
+
while start < stop:
|
|
764
|
+
yield start
|
|
765
|
+
start += step
|