media-engine 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/clip.py +79 -0
- cli/faces.py +91 -0
- cli/metadata.py +68 -0
- cli/motion.py +77 -0
- cli/objects.py +94 -0
- cli/ocr.py +93 -0
- cli/scenes.py +57 -0
- cli/telemetry.py +65 -0
- cli/transcript.py +76 -0
- media_engine/__init__.py +7 -0
- media_engine/_version.py +34 -0
- media_engine/app.py +80 -0
- media_engine/batch/__init__.py +56 -0
- media_engine/batch/models.py +99 -0
- media_engine/batch/processor.py +1131 -0
- media_engine/batch/queue.py +232 -0
- media_engine/batch/state.py +30 -0
- media_engine/batch/timing.py +321 -0
- media_engine/cli.py +17 -0
- media_engine/config.py +674 -0
- media_engine/extractors/__init__.py +75 -0
- media_engine/extractors/clip.py +401 -0
- media_engine/extractors/faces.py +459 -0
- media_engine/extractors/frame_buffer.py +351 -0
- media_engine/extractors/frames.py +402 -0
- media_engine/extractors/metadata/__init__.py +127 -0
- media_engine/extractors/metadata/apple.py +169 -0
- media_engine/extractors/metadata/arri.py +118 -0
- media_engine/extractors/metadata/avchd.py +208 -0
- media_engine/extractors/metadata/avchd_gps.py +270 -0
- media_engine/extractors/metadata/base.py +688 -0
- media_engine/extractors/metadata/blackmagic.py +139 -0
- media_engine/extractors/metadata/camera_360.py +276 -0
- media_engine/extractors/metadata/canon.py +290 -0
- media_engine/extractors/metadata/dji.py +371 -0
- media_engine/extractors/metadata/dv.py +121 -0
- media_engine/extractors/metadata/ffmpeg.py +76 -0
- media_engine/extractors/metadata/generic.py +119 -0
- media_engine/extractors/metadata/gopro.py +256 -0
- media_engine/extractors/metadata/red.py +305 -0
- media_engine/extractors/metadata/registry.py +114 -0
- media_engine/extractors/metadata/sony.py +442 -0
- media_engine/extractors/metadata/tesla.py +157 -0
- media_engine/extractors/motion.py +765 -0
- media_engine/extractors/objects.py +245 -0
- media_engine/extractors/objects_qwen.py +754 -0
- media_engine/extractors/ocr.py +268 -0
- media_engine/extractors/scenes.py +82 -0
- media_engine/extractors/shot_type.py +217 -0
- media_engine/extractors/telemetry.py +262 -0
- media_engine/extractors/transcribe.py +579 -0
- media_engine/extractors/translate.py +121 -0
- media_engine/extractors/vad.py +263 -0
- media_engine/main.py +68 -0
- media_engine/py.typed +0 -0
- media_engine/routers/__init__.py +15 -0
- media_engine/routers/batch.py +78 -0
- media_engine/routers/health.py +93 -0
- media_engine/routers/models.py +211 -0
- media_engine/routers/settings.py +87 -0
- media_engine/routers/utils.py +135 -0
- media_engine/schemas.py +581 -0
- media_engine/utils/__init__.py +5 -0
- media_engine/utils/logging.py +54 -0
- media_engine/utils/memory.py +49 -0
- media_engine-0.1.0.dist-info/METADATA +276 -0
- media_engine-0.1.0.dist-info/RECORD +70 -0
- media_engine-0.1.0.dist-info/WHEEL +4 -0
- media_engine-0.1.0.dist-info/entry_points.txt +11 -0
- media_engine-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,1131 @@
|
|
|
1
|
+
"""Batch job processor - main extraction logic."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import gc
|
|
6
|
+
import logging
|
|
7
|
+
import time
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import TYPE_CHECKING, Any
|
|
11
|
+
|
|
12
|
+
from media_engine.batch.models import (
|
|
13
|
+
BatchRequest,
|
|
14
|
+
ExtractorTiming,
|
|
15
|
+
JobProgress,
|
|
16
|
+
)
|
|
17
|
+
from media_engine.batch.queue import cleanup_expired_batch_jobs, start_next_batch
|
|
18
|
+
from media_engine.batch.state import batch_jobs, batch_jobs_lock
|
|
19
|
+
from media_engine.batch.timing import (
|
|
20
|
+
EXTRACTOR_ORDER,
|
|
21
|
+
calculate_queue_eta,
|
|
22
|
+
get_enabled_extractors_from_request,
|
|
23
|
+
get_predicted_rate,
|
|
24
|
+
get_resolution_bucket,
|
|
25
|
+
predict_extractor_time,
|
|
26
|
+
record_timing,
|
|
27
|
+
)
|
|
28
|
+
from media_engine.utils.memory import clear_memory, get_memory_mb
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def run_batch_job(batch_id: str, request: BatchRequest) -> None:
|
|
37
|
+
"""Run batch extraction - processes all files per extractor stage.
|
|
38
|
+
|
|
39
|
+
This is more memory efficient as each model is loaded once,
|
|
40
|
+
processes all files, then is unloaded before the next model.
|
|
41
|
+
"""
|
|
42
|
+
from media_engine.config import get_settings
|
|
43
|
+
from media_engine.extractors import (
|
|
44
|
+
FFPROBE_WORKERS,
|
|
45
|
+
SharedFrameBuffer,
|
|
46
|
+
analyze_motion,
|
|
47
|
+
check_faces_are_known,
|
|
48
|
+
decode_frames,
|
|
49
|
+
detect_voice_activity,
|
|
50
|
+
extract_clip,
|
|
51
|
+
extract_faces,
|
|
52
|
+
extract_metadata,
|
|
53
|
+
extract_objects,
|
|
54
|
+
extract_objects_qwen,
|
|
55
|
+
extract_ocr,
|
|
56
|
+
extract_scenes,
|
|
57
|
+
extract_telemetry,
|
|
58
|
+
extract_transcript,
|
|
59
|
+
get_adaptive_timestamps,
|
|
60
|
+
get_extractor_timestamps,
|
|
61
|
+
get_sample_timestamps,
|
|
62
|
+
run_ffprobe_batch,
|
|
63
|
+
unload_clip_model,
|
|
64
|
+
unload_face_model,
|
|
65
|
+
unload_ocr_model,
|
|
66
|
+
unload_qwen_model,
|
|
67
|
+
unload_vad_model,
|
|
68
|
+
unload_whisper_model,
|
|
69
|
+
unload_yolo_model,
|
|
70
|
+
)
|
|
71
|
+
from media_engine.extractors.vad import AudioContent
|
|
72
|
+
from media_engine.schemas import (
|
|
73
|
+
BoundingBox,
|
|
74
|
+
FaceDetection,
|
|
75
|
+
FacesResult,
|
|
76
|
+
MediaType,
|
|
77
|
+
get_media_type,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
settings = get_settings()
|
|
81
|
+
|
|
82
|
+
# Resolve models from settings (handles "auto" -> actual model name)
|
|
83
|
+
whisper_model = settings.get_whisper_model()
|
|
84
|
+
qwen_model = settings.get_qwen_model()
|
|
85
|
+
yolo_model = settings.get_yolo_model()
|
|
86
|
+
clip_model = settings.get_clip_model()
|
|
87
|
+
|
|
88
|
+
logger.info(f"Batch {batch_id} models: whisper={whisper_model}, qwen={qwen_model}, " f"yolo={yolo_model}, clip={clip_model}")
|
|
89
|
+
|
|
90
|
+
batch_start_time = time.time()
|
|
91
|
+
peak_memory = get_memory_mb()
|
|
92
|
+
stage_start_times: dict[str, float] = {} # extractor -> start time
|
|
93
|
+
file_resolutions: dict[int, str] = {} # file_idx -> resolution bucket (for timing predictions)
|
|
94
|
+
file_durations: dict[int, float] = {} # file_idx -> duration in seconds
|
|
95
|
+
|
|
96
|
+
# Get enabled extractors for this batch
|
|
97
|
+
enabled_extractors, enabled_sub_extractors = get_enabled_extractors_from_request(request)
|
|
98
|
+
|
|
99
|
+
def calculate_total_eta(current_extractor: str, stage_eta: float) -> float:
|
|
100
|
+
"""Calculate total remaining time for the entire batch.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
current_extractor: Currently running extractor
|
|
104
|
+
stage_eta: Remaining time for current stage
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Total estimated remaining seconds for the batch
|
|
108
|
+
"""
|
|
109
|
+
total_eta = stage_eta if stage_eta else 0.0
|
|
110
|
+
|
|
111
|
+
# Get the current extractor's position in the order
|
|
112
|
+
if current_extractor not in EXTRACTOR_ORDER:
|
|
113
|
+
return total_eta
|
|
114
|
+
|
|
115
|
+
current_ext_idx = EXTRACTOR_ORDER.index(current_extractor)
|
|
116
|
+
num_files = len(request.files)
|
|
117
|
+
|
|
118
|
+
# Add time for remaining extractors (after current one)
|
|
119
|
+
remaining_extractors = EXTRACTOR_ORDER[current_ext_idx + 1 :]
|
|
120
|
+
logger.info(f"ETA calc: current={current_extractor}, remaining={remaining_extractors}, " f"enabled={enabled_extractors}")
|
|
121
|
+
|
|
122
|
+
for ext in remaining_extractors:
|
|
123
|
+
if ext not in enabled_extractors:
|
|
124
|
+
logger.info(f"ETA calc: skipping {ext} (not enabled)")
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
# Sum predicted time across all files
|
|
128
|
+
for file_idx in range(num_files):
|
|
129
|
+
resolution = file_resolutions.get(file_idx, "1080p")
|
|
130
|
+
duration = file_durations.get(file_idx, 60.0) # Default 1 min
|
|
131
|
+
predicted = predict_extractor_time(
|
|
132
|
+
ext,
|
|
133
|
+
resolution,
|
|
134
|
+
duration,
|
|
135
|
+
enabled_sub_extractors=enabled_sub_extractors if ext == "visual_processing" else None,
|
|
136
|
+
)
|
|
137
|
+
total_eta += predicted
|
|
138
|
+
logger.info(f"ETA calc: {ext} file={file_idx} res={resolution} dur={duration}s -> +{predicted:.1f}s")
|
|
139
|
+
|
|
140
|
+
return round(total_eta, 1)
|
|
141
|
+
|
|
142
|
+
def update_batch_progress(
|
|
143
|
+
extractor: str,
|
|
144
|
+
message: str,
|
|
145
|
+
current: int | None = None,
|
|
146
|
+
total: int | None = None,
|
|
147
|
+
) -> None:
|
|
148
|
+
nonlocal peak_memory
|
|
149
|
+
|
|
150
|
+
# Track stage start time
|
|
151
|
+
if extractor not in stage_start_times:
|
|
152
|
+
stage_start_times[extractor] = time.time()
|
|
153
|
+
|
|
154
|
+
# Calculate ETA
|
|
155
|
+
stage_elapsed: float | None = None
|
|
156
|
+
eta: float | None = None
|
|
157
|
+
if extractor in stage_start_times:
|
|
158
|
+
stage_elapsed = round(time.time() - stage_start_times[extractor], 1)
|
|
159
|
+
# Calculate ETA if we have progress info
|
|
160
|
+
if current is not None and total is not None and current > 0:
|
|
161
|
+
# Always use prediction-based ETA for remaining files + current file
|
|
162
|
+
# This is more accurate than elapsed-based calculation
|
|
163
|
+
eta = 0.0
|
|
164
|
+
|
|
165
|
+
# Add predicted time for current file (estimate 50% remaining)
|
|
166
|
+
current_file_idx = current - 1
|
|
167
|
+
current_res = file_resolutions.get(current_file_idx, "1080p")
|
|
168
|
+
current_dur = file_durations.get(current_file_idx, 60.0)
|
|
169
|
+
current_predicted = predict_extractor_time(
|
|
170
|
+
extractor,
|
|
171
|
+
current_res,
|
|
172
|
+
current_dur,
|
|
173
|
+
enabled_sub_extractors=enabled_sub_extractors if extractor == "visual_processing" else None,
|
|
174
|
+
)
|
|
175
|
+
# Estimate we're ~halfway through current file if we have some elapsed time
|
|
176
|
+
if stage_elapsed > 0 and current > 1:
|
|
177
|
+
# For files after the first, estimate based on avg time per completed file
|
|
178
|
+
avg_per_file = stage_elapsed / (current - 1)
|
|
179
|
+
eta += max(0, avg_per_file * 0.5) # ~50% of avg remaining for current
|
|
180
|
+
else:
|
|
181
|
+
eta += current_predicted * 0.5 # ~50% of predicted remaining for current
|
|
182
|
+
|
|
183
|
+
# Add predicted time for remaining files
|
|
184
|
+
for file_idx in range(current, total):
|
|
185
|
+
resolution = file_resolutions.get(file_idx, "1080p")
|
|
186
|
+
duration = file_durations.get(file_idx, 60.0)
|
|
187
|
+
eta += predict_extractor_time(
|
|
188
|
+
extractor,
|
|
189
|
+
resolution,
|
|
190
|
+
duration,
|
|
191
|
+
enabled_sub_extractors=enabled_sub_extractors if extractor == "visual_processing" else None,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
eta = round(eta, 1)
|
|
195
|
+
elif current == 0 and total is not None and total > 0:
|
|
196
|
+
# No progress yet - try to use historical timing for prediction
|
|
197
|
+
# Use the most common resolution in the batch, or "unknown" if none set
|
|
198
|
+
common_res = "unknown"
|
|
199
|
+
if file_resolutions:
|
|
200
|
+
res_counts: dict[str, int] = {}
|
|
201
|
+
for res in file_resolutions.values():
|
|
202
|
+
res_counts[res] = res_counts.get(res, 0) + 1
|
|
203
|
+
common_res = max(res_counts, key=lambda r: res_counts[r])
|
|
204
|
+
predicted = get_predicted_rate(extractor, common_res)
|
|
205
|
+
if predicted is not None:
|
|
206
|
+
eta = round(predicted * total, 1)
|
|
207
|
+
|
|
208
|
+
# Calculate total ETA for entire batch (current stage + remaining stages)
|
|
209
|
+
total_eta = calculate_total_eta(extractor, eta or 0.0)
|
|
210
|
+
|
|
211
|
+
# Debug logging for ETA calculation (use INFO level to see it)
|
|
212
|
+
if total_eta and total_eta > 0:
|
|
213
|
+
logger.info(f"ETA: {extractor} stage={eta}s, total={total_eta}s, " f"subs={enabled_sub_extractors}, files={len(file_durations)}")
|
|
214
|
+
|
|
215
|
+
# Calculate queue ETA (for all queued batches)
|
|
216
|
+
queue_eta, queued_count = calculate_queue_eta()
|
|
217
|
+
|
|
218
|
+
with batch_jobs_lock:
|
|
219
|
+
if batch_id in batch_jobs:
|
|
220
|
+
batch_jobs[batch_id].current_extractor = extractor
|
|
221
|
+
batch_jobs[batch_id].progress = JobProgress(
|
|
222
|
+
message=message,
|
|
223
|
+
current=current,
|
|
224
|
+
total=total,
|
|
225
|
+
stage_elapsed_seconds=stage_elapsed,
|
|
226
|
+
eta_seconds=eta,
|
|
227
|
+
# Always send total_eta - even if 0, it's valid info
|
|
228
|
+
total_eta_seconds=total_eta,
|
|
229
|
+
queue_eta_seconds=queue_eta if queue_eta > 0 else None,
|
|
230
|
+
queued_batches=queued_count if queued_count > 0 else None,
|
|
231
|
+
)
|
|
232
|
+
# Update memory and elapsed time
|
|
233
|
+
current_mem = get_memory_mb()
|
|
234
|
+
peak_memory = max(peak_memory, current_mem)
|
|
235
|
+
batch_jobs[batch_id].memory_mb = current_mem
|
|
236
|
+
batch_jobs[batch_id].peak_memory_mb = peak_memory
|
|
237
|
+
batch_jobs[batch_id].elapsed_seconds = round(time.time() - batch_start_time, 1)
|
|
238
|
+
|
|
239
|
+
def update_file_status(
|
|
240
|
+
file_idx: int,
|
|
241
|
+
status: str,
|
|
242
|
+
result_key: str | None = None,
|
|
243
|
+
result: Any = None,
|
|
244
|
+
error: str | None = None,
|
|
245
|
+
) -> None:
|
|
246
|
+
with batch_jobs_lock:
|
|
247
|
+
if batch_id in batch_jobs and file_idx < len(batch_jobs[batch_id].files):
|
|
248
|
+
batch_jobs[batch_id].files[file_idx].status = status
|
|
249
|
+
if result_key and result is not None:
|
|
250
|
+
batch_jobs[batch_id].files[file_idx].results[result_key] = result
|
|
251
|
+
if error:
|
|
252
|
+
batch_jobs[batch_id].files[file_idx].error = error
|
|
253
|
+
|
|
254
|
+
def update_extractor_status(file_idx: int, extractor: str, status: str) -> None:
|
|
255
|
+
"""Update extractor status for a file.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
file_idx: Index of the file in the batch
|
|
259
|
+
extractor: Name of the extractor
|
|
260
|
+
status: One of 'pending', 'active', 'completed', 'failed', 'skipped'
|
|
261
|
+
"""
|
|
262
|
+
with batch_jobs_lock:
|
|
263
|
+
if batch_id in batch_jobs and file_idx < len(batch_jobs[batch_id].files):
|
|
264
|
+
batch_jobs[batch_id].files[file_idx].extractor_status[extractor] = status
|
|
265
|
+
|
|
266
|
+
def start_extractor_timing(extractor: str) -> datetime:
|
|
267
|
+
"""Start timing for an extractor stage."""
|
|
268
|
+
started = datetime.now(timezone.utc)
|
|
269
|
+
# Reset stage start time for ETA calculation
|
|
270
|
+
stage_start_times[extractor] = time.time()
|
|
271
|
+
with batch_jobs_lock:
|
|
272
|
+
if batch_id in batch_jobs:
|
|
273
|
+
batch_jobs[batch_id].extractor_timings.append(ExtractorTiming(extractor=extractor, started_at=started))
|
|
274
|
+
return started
|
|
275
|
+
|
|
276
|
+
def end_extractor_timing(extractor: str, files_processed: int) -> None:
|
|
277
|
+
"""End timing for an extractor stage."""
|
|
278
|
+
completed = datetime.now(timezone.utc)
|
|
279
|
+
with batch_jobs_lock:
|
|
280
|
+
if batch_id in batch_jobs:
|
|
281
|
+
for timing in batch_jobs[batch_id].extractor_timings:
|
|
282
|
+
if timing.extractor == extractor and timing.completed_at is None:
|
|
283
|
+
timing.completed_at = completed
|
|
284
|
+
timing.duration_seconds = round((completed - timing.started_at).total_seconds(), 2)
|
|
285
|
+
timing.files_processed = files_processed
|
|
286
|
+
break
|
|
287
|
+
|
|
288
|
+
def update_file_timing(file_idx: int, extractor: str, duration: float, units: float | None = None) -> None:
|
|
289
|
+
"""Record per-file timing for an extractor.
|
|
290
|
+
|
|
291
|
+
Args:
|
|
292
|
+
file_idx: Index of the file in the batch
|
|
293
|
+
extractor: Name of the extractor
|
|
294
|
+
duration: Wall clock seconds to process
|
|
295
|
+
units: Normalization units for rate calculation:
|
|
296
|
+
- transcript: duration in minutes
|
|
297
|
+
- visual: number of timestamps
|
|
298
|
+
- objects/faces/ocr/clip: number of frames
|
|
299
|
+
- None: store raw seconds (metadata, telemetry, etc.)
|
|
300
|
+
"""
|
|
301
|
+
with batch_jobs_lock:
|
|
302
|
+
if batch_id in batch_jobs and file_idx < len(batch_jobs[batch_id].files):
|
|
303
|
+
batch_jobs[batch_id].files[file_idx].timings[extractor] = round(duration, 2)
|
|
304
|
+
# Record to historical timing for future ETA predictions
|
|
305
|
+
resolution = file_resolutions.get(file_idx, "unknown")
|
|
306
|
+
record_timing(extractor, resolution, duration, units)
|
|
307
|
+
|
|
308
|
+
try:
|
|
309
|
+
with batch_jobs_lock:
|
|
310
|
+
batch_jobs[batch_id].status = "running"
|
|
311
|
+
|
|
312
|
+
files = request.files
|
|
313
|
+
total_files = len(files)
|
|
314
|
+
|
|
315
|
+
# Track files that failed metadata extraction - skip them in all subsequent stages
|
|
316
|
+
# If we can't read the file with ffprobe, there's no point trying other extractors
|
|
317
|
+
failed_files: set[int] = set()
|
|
318
|
+
|
|
319
|
+
# Always get file durations and resolutions for ETA predictions (lightweight ffprobe)
|
|
320
|
+
# This runs even if metadata isn't enabled
|
|
321
|
+
if not request.enable_metadata:
|
|
322
|
+
from media_engine.extractors.metadata.base import get_video_info
|
|
323
|
+
|
|
324
|
+
for i, file_path in enumerate(files):
|
|
325
|
+
try:
|
|
326
|
+
_fps, duration, width, height = get_video_info(file_path)
|
|
327
|
+
if duration:
|
|
328
|
+
file_durations[i] = duration
|
|
329
|
+
# Determine resolution bucket from dimensions
|
|
330
|
+
file_resolutions[i] = get_resolution_bucket(width, height)
|
|
331
|
+
logger.info(f"ETA: file {i} duration={duration}s, res={file_resolutions[i]}")
|
|
332
|
+
except Exception as e:
|
|
333
|
+
logger.warning(f"Could not get video info for {file_path}: {e}")
|
|
334
|
+
|
|
335
|
+
# Stage 1: Metadata (parallel ffprobe for speed)
|
|
336
|
+
if request.enable_metadata:
|
|
337
|
+
start_extractor_timing("metadata")
|
|
338
|
+
update_batch_progress(
|
|
339
|
+
"metadata",
|
|
340
|
+
f"Running ffprobe ({FFPROBE_WORKERS} parallel workers)...",
|
|
341
|
+
0,
|
|
342
|
+
total_files,
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
# Run all ffprobe calls in parallel
|
|
346
|
+
probe_results = run_ffprobe_batch(files)
|
|
347
|
+
|
|
348
|
+
# Extract metadata from each probe result
|
|
349
|
+
for i, file_path in enumerate(files):
|
|
350
|
+
file_start = time.time()
|
|
351
|
+
update_batch_progress("metadata", f"Processing {Path(file_path).name}", i + 1, total_files)
|
|
352
|
+
update_extractor_status(i, "metadata", "active")
|
|
353
|
+
probe_data = probe_results.get(file_path)
|
|
354
|
+
|
|
355
|
+
if isinstance(probe_data, Exception):
|
|
356
|
+
logger.warning(f"Metadata failed for {file_path}: {probe_data}")
|
|
357
|
+
logger.warning(f"Skipping all extractors for {file_path} - file unreadable")
|
|
358
|
+
update_file_status(i, "failed", "metadata", None, str(probe_data))
|
|
359
|
+
update_extractor_status(i, "metadata", "failed")
|
|
360
|
+
update_file_timing(i, "metadata", time.time() - file_start)
|
|
361
|
+
failed_files.add(i)
|
|
362
|
+
continue
|
|
363
|
+
|
|
364
|
+
try:
|
|
365
|
+
metadata = extract_metadata(file_path, probe_data)
|
|
366
|
+
update_file_status(i, "running", "metadata", metadata.model_dump())
|
|
367
|
+
update_extractor_status(i, "metadata", "completed")
|
|
368
|
+
# Store resolution bucket for timing predictions
|
|
369
|
+
file_resolutions[i] = get_resolution_bucket(
|
|
370
|
+
metadata.resolution.width,
|
|
371
|
+
metadata.resolution.height,
|
|
372
|
+
)
|
|
373
|
+
# Store duration for total ETA predictions
|
|
374
|
+
if metadata.duration is not None:
|
|
375
|
+
file_durations[i] = metadata.duration
|
|
376
|
+
logger.info(f"ETA: stored duration {metadata.duration}s for file {i}")
|
|
377
|
+
except Exception as e:
|
|
378
|
+
logger.warning(f"Metadata failed for {file_path}: {e}")
|
|
379
|
+
logger.warning(f"Skipping all extractors for {file_path} - file unreadable")
|
|
380
|
+
update_file_status(i, "failed", "metadata", None, str(e))
|
|
381
|
+
update_extractor_status(i, "metadata", "failed")
|
|
382
|
+
failed_files.add(i)
|
|
383
|
+
update_file_timing(i, "metadata", time.time() - file_start)
|
|
384
|
+
end_extractor_timing("metadata", total_files)
|
|
385
|
+
|
|
386
|
+
# Stage 2: Telemetry (always runs - lightweight, no models)
|
|
387
|
+
start_extractor_timing("telemetry")
|
|
388
|
+
update_batch_progress("telemetry", "Extracting telemetry...", 0, total_files)
|
|
389
|
+
for i, file_path in enumerate(files):
|
|
390
|
+
file_start = time.time()
|
|
391
|
+
update_batch_progress("telemetry", f"Processing {Path(file_path).name}", i + 1, total_files)
|
|
392
|
+
update_extractor_status(i, "telemetry", "active")
|
|
393
|
+
try:
|
|
394
|
+
telemetry = extract_telemetry(file_path)
|
|
395
|
+
update_file_status(
|
|
396
|
+
i,
|
|
397
|
+
"running",
|
|
398
|
+
"telemetry",
|
|
399
|
+
telemetry.model_dump() if telemetry else None,
|
|
400
|
+
)
|
|
401
|
+
update_extractor_status(i, "telemetry", "completed")
|
|
402
|
+
except Exception as e:
|
|
403
|
+
logger.warning(f"Telemetry failed for {file_path}: {e}")
|
|
404
|
+
update_extractor_status(i, "telemetry", "failed")
|
|
405
|
+
update_file_timing(i, "telemetry", time.time() - file_start)
|
|
406
|
+
end_extractor_timing("telemetry", total_files)
|
|
407
|
+
|
|
408
|
+
# Stage 3: Voice Activity Detection (WebRTC VAD - lightweight)
|
|
409
|
+
# Skip for images and files without audio tracks
|
|
410
|
+
if request.enable_vad:
|
|
411
|
+
start_extractor_timing("vad")
|
|
412
|
+
update_batch_progress("vad", "Analyzing audio...", 0, total_files)
|
|
413
|
+
vad_ran = False # Track if we actually ran VAD on any file
|
|
414
|
+
for i, file_path in enumerate(files):
|
|
415
|
+
if i in failed_files:
|
|
416
|
+
update_extractor_status(i, "vad", "skipped")
|
|
417
|
+
continue
|
|
418
|
+
file_start = time.time()
|
|
419
|
+
update_extractor_status(i, "vad", "active")
|
|
420
|
+
|
|
421
|
+
# Check media type - skip VAD for images
|
|
422
|
+
media_type = get_media_type(file_path)
|
|
423
|
+
if media_type == MediaType.IMAGE:
|
|
424
|
+
logger.info(f"Skipping VAD for {file_path} - image file")
|
|
425
|
+
no_audio_result = {
|
|
426
|
+
"audio_content": str(AudioContent.NO_AUDIO),
|
|
427
|
+
"speech_ratio": 0.0,
|
|
428
|
+
"speech_segments": [],
|
|
429
|
+
"total_duration": 0.0,
|
|
430
|
+
}
|
|
431
|
+
update_file_status(i, "running", "vad", no_audio_result)
|
|
432
|
+
update_extractor_status(i, "vad", "completed")
|
|
433
|
+
update_file_timing(i, "vad", time.time() - file_start)
|
|
434
|
+
continue
|
|
435
|
+
|
|
436
|
+
# Check if metadata shows no audio track
|
|
437
|
+
has_audio_track = True
|
|
438
|
+
with batch_jobs_lock:
|
|
439
|
+
file_results = batch_jobs[batch_id].files[i].results
|
|
440
|
+
if file_results and file_results.get("metadata"):
|
|
441
|
+
metadata = file_results["metadata"]
|
|
442
|
+
if metadata.get("audio") is None:
|
|
443
|
+
has_audio_track = False
|
|
444
|
+
|
|
445
|
+
if not has_audio_track:
|
|
446
|
+
logger.info(f"Skipping VAD for {file_path} - no audio track")
|
|
447
|
+
no_audio_result = {
|
|
448
|
+
"audio_content": str(AudioContent.NO_AUDIO),
|
|
449
|
+
"speech_ratio": 0.0,
|
|
450
|
+
"speech_segments": [],
|
|
451
|
+
"total_duration": 0.0,
|
|
452
|
+
}
|
|
453
|
+
update_file_status(i, "running", "vad", no_audio_result)
|
|
454
|
+
update_extractor_status(i, "vad", "completed")
|
|
455
|
+
update_file_timing(i, "vad", time.time() - file_start)
|
|
456
|
+
continue
|
|
457
|
+
|
|
458
|
+
# Run VAD for files with audio
|
|
459
|
+
update_batch_progress("vad", f"Analyzing {Path(file_path).name}", i + 1, total_files)
|
|
460
|
+
try:
|
|
461
|
+
vad_result = detect_voice_activity(file_path)
|
|
462
|
+
update_file_status(i, "running", "vad", vad_result)
|
|
463
|
+
update_extractor_status(i, "vad", "completed")
|
|
464
|
+
vad_ran = True
|
|
465
|
+
except Exception as e:
|
|
466
|
+
logger.warning(f"VAD failed for {file_path}: {e}")
|
|
467
|
+
update_extractor_status(i, "vad", "failed")
|
|
468
|
+
update_file_timing(i, "vad", time.time() - file_start)
|
|
469
|
+
|
|
470
|
+
# Only unload if we actually loaded the model
|
|
471
|
+
if vad_ran:
|
|
472
|
+
update_batch_progress("vad", "Unloading VAD model...", None, None)
|
|
473
|
+
unload_vad_model()
|
|
474
|
+
end_extractor_timing("vad", total_files)
|
|
475
|
+
|
|
476
|
+
# Stage 4: Per-file visual processing
|
|
477
|
+
# Process each file completely before moving to next (memory efficient)
|
|
478
|
+
# Order: Motion → Scenes → Decode frames → Objects → Faces → OCR → CLIP → Release buffer
|
|
479
|
+
#
|
|
480
|
+
# This approach:
|
|
481
|
+
# - Decodes frames once per file
|
|
482
|
+
# - Runs all visual extractors on those frames
|
|
483
|
+
# - Releases buffer before processing next file
|
|
484
|
+
# - Keeps only one file's frames in memory at a time
|
|
485
|
+
|
|
486
|
+
needs_visual_processing = any(
|
|
487
|
+
[
|
|
488
|
+
request.enable_motion,
|
|
489
|
+
request.enable_scenes,
|
|
490
|
+
request.enable_objects,
|
|
491
|
+
request.enable_faces,
|
|
492
|
+
request.enable_ocr,
|
|
493
|
+
request.enable_clip,
|
|
494
|
+
]
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
# Track motion data for adaptive timestamps
|
|
498
|
+
motion_data: dict[int, Any] = {}
|
|
499
|
+
adaptive_timestamps: dict[int, list[float]] = {}
|
|
500
|
+
|
|
501
|
+
# Track person timestamps for smart face detection
|
|
502
|
+
person_timestamps: dict[int, list[float]] = {}
|
|
503
|
+
|
|
504
|
+
# Skip motion analysis if timestamps are already provided
|
|
505
|
+
has_precomputed_timestamps = bool(request.visual_timestamps)
|
|
506
|
+
|
|
507
|
+
if needs_visual_processing:
|
|
508
|
+
start_extractor_timing("visual_processing")
|
|
509
|
+
update_batch_progress(
|
|
510
|
+
"visual_processing",
|
|
511
|
+
"Processing video frames...",
|
|
512
|
+
0,
|
|
513
|
+
total_files,
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
for i, file_path in enumerate(files):
|
|
517
|
+
if i in failed_files:
|
|
518
|
+
continue
|
|
519
|
+
|
|
520
|
+
fname = Path(file_path).name
|
|
521
|
+
media_type = get_media_type(file_path)
|
|
522
|
+
file_start = time.time()
|
|
523
|
+
|
|
524
|
+
update_batch_progress(
|
|
525
|
+
"visual_processing",
|
|
526
|
+
f"Processing {fname}",
|
|
527
|
+
i + 1,
|
|
528
|
+
total_files,
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
# --- Motion Analysis ---
|
|
532
|
+
if request.enable_motion or (
|
|
533
|
+
(request.enable_objects or request.enable_faces or request.enable_clip or request.enable_ocr)
|
|
534
|
+
and not has_precomputed_timestamps
|
|
535
|
+
and media_type != MediaType.IMAGE
|
|
536
|
+
):
|
|
537
|
+
motion_start = time.time()
|
|
538
|
+
update_extractor_status(i, "motion", "active")
|
|
539
|
+
try:
|
|
540
|
+
if media_type == MediaType.IMAGE:
|
|
541
|
+
motion_data[i] = None
|
|
542
|
+
adaptive_timestamps[i] = [0.0]
|
|
543
|
+
update_extractor_status(i, "motion", "completed")
|
|
544
|
+
else:
|
|
545
|
+
motion = analyze_motion(file_path)
|
|
546
|
+
motion_data[i] = motion
|
|
547
|
+
adaptive_timestamps[i] = get_adaptive_timestamps(motion)
|
|
548
|
+
|
|
549
|
+
# Always store motion data when computed (needed for Pass 2 timestamps)
|
|
550
|
+
motion_result = {
|
|
551
|
+
"duration": motion.duration,
|
|
552
|
+
"fps": motion.fps,
|
|
553
|
+
"primary_motion": motion.primary_motion.value,
|
|
554
|
+
"avg_intensity": float(motion.avg_intensity),
|
|
555
|
+
"is_stable": bool(motion.is_stable),
|
|
556
|
+
"segments": [
|
|
557
|
+
{
|
|
558
|
+
"start": seg.start,
|
|
559
|
+
"end": seg.end,
|
|
560
|
+
"motion_type": seg.motion_type.value,
|
|
561
|
+
"intensity": float(seg.intensity),
|
|
562
|
+
}
|
|
563
|
+
for seg in motion.segments
|
|
564
|
+
],
|
|
565
|
+
}
|
|
566
|
+
update_file_status(i, "running", "motion", motion_result)
|
|
567
|
+
update_extractor_status(i, "motion", "completed")
|
|
568
|
+
logger.info(f"Motion for {fname}: stable={motion.is_stable}, " f"timestamps={len(adaptive_timestamps[i])}")
|
|
569
|
+
except Exception as e:
|
|
570
|
+
logger.warning(f"Motion analysis failed for {file_path}: {e}")
|
|
571
|
+
update_extractor_status(i, "motion", "failed")
|
|
572
|
+
motion_data[i] = None
|
|
573
|
+
# Fallback: generate uniform timestamps from duration
|
|
574
|
+
# This ensures visual extractors still run even if motion fails
|
|
575
|
+
file_result = batch_jobs[batch_id].files[i]
|
|
576
|
+
meta = file_result.results.get("metadata")
|
|
577
|
+
if meta and meta.get("duration"):
|
|
578
|
+
duration = meta["duration"]
|
|
579
|
+
# Generate ~10 uniform timestamps
|
|
580
|
+
num_samples = min(10, max(3, int(duration / 10)))
|
|
581
|
+
step = duration / (num_samples + 1)
|
|
582
|
+
fallback_ts = [step * (j + 1) for j in range(num_samples)]
|
|
583
|
+
adaptive_timestamps[i] = fallback_ts
|
|
584
|
+
logger.info(f"Using fallback timestamps for {fname}: {num_samples} uniform samples")
|
|
585
|
+
else:
|
|
586
|
+
adaptive_timestamps[i] = []
|
|
587
|
+
update_file_timing(i, "motion", time.time() - motion_start)
|
|
588
|
+
|
|
589
|
+
# --- Scene Detection ---
|
|
590
|
+
if request.enable_scenes and media_type != MediaType.IMAGE:
|
|
591
|
+
scenes_start = time.time()
|
|
592
|
+
update_extractor_status(i, "scenes", "active")
|
|
593
|
+
try:
|
|
594
|
+
scenes = extract_scenes(file_path)
|
|
595
|
+
update_file_status(
|
|
596
|
+
i,
|
|
597
|
+
"running",
|
|
598
|
+
"scenes",
|
|
599
|
+
scenes.model_dump() if scenes else None,
|
|
600
|
+
)
|
|
601
|
+
update_extractor_status(i, "scenes", "completed")
|
|
602
|
+
except Exception as e:
|
|
603
|
+
logger.warning(f"Scenes failed for {file_path}: {e}")
|
|
604
|
+
update_extractor_status(i, "scenes", "failed")
|
|
605
|
+
update_file_timing(i, "scenes", time.time() - scenes_start)
|
|
606
|
+
|
|
607
|
+
# --- Decode Frames (for Objects, Faces, OCR, CLIP) ---
|
|
608
|
+
buffer: SharedFrameBuffer | None = None
|
|
609
|
+
visual_extractors_needed = any(
|
|
610
|
+
[
|
|
611
|
+
request.enable_objects,
|
|
612
|
+
request.enable_faces,
|
|
613
|
+
request.enable_ocr,
|
|
614
|
+
request.enable_clip,
|
|
615
|
+
]
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
if visual_extractors_needed:
|
|
619
|
+
decode_start = time.time()
|
|
620
|
+
update_extractor_status(i, "frame_decode", "active")
|
|
621
|
+
motion = motion_data.get(i)
|
|
622
|
+
timestamps = adaptive_timestamps.get(i, [])
|
|
623
|
+
|
|
624
|
+
# Use precomputed timestamps if provided for this file
|
|
625
|
+
if has_precomputed_timestamps and request.visual_timestamps:
|
|
626
|
+
file_timestamps = request.visual_timestamps.get(file_path)
|
|
627
|
+
if file_timestamps:
|
|
628
|
+
timestamps = file_timestamps
|
|
629
|
+
|
|
630
|
+
# Apply motion-based filtering for stable footage
|
|
631
|
+
if motion and motion.is_stable and timestamps:
|
|
632
|
+
timestamps = get_extractor_timestamps(motion.is_stable, motion.avg_intensity, timestamps)
|
|
633
|
+
|
|
634
|
+
# For images, use timestamp 0
|
|
635
|
+
if media_type == MediaType.IMAGE:
|
|
636
|
+
timestamps = [0.0]
|
|
637
|
+
|
|
638
|
+
if timestamps:
|
|
639
|
+
try:
|
|
640
|
+
buffer = decode_frames(
|
|
641
|
+
file_path,
|
|
642
|
+
timestamps=timestamps,
|
|
643
|
+
max_dimension=1920,
|
|
644
|
+
)
|
|
645
|
+
logger.info(f"Decoded {len(buffer.frames)}/{len(timestamps)} frames for {fname}")
|
|
646
|
+
update_extractor_status(i, "frame_decode", "completed")
|
|
647
|
+
except Exception as e:
|
|
648
|
+
logger.warning(f"Frame decode failed for {file_path}: {e}")
|
|
649
|
+
update_extractor_status(i, "frame_decode", "failed")
|
|
650
|
+
else:
|
|
651
|
+
update_extractor_status(i, "frame_decode", "skipped")
|
|
652
|
+
# Pass frame count as units for per-frame rate calculation
|
|
653
|
+
num_frames = len(buffer.frames) if buffer else None
|
|
654
|
+
update_file_timing(i, "frame_decode", time.time() - decode_start, num_frames)
|
|
655
|
+
|
|
656
|
+
# --- Objects (YOLO) ---
|
|
657
|
+
if request.enable_objects and buffer is not None:
|
|
658
|
+
objects_start = time.time()
|
|
659
|
+
update_extractor_status(i, "objects", "active")
|
|
660
|
+
try:
|
|
661
|
+
objects = extract_objects(
|
|
662
|
+
file_path,
|
|
663
|
+
frame_buffer=buffer,
|
|
664
|
+
model_name=yolo_model,
|
|
665
|
+
)
|
|
666
|
+
if objects:
|
|
667
|
+
update_file_status(i, "running", "objects", {"summary": objects.summary})
|
|
668
|
+
# Collect person timestamps for smart face sampling
|
|
669
|
+
person_ts = list(set(d.timestamp for d in objects.detections if d.label == "person"))
|
|
670
|
+
person_timestamps[i] = sorted(person_ts)
|
|
671
|
+
if person_ts:
|
|
672
|
+
logger.info(f"Found {len(person_ts)} person frames in {fname}")
|
|
673
|
+
else:
|
|
674
|
+
person_timestamps[i] = []
|
|
675
|
+
update_extractor_status(i, "objects", "completed")
|
|
676
|
+
except Exception as e:
|
|
677
|
+
logger.warning(f"Objects failed for {file_path}: {e}")
|
|
678
|
+
person_timestamps[i] = []
|
|
679
|
+
update_extractor_status(i, "objects", "failed")
|
|
680
|
+
# Use number of frames as units for rate calculation
|
|
681
|
+
num_frames = len(buffer.frames) if buffer else None
|
|
682
|
+
update_file_timing(i, "objects", time.time() - objects_start, num_frames)
|
|
683
|
+
|
|
684
|
+
# --- Faces ---
|
|
685
|
+
if request.enable_faces:
|
|
686
|
+
faces_start = time.time()
|
|
687
|
+
face_frame_count: int | None = None
|
|
688
|
+
update_extractor_status(i, "faces", "active")
|
|
689
|
+
try:
|
|
690
|
+
person_ts = person_timestamps.get(i, [])
|
|
691
|
+
motion = motion_data.get(i)
|
|
692
|
+
|
|
693
|
+
# Get video duration from motion data or metadata results
|
|
694
|
+
duration = 0.0
|
|
695
|
+
if motion is not None:
|
|
696
|
+
duration = motion.duration
|
|
697
|
+
else:
|
|
698
|
+
file_result = batch_jobs[batch_id].files[i]
|
|
699
|
+
if file_result.results.get("metadata"):
|
|
700
|
+
duration = file_result.results["metadata"].get("duration", 0.0)
|
|
701
|
+
|
|
702
|
+
# Calculate FPS based on motion intensity
|
|
703
|
+
intensity = motion.avg_intensity if motion else 0.0
|
|
704
|
+
if intensity >= 10.0:
|
|
705
|
+
face_fps = 3.0
|
|
706
|
+
elif intensity >= 6.0:
|
|
707
|
+
face_fps = 2.0
|
|
708
|
+
elif intensity >= 2.0:
|
|
709
|
+
face_fps = 1.5
|
|
710
|
+
else:
|
|
711
|
+
face_fps = 1.0
|
|
712
|
+
|
|
713
|
+
# Adaptive face detection for long videos
|
|
714
|
+
# Short videos (<60s): process all at once
|
|
715
|
+
# Long videos: use batched approach with early exit when faces stabilize
|
|
716
|
+
batch_duration = 30.0 # Process 30s at a time
|
|
717
|
+
verification_interval = 10.0 # Check every 10s once stable
|
|
718
|
+
min_consistent_batches = 2 # Need 2 batches of same faces to go sparse
|
|
719
|
+
|
|
720
|
+
faces = None
|
|
721
|
+
face_frame_count = 0
|
|
722
|
+
all_detections: list[dict[str, Any]] = []
|
|
723
|
+
known_embeddings: list[list[float]] = []
|
|
724
|
+
consistent_batches = 0
|
|
725
|
+
in_verification_mode = False
|
|
726
|
+
|
|
727
|
+
if duration <= 60.0:
|
|
728
|
+
# Short video - process all at once
|
|
729
|
+
num_samples = max(1, int(duration * face_fps))
|
|
730
|
+
step = duration / (num_samples + 1)
|
|
731
|
+
face_timestamps = [step * (j + 1) for j in range(num_samples)]
|
|
732
|
+
|
|
733
|
+
# Merge with YOLO person timestamps
|
|
734
|
+
if person_ts:
|
|
735
|
+
all_ts = sorted(set(person_ts + face_timestamps))
|
|
736
|
+
merged_ts: list[float] = []
|
|
737
|
+
for ts in all_ts:
|
|
738
|
+
if not merged_ts or ts - merged_ts[-1] >= 0.3:
|
|
739
|
+
merged_ts.append(ts)
|
|
740
|
+
face_timestamps = merged_ts
|
|
741
|
+
|
|
742
|
+
if face_timestamps:
|
|
743
|
+
face_buffer = decode_frames(file_path, timestamps=face_timestamps)
|
|
744
|
+
faces = extract_faces(file_path, frame_buffer=face_buffer)
|
|
745
|
+
face_frame_count = len(face_buffer.frames)
|
|
746
|
+
logger.info(f"Face detection on {face_frame_count} frames for {fname} " f"(short video, {face_fps} FPS)")
|
|
747
|
+
else:
|
|
748
|
+
# Long video - use adaptive batching
|
|
749
|
+
current_time = 0.0
|
|
750
|
+
total_frames = 0
|
|
751
|
+
|
|
752
|
+
while current_time < duration:
|
|
753
|
+
# Determine batch parameters
|
|
754
|
+
if in_verification_mode:
|
|
755
|
+
# Sparse verification: just check every 10s
|
|
756
|
+
batch_end = min(current_time + verification_interval, duration)
|
|
757
|
+
batch_timestamps = [current_time + verification_interval / 2]
|
|
758
|
+
else:
|
|
759
|
+
# Normal dense sampling
|
|
760
|
+
batch_end = min(current_time + batch_duration, duration)
|
|
761
|
+
batch_dur = batch_end - current_time
|
|
762
|
+
num_batch_samples = max(1, int(batch_dur * face_fps))
|
|
763
|
+
step = batch_dur / (num_batch_samples + 1)
|
|
764
|
+
batch_timestamps = [current_time + step * (j + 1) for j in range(num_batch_samples)]
|
|
765
|
+
|
|
766
|
+
# Add YOLO person timestamps in this range
|
|
767
|
+
batch_person_ts = [ts for ts in person_ts if current_time <= ts < batch_end]
|
|
768
|
+
if batch_person_ts:
|
|
769
|
+
all_ts = sorted(set(batch_person_ts + batch_timestamps))
|
|
770
|
+
merged_ts = []
|
|
771
|
+
for ts in all_ts:
|
|
772
|
+
if not merged_ts or ts - merged_ts[-1] >= 0.3:
|
|
773
|
+
merged_ts.append(ts)
|
|
774
|
+
batch_timestamps = merged_ts
|
|
775
|
+
|
|
776
|
+
# Process this batch
|
|
777
|
+
if batch_timestamps:
|
|
778
|
+
batch_buffer = decode_frames(file_path, timestamps=batch_timestamps)
|
|
779
|
+
batch_faces = extract_faces(file_path, frame_buffer=batch_buffer)
|
|
780
|
+
total_frames += len(batch_buffer.frames)
|
|
781
|
+
|
|
782
|
+
if batch_faces and batch_faces.detections:
|
|
783
|
+
# Add detections to our collection
|
|
784
|
+
for d in batch_faces.detections:
|
|
785
|
+
all_detections.append(
|
|
786
|
+
{
|
|
787
|
+
"timestamp": d.timestamp,
|
|
788
|
+
"bbox": d.bbox.model_dump(),
|
|
789
|
+
"confidence": d.confidence,
|
|
790
|
+
"embedding": d.embedding,
|
|
791
|
+
"image_base64": d.image_base64,
|
|
792
|
+
"needs_review": d.needs_review,
|
|
793
|
+
"review_reason": d.review_reason,
|
|
794
|
+
}
|
|
795
|
+
)
|
|
796
|
+
|
|
797
|
+
# Check if faces are all known
|
|
798
|
+
all_known, new_embs = check_faces_are_known(batch_faces, known_embeddings)
|
|
799
|
+
|
|
800
|
+
if new_embs:
|
|
801
|
+
# New faces found - add to known and reset consistency
|
|
802
|
+
known_embeddings.extend(new_embs)
|
|
803
|
+
consistent_batches = 0
|
|
804
|
+
if in_verification_mode:
|
|
805
|
+
logger.info(f"New face detected at {current_time:.1f}s, " "exiting verification mode")
|
|
806
|
+
in_verification_mode = False
|
|
807
|
+
elif all_known and known_embeddings:
|
|
808
|
+
# All faces are known
|
|
809
|
+
consistent_batches += 1
|
|
810
|
+
if consistent_batches >= min_consistent_batches and not in_verification_mode:
|
|
811
|
+
in_verification_mode = True
|
|
812
|
+
logger.info(f"Faces stable after {current_time:.1f}s, " "switching to verification mode (every 10s)")
|
|
813
|
+
elif not known_embeddings:
|
|
814
|
+
# No faces in this batch and no known faces yet
|
|
815
|
+
consistent_batches += 1
|
|
816
|
+
if consistent_batches >= min_consistent_batches:
|
|
817
|
+
in_verification_mode = True
|
|
818
|
+
|
|
819
|
+
current_time = batch_end
|
|
820
|
+
|
|
821
|
+
face_frame_count = total_frames
|
|
822
|
+
|
|
823
|
+
# Create result from collected detections
|
|
824
|
+
if all_detections:
|
|
825
|
+
# Reconstruct FacesResult from batched detections
|
|
826
|
+
faces = FacesResult(
|
|
827
|
+
count=len(all_detections),
|
|
828
|
+
unique_estimate=len(known_embeddings),
|
|
829
|
+
detections=[
|
|
830
|
+
FaceDetection(
|
|
831
|
+
timestamp=d["timestamp"],
|
|
832
|
+
bbox=BoundingBox(**d["bbox"]),
|
|
833
|
+
confidence=d["confidence"],
|
|
834
|
+
embedding=d["embedding"],
|
|
835
|
+
image_base64=d["image_base64"],
|
|
836
|
+
needs_review=d.get("needs_review", False),
|
|
837
|
+
review_reason=d.get("review_reason"),
|
|
838
|
+
)
|
|
839
|
+
for d in all_detections
|
|
840
|
+
],
|
|
841
|
+
)
|
|
842
|
+
|
|
843
|
+
mode_info = "verification" if in_verification_mode else "normal"
|
|
844
|
+
logger.info(
|
|
845
|
+
f"Face detection on {total_frames} frames for {fname} " f"(adaptive batching, {len(known_embeddings)} unique, " f"ended in {mode_info} mode)"
|
|
846
|
+
)
|
|
847
|
+
|
|
848
|
+
# Fallback if no duration info
|
|
849
|
+
if faces is None and buffer is not None:
|
|
850
|
+
faces = extract_faces(file_path, frame_buffer=buffer)
|
|
851
|
+
face_frame_count = len(buffer.frames)
|
|
852
|
+
logger.info(f"Face detection on {len(buffer.frames)} frames for {fname} " "(using shared buffer)")
|
|
853
|
+
|
|
854
|
+
if faces:
|
|
855
|
+
faces_data = {
|
|
856
|
+
"count": faces.count,
|
|
857
|
+
"unique_estimate": faces.unique_estimate,
|
|
858
|
+
"detections": [
|
|
859
|
+
{
|
|
860
|
+
"timestamp": d.timestamp,
|
|
861
|
+
"bbox": d.bbox.model_dump(),
|
|
862
|
+
"confidence": d.confidence,
|
|
863
|
+
"embedding": d.embedding,
|
|
864
|
+
"image_base64": d.image_base64,
|
|
865
|
+
"needs_review": d.needs_review,
|
|
866
|
+
"review_reason": d.review_reason,
|
|
867
|
+
}
|
|
868
|
+
for d in faces.detections
|
|
869
|
+
],
|
|
870
|
+
}
|
|
871
|
+
update_file_status(i, "running", "faces", faces_data)
|
|
872
|
+
else:
|
|
873
|
+
update_file_status(
|
|
874
|
+
i,
|
|
875
|
+
"running",
|
|
876
|
+
"faces",
|
|
877
|
+
{"count": 0, "unique_estimate": 0, "detections": []},
|
|
878
|
+
)
|
|
879
|
+
update_extractor_status(i, "faces", "completed")
|
|
880
|
+
except Exception as e:
|
|
881
|
+
logger.warning(f"Faces failed for {file_path}: {e}")
|
|
882
|
+
update_extractor_status(i, "faces", "failed")
|
|
883
|
+
update_file_timing(i, "faces", time.time() - faces_start, face_frame_count)
|
|
884
|
+
|
|
885
|
+
# --- OCR ---
|
|
886
|
+
if request.enable_ocr and buffer is not None:
|
|
887
|
+
ocr_start = time.time()
|
|
888
|
+
update_extractor_status(i, "ocr", "active")
|
|
889
|
+
try:
|
|
890
|
+
ocr = extract_ocr(file_path, frame_buffer=buffer)
|
|
891
|
+
update_file_status(i, "running", "ocr", ocr.model_dump() if ocr else None)
|
|
892
|
+
update_extractor_status(i, "ocr", "completed")
|
|
893
|
+
except Exception as e:
|
|
894
|
+
logger.warning(f"OCR failed for {file_path}: {e}")
|
|
895
|
+
update_extractor_status(i, "ocr", "failed")
|
|
896
|
+
num_frames = len(buffer.frames) if buffer else None
|
|
897
|
+
update_file_timing(i, "ocr", time.time() - ocr_start, num_frames)
|
|
898
|
+
|
|
899
|
+
# --- CLIP ---
|
|
900
|
+
if request.enable_clip and buffer is not None:
|
|
901
|
+
clip_start = time.time()
|
|
902
|
+
update_extractor_status(i, "clip", "active")
|
|
903
|
+
try:
|
|
904
|
+
clip = extract_clip(
|
|
905
|
+
file_path,
|
|
906
|
+
frame_buffer=buffer,
|
|
907
|
+
model_name=clip_model,
|
|
908
|
+
)
|
|
909
|
+
if clip:
|
|
910
|
+
update_file_status(i, "running", "clip", clip.model_dump())
|
|
911
|
+
else:
|
|
912
|
+
update_file_status(i, "running", "clip", None)
|
|
913
|
+
update_extractor_status(i, "clip", "completed")
|
|
914
|
+
except Exception as e:
|
|
915
|
+
logger.warning(f"CLIP failed for {file_path}: {e}")
|
|
916
|
+
update_extractor_status(i, "clip", "failed")
|
|
917
|
+
num_frames = len(buffer.frames) if buffer else None
|
|
918
|
+
update_file_timing(i, "clip", time.time() - clip_start, num_frames)
|
|
919
|
+
|
|
920
|
+
# --- Release buffer for this file ---
|
|
921
|
+
if buffer is not None:
|
|
922
|
+
logger.info(f"Releasing frame buffer for {fname}")
|
|
923
|
+
del buffer
|
|
924
|
+
gc.collect()
|
|
925
|
+
|
|
926
|
+
# Update peak memory after each file
|
|
927
|
+
peak_memory = max(peak_memory, get_memory_mb())
|
|
928
|
+
|
|
929
|
+
# Unload all visual models after processing all files
|
|
930
|
+
update_batch_progress("visual_processing", "Unloading models...", None, None)
|
|
931
|
+
if request.enable_objects:
|
|
932
|
+
unload_yolo_model()
|
|
933
|
+
if request.enable_faces:
|
|
934
|
+
unload_face_model()
|
|
935
|
+
if request.enable_ocr:
|
|
936
|
+
unload_ocr_model()
|
|
937
|
+
if request.enable_clip:
|
|
938
|
+
unload_clip_model()
|
|
939
|
+
|
|
940
|
+
end_extractor_timing("visual_processing", total_files)
|
|
941
|
+
|
|
942
|
+
# Stage 5: Visual (Qwen VLM - scene descriptions)
|
|
943
|
+
# Separate stage because Qwen is very heavy and has its own frame handling
|
|
944
|
+
if request.enable_visual:
|
|
945
|
+
start_extractor_timing("visual")
|
|
946
|
+
logger.info("Visual enabled (Qwen VLM)")
|
|
947
|
+
clear_memory()
|
|
948
|
+
update_batch_progress("visual", "Loading Qwen model...", 0, total_files)
|
|
949
|
+
logger.info(f"Qwen batch contexts: {request.contexts}")
|
|
950
|
+
|
|
951
|
+
for i, file_path in enumerate(files):
|
|
952
|
+
if i in failed_files:
|
|
953
|
+
update_extractor_status(i, "visual", "skipped")
|
|
954
|
+
continue
|
|
955
|
+
file_start = time.time()
|
|
956
|
+
fname = Path(file_path).name
|
|
957
|
+
update_batch_progress("visual", f"Analyzing: {fname}", i + 1, total_files)
|
|
958
|
+
update_extractor_status(i, "visual", "active")
|
|
959
|
+
# Get per-file timestamps if provided (declared before try so it's visible after)
|
|
960
|
+
timestamps: list[float] | None = None
|
|
961
|
+
try:
|
|
962
|
+
motion = motion_data.get(i)
|
|
963
|
+
if request.visual_timestamps:
|
|
964
|
+
timestamps = request.visual_timestamps.get(file_path)
|
|
965
|
+
if timestamps is None and motion:
|
|
966
|
+
timestamps = get_sample_timestamps(motion, max_samples=5)
|
|
967
|
+
|
|
968
|
+
file_context = request.contexts.get(file_path) if request.contexts else None
|
|
969
|
+
logger.info(f"Calling Qwen with context for {fname}: {file_context}, lut_path={request.lut_path}")
|
|
970
|
+
visual_result = extract_objects_qwen(
|
|
971
|
+
file_path,
|
|
972
|
+
timestamps=timestamps,
|
|
973
|
+
model_name=qwen_model,
|
|
974
|
+
context=file_context,
|
|
975
|
+
lut_path=request.lut_path,
|
|
976
|
+
)
|
|
977
|
+
visual_data: dict[str, Any] = {"summary": visual_result.summary}
|
|
978
|
+
if visual_result.descriptions:
|
|
979
|
+
visual_data["descriptions"] = visual_result.descriptions
|
|
980
|
+
update_file_status(i, "running", "visual", visual_data)
|
|
981
|
+
update_extractor_status(i, "visual", "completed")
|
|
982
|
+
except Exception as e:
|
|
983
|
+
logger.warning(f"Visual failed for {file_path}: {e}", exc_info=True)
|
|
984
|
+
update_extractor_status(i, "visual", "failed")
|
|
985
|
+
update_file_status(i, "failed", error=str(e))
|
|
986
|
+
failed_files.add(i)
|
|
987
|
+
# Use number of timestamps as units for rate calculation
|
|
988
|
+
num_timestamps = len(timestamps) if timestamps else None
|
|
989
|
+
update_file_timing(i, "visual", time.time() - file_start, num_timestamps)
|
|
990
|
+
|
|
991
|
+
update_batch_progress("visual", "Unloading Qwen model...", None, None)
|
|
992
|
+
unload_qwen_model()
|
|
993
|
+
end_extractor_timing("visual", total_files)
|
|
994
|
+
|
|
995
|
+
# Stage 6: Transcript (Whisper - heavy model)
|
|
996
|
+
# Skip for images and files without audio tracks
|
|
997
|
+
if request.enable_transcript:
|
|
998
|
+
start_extractor_timing("transcript")
|
|
999
|
+
whisper_ran = False # Track if we actually ran Whisper
|
|
1000
|
+
|
|
1001
|
+
# Check if any files need transcription before loading model
|
|
1002
|
+
files_to_transcribe: list[int] = []
|
|
1003
|
+
for i, file_path in enumerate(files):
|
|
1004
|
+
if i in failed_files:
|
|
1005
|
+
update_extractor_status(i, "transcript", "skipped")
|
|
1006
|
+
continue
|
|
1007
|
+
# Skip images
|
|
1008
|
+
media_type = get_media_type(file_path)
|
|
1009
|
+
if media_type == MediaType.IMAGE:
|
|
1010
|
+
update_extractor_status(i, "transcript", "skipped")
|
|
1011
|
+
continue
|
|
1012
|
+
# Check for audio track
|
|
1013
|
+
has_audio = True
|
|
1014
|
+
with batch_jobs_lock:
|
|
1015
|
+
file_results = batch_jobs[batch_id].files[i].results
|
|
1016
|
+
if file_results and file_results.get("metadata"):
|
|
1017
|
+
if file_results["metadata"].get("audio") is None:
|
|
1018
|
+
has_audio = False
|
|
1019
|
+
if has_audio:
|
|
1020
|
+
files_to_transcribe.append(i)
|
|
1021
|
+
else:
|
|
1022
|
+
update_extractor_status(i, "transcript", "skipped")
|
|
1023
|
+
|
|
1024
|
+
if files_to_transcribe:
|
|
1025
|
+
# Clear memory before loading heavy model
|
|
1026
|
+
logger.info("Clearing memory before Whisper...")
|
|
1027
|
+
clear_memory()
|
|
1028
|
+
update_batch_progress(
|
|
1029
|
+
"transcript",
|
|
1030
|
+
"Loading Whisper model...",
|
|
1031
|
+
0,
|
|
1032
|
+
len(files_to_transcribe),
|
|
1033
|
+
)
|
|
1034
|
+
|
|
1035
|
+
for idx, i in enumerate(files_to_transcribe):
|
|
1036
|
+
file_path = files[i]
|
|
1037
|
+
file_start = time.time()
|
|
1038
|
+
update_batch_progress(
|
|
1039
|
+
"transcript",
|
|
1040
|
+
f"Transcribing {Path(file_path).name}",
|
|
1041
|
+
idx + 1,
|
|
1042
|
+
len(files_to_transcribe),
|
|
1043
|
+
)
|
|
1044
|
+
update_extractor_status(i, "transcript", "active")
|
|
1045
|
+
try:
|
|
1046
|
+
transcript = extract_transcript(
|
|
1047
|
+
file_path,
|
|
1048
|
+
model=whisper_model,
|
|
1049
|
+
language=request.language,
|
|
1050
|
+
fallback_language=settings.fallback_language,
|
|
1051
|
+
language_hints=request.language_hints,
|
|
1052
|
+
context_hint=request.context_hint,
|
|
1053
|
+
)
|
|
1054
|
+
update_file_status(
|
|
1055
|
+
i,
|
|
1056
|
+
"running",
|
|
1057
|
+
"transcript",
|
|
1058
|
+
transcript.model_dump() if transcript else None,
|
|
1059
|
+
)
|
|
1060
|
+
update_extractor_status(i, "transcript", "completed")
|
|
1061
|
+
whisper_ran = True
|
|
1062
|
+
except Exception as e:
|
|
1063
|
+
logger.warning(f"Transcript failed for {file_path}: {e}")
|
|
1064
|
+
update_extractor_status(i, "transcript", "failed")
|
|
1065
|
+
update_file_status(i, "failed", error=str(e))
|
|
1066
|
+
failed_files.add(i)
|
|
1067
|
+
# Get duration in minutes for rate calculation
|
|
1068
|
+
duration_minutes: float | None = None
|
|
1069
|
+
with batch_jobs_lock:
|
|
1070
|
+
file_results = batch_jobs[batch_id].files[i].results
|
|
1071
|
+
if file_results and file_results.get("metadata"):
|
|
1072
|
+
duration_sec = file_results["metadata"].get("duration")
|
|
1073
|
+
if duration_sec:
|
|
1074
|
+
duration_minutes = duration_sec / 60.0
|
|
1075
|
+
update_file_timing(i, "transcript", time.time() - file_start, duration_minutes)
|
|
1076
|
+
|
|
1077
|
+
# Unload Whisper to free memory
|
|
1078
|
+
if whisper_ran:
|
|
1079
|
+
update_batch_progress("transcript", "Unloading Whisper model...", None, None)
|
|
1080
|
+
unload_whisper_model()
|
|
1081
|
+
else:
|
|
1082
|
+
logger.info("Skipping Whisper - no files with audio tracks")
|
|
1083
|
+
|
|
1084
|
+
end_extractor_timing("transcript", total_files)
|
|
1085
|
+
|
|
1086
|
+
# Mark files as completed (skip failed files - they stay "failed")
|
|
1087
|
+
with batch_jobs_lock:
|
|
1088
|
+
for i in range(len(files)):
|
|
1089
|
+
if i in failed_files:
|
|
1090
|
+
# File already marked as failed - don't overwrite
|
|
1091
|
+
error_msg = batch_jobs[batch_id].files[i].error or "unknown error"
|
|
1092
|
+
logger.info(f"Batch {batch_id} file {i} marked failed: {error_msg}")
|
|
1093
|
+
continue
|
|
1094
|
+
# Log results before marking complete
|
|
1095
|
+
result_keys = list(batch_jobs[batch_id].files[i].results.keys())
|
|
1096
|
+
logger.info(f"Batch {batch_id} file {i} results before completion: keys={result_keys}")
|
|
1097
|
+
batch_jobs[batch_id].files[i].status = "completed"
|
|
1098
|
+
batch_jobs[batch_id].status = "completed"
|
|
1099
|
+
batch_jobs[batch_id].current_extractor = None
|
|
1100
|
+
batch_jobs[batch_id].progress = None
|
|
1101
|
+
batch_jobs[batch_id].completed_at = datetime.now(timezone.utc)
|
|
1102
|
+
# Final metrics
|
|
1103
|
+
batch_jobs[batch_id].elapsed_seconds = round(time.time() - batch_start_time, 2)
|
|
1104
|
+
batch_jobs[batch_id].memory_mb = get_memory_mb()
|
|
1105
|
+
batch_jobs[batch_id].peak_memory_mb = max(peak_memory, get_memory_mb())
|
|
1106
|
+
|
|
1107
|
+
# Log timing summary
|
|
1108
|
+
logger.info(f"Batch {batch_id} completed in {batch_jobs[batch_id].elapsed_seconds}s, peak memory: {batch_jobs[batch_id].peak_memory_mb}MB")
|
|
1109
|
+
for timing in batch_jobs[batch_id].extractor_timings:
|
|
1110
|
+
logger.info(f" {timing.extractor}: {timing.duration_seconds}s ({timing.files_processed} files)")
|
|
1111
|
+
|
|
1112
|
+
except Exception as e:
|
|
1113
|
+
logger.error(f"Batch {batch_id} failed: {e}")
|
|
1114
|
+
with batch_jobs_lock:
|
|
1115
|
+
if batch_id in batch_jobs:
|
|
1116
|
+
batch_jobs[batch_id].status = "failed"
|
|
1117
|
+
batch_jobs[batch_id].completed_at = datetime.now(timezone.utc)
|
|
1118
|
+
batch_jobs[batch_id].elapsed_seconds = round(time.time() - batch_start_time, 2)
|
|
1119
|
+
batch_jobs[batch_id].memory_mb = get_memory_mb()
|
|
1120
|
+
batch_jobs[batch_id].peak_memory_mb = peak_memory
|
|
1121
|
+
|
|
1122
|
+
finally:
|
|
1123
|
+
# Cleanup old batch jobs to free memory
|
|
1124
|
+
cleanup_expired_batch_jobs()
|
|
1125
|
+
|
|
1126
|
+
# Clear memory before starting next batch
|
|
1127
|
+
logger.info("Clearing memory after batch completion...")
|
|
1128
|
+
clear_memory()
|
|
1129
|
+
|
|
1130
|
+
# Always start the next batch from queue (or set batch_running = False)
|
|
1131
|
+
start_next_batch()
|