media-engine 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/clip.py +79 -0
- cli/faces.py +91 -0
- cli/metadata.py +68 -0
- cli/motion.py +77 -0
- cli/objects.py +94 -0
- cli/ocr.py +93 -0
- cli/scenes.py +57 -0
- cli/telemetry.py +65 -0
- cli/transcript.py +76 -0
- media_engine/__init__.py +7 -0
- media_engine/_version.py +34 -0
- media_engine/app.py +80 -0
- media_engine/batch/__init__.py +56 -0
- media_engine/batch/models.py +99 -0
- media_engine/batch/processor.py +1131 -0
- media_engine/batch/queue.py +232 -0
- media_engine/batch/state.py +30 -0
- media_engine/batch/timing.py +321 -0
- media_engine/cli.py +17 -0
- media_engine/config.py +674 -0
- media_engine/extractors/__init__.py +75 -0
- media_engine/extractors/clip.py +401 -0
- media_engine/extractors/faces.py +459 -0
- media_engine/extractors/frame_buffer.py +351 -0
- media_engine/extractors/frames.py +402 -0
- media_engine/extractors/metadata/__init__.py +127 -0
- media_engine/extractors/metadata/apple.py +169 -0
- media_engine/extractors/metadata/arri.py +118 -0
- media_engine/extractors/metadata/avchd.py +208 -0
- media_engine/extractors/metadata/avchd_gps.py +270 -0
- media_engine/extractors/metadata/base.py +688 -0
- media_engine/extractors/metadata/blackmagic.py +139 -0
- media_engine/extractors/metadata/camera_360.py +276 -0
- media_engine/extractors/metadata/canon.py +290 -0
- media_engine/extractors/metadata/dji.py +371 -0
- media_engine/extractors/metadata/dv.py +121 -0
- media_engine/extractors/metadata/ffmpeg.py +76 -0
- media_engine/extractors/metadata/generic.py +119 -0
- media_engine/extractors/metadata/gopro.py +256 -0
- media_engine/extractors/metadata/red.py +305 -0
- media_engine/extractors/metadata/registry.py +114 -0
- media_engine/extractors/metadata/sony.py +442 -0
- media_engine/extractors/metadata/tesla.py +157 -0
- media_engine/extractors/motion.py +765 -0
- media_engine/extractors/objects.py +245 -0
- media_engine/extractors/objects_qwen.py +754 -0
- media_engine/extractors/ocr.py +268 -0
- media_engine/extractors/scenes.py +82 -0
- media_engine/extractors/shot_type.py +217 -0
- media_engine/extractors/telemetry.py +262 -0
- media_engine/extractors/transcribe.py +579 -0
- media_engine/extractors/translate.py +121 -0
- media_engine/extractors/vad.py +263 -0
- media_engine/main.py +68 -0
- media_engine/py.typed +0 -0
- media_engine/routers/__init__.py +15 -0
- media_engine/routers/batch.py +78 -0
- media_engine/routers/health.py +93 -0
- media_engine/routers/models.py +211 -0
- media_engine/routers/settings.py +87 -0
- media_engine/routers/utils.py +135 -0
- media_engine/schemas.py +581 -0
- media_engine/utils/__init__.py +5 -0
- media_engine/utils/logging.py +54 -0
- media_engine/utils/memory.py +49 -0
- media_engine-0.1.0.dist-info/METADATA +276 -0
- media_engine-0.1.0.dist-info/RECORD +70 -0
- media_engine-0.1.0.dist-info/WHEEL +4 -0
- media_engine-0.1.0.dist-info/entry_points.txt +11 -0
- media_engine-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,754 @@
|
|
|
1
|
+
"""Object detection using Qwen2-VL vision-language model."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import shutil
|
|
7
|
+
import tempfile
|
|
8
|
+
from collections.abc import Callable
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
import torch
|
|
13
|
+
|
|
14
|
+
from media_engine.config import (
|
|
15
|
+
DeviceType,
|
|
16
|
+
get_device,
|
|
17
|
+
get_free_memory_gb,
|
|
18
|
+
get_settings,
|
|
19
|
+
)
|
|
20
|
+
from media_engine.extractors.frames import FrameExtractor
|
|
21
|
+
from media_engine.schemas import BoundingBox, ObjectDetection, ObjectsResult
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
# Progress callback type: (message, current, total) -> None
|
|
26
|
+
ProgressCallback = Callable[[str, int | None, int | None], None]
|
|
27
|
+
|
|
28
|
+
# Singleton model instances (lazy loaded, stays in memory between calls)
|
|
29
|
+
_qwen_model: Any = None
|
|
30
|
+
_qwen_processor: Any = None
|
|
31
|
+
_qwen_model_name: str | None = None
|
|
32
|
+
_qwen_device: str | None = None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def unload_qwen_model() -> None:
|
|
36
|
+
"""Unload Qwen model from memory to free GPU/MPS memory."""
|
|
37
|
+
global _qwen_model, _qwen_processor, _qwen_model_name, _qwen_device
|
|
38
|
+
|
|
39
|
+
if _qwen_model is not None:
|
|
40
|
+
logger.info("Unloading Qwen model from memory")
|
|
41
|
+
|
|
42
|
+
# Move model to CPU first to release MPS memory
|
|
43
|
+
try:
|
|
44
|
+
_qwen_model.to("cpu")
|
|
45
|
+
except Exception:
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
del _qwen_model
|
|
49
|
+
del _qwen_processor
|
|
50
|
+
_qwen_model = None
|
|
51
|
+
_qwen_processor = None
|
|
52
|
+
_qwen_model_name = None
|
|
53
|
+
_qwen_device = None
|
|
54
|
+
|
|
55
|
+
import gc
|
|
56
|
+
|
|
57
|
+
gc.collect()
|
|
58
|
+
|
|
59
|
+
# Free GPU memory with sync
|
|
60
|
+
if torch.cuda.is_available():
|
|
61
|
+
torch.cuda.synchronize()
|
|
62
|
+
torch.cuda.empty_cache()
|
|
63
|
+
if hasattr(torch, "mps"):
|
|
64
|
+
if hasattr(torch.mps, "synchronize"):
|
|
65
|
+
torch.mps.synchronize()
|
|
66
|
+
if hasattr(torch.mps, "empty_cache"):
|
|
67
|
+
torch.mps.empty_cache()
|
|
68
|
+
|
|
69
|
+
gc.collect()
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# Known LOG/HDR color transfer characteristics
|
|
73
|
+
# These indicate footage that needs color correction to look "normal"
|
|
74
|
+
LOG_COLOR_TRANSFERS = {
|
|
75
|
+
# HDR transfer functions
|
|
76
|
+
"arib-std-b67", # HLG (Hybrid Log-Gamma)
|
|
77
|
+
"smpte2084", # PQ (Perceptual Quantizer) / HDR10
|
|
78
|
+
"smpte428", # DCI-P3
|
|
79
|
+
# Manufacturer LOG profiles (as they appear in ffmpeg metadata)
|
|
80
|
+
"log", # Generic log
|
|
81
|
+
"slog", # Sony S-Log
|
|
82
|
+
"slog2", # Sony S-Log2
|
|
83
|
+
"slog3", # Sony S-Log3
|
|
84
|
+
"vlog", # Panasonic V-Log
|
|
85
|
+
"clog", # Canon C-Log
|
|
86
|
+
"clog2", # Canon C-Log2
|
|
87
|
+
"clog3", # Canon C-Log3
|
|
88
|
+
"dlog", # DJI D-Log
|
|
89
|
+
"dlog-m", # DJI D-Log M
|
|
90
|
+
"hlg", # HLG
|
|
91
|
+
"n-log", # Nikon N-Log
|
|
92
|
+
"f-log", # Fujifilm F-Log
|
|
93
|
+
"f-log2", # Fujifilm F-Log2
|
|
94
|
+
"blackmagic", # Blackmagic Film
|
|
95
|
+
"arri", # ARRI Log C
|
|
96
|
+
"logc", # ARRI Log C
|
|
97
|
+
"redlogfilm", # RED Log Film
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _is_log_color_space(color_transfer: str | None) -> bool:
|
|
102
|
+
"""Check if the color transfer characteristic indicates LOG/HDR footage.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
color_transfer: The color transfer characteristic from video metadata
|
|
106
|
+
(e.g., "arib-std-b67", "smpte2084", "bt709")
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
True if the footage appears to be in a LOG/flat/HDR color space
|
|
110
|
+
that would benefit from color correction before viewing.
|
|
111
|
+
"""
|
|
112
|
+
if not color_transfer:
|
|
113
|
+
return False
|
|
114
|
+
|
|
115
|
+
# Normalize to lowercase for comparison
|
|
116
|
+
ct_lower = color_transfer.lower().replace("_", "-").replace(" ", "")
|
|
117
|
+
|
|
118
|
+
# Check for exact matches first
|
|
119
|
+
if ct_lower in LOG_COLOR_TRANSFERS:
|
|
120
|
+
return True
|
|
121
|
+
|
|
122
|
+
# Check for partial matches (e.g., "s-log3" contains "log")
|
|
123
|
+
log_keywords = ["log", "hlg", "pq", "hdr", "dci-p3"]
|
|
124
|
+
for keyword in log_keywords:
|
|
125
|
+
if keyword in ct_lower:
|
|
126
|
+
return True
|
|
127
|
+
|
|
128
|
+
return False
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _get_qwen_model(
|
|
132
|
+
model_name: str,
|
|
133
|
+
progress_callback: ProgressCallback | None = None,
|
|
134
|
+
) -> tuple[Any, Any, str]:
|
|
135
|
+
"""Get or create the Qwen model and processor (singleton).
|
|
136
|
+
|
|
137
|
+
Returns (model, processor, device_str).
|
|
138
|
+
Raises RuntimeError/MemoryError if model cannot be loaded (e.g., OOM).
|
|
139
|
+
Model stays loaded in memory for subsequent calls.
|
|
140
|
+
"""
|
|
141
|
+
global _qwen_model, _qwen_processor, _qwen_model_name, _qwen_device
|
|
142
|
+
|
|
143
|
+
# Return cached model if same model requested
|
|
144
|
+
if _qwen_model is not None and _qwen_model_name == model_name:
|
|
145
|
+
logger.info(f"Reusing cached Qwen model: {model_name}")
|
|
146
|
+
return _qwen_model, _qwen_processor, _qwen_device # type: ignore
|
|
147
|
+
|
|
148
|
+
# Log memory status (informational only - let PyTorch handle OOM)
|
|
149
|
+
free_memory = get_free_memory_gb()
|
|
150
|
+
model_memory_gb = 15.0 if "7B" in model_name else 5.0
|
|
151
|
+
logger.info(f"Free memory: {free_memory:.1f}GB, model needs: ~{model_memory_gb:.0f}GB")
|
|
152
|
+
|
|
153
|
+
# Clear existing GPU memory before loading
|
|
154
|
+
import gc
|
|
155
|
+
|
|
156
|
+
gc.collect()
|
|
157
|
+
if torch.cuda.is_available():
|
|
158
|
+
torch.cuda.synchronize()
|
|
159
|
+
torch.cuda.empty_cache()
|
|
160
|
+
if hasattr(torch, "mps") and hasattr(torch.mps, "empty_cache"):
|
|
161
|
+
try:
|
|
162
|
+
torch.mps.synchronize()
|
|
163
|
+
torch.mps.empty_cache()
|
|
164
|
+
except Exception as e:
|
|
165
|
+
logger.warning(f"Failed to clear MPS cache: {e}")
|
|
166
|
+
gc.collect()
|
|
167
|
+
|
|
168
|
+
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration # type: ignore[import-not-found]
|
|
169
|
+
|
|
170
|
+
# Determine device
|
|
171
|
+
device = get_device()
|
|
172
|
+
if device == DeviceType.MPS:
|
|
173
|
+
torch_device = "mps"
|
|
174
|
+
torch_dtype = torch.float16
|
|
175
|
+
elif device == DeviceType.CUDA:
|
|
176
|
+
torch_device = "cuda"
|
|
177
|
+
torch_dtype = torch.float16
|
|
178
|
+
else:
|
|
179
|
+
torch_device = "cpu"
|
|
180
|
+
torch_dtype = torch.float32
|
|
181
|
+
|
|
182
|
+
logger.info(f"Loading Qwen2-VL model: {model_name} on {torch_device}")
|
|
183
|
+
if progress_callback:
|
|
184
|
+
progress_callback("Loading Qwen model...", None, None)
|
|
185
|
+
|
|
186
|
+
# Disable tqdm progress bars and warnings to avoid BrokenPipeError when running as daemon
|
|
187
|
+
import transformers # type: ignore[import-not-found]
|
|
188
|
+
|
|
189
|
+
transformers.logging.disable_progress_bar()
|
|
190
|
+
transformers.logging.set_verbosity_error() # Suppress info/warning output
|
|
191
|
+
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
|
|
192
|
+
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
|
|
193
|
+
|
|
194
|
+
# Load model and processor with detailed error handling
|
|
195
|
+
try:
|
|
196
|
+
logger.info("Loading Qwen2VLForConditionalGeneration...")
|
|
197
|
+
|
|
198
|
+
# For MPS (Apple Silicon), don't use device_map at all
|
|
199
|
+
# device_map triggers accelerate's meta tensor handling which fails on MPS
|
|
200
|
+
if torch_device == "mps":
|
|
201
|
+
_qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
202
|
+
model_name,
|
|
203
|
+
torch_dtype=torch_dtype,
|
|
204
|
+
# No device_map - load directly to CPU without accelerate dispatch
|
|
205
|
+
)
|
|
206
|
+
logger.info("Moving model to MPS...")
|
|
207
|
+
_qwen_model = _qwen_model.to("mps")
|
|
208
|
+
elif torch_device == "cuda":
|
|
209
|
+
# CUDA works fine with device_map
|
|
210
|
+
_qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
211
|
+
model_name,
|
|
212
|
+
torch_dtype=torch_dtype,
|
|
213
|
+
device_map="cuda",
|
|
214
|
+
)
|
|
215
|
+
else:
|
|
216
|
+
# CPU - no device_map needed
|
|
217
|
+
_qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
218
|
+
model_name,
|
|
219
|
+
torch_dtype=torch_dtype,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
logger.info("Qwen model loaded, loading processor...")
|
|
223
|
+
_qwen_processor = AutoProcessor.from_pretrained(model_name)
|
|
224
|
+
logger.info("Qwen processor loaded successfully")
|
|
225
|
+
except Exception as e:
|
|
226
|
+
logger.error(f"Failed to load Qwen model: {e}", exc_info=True)
|
|
227
|
+
raise
|
|
228
|
+
|
|
229
|
+
_qwen_model_name = model_name
|
|
230
|
+
_qwen_device = torch_device
|
|
231
|
+
|
|
232
|
+
return _qwen_model, _qwen_processor, torch_device
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def _build_analysis_prompt(context: dict[str, str] | None = None) -> str:
|
|
236
|
+
"""Build the analysis prompt, optionally including context."""
|
|
237
|
+
base_prompt = """Look at this image carefully and describe what you see.
|
|
238
|
+
|
|
239
|
+
List all visible objects and write a brief description of the scene.
|
|
240
|
+
|
|
241
|
+
You MUST respond with ONLY this exact JSON format:
|
|
242
|
+
{"objects": ["item1", "item2"], "description": "One or two sentences describing the scene."}
|
|
243
|
+
|
|
244
|
+
Rules for objects:
|
|
245
|
+
- Be specific: "scissors" not "tool", "laptop" not "device"
|
|
246
|
+
- Include people as "person" or "man"/"woman"
|
|
247
|
+
- Only list clearly visible objects
|
|
248
|
+
|
|
249
|
+
Rules for description:
|
|
250
|
+
- Describe what's happening
|
|
251
|
+
- Mention the setting/environment
|
|
252
|
+
- Keep it to 1-2 sentences
|
|
253
|
+
|
|
254
|
+
Respond with JSON only, no other text."""
|
|
255
|
+
|
|
256
|
+
if not context:
|
|
257
|
+
return base_prompt
|
|
258
|
+
|
|
259
|
+
# Build context section
|
|
260
|
+
context_lines = ["Known context about this video:"]
|
|
261
|
+
|
|
262
|
+
# Map context keys to human-readable labels
|
|
263
|
+
labels = {
|
|
264
|
+
"person": "Person identified",
|
|
265
|
+
"location": "Location",
|
|
266
|
+
"nearby_landmarks": "Nearby landmarks/POIs",
|
|
267
|
+
"activity": "Activity",
|
|
268
|
+
"language": "Language spoken",
|
|
269
|
+
"device": "Filmed with",
|
|
270
|
+
"topic": "Topic/Subject",
|
|
271
|
+
"organization": "Organization",
|
|
272
|
+
"event": "Event",
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
# Handle log footage note separately (not as a bullet point)
|
|
276
|
+
log_footage_note = context.get("log_footage_note", "")
|
|
277
|
+
|
|
278
|
+
for key, value in context.items():
|
|
279
|
+
if value and key != "log_footage_note":
|
|
280
|
+
label = labels.get(key, key.replace("_", " ").title())
|
|
281
|
+
context_lines.append(f"- {label}: {value}")
|
|
282
|
+
|
|
283
|
+
context_section = "\n".join(context_lines)
|
|
284
|
+
|
|
285
|
+
# Get person name for explicit instruction
|
|
286
|
+
person_name = context.get("person", "")
|
|
287
|
+
person_instruction = ""
|
|
288
|
+
if person_name:
|
|
289
|
+
person_instruction = f"""
|
|
290
|
+
IMPORTANT: The person in this video is "{person_name}".
|
|
291
|
+
- In objects list: use "{person_name}" instead of "person", "man", or "woman"
|
|
292
|
+
- In description: refer to them as "{person_name}", not "a person" or "someone"
|
|
293
|
+
"""
|
|
294
|
+
|
|
295
|
+
# Get nearby landmarks for naming instruction
|
|
296
|
+
nearby_landmarks = context.get("nearby_landmarks", "")
|
|
297
|
+
landmark_instruction = ""
|
|
298
|
+
if nearby_landmarks:
|
|
299
|
+
landmark_instruction = f"""
|
|
300
|
+
IMPORTANT: This location has these nearby landmarks: {nearby_landmarks}
|
|
301
|
+
- If you see any of these landmarks, use their PROPER NAME in the description
|
|
302
|
+
- Example: say "Alnes fyr lighthouse" not just "a lighthouse"
|
|
303
|
+
- Example: say "Eiffel Tower" not just "a tower"
|
|
304
|
+
"""
|
|
305
|
+
|
|
306
|
+
# Add log footage instruction if applicable
|
|
307
|
+
log_instruction = ""
|
|
308
|
+
if log_footage_note:
|
|
309
|
+
log_instruction = f"""
|
|
310
|
+
NOTE: {log_footage_note}
|
|
311
|
+
- Focus on describing the content and action, not the color grading
|
|
312
|
+
"""
|
|
313
|
+
|
|
314
|
+
# Enhanced prompt with context
|
|
315
|
+
return f"""{context_section}
|
|
316
|
+
{person_instruction}{landmark_instruction}{log_instruction}
|
|
317
|
+
Look at this image carefully and describe what you see.
|
|
318
|
+
|
|
319
|
+
You MUST respond with ONLY this exact JSON format:
|
|
320
|
+
{{"objects": ["item1", "item2"], "description": "One or two sentences describing the scene."}}
|
|
321
|
+
|
|
322
|
+
Rules for objects:
|
|
323
|
+
- Be specific: "scissors" not "tool", "laptop" not "device"
|
|
324
|
+
- If a person is visible and identified above, use their name ("{person_name}") not "person"
|
|
325
|
+
- If a known landmark is visible, use its proper name from the context
|
|
326
|
+
- Only list clearly visible objects
|
|
327
|
+
|
|
328
|
+
Rules for description:
|
|
329
|
+
- Use "{person_name}" if they are visible
|
|
330
|
+
- Use proper landmark names if visible
|
|
331
|
+
- Describe what's happening in the scene
|
|
332
|
+
- Keep it to 1-2 sentences
|
|
333
|
+
|
|
334
|
+
Respond with JSON only, no other text."""
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def extract_objects_qwen(
|
|
338
|
+
file_path: str,
|
|
339
|
+
timestamps: list[float] | None = None,
|
|
340
|
+
model_name: str | None = None,
|
|
341
|
+
context: dict[str, str] | None = None,
|
|
342
|
+
progress_callback: ProgressCallback | None = None,
|
|
343
|
+
lut_path: str | None = None,
|
|
344
|
+
) -> ObjectsResult:
|
|
345
|
+
"""Extract objects using Qwen2-VL vision-language model.
|
|
346
|
+
|
|
347
|
+
Much more accurate than YOLO for contextual understanding.
|
|
348
|
+
|
|
349
|
+
Args:
|
|
350
|
+
file_path: Path to video file
|
|
351
|
+
timestamps: Specific timestamps to analyze. If None, samples from middle.
|
|
352
|
+
model_name: Qwen model name (default from config)
|
|
353
|
+
context: Optional context from earlier extraction steps, e.g.:
|
|
354
|
+
- "person": Name of identified person
|
|
355
|
+
- "location": Where this was filmed
|
|
356
|
+
- "activity": What's happening (e.g., "tutorial", "interview")
|
|
357
|
+
- "language": Language spoken in the video
|
|
358
|
+
- "device": Camera/device used
|
|
359
|
+
- "topic": Subject matter of the video
|
|
360
|
+
progress_callback: Optional callback for progress updates (message, current, total)
|
|
361
|
+
lut_path: Optional path to a LUT file (.cube) to apply for log footage color correction
|
|
362
|
+
|
|
363
|
+
Returns:
|
|
364
|
+
ObjectsResult with detected objects and contextual descriptions
|
|
365
|
+
"""
|
|
366
|
+
from qwen_vl_utils import process_vision_info # type: ignore[import-not-found]
|
|
367
|
+
|
|
368
|
+
logger.info(f"extract_objects_qwen called: file={file_path}, timestamps={timestamps}, context={context}")
|
|
369
|
+
|
|
370
|
+
settings = get_settings()
|
|
371
|
+
# Resolve model name (handles "auto")
|
|
372
|
+
model_name = model_name or settings.get_qwen_model()
|
|
373
|
+
logger.info(f"Using Qwen model: {model_name}")
|
|
374
|
+
|
|
375
|
+
path = Path(file_path)
|
|
376
|
+
if not path.exists():
|
|
377
|
+
raise FileNotFoundError(f"Video file not found: {file_path}")
|
|
378
|
+
|
|
379
|
+
# Create temp directory for frames
|
|
380
|
+
temp_dir = tempfile.mkdtemp(prefix="polybos_qwen_")
|
|
381
|
+
|
|
382
|
+
try:
|
|
383
|
+
# Use provided timestamps, or default to middle of video
|
|
384
|
+
if timestamps is None:
|
|
385
|
+
duration = _get_video_duration(file_path)
|
|
386
|
+
timestamps = [duration / 2]
|
|
387
|
+
logger.info(f"No timestamps provided, sampling from middle ({duration/2:.1f}s)")
|
|
388
|
+
else:
|
|
389
|
+
logger.info(f"Analyzing {len(timestamps)} provided timestamps")
|
|
390
|
+
|
|
391
|
+
# Check for LOG/HDR color space from metadata
|
|
392
|
+
color_transfer = context.get("color_transfer") if context else None
|
|
393
|
+
is_log_footage = _is_log_color_space(color_transfer)
|
|
394
|
+
|
|
395
|
+
# Add context hint for log footage
|
|
396
|
+
if context is None:
|
|
397
|
+
context = {}
|
|
398
|
+
else:
|
|
399
|
+
context = context.copy() # Don't modify the original
|
|
400
|
+
|
|
401
|
+
if lut_path and os.path.exists(lut_path):
|
|
402
|
+
# LUT applied - colors are corrected but may still be slightly off
|
|
403
|
+
context["log_footage_note"] = (
|
|
404
|
+
"This footage was recorded in LOG profile and color-corrected with a LUT. " "Colors shown are the corrected version but may still appear slightly desaturated."
|
|
405
|
+
)
|
|
406
|
+
logger.info("Added log footage context hint (with LUT)")
|
|
407
|
+
elif is_log_footage:
|
|
408
|
+
# LOG detected but no LUT - colors are definitely off
|
|
409
|
+
context["log_footage_note"] = (
|
|
410
|
+
f"This footage appears to be in LOG/flat color profile ({color_transfer}). "
|
|
411
|
+
"Colors are desaturated and not representative of the actual scene. "
|
|
412
|
+
"Focus on describing content and action, not colors."
|
|
413
|
+
)
|
|
414
|
+
logger.info(f"Added log footage context hint (no LUT, color_transfer={color_transfer})")
|
|
415
|
+
|
|
416
|
+
# IMPORTANT: Extract frames BEFORE loading the model!
|
|
417
|
+
# ffmpeg can crash (SIGABRT) when forked from a process with MPS/Metal loaded.
|
|
418
|
+
if progress_callback:
|
|
419
|
+
progress_callback("Extracting frames...", None, None)
|
|
420
|
+
frame_paths = _extract_frames_at_timestamps(file_path, temp_dir, timestamps, lut_path=lut_path)
|
|
421
|
+
total_frames = len([p for p in frame_paths if p])
|
|
422
|
+
|
|
423
|
+
if total_frames == 0:
|
|
424
|
+
logger.warning(f"No frames could be extracted from {file_path} at timestamps {timestamps}")
|
|
425
|
+
return ObjectsResult(summary={}, detections=[], descriptions=None)
|
|
426
|
+
|
|
427
|
+
# Now load the model (after ffmpeg has finished)
|
|
428
|
+
# If this fails due to OOM, the exception propagates up
|
|
429
|
+
try:
|
|
430
|
+
model, processor, torch_device = _get_qwen_model(model_name, progress_callback)
|
|
431
|
+
except (RuntimeError, MemoryError, OSError) as e:
|
|
432
|
+
error_msg = str(e).lower()
|
|
433
|
+
if "out of memory" in error_msg or "cannot allocate" in error_msg:
|
|
434
|
+
logger.error(f"Out of memory loading Qwen model. " f"Close other apps or use a cloud vision API. Error: {e}")
|
|
435
|
+
# Return empty result - frontend can fall back to cloud API if configured
|
|
436
|
+
return ObjectsResult(
|
|
437
|
+
summary={},
|
|
438
|
+
detections=[],
|
|
439
|
+
descriptions=None,
|
|
440
|
+
error="out_of_memory",
|
|
441
|
+
)
|
|
442
|
+
raise # Re-raise other errors
|
|
443
|
+
|
|
444
|
+
logger.info(f"Processing {total_frames} frames for Qwen analysis")
|
|
445
|
+
|
|
446
|
+
all_objects: dict[str, int] = {}
|
|
447
|
+
detections: list[ObjectDetection] = []
|
|
448
|
+
descriptions: list[str] = []
|
|
449
|
+
frame_count = 0
|
|
450
|
+
|
|
451
|
+
for frame_path, timestamp in zip(frame_paths, timestamps):
|
|
452
|
+
if not frame_path or not os.path.exists(frame_path):
|
|
453
|
+
logger.warning(f"Skipping missing frame at {timestamp}s: {frame_path}")
|
|
454
|
+
continue
|
|
455
|
+
|
|
456
|
+
frame_count += 1
|
|
457
|
+
if progress_callback:
|
|
458
|
+
progress_callback(
|
|
459
|
+
f"Analyzing frame {frame_count}/{total_frames}...",
|
|
460
|
+
frame_count,
|
|
461
|
+
total_frames,
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
try:
|
|
465
|
+
# Build the prompt with optional context
|
|
466
|
+
prompt = _build_analysis_prompt(context)
|
|
467
|
+
|
|
468
|
+
# Log prompt on first frame for debugging
|
|
469
|
+
if frame_count == 1:
|
|
470
|
+
logger.info(f"Qwen prompt: {prompt[:500]}")
|
|
471
|
+
|
|
472
|
+
# Prepare message for Qwen - ask for both objects and description
|
|
473
|
+
messages = [
|
|
474
|
+
{
|
|
475
|
+
"role": "user",
|
|
476
|
+
"content": [
|
|
477
|
+
{"type": "image", "image": f"file://{frame_path}"},
|
|
478
|
+
{"type": "text", "text": prompt},
|
|
479
|
+
],
|
|
480
|
+
}
|
|
481
|
+
]
|
|
482
|
+
|
|
483
|
+
# Process inputs
|
|
484
|
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
485
|
+
image_inputs, video_inputs = process_vision_info(messages)
|
|
486
|
+
inputs = processor(
|
|
487
|
+
text=[text],
|
|
488
|
+
images=image_inputs,
|
|
489
|
+
videos=video_inputs,
|
|
490
|
+
padding=True,
|
|
491
|
+
return_tensors="pt",
|
|
492
|
+
)
|
|
493
|
+
inputs = inputs.to(torch_device)
|
|
494
|
+
|
|
495
|
+
# Generate response with repetition penalty to prevent loops
|
|
496
|
+
with torch.no_grad():
|
|
497
|
+
generated_ids = model.generate(
|
|
498
|
+
**inputs,
|
|
499
|
+
max_new_tokens=512,
|
|
500
|
+
do_sample=False, # Greedy decoding for consistent JSON
|
|
501
|
+
repetition_penalty=1.2, # Penalize repetition
|
|
502
|
+
no_repeat_ngram_size=3, # Prevent 3-gram repetition
|
|
503
|
+
)
|
|
504
|
+
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
|
505
|
+
output_text = processor.batch_decode(
|
|
506
|
+
generated_ids_trimmed,
|
|
507
|
+
skip_special_tokens=True,
|
|
508
|
+
clean_up_tokenization_spaces=False,
|
|
509
|
+
)[0]
|
|
510
|
+
|
|
511
|
+
# Parse response
|
|
512
|
+
logger.info(f"Qwen raw output for {timestamp:.1f}s: {output_text[:500]}")
|
|
513
|
+
objects, description = _parse_objects_and_description(output_text)
|
|
514
|
+
if not description:
|
|
515
|
+
logger.warning(f"No description parsed from Qwen output at {timestamp:.1f}s")
|
|
516
|
+
for obj in objects:
|
|
517
|
+
obj_lower = obj.lower().strip()
|
|
518
|
+
all_objects[obj_lower] = all_objects.get(obj_lower, 0) + 1
|
|
519
|
+
|
|
520
|
+
detections.append(
|
|
521
|
+
ObjectDetection(
|
|
522
|
+
timestamp=round(timestamp, 2),
|
|
523
|
+
label=obj_lower,
|
|
524
|
+
confidence=0.95, # VLM confidence is generally high
|
|
525
|
+
bbox=BoundingBox(x=0, y=0, width=0, height=0), # No bbox from VLM
|
|
526
|
+
)
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
if description:
|
|
530
|
+
descriptions.append(description)
|
|
531
|
+
logger.info(f"Frame {timestamp:.1f}s description: {description}")
|
|
532
|
+
|
|
533
|
+
logger.info(f"Frame {timestamp:.1f}s objects: {objects}")
|
|
534
|
+
|
|
535
|
+
# Clear memory after each frame
|
|
536
|
+
del inputs, generated_ids
|
|
537
|
+
if torch_device == "mps":
|
|
538
|
+
torch.mps.empty_cache()
|
|
539
|
+
elif torch_device == "cuda":
|
|
540
|
+
torch.cuda.empty_cache()
|
|
541
|
+
|
|
542
|
+
except Exception as e:
|
|
543
|
+
logger.error(f"Failed to process frame {frame_path}: {e}", exc_info=True)
|
|
544
|
+
# Try to recover memory
|
|
545
|
+
if torch_device == "mps":
|
|
546
|
+
torch.mps.empty_cache()
|
|
547
|
+
continue
|
|
548
|
+
|
|
549
|
+
# Deduplicate - count unique objects per type
|
|
550
|
+
unique_objects = _deduplicate_objects(all_objects)
|
|
551
|
+
|
|
552
|
+
logger.info(f"Qwen detected {len(unique_objects)} unique object types, {len(descriptions)} descriptions")
|
|
553
|
+
|
|
554
|
+
return ObjectsResult(
|
|
555
|
+
summary=unique_objects,
|
|
556
|
+
detections=detections,
|
|
557
|
+
descriptions=descriptions if descriptions else None,
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
finally:
|
|
561
|
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
def _get_video_duration(file_path: str) -> float:
|
|
565
|
+
"""Get video/image duration in seconds (0 for images)."""
|
|
566
|
+
from media_engine.extractors.frames import get_video_duration
|
|
567
|
+
|
|
568
|
+
return get_video_duration(file_path)
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
def _extract_frames_at_timestamps(
|
|
572
|
+
file_path: str,
|
|
573
|
+
output_dir: str,
|
|
574
|
+
timestamps: list[float],
|
|
575
|
+
max_width: int = 1280,
|
|
576
|
+
lut_path: str | None = None,
|
|
577
|
+
) -> list[str]:
|
|
578
|
+
"""Extract frames at specific timestamps, resized for VLM inference.
|
|
579
|
+
|
|
580
|
+
Uses FrameExtractor which handles both videos (via OpenCV/ffmpeg)
|
|
581
|
+
and images (via direct loading). When a LUT path is provided, uses
|
|
582
|
+
ffmpeg directly to apply the LUT during extraction.
|
|
583
|
+
|
|
584
|
+
Args:
|
|
585
|
+
file_path: Path to video/image file
|
|
586
|
+
output_dir: Directory to save extracted frames
|
|
587
|
+
timestamps: List of timestamps to extract (in seconds)
|
|
588
|
+
max_width: Maximum width for scaling (default 1280)
|
|
589
|
+
lut_path: Optional path to a .cube LUT file for color correction
|
|
590
|
+
"""
|
|
591
|
+
import subprocess
|
|
592
|
+
|
|
593
|
+
import cv2
|
|
594
|
+
|
|
595
|
+
frame_paths: list[str] = []
|
|
596
|
+
|
|
597
|
+
logger.info(f"Extracting {len(timestamps)} frames from {file_path} at timestamps {timestamps}")
|
|
598
|
+
|
|
599
|
+
# If LUT is provided, use ffmpeg directly for extraction with LUT applied
|
|
600
|
+
if lut_path and os.path.exists(lut_path):
|
|
601
|
+
logger.info(f"Applying LUT: {lut_path}")
|
|
602
|
+
for i, ts in enumerate(timestamps):
|
|
603
|
+
output_path = os.path.join(output_dir, f"frame_{i:04d}.jpg")
|
|
604
|
+
try:
|
|
605
|
+
# Build filter chain: LUT + scale
|
|
606
|
+
scale_filter = f"scale={max_width}:{max_width}:force_original_aspect_ratio=decrease"
|
|
607
|
+
lut_filter = f"lut3d='{lut_path}'"
|
|
608
|
+
vf = f"{lut_filter},{scale_filter}"
|
|
609
|
+
|
|
610
|
+
cmd = [
|
|
611
|
+
"ffmpeg",
|
|
612
|
+
"-y",
|
|
613
|
+
"-ss",
|
|
614
|
+
str(ts),
|
|
615
|
+
"-i",
|
|
616
|
+
file_path,
|
|
617
|
+
"-vf",
|
|
618
|
+
vf,
|
|
619
|
+
"-frames:v",
|
|
620
|
+
"1",
|
|
621
|
+
"-update",
|
|
622
|
+
"1",
|
|
623
|
+
"-q:v",
|
|
624
|
+
"2",
|
|
625
|
+
output_path,
|
|
626
|
+
]
|
|
627
|
+
subprocess.run(cmd, capture_output=True, check=True)
|
|
628
|
+
|
|
629
|
+
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
|
|
630
|
+
frame_paths.append(output_path)
|
|
631
|
+
logger.info(f"Extracted frame {i} at {ts:.2f}s with LUT: {output_path}")
|
|
632
|
+
else:
|
|
633
|
+
logger.warning(f"Frame at {ts:.2f}s: could not extract with LUT")
|
|
634
|
+
frame_paths.append("")
|
|
635
|
+
except subprocess.CalledProcessError as e:
|
|
636
|
+
logger.warning(f"Frame at {ts:.2f}s: ffmpeg failed: {e}")
|
|
637
|
+
frame_paths.append("")
|
|
638
|
+
else:
|
|
639
|
+
# Standard extraction without LUT
|
|
640
|
+
with FrameExtractor(file_path, max_dimension=max_width) as extractor:
|
|
641
|
+
for i, ts in enumerate(timestamps):
|
|
642
|
+
output_path = os.path.join(output_dir, f"frame_{i:04d}.jpg")
|
|
643
|
+
frame = extractor.get_frame_at(ts)
|
|
644
|
+
|
|
645
|
+
if frame is not None:
|
|
646
|
+
# Save frame as JPEG with moderate quality for VLM
|
|
647
|
+
cv2.imwrite(output_path, frame, [cv2.IMWRITE_JPEG_QUALITY, 85])
|
|
648
|
+
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
|
|
649
|
+
frame_paths.append(output_path)
|
|
650
|
+
logger.info(f"Extracted frame {i} at {ts:.2f}s: {output_path}")
|
|
651
|
+
else:
|
|
652
|
+
logger.warning(f"Frame at {ts:.2f}s: could not save to {output_path}")
|
|
653
|
+
frame_paths.append("")
|
|
654
|
+
else:
|
|
655
|
+
logger.warning(f"Frame at {ts:.2f}s: extraction failed")
|
|
656
|
+
frame_paths.append("")
|
|
657
|
+
|
|
658
|
+
successful = sum(1 for p in frame_paths if p)
|
|
659
|
+
logger.info(f"Frame extraction complete: {successful}/{len(timestamps)} frames extracted")
|
|
660
|
+
return frame_paths
|
|
661
|
+
|
|
662
|
+
|
|
663
|
+
def _parse_objects_and_description(response: str) -> tuple[list[str], str | None]:
|
|
664
|
+
"""Parse objects and description from Qwen response."""
|
|
665
|
+
objects: list[str] = []
|
|
666
|
+
description: str | None = None
|
|
667
|
+
|
|
668
|
+
# Try to find and parse JSON
|
|
669
|
+
try:
|
|
670
|
+
# Remove markdown code block markers
|
|
671
|
+
clean_response = response.replace("```json", "").replace("```", "").strip()
|
|
672
|
+
|
|
673
|
+
# Try to parse as JSON (could be object or array)
|
|
674
|
+
if "[" in clean_response or "{" in clean_response:
|
|
675
|
+
# Find the JSON portion
|
|
676
|
+
start_bracket = clean_response.find("[")
|
|
677
|
+
start_brace = clean_response.find("{")
|
|
678
|
+
|
|
679
|
+
if start_bracket >= 0 and (start_brace < 0 or start_bracket < start_brace):
|
|
680
|
+
# Array format - find matching ]
|
|
681
|
+
json_str = clean_response[start_bracket : clean_response.rindex("]") + 1]
|
|
682
|
+
data = json.loads(json_str)
|
|
683
|
+
|
|
684
|
+
# Array of objects - take the first non-empty one
|
|
685
|
+
if isinstance(data, list):
|
|
686
|
+
for item in data:
|
|
687
|
+
if isinstance(item, dict):
|
|
688
|
+
raw_objects = item.get("objects", [])
|
|
689
|
+
for obj in raw_objects:
|
|
690
|
+
if isinstance(obj, str) and len(obj) < 100 and obj.strip():
|
|
691
|
+
objects.append(obj)
|
|
692
|
+
elif isinstance(obj, dict):
|
|
693
|
+
# Handle nested format: {"name": "person"}
|
|
694
|
+
name = obj.get("name", "") or obj.get("label", "")
|
|
695
|
+
if isinstance(name, str) and len(name) < 100 and name.strip():
|
|
696
|
+
objects.append(name)
|
|
697
|
+
desc = item.get("description", "")
|
|
698
|
+
if isinstance(desc, str) and len(desc) > 10 and not description:
|
|
699
|
+
description = desc.strip()
|
|
700
|
+
return objects, description
|
|
701
|
+
|
|
702
|
+
# Single object format
|
|
703
|
+
if start_brace >= 0:
|
|
704
|
+
json_str = clean_response[start_brace : clean_response.rindex("}") + 1]
|
|
705
|
+
data = json.loads(json_str)
|
|
706
|
+
|
|
707
|
+
# Extract objects - handle both string and dict formats
|
|
708
|
+
raw_objects = data.get("objects", [])
|
|
709
|
+
for obj in raw_objects:
|
|
710
|
+
if isinstance(obj, str) and len(obj) < 100 and obj.strip():
|
|
711
|
+
objects.append(obj)
|
|
712
|
+
elif isinstance(obj, dict):
|
|
713
|
+
# Handle nested format: {"name": "person", "position": "..."}
|
|
714
|
+
name = obj.get("name", "") or obj.get("label", "")
|
|
715
|
+
if isinstance(name, str) and len(name) < 100 and name.strip():
|
|
716
|
+
objects.append(name)
|
|
717
|
+
|
|
718
|
+
# Extract description
|
|
719
|
+
desc = data.get("description", "")
|
|
720
|
+
if isinstance(desc, str) and len(desc) > 10:
|
|
721
|
+
description = desc.strip()
|
|
722
|
+
|
|
723
|
+
return objects, description
|
|
724
|
+
except (json.JSONDecodeError, ValueError) as e:
|
|
725
|
+
logger.warning(f"Failed to parse JSON from Qwen response: {e}")
|
|
726
|
+
logger.debug(f"Response was: {response[:500]}")
|
|
727
|
+
|
|
728
|
+
# Fallback: try to extract objects from plain text
|
|
729
|
+
for line in response.split("\n"):
|
|
730
|
+
line = line.strip().strip("-").strip("*").strip()
|
|
731
|
+
# Skip JSON artifacts and code block markers
|
|
732
|
+
if not line or line.startswith("{") or line.startswith("}"):
|
|
733
|
+
continue
|
|
734
|
+
if line.startswith("```") or line.startswith('"objects"'):
|
|
735
|
+
continue
|
|
736
|
+
if line.startswith('"') and line.endswith('"'):
|
|
737
|
+
line = line[1:-1].rstrip(",")
|
|
738
|
+
|
|
739
|
+
if len(line) > 50 or "[" in line or ":" in line:
|
|
740
|
+
continue
|
|
741
|
+
|
|
742
|
+
parts = [p.strip().strip('"').strip("'") for p in line.split(",")]
|
|
743
|
+
objects.extend([p for p in parts if p and len(p) < 50])
|
|
744
|
+
|
|
745
|
+
return objects, description
|
|
746
|
+
|
|
747
|
+
|
|
748
|
+
def _deduplicate_objects(objects: dict[str, int]) -> dict[str, int]:
|
|
749
|
+
"""Deduplicate object counts.
|
|
750
|
+
|
|
751
|
+
If an object appears in multiple frames, it's likely the same instance.
|
|
752
|
+
Returns count of 1 for each unique object type.
|
|
753
|
+
"""
|
|
754
|
+
return {obj: 1 for obj in objects.keys()}
|