media-engine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. cli/clip.py +79 -0
  2. cli/faces.py +91 -0
  3. cli/metadata.py +68 -0
  4. cli/motion.py +77 -0
  5. cli/objects.py +94 -0
  6. cli/ocr.py +93 -0
  7. cli/scenes.py +57 -0
  8. cli/telemetry.py +65 -0
  9. cli/transcript.py +76 -0
  10. media_engine/__init__.py +7 -0
  11. media_engine/_version.py +34 -0
  12. media_engine/app.py +80 -0
  13. media_engine/batch/__init__.py +56 -0
  14. media_engine/batch/models.py +99 -0
  15. media_engine/batch/processor.py +1131 -0
  16. media_engine/batch/queue.py +232 -0
  17. media_engine/batch/state.py +30 -0
  18. media_engine/batch/timing.py +321 -0
  19. media_engine/cli.py +17 -0
  20. media_engine/config.py +674 -0
  21. media_engine/extractors/__init__.py +75 -0
  22. media_engine/extractors/clip.py +401 -0
  23. media_engine/extractors/faces.py +459 -0
  24. media_engine/extractors/frame_buffer.py +351 -0
  25. media_engine/extractors/frames.py +402 -0
  26. media_engine/extractors/metadata/__init__.py +127 -0
  27. media_engine/extractors/metadata/apple.py +169 -0
  28. media_engine/extractors/metadata/arri.py +118 -0
  29. media_engine/extractors/metadata/avchd.py +208 -0
  30. media_engine/extractors/metadata/avchd_gps.py +270 -0
  31. media_engine/extractors/metadata/base.py +688 -0
  32. media_engine/extractors/metadata/blackmagic.py +139 -0
  33. media_engine/extractors/metadata/camera_360.py +276 -0
  34. media_engine/extractors/metadata/canon.py +290 -0
  35. media_engine/extractors/metadata/dji.py +371 -0
  36. media_engine/extractors/metadata/dv.py +121 -0
  37. media_engine/extractors/metadata/ffmpeg.py +76 -0
  38. media_engine/extractors/metadata/generic.py +119 -0
  39. media_engine/extractors/metadata/gopro.py +256 -0
  40. media_engine/extractors/metadata/red.py +305 -0
  41. media_engine/extractors/metadata/registry.py +114 -0
  42. media_engine/extractors/metadata/sony.py +442 -0
  43. media_engine/extractors/metadata/tesla.py +157 -0
  44. media_engine/extractors/motion.py +765 -0
  45. media_engine/extractors/objects.py +245 -0
  46. media_engine/extractors/objects_qwen.py +754 -0
  47. media_engine/extractors/ocr.py +268 -0
  48. media_engine/extractors/scenes.py +82 -0
  49. media_engine/extractors/shot_type.py +217 -0
  50. media_engine/extractors/telemetry.py +262 -0
  51. media_engine/extractors/transcribe.py +579 -0
  52. media_engine/extractors/translate.py +121 -0
  53. media_engine/extractors/vad.py +263 -0
  54. media_engine/main.py +68 -0
  55. media_engine/py.typed +0 -0
  56. media_engine/routers/__init__.py +15 -0
  57. media_engine/routers/batch.py +78 -0
  58. media_engine/routers/health.py +93 -0
  59. media_engine/routers/models.py +211 -0
  60. media_engine/routers/settings.py +87 -0
  61. media_engine/routers/utils.py +135 -0
  62. media_engine/schemas.py +581 -0
  63. media_engine/utils/__init__.py +5 -0
  64. media_engine/utils/logging.py +54 -0
  65. media_engine/utils/memory.py +49 -0
  66. media_engine-0.1.0.dist-info/METADATA +276 -0
  67. media_engine-0.1.0.dist-info/RECORD +70 -0
  68. media_engine-0.1.0.dist-info/WHEEL +4 -0
  69. media_engine-0.1.0.dist-info/entry_points.txt +11 -0
  70. media_engine-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,754 @@
1
+ """Object detection using Qwen2-VL vision-language model."""
2
+
3
+ import json
4
+ import logging
5
+ import os
6
+ import shutil
7
+ import tempfile
8
+ from collections.abc import Callable
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ import torch
13
+
14
+ from media_engine.config import (
15
+ DeviceType,
16
+ get_device,
17
+ get_free_memory_gb,
18
+ get_settings,
19
+ )
20
+ from media_engine.extractors.frames import FrameExtractor
21
+ from media_engine.schemas import BoundingBox, ObjectDetection, ObjectsResult
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ # Progress callback type: (message, current, total) -> None
26
+ ProgressCallback = Callable[[str, int | None, int | None], None]
27
+
28
+ # Singleton model instances (lazy loaded, stays in memory between calls)
29
+ _qwen_model: Any = None
30
+ _qwen_processor: Any = None
31
+ _qwen_model_name: str | None = None
32
+ _qwen_device: str | None = None
33
+
34
+
35
+ def unload_qwen_model() -> None:
36
+ """Unload Qwen model from memory to free GPU/MPS memory."""
37
+ global _qwen_model, _qwen_processor, _qwen_model_name, _qwen_device
38
+
39
+ if _qwen_model is not None:
40
+ logger.info("Unloading Qwen model from memory")
41
+
42
+ # Move model to CPU first to release MPS memory
43
+ try:
44
+ _qwen_model.to("cpu")
45
+ except Exception:
46
+ pass
47
+
48
+ del _qwen_model
49
+ del _qwen_processor
50
+ _qwen_model = None
51
+ _qwen_processor = None
52
+ _qwen_model_name = None
53
+ _qwen_device = None
54
+
55
+ import gc
56
+
57
+ gc.collect()
58
+
59
+ # Free GPU memory with sync
60
+ if torch.cuda.is_available():
61
+ torch.cuda.synchronize()
62
+ torch.cuda.empty_cache()
63
+ if hasattr(torch, "mps"):
64
+ if hasattr(torch.mps, "synchronize"):
65
+ torch.mps.synchronize()
66
+ if hasattr(torch.mps, "empty_cache"):
67
+ torch.mps.empty_cache()
68
+
69
+ gc.collect()
70
+
71
+
72
+ # Known LOG/HDR color transfer characteristics
73
+ # These indicate footage that needs color correction to look "normal"
74
+ LOG_COLOR_TRANSFERS = {
75
+ # HDR transfer functions
76
+ "arib-std-b67", # HLG (Hybrid Log-Gamma)
77
+ "smpte2084", # PQ (Perceptual Quantizer) / HDR10
78
+ "smpte428", # DCI-P3
79
+ # Manufacturer LOG profiles (as they appear in ffmpeg metadata)
80
+ "log", # Generic log
81
+ "slog", # Sony S-Log
82
+ "slog2", # Sony S-Log2
83
+ "slog3", # Sony S-Log3
84
+ "vlog", # Panasonic V-Log
85
+ "clog", # Canon C-Log
86
+ "clog2", # Canon C-Log2
87
+ "clog3", # Canon C-Log3
88
+ "dlog", # DJI D-Log
89
+ "dlog-m", # DJI D-Log M
90
+ "hlg", # HLG
91
+ "n-log", # Nikon N-Log
92
+ "f-log", # Fujifilm F-Log
93
+ "f-log2", # Fujifilm F-Log2
94
+ "blackmagic", # Blackmagic Film
95
+ "arri", # ARRI Log C
96
+ "logc", # ARRI Log C
97
+ "redlogfilm", # RED Log Film
98
+ }
99
+
100
+
101
+ def _is_log_color_space(color_transfer: str | None) -> bool:
102
+ """Check if the color transfer characteristic indicates LOG/HDR footage.
103
+
104
+ Args:
105
+ color_transfer: The color transfer characteristic from video metadata
106
+ (e.g., "arib-std-b67", "smpte2084", "bt709")
107
+
108
+ Returns:
109
+ True if the footage appears to be in a LOG/flat/HDR color space
110
+ that would benefit from color correction before viewing.
111
+ """
112
+ if not color_transfer:
113
+ return False
114
+
115
+ # Normalize to lowercase for comparison
116
+ ct_lower = color_transfer.lower().replace("_", "-").replace(" ", "")
117
+
118
+ # Check for exact matches first
119
+ if ct_lower in LOG_COLOR_TRANSFERS:
120
+ return True
121
+
122
+ # Check for partial matches (e.g., "s-log3" contains "log")
123
+ log_keywords = ["log", "hlg", "pq", "hdr", "dci-p3"]
124
+ for keyword in log_keywords:
125
+ if keyword in ct_lower:
126
+ return True
127
+
128
+ return False
129
+
130
+
131
+ def _get_qwen_model(
132
+ model_name: str,
133
+ progress_callback: ProgressCallback | None = None,
134
+ ) -> tuple[Any, Any, str]:
135
+ """Get or create the Qwen model and processor (singleton).
136
+
137
+ Returns (model, processor, device_str).
138
+ Raises RuntimeError/MemoryError if model cannot be loaded (e.g., OOM).
139
+ Model stays loaded in memory for subsequent calls.
140
+ """
141
+ global _qwen_model, _qwen_processor, _qwen_model_name, _qwen_device
142
+
143
+ # Return cached model if same model requested
144
+ if _qwen_model is not None and _qwen_model_name == model_name:
145
+ logger.info(f"Reusing cached Qwen model: {model_name}")
146
+ return _qwen_model, _qwen_processor, _qwen_device # type: ignore
147
+
148
+ # Log memory status (informational only - let PyTorch handle OOM)
149
+ free_memory = get_free_memory_gb()
150
+ model_memory_gb = 15.0 if "7B" in model_name else 5.0
151
+ logger.info(f"Free memory: {free_memory:.1f}GB, model needs: ~{model_memory_gb:.0f}GB")
152
+
153
+ # Clear existing GPU memory before loading
154
+ import gc
155
+
156
+ gc.collect()
157
+ if torch.cuda.is_available():
158
+ torch.cuda.synchronize()
159
+ torch.cuda.empty_cache()
160
+ if hasattr(torch, "mps") and hasattr(torch.mps, "empty_cache"):
161
+ try:
162
+ torch.mps.synchronize()
163
+ torch.mps.empty_cache()
164
+ except Exception as e:
165
+ logger.warning(f"Failed to clear MPS cache: {e}")
166
+ gc.collect()
167
+
168
+ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration # type: ignore[import-not-found]
169
+
170
+ # Determine device
171
+ device = get_device()
172
+ if device == DeviceType.MPS:
173
+ torch_device = "mps"
174
+ torch_dtype = torch.float16
175
+ elif device == DeviceType.CUDA:
176
+ torch_device = "cuda"
177
+ torch_dtype = torch.float16
178
+ else:
179
+ torch_device = "cpu"
180
+ torch_dtype = torch.float32
181
+
182
+ logger.info(f"Loading Qwen2-VL model: {model_name} on {torch_device}")
183
+ if progress_callback:
184
+ progress_callback("Loading Qwen model...", None, None)
185
+
186
+ # Disable tqdm progress bars and warnings to avoid BrokenPipeError when running as daemon
187
+ import transformers # type: ignore[import-not-found]
188
+
189
+ transformers.logging.disable_progress_bar()
190
+ transformers.logging.set_verbosity_error() # Suppress info/warning output
191
+ os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
192
+ os.environ["TRANSFORMERS_VERBOSITY"] = "error"
193
+
194
+ # Load model and processor with detailed error handling
195
+ try:
196
+ logger.info("Loading Qwen2VLForConditionalGeneration...")
197
+
198
+ # For MPS (Apple Silicon), don't use device_map at all
199
+ # device_map triggers accelerate's meta tensor handling which fails on MPS
200
+ if torch_device == "mps":
201
+ _qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
202
+ model_name,
203
+ torch_dtype=torch_dtype,
204
+ # No device_map - load directly to CPU without accelerate dispatch
205
+ )
206
+ logger.info("Moving model to MPS...")
207
+ _qwen_model = _qwen_model.to("mps")
208
+ elif torch_device == "cuda":
209
+ # CUDA works fine with device_map
210
+ _qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
211
+ model_name,
212
+ torch_dtype=torch_dtype,
213
+ device_map="cuda",
214
+ )
215
+ else:
216
+ # CPU - no device_map needed
217
+ _qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
218
+ model_name,
219
+ torch_dtype=torch_dtype,
220
+ )
221
+
222
+ logger.info("Qwen model loaded, loading processor...")
223
+ _qwen_processor = AutoProcessor.from_pretrained(model_name)
224
+ logger.info("Qwen processor loaded successfully")
225
+ except Exception as e:
226
+ logger.error(f"Failed to load Qwen model: {e}", exc_info=True)
227
+ raise
228
+
229
+ _qwen_model_name = model_name
230
+ _qwen_device = torch_device
231
+
232
+ return _qwen_model, _qwen_processor, torch_device
233
+
234
+
235
+ def _build_analysis_prompt(context: dict[str, str] | None = None) -> str:
236
+ """Build the analysis prompt, optionally including context."""
237
+ base_prompt = """Look at this image carefully and describe what you see.
238
+
239
+ List all visible objects and write a brief description of the scene.
240
+
241
+ You MUST respond with ONLY this exact JSON format:
242
+ {"objects": ["item1", "item2"], "description": "One or two sentences describing the scene."}
243
+
244
+ Rules for objects:
245
+ - Be specific: "scissors" not "tool", "laptop" not "device"
246
+ - Include people as "person" or "man"/"woman"
247
+ - Only list clearly visible objects
248
+
249
+ Rules for description:
250
+ - Describe what's happening
251
+ - Mention the setting/environment
252
+ - Keep it to 1-2 sentences
253
+
254
+ Respond with JSON only, no other text."""
255
+
256
+ if not context:
257
+ return base_prompt
258
+
259
+ # Build context section
260
+ context_lines = ["Known context about this video:"]
261
+
262
+ # Map context keys to human-readable labels
263
+ labels = {
264
+ "person": "Person identified",
265
+ "location": "Location",
266
+ "nearby_landmarks": "Nearby landmarks/POIs",
267
+ "activity": "Activity",
268
+ "language": "Language spoken",
269
+ "device": "Filmed with",
270
+ "topic": "Topic/Subject",
271
+ "organization": "Organization",
272
+ "event": "Event",
273
+ }
274
+
275
+ # Handle log footage note separately (not as a bullet point)
276
+ log_footage_note = context.get("log_footage_note", "")
277
+
278
+ for key, value in context.items():
279
+ if value and key != "log_footage_note":
280
+ label = labels.get(key, key.replace("_", " ").title())
281
+ context_lines.append(f"- {label}: {value}")
282
+
283
+ context_section = "\n".join(context_lines)
284
+
285
+ # Get person name for explicit instruction
286
+ person_name = context.get("person", "")
287
+ person_instruction = ""
288
+ if person_name:
289
+ person_instruction = f"""
290
+ IMPORTANT: The person in this video is "{person_name}".
291
+ - In objects list: use "{person_name}" instead of "person", "man", or "woman"
292
+ - In description: refer to them as "{person_name}", not "a person" or "someone"
293
+ """
294
+
295
+ # Get nearby landmarks for naming instruction
296
+ nearby_landmarks = context.get("nearby_landmarks", "")
297
+ landmark_instruction = ""
298
+ if nearby_landmarks:
299
+ landmark_instruction = f"""
300
+ IMPORTANT: This location has these nearby landmarks: {nearby_landmarks}
301
+ - If you see any of these landmarks, use their PROPER NAME in the description
302
+ - Example: say "Alnes fyr lighthouse" not just "a lighthouse"
303
+ - Example: say "Eiffel Tower" not just "a tower"
304
+ """
305
+
306
+ # Add log footage instruction if applicable
307
+ log_instruction = ""
308
+ if log_footage_note:
309
+ log_instruction = f"""
310
+ NOTE: {log_footage_note}
311
+ - Focus on describing the content and action, not the color grading
312
+ """
313
+
314
+ # Enhanced prompt with context
315
+ return f"""{context_section}
316
+ {person_instruction}{landmark_instruction}{log_instruction}
317
+ Look at this image carefully and describe what you see.
318
+
319
+ You MUST respond with ONLY this exact JSON format:
320
+ {{"objects": ["item1", "item2"], "description": "One or two sentences describing the scene."}}
321
+
322
+ Rules for objects:
323
+ - Be specific: "scissors" not "tool", "laptop" not "device"
324
+ - If a person is visible and identified above, use their name ("{person_name}") not "person"
325
+ - If a known landmark is visible, use its proper name from the context
326
+ - Only list clearly visible objects
327
+
328
+ Rules for description:
329
+ - Use "{person_name}" if they are visible
330
+ - Use proper landmark names if visible
331
+ - Describe what's happening in the scene
332
+ - Keep it to 1-2 sentences
333
+
334
+ Respond with JSON only, no other text."""
335
+
336
+
337
+ def extract_objects_qwen(
338
+ file_path: str,
339
+ timestamps: list[float] | None = None,
340
+ model_name: str | None = None,
341
+ context: dict[str, str] | None = None,
342
+ progress_callback: ProgressCallback | None = None,
343
+ lut_path: str | None = None,
344
+ ) -> ObjectsResult:
345
+ """Extract objects using Qwen2-VL vision-language model.
346
+
347
+ Much more accurate than YOLO for contextual understanding.
348
+
349
+ Args:
350
+ file_path: Path to video file
351
+ timestamps: Specific timestamps to analyze. If None, samples from middle.
352
+ model_name: Qwen model name (default from config)
353
+ context: Optional context from earlier extraction steps, e.g.:
354
+ - "person": Name of identified person
355
+ - "location": Where this was filmed
356
+ - "activity": What's happening (e.g., "tutorial", "interview")
357
+ - "language": Language spoken in the video
358
+ - "device": Camera/device used
359
+ - "topic": Subject matter of the video
360
+ progress_callback: Optional callback for progress updates (message, current, total)
361
+ lut_path: Optional path to a LUT file (.cube) to apply for log footage color correction
362
+
363
+ Returns:
364
+ ObjectsResult with detected objects and contextual descriptions
365
+ """
366
+ from qwen_vl_utils import process_vision_info # type: ignore[import-not-found]
367
+
368
+ logger.info(f"extract_objects_qwen called: file={file_path}, timestamps={timestamps}, context={context}")
369
+
370
+ settings = get_settings()
371
+ # Resolve model name (handles "auto")
372
+ model_name = model_name or settings.get_qwen_model()
373
+ logger.info(f"Using Qwen model: {model_name}")
374
+
375
+ path = Path(file_path)
376
+ if not path.exists():
377
+ raise FileNotFoundError(f"Video file not found: {file_path}")
378
+
379
+ # Create temp directory for frames
380
+ temp_dir = tempfile.mkdtemp(prefix="polybos_qwen_")
381
+
382
+ try:
383
+ # Use provided timestamps, or default to middle of video
384
+ if timestamps is None:
385
+ duration = _get_video_duration(file_path)
386
+ timestamps = [duration / 2]
387
+ logger.info(f"No timestamps provided, sampling from middle ({duration/2:.1f}s)")
388
+ else:
389
+ logger.info(f"Analyzing {len(timestamps)} provided timestamps")
390
+
391
+ # Check for LOG/HDR color space from metadata
392
+ color_transfer = context.get("color_transfer") if context else None
393
+ is_log_footage = _is_log_color_space(color_transfer)
394
+
395
+ # Add context hint for log footage
396
+ if context is None:
397
+ context = {}
398
+ else:
399
+ context = context.copy() # Don't modify the original
400
+
401
+ if lut_path and os.path.exists(lut_path):
402
+ # LUT applied - colors are corrected but may still be slightly off
403
+ context["log_footage_note"] = (
404
+ "This footage was recorded in LOG profile and color-corrected with a LUT. " "Colors shown are the corrected version but may still appear slightly desaturated."
405
+ )
406
+ logger.info("Added log footage context hint (with LUT)")
407
+ elif is_log_footage:
408
+ # LOG detected but no LUT - colors are definitely off
409
+ context["log_footage_note"] = (
410
+ f"This footage appears to be in LOG/flat color profile ({color_transfer}). "
411
+ "Colors are desaturated and not representative of the actual scene. "
412
+ "Focus on describing content and action, not colors."
413
+ )
414
+ logger.info(f"Added log footage context hint (no LUT, color_transfer={color_transfer})")
415
+
416
+ # IMPORTANT: Extract frames BEFORE loading the model!
417
+ # ffmpeg can crash (SIGABRT) when forked from a process with MPS/Metal loaded.
418
+ if progress_callback:
419
+ progress_callback("Extracting frames...", None, None)
420
+ frame_paths = _extract_frames_at_timestamps(file_path, temp_dir, timestamps, lut_path=lut_path)
421
+ total_frames = len([p for p in frame_paths if p])
422
+
423
+ if total_frames == 0:
424
+ logger.warning(f"No frames could be extracted from {file_path} at timestamps {timestamps}")
425
+ return ObjectsResult(summary={}, detections=[], descriptions=None)
426
+
427
+ # Now load the model (after ffmpeg has finished)
428
+ # If this fails due to OOM, the exception propagates up
429
+ try:
430
+ model, processor, torch_device = _get_qwen_model(model_name, progress_callback)
431
+ except (RuntimeError, MemoryError, OSError) as e:
432
+ error_msg = str(e).lower()
433
+ if "out of memory" in error_msg or "cannot allocate" in error_msg:
434
+ logger.error(f"Out of memory loading Qwen model. " f"Close other apps or use a cloud vision API. Error: {e}")
435
+ # Return empty result - frontend can fall back to cloud API if configured
436
+ return ObjectsResult(
437
+ summary={},
438
+ detections=[],
439
+ descriptions=None,
440
+ error="out_of_memory",
441
+ )
442
+ raise # Re-raise other errors
443
+
444
+ logger.info(f"Processing {total_frames} frames for Qwen analysis")
445
+
446
+ all_objects: dict[str, int] = {}
447
+ detections: list[ObjectDetection] = []
448
+ descriptions: list[str] = []
449
+ frame_count = 0
450
+
451
+ for frame_path, timestamp in zip(frame_paths, timestamps):
452
+ if not frame_path or not os.path.exists(frame_path):
453
+ logger.warning(f"Skipping missing frame at {timestamp}s: {frame_path}")
454
+ continue
455
+
456
+ frame_count += 1
457
+ if progress_callback:
458
+ progress_callback(
459
+ f"Analyzing frame {frame_count}/{total_frames}...",
460
+ frame_count,
461
+ total_frames,
462
+ )
463
+
464
+ try:
465
+ # Build the prompt with optional context
466
+ prompt = _build_analysis_prompt(context)
467
+
468
+ # Log prompt on first frame for debugging
469
+ if frame_count == 1:
470
+ logger.info(f"Qwen prompt: {prompt[:500]}")
471
+
472
+ # Prepare message for Qwen - ask for both objects and description
473
+ messages = [
474
+ {
475
+ "role": "user",
476
+ "content": [
477
+ {"type": "image", "image": f"file://{frame_path}"},
478
+ {"type": "text", "text": prompt},
479
+ ],
480
+ }
481
+ ]
482
+
483
+ # Process inputs
484
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
485
+ image_inputs, video_inputs = process_vision_info(messages)
486
+ inputs = processor(
487
+ text=[text],
488
+ images=image_inputs,
489
+ videos=video_inputs,
490
+ padding=True,
491
+ return_tensors="pt",
492
+ )
493
+ inputs = inputs.to(torch_device)
494
+
495
+ # Generate response with repetition penalty to prevent loops
496
+ with torch.no_grad():
497
+ generated_ids = model.generate(
498
+ **inputs,
499
+ max_new_tokens=512,
500
+ do_sample=False, # Greedy decoding for consistent JSON
501
+ repetition_penalty=1.2, # Penalize repetition
502
+ no_repeat_ngram_size=3, # Prevent 3-gram repetition
503
+ )
504
+ generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
505
+ output_text = processor.batch_decode(
506
+ generated_ids_trimmed,
507
+ skip_special_tokens=True,
508
+ clean_up_tokenization_spaces=False,
509
+ )[0]
510
+
511
+ # Parse response
512
+ logger.info(f"Qwen raw output for {timestamp:.1f}s: {output_text[:500]}")
513
+ objects, description = _parse_objects_and_description(output_text)
514
+ if not description:
515
+ logger.warning(f"No description parsed from Qwen output at {timestamp:.1f}s")
516
+ for obj in objects:
517
+ obj_lower = obj.lower().strip()
518
+ all_objects[obj_lower] = all_objects.get(obj_lower, 0) + 1
519
+
520
+ detections.append(
521
+ ObjectDetection(
522
+ timestamp=round(timestamp, 2),
523
+ label=obj_lower,
524
+ confidence=0.95, # VLM confidence is generally high
525
+ bbox=BoundingBox(x=0, y=0, width=0, height=0), # No bbox from VLM
526
+ )
527
+ )
528
+
529
+ if description:
530
+ descriptions.append(description)
531
+ logger.info(f"Frame {timestamp:.1f}s description: {description}")
532
+
533
+ logger.info(f"Frame {timestamp:.1f}s objects: {objects}")
534
+
535
+ # Clear memory after each frame
536
+ del inputs, generated_ids
537
+ if torch_device == "mps":
538
+ torch.mps.empty_cache()
539
+ elif torch_device == "cuda":
540
+ torch.cuda.empty_cache()
541
+
542
+ except Exception as e:
543
+ logger.error(f"Failed to process frame {frame_path}: {e}", exc_info=True)
544
+ # Try to recover memory
545
+ if torch_device == "mps":
546
+ torch.mps.empty_cache()
547
+ continue
548
+
549
+ # Deduplicate - count unique objects per type
550
+ unique_objects = _deduplicate_objects(all_objects)
551
+
552
+ logger.info(f"Qwen detected {len(unique_objects)} unique object types, {len(descriptions)} descriptions")
553
+
554
+ return ObjectsResult(
555
+ summary=unique_objects,
556
+ detections=detections,
557
+ descriptions=descriptions if descriptions else None,
558
+ )
559
+
560
+ finally:
561
+ shutil.rmtree(temp_dir, ignore_errors=True)
562
+
563
+
564
+ def _get_video_duration(file_path: str) -> float:
565
+ """Get video/image duration in seconds (0 for images)."""
566
+ from media_engine.extractors.frames import get_video_duration
567
+
568
+ return get_video_duration(file_path)
569
+
570
+
571
+ def _extract_frames_at_timestamps(
572
+ file_path: str,
573
+ output_dir: str,
574
+ timestamps: list[float],
575
+ max_width: int = 1280,
576
+ lut_path: str | None = None,
577
+ ) -> list[str]:
578
+ """Extract frames at specific timestamps, resized for VLM inference.
579
+
580
+ Uses FrameExtractor which handles both videos (via OpenCV/ffmpeg)
581
+ and images (via direct loading). When a LUT path is provided, uses
582
+ ffmpeg directly to apply the LUT during extraction.
583
+
584
+ Args:
585
+ file_path: Path to video/image file
586
+ output_dir: Directory to save extracted frames
587
+ timestamps: List of timestamps to extract (in seconds)
588
+ max_width: Maximum width for scaling (default 1280)
589
+ lut_path: Optional path to a .cube LUT file for color correction
590
+ """
591
+ import subprocess
592
+
593
+ import cv2
594
+
595
+ frame_paths: list[str] = []
596
+
597
+ logger.info(f"Extracting {len(timestamps)} frames from {file_path} at timestamps {timestamps}")
598
+
599
+ # If LUT is provided, use ffmpeg directly for extraction with LUT applied
600
+ if lut_path and os.path.exists(lut_path):
601
+ logger.info(f"Applying LUT: {lut_path}")
602
+ for i, ts in enumerate(timestamps):
603
+ output_path = os.path.join(output_dir, f"frame_{i:04d}.jpg")
604
+ try:
605
+ # Build filter chain: LUT + scale
606
+ scale_filter = f"scale={max_width}:{max_width}:force_original_aspect_ratio=decrease"
607
+ lut_filter = f"lut3d='{lut_path}'"
608
+ vf = f"{lut_filter},{scale_filter}"
609
+
610
+ cmd = [
611
+ "ffmpeg",
612
+ "-y",
613
+ "-ss",
614
+ str(ts),
615
+ "-i",
616
+ file_path,
617
+ "-vf",
618
+ vf,
619
+ "-frames:v",
620
+ "1",
621
+ "-update",
622
+ "1",
623
+ "-q:v",
624
+ "2",
625
+ output_path,
626
+ ]
627
+ subprocess.run(cmd, capture_output=True, check=True)
628
+
629
+ if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
630
+ frame_paths.append(output_path)
631
+ logger.info(f"Extracted frame {i} at {ts:.2f}s with LUT: {output_path}")
632
+ else:
633
+ logger.warning(f"Frame at {ts:.2f}s: could not extract with LUT")
634
+ frame_paths.append("")
635
+ except subprocess.CalledProcessError as e:
636
+ logger.warning(f"Frame at {ts:.2f}s: ffmpeg failed: {e}")
637
+ frame_paths.append("")
638
+ else:
639
+ # Standard extraction without LUT
640
+ with FrameExtractor(file_path, max_dimension=max_width) as extractor:
641
+ for i, ts in enumerate(timestamps):
642
+ output_path = os.path.join(output_dir, f"frame_{i:04d}.jpg")
643
+ frame = extractor.get_frame_at(ts)
644
+
645
+ if frame is not None:
646
+ # Save frame as JPEG with moderate quality for VLM
647
+ cv2.imwrite(output_path, frame, [cv2.IMWRITE_JPEG_QUALITY, 85])
648
+ if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
649
+ frame_paths.append(output_path)
650
+ logger.info(f"Extracted frame {i} at {ts:.2f}s: {output_path}")
651
+ else:
652
+ logger.warning(f"Frame at {ts:.2f}s: could not save to {output_path}")
653
+ frame_paths.append("")
654
+ else:
655
+ logger.warning(f"Frame at {ts:.2f}s: extraction failed")
656
+ frame_paths.append("")
657
+
658
+ successful = sum(1 for p in frame_paths if p)
659
+ logger.info(f"Frame extraction complete: {successful}/{len(timestamps)} frames extracted")
660
+ return frame_paths
661
+
662
+
663
+ def _parse_objects_and_description(response: str) -> tuple[list[str], str | None]:
664
+ """Parse objects and description from Qwen response."""
665
+ objects: list[str] = []
666
+ description: str | None = None
667
+
668
+ # Try to find and parse JSON
669
+ try:
670
+ # Remove markdown code block markers
671
+ clean_response = response.replace("```json", "").replace("```", "").strip()
672
+
673
+ # Try to parse as JSON (could be object or array)
674
+ if "[" in clean_response or "{" in clean_response:
675
+ # Find the JSON portion
676
+ start_bracket = clean_response.find("[")
677
+ start_brace = clean_response.find("{")
678
+
679
+ if start_bracket >= 0 and (start_brace < 0 or start_bracket < start_brace):
680
+ # Array format - find matching ]
681
+ json_str = clean_response[start_bracket : clean_response.rindex("]") + 1]
682
+ data = json.loads(json_str)
683
+
684
+ # Array of objects - take the first non-empty one
685
+ if isinstance(data, list):
686
+ for item in data:
687
+ if isinstance(item, dict):
688
+ raw_objects = item.get("objects", [])
689
+ for obj in raw_objects:
690
+ if isinstance(obj, str) and len(obj) < 100 and obj.strip():
691
+ objects.append(obj)
692
+ elif isinstance(obj, dict):
693
+ # Handle nested format: {"name": "person"}
694
+ name = obj.get("name", "") or obj.get("label", "")
695
+ if isinstance(name, str) and len(name) < 100 and name.strip():
696
+ objects.append(name)
697
+ desc = item.get("description", "")
698
+ if isinstance(desc, str) and len(desc) > 10 and not description:
699
+ description = desc.strip()
700
+ return objects, description
701
+
702
+ # Single object format
703
+ if start_brace >= 0:
704
+ json_str = clean_response[start_brace : clean_response.rindex("}") + 1]
705
+ data = json.loads(json_str)
706
+
707
+ # Extract objects - handle both string and dict formats
708
+ raw_objects = data.get("objects", [])
709
+ for obj in raw_objects:
710
+ if isinstance(obj, str) and len(obj) < 100 and obj.strip():
711
+ objects.append(obj)
712
+ elif isinstance(obj, dict):
713
+ # Handle nested format: {"name": "person", "position": "..."}
714
+ name = obj.get("name", "") or obj.get("label", "")
715
+ if isinstance(name, str) and len(name) < 100 and name.strip():
716
+ objects.append(name)
717
+
718
+ # Extract description
719
+ desc = data.get("description", "")
720
+ if isinstance(desc, str) and len(desc) > 10:
721
+ description = desc.strip()
722
+
723
+ return objects, description
724
+ except (json.JSONDecodeError, ValueError) as e:
725
+ logger.warning(f"Failed to parse JSON from Qwen response: {e}")
726
+ logger.debug(f"Response was: {response[:500]}")
727
+
728
+ # Fallback: try to extract objects from plain text
729
+ for line in response.split("\n"):
730
+ line = line.strip().strip("-").strip("*").strip()
731
+ # Skip JSON artifacts and code block markers
732
+ if not line or line.startswith("{") or line.startswith("}"):
733
+ continue
734
+ if line.startswith("```") or line.startswith('"objects"'):
735
+ continue
736
+ if line.startswith('"') and line.endswith('"'):
737
+ line = line[1:-1].rstrip(",")
738
+
739
+ if len(line) > 50 or "[" in line or ":" in line:
740
+ continue
741
+
742
+ parts = [p.strip().strip('"').strip("'") for p in line.split(",")]
743
+ objects.extend([p for p in parts if p and len(p) < 50])
744
+
745
+ return objects, description
746
+
747
+
748
+ def _deduplicate_objects(objects: dict[str, int]) -> dict[str, int]:
749
+ """Deduplicate object counts.
750
+
751
+ If an object appears in multiple frames, it's likely the same instance.
752
+ Returns count of 1 for each unique object type.
753
+ """
754
+ return {obj: 1 for obj in objects.keys()}