media-engine 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- media_engine/_version.py +2 -2
- media_engine/batch/models.py +9 -0
- media_engine/batch/processor.py +14 -12
- media_engine/batch/timing.py +1 -1
- media_engine/config.py +91 -19
- media_engine/extractors/faces.py +1 -1
- media_engine/extractors/frame_buffer.py +1 -1
- media_engine/extractors/frames.py +2 -2
- media_engine/extractors/metadata/sony.py +1 -1
- media_engine/extractors/motion.py +4 -4
- media_engine/extractors/objects.py +1 -1
- media_engine/extractors/objects_qwen.py +845 -147
- media_engine/extractors/ocr.py +1 -1
- media_engine/extractors/transcribe.py +1 -1
- media_engine/extractors/vad.py +1 -1
- media_engine/routers/settings.py +2 -0
- media_engine/schemas.py +2 -0
- {media_engine-0.1.1.dist-info → media_engine-0.2.1.dist-info}/METADATA +1 -1
- {media_engine-0.1.1.dist-info → media_engine-0.2.1.dist-info}/RECORD +22 -22
- {media_engine-0.1.1.dist-info → media_engine-0.2.1.dist-info}/WHEEL +0 -0
- {media_engine-0.1.1.dist-info → media_engine-0.2.1.dist-info}/entry_points.txt +0 -0
- {media_engine-0.1.1.dist-info → media_engine-0.2.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import json
|
|
4
4
|
import logging
|
|
5
5
|
import os
|
|
6
|
+
import re
|
|
6
7
|
import shutil
|
|
7
8
|
import tempfile
|
|
8
9
|
from collections.abc import Callable
|
|
@@ -13,6 +14,8 @@ import torch
|
|
|
13
14
|
|
|
14
15
|
from media_engine.config import (
|
|
15
16
|
DeviceType,
|
|
17
|
+
QwenStrategy,
|
|
18
|
+
get_auto_qwen_batch_size,
|
|
16
19
|
get_device,
|
|
17
20
|
get_free_memory_gb,
|
|
18
21
|
get_settings,
|
|
@@ -234,24 +237,18 @@ def _get_qwen_model(
|
|
|
234
237
|
|
|
235
238
|
def _build_analysis_prompt(context: dict[str, str] | None = None) -> str:
|
|
236
239
|
"""Build the analysis prompt, optionally including context."""
|
|
237
|
-
base_prompt = """
|
|
240
|
+
base_prompt = """Describe what you see in this image. List main objects and write a short description.
|
|
238
241
|
|
|
239
|
-
|
|
242
|
+
JSON format:
|
|
243
|
+
{"objects": ["object1", "object2"], "description": "scene description"}
|
|
240
244
|
|
|
241
|
-
|
|
242
|
-
{"objects": ["
|
|
243
|
-
|
|
244
|
-
Rules for objects:
|
|
245
|
-
- Be specific: "scissors" not "tool", "laptop" not "device"
|
|
246
|
-
- Include people as "person" or "man"/"woman"
|
|
247
|
-
- Only list clearly visible objects
|
|
245
|
+
If the image is unclear, use:
|
|
246
|
+
{"objects": [], "description": "unknown", "error": "reason why"}
|
|
248
247
|
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
- Mention the setting/environment
|
|
252
|
-
- Keep it to 1-2 sentences
|
|
248
|
+
Example:
|
|
249
|
+
{"objects": ["mountain", "ocean", "lighthouse"], "description": "A lighthouse on a rocky coast with mountains in the background."}
|
|
253
250
|
|
|
254
|
-
Respond with JSON only
|
|
251
|
+
Respond with JSON only. Describe what you CAN see."""
|
|
255
252
|
|
|
256
253
|
if not context:
|
|
257
254
|
return base_prompt
|
|
@@ -309,11 +306,19 @@ IMPORTANT: This location has these nearby landmarks: {nearby_landmarks}
|
|
|
309
306
|
log_instruction = f"""
|
|
310
307
|
NOTE: {log_footage_note}
|
|
311
308
|
- Focus on describing the content and action, not the color grading
|
|
309
|
+
"""
|
|
310
|
+
|
|
311
|
+
# Add topic/activity instruction if provided
|
|
312
|
+
topic = context.get("topic", "") or context.get("activity", "")
|
|
313
|
+
topic_instruction = ""
|
|
314
|
+
if topic:
|
|
315
|
+
topic_instruction = f"""
|
|
316
|
+
IMPORTANT: This video shows "{topic}". Use this context to interpret the action.
|
|
312
317
|
"""
|
|
313
318
|
|
|
314
319
|
# Enhanced prompt with context
|
|
315
320
|
return f"""{context_section}
|
|
316
|
-
{person_instruction}{landmark_instruction}{log_instruction}
|
|
321
|
+
{person_instruction}{landmark_instruction}{log_instruction}{topic_instruction}
|
|
317
322
|
Look at this image carefully and describe what you see.
|
|
318
323
|
|
|
319
324
|
You MUST respond with ONLY this exact JSON format:
|
|
@@ -334,6 +339,746 @@ Rules for description:
|
|
|
334
339
|
Respond with JSON only, no other text."""
|
|
335
340
|
|
|
336
341
|
|
|
342
|
+
def _build_context_prompt(
|
|
343
|
+
context: dict[str, str] | None = None,
|
|
344
|
+
previous_description: str | None = None,
|
|
345
|
+
) -> str:
|
|
346
|
+
"""Build prompt for CONTEXT strategy - includes previous frame description."""
|
|
347
|
+
base_prompt = _build_analysis_prompt(context)
|
|
348
|
+
|
|
349
|
+
if not previous_description:
|
|
350
|
+
return base_prompt
|
|
351
|
+
|
|
352
|
+
# Insert previous frame context before the analysis request
|
|
353
|
+
context_insert = f"""
|
|
354
|
+
Previous frame showed: {previous_description}
|
|
355
|
+
|
|
356
|
+
Describe what's happening NOW and how it relates to the previous frame.
|
|
357
|
+
Focus on: objects visible, actions occurring, any changes from before.
|
|
358
|
+
|
|
359
|
+
"""
|
|
360
|
+
# Modify the JSON format to include "change" field
|
|
361
|
+
modified_prompt = base_prompt.replace(
|
|
362
|
+
'{"objects": ["item1", "item2"], "description": "One or two sentences describing the scene."}',
|
|
363
|
+
'{"objects": ["item1", "item2"], "description": "What\'s happening now.", "change": "How this differs from the previous frame."}',
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
# Insert context after any existing context section but before "Look at this image"
|
|
367
|
+
if "Look at this image" in modified_prompt:
|
|
368
|
+
parts = modified_prompt.split("Look at this image")
|
|
369
|
+
return parts[0] + context_insert + "Look at this image" + parts[1]
|
|
370
|
+
|
|
371
|
+
return context_insert + modified_prompt
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def _build_batch_prompt(
|
|
375
|
+
context: dict[str, str] | None = None,
|
|
376
|
+
num_frames: int = 3,
|
|
377
|
+
) -> str:
|
|
378
|
+
"""Build prompt for BATCH strategy - analyzes multiple frames together."""
|
|
379
|
+
# Get person name from context for instructions
|
|
380
|
+
person_name = context.get("person", "") if context else ""
|
|
381
|
+
|
|
382
|
+
# Build context section if available
|
|
383
|
+
context_section = ""
|
|
384
|
+
topic_hint = ""
|
|
385
|
+
if context:
|
|
386
|
+
context_lines = ["Known context about this video:"]
|
|
387
|
+
labels = {
|
|
388
|
+
"person": "Person identified",
|
|
389
|
+
"location": "Location",
|
|
390
|
+
"nearby_landmarks": "Nearby landmarks/POIs",
|
|
391
|
+
"activity": "Activity",
|
|
392
|
+
"topic": "Activity/Subject",
|
|
393
|
+
"language": "Language spoken",
|
|
394
|
+
"device": "Filmed with",
|
|
395
|
+
}
|
|
396
|
+
for key, value in context.items():
|
|
397
|
+
if value and key not in ("log_footage_note", "color_transfer"):
|
|
398
|
+
label = labels.get(key, key.replace("_", " ").title())
|
|
399
|
+
context_lines.append(f"- {label}: {value}")
|
|
400
|
+
# Capture topic for special instruction
|
|
401
|
+
if key in ("topic", "activity") and value:
|
|
402
|
+
topic_hint = value
|
|
403
|
+
context_section = "\n".join(context_lines) + "\n\n"
|
|
404
|
+
|
|
405
|
+
person_instruction = ""
|
|
406
|
+
if person_name:
|
|
407
|
+
person_instruction = f'Use "{person_name}" instead of "person" in objects and description.\n'
|
|
408
|
+
|
|
409
|
+
# Add topic instruction if provided
|
|
410
|
+
topic_instruction = ""
|
|
411
|
+
if topic_hint:
|
|
412
|
+
topic_instruction = f'IMPORTANT: This video shows "{topic_hint}". Use this context to interpret what you see.\n'
|
|
413
|
+
|
|
414
|
+
return f"""{context_section}These {num_frames} frames are from a video.
|
|
415
|
+
{person_instruction}{topic_instruction}
|
|
416
|
+
Describe what you see. List main objects and write a short description.
|
|
417
|
+
|
|
418
|
+
JSON format:
|
|
419
|
+
{{"objects": ["object1", "object2"], "action": "what is happening", "description": "scene description"}}
|
|
420
|
+
|
|
421
|
+
If the image is unclear or you cannot identify content, use:
|
|
422
|
+
{{"objects": [], "action": "unknown", "description": "unknown", "error": "reason why"}}
|
|
423
|
+
|
|
424
|
+
Example:
|
|
425
|
+
{{"objects": ["bus", "road", "mountain"], "action": "bus driving", "description": "A bus on a coastal road with mountains."}}
|
|
426
|
+
|
|
427
|
+
Respond with JSON only. Describe what you CAN see, even if partial."""
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def _build_batch_context_prompt(
|
|
431
|
+
context: dict[str, str] | None = None,
|
|
432
|
+
num_frames: int = 3,
|
|
433
|
+
group_context: str | None = None,
|
|
434
|
+
) -> str:
|
|
435
|
+
"""Build prompt for BATCH_CONTEXT strategy - batch with previous group context."""
|
|
436
|
+
base_prompt = _build_batch_prompt(context, num_frames)
|
|
437
|
+
|
|
438
|
+
if not group_context:
|
|
439
|
+
return base_prompt
|
|
440
|
+
|
|
441
|
+
context_insert = f"""Previous scene: {group_context}
|
|
442
|
+
|
|
443
|
+
What happens next in these frames? How does it continue from before?
|
|
444
|
+
|
|
445
|
+
"""
|
|
446
|
+
# Modify JSON format to include "continues" field
|
|
447
|
+
modified_prompt = base_prompt.replace(
|
|
448
|
+
'{"objects": ["item1", "item2"], "action": "The action happening across frames", "description": "Overall scene description"}',
|
|
449
|
+
'{"objects": ["item1", "item2"], "action": "The action in these frames", "description": "Scene description", "continues": "How this continues from the previous scene"}',
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
# Insert after context section but before "These X frames"
|
|
453
|
+
if "These " in modified_prompt and " frames are" in modified_prompt:
|
|
454
|
+
idx = modified_prompt.find("These ")
|
|
455
|
+
return modified_prompt[:idx] + context_insert + modified_prompt[idx:]
|
|
456
|
+
|
|
457
|
+
return context_insert + modified_prompt
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def _analyze_frames_single(
|
|
461
|
+
model: Any,
|
|
462
|
+
processor: Any,
|
|
463
|
+
torch_device: str,
|
|
464
|
+
frame_paths: list[str],
|
|
465
|
+
timestamps: list[float],
|
|
466
|
+
context: dict[str, str] | None,
|
|
467
|
+
progress_callback: ProgressCallback | None,
|
|
468
|
+
) -> tuple[dict[str, int], list[ObjectDetection], list[str]]:
|
|
469
|
+
"""Analyze frames one at a time without temporal context (original behavior)."""
|
|
470
|
+
from qwen_vl_utils import process_vision_info # type: ignore[import-not-found]
|
|
471
|
+
|
|
472
|
+
all_objects: dict[str, int] = {}
|
|
473
|
+
detections: list[ObjectDetection] = []
|
|
474
|
+
descriptions: list[str] = []
|
|
475
|
+
|
|
476
|
+
total_frames = len([p for p in frame_paths if p])
|
|
477
|
+
frame_count = 0
|
|
478
|
+
|
|
479
|
+
for frame_path, timestamp in zip(frame_paths, timestamps):
|
|
480
|
+
if not frame_path or not os.path.exists(frame_path):
|
|
481
|
+
continue
|
|
482
|
+
|
|
483
|
+
frame_count += 1
|
|
484
|
+
if progress_callback:
|
|
485
|
+
progress_callback(
|
|
486
|
+
f"Analyzing frame {frame_count}/{total_frames}...",
|
|
487
|
+
frame_count,
|
|
488
|
+
total_frames,
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
try:
|
|
492
|
+
prompt = _build_analysis_prompt(context)
|
|
493
|
+
|
|
494
|
+
messages = [
|
|
495
|
+
{
|
|
496
|
+
"role": "user",
|
|
497
|
+
"content": [
|
|
498
|
+
{"type": "image", "image": f"file://{frame_path}"},
|
|
499
|
+
{"type": "text", "text": prompt},
|
|
500
|
+
],
|
|
501
|
+
}
|
|
502
|
+
]
|
|
503
|
+
|
|
504
|
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
505
|
+
image_inputs, video_inputs = process_vision_info(messages)
|
|
506
|
+
inputs = processor(
|
|
507
|
+
text=[text],
|
|
508
|
+
images=image_inputs,
|
|
509
|
+
videos=video_inputs,
|
|
510
|
+
padding=True,
|
|
511
|
+
return_tensors="pt",
|
|
512
|
+
)
|
|
513
|
+
inputs = inputs.to(torch_device)
|
|
514
|
+
|
|
515
|
+
with torch.no_grad():
|
|
516
|
+
generated_ids = model.generate(
|
|
517
|
+
**inputs,
|
|
518
|
+
max_new_tokens=512,
|
|
519
|
+
do_sample=False,
|
|
520
|
+
repetition_penalty=1.2,
|
|
521
|
+
no_repeat_ngram_size=3,
|
|
522
|
+
)
|
|
523
|
+
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
|
524
|
+
output_text = processor.batch_decode(
|
|
525
|
+
generated_ids_trimmed,
|
|
526
|
+
skip_special_tokens=True,
|
|
527
|
+
clean_up_tokenization_spaces=False,
|
|
528
|
+
)[0]
|
|
529
|
+
|
|
530
|
+
logger.info(f"Qwen raw output for {timestamp:.1f}s: {output_text[:500]}")
|
|
531
|
+
objects, description = _parse_objects_and_description(output_text)
|
|
532
|
+
|
|
533
|
+
for obj in objects:
|
|
534
|
+
obj_lower = obj.lower().strip()
|
|
535
|
+
all_objects[obj_lower] = all_objects.get(obj_lower, 0) + 1
|
|
536
|
+
detections.append(
|
|
537
|
+
ObjectDetection(
|
|
538
|
+
timestamp=round(timestamp, 2),
|
|
539
|
+
label=obj_lower,
|
|
540
|
+
confidence=0.95,
|
|
541
|
+
bbox=BoundingBox(x=0, y=0, width=0, height=0),
|
|
542
|
+
)
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
if description:
|
|
546
|
+
descriptions.append(description)
|
|
547
|
+
logger.info(f"Frame {timestamp:.1f}s description: {description}")
|
|
548
|
+
|
|
549
|
+
logger.info(f"Frame {timestamp:.1f}s objects: {objects}")
|
|
550
|
+
|
|
551
|
+
del inputs, generated_ids
|
|
552
|
+
if torch_device == "mps":
|
|
553
|
+
torch.mps.empty_cache()
|
|
554
|
+
elif torch_device == "cuda":
|
|
555
|
+
torch.cuda.empty_cache()
|
|
556
|
+
|
|
557
|
+
except Exception as e:
|
|
558
|
+
logger.error(f"Failed to process frame {frame_path}: {e}", exc_info=True)
|
|
559
|
+
if torch_device == "mps":
|
|
560
|
+
torch.mps.empty_cache()
|
|
561
|
+
|
|
562
|
+
return all_objects, detections, descriptions
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
def _analyze_frames_with_context(
|
|
566
|
+
model: Any,
|
|
567
|
+
processor: Any,
|
|
568
|
+
torch_device: str,
|
|
569
|
+
frame_paths: list[str],
|
|
570
|
+
timestamps: list[float],
|
|
571
|
+
context: dict[str, str] | None,
|
|
572
|
+
progress_callback: ProgressCallback | None,
|
|
573
|
+
) -> tuple[dict[str, int], list[ObjectDetection], list[str]]:
|
|
574
|
+
"""Analyze frames sequentially, passing previous description as context."""
|
|
575
|
+
from qwen_vl_utils import process_vision_info # type: ignore[import-not-found]
|
|
576
|
+
|
|
577
|
+
all_objects: dict[str, int] = {}
|
|
578
|
+
detections: list[ObjectDetection] = []
|
|
579
|
+
descriptions: list[str] = []
|
|
580
|
+
|
|
581
|
+
total_frames = len([p for p in frame_paths if p])
|
|
582
|
+
frame_count = 0
|
|
583
|
+
previous_description: str | None = None
|
|
584
|
+
|
|
585
|
+
for frame_path, timestamp in zip(frame_paths, timestamps):
|
|
586
|
+
if not frame_path or not os.path.exists(frame_path):
|
|
587
|
+
continue
|
|
588
|
+
|
|
589
|
+
frame_count += 1
|
|
590
|
+
if progress_callback:
|
|
591
|
+
progress_callback(
|
|
592
|
+
f"Analyzing frame {frame_count}/{total_frames} (with context)...",
|
|
593
|
+
frame_count,
|
|
594
|
+
total_frames,
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
try:
|
|
598
|
+
# Build prompt with previous frame's description as context
|
|
599
|
+
prompt = _build_context_prompt(context, previous_description)
|
|
600
|
+
|
|
601
|
+
if frame_count == 1:
|
|
602
|
+
logger.info(f"Qwen context prompt (first frame): {prompt[:500]}")
|
|
603
|
+
|
|
604
|
+
messages = [
|
|
605
|
+
{
|
|
606
|
+
"role": "user",
|
|
607
|
+
"content": [
|
|
608
|
+
{"type": "image", "image": f"file://{frame_path}"},
|
|
609
|
+
{"type": "text", "text": prompt},
|
|
610
|
+
],
|
|
611
|
+
}
|
|
612
|
+
]
|
|
613
|
+
|
|
614
|
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
615
|
+
image_inputs, video_inputs = process_vision_info(messages)
|
|
616
|
+
inputs = processor(
|
|
617
|
+
text=[text],
|
|
618
|
+
images=image_inputs,
|
|
619
|
+
videos=video_inputs,
|
|
620
|
+
padding=True,
|
|
621
|
+
return_tensors="pt",
|
|
622
|
+
)
|
|
623
|
+
inputs = inputs.to(torch_device)
|
|
624
|
+
|
|
625
|
+
with torch.no_grad():
|
|
626
|
+
generated_ids = model.generate(
|
|
627
|
+
**inputs,
|
|
628
|
+
max_new_tokens=512,
|
|
629
|
+
do_sample=False,
|
|
630
|
+
repetition_penalty=1.2,
|
|
631
|
+
no_repeat_ngram_size=3,
|
|
632
|
+
)
|
|
633
|
+
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
|
634
|
+
output_text = processor.batch_decode(
|
|
635
|
+
generated_ids_trimmed,
|
|
636
|
+
skip_special_tokens=True,
|
|
637
|
+
clean_up_tokenization_spaces=False,
|
|
638
|
+
)[0]
|
|
639
|
+
|
|
640
|
+
logger.info(f"Qwen raw output for {timestamp:.1f}s: {output_text[:500]}")
|
|
641
|
+
objects, description = _parse_objects_and_description(output_text)
|
|
642
|
+
|
|
643
|
+
for obj in objects:
|
|
644
|
+
obj_lower = obj.lower().strip()
|
|
645
|
+
all_objects[obj_lower] = all_objects.get(obj_lower, 0) + 1
|
|
646
|
+
detections.append(
|
|
647
|
+
ObjectDetection(
|
|
648
|
+
timestamp=round(timestamp, 2),
|
|
649
|
+
label=obj_lower,
|
|
650
|
+
confidence=0.95,
|
|
651
|
+
bbox=BoundingBox(x=0, y=0, width=0, height=0),
|
|
652
|
+
)
|
|
653
|
+
)
|
|
654
|
+
|
|
655
|
+
if description:
|
|
656
|
+
descriptions.append(description)
|
|
657
|
+
previous_description = description # Pass to next frame
|
|
658
|
+
logger.info(f"Frame {timestamp:.1f}s description: {description}")
|
|
659
|
+
|
|
660
|
+
logger.info(f"Frame {timestamp:.1f}s objects: {objects}")
|
|
661
|
+
|
|
662
|
+
del inputs, generated_ids
|
|
663
|
+
if torch_device == "mps":
|
|
664
|
+
torch.mps.empty_cache()
|
|
665
|
+
elif torch_device == "cuda":
|
|
666
|
+
torch.cuda.empty_cache()
|
|
667
|
+
|
|
668
|
+
except Exception as e:
|
|
669
|
+
logger.error(f"Failed to process frame {frame_path}: {e}", exc_info=True)
|
|
670
|
+
if torch_device == "mps":
|
|
671
|
+
torch.mps.empty_cache()
|
|
672
|
+
|
|
673
|
+
return all_objects, detections, descriptions
|
|
674
|
+
|
|
675
|
+
|
|
676
|
+
def _analyze_frames_batch(
|
|
677
|
+
model: Any,
|
|
678
|
+
processor: Any,
|
|
679
|
+
torch_device: str,
|
|
680
|
+
frame_paths: list[str],
|
|
681
|
+
timestamps: list[float],
|
|
682
|
+
context: dict[str, str] | None,
|
|
683
|
+
progress_callback: ProgressCallback | None,
|
|
684
|
+
batch_size: int | None = None,
|
|
685
|
+
overlap: bool = False,
|
|
686
|
+
) -> tuple[dict[str, int], list[ObjectDetection], list[str]]:
|
|
687
|
+
"""Analyze frames in batches for temporal understanding."""
|
|
688
|
+
from qwen_vl_utils import process_vision_info # type: ignore[import-not-found]
|
|
689
|
+
|
|
690
|
+
all_objects: dict[str, int] = {}
|
|
691
|
+
detections: list[ObjectDetection] = []
|
|
692
|
+
descriptions: list[str] = []
|
|
693
|
+
|
|
694
|
+
# Auto-select batch size based on available memory
|
|
695
|
+
if batch_size is None:
|
|
696
|
+
batch_size = get_auto_qwen_batch_size()
|
|
697
|
+
|
|
698
|
+
# Filter to valid frames
|
|
699
|
+
valid_frames = [(p, t) for p, t in zip(frame_paths, timestamps) if p and os.path.exists(p)]
|
|
700
|
+
if not valid_frames:
|
|
701
|
+
return all_objects, detections, descriptions
|
|
702
|
+
|
|
703
|
+
# Group frames into batches
|
|
704
|
+
# With overlap: last frame of batch N = first frame of batch N+1 (visual continuity)
|
|
705
|
+
# Without overlap: sequential non-overlapping batches (faster)
|
|
706
|
+
batches: list[list[tuple[str, float]]] = []
|
|
707
|
+
step = max(1, batch_size - 1) if overlap else batch_size
|
|
708
|
+
for i in range(0, len(valid_frames), step):
|
|
709
|
+
batch = valid_frames[i : i + batch_size]
|
|
710
|
+
if overlap:
|
|
711
|
+
if len(batch) >= 2: # Need at least 2 frames for temporal analysis
|
|
712
|
+
batches.append(batch)
|
|
713
|
+
elif not batches: # Edge case: very few frames
|
|
714
|
+
batches.append(batch)
|
|
715
|
+
else:
|
|
716
|
+
batches.append(batch)
|
|
717
|
+
|
|
718
|
+
total_batches = len(batches)
|
|
719
|
+
overlap_str = "overlapping " if overlap else ""
|
|
720
|
+
logger.info(f"Processing {len(valid_frames)} frames in {total_batches} {overlap_str}batches (size={batch_size}, step={step})")
|
|
721
|
+
|
|
722
|
+
for batch_idx, batch in enumerate(batches):
|
|
723
|
+
if progress_callback:
|
|
724
|
+
progress_callback(
|
|
725
|
+
f"Analyzing batch {batch_idx + 1}/{total_batches}...",
|
|
726
|
+
batch_idx + 1,
|
|
727
|
+
total_batches,
|
|
728
|
+
)
|
|
729
|
+
|
|
730
|
+
try:
|
|
731
|
+
# Build multi-image message
|
|
732
|
+
prompt = _build_batch_prompt(context, len(batch))
|
|
733
|
+
|
|
734
|
+
if batch_idx == 0:
|
|
735
|
+
logger.info(f"Qwen batch prompt: {prompt[:500]}")
|
|
736
|
+
|
|
737
|
+
# Build content with all images in the batch
|
|
738
|
+
content: list[dict[str, str]] = []
|
|
739
|
+
for frame_path, _ in batch:
|
|
740
|
+
content.append({"type": "image", "image": f"file://{frame_path}"})
|
|
741
|
+
content.append({"type": "text", "text": prompt})
|
|
742
|
+
|
|
743
|
+
messages = [{"role": "user", "content": content}]
|
|
744
|
+
|
|
745
|
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
746
|
+
image_inputs, video_inputs = process_vision_info(messages)
|
|
747
|
+
inputs = processor(
|
|
748
|
+
text=[text],
|
|
749
|
+
images=image_inputs,
|
|
750
|
+
videos=video_inputs,
|
|
751
|
+
padding=True,
|
|
752
|
+
return_tensors="pt",
|
|
753
|
+
)
|
|
754
|
+
inputs = inputs.to(torch_device)
|
|
755
|
+
|
|
756
|
+
with torch.no_grad():
|
|
757
|
+
generated_ids = model.generate(
|
|
758
|
+
**inputs,
|
|
759
|
+
max_new_tokens=512,
|
|
760
|
+
do_sample=False,
|
|
761
|
+
repetition_penalty=1.2,
|
|
762
|
+
no_repeat_ngram_size=3,
|
|
763
|
+
)
|
|
764
|
+
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
|
765
|
+
output_text = processor.batch_decode(
|
|
766
|
+
generated_ids_trimmed,
|
|
767
|
+
skip_special_tokens=True,
|
|
768
|
+
clean_up_tokenization_spaces=False,
|
|
769
|
+
)[0]
|
|
770
|
+
|
|
771
|
+
logger.info(f"Qwen batch {batch_idx + 1} raw output: {output_text[:500]}")
|
|
772
|
+
objects, description = _parse_batch_response(output_text)
|
|
773
|
+
|
|
774
|
+
# Associate objects with the middle timestamp of the batch
|
|
775
|
+
batch_timestamps = [t for _, t in batch]
|
|
776
|
+
middle_timestamp = batch_timestamps[len(batch_timestamps) // 2]
|
|
777
|
+
|
|
778
|
+
for obj in objects:
|
|
779
|
+
obj_lower = obj.lower().strip()
|
|
780
|
+
all_objects[obj_lower] = all_objects.get(obj_lower, 0) + 1
|
|
781
|
+
detections.append(
|
|
782
|
+
ObjectDetection(
|
|
783
|
+
timestamp=round(middle_timestamp, 2),
|
|
784
|
+
label=obj_lower,
|
|
785
|
+
confidence=0.95,
|
|
786
|
+
bbox=BoundingBox(x=0, y=0, width=0, height=0),
|
|
787
|
+
)
|
|
788
|
+
)
|
|
789
|
+
|
|
790
|
+
if description:
|
|
791
|
+
descriptions.append(description)
|
|
792
|
+
logger.info(f"Batch {batch_idx + 1} description: {description}")
|
|
793
|
+
|
|
794
|
+
logger.info(f"Batch {batch_idx + 1} objects: {objects}")
|
|
795
|
+
|
|
796
|
+
del inputs, generated_ids
|
|
797
|
+
if torch_device == "mps":
|
|
798
|
+
torch.mps.empty_cache()
|
|
799
|
+
elif torch_device == "cuda":
|
|
800
|
+
torch.cuda.empty_cache()
|
|
801
|
+
|
|
802
|
+
except Exception as e:
|
|
803
|
+
logger.error(f"Failed to process batch {batch_idx + 1}: {e}", exc_info=True)
|
|
804
|
+
if torch_device == "mps":
|
|
805
|
+
torch.mps.empty_cache()
|
|
806
|
+
|
|
807
|
+
return all_objects, detections, descriptions
|
|
808
|
+
|
|
809
|
+
|
|
810
|
+
def _analyze_frames_batch_context(
|
|
811
|
+
model: Any,
|
|
812
|
+
processor: Any,
|
|
813
|
+
torch_device: str,
|
|
814
|
+
frame_paths: list[str],
|
|
815
|
+
timestamps: list[float],
|
|
816
|
+
context: dict[str, str] | None,
|
|
817
|
+
progress_callback: ProgressCallback | None,
|
|
818
|
+
batch_size: int | None = None,
|
|
819
|
+
overlap: bool = False,
|
|
820
|
+
) -> tuple[dict[str, int], list[ObjectDetection], list[str]]:
|
|
821
|
+
"""Analyze frames in batches with context passed between batches."""
|
|
822
|
+
from qwen_vl_utils import process_vision_info # type: ignore[import-not-found]
|
|
823
|
+
|
|
824
|
+
all_objects: dict[str, int] = {}
|
|
825
|
+
detections: list[ObjectDetection] = []
|
|
826
|
+
descriptions: list[str] = []
|
|
827
|
+
|
|
828
|
+
# Auto-select batch size based on available memory
|
|
829
|
+
if batch_size is None:
|
|
830
|
+
batch_size = get_auto_qwen_batch_size()
|
|
831
|
+
|
|
832
|
+
# Filter to valid frames
|
|
833
|
+
valid_frames = [(p, t) for p, t in zip(frame_paths, timestamps) if p and os.path.exists(p)]
|
|
834
|
+
if not valid_frames:
|
|
835
|
+
return all_objects, detections, descriptions
|
|
836
|
+
|
|
837
|
+
# Group frames into batches
|
|
838
|
+
# With overlap: last frame of batch N = first frame of batch N+1 (visual continuity)
|
|
839
|
+
# Without overlap: sequential non-overlapping batches (faster)
|
|
840
|
+
batches: list[list[tuple[str, float]]] = []
|
|
841
|
+
step = max(1, batch_size - 1) if overlap else batch_size
|
|
842
|
+
for i in range(0, len(valid_frames), step):
|
|
843
|
+
batch = valid_frames[i : i + batch_size]
|
|
844
|
+
if overlap:
|
|
845
|
+
if len(batch) >= 2: # Need at least 2 frames for temporal analysis
|
|
846
|
+
batches.append(batch)
|
|
847
|
+
elif not batches: # Edge case: very few frames
|
|
848
|
+
batches.append(batch)
|
|
849
|
+
else:
|
|
850
|
+
batches.append(batch)
|
|
851
|
+
|
|
852
|
+
total_batches = len(batches)
|
|
853
|
+
overlap_str = "overlapping " if overlap else ""
|
|
854
|
+
logger.info(f"Processing {len(valid_frames)} frames in {total_batches} {overlap_str}batches with context (size={batch_size}, step={step})")
|
|
855
|
+
|
|
856
|
+
group_context: str | None = None
|
|
857
|
+
|
|
858
|
+
for batch_idx, batch in enumerate(batches):
|
|
859
|
+
if progress_callback:
|
|
860
|
+
progress_callback(
|
|
861
|
+
f"Analyzing batch {batch_idx + 1}/{total_batches} (with context)...",
|
|
862
|
+
batch_idx + 1,
|
|
863
|
+
total_batches,
|
|
864
|
+
)
|
|
865
|
+
|
|
866
|
+
try:
|
|
867
|
+
# Build multi-image message with previous batch context
|
|
868
|
+
prompt = _build_batch_context_prompt(context, len(batch), group_context)
|
|
869
|
+
|
|
870
|
+
if batch_idx == 0:
|
|
871
|
+
logger.info(f"Qwen batch-context prompt: {prompt[:500]}")
|
|
872
|
+
|
|
873
|
+
# Build content with all images in the batch
|
|
874
|
+
content: list[dict[str, str]] = []
|
|
875
|
+
for frame_path, ts in batch:
|
|
876
|
+
# Verify frame exists and log size
|
|
877
|
+
if os.path.exists(frame_path):
|
|
878
|
+
size_kb = os.path.getsize(frame_path) / 1024
|
|
879
|
+
logger.info(f"Batch frame {ts:.1f}s: {size_kb:.1f}KB")
|
|
880
|
+
else:
|
|
881
|
+
logger.warning(f"Batch frame missing: {frame_path}")
|
|
882
|
+
content.append({"type": "image", "image": f"file://{frame_path}"})
|
|
883
|
+
content.append({"type": "text", "text": prompt})
|
|
884
|
+
logger.info(f"Batch {batch_idx + 1}: sending {len(batch)} images to Qwen")
|
|
885
|
+
|
|
886
|
+
messages = [{"role": "user", "content": content}]
|
|
887
|
+
|
|
888
|
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
889
|
+
image_inputs, video_inputs = process_vision_info(messages)
|
|
890
|
+
inputs = processor(
|
|
891
|
+
text=[text],
|
|
892
|
+
images=image_inputs,
|
|
893
|
+
videos=video_inputs,
|
|
894
|
+
padding=True,
|
|
895
|
+
return_tensors="pt",
|
|
896
|
+
)
|
|
897
|
+
inputs = inputs.to(torch_device)
|
|
898
|
+
|
|
899
|
+
with torch.no_grad():
|
|
900
|
+
generated_ids = model.generate(
|
|
901
|
+
**inputs,
|
|
902
|
+
max_new_tokens=512,
|
|
903
|
+
do_sample=False,
|
|
904
|
+
repetition_penalty=1.2,
|
|
905
|
+
no_repeat_ngram_size=3,
|
|
906
|
+
)
|
|
907
|
+
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
|
908
|
+
output_text = processor.batch_decode(
|
|
909
|
+
generated_ids_trimmed,
|
|
910
|
+
skip_special_tokens=True,
|
|
911
|
+
clean_up_tokenization_spaces=False,
|
|
912
|
+
)[0]
|
|
913
|
+
|
|
914
|
+
logger.info(f"Qwen batch {batch_idx + 1} raw output: {output_text[:500]}")
|
|
915
|
+
objects, description = _parse_batch_response(output_text)
|
|
916
|
+
|
|
917
|
+
# Use description as context for next batch
|
|
918
|
+
if description:
|
|
919
|
+
group_context = description
|
|
920
|
+
|
|
921
|
+
# Associate objects with the middle timestamp of the batch
|
|
922
|
+
batch_timestamps = [t for _, t in batch]
|
|
923
|
+
middle_timestamp = batch_timestamps[len(batch_timestamps) // 2]
|
|
924
|
+
|
|
925
|
+
for obj in objects:
|
|
926
|
+
obj_lower = obj.lower().strip()
|
|
927
|
+
all_objects[obj_lower] = all_objects.get(obj_lower, 0) + 1
|
|
928
|
+
detections.append(
|
|
929
|
+
ObjectDetection(
|
|
930
|
+
timestamp=round(middle_timestamp, 2),
|
|
931
|
+
label=obj_lower,
|
|
932
|
+
confidence=0.95,
|
|
933
|
+
bbox=BoundingBox(x=0, y=0, width=0, height=0),
|
|
934
|
+
)
|
|
935
|
+
)
|
|
936
|
+
|
|
937
|
+
if description:
|
|
938
|
+
descriptions.append(description)
|
|
939
|
+
logger.info(f"Batch {batch_idx + 1} description: {description}")
|
|
940
|
+
|
|
941
|
+
logger.info(f"Batch {batch_idx + 1} objects: {objects}")
|
|
942
|
+
|
|
943
|
+
del inputs, generated_ids
|
|
944
|
+
if torch_device == "mps":
|
|
945
|
+
torch.mps.empty_cache()
|
|
946
|
+
elif torch_device == "cuda":
|
|
947
|
+
torch.cuda.empty_cache()
|
|
948
|
+
|
|
949
|
+
except Exception as e:
|
|
950
|
+
logger.error(f"Failed to process batch {batch_idx + 1}: {e}", exc_info=True)
|
|
951
|
+
if torch_device == "mps":
|
|
952
|
+
torch.mps.empty_cache()
|
|
953
|
+
|
|
954
|
+
return all_objects, detections, descriptions
|
|
955
|
+
|
|
956
|
+
|
|
957
|
+
def _fix_malformed_json(text: str) -> str:
|
|
958
|
+
"""Fix common JSON malformations from VLM output."""
|
|
959
|
+
# Remove markdown code blocks
|
|
960
|
+
text = text.replace("```json", "").replace("```", "").strip()
|
|
961
|
+
|
|
962
|
+
# Remove invalid control characters (keep newlines and tabs for readability)
|
|
963
|
+
# Control chars are 0x00-0x1F except \t (0x09), \n (0x0A), \r (0x0D)
|
|
964
|
+
text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]", "", text)
|
|
965
|
+
|
|
966
|
+
# Fix escaped quotes before colons: "action\": -> "action":
|
|
967
|
+
text = text.replace('\\":', '":')
|
|
968
|
+
|
|
969
|
+
# Fix markdown bold in JSON keys: "action**: -> "action":
|
|
970
|
+
# Model sometimes outputs "key**: "value" instead of "key": "value"
|
|
971
|
+
text = re.sub(r'"\*+:', '":', text)
|
|
972
|
+
text = re.sub(r"(\w)\*+:", r'\1":', text) # action**: -> action":
|
|
973
|
+
|
|
974
|
+
# Replace single quotes with double quotes for keys and string values
|
|
975
|
+
# But be careful not to replace apostrophes within words
|
|
976
|
+
# First, handle keys: 'key': -> "key":
|
|
977
|
+
text = re.sub(r"'(\w+)'(\s*):", r'"\1"\2:', text)
|
|
978
|
+
|
|
979
|
+
# Handle string values: : 'value' -> : "value"
|
|
980
|
+
# This regex looks for : followed by optional whitespace and a single-quoted string
|
|
981
|
+
text = re.sub(r":\s*'([^']*)'", r': "\1"', text)
|
|
982
|
+
|
|
983
|
+
# Remove trailing commas before ] or }
|
|
984
|
+
text = re.sub(r",(\s*[\]\}])", r"\1", text)
|
|
985
|
+
|
|
986
|
+
return text
|
|
987
|
+
|
|
988
|
+
|
|
989
|
+
def _parse_batch_response(response: str) -> tuple[list[str], str | None]:
|
|
990
|
+
"""Parse objects and description from batch analysis response.
|
|
991
|
+
|
|
992
|
+
Handles both standard format and batch-specific format with action field.
|
|
993
|
+
"""
|
|
994
|
+
objects: list[str] = []
|
|
995
|
+
description: str | None = None
|
|
996
|
+
|
|
997
|
+
try:
|
|
998
|
+
clean_response = _fix_malformed_json(response)
|
|
999
|
+
|
|
1000
|
+
if "{" in clean_response:
|
|
1001
|
+
start_brace = clean_response.find("{")
|
|
1002
|
+
json_str = clean_response[start_brace : clean_response.rindex("}") + 1]
|
|
1003
|
+
data = json.loads(json_str)
|
|
1004
|
+
|
|
1005
|
+
# Extract objects
|
|
1006
|
+
raw_objects = data.get("objects", [])
|
|
1007
|
+
for obj in raw_objects:
|
|
1008
|
+
if isinstance(obj, str) and len(obj) < 100 and obj.strip():
|
|
1009
|
+
objects.append(obj)
|
|
1010
|
+
elif isinstance(obj, dict):
|
|
1011
|
+
name = obj.get("name", "") or obj.get("label", "")
|
|
1012
|
+
if isinstance(name, str) and len(name) < 100 and name.strip():
|
|
1013
|
+
objects.append(name)
|
|
1014
|
+
|
|
1015
|
+
# Build description from available fields
|
|
1016
|
+
desc_parts = []
|
|
1017
|
+
|
|
1018
|
+
# Action field (batch-specific)
|
|
1019
|
+
action = data.get("action", "")
|
|
1020
|
+
if isinstance(action, str) and action.strip():
|
|
1021
|
+
desc_parts.append(action.strip())
|
|
1022
|
+
|
|
1023
|
+
# Standard description
|
|
1024
|
+
desc = data.get("description", "")
|
|
1025
|
+
if isinstance(desc, str) and desc.strip():
|
|
1026
|
+
desc_parts.append(desc.strip())
|
|
1027
|
+
|
|
1028
|
+
# Continues field (batch-context specific)
|
|
1029
|
+
continues = data.get("continues", "")
|
|
1030
|
+
if isinstance(continues, str) and continues.strip():
|
|
1031
|
+
desc_parts.append(continues.strip())
|
|
1032
|
+
|
|
1033
|
+
# Change field (context-specific)
|
|
1034
|
+
change = data.get("change", "")
|
|
1035
|
+
if isinstance(change, str) and change.strip():
|
|
1036
|
+
desc_parts.append(f"Change: {change.strip()}")
|
|
1037
|
+
|
|
1038
|
+
# Check for error field (model couldn't fully analyze)
|
|
1039
|
+
error = data.get("error", "")
|
|
1040
|
+
if isinstance(error, str) and error.strip():
|
|
1041
|
+
logger.warning(f"Qwen reported issue: {error}")
|
|
1042
|
+
|
|
1043
|
+
if desc_parts:
|
|
1044
|
+
description = " ".join(desc_parts)
|
|
1045
|
+
|
|
1046
|
+
return objects, description
|
|
1047
|
+
|
|
1048
|
+
except (json.JSONDecodeError, ValueError) as e:
|
|
1049
|
+
logger.warning(f"Failed to parse batch JSON from Qwen response: {e}")
|
|
1050
|
+
|
|
1051
|
+
# Try to extract objects from partial/truncated JSON using regex
|
|
1052
|
+
# Look for "name": "value" patterns in the objects array
|
|
1053
|
+
name_matches = re.findall(r'"name"\s*:\s*"([^"]+)"', response)
|
|
1054
|
+
if name_matches:
|
|
1055
|
+
objects = [n for n in name_matches if len(n) < 100 and n.strip()]
|
|
1056
|
+
logger.info(f"Extracted {len(objects)} objects from partial JSON: {objects}")
|
|
1057
|
+
if objects:
|
|
1058
|
+
return objects, None
|
|
1059
|
+
|
|
1060
|
+
# Look for simple string arrays: ["item1", "item2"]
|
|
1061
|
+
array_match = re.search(r'"objects"\s*:\s*\[([^\]]*)', response)
|
|
1062
|
+
if array_match:
|
|
1063
|
+
items = re.findall(r'"([^"]+)"', array_match.group(1))
|
|
1064
|
+
objects = [i for i in items if len(i) < 100 and i.strip() and i not in ("name", "color", "location")]
|
|
1065
|
+
if objects:
|
|
1066
|
+
logger.info(f"Extracted {len(objects)} objects from array: {objects}")
|
|
1067
|
+
|
|
1068
|
+
# Try to extract description from malformed JSON
|
|
1069
|
+
desc_match = re.search(r'"description["\*]*\s*:\s*"([^"]+)"', response)
|
|
1070
|
+
if desc_match:
|
|
1071
|
+
description = desc_match.group(1).strip()
|
|
1072
|
+
logger.info(f"Extracted description from partial JSON: {description}")
|
|
1073
|
+
return objects, description
|
|
1074
|
+
|
|
1075
|
+
if objects:
|
|
1076
|
+
return objects, None
|
|
1077
|
+
|
|
1078
|
+
# Fallback to standard parser
|
|
1079
|
+
return _parse_objects_and_description(response)
|
|
1080
|
+
|
|
1081
|
+
|
|
337
1082
|
def extract_objects_qwen(
|
|
338
1083
|
file_path: str,
|
|
339
1084
|
timestamps: list[float] | None = None,
|
|
@@ -341,6 +1086,8 @@ def extract_objects_qwen(
|
|
|
341
1086
|
context: dict[str, str] | None = None,
|
|
342
1087
|
progress_callback: ProgressCallback | None = None,
|
|
343
1088
|
lut_path: str | None = None,
|
|
1089
|
+
batch_overlap: bool = False,
|
|
1090
|
+
strategy: str | None = None,
|
|
344
1091
|
) -> ObjectsResult:
|
|
345
1092
|
"""Extract objects using Qwen2-VL vision-language model.
|
|
346
1093
|
|
|
@@ -359,13 +1106,20 @@ def extract_objects_qwen(
|
|
|
359
1106
|
- "topic": Subject matter of the video
|
|
360
1107
|
progress_callback: Optional callback for progress updates (message, current, total)
|
|
361
1108
|
lut_path: Optional path to a LUT file (.cube) to apply for log footage color correction
|
|
1109
|
+
batch_overlap: If True, batches overlap by 1 frame for visual continuity.
|
|
1110
|
+
Useful for unstable camera or videos with rapid scene changes.
|
|
1111
|
+
Default False for faster processing.
|
|
1112
|
+
strategy: Override Qwen strategy for this file. One of:
|
|
1113
|
+
- "single": No temporal context (fastest)
|
|
1114
|
+
- "context": Pass previous description as text
|
|
1115
|
+
- "batch": Multi-frame batches
|
|
1116
|
+
- "batch_context": Batches with text context between (richest)
|
|
1117
|
+
If None, uses global setting from config.
|
|
362
1118
|
|
|
363
1119
|
Returns:
|
|
364
1120
|
ObjectsResult with detected objects and contextual descriptions
|
|
365
1121
|
"""
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
logger.info(f"extract_objects_qwen called: file={file_path}, timestamps={timestamps}, context={context}")
|
|
1122
|
+
logger.info(f"extract_objects_qwen called: file={file_path}, lut_path={lut_path}, timestamps={timestamps}")
|
|
369
1123
|
|
|
370
1124
|
settings = get_settings()
|
|
371
1125
|
# Resolve model name (handles "auto")
|
|
@@ -384,7 +1138,7 @@ def extract_objects_qwen(
|
|
|
384
1138
|
if timestamps is None:
|
|
385
1139
|
duration = _get_video_duration(file_path)
|
|
386
1140
|
timestamps = [duration / 2]
|
|
387
|
-
logger.info(f"No timestamps provided, sampling from middle ({duration/2:.1f}s)")
|
|
1141
|
+
logger.info(f"No timestamps provided, sampling from middle ({duration / 2:.1f}s)")
|
|
388
1142
|
else:
|
|
389
1143
|
logger.info(f"Analyzing {len(timestamps)} provided timestamps")
|
|
390
1144
|
|
|
@@ -398,26 +1152,22 @@ def extract_objects_qwen(
|
|
|
398
1152
|
else:
|
|
399
1153
|
context = context.copy() # Don't modify the original
|
|
400
1154
|
|
|
401
|
-
if
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
"Colors are desaturated and not representative of the actual scene. "
|
|
412
|
-
"Focus on describing content and action, not colors."
|
|
413
|
-
)
|
|
414
|
-
logger.info(f"Added log footage context hint (no LUT, color_transfer={color_transfer})")
|
|
1155
|
+
# Determine if we need auto-normalization (LOG footage without LUT)
|
|
1156
|
+
has_lut = lut_path and os.path.exists(lut_path)
|
|
1157
|
+
auto_normalize = is_log_footage and not has_lut
|
|
1158
|
+
|
|
1159
|
+
if has_lut:
|
|
1160
|
+
# LUT applied - colors are corrected
|
|
1161
|
+
logger.info(f"LOG footage detected, applying LUT: {lut_path}")
|
|
1162
|
+
elif auto_normalize:
|
|
1163
|
+
# LOG detected, no LUT - will apply auto-normalization
|
|
1164
|
+
logger.info(f"LOG footage detected ({color_transfer}), applying auto-normalization")
|
|
415
1165
|
|
|
416
1166
|
# IMPORTANT: Extract frames BEFORE loading the model!
|
|
417
1167
|
# ffmpeg can crash (SIGABRT) when forked from a process with MPS/Metal loaded.
|
|
418
1168
|
if progress_callback:
|
|
419
1169
|
progress_callback("Extracting frames...", None, None)
|
|
420
|
-
frame_paths = _extract_frames_at_timestamps(file_path, temp_dir, timestamps, lut_path=lut_path)
|
|
1170
|
+
frame_paths = _extract_frames_at_timestamps(file_path, temp_dir, timestamps, lut_path=lut_path, auto_normalize=auto_normalize)
|
|
421
1171
|
total_frames = len([p for p in frame_paths if p])
|
|
422
1172
|
|
|
423
1173
|
if total_frames == 0:
|
|
@@ -431,7 +1181,7 @@ def extract_objects_qwen(
|
|
|
431
1181
|
except (RuntimeError, MemoryError, OSError) as e:
|
|
432
1182
|
error_msg = str(e).lower()
|
|
433
1183
|
if "out of memory" in error_msg or "cannot allocate" in error_msg:
|
|
434
|
-
logger.error(f"Out of memory loading Qwen model.
|
|
1184
|
+
logger.error(f"Out of memory loading Qwen model. Close other apps or use a cloud vision API. Error: {e}")
|
|
435
1185
|
# Return empty result - frontend can fall back to cloud API if configured
|
|
436
1186
|
return ObjectsResult(
|
|
437
1187
|
summary={},
|
|
@@ -443,108 +1193,41 @@ def extract_objects_qwen(
|
|
|
443
1193
|
|
|
444
1194
|
logger.info(f"Processing {total_frames} frames for Qwen analysis")
|
|
445
1195
|
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
]
|
|
482
|
-
|
|
483
|
-
# Process inputs
|
|
484
|
-
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
485
|
-
image_inputs, video_inputs = process_vision_info(messages)
|
|
486
|
-
inputs = processor(
|
|
487
|
-
text=[text],
|
|
488
|
-
images=image_inputs,
|
|
489
|
-
videos=video_inputs,
|
|
490
|
-
padding=True,
|
|
491
|
-
return_tensors="pt",
|
|
492
|
-
)
|
|
493
|
-
inputs = inputs.to(torch_device)
|
|
494
|
-
|
|
495
|
-
# Generate response with repetition penalty to prevent loops
|
|
496
|
-
with torch.no_grad():
|
|
497
|
-
generated_ids = model.generate(
|
|
498
|
-
**inputs,
|
|
499
|
-
max_new_tokens=512,
|
|
500
|
-
do_sample=False, # Greedy decoding for consistent JSON
|
|
501
|
-
repetition_penalty=1.2, # Penalize repetition
|
|
502
|
-
no_repeat_ngram_size=3, # Prevent 3-gram repetition
|
|
503
|
-
)
|
|
504
|
-
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
|
505
|
-
output_text = processor.batch_decode(
|
|
506
|
-
generated_ids_trimmed,
|
|
507
|
-
skip_special_tokens=True,
|
|
508
|
-
clean_up_tokenization_spaces=False,
|
|
509
|
-
)[0]
|
|
510
|
-
|
|
511
|
-
# Parse response
|
|
512
|
-
logger.info(f"Qwen raw output for {timestamp:.1f}s: {output_text[:500]}")
|
|
513
|
-
objects, description = _parse_objects_and_description(output_text)
|
|
514
|
-
if not description:
|
|
515
|
-
logger.warning(f"No description parsed from Qwen output at {timestamp:.1f}s")
|
|
516
|
-
for obj in objects:
|
|
517
|
-
obj_lower = obj.lower().strip()
|
|
518
|
-
all_objects[obj_lower] = all_objects.get(obj_lower, 0) + 1
|
|
519
|
-
|
|
520
|
-
detections.append(
|
|
521
|
-
ObjectDetection(
|
|
522
|
-
timestamp=round(timestamp, 2),
|
|
523
|
-
label=obj_lower,
|
|
524
|
-
confidence=0.95, # VLM confidence is generally high
|
|
525
|
-
bbox=BoundingBox(x=0, y=0, width=0, height=0), # No bbox from VLM
|
|
526
|
-
)
|
|
527
|
-
)
|
|
528
|
-
|
|
529
|
-
if description:
|
|
530
|
-
descriptions.append(description)
|
|
531
|
-
logger.info(f"Frame {timestamp:.1f}s description: {description}")
|
|
532
|
-
|
|
533
|
-
logger.info(f"Frame {timestamp:.1f}s objects: {objects}")
|
|
534
|
-
|
|
535
|
-
# Clear memory after each frame
|
|
536
|
-
del inputs, generated_ids
|
|
537
|
-
if torch_device == "mps":
|
|
538
|
-
torch.mps.empty_cache()
|
|
539
|
-
elif torch_device == "cuda":
|
|
540
|
-
torch.cuda.empty_cache()
|
|
541
|
-
|
|
542
|
-
except Exception as e:
|
|
543
|
-
logger.error(f"Failed to process frame {frame_path}: {e}", exc_info=True)
|
|
544
|
-
# Try to recover memory
|
|
545
|
-
if torch_device == "mps":
|
|
546
|
-
torch.mps.empty_cache()
|
|
547
|
-
continue
|
|
1196
|
+
# Get strategy for multi-frame analysis (use override if provided)
|
|
1197
|
+
if strategy is not None:
|
|
1198
|
+
resolved_strategy = QwenStrategy(strategy)
|
|
1199
|
+
logger.info(f"Using Qwen strategy override: {resolved_strategy}")
|
|
1200
|
+
else:
|
|
1201
|
+
resolved_strategy = settings.get_qwen_strategy()
|
|
1202
|
+
logger.info(f"Using Qwen strategy from config: {resolved_strategy}")
|
|
1203
|
+
|
|
1204
|
+
# Dispatch to appropriate strategy implementation
|
|
1205
|
+
if resolved_strategy == QwenStrategy.SINGLE:
|
|
1206
|
+
all_objects, detections, descriptions = _analyze_frames_single(model, processor, torch_device, frame_paths, timestamps, context, progress_callback)
|
|
1207
|
+
elif resolved_strategy == QwenStrategy.CONTEXT:
|
|
1208
|
+
all_objects, detections, descriptions = _analyze_frames_with_context(model, processor, torch_device, frame_paths, timestamps, context, progress_callback)
|
|
1209
|
+
elif resolved_strategy == QwenStrategy.BATCH:
|
|
1210
|
+
all_objects, detections, descriptions = _analyze_frames_batch(
|
|
1211
|
+
model,
|
|
1212
|
+
processor,
|
|
1213
|
+
torch_device,
|
|
1214
|
+
frame_paths,
|
|
1215
|
+
timestamps,
|
|
1216
|
+
context,
|
|
1217
|
+
progress_callback,
|
|
1218
|
+
overlap=batch_overlap,
|
|
1219
|
+
)
|
|
1220
|
+
else: # BATCH_CONTEXT
|
|
1221
|
+
all_objects, detections, descriptions = _analyze_frames_batch_context(
|
|
1222
|
+
model,
|
|
1223
|
+
processor,
|
|
1224
|
+
torch_device,
|
|
1225
|
+
frame_paths,
|
|
1226
|
+
timestamps,
|
|
1227
|
+
context,
|
|
1228
|
+
progress_callback,
|
|
1229
|
+
overlap=batch_overlap,
|
|
1230
|
+
)
|
|
548
1231
|
|
|
549
1232
|
# Deduplicate - count unique objects per type
|
|
550
1233
|
unique_objects = _deduplicate_objects(all_objects)
|
|
@@ -574,6 +1257,7 @@ def _extract_frames_at_timestamps(
|
|
|
574
1257
|
timestamps: list[float],
|
|
575
1258
|
max_width: int = 1280,
|
|
576
1259
|
lut_path: str | None = None,
|
|
1260
|
+
auto_normalize: bool = False,
|
|
577
1261
|
) -> list[str]:
|
|
578
1262
|
"""Extract frames at specific timestamps, resized for VLM inference.
|
|
579
1263
|
|
|
@@ -587,6 +1271,8 @@ def _extract_frames_at_timestamps(
|
|
|
587
1271
|
timestamps: List of timestamps to extract (in seconds)
|
|
588
1272
|
max_width: Maximum width for scaling (default 1280)
|
|
589
1273
|
lut_path: Optional path to a .cube LUT file for color correction
|
|
1274
|
+
auto_normalize: If True and no LUT, apply automatic color normalization
|
|
1275
|
+
for LOG footage (boosts contrast and saturation)
|
|
590
1276
|
"""
|
|
591
1277
|
import subprocess
|
|
592
1278
|
|
|
@@ -596,16 +1282,28 @@ def _extract_frames_at_timestamps(
|
|
|
596
1282
|
|
|
597
1283
|
logger.info(f"Extracting {len(timestamps)} frames from {file_path} at timestamps {timestamps}")
|
|
598
1284
|
|
|
599
|
-
#
|
|
600
|
-
|
|
601
|
-
|
|
1285
|
+
# Use ffmpeg with color correction if LUT provided OR auto-normalize requested
|
|
1286
|
+
use_ffmpeg_color = (lut_path and os.path.exists(lut_path)) or auto_normalize
|
|
1287
|
+
|
|
1288
|
+
if use_ffmpeg_color:
|
|
1289
|
+
# Build color correction filter
|
|
1290
|
+
if lut_path and os.path.exists(lut_path):
|
|
1291
|
+
logger.info(f"Applying LUT: {lut_path}")
|
|
1292
|
+
color_filter = f"lut3d='{lut_path}'"
|
|
1293
|
+
else:
|
|
1294
|
+
# Auto-normalize for LOG footage: apply S-curve + saturation boost
|
|
1295
|
+
# This converts flat LOG footage to a more viewable range for VLM analysis
|
|
1296
|
+
# curves: S-curve to add contrast (lift shadows, compress highlights)
|
|
1297
|
+
# eq: boost saturation since LOG footage is very desaturated
|
|
1298
|
+
logger.info("Applying auto-normalization for LOG footage (no LUT configured)")
|
|
1299
|
+
color_filter = "curves=master='0/0 0.15/0.30 0.5/0.5 0.85/0.70 1/1',eq=saturation=1.4:contrast=1.1"
|
|
1300
|
+
|
|
602
1301
|
for i, ts in enumerate(timestamps):
|
|
603
1302
|
output_path = os.path.join(output_dir, f"frame_{i:04d}.jpg")
|
|
604
1303
|
try:
|
|
605
|
-
# Build filter chain:
|
|
1304
|
+
# Build filter chain: color correction + scale
|
|
606
1305
|
scale_filter = f"scale={max_width}:{max_width}:force_original_aspect_ratio=decrease"
|
|
607
|
-
|
|
608
|
-
vf = f"{lut_filter},{scale_filter}"
|
|
1306
|
+
vf = f"{color_filter},{scale_filter}"
|
|
609
1307
|
|
|
610
1308
|
cmd = [
|
|
611
1309
|
"ffmpeg",
|
|
@@ -628,9 +1326,10 @@ def _extract_frames_at_timestamps(
|
|
|
628
1326
|
|
|
629
1327
|
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
|
|
630
1328
|
frame_paths.append(output_path)
|
|
631
|
-
|
|
1329
|
+
correction_type = "LUT" if (lut_path and os.path.exists(lut_path)) else "auto-normalized"
|
|
1330
|
+
logger.info(f"Extracted frame {i} at {ts:.2f}s ({correction_type}): {output_path}")
|
|
632
1331
|
else:
|
|
633
|
-
logger.warning(f"Frame at {ts:.2f}s: could not extract with
|
|
1332
|
+
logger.warning(f"Frame at {ts:.2f}s: could not extract with color correction")
|
|
634
1333
|
frame_paths.append("")
|
|
635
1334
|
except subprocess.CalledProcessError as e:
|
|
636
1335
|
logger.warning(f"Frame at {ts:.2f}s: ffmpeg failed: {e}")
|
|
@@ -667,8 +1366,7 @@ def _parse_objects_and_description(response: str) -> tuple[list[str], str | None
|
|
|
667
1366
|
|
|
668
1367
|
# Try to find and parse JSON
|
|
669
1368
|
try:
|
|
670
|
-
|
|
671
|
-
clean_response = response.replace("```json", "").replace("```", "").strip()
|
|
1369
|
+
clean_response = _fix_malformed_json(response)
|
|
672
1370
|
|
|
673
1371
|
# Try to parse as JSON (could be object or array)
|
|
674
1372
|
if "[" in clean_response or "{" in clean_response:
|