media-engine 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,7 @@
3
3
  import json
4
4
  import logging
5
5
  import os
6
+ import re
6
7
  import shutil
7
8
  import tempfile
8
9
  from collections.abc import Callable
@@ -13,6 +14,8 @@ import torch
13
14
 
14
15
  from media_engine.config import (
15
16
  DeviceType,
17
+ QwenStrategy,
18
+ get_auto_qwen_batch_size,
16
19
  get_device,
17
20
  get_free_memory_gb,
18
21
  get_settings,
@@ -234,24 +237,18 @@ def _get_qwen_model(
234
237
 
235
238
  def _build_analysis_prompt(context: dict[str, str] | None = None) -> str:
236
239
  """Build the analysis prompt, optionally including context."""
237
- base_prompt = """Look at this image carefully and describe what you see.
240
+ base_prompt = """Describe what you see in this image. List main objects and write a short description.
238
241
 
239
- List all visible objects and write a brief description of the scene.
242
+ JSON format:
243
+ {"objects": ["object1", "object2"], "description": "scene description"}
240
244
 
241
- You MUST respond with ONLY this exact JSON format:
242
- {"objects": ["item1", "item2"], "description": "One or two sentences describing the scene."}
243
-
244
- Rules for objects:
245
- - Be specific: "scissors" not "tool", "laptop" not "device"
246
- - Include people as "person" or "man"/"woman"
247
- - Only list clearly visible objects
245
+ If the image is unclear, use:
246
+ {"objects": [], "description": "unknown", "error": "reason why"}
248
247
 
249
- Rules for description:
250
- - Describe what's happening
251
- - Mention the setting/environment
252
- - Keep it to 1-2 sentences
248
+ Example:
249
+ {"objects": ["mountain", "ocean", "lighthouse"], "description": "A lighthouse on a rocky coast with mountains in the background."}
253
250
 
254
- Respond with JSON only, no other text."""
251
+ Respond with JSON only. Describe what you CAN see."""
255
252
 
256
253
  if not context:
257
254
  return base_prompt
@@ -309,11 +306,19 @@ IMPORTANT: This location has these nearby landmarks: {nearby_landmarks}
309
306
  log_instruction = f"""
310
307
  NOTE: {log_footage_note}
311
308
  - Focus on describing the content and action, not the color grading
309
+ """
310
+
311
+ # Add topic/activity instruction if provided
312
+ topic = context.get("topic", "") or context.get("activity", "")
313
+ topic_instruction = ""
314
+ if topic:
315
+ topic_instruction = f"""
316
+ IMPORTANT: This video shows "{topic}". Use this context to interpret the action.
312
317
  """
313
318
 
314
319
  # Enhanced prompt with context
315
320
  return f"""{context_section}
316
- {person_instruction}{landmark_instruction}{log_instruction}
321
+ {person_instruction}{landmark_instruction}{log_instruction}{topic_instruction}
317
322
  Look at this image carefully and describe what you see.
318
323
 
319
324
  You MUST respond with ONLY this exact JSON format:
@@ -334,6 +339,746 @@ Rules for description:
334
339
  Respond with JSON only, no other text."""
335
340
 
336
341
 
342
+ def _build_context_prompt(
343
+ context: dict[str, str] | None = None,
344
+ previous_description: str | None = None,
345
+ ) -> str:
346
+ """Build prompt for CONTEXT strategy - includes previous frame description."""
347
+ base_prompt = _build_analysis_prompt(context)
348
+
349
+ if not previous_description:
350
+ return base_prompt
351
+
352
+ # Insert previous frame context before the analysis request
353
+ context_insert = f"""
354
+ Previous frame showed: {previous_description}
355
+
356
+ Describe what's happening NOW and how it relates to the previous frame.
357
+ Focus on: objects visible, actions occurring, any changes from before.
358
+
359
+ """
360
+ # Modify the JSON format to include "change" field
361
+ modified_prompt = base_prompt.replace(
362
+ '{"objects": ["item1", "item2"], "description": "One or two sentences describing the scene."}',
363
+ '{"objects": ["item1", "item2"], "description": "What\'s happening now.", "change": "How this differs from the previous frame."}',
364
+ )
365
+
366
+ # Insert context after any existing context section but before "Look at this image"
367
+ if "Look at this image" in modified_prompt:
368
+ parts = modified_prompt.split("Look at this image")
369
+ return parts[0] + context_insert + "Look at this image" + parts[1]
370
+
371
+ return context_insert + modified_prompt
372
+
373
+
374
+ def _build_batch_prompt(
375
+ context: dict[str, str] | None = None,
376
+ num_frames: int = 3,
377
+ ) -> str:
378
+ """Build prompt for BATCH strategy - analyzes multiple frames together."""
379
+ # Get person name from context for instructions
380
+ person_name = context.get("person", "") if context else ""
381
+
382
+ # Build context section if available
383
+ context_section = ""
384
+ topic_hint = ""
385
+ if context:
386
+ context_lines = ["Known context about this video:"]
387
+ labels = {
388
+ "person": "Person identified",
389
+ "location": "Location",
390
+ "nearby_landmarks": "Nearby landmarks/POIs",
391
+ "activity": "Activity",
392
+ "topic": "Activity/Subject",
393
+ "language": "Language spoken",
394
+ "device": "Filmed with",
395
+ }
396
+ for key, value in context.items():
397
+ if value and key not in ("log_footage_note", "color_transfer"):
398
+ label = labels.get(key, key.replace("_", " ").title())
399
+ context_lines.append(f"- {label}: {value}")
400
+ # Capture topic for special instruction
401
+ if key in ("topic", "activity") and value:
402
+ topic_hint = value
403
+ context_section = "\n".join(context_lines) + "\n\n"
404
+
405
+ person_instruction = ""
406
+ if person_name:
407
+ person_instruction = f'Use "{person_name}" instead of "person" in objects and description.\n'
408
+
409
+ # Add topic instruction if provided
410
+ topic_instruction = ""
411
+ if topic_hint:
412
+ topic_instruction = f'IMPORTANT: This video shows "{topic_hint}". Use this context to interpret what you see.\n'
413
+
414
+ return f"""{context_section}These {num_frames} frames are from a video.
415
+ {person_instruction}{topic_instruction}
416
+ Describe what you see. List main objects and write a short description.
417
+
418
+ JSON format:
419
+ {{"objects": ["object1", "object2"], "action": "what is happening", "description": "scene description"}}
420
+
421
+ If the image is unclear or you cannot identify content, use:
422
+ {{"objects": [], "action": "unknown", "description": "unknown", "error": "reason why"}}
423
+
424
+ Example:
425
+ {{"objects": ["bus", "road", "mountain"], "action": "bus driving", "description": "A bus on a coastal road with mountains."}}
426
+
427
+ Respond with JSON only. Describe what you CAN see, even if partial."""
428
+
429
+
430
+ def _build_batch_context_prompt(
431
+ context: dict[str, str] | None = None,
432
+ num_frames: int = 3,
433
+ group_context: str | None = None,
434
+ ) -> str:
435
+ """Build prompt for BATCH_CONTEXT strategy - batch with previous group context."""
436
+ base_prompt = _build_batch_prompt(context, num_frames)
437
+
438
+ if not group_context:
439
+ return base_prompt
440
+
441
+ context_insert = f"""Previous scene: {group_context}
442
+
443
+ What happens next in these frames? How does it continue from before?
444
+
445
+ """
446
+ # Modify JSON format to include "continues" field
447
+ modified_prompt = base_prompt.replace(
448
+ '{"objects": ["item1", "item2"], "action": "The action happening across frames", "description": "Overall scene description"}',
449
+ '{"objects": ["item1", "item2"], "action": "The action in these frames", "description": "Scene description", "continues": "How this continues from the previous scene"}',
450
+ )
451
+
452
+ # Insert after context section but before "These X frames"
453
+ if "These " in modified_prompt and " frames are" in modified_prompt:
454
+ idx = modified_prompt.find("These ")
455
+ return modified_prompt[:idx] + context_insert + modified_prompt[idx:]
456
+
457
+ return context_insert + modified_prompt
458
+
459
+
460
+ def _analyze_frames_single(
461
+ model: Any,
462
+ processor: Any,
463
+ torch_device: str,
464
+ frame_paths: list[str],
465
+ timestamps: list[float],
466
+ context: dict[str, str] | None,
467
+ progress_callback: ProgressCallback | None,
468
+ ) -> tuple[dict[str, int], list[ObjectDetection], list[str]]:
469
+ """Analyze frames one at a time without temporal context (original behavior)."""
470
+ from qwen_vl_utils import process_vision_info # type: ignore[import-not-found]
471
+
472
+ all_objects: dict[str, int] = {}
473
+ detections: list[ObjectDetection] = []
474
+ descriptions: list[str] = []
475
+
476
+ total_frames = len([p for p in frame_paths if p])
477
+ frame_count = 0
478
+
479
+ for frame_path, timestamp in zip(frame_paths, timestamps):
480
+ if not frame_path or not os.path.exists(frame_path):
481
+ continue
482
+
483
+ frame_count += 1
484
+ if progress_callback:
485
+ progress_callback(
486
+ f"Analyzing frame {frame_count}/{total_frames}...",
487
+ frame_count,
488
+ total_frames,
489
+ )
490
+
491
+ try:
492
+ prompt = _build_analysis_prompt(context)
493
+
494
+ messages = [
495
+ {
496
+ "role": "user",
497
+ "content": [
498
+ {"type": "image", "image": f"file://{frame_path}"},
499
+ {"type": "text", "text": prompt},
500
+ ],
501
+ }
502
+ ]
503
+
504
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
505
+ image_inputs, video_inputs = process_vision_info(messages)
506
+ inputs = processor(
507
+ text=[text],
508
+ images=image_inputs,
509
+ videos=video_inputs,
510
+ padding=True,
511
+ return_tensors="pt",
512
+ )
513
+ inputs = inputs.to(torch_device)
514
+
515
+ with torch.no_grad():
516
+ generated_ids = model.generate(
517
+ **inputs,
518
+ max_new_tokens=512,
519
+ do_sample=False,
520
+ repetition_penalty=1.2,
521
+ no_repeat_ngram_size=3,
522
+ )
523
+ generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
524
+ output_text = processor.batch_decode(
525
+ generated_ids_trimmed,
526
+ skip_special_tokens=True,
527
+ clean_up_tokenization_spaces=False,
528
+ )[0]
529
+
530
+ logger.info(f"Qwen raw output for {timestamp:.1f}s: {output_text[:500]}")
531
+ objects, description = _parse_objects_and_description(output_text)
532
+
533
+ for obj in objects:
534
+ obj_lower = obj.lower().strip()
535
+ all_objects[obj_lower] = all_objects.get(obj_lower, 0) + 1
536
+ detections.append(
537
+ ObjectDetection(
538
+ timestamp=round(timestamp, 2),
539
+ label=obj_lower,
540
+ confidence=0.95,
541
+ bbox=BoundingBox(x=0, y=0, width=0, height=0),
542
+ )
543
+ )
544
+
545
+ if description:
546
+ descriptions.append(description)
547
+ logger.info(f"Frame {timestamp:.1f}s description: {description}")
548
+
549
+ logger.info(f"Frame {timestamp:.1f}s objects: {objects}")
550
+
551
+ del inputs, generated_ids
552
+ if torch_device == "mps":
553
+ torch.mps.empty_cache()
554
+ elif torch_device == "cuda":
555
+ torch.cuda.empty_cache()
556
+
557
+ except Exception as e:
558
+ logger.error(f"Failed to process frame {frame_path}: {e}", exc_info=True)
559
+ if torch_device == "mps":
560
+ torch.mps.empty_cache()
561
+
562
+ return all_objects, detections, descriptions
563
+
564
+
565
+ def _analyze_frames_with_context(
566
+ model: Any,
567
+ processor: Any,
568
+ torch_device: str,
569
+ frame_paths: list[str],
570
+ timestamps: list[float],
571
+ context: dict[str, str] | None,
572
+ progress_callback: ProgressCallback | None,
573
+ ) -> tuple[dict[str, int], list[ObjectDetection], list[str]]:
574
+ """Analyze frames sequentially, passing previous description as context."""
575
+ from qwen_vl_utils import process_vision_info # type: ignore[import-not-found]
576
+
577
+ all_objects: dict[str, int] = {}
578
+ detections: list[ObjectDetection] = []
579
+ descriptions: list[str] = []
580
+
581
+ total_frames = len([p for p in frame_paths if p])
582
+ frame_count = 0
583
+ previous_description: str | None = None
584
+
585
+ for frame_path, timestamp in zip(frame_paths, timestamps):
586
+ if not frame_path or not os.path.exists(frame_path):
587
+ continue
588
+
589
+ frame_count += 1
590
+ if progress_callback:
591
+ progress_callback(
592
+ f"Analyzing frame {frame_count}/{total_frames} (with context)...",
593
+ frame_count,
594
+ total_frames,
595
+ )
596
+
597
+ try:
598
+ # Build prompt with previous frame's description as context
599
+ prompt = _build_context_prompt(context, previous_description)
600
+
601
+ if frame_count == 1:
602
+ logger.info(f"Qwen context prompt (first frame): {prompt[:500]}")
603
+
604
+ messages = [
605
+ {
606
+ "role": "user",
607
+ "content": [
608
+ {"type": "image", "image": f"file://{frame_path}"},
609
+ {"type": "text", "text": prompt},
610
+ ],
611
+ }
612
+ ]
613
+
614
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
615
+ image_inputs, video_inputs = process_vision_info(messages)
616
+ inputs = processor(
617
+ text=[text],
618
+ images=image_inputs,
619
+ videos=video_inputs,
620
+ padding=True,
621
+ return_tensors="pt",
622
+ )
623
+ inputs = inputs.to(torch_device)
624
+
625
+ with torch.no_grad():
626
+ generated_ids = model.generate(
627
+ **inputs,
628
+ max_new_tokens=512,
629
+ do_sample=False,
630
+ repetition_penalty=1.2,
631
+ no_repeat_ngram_size=3,
632
+ )
633
+ generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
634
+ output_text = processor.batch_decode(
635
+ generated_ids_trimmed,
636
+ skip_special_tokens=True,
637
+ clean_up_tokenization_spaces=False,
638
+ )[0]
639
+
640
+ logger.info(f"Qwen raw output for {timestamp:.1f}s: {output_text[:500]}")
641
+ objects, description = _parse_objects_and_description(output_text)
642
+
643
+ for obj in objects:
644
+ obj_lower = obj.lower().strip()
645
+ all_objects[obj_lower] = all_objects.get(obj_lower, 0) + 1
646
+ detections.append(
647
+ ObjectDetection(
648
+ timestamp=round(timestamp, 2),
649
+ label=obj_lower,
650
+ confidence=0.95,
651
+ bbox=BoundingBox(x=0, y=0, width=0, height=0),
652
+ )
653
+ )
654
+
655
+ if description:
656
+ descriptions.append(description)
657
+ previous_description = description # Pass to next frame
658
+ logger.info(f"Frame {timestamp:.1f}s description: {description}")
659
+
660
+ logger.info(f"Frame {timestamp:.1f}s objects: {objects}")
661
+
662
+ del inputs, generated_ids
663
+ if torch_device == "mps":
664
+ torch.mps.empty_cache()
665
+ elif torch_device == "cuda":
666
+ torch.cuda.empty_cache()
667
+
668
+ except Exception as e:
669
+ logger.error(f"Failed to process frame {frame_path}: {e}", exc_info=True)
670
+ if torch_device == "mps":
671
+ torch.mps.empty_cache()
672
+
673
+ return all_objects, detections, descriptions
674
+
675
+
676
+ def _analyze_frames_batch(
677
+ model: Any,
678
+ processor: Any,
679
+ torch_device: str,
680
+ frame_paths: list[str],
681
+ timestamps: list[float],
682
+ context: dict[str, str] | None,
683
+ progress_callback: ProgressCallback | None,
684
+ batch_size: int | None = None,
685
+ overlap: bool = False,
686
+ ) -> tuple[dict[str, int], list[ObjectDetection], list[str]]:
687
+ """Analyze frames in batches for temporal understanding."""
688
+ from qwen_vl_utils import process_vision_info # type: ignore[import-not-found]
689
+
690
+ all_objects: dict[str, int] = {}
691
+ detections: list[ObjectDetection] = []
692
+ descriptions: list[str] = []
693
+
694
+ # Auto-select batch size based on available memory
695
+ if batch_size is None:
696
+ batch_size = get_auto_qwen_batch_size()
697
+
698
+ # Filter to valid frames
699
+ valid_frames = [(p, t) for p, t in zip(frame_paths, timestamps) if p and os.path.exists(p)]
700
+ if not valid_frames:
701
+ return all_objects, detections, descriptions
702
+
703
+ # Group frames into batches
704
+ # With overlap: last frame of batch N = first frame of batch N+1 (visual continuity)
705
+ # Without overlap: sequential non-overlapping batches (faster)
706
+ batches: list[list[tuple[str, float]]] = []
707
+ step = max(1, batch_size - 1) if overlap else batch_size
708
+ for i in range(0, len(valid_frames), step):
709
+ batch = valid_frames[i : i + batch_size]
710
+ if overlap:
711
+ if len(batch) >= 2: # Need at least 2 frames for temporal analysis
712
+ batches.append(batch)
713
+ elif not batches: # Edge case: very few frames
714
+ batches.append(batch)
715
+ else:
716
+ batches.append(batch)
717
+
718
+ total_batches = len(batches)
719
+ overlap_str = "overlapping " if overlap else ""
720
+ logger.info(f"Processing {len(valid_frames)} frames in {total_batches} {overlap_str}batches (size={batch_size}, step={step})")
721
+
722
+ for batch_idx, batch in enumerate(batches):
723
+ if progress_callback:
724
+ progress_callback(
725
+ f"Analyzing batch {batch_idx + 1}/{total_batches}...",
726
+ batch_idx + 1,
727
+ total_batches,
728
+ )
729
+
730
+ try:
731
+ # Build multi-image message
732
+ prompt = _build_batch_prompt(context, len(batch))
733
+
734
+ if batch_idx == 0:
735
+ logger.info(f"Qwen batch prompt: {prompt[:500]}")
736
+
737
+ # Build content with all images in the batch
738
+ content: list[dict[str, str]] = []
739
+ for frame_path, _ in batch:
740
+ content.append({"type": "image", "image": f"file://{frame_path}"})
741
+ content.append({"type": "text", "text": prompt})
742
+
743
+ messages = [{"role": "user", "content": content}]
744
+
745
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
746
+ image_inputs, video_inputs = process_vision_info(messages)
747
+ inputs = processor(
748
+ text=[text],
749
+ images=image_inputs,
750
+ videos=video_inputs,
751
+ padding=True,
752
+ return_tensors="pt",
753
+ )
754
+ inputs = inputs.to(torch_device)
755
+
756
+ with torch.no_grad():
757
+ generated_ids = model.generate(
758
+ **inputs,
759
+ max_new_tokens=512,
760
+ do_sample=False,
761
+ repetition_penalty=1.2,
762
+ no_repeat_ngram_size=3,
763
+ )
764
+ generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
765
+ output_text = processor.batch_decode(
766
+ generated_ids_trimmed,
767
+ skip_special_tokens=True,
768
+ clean_up_tokenization_spaces=False,
769
+ )[0]
770
+
771
+ logger.info(f"Qwen batch {batch_idx + 1} raw output: {output_text[:500]}")
772
+ objects, description = _parse_batch_response(output_text)
773
+
774
+ # Associate objects with the middle timestamp of the batch
775
+ batch_timestamps = [t for _, t in batch]
776
+ middle_timestamp = batch_timestamps[len(batch_timestamps) // 2]
777
+
778
+ for obj in objects:
779
+ obj_lower = obj.lower().strip()
780
+ all_objects[obj_lower] = all_objects.get(obj_lower, 0) + 1
781
+ detections.append(
782
+ ObjectDetection(
783
+ timestamp=round(middle_timestamp, 2),
784
+ label=obj_lower,
785
+ confidence=0.95,
786
+ bbox=BoundingBox(x=0, y=0, width=0, height=0),
787
+ )
788
+ )
789
+
790
+ if description:
791
+ descriptions.append(description)
792
+ logger.info(f"Batch {batch_idx + 1} description: {description}")
793
+
794
+ logger.info(f"Batch {batch_idx + 1} objects: {objects}")
795
+
796
+ del inputs, generated_ids
797
+ if torch_device == "mps":
798
+ torch.mps.empty_cache()
799
+ elif torch_device == "cuda":
800
+ torch.cuda.empty_cache()
801
+
802
+ except Exception as e:
803
+ logger.error(f"Failed to process batch {batch_idx + 1}: {e}", exc_info=True)
804
+ if torch_device == "mps":
805
+ torch.mps.empty_cache()
806
+
807
+ return all_objects, detections, descriptions
808
+
809
+
810
+ def _analyze_frames_batch_context(
811
+ model: Any,
812
+ processor: Any,
813
+ torch_device: str,
814
+ frame_paths: list[str],
815
+ timestamps: list[float],
816
+ context: dict[str, str] | None,
817
+ progress_callback: ProgressCallback | None,
818
+ batch_size: int | None = None,
819
+ overlap: bool = False,
820
+ ) -> tuple[dict[str, int], list[ObjectDetection], list[str]]:
821
+ """Analyze frames in batches with context passed between batches."""
822
+ from qwen_vl_utils import process_vision_info # type: ignore[import-not-found]
823
+
824
+ all_objects: dict[str, int] = {}
825
+ detections: list[ObjectDetection] = []
826
+ descriptions: list[str] = []
827
+
828
+ # Auto-select batch size based on available memory
829
+ if batch_size is None:
830
+ batch_size = get_auto_qwen_batch_size()
831
+
832
+ # Filter to valid frames
833
+ valid_frames = [(p, t) for p, t in zip(frame_paths, timestamps) if p and os.path.exists(p)]
834
+ if not valid_frames:
835
+ return all_objects, detections, descriptions
836
+
837
+ # Group frames into batches
838
+ # With overlap: last frame of batch N = first frame of batch N+1 (visual continuity)
839
+ # Without overlap: sequential non-overlapping batches (faster)
840
+ batches: list[list[tuple[str, float]]] = []
841
+ step = max(1, batch_size - 1) if overlap else batch_size
842
+ for i in range(0, len(valid_frames), step):
843
+ batch = valid_frames[i : i + batch_size]
844
+ if overlap:
845
+ if len(batch) >= 2: # Need at least 2 frames for temporal analysis
846
+ batches.append(batch)
847
+ elif not batches: # Edge case: very few frames
848
+ batches.append(batch)
849
+ else:
850
+ batches.append(batch)
851
+
852
+ total_batches = len(batches)
853
+ overlap_str = "overlapping " if overlap else ""
854
+ logger.info(f"Processing {len(valid_frames)} frames in {total_batches} {overlap_str}batches with context (size={batch_size}, step={step})")
855
+
856
+ group_context: str | None = None
857
+
858
+ for batch_idx, batch in enumerate(batches):
859
+ if progress_callback:
860
+ progress_callback(
861
+ f"Analyzing batch {batch_idx + 1}/{total_batches} (with context)...",
862
+ batch_idx + 1,
863
+ total_batches,
864
+ )
865
+
866
+ try:
867
+ # Build multi-image message with previous batch context
868
+ prompt = _build_batch_context_prompt(context, len(batch), group_context)
869
+
870
+ if batch_idx == 0:
871
+ logger.info(f"Qwen batch-context prompt: {prompt[:500]}")
872
+
873
+ # Build content with all images in the batch
874
+ content: list[dict[str, str]] = []
875
+ for frame_path, ts in batch:
876
+ # Verify frame exists and log size
877
+ if os.path.exists(frame_path):
878
+ size_kb = os.path.getsize(frame_path) / 1024
879
+ logger.info(f"Batch frame {ts:.1f}s: {size_kb:.1f}KB")
880
+ else:
881
+ logger.warning(f"Batch frame missing: {frame_path}")
882
+ content.append({"type": "image", "image": f"file://{frame_path}"})
883
+ content.append({"type": "text", "text": prompt})
884
+ logger.info(f"Batch {batch_idx + 1}: sending {len(batch)} images to Qwen")
885
+
886
+ messages = [{"role": "user", "content": content}]
887
+
888
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
889
+ image_inputs, video_inputs = process_vision_info(messages)
890
+ inputs = processor(
891
+ text=[text],
892
+ images=image_inputs,
893
+ videos=video_inputs,
894
+ padding=True,
895
+ return_tensors="pt",
896
+ )
897
+ inputs = inputs.to(torch_device)
898
+
899
+ with torch.no_grad():
900
+ generated_ids = model.generate(
901
+ **inputs,
902
+ max_new_tokens=512,
903
+ do_sample=False,
904
+ repetition_penalty=1.2,
905
+ no_repeat_ngram_size=3,
906
+ )
907
+ generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
908
+ output_text = processor.batch_decode(
909
+ generated_ids_trimmed,
910
+ skip_special_tokens=True,
911
+ clean_up_tokenization_spaces=False,
912
+ )[0]
913
+
914
+ logger.info(f"Qwen batch {batch_idx + 1} raw output: {output_text[:500]}")
915
+ objects, description = _parse_batch_response(output_text)
916
+
917
+ # Use description as context for next batch
918
+ if description:
919
+ group_context = description
920
+
921
+ # Associate objects with the middle timestamp of the batch
922
+ batch_timestamps = [t for _, t in batch]
923
+ middle_timestamp = batch_timestamps[len(batch_timestamps) // 2]
924
+
925
+ for obj in objects:
926
+ obj_lower = obj.lower().strip()
927
+ all_objects[obj_lower] = all_objects.get(obj_lower, 0) + 1
928
+ detections.append(
929
+ ObjectDetection(
930
+ timestamp=round(middle_timestamp, 2),
931
+ label=obj_lower,
932
+ confidence=0.95,
933
+ bbox=BoundingBox(x=0, y=0, width=0, height=0),
934
+ )
935
+ )
936
+
937
+ if description:
938
+ descriptions.append(description)
939
+ logger.info(f"Batch {batch_idx + 1} description: {description}")
940
+
941
+ logger.info(f"Batch {batch_idx + 1} objects: {objects}")
942
+
943
+ del inputs, generated_ids
944
+ if torch_device == "mps":
945
+ torch.mps.empty_cache()
946
+ elif torch_device == "cuda":
947
+ torch.cuda.empty_cache()
948
+
949
+ except Exception as e:
950
+ logger.error(f"Failed to process batch {batch_idx + 1}: {e}", exc_info=True)
951
+ if torch_device == "mps":
952
+ torch.mps.empty_cache()
953
+
954
+ return all_objects, detections, descriptions
955
+
956
+
957
+ def _fix_malformed_json(text: str) -> str:
958
+ """Fix common JSON malformations from VLM output."""
959
+ # Remove markdown code blocks
960
+ text = text.replace("```json", "").replace("```", "").strip()
961
+
962
+ # Remove invalid control characters (keep newlines and tabs for readability)
963
+ # Control chars are 0x00-0x1F except \t (0x09), \n (0x0A), \r (0x0D)
964
+ text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]", "", text)
965
+
966
+ # Fix escaped quotes before colons: "action\": -> "action":
967
+ text = text.replace('\\":', '":')
968
+
969
+ # Fix markdown bold in JSON keys: "action**: -> "action":
970
+ # Model sometimes outputs "key**: "value" instead of "key": "value"
971
+ text = re.sub(r'"\*+:', '":', text)
972
+ text = re.sub(r"(\w)\*+:", r'\1":', text) # action**: -> action":
973
+
974
+ # Replace single quotes with double quotes for keys and string values
975
+ # But be careful not to replace apostrophes within words
976
+ # First, handle keys: 'key': -> "key":
977
+ text = re.sub(r"'(\w+)'(\s*):", r'"\1"\2:', text)
978
+
979
+ # Handle string values: : 'value' -> : "value"
980
+ # This regex looks for : followed by optional whitespace and a single-quoted string
981
+ text = re.sub(r":\s*'([^']*)'", r': "\1"', text)
982
+
983
+ # Remove trailing commas before ] or }
984
+ text = re.sub(r",(\s*[\]\}])", r"\1", text)
985
+
986
+ return text
987
+
988
+
989
+ def _parse_batch_response(response: str) -> tuple[list[str], str | None]:
990
+ """Parse objects and description from batch analysis response.
991
+
992
+ Handles both standard format and batch-specific format with action field.
993
+ """
994
+ objects: list[str] = []
995
+ description: str | None = None
996
+
997
+ try:
998
+ clean_response = _fix_malformed_json(response)
999
+
1000
+ if "{" in clean_response:
1001
+ start_brace = clean_response.find("{")
1002
+ json_str = clean_response[start_brace : clean_response.rindex("}") + 1]
1003
+ data = json.loads(json_str)
1004
+
1005
+ # Extract objects
1006
+ raw_objects = data.get("objects", [])
1007
+ for obj in raw_objects:
1008
+ if isinstance(obj, str) and len(obj) < 100 and obj.strip():
1009
+ objects.append(obj)
1010
+ elif isinstance(obj, dict):
1011
+ name = obj.get("name", "") or obj.get("label", "")
1012
+ if isinstance(name, str) and len(name) < 100 and name.strip():
1013
+ objects.append(name)
1014
+
1015
+ # Build description from available fields
1016
+ desc_parts = []
1017
+
1018
+ # Action field (batch-specific)
1019
+ action = data.get("action", "")
1020
+ if isinstance(action, str) and action.strip():
1021
+ desc_parts.append(action.strip())
1022
+
1023
+ # Standard description
1024
+ desc = data.get("description", "")
1025
+ if isinstance(desc, str) and desc.strip():
1026
+ desc_parts.append(desc.strip())
1027
+
1028
+ # Continues field (batch-context specific)
1029
+ continues = data.get("continues", "")
1030
+ if isinstance(continues, str) and continues.strip():
1031
+ desc_parts.append(continues.strip())
1032
+
1033
+ # Change field (context-specific)
1034
+ change = data.get("change", "")
1035
+ if isinstance(change, str) and change.strip():
1036
+ desc_parts.append(f"Change: {change.strip()}")
1037
+
1038
+ # Check for error field (model couldn't fully analyze)
1039
+ error = data.get("error", "")
1040
+ if isinstance(error, str) and error.strip():
1041
+ logger.warning(f"Qwen reported issue: {error}")
1042
+
1043
+ if desc_parts:
1044
+ description = " ".join(desc_parts)
1045
+
1046
+ return objects, description
1047
+
1048
+ except (json.JSONDecodeError, ValueError) as e:
1049
+ logger.warning(f"Failed to parse batch JSON from Qwen response: {e}")
1050
+
1051
+ # Try to extract objects from partial/truncated JSON using regex
1052
+ # Look for "name": "value" patterns in the objects array
1053
+ name_matches = re.findall(r'"name"\s*:\s*"([^"]+)"', response)
1054
+ if name_matches:
1055
+ objects = [n for n in name_matches if len(n) < 100 and n.strip()]
1056
+ logger.info(f"Extracted {len(objects)} objects from partial JSON: {objects}")
1057
+ if objects:
1058
+ return objects, None
1059
+
1060
+ # Look for simple string arrays: ["item1", "item2"]
1061
+ array_match = re.search(r'"objects"\s*:\s*\[([^\]]*)', response)
1062
+ if array_match:
1063
+ items = re.findall(r'"([^"]+)"', array_match.group(1))
1064
+ objects = [i for i in items if len(i) < 100 and i.strip() and i not in ("name", "color", "location")]
1065
+ if objects:
1066
+ logger.info(f"Extracted {len(objects)} objects from array: {objects}")
1067
+
1068
+ # Try to extract description from malformed JSON
1069
+ desc_match = re.search(r'"description["\*]*\s*:\s*"([^"]+)"', response)
1070
+ if desc_match:
1071
+ description = desc_match.group(1).strip()
1072
+ logger.info(f"Extracted description from partial JSON: {description}")
1073
+ return objects, description
1074
+
1075
+ if objects:
1076
+ return objects, None
1077
+
1078
+ # Fallback to standard parser
1079
+ return _parse_objects_and_description(response)
1080
+
1081
+
337
1082
  def extract_objects_qwen(
338
1083
  file_path: str,
339
1084
  timestamps: list[float] | None = None,
@@ -341,6 +1086,8 @@ def extract_objects_qwen(
341
1086
  context: dict[str, str] | None = None,
342
1087
  progress_callback: ProgressCallback | None = None,
343
1088
  lut_path: str | None = None,
1089
+ batch_overlap: bool = False,
1090
+ strategy: str | None = None,
344
1091
  ) -> ObjectsResult:
345
1092
  """Extract objects using Qwen2-VL vision-language model.
346
1093
 
@@ -359,13 +1106,20 @@ def extract_objects_qwen(
359
1106
  - "topic": Subject matter of the video
360
1107
  progress_callback: Optional callback for progress updates (message, current, total)
361
1108
  lut_path: Optional path to a LUT file (.cube) to apply for log footage color correction
1109
+ batch_overlap: If True, batches overlap by 1 frame for visual continuity.
1110
+ Useful for unstable camera or videos with rapid scene changes.
1111
+ Default False for faster processing.
1112
+ strategy: Override Qwen strategy for this file. One of:
1113
+ - "single": No temporal context (fastest)
1114
+ - "context": Pass previous description as text
1115
+ - "batch": Multi-frame batches
1116
+ - "batch_context": Batches with text context between (richest)
1117
+ If None, uses global setting from config.
362
1118
 
363
1119
  Returns:
364
1120
  ObjectsResult with detected objects and contextual descriptions
365
1121
  """
366
- from qwen_vl_utils import process_vision_info # type: ignore[import-not-found]
367
-
368
- logger.info(f"extract_objects_qwen called: file={file_path}, timestamps={timestamps}, context={context}")
1122
+ logger.info(f"extract_objects_qwen called: file={file_path}, lut_path={lut_path}, timestamps={timestamps}")
369
1123
 
370
1124
  settings = get_settings()
371
1125
  # Resolve model name (handles "auto")
@@ -384,7 +1138,7 @@ def extract_objects_qwen(
384
1138
  if timestamps is None:
385
1139
  duration = _get_video_duration(file_path)
386
1140
  timestamps = [duration / 2]
387
- logger.info(f"No timestamps provided, sampling from middle ({duration/2:.1f}s)")
1141
+ logger.info(f"No timestamps provided, sampling from middle ({duration / 2:.1f}s)")
388
1142
  else:
389
1143
  logger.info(f"Analyzing {len(timestamps)} provided timestamps")
390
1144
 
@@ -398,26 +1152,22 @@ def extract_objects_qwen(
398
1152
  else:
399
1153
  context = context.copy() # Don't modify the original
400
1154
 
401
- if lut_path and os.path.exists(lut_path):
402
- # LUT applied - colors are corrected but may still be slightly off
403
- context["log_footage_note"] = (
404
- "This footage was recorded in LOG profile and color-corrected with a LUT. " "Colors shown are the corrected version but may still appear slightly desaturated."
405
- )
406
- logger.info("Added log footage context hint (with LUT)")
407
- elif is_log_footage:
408
- # LOG detected but no LUT - colors are definitely off
409
- context["log_footage_note"] = (
410
- f"This footage appears to be in LOG/flat color profile ({color_transfer}). "
411
- "Colors are desaturated and not representative of the actual scene. "
412
- "Focus on describing content and action, not colors."
413
- )
414
- logger.info(f"Added log footage context hint (no LUT, color_transfer={color_transfer})")
1155
+ # Determine if we need auto-normalization (LOG footage without LUT)
1156
+ has_lut = lut_path and os.path.exists(lut_path)
1157
+ auto_normalize = is_log_footage and not has_lut
1158
+
1159
+ if has_lut:
1160
+ # LUT applied - colors are corrected
1161
+ logger.info(f"LOG footage detected, applying LUT: {lut_path}")
1162
+ elif auto_normalize:
1163
+ # LOG detected, no LUT - will apply auto-normalization
1164
+ logger.info(f"LOG footage detected ({color_transfer}), applying auto-normalization")
415
1165
 
416
1166
  # IMPORTANT: Extract frames BEFORE loading the model!
417
1167
  # ffmpeg can crash (SIGABRT) when forked from a process with MPS/Metal loaded.
418
1168
  if progress_callback:
419
1169
  progress_callback("Extracting frames...", None, None)
420
- frame_paths = _extract_frames_at_timestamps(file_path, temp_dir, timestamps, lut_path=lut_path)
1170
+ frame_paths = _extract_frames_at_timestamps(file_path, temp_dir, timestamps, lut_path=lut_path, auto_normalize=auto_normalize)
421
1171
  total_frames = len([p for p in frame_paths if p])
422
1172
 
423
1173
  if total_frames == 0:
@@ -431,7 +1181,7 @@ def extract_objects_qwen(
431
1181
  except (RuntimeError, MemoryError, OSError) as e:
432
1182
  error_msg = str(e).lower()
433
1183
  if "out of memory" in error_msg or "cannot allocate" in error_msg:
434
- logger.error(f"Out of memory loading Qwen model. " f"Close other apps or use a cloud vision API. Error: {e}")
1184
+ logger.error(f"Out of memory loading Qwen model. Close other apps or use a cloud vision API. Error: {e}")
435
1185
  # Return empty result - frontend can fall back to cloud API if configured
436
1186
  return ObjectsResult(
437
1187
  summary={},
@@ -443,108 +1193,41 @@ def extract_objects_qwen(
443
1193
 
444
1194
  logger.info(f"Processing {total_frames} frames for Qwen analysis")
445
1195
 
446
- all_objects: dict[str, int] = {}
447
- detections: list[ObjectDetection] = []
448
- descriptions: list[str] = []
449
- frame_count = 0
450
-
451
- for frame_path, timestamp in zip(frame_paths, timestamps):
452
- if not frame_path or not os.path.exists(frame_path):
453
- logger.warning(f"Skipping missing frame at {timestamp}s: {frame_path}")
454
- continue
455
-
456
- frame_count += 1
457
- if progress_callback:
458
- progress_callback(
459
- f"Analyzing frame {frame_count}/{total_frames}...",
460
- frame_count,
461
- total_frames,
462
- )
463
-
464
- try:
465
- # Build the prompt with optional context
466
- prompt = _build_analysis_prompt(context)
467
-
468
- # Log prompt on first frame for debugging
469
- if frame_count == 1:
470
- logger.info(f"Qwen prompt: {prompt[:500]}")
471
-
472
- # Prepare message for Qwen - ask for both objects and description
473
- messages = [
474
- {
475
- "role": "user",
476
- "content": [
477
- {"type": "image", "image": f"file://{frame_path}"},
478
- {"type": "text", "text": prompt},
479
- ],
480
- }
481
- ]
482
-
483
- # Process inputs
484
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
485
- image_inputs, video_inputs = process_vision_info(messages)
486
- inputs = processor(
487
- text=[text],
488
- images=image_inputs,
489
- videos=video_inputs,
490
- padding=True,
491
- return_tensors="pt",
492
- )
493
- inputs = inputs.to(torch_device)
494
-
495
- # Generate response with repetition penalty to prevent loops
496
- with torch.no_grad():
497
- generated_ids = model.generate(
498
- **inputs,
499
- max_new_tokens=512,
500
- do_sample=False, # Greedy decoding for consistent JSON
501
- repetition_penalty=1.2, # Penalize repetition
502
- no_repeat_ngram_size=3, # Prevent 3-gram repetition
503
- )
504
- generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
505
- output_text = processor.batch_decode(
506
- generated_ids_trimmed,
507
- skip_special_tokens=True,
508
- clean_up_tokenization_spaces=False,
509
- )[0]
510
-
511
- # Parse response
512
- logger.info(f"Qwen raw output for {timestamp:.1f}s: {output_text[:500]}")
513
- objects, description = _parse_objects_and_description(output_text)
514
- if not description:
515
- logger.warning(f"No description parsed from Qwen output at {timestamp:.1f}s")
516
- for obj in objects:
517
- obj_lower = obj.lower().strip()
518
- all_objects[obj_lower] = all_objects.get(obj_lower, 0) + 1
519
-
520
- detections.append(
521
- ObjectDetection(
522
- timestamp=round(timestamp, 2),
523
- label=obj_lower,
524
- confidence=0.95, # VLM confidence is generally high
525
- bbox=BoundingBox(x=0, y=0, width=0, height=0), # No bbox from VLM
526
- )
527
- )
528
-
529
- if description:
530
- descriptions.append(description)
531
- logger.info(f"Frame {timestamp:.1f}s description: {description}")
532
-
533
- logger.info(f"Frame {timestamp:.1f}s objects: {objects}")
534
-
535
- # Clear memory after each frame
536
- del inputs, generated_ids
537
- if torch_device == "mps":
538
- torch.mps.empty_cache()
539
- elif torch_device == "cuda":
540
- torch.cuda.empty_cache()
541
-
542
- except Exception as e:
543
- logger.error(f"Failed to process frame {frame_path}: {e}", exc_info=True)
544
- # Try to recover memory
545
- if torch_device == "mps":
546
- torch.mps.empty_cache()
547
- continue
1196
+ # Get strategy for multi-frame analysis (use override if provided)
1197
+ if strategy is not None:
1198
+ resolved_strategy = QwenStrategy(strategy)
1199
+ logger.info(f"Using Qwen strategy override: {resolved_strategy}")
1200
+ else:
1201
+ resolved_strategy = settings.get_qwen_strategy()
1202
+ logger.info(f"Using Qwen strategy from config: {resolved_strategy}")
1203
+
1204
+ # Dispatch to appropriate strategy implementation
1205
+ if resolved_strategy == QwenStrategy.SINGLE:
1206
+ all_objects, detections, descriptions = _analyze_frames_single(model, processor, torch_device, frame_paths, timestamps, context, progress_callback)
1207
+ elif resolved_strategy == QwenStrategy.CONTEXT:
1208
+ all_objects, detections, descriptions = _analyze_frames_with_context(model, processor, torch_device, frame_paths, timestamps, context, progress_callback)
1209
+ elif resolved_strategy == QwenStrategy.BATCH:
1210
+ all_objects, detections, descriptions = _analyze_frames_batch(
1211
+ model,
1212
+ processor,
1213
+ torch_device,
1214
+ frame_paths,
1215
+ timestamps,
1216
+ context,
1217
+ progress_callback,
1218
+ overlap=batch_overlap,
1219
+ )
1220
+ else: # BATCH_CONTEXT
1221
+ all_objects, detections, descriptions = _analyze_frames_batch_context(
1222
+ model,
1223
+ processor,
1224
+ torch_device,
1225
+ frame_paths,
1226
+ timestamps,
1227
+ context,
1228
+ progress_callback,
1229
+ overlap=batch_overlap,
1230
+ )
548
1231
 
549
1232
  # Deduplicate - count unique objects per type
550
1233
  unique_objects = _deduplicate_objects(all_objects)
@@ -574,6 +1257,7 @@ def _extract_frames_at_timestamps(
574
1257
  timestamps: list[float],
575
1258
  max_width: int = 1280,
576
1259
  lut_path: str | None = None,
1260
+ auto_normalize: bool = False,
577
1261
  ) -> list[str]:
578
1262
  """Extract frames at specific timestamps, resized for VLM inference.
579
1263
 
@@ -587,6 +1271,8 @@ def _extract_frames_at_timestamps(
587
1271
  timestamps: List of timestamps to extract (in seconds)
588
1272
  max_width: Maximum width for scaling (default 1280)
589
1273
  lut_path: Optional path to a .cube LUT file for color correction
1274
+ auto_normalize: If True and no LUT, apply automatic color normalization
1275
+ for LOG footage (boosts contrast and saturation)
590
1276
  """
591
1277
  import subprocess
592
1278
 
@@ -596,16 +1282,28 @@ def _extract_frames_at_timestamps(
596
1282
 
597
1283
  logger.info(f"Extracting {len(timestamps)} frames from {file_path} at timestamps {timestamps}")
598
1284
 
599
- # If LUT is provided, use ffmpeg directly for extraction with LUT applied
600
- if lut_path and os.path.exists(lut_path):
601
- logger.info(f"Applying LUT: {lut_path}")
1285
+ # Use ffmpeg with color correction if LUT provided OR auto-normalize requested
1286
+ use_ffmpeg_color = (lut_path and os.path.exists(lut_path)) or auto_normalize
1287
+
1288
+ if use_ffmpeg_color:
1289
+ # Build color correction filter
1290
+ if lut_path and os.path.exists(lut_path):
1291
+ logger.info(f"Applying LUT: {lut_path}")
1292
+ color_filter = f"lut3d='{lut_path}'"
1293
+ else:
1294
+ # Auto-normalize for LOG footage: apply S-curve + saturation boost
1295
+ # This converts flat LOG footage to a more viewable range for VLM analysis
1296
+ # curves: S-curve to add contrast (lift shadows, compress highlights)
1297
+ # eq: boost saturation since LOG footage is very desaturated
1298
+ logger.info("Applying auto-normalization for LOG footage (no LUT configured)")
1299
+ color_filter = "curves=master='0/0 0.15/0.30 0.5/0.5 0.85/0.70 1/1',eq=saturation=1.4:contrast=1.1"
1300
+
602
1301
  for i, ts in enumerate(timestamps):
603
1302
  output_path = os.path.join(output_dir, f"frame_{i:04d}.jpg")
604
1303
  try:
605
- # Build filter chain: LUT + scale
1304
+ # Build filter chain: color correction + scale
606
1305
  scale_filter = f"scale={max_width}:{max_width}:force_original_aspect_ratio=decrease"
607
- lut_filter = f"lut3d='{lut_path}'"
608
- vf = f"{lut_filter},{scale_filter}"
1306
+ vf = f"{color_filter},{scale_filter}"
609
1307
 
610
1308
  cmd = [
611
1309
  "ffmpeg",
@@ -628,9 +1326,10 @@ def _extract_frames_at_timestamps(
628
1326
 
629
1327
  if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
630
1328
  frame_paths.append(output_path)
631
- logger.info(f"Extracted frame {i} at {ts:.2f}s with LUT: {output_path}")
1329
+ correction_type = "LUT" if (lut_path and os.path.exists(lut_path)) else "auto-normalized"
1330
+ logger.info(f"Extracted frame {i} at {ts:.2f}s ({correction_type}): {output_path}")
632
1331
  else:
633
- logger.warning(f"Frame at {ts:.2f}s: could not extract with LUT")
1332
+ logger.warning(f"Frame at {ts:.2f}s: could not extract with color correction")
634
1333
  frame_paths.append("")
635
1334
  except subprocess.CalledProcessError as e:
636
1335
  logger.warning(f"Frame at {ts:.2f}s: ffmpeg failed: {e}")
@@ -667,8 +1366,7 @@ def _parse_objects_and_description(response: str) -> tuple[list[str], str | None
667
1366
 
668
1367
  # Try to find and parse JSON
669
1368
  try:
670
- # Remove markdown code block markers
671
- clean_response = response.replace("```json", "").replace("```", "").strip()
1369
+ clean_response = _fix_malformed_json(response)
672
1370
 
673
1371
  # Try to parse as JSON (could be object or array)
674
1372
  if "[" in clean_response or "{" in clean_response: