media-engine 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,7 @@
3
3
  import json
4
4
  import logging
5
5
  import os
6
+ import re
6
7
  import shutil
7
8
  import tempfile
8
9
  from collections.abc import Callable
@@ -13,6 +14,8 @@ import torch
13
14
 
14
15
  from media_engine.config import (
15
16
  DeviceType,
17
+ QwenStrategy,
18
+ get_auto_qwen_batch_size,
16
19
  get_device,
17
20
  get_free_memory_gb,
18
21
  get_settings,
@@ -334,6 +337,690 @@ Rules for description:
334
337
  Respond with JSON only, no other text."""
335
338
 
336
339
 
340
+ def _build_context_prompt(
341
+ context: dict[str, str] | None = None,
342
+ previous_description: str | None = None,
343
+ ) -> str:
344
+ """Build prompt for CONTEXT strategy - includes previous frame description."""
345
+ base_prompt = _build_analysis_prompt(context)
346
+
347
+ if not previous_description:
348
+ return base_prompt
349
+
350
+ # Insert previous frame context before the analysis request
351
+ context_insert = f"""
352
+ Previous frame showed: {previous_description}
353
+
354
+ Describe what's happening NOW and how it relates to the previous frame.
355
+ Focus on: objects visible, actions occurring, any changes from before.
356
+
357
+ """
358
+ # Modify the JSON format to include "change" field
359
+ modified_prompt = base_prompt.replace(
360
+ '{"objects": ["item1", "item2"], "description": "One or two sentences describing the scene."}',
361
+ '{"objects": ["item1", "item2"], "description": "What\'s happening now.", "change": "How this differs from the previous frame."}',
362
+ )
363
+
364
+ # Insert context after any existing context section but before "Look at this image"
365
+ if "Look at this image" in modified_prompt:
366
+ parts = modified_prompt.split("Look at this image")
367
+ return parts[0] + context_insert + "Look at this image" + parts[1]
368
+
369
+ return context_insert + modified_prompt
370
+
371
+
372
+ def _build_batch_prompt(
373
+ context: dict[str, str] | None = None,
374
+ num_frames: int = 3,
375
+ ) -> str:
376
+ """Build prompt for BATCH strategy - analyzes multiple frames together."""
377
+ # Get person name from context for instructions
378
+ person_name = context.get("person", "") if context else ""
379
+
380
+ # Build context section if available
381
+ context_section = ""
382
+ if context:
383
+ context_lines = ["Known context about this video:"]
384
+ labels = {
385
+ "person": "Person identified",
386
+ "location": "Location",
387
+ "nearby_landmarks": "Nearby landmarks/POIs",
388
+ "activity": "Activity",
389
+ "language": "Language spoken",
390
+ "device": "Filmed with",
391
+ }
392
+ for key, value in context.items():
393
+ if value and key not in ("log_footage_note", "color_transfer"):
394
+ label = labels.get(key, key.replace("_", " ").title())
395
+ context_lines.append(f"- {label}: {value}")
396
+ context_section = "\n".join(context_lines) + "\n\n"
397
+
398
+ person_instruction = ""
399
+ if person_name:
400
+ person_instruction = f'Use "{person_name}" instead of "person" in objects and description.\n'
401
+
402
+ return f"""{context_section}These {num_frames} frames are from the same video in sequence.
403
+ {person_instruction}
404
+ Analyze what happens ACROSS these frames:
405
+ 1. What objects/people are visible throughout?
406
+ 2. What ACTION or movement occurs across the frames?
407
+ 3. How does the scene change from first to last frame?
408
+
409
+ You MUST respond with ONLY this exact JSON format:
410
+ {{"objects": ["item1", "item2"], "action": "The action happening across frames", "description": "Overall scene description"}}
411
+
412
+ Rules:
413
+ - List objects visible in ANY of the frames
414
+ - Describe the ACTION that unfolds across frames (e.g., "person walks toward camera", "car turns left")
415
+ - Keep description to 1-2 sentences summarizing the sequence
416
+
417
+ Respond with JSON only, no other text."""
418
+
419
+
420
+ def _build_batch_context_prompt(
421
+ context: dict[str, str] | None = None,
422
+ num_frames: int = 3,
423
+ group_context: str | None = None,
424
+ ) -> str:
425
+ """Build prompt for BATCH_CONTEXT strategy - batch with previous group context."""
426
+ base_prompt = _build_batch_prompt(context, num_frames)
427
+
428
+ if not group_context:
429
+ return base_prompt
430
+
431
+ context_insert = f"""Previous scene: {group_context}
432
+
433
+ What happens next in these frames? How does it continue from before?
434
+
435
+ """
436
+ # Modify JSON format to include "continues" field
437
+ modified_prompt = base_prompt.replace(
438
+ '{"objects": ["item1", "item2"], "action": "The action happening across frames", "description": "Overall scene description"}',
439
+ '{"objects": ["item1", "item2"], "action": "The action in these frames", "description": "Scene description", "continues": "How this continues from the previous scene"}',
440
+ )
441
+
442
+ # Insert after context section but before "These X frames"
443
+ if "These " in modified_prompt and " frames are" in modified_prompt:
444
+ idx = modified_prompt.find("These ")
445
+ return modified_prompt[:idx] + context_insert + modified_prompt[idx:]
446
+
447
+ return context_insert + modified_prompt
448
+
449
+
450
+ def _analyze_frames_single(
451
+ model: Any,
452
+ processor: Any,
453
+ torch_device: str,
454
+ frame_paths: list[str],
455
+ timestamps: list[float],
456
+ context: dict[str, str] | None,
457
+ progress_callback: ProgressCallback | None,
458
+ ) -> tuple[dict[str, int], list[ObjectDetection], list[str]]:
459
+ """Analyze frames one at a time without temporal context (original behavior)."""
460
+ from qwen_vl_utils import process_vision_info # type: ignore[import-not-found]
461
+
462
+ all_objects: dict[str, int] = {}
463
+ detections: list[ObjectDetection] = []
464
+ descriptions: list[str] = []
465
+
466
+ total_frames = len([p for p in frame_paths if p])
467
+ frame_count = 0
468
+
469
+ for frame_path, timestamp in zip(frame_paths, timestamps):
470
+ if not frame_path or not os.path.exists(frame_path):
471
+ continue
472
+
473
+ frame_count += 1
474
+ if progress_callback:
475
+ progress_callback(
476
+ f"Analyzing frame {frame_count}/{total_frames}...",
477
+ frame_count,
478
+ total_frames,
479
+ )
480
+
481
+ try:
482
+ prompt = _build_analysis_prompt(context)
483
+
484
+ messages = [
485
+ {
486
+ "role": "user",
487
+ "content": [
488
+ {"type": "image", "image": f"file://{frame_path}"},
489
+ {"type": "text", "text": prompt},
490
+ ],
491
+ }
492
+ ]
493
+
494
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
495
+ image_inputs, video_inputs = process_vision_info(messages)
496
+ inputs = processor(
497
+ text=[text],
498
+ images=image_inputs,
499
+ videos=video_inputs,
500
+ padding=True,
501
+ return_tensors="pt",
502
+ )
503
+ inputs = inputs.to(torch_device)
504
+
505
+ with torch.no_grad():
506
+ generated_ids = model.generate(
507
+ **inputs,
508
+ max_new_tokens=512,
509
+ do_sample=False,
510
+ repetition_penalty=1.2,
511
+ no_repeat_ngram_size=3,
512
+ )
513
+ generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
514
+ output_text = processor.batch_decode(
515
+ generated_ids_trimmed,
516
+ skip_special_tokens=True,
517
+ clean_up_tokenization_spaces=False,
518
+ )[0]
519
+
520
+ logger.info(f"Qwen raw output for {timestamp:.1f}s: {output_text[:500]}")
521
+ objects, description = _parse_objects_and_description(output_text)
522
+
523
+ for obj in objects:
524
+ obj_lower = obj.lower().strip()
525
+ all_objects[obj_lower] = all_objects.get(obj_lower, 0) + 1
526
+ detections.append(
527
+ ObjectDetection(
528
+ timestamp=round(timestamp, 2),
529
+ label=obj_lower,
530
+ confidence=0.95,
531
+ bbox=BoundingBox(x=0, y=0, width=0, height=0),
532
+ )
533
+ )
534
+
535
+ if description:
536
+ descriptions.append(description)
537
+ logger.info(f"Frame {timestamp:.1f}s description: {description}")
538
+
539
+ logger.info(f"Frame {timestamp:.1f}s objects: {objects}")
540
+
541
+ del inputs, generated_ids
542
+ if torch_device == "mps":
543
+ torch.mps.empty_cache()
544
+ elif torch_device == "cuda":
545
+ torch.cuda.empty_cache()
546
+
547
+ except Exception as e:
548
+ logger.error(f"Failed to process frame {frame_path}: {e}", exc_info=True)
549
+ if torch_device == "mps":
550
+ torch.mps.empty_cache()
551
+
552
+ return all_objects, detections, descriptions
553
+
554
+
555
+ def _analyze_frames_with_context(
556
+ model: Any,
557
+ processor: Any,
558
+ torch_device: str,
559
+ frame_paths: list[str],
560
+ timestamps: list[float],
561
+ context: dict[str, str] | None,
562
+ progress_callback: ProgressCallback | None,
563
+ ) -> tuple[dict[str, int], list[ObjectDetection], list[str]]:
564
+ """Analyze frames sequentially, passing previous description as context."""
565
+ from qwen_vl_utils import process_vision_info # type: ignore[import-not-found]
566
+
567
+ all_objects: dict[str, int] = {}
568
+ detections: list[ObjectDetection] = []
569
+ descriptions: list[str] = []
570
+
571
+ total_frames = len([p for p in frame_paths if p])
572
+ frame_count = 0
573
+ previous_description: str | None = None
574
+
575
+ for frame_path, timestamp in zip(frame_paths, timestamps):
576
+ if not frame_path or not os.path.exists(frame_path):
577
+ continue
578
+
579
+ frame_count += 1
580
+ if progress_callback:
581
+ progress_callback(
582
+ f"Analyzing frame {frame_count}/{total_frames} (with context)...",
583
+ frame_count,
584
+ total_frames,
585
+ )
586
+
587
+ try:
588
+ # Build prompt with previous frame's description as context
589
+ prompt = _build_context_prompt(context, previous_description)
590
+
591
+ if frame_count == 1:
592
+ logger.info(f"Qwen context prompt (first frame): {prompt[:500]}")
593
+
594
+ messages = [
595
+ {
596
+ "role": "user",
597
+ "content": [
598
+ {"type": "image", "image": f"file://{frame_path}"},
599
+ {"type": "text", "text": prompt},
600
+ ],
601
+ }
602
+ ]
603
+
604
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
605
+ image_inputs, video_inputs = process_vision_info(messages)
606
+ inputs = processor(
607
+ text=[text],
608
+ images=image_inputs,
609
+ videos=video_inputs,
610
+ padding=True,
611
+ return_tensors="pt",
612
+ )
613
+ inputs = inputs.to(torch_device)
614
+
615
+ with torch.no_grad():
616
+ generated_ids = model.generate(
617
+ **inputs,
618
+ max_new_tokens=512,
619
+ do_sample=False,
620
+ repetition_penalty=1.2,
621
+ no_repeat_ngram_size=3,
622
+ )
623
+ generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
624
+ output_text = processor.batch_decode(
625
+ generated_ids_trimmed,
626
+ skip_special_tokens=True,
627
+ clean_up_tokenization_spaces=False,
628
+ )[0]
629
+
630
+ logger.info(f"Qwen raw output for {timestamp:.1f}s: {output_text[:500]}")
631
+ objects, description = _parse_objects_and_description(output_text)
632
+
633
+ for obj in objects:
634
+ obj_lower = obj.lower().strip()
635
+ all_objects[obj_lower] = all_objects.get(obj_lower, 0) + 1
636
+ detections.append(
637
+ ObjectDetection(
638
+ timestamp=round(timestamp, 2),
639
+ label=obj_lower,
640
+ confidence=0.95,
641
+ bbox=BoundingBox(x=0, y=0, width=0, height=0),
642
+ )
643
+ )
644
+
645
+ if description:
646
+ descriptions.append(description)
647
+ previous_description = description # Pass to next frame
648
+ logger.info(f"Frame {timestamp:.1f}s description: {description}")
649
+
650
+ logger.info(f"Frame {timestamp:.1f}s objects: {objects}")
651
+
652
+ del inputs, generated_ids
653
+ if torch_device == "mps":
654
+ torch.mps.empty_cache()
655
+ elif torch_device == "cuda":
656
+ torch.cuda.empty_cache()
657
+
658
+ except Exception as e:
659
+ logger.error(f"Failed to process frame {frame_path}: {e}", exc_info=True)
660
+ if torch_device == "mps":
661
+ torch.mps.empty_cache()
662
+
663
+ return all_objects, detections, descriptions
664
+
665
+
666
+ def _analyze_frames_batch(
667
+ model: Any,
668
+ processor: Any,
669
+ torch_device: str,
670
+ frame_paths: list[str],
671
+ timestamps: list[float],
672
+ context: dict[str, str] | None,
673
+ progress_callback: ProgressCallback | None,
674
+ batch_size: int | None = None,
675
+ overlap: bool = False,
676
+ ) -> tuple[dict[str, int], list[ObjectDetection], list[str]]:
677
+ """Analyze frames in batches for temporal understanding."""
678
+ from qwen_vl_utils import process_vision_info # type: ignore[import-not-found]
679
+
680
+ all_objects: dict[str, int] = {}
681
+ detections: list[ObjectDetection] = []
682
+ descriptions: list[str] = []
683
+
684
+ # Auto-select batch size based on available memory
685
+ if batch_size is None:
686
+ batch_size = get_auto_qwen_batch_size()
687
+
688
+ # Filter to valid frames
689
+ valid_frames = [(p, t) for p, t in zip(frame_paths, timestamps) if p and os.path.exists(p)]
690
+ if not valid_frames:
691
+ return all_objects, detections, descriptions
692
+
693
+ # Group frames into batches
694
+ # With overlap: last frame of batch N = first frame of batch N+1 (visual continuity)
695
+ # Without overlap: sequential non-overlapping batches (faster)
696
+ batches: list[list[tuple[str, float]]] = []
697
+ step = max(1, batch_size - 1) if overlap else batch_size
698
+ for i in range(0, len(valid_frames), step):
699
+ batch = valid_frames[i : i + batch_size]
700
+ if overlap:
701
+ if len(batch) >= 2: # Need at least 2 frames for temporal analysis
702
+ batches.append(batch)
703
+ elif not batches: # Edge case: very few frames
704
+ batches.append(batch)
705
+ else:
706
+ batches.append(batch)
707
+
708
+ total_batches = len(batches)
709
+ overlap_str = "overlapping " if overlap else ""
710
+ logger.info(f"Processing {len(valid_frames)} frames in {total_batches} {overlap_str}batches (size={batch_size}, step={step})")
711
+
712
+ for batch_idx, batch in enumerate(batches):
713
+ if progress_callback:
714
+ progress_callback(
715
+ f"Analyzing batch {batch_idx + 1}/{total_batches}...",
716
+ batch_idx + 1,
717
+ total_batches,
718
+ )
719
+
720
+ try:
721
+ # Build multi-image message
722
+ prompt = _build_batch_prompt(context, len(batch))
723
+
724
+ if batch_idx == 0:
725
+ logger.info(f"Qwen batch prompt: {prompt[:500]}")
726
+
727
+ # Build content with all images in the batch
728
+ content: list[dict[str, str]] = []
729
+ for frame_path, _ in batch:
730
+ content.append({"type": "image", "image": f"file://{frame_path}"})
731
+ content.append({"type": "text", "text": prompt})
732
+
733
+ messages = [{"role": "user", "content": content}]
734
+
735
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
736
+ image_inputs, video_inputs = process_vision_info(messages)
737
+ inputs = processor(
738
+ text=[text],
739
+ images=image_inputs,
740
+ videos=video_inputs,
741
+ padding=True,
742
+ return_tensors="pt",
743
+ )
744
+ inputs = inputs.to(torch_device)
745
+
746
+ with torch.no_grad():
747
+ generated_ids = model.generate(
748
+ **inputs,
749
+ max_new_tokens=512,
750
+ do_sample=False,
751
+ repetition_penalty=1.2,
752
+ no_repeat_ngram_size=3,
753
+ )
754
+ generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
755
+ output_text = processor.batch_decode(
756
+ generated_ids_trimmed,
757
+ skip_special_tokens=True,
758
+ clean_up_tokenization_spaces=False,
759
+ )[0]
760
+
761
+ logger.info(f"Qwen batch {batch_idx + 1} raw output: {output_text[:500]}")
762
+ objects, description = _parse_batch_response(output_text)
763
+
764
+ # Associate objects with the middle timestamp of the batch
765
+ batch_timestamps = [t for _, t in batch]
766
+ middle_timestamp = batch_timestamps[len(batch_timestamps) // 2]
767
+
768
+ for obj in objects:
769
+ obj_lower = obj.lower().strip()
770
+ all_objects[obj_lower] = all_objects.get(obj_lower, 0) + 1
771
+ detections.append(
772
+ ObjectDetection(
773
+ timestamp=round(middle_timestamp, 2),
774
+ label=obj_lower,
775
+ confidence=0.95,
776
+ bbox=BoundingBox(x=0, y=0, width=0, height=0),
777
+ )
778
+ )
779
+
780
+ if description:
781
+ descriptions.append(description)
782
+ logger.info(f"Batch {batch_idx + 1} description: {description}")
783
+
784
+ logger.info(f"Batch {batch_idx + 1} objects: {objects}")
785
+
786
+ del inputs, generated_ids
787
+ if torch_device == "mps":
788
+ torch.mps.empty_cache()
789
+ elif torch_device == "cuda":
790
+ torch.cuda.empty_cache()
791
+
792
+ except Exception as e:
793
+ logger.error(f"Failed to process batch {batch_idx + 1}: {e}", exc_info=True)
794
+ if torch_device == "mps":
795
+ torch.mps.empty_cache()
796
+
797
+ return all_objects, detections, descriptions
798
+
799
+
800
+ def _analyze_frames_batch_context(
801
+ model: Any,
802
+ processor: Any,
803
+ torch_device: str,
804
+ frame_paths: list[str],
805
+ timestamps: list[float],
806
+ context: dict[str, str] | None,
807
+ progress_callback: ProgressCallback | None,
808
+ batch_size: int | None = None,
809
+ overlap: bool = False,
810
+ ) -> tuple[dict[str, int], list[ObjectDetection], list[str]]:
811
+ """Analyze frames in batches with context passed between batches."""
812
+ from qwen_vl_utils import process_vision_info # type: ignore[import-not-found]
813
+
814
+ all_objects: dict[str, int] = {}
815
+ detections: list[ObjectDetection] = []
816
+ descriptions: list[str] = []
817
+
818
+ # Auto-select batch size based on available memory
819
+ if batch_size is None:
820
+ batch_size = get_auto_qwen_batch_size()
821
+
822
+ # Filter to valid frames
823
+ valid_frames = [(p, t) for p, t in zip(frame_paths, timestamps) if p and os.path.exists(p)]
824
+ if not valid_frames:
825
+ return all_objects, detections, descriptions
826
+
827
+ # Group frames into batches
828
+ # With overlap: last frame of batch N = first frame of batch N+1 (visual continuity)
829
+ # Without overlap: sequential non-overlapping batches (faster)
830
+ batches: list[list[tuple[str, float]]] = []
831
+ step = max(1, batch_size - 1) if overlap else batch_size
832
+ for i in range(0, len(valid_frames), step):
833
+ batch = valid_frames[i : i + batch_size]
834
+ if overlap:
835
+ if len(batch) >= 2: # Need at least 2 frames for temporal analysis
836
+ batches.append(batch)
837
+ elif not batches: # Edge case: very few frames
838
+ batches.append(batch)
839
+ else:
840
+ batches.append(batch)
841
+
842
+ total_batches = len(batches)
843
+ overlap_str = "overlapping " if overlap else ""
844
+ logger.info(f"Processing {len(valid_frames)} frames in {total_batches} {overlap_str}batches with context (size={batch_size}, step={step})")
845
+
846
+ group_context: str | None = None
847
+
848
+ for batch_idx, batch in enumerate(batches):
849
+ if progress_callback:
850
+ progress_callback(
851
+ f"Analyzing batch {batch_idx + 1}/{total_batches} (with context)...",
852
+ batch_idx + 1,
853
+ total_batches,
854
+ )
855
+
856
+ try:
857
+ # Build multi-image message with previous batch context
858
+ prompt = _build_batch_context_prompt(context, len(batch), group_context)
859
+
860
+ if batch_idx == 0:
861
+ logger.info(f"Qwen batch-context prompt: {prompt[:500]}")
862
+
863
+ # Build content with all images in the batch
864
+ content: list[dict[str, str]] = []
865
+ for frame_path, _ in batch:
866
+ content.append({"type": "image", "image": f"file://{frame_path}"})
867
+ content.append({"type": "text", "text": prompt})
868
+
869
+ messages = [{"role": "user", "content": content}]
870
+
871
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
872
+ image_inputs, video_inputs = process_vision_info(messages)
873
+ inputs = processor(
874
+ text=[text],
875
+ images=image_inputs,
876
+ videos=video_inputs,
877
+ padding=True,
878
+ return_tensors="pt",
879
+ )
880
+ inputs = inputs.to(torch_device)
881
+
882
+ with torch.no_grad():
883
+ generated_ids = model.generate(
884
+ **inputs,
885
+ max_new_tokens=512,
886
+ do_sample=False,
887
+ repetition_penalty=1.2,
888
+ no_repeat_ngram_size=3,
889
+ )
890
+ generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
891
+ output_text = processor.batch_decode(
892
+ generated_ids_trimmed,
893
+ skip_special_tokens=True,
894
+ clean_up_tokenization_spaces=False,
895
+ )[0]
896
+
897
+ logger.info(f"Qwen batch {batch_idx + 1} raw output: {output_text[:500]}")
898
+ objects, description = _parse_batch_response(output_text)
899
+
900
+ # Use description as context for next batch
901
+ if description:
902
+ group_context = description
903
+
904
+ # Associate objects with the middle timestamp of the batch
905
+ batch_timestamps = [t for _, t in batch]
906
+ middle_timestamp = batch_timestamps[len(batch_timestamps) // 2]
907
+
908
+ for obj in objects:
909
+ obj_lower = obj.lower().strip()
910
+ all_objects[obj_lower] = all_objects.get(obj_lower, 0) + 1
911
+ detections.append(
912
+ ObjectDetection(
913
+ timestamp=round(middle_timestamp, 2),
914
+ label=obj_lower,
915
+ confidence=0.95,
916
+ bbox=BoundingBox(x=0, y=0, width=0, height=0),
917
+ )
918
+ )
919
+
920
+ if description:
921
+ descriptions.append(description)
922
+ logger.info(f"Batch {batch_idx + 1} description: {description}")
923
+
924
+ logger.info(f"Batch {batch_idx + 1} objects: {objects}")
925
+
926
+ del inputs, generated_ids
927
+ if torch_device == "mps":
928
+ torch.mps.empty_cache()
929
+ elif torch_device == "cuda":
930
+ torch.cuda.empty_cache()
931
+
932
+ except Exception as e:
933
+ logger.error(f"Failed to process batch {batch_idx + 1}: {e}", exc_info=True)
934
+ if torch_device == "mps":
935
+ torch.mps.empty_cache()
936
+
937
+ return all_objects, detections, descriptions
938
+
939
+
940
+ def _fix_malformed_json(text: str) -> str:
941
+ """Fix common JSON malformations from VLM output."""
942
+ # Remove markdown code blocks
943
+ text = text.replace("```json", "").replace("```", "").strip()
944
+
945
+ # Fix escaped quotes before colons: "action\": -> "action":
946
+ text = text.replace('\\":', '":')
947
+
948
+ # Replace single quotes with double quotes for keys and string values
949
+ # But be careful not to replace apostrophes within words
950
+ # First, handle keys: 'key': -> "key":
951
+ text = re.sub(r"'(\w+)'(\s*):", r'"\1"\2:', text)
952
+
953
+ # Handle string values: : 'value' -> : "value"
954
+ # This regex looks for : followed by optional whitespace and a single-quoted string
955
+ text = re.sub(r":\s*'([^']*)'", r': "\1"', text)
956
+
957
+ # Remove trailing commas before ] or }
958
+ text = re.sub(r",(\s*[\]\}])", r"\1", text)
959
+
960
+ return text
961
+
962
+
963
+ def _parse_batch_response(response: str) -> tuple[list[str], str | None]:
964
+ """Parse objects and description from batch analysis response.
965
+
966
+ Handles both standard format and batch-specific format with action field.
967
+ """
968
+ objects: list[str] = []
969
+ description: str | None = None
970
+
971
+ try:
972
+ clean_response = _fix_malformed_json(response)
973
+
974
+ if "{" in clean_response:
975
+ start_brace = clean_response.find("{")
976
+ json_str = clean_response[start_brace : clean_response.rindex("}") + 1]
977
+ data = json.loads(json_str)
978
+
979
+ # Extract objects
980
+ raw_objects = data.get("objects", [])
981
+ for obj in raw_objects:
982
+ if isinstance(obj, str) and len(obj) < 100 and obj.strip():
983
+ objects.append(obj)
984
+ elif isinstance(obj, dict):
985
+ name = obj.get("name", "") or obj.get("label", "")
986
+ if isinstance(name, str) and len(name) < 100 and name.strip():
987
+ objects.append(name)
988
+
989
+ # Build description from available fields
990
+ desc_parts = []
991
+
992
+ # Action field (batch-specific)
993
+ action = data.get("action", "")
994
+ if isinstance(action, str) and action.strip():
995
+ desc_parts.append(action.strip())
996
+
997
+ # Standard description
998
+ desc = data.get("description", "")
999
+ if isinstance(desc, str) and desc.strip():
1000
+ desc_parts.append(desc.strip())
1001
+
1002
+ # Continues field (batch-context specific)
1003
+ continues = data.get("continues", "")
1004
+ if isinstance(continues, str) and continues.strip():
1005
+ desc_parts.append(continues.strip())
1006
+
1007
+ # Change field (context-specific)
1008
+ change = data.get("change", "")
1009
+ if isinstance(change, str) and change.strip():
1010
+ desc_parts.append(f"Change: {change.strip()}")
1011
+
1012
+ if desc_parts:
1013
+ description = " ".join(desc_parts)
1014
+
1015
+ return objects, description
1016
+
1017
+ except (json.JSONDecodeError, ValueError) as e:
1018
+ logger.warning(f"Failed to parse batch JSON from Qwen response: {e}")
1019
+
1020
+ # Fallback to standard parser
1021
+ return _parse_objects_and_description(response)
1022
+
1023
+
337
1024
  def extract_objects_qwen(
338
1025
  file_path: str,
339
1026
  timestamps: list[float] | None = None,
@@ -341,6 +1028,8 @@ def extract_objects_qwen(
341
1028
  context: dict[str, str] | None = None,
342
1029
  progress_callback: ProgressCallback | None = None,
343
1030
  lut_path: str | None = None,
1031
+ batch_overlap: bool = False,
1032
+ strategy: str | None = None,
344
1033
  ) -> ObjectsResult:
345
1034
  """Extract objects using Qwen2-VL vision-language model.
346
1035
 
@@ -359,12 +1048,19 @@ def extract_objects_qwen(
359
1048
  - "topic": Subject matter of the video
360
1049
  progress_callback: Optional callback for progress updates (message, current, total)
361
1050
  lut_path: Optional path to a LUT file (.cube) to apply for log footage color correction
1051
+ batch_overlap: If True, batches overlap by 1 frame for visual continuity.
1052
+ Useful for unstable camera or videos with rapid scene changes.
1053
+ Default False for faster processing.
1054
+ strategy: Override Qwen strategy for this file. One of:
1055
+ - "single": No temporal context (fastest)
1056
+ - "context": Pass previous description as text
1057
+ - "batch": Multi-frame batches
1058
+ - "batch_context": Batches with text context between (richest)
1059
+ If None, uses global setting from config.
362
1060
 
363
1061
  Returns:
364
1062
  ObjectsResult with detected objects and contextual descriptions
365
1063
  """
366
- from qwen_vl_utils import process_vision_info # type: ignore[import-not-found]
367
-
368
1064
  logger.info(f"extract_objects_qwen called: file={file_path}, timestamps={timestamps}, context={context}")
369
1065
 
370
1066
  settings = get_settings()
@@ -384,7 +1080,7 @@ def extract_objects_qwen(
384
1080
  if timestamps is None:
385
1081
  duration = _get_video_duration(file_path)
386
1082
  timestamps = [duration / 2]
387
- logger.info(f"No timestamps provided, sampling from middle ({duration/2:.1f}s)")
1083
+ logger.info(f"No timestamps provided, sampling from middle ({duration / 2:.1f}s)")
388
1084
  else:
389
1085
  logger.info(f"Analyzing {len(timestamps)} provided timestamps")
390
1086
 
@@ -401,15 +1097,13 @@ def extract_objects_qwen(
401
1097
  if lut_path and os.path.exists(lut_path):
402
1098
  # LUT applied - colors are corrected but may still be slightly off
403
1099
  context["log_footage_note"] = (
404
- "This footage was recorded in LOG profile and color-corrected with a LUT. " "Colors shown are the corrected version but may still appear slightly desaturated."
1100
+ "This footage was recorded in LOG profile and color-corrected with a LUT. Colors shown are the corrected version but may still appear slightly desaturated."
405
1101
  )
406
1102
  logger.info("Added log footage context hint (with LUT)")
407
1103
  elif is_log_footage:
408
1104
  # LOG detected but no LUT - colors are definitely off
409
1105
  context["log_footage_note"] = (
410
- f"This footage appears to be in LOG/flat color profile ({color_transfer}). "
411
- "Colors are desaturated and not representative of the actual scene. "
412
- "Focus on describing content and action, not colors."
1106
+ f"This footage appears to be in LOG/flat color profile ({color_transfer}). Colors are desaturated and not representative of the actual scene. Focus on describing content and action, not colors."
413
1107
  )
414
1108
  logger.info(f"Added log footage context hint (no LUT, color_transfer={color_transfer})")
415
1109
 
@@ -431,7 +1125,7 @@ def extract_objects_qwen(
431
1125
  except (RuntimeError, MemoryError, OSError) as e:
432
1126
  error_msg = str(e).lower()
433
1127
  if "out of memory" in error_msg or "cannot allocate" in error_msg:
434
- logger.error(f"Out of memory loading Qwen model. " f"Close other apps or use a cloud vision API. Error: {e}")
1128
+ logger.error(f"Out of memory loading Qwen model. Close other apps or use a cloud vision API. Error: {e}")
435
1129
  # Return empty result - frontend can fall back to cloud API if configured
436
1130
  return ObjectsResult(
437
1131
  summary={},
@@ -443,108 +1137,41 @@ def extract_objects_qwen(
443
1137
 
444
1138
  logger.info(f"Processing {total_frames} frames for Qwen analysis")
445
1139
 
446
- all_objects: dict[str, int] = {}
447
- detections: list[ObjectDetection] = []
448
- descriptions: list[str] = []
449
- frame_count = 0
450
-
451
- for frame_path, timestamp in zip(frame_paths, timestamps):
452
- if not frame_path or not os.path.exists(frame_path):
453
- logger.warning(f"Skipping missing frame at {timestamp}s: {frame_path}")
454
- continue
455
-
456
- frame_count += 1
457
- if progress_callback:
458
- progress_callback(
459
- f"Analyzing frame {frame_count}/{total_frames}...",
460
- frame_count,
461
- total_frames,
462
- )
463
-
464
- try:
465
- # Build the prompt with optional context
466
- prompt = _build_analysis_prompt(context)
467
-
468
- # Log prompt on first frame for debugging
469
- if frame_count == 1:
470
- logger.info(f"Qwen prompt: {prompt[:500]}")
471
-
472
- # Prepare message for Qwen - ask for both objects and description
473
- messages = [
474
- {
475
- "role": "user",
476
- "content": [
477
- {"type": "image", "image": f"file://{frame_path}"},
478
- {"type": "text", "text": prompt},
479
- ],
480
- }
481
- ]
482
-
483
- # Process inputs
484
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
485
- image_inputs, video_inputs = process_vision_info(messages)
486
- inputs = processor(
487
- text=[text],
488
- images=image_inputs,
489
- videos=video_inputs,
490
- padding=True,
491
- return_tensors="pt",
492
- )
493
- inputs = inputs.to(torch_device)
494
-
495
- # Generate response with repetition penalty to prevent loops
496
- with torch.no_grad():
497
- generated_ids = model.generate(
498
- **inputs,
499
- max_new_tokens=512,
500
- do_sample=False, # Greedy decoding for consistent JSON
501
- repetition_penalty=1.2, # Penalize repetition
502
- no_repeat_ngram_size=3, # Prevent 3-gram repetition
503
- )
504
- generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
505
- output_text = processor.batch_decode(
506
- generated_ids_trimmed,
507
- skip_special_tokens=True,
508
- clean_up_tokenization_spaces=False,
509
- )[0]
510
-
511
- # Parse response
512
- logger.info(f"Qwen raw output for {timestamp:.1f}s: {output_text[:500]}")
513
- objects, description = _parse_objects_and_description(output_text)
514
- if not description:
515
- logger.warning(f"No description parsed from Qwen output at {timestamp:.1f}s")
516
- for obj in objects:
517
- obj_lower = obj.lower().strip()
518
- all_objects[obj_lower] = all_objects.get(obj_lower, 0) + 1
519
-
520
- detections.append(
521
- ObjectDetection(
522
- timestamp=round(timestamp, 2),
523
- label=obj_lower,
524
- confidence=0.95, # VLM confidence is generally high
525
- bbox=BoundingBox(x=0, y=0, width=0, height=0), # No bbox from VLM
526
- )
527
- )
528
-
529
- if description:
530
- descriptions.append(description)
531
- logger.info(f"Frame {timestamp:.1f}s description: {description}")
532
-
533
- logger.info(f"Frame {timestamp:.1f}s objects: {objects}")
534
-
535
- # Clear memory after each frame
536
- del inputs, generated_ids
537
- if torch_device == "mps":
538
- torch.mps.empty_cache()
539
- elif torch_device == "cuda":
540
- torch.cuda.empty_cache()
541
-
542
- except Exception as e:
543
- logger.error(f"Failed to process frame {frame_path}: {e}", exc_info=True)
544
- # Try to recover memory
545
- if torch_device == "mps":
546
- torch.mps.empty_cache()
547
- continue
1140
+ # Get strategy for multi-frame analysis (use override if provided)
1141
+ if strategy is not None:
1142
+ resolved_strategy = QwenStrategy(strategy)
1143
+ logger.info(f"Using Qwen strategy override: {resolved_strategy}")
1144
+ else:
1145
+ resolved_strategy = settings.get_qwen_strategy()
1146
+ logger.info(f"Using Qwen strategy from config: {resolved_strategy}")
1147
+
1148
+ # Dispatch to appropriate strategy implementation
1149
+ if resolved_strategy == QwenStrategy.SINGLE:
1150
+ all_objects, detections, descriptions = _analyze_frames_single(model, processor, torch_device, frame_paths, timestamps, context, progress_callback)
1151
+ elif resolved_strategy == QwenStrategy.CONTEXT:
1152
+ all_objects, detections, descriptions = _analyze_frames_with_context(model, processor, torch_device, frame_paths, timestamps, context, progress_callback)
1153
+ elif resolved_strategy == QwenStrategy.BATCH:
1154
+ all_objects, detections, descriptions = _analyze_frames_batch(
1155
+ model,
1156
+ processor,
1157
+ torch_device,
1158
+ frame_paths,
1159
+ timestamps,
1160
+ context,
1161
+ progress_callback,
1162
+ overlap=batch_overlap,
1163
+ )
1164
+ else: # BATCH_CONTEXT
1165
+ all_objects, detections, descriptions = _analyze_frames_batch_context(
1166
+ model,
1167
+ processor,
1168
+ torch_device,
1169
+ frame_paths,
1170
+ timestamps,
1171
+ context,
1172
+ progress_callback,
1173
+ overlap=batch_overlap,
1174
+ )
548
1175
 
549
1176
  # Deduplicate - count unique objects per type
550
1177
  unique_objects = _deduplicate_objects(all_objects)
@@ -667,8 +1294,7 @@ def _parse_objects_and_description(response: str) -> tuple[list[str], str | None
667
1294
 
668
1295
  # Try to find and parse JSON
669
1296
  try:
670
- # Remove markdown code block markers
671
- clean_response = response.replace("```json", "").replace("```", "").strip()
1297
+ clean_response = _fix_malformed_json(response)
672
1298
 
673
1299
  # Try to parse as JSON (could be object or array)
674
1300
  if "[" in clean_response or "{" in clean_response: