media-engine 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- media_engine/_version.py +2 -2
- media_engine/batch/models.py +9 -0
- media_engine/batch/processor.py +14 -12
- media_engine/batch/timing.py +1 -1
- media_engine/config.py +91 -19
- media_engine/extractors/faces.py +1 -1
- media_engine/extractors/frame_buffer.py +1 -1
- media_engine/extractors/frames.py +2 -2
- media_engine/extractors/metadata/sony.py +1 -1
- media_engine/extractors/motion.py +4 -4
- media_engine/extractors/objects.py +1 -1
- media_engine/extractors/objects_qwen.py +738 -112
- media_engine/extractors/ocr.py +1 -1
- media_engine/extractors/transcribe.py +1 -1
- media_engine/extractors/vad.py +1 -1
- media_engine/routers/models.py +27 -11
- media_engine/routers/settings.py +2 -0
- media_engine/schemas.py +2 -0
- {media_engine-0.1.0.dist-info → media_engine-0.2.0.dist-info}/METADATA +3 -2
- {media_engine-0.1.0.dist-info → media_engine-0.2.0.dist-info}/RECORD +23 -23
- {media_engine-0.1.0.dist-info → media_engine-0.2.0.dist-info}/WHEEL +0 -0
- {media_engine-0.1.0.dist-info → media_engine-0.2.0.dist-info}/entry_points.txt +0 -0
- {media_engine-0.1.0.dist-info → media_engine-0.2.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import json
|
|
4
4
|
import logging
|
|
5
5
|
import os
|
|
6
|
+
import re
|
|
6
7
|
import shutil
|
|
7
8
|
import tempfile
|
|
8
9
|
from collections.abc import Callable
|
|
@@ -13,6 +14,8 @@ import torch
|
|
|
13
14
|
|
|
14
15
|
from media_engine.config import (
|
|
15
16
|
DeviceType,
|
|
17
|
+
QwenStrategy,
|
|
18
|
+
get_auto_qwen_batch_size,
|
|
16
19
|
get_device,
|
|
17
20
|
get_free_memory_gb,
|
|
18
21
|
get_settings,
|
|
@@ -334,6 +337,690 @@ Rules for description:
|
|
|
334
337
|
Respond with JSON only, no other text."""
|
|
335
338
|
|
|
336
339
|
|
|
340
|
+
def _build_context_prompt(
|
|
341
|
+
context: dict[str, str] | None = None,
|
|
342
|
+
previous_description: str | None = None,
|
|
343
|
+
) -> str:
|
|
344
|
+
"""Build prompt for CONTEXT strategy - includes previous frame description."""
|
|
345
|
+
base_prompt = _build_analysis_prompt(context)
|
|
346
|
+
|
|
347
|
+
if not previous_description:
|
|
348
|
+
return base_prompt
|
|
349
|
+
|
|
350
|
+
# Insert previous frame context before the analysis request
|
|
351
|
+
context_insert = f"""
|
|
352
|
+
Previous frame showed: {previous_description}
|
|
353
|
+
|
|
354
|
+
Describe what's happening NOW and how it relates to the previous frame.
|
|
355
|
+
Focus on: objects visible, actions occurring, any changes from before.
|
|
356
|
+
|
|
357
|
+
"""
|
|
358
|
+
# Modify the JSON format to include "change" field
|
|
359
|
+
modified_prompt = base_prompt.replace(
|
|
360
|
+
'{"objects": ["item1", "item2"], "description": "One or two sentences describing the scene."}',
|
|
361
|
+
'{"objects": ["item1", "item2"], "description": "What\'s happening now.", "change": "How this differs from the previous frame."}',
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
# Insert context after any existing context section but before "Look at this image"
|
|
365
|
+
if "Look at this image" in modified_prompt:
|
|
366
|
+
parts = modified_prompt.split("Look at this image")
|
|
367
|
+
return parts[0] + context_insert + "Look at this image" + parts[1]
|
|
368
|
+
|
|
369
|
+
return context_insert + modified_prompt
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def _build_batch_prompt(
|
|
373
|
+
context: dict[str, str] | None = None,
|
|
374
|
+
num_frames: int = 3,
|
|
375
|
+
) -> str:
|
|
376
|
+
"""Build prompt for BATCH strategy - analyzes multiple frames together."""
|
|
377
|
+
# Get person name from context for instructions
|
|
378
|
+
person_name = context.get("person", "") if context else ""
|
|
379
|
+
|
|
380
|
+
# Build context section if available
|
|
381
|
+
context_section = ""
|
|
382
|
+
if context:
|
|
383
|
+
context_lines = ["Known context about this video:"]
|
|
384
|
+
labels = {
|
|
385
|
+
"person": "Person identified",
|
|
386
|
+
"location": "Location",
|
|
387
|
+
"nearby_landmarks": "Nearby landmarks/POIs",
|
|
388
|
+
"activity": "Activity",
|
|
389
|
+
"language": "Language spoken",
|
|
390
|
+
"device": "Filmed with",
|
|
391
|
+
}
|
|
392
|
+
for key, value in context.items():
|
|
393
|
+
if value and key not in ("log_footage_note", "color_transfer"):
|
|
394
|
+
label = labels.get(key, key.replace("_", " ").title())
|
|
395
|
+
context_lines.append(f"- {label}: {value}")
|
|
396
|
+
context_section = "\n".join(context_lines) + "\n\n"
|
|
397
|
+
|
|
398
|
+
person_instruction = ""
|
|
399
|
+
if person_name:
|
|
400
|
+
person_instruction = f'Use "{person_name}" instead of "person" in objects and description.\n'
|
|
401
|
+
|
|
402
|
+
return f"""{context_section}These {num_frames} frames are from the same video in sequence.
|
|
403
|
+
{person_instruction}
|
|
404
|
+
Analyze what happens ACROSS these frames:
|
|
405
|
+
1. What objects/people are visible throughout?
|
|
406
|
+
2. What ACTION or movement occurs across the frames?
|
|
407
|
+
3. How does the scene change from first to last frame?
|
|
408
|
+
|
|
409
|
+
You MUST respond with ONLY this exact JSON format:
|
|
410
|
+
{{"objects": ["item1", "item2"], "action": "The action happening across frames", "description": "Overall scene description"}}
|
|
411
|
+
|
|
412
|
+
Rules:
|
|
413
|
+
- List objects visible in ANY of the frames
|
|
414
|
+
- Describe the ACTION that unfolds across frames (e.g., "person walks toward camera", "car turns left")
|
|
415
|
+
- Keep description to 1-2 sentences summarizing the sequence
|
|
416
|
+
|
|
417
|
+
Respond with JSON only, no other text."""
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def _build_batch_context_prompt(
|
|
421
|
+
context: dict[str, str] | None = None,
|
|
422
|
+
num_frames: int = 3,
|
|
423
|
+
group_context: str | None = None,
|
|
424
|
+
) -> str:
|
|
425
|
+
"""Build prompt for BATCH_CONTEXT strategy - batch with previous group context."""
|
|
426
|
+
base_prompt = _build_batch_prompt(context, num_frames)
|
|
427
|
+
|
|
428
|
+
if not group_context:
|
|
429
|
+
return base_prompt
|
|
430
|
+
|
|
431
|
+
context_insert = f"""Previous scene: {group_context}
|
|
432
|
+
|
|
433
|
+
What happens next in these frames? How does it continue from before?
|
|
434
|
+
|
|
435
|
+
"""
|
|
436
|
+
# Modify JSON format to include "continues" field
|
|
437
|
+
modified_prompt = base_prompt.replace(
|
|
438
|
+
'{"objects": ["item1", "item2"], "action": "The action happening across frames", "description": "Overall scene description"}',
|
|
439
|
+
'{"objects": ["item1", "item2"], "action": "The action in these frames", "description": "Scene description", "continues": "How this continues from the previous scene"}',
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
# Insert after context section but before "These X frames"
|
|
443
|
+
if "These " in modified_prompt and " frames are" in modified_prompt:
|
|
444
|
+
idx = modified_prompt.find("These ")
|
|
445
|
+
return modified_prompt[:idx] + context_insert + modified_prompt[idx:]
|
|
446
|
+
|
|
447
|
+
return context_insert + modified_prompt
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
def _analyze_frames_single(
|
|
451
|
+
model: Any,
|
|
452
|
+
processor: Any,
|
|
453
|
+
torch_device: str,
|
|
454
|
+
frame_paths: list[str],
|
|
455
|
+
timestamps: list[float],
|
|
456
|
+
context: dict[str, str] | None,
|
|
457
|
+
progress_callback: ProgressCallback | None,
|
|
458
|
+
) -> tuple[dict[str, int], list[ObjectDetection], list[str]]:
|
|
459
|
+
"""Analyze frames one at a time without temporal context (original behavior)."""
|
|
460
|
+
from qwen_vl_utils import process_vision_info # type: ignore[import-not-found]
|
|
461
|
+
|
|
462
|
+
all_objects: dict[str, int] = {}
|
|
463
|
+
detections: list[ObjectDetection] = []
|
|
464
|
+
descriptions: list[str] = []
|
|
465
|
+
|
|
466
|
+
total_frames = len([p for p in frame_paths if p])
|
|
467
|
+
frame_count = 0
|
|
468
|
+
|
|
469
|
+
for frame_path, timestamp in zip(frame_paths, timestamps):
|
|
470
|
+
if not frame_path or not os.path.exists(frame_path):
|
|
471
|
+
continue
|
|
472
|
+
|
|
473
|
+
frame_count += 1
|
|
474
|
+
if progress_callback:
|
|
475
|
+
progress_callback(
|
|
476
|
+
f"Analyzing frame {frame_count}/{total_frames}...",
|
|
477
|
+
frame_count,
|
|
478
|
+
total_frames,
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
try:
|
|
482
|
+
prompt = _build_analysis_prompt(context)
|
|
483
|
+
|
|
484
|
+
messages = [
|
|
485
|
+
{
|
|
486
|
+
"role": "user",
|
|
487
|
+
"content": [
|
|
488
|
+
{"type": "image", "image": f"file://{frame_path}"},
|
|
489
|
+
{"type": "text", "text": prompt},
|
|
490
|
+
],
|
|
491
|
+
}
|
|
492
|
+
]
|
|
493
|
+
|
|
494
|
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
495
|
+
image_inputs, video_inputs = process_vision_info(messages)
|
|
496
|
+
inputs = processor(
|
|
497
|
+
text=[text],
|
|
498
|
+
images=image_inputs,
|
|
499
|
+
videos=video_inputs,
|
|
500
|
+
padding=True,
|
|
501
|
+
return_tensors="pt",
|
|
502
|
+
)
|
|
503
|
+
inputs = inputs.to(torch_device)
|
|
504
|
+
|
|
505
|
+
with torch.no_grad():
|
|
506
|
+
generated_ids = model.generate(
|
|
507
|
+
**inputs,
|
|
508
|
+
max_new_tokens=512,
|
|
509
|
+
do_sample=False,
|
|
510
|
+
repetition_penalty=1.2,
|
|
511
|
+
no_repeat_ngram_size=3,
|
|
512
|
+
)
|
|
513
|
+
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
|
514
|
+
output_text = processor.batch_decode(
|
|
515
|
+
generated_ids_trimmed,
|
|
516
|
+
skip_special_tokens=True,
|
|
517
|
+
clean_up_tokenization_spaces=False,
|
|
518
|
+
)[0]
|
|
519
|
+
|
|
520
|
+
logger.info(f"Qwen raw output for {timestamp:.1f}s: {output_text[:500]}")
|
|
521
|
+
objects, description = _parse_objects_and_description(output_text)
|
|
522
|
+
|
|
523
|
+
for obj in objects:
|
|
524
|
+
obj_lower = obj.lower().strip()
|
|
525
|
+
all_objects[obj_lower] = all_objects.get(obj_lower, 0) + 1
|
|
526
|
+
detections.append(
|
|
527
|
+
ObjectDetection(
|
|
528
|
+
timestamp=round(timestamp, 2),
|
|
529
|
+
label=obj_lower,
|
|
530
|
+
confidence=0.95,
|
|
531
|
+
bbox=BoundingBox(x=0, y=0, width=0, height=0),
|
|
532
|
+
)
|
|
533
|
+
)
|
|
534
|
+
|
|
535
|
+
if description:
|
|
536
|
+
descriptions.append(description)
|
|
537
|
+
logger.info(f"Frame {timestamp:.1f}s description: {description}")
|
|
538
|
+
|
|
539
|
+
logger.info(f"Frame {timestamp:.1f}s objects: {objects}")
|
|
540
|
+
|
|
541
|
+
del inputs, generated_ids
|
|
542
|
+
if torch_device == "mps":
|
|
543
|
+
torch.mps.empty_cache()
|
|
544
|
+
elif torch_device == "cuda":
|
|
545
|
+
torch.cuda.empty_cache()
|
|
546
|
+
|
|
547
|
+
except Exception as e:
|
|
548
|
+
logger.error(f"Failed to process frame {frame_path}: {e}", exc_info=True)
|
|
549
|
+
if torch_device == "mps":
|
|
550
|
+
torch.mps.empty_cache()
|
|
551
|
+
|
|
552
|
+
return all_objects, detections, descriptions
|
|
553
|
+
|
|
554
|
+
|
|
555
|
+
def _analyze_frames_with_context(
|
|
556
|
+
model: Any,
|
|
557
|
+
processor: Any,
|
|
558
|
+
torch_device: str,
|
|
559
|
+
frame_paths: list[str],
|
|
560
|
+
timestamps: list[float],
|
|
561
|
+
context: dict[str, str] | None,
|
|
562
|
+
progress_callback: ProgressCallback | None,
|
|
563
|
+
) -> tuple[dict[str, int], list[ObjectDetection], list[str]]:
|
|
564
|
+
"""Analyze frames sequentially, passing previous description as context."""
|
|
565
|
+
from qwen_vl_utils import process_vision_info # type: ignore[import-not-found]
|
|
566
|
+
|
|
567
|
+
all_objects: dict[str, int] = {}
|
|
568
|
+
detections: list[ObjectDetection] = []
|
|
569
|
+
descriptions: list[str] = []
|
|
570
|
+
|
|
571
|
+
total_frames = len([p for p in frame_paths if p])
|
|
572
|
+
frame_count = 0
|
|
573
|
+
previous_description: str | None = None
|
|
574
|
+
|
|
575
|
+
for frame_path, timestamp in zip(frame_paths, timestamps):
|
|
576
|
+
if not frame_path or not os.path.exists(frame_path):
|
|
577
|
+
continue
|
|
578
|
+
|
|
579
|
+
frame_count += 1
|
|
580
|
+
if progress_callback:
|
|
581
|
+
progress_callback(
|
|
582
|
+
f"Analyzing frame {frame_count}/{total_frames} (with context)...",
|
|
583
|
+
frame_count,
|
|
584
|
+
total_frames,
|
|
585
|
+
)
|
|
586
|
+
|
|
587
|
+
try:
|
|
588
|
+
# Build prompt with previous frame's description as context
|
|
589
|
+
prompt = _build_context_prompt(context, previous_description)
|
|
590
|
+
|
|
591
|
+
if frame_count == 1:
|
|
592
|
+
logger.info(f"Qwen context prompt (first frame): {prompt[:500]}")
|
|
593
|
+
|
|
594
|
+
messages = [
|
|
595
|
+
{
|
|
596
|
+
"role": "user",
|
|
597
|
+
"content": [
|
|
598
|
+
{"type": "image", "image": f"file://{frame_path}"},
|
|
599
|
+
{"type": "text", "text": prompt},
|
|
600
|
+
],
|
|
601
|
+
}
|
|
602
|
+
]
|
|
603
|
+
|
|
604
|
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
605
|
+
image_inputs, video_inputs = process_vision_info(messages)
|
|
606
|
+
inputs = processor(
|
|
607
|
+
text=[text],
|
|
608
|
+
images=image_inputs,
|
|
609
|
+
videos=video_inputs,
|
|
610
|
+
padding=True,
|
|
611
|
+
return_tensors="pt",
|
|
612
|
+
)
|
|
613
|
+
inputs = inputs.to(torch_device)
|
|
614
|
+
|
|
615
|
+
with torch.no_grad():
|
|
616
|
+
generated_ids = model.generate(
|
|
617
|
+
**inputs,
|
|
618
|
+
max_new_tokens=512,
|
|
619
|
+
do_sample=False,
|
|
620
|
+
repetition_penalty=1.2,
|
|
621
|
+
no_repeat_ngram_size=3,
|
|
622
|
+
)
|
|
623
|
+
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
|
624
|
+
output_text = processor.batch_decode(
|
|
625
|
+
generated_ids_trimmed,
|
|
626
|
+
skip_special_tokens=True,
|
|
627
|
+
clean_up_tokenization_spaces=False,
|
|
628
|
+
)[0]
|
|
629
|
+
|
|
630
|
+
logger.info(f"Qwen raw output for {timestamp:.1f}s: {output_text[:500]}")
|
|
631
|
+
objects, description = _parse_objects_and_description(output_text)
|
|
632
|
+
|
|
633
|
+
for obj in objects:
|
|
634
|
+
obj_lower = obj.lower().strip()
|
|
635
|
+
all_objects[obj_lower] = all_objects.get(obj_lower, 0) + 1
|
|
636
|
+
detections.append(
|
|
637
|
+
ObjectDetection(
|
|
638
|
+
timestamp=round(timestamp, 2),
|
|
639
|
+
label=obj_lower,
|
|
640
|
+
confidence=0.95,
|
|
641
|
+
bbox=BoundingBox(x=0, y=0, width=0, height=0),
|
|
642
|
+
)
|
|
643
|
+
)
|
|
644
|
+
|
|
645
|
+
if description:
|
|
646
|
+
descriptions.append(description)
|
|
647
|
+
previous_description = description # Pass to next frame
|
|
648
|
+
logger.info(f"Frame {timestamp:.1f}s description: {description}")
|
|
649
|
+
|
|
650
|
+
logger.info(f"Frame {timestamp:.1f}s objects: {objects}")
|
|
651
|
+
|
|
652
|
+
del inputs, generated_ids
|
|
653
|
+
if torch_device == "mps":
|
|
654
|
+
torch.mps.empty_cache()
|
|
655
|
+
elif torch_device == "cuda":
|
|
656
|
+
torch.cuda.empty_cache()
|
|
657
|
+
|
|
658
|
+
except Exception as e:
|
|
659
|
+
logger.error(f"Failed to process frame {frame_path}: {e}", exc_info=True)
|
|
660
|
+
if torch_device == "mps":
|
|
661
|
+
torch.mps.empty_cache()
|
|
662
|
+
|
|
663
|
+
return all_objects, detections, descriptions
|
|
664
|
+
|
|
665
|
+
|
|
666
|
+
def _analyze_frames_batch(
|
|
667
|
+
model: Any,
|
|
668
|
+
processor: Any,
|
|
669
|
+
torch_device: str,
|
|
670
|
+
frame_paths: list[str],
|
|
671
|
+
timestamps: list[float],
|
|
672
|
+
context: dict[str, str] | None,
|
|
673
|
+
progress_callback: ProgressCallback | None,
|
|
674
|
+
batch_size: int | None = None,
|
|
675
|
+
overlap: bool = False,
|
|
676
|
+
) -> tuple[dict[str, int], list[ObjectDetection], list[str]]:
|
|
677
|
+
"""Analyze frames in batches for temporal understanding."""
|
|
678
|
+
from qwen_vl_utils import process_vision_info # type: ignore[import-not-found]
|
|
679
|
+
|
|
680
|
+
all_objects: dict[str, int] = {}
|
|
681
|
+
detections: list[ObjectDetection] = []
|
|
682
|
+
descriptions: list[str] = []
|
|
683
|
+
|
|
684
|
+
# Auto-select batch size based on available memory
|
|
685
|
+
if batch_size is None:
|
|
686
|
+
batch_size = get_auto_qwen_batch_size()
|
|
687
|
+
|
|
688
|
+
# Filter to valid frames
|
|
689
|
+
valid_frames = [(p, t) for p, t in zip(frame_paths, timestamps) if p and os.path.exists(p)]
|
|
690
|
+
if not valid_frames:
|
|
691
|
+
return all_objects, detections, descriptions
|
|
692
|
+
|
|
693
|
+
# Group frames into batches
|
|
694
|
+
# With overlap: last frame of batch N = first frame of batch N+1 (visual continuity)
|
|
695
|
+
# Without overlap: sequential non-overlapping batches (faster)
|
|
696
|
+
batches: list[list[tuple[str, float]]] = []
|
|
697
|
+
step = max(1, batch_size - 1) if overlap else batch_size
|
|
698
|
+
for i in range(0, len(valid_frames), step):
|
|
699
|
+
batch = valid_frames[i : i + batch_size]
|
|
700
|
+
if overlap:
|
|
701
|
+
if len(batch) >= 2: # Need at least 2 frames for temporal analysis
|
|
702
|
+
batches.append(batch)
|
|
703
|
+
elif not batches: # Edge case: very few frames
|
|
704
|
+
batches.append(batch)
|
|
705
|
+
else:
|
|
706
|
+
batches.append(batch)
|
|
707
|
+
|
|
708
|
+
total_batches = len(batches)
|
|
709
|
+
overlap_str = "overlapping " if overlap else ""
|
|
710
|
+
logger.info(f"Processing {len(valid_frames)} frames in {total_batches} {overlap_str}batches (size={batch_size}, step={step})")
|
|
711
|
+
|
|
712
|
+
for batch_idx, batch in enumerate(batches):
|
|
713
|
+
if progress_callback:
|
|
714
|
+
progress_callback(
|
|
715
|
+
f"Analyzing batch {batch_idx + 1}/{total_batches}...",
|
|
716
|
+
batch_idx + 1,
|
|
717
|
+
total_batches,
|
|
718
|
+
)
|
|
719
|
+
|
|
720
|
+
try:
|
|
721
|
+
# Build multi-image message
|
|
722
|
+
prompt = _build_batch_prompt(context, len(batch))
|
|
723
|
+
|
|
724
|
+
if batch_idx == 0:
|
|
725
|
+
logger.info(f"Qwen batch prompt: {prompt[:500]}")
|
|
726
|
+
|
|
727
|
+
# Build content with all images in the batch
|
|
728
|
+
content: list[dict[str, str]] = []
|
|
729
|
+
for frame_path, _ in batch:
|
|
730
|
+
content.append({"type": "image", "image": f"file://{frame_path}"})
|
|
731
|
+
content.append({"type": "text", "text": prompt})
|
|
732
|
+
|
|
733
|
+
messages = [{"role": "user", "content": content}]
|
|
734
|
+
|
|
735
|
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
736
|
+
image_inputs, video_inputs = process_vision_info(messages)
|
|
737
|
+
inputs = processor(
|
|
738
|
+
text=[text],
|
|
739
|
+
images=image_inputs,
|
|
740
|
+
videos=video_inputs,
|
|
741
|
+
padding=True,
|
|
742
|
+
return_tensors="pt",
|
|
743
|
+
)
|
|
744
|
+
inputs = inputs.to(torch_device)
|
|
745
|
+
|
|
746
|
+
with torch.no_grad():
|
|
747
|
+
generated_ids = model.generate(
|
|
748
|
+
**inputs,
|
|
749
|
+
max_new_tokens=512,
|
|
750
|
+
do_sample=False,
|
|
751
|
+
repetition_penalty=1.2,
|
|
752
|
+
no_repeat_ngram_size=3,
|
|
753
|
+
)
|
|
754
|
+
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
|
755
|
+
output_text = processor.batch_decode(
|
|
756
|
+
generated_ids_trimmed,
|
|
757
|
+
skip_special_tokens=True,
|
|
758
|
+
clean_up_tokenization_spaces=False,
|
|
759
|
+
)[0]
|
|
760
|
+
|
|
761
|
+
logger.info(f"Qwen batch {batch_idx + 1} raw output: {output_text[:500]}")
|
|
762
|
+
objects, description = _parse_batch_response(output_text)
|
|
763
|
+
|
|
764
|
+
# Associate objects with the middle timestamp of the batch
|
|
765
|
+
batch_timestamps = [t for _, t in batch]
|
|
766
|
+
middle_timestamp = batch_timestamps[len(batch_timestamps) // 2]
|
|
767
|
+
|
|
768
|
+
for obj in objects:
|
|
769
|
+
obj_lower = obj.lower().strip()
|
|
770
|
+
all_objects[obj_lower] = all_objects.get(obj_lower, 0) + 1
|
|
771
|
+
detections.append(
|
|
772
|
+
ObjectDetection(
|
|
773
|
+
timestamp=round(middle_timestamp, 2),
|
|
774
|
+
label=obj_lower,
|
|
775
|
+
confidence=0.95,
|
|
776
|
+
bbox=BoundingBox(x=0, y=0, width=0, height=0),
|
|
777
|
+
)
|
|
778
|
+
)
|
|
779
|
+
|
|
780
|
+
if description:
|
|
781
|
+
descriptions.append(description)
|
|
782
|
+
logger.info(f"Batch {batch_idx + 1} description: {description}")
|
|
783
|
+
|
|
784
|
+
logger.info(f"Batch {batch_idx + 1} objects: {objects}")
|
|
785
|
+
|
|
786
|
+
del inputs, generated_ids
|
|
787
|
+
if torch_device == "mps":
|
|
788
|
+
torch.mps.empty_cache()
|
|
789
|
+
elif torch_device == "cuda":
|
|
790
|
+
torch.cuda.empty_cache()
|
|
791
|
+
|
|
792
|
+
except Exception as e:
|
|
793
|
+
logger.error(f"Failed to process batch {batch_idx + 1}: {e}", exc_info=True)
|
|
794
|
+
if torch_device == "mps":
|
|
795
|
+
torch.mps.empty_cache()
|
|
796
|
+
|
|
797
|
+
return all_objects, detections, descriptions
|
|
798
|
+
|
|
799
|
+
|
|
800
|
+
def _analyze_frames_batch_context(
|
|
801
|
+
model: Any,
|
|
802
|
+
processor: Any,
|
|
803
|
+
torch_device: str,
|
|
804
|
+
frame_paths: list[str],
|
|
805
|
+
timestamps: list[float],
|
|
806
|
+
context: dict[str, str] | None,
|
|
807
|
+
progress_callback: ProgressCallback | None,
|
|
808
|
+
batch_size: int | None = None,
|
|
809
|
+
overlap: bool = False,
|
|
810
|
+
) -> tuple[dict[str, int], list[ObjectDetection], list[str]]:
|
|
811
|
+
"""Analyze frames in batches with context passed between batches."""
|
|
812
|
+
from qwen_vl_utils import process_vision_info # type: ignore[import-not-found]
|
|
813
|
+
|
|
814
|
+
all_objects: dict[str, int] = {}
|
|
815
|
+
detections: list[ObjectDetection] = []
|
|
816
|
+
descriptions: list[str] = []
|
|
817
|
+
|
|
818
|
+
# Auto-select batch size based on available memory
|
|
819
|
+
if batch_size is None:
|
|
820
|
+
batch_size = get_auto_qwen_batch_size()
|
|
821
|
+
|
|
822
|
+
# Filter to valid frames
|
|
823
|
+
valid_frames = [(p, t) for p, t in zip(frame_paths, timestamps) if p and os.path.exists(p)]
|
|
824
|
+
if not valid_frames:
|
|
825
|
+
return all_objects, detections, descriptions
|
|
826
|
+
|
|
827
|
+
# Group frames into batches
|
|
828
|
+
# With overlap: last frame of batch N = first frame of batch N+1 (visual continuity)
|
|
829
|
+
# Without overlap: sequential non-overlapping batches (faster)
|
|
830
|
+
batches: list[list[tuple[str, float]]] = []
|
|
831
|
+
step = max(1, batch_size - 1) if overlap else batch_size
|
|
832
|
+
for i in range(0, len(valid_frames), step):
|
|
833
|
+
batch = valid_frames[i : i + batch_size]
|
|
834
|
+
if overlap:
|
|
835
|
+
if len(batch) >= 2: # Need at least 2 frames for temporal analysis
|
|
836
|
+
batches.append(batch)
|
|
837
|
+
elif not batches: # Edge case: very few frames
|
|
838
|
+
batches.append(batch)
|
|
839
|
+
else:
|
|
840
|
+
batches.append(batch)
|
|
841
|
+
|
|
842
|
+
total_batches = len(batches)
|
|
843
|
+
overlap_str = "overlapping " if overlap else ""
|
|
844
|
+
logger.info(f"Processing {len(valid_frames)} frames in {total_batches} {overlap_str}batches with context (size={batch_size}, step={step})")
|
|
845
|
+
|
|
846
|
+
group_context: str | None = None
|
|
847
|
+
|
|
848
|
+
for batch_idx, batch in enumerate(batches):
|
|
849
|
+
if progress_callback:
|
|
850
|
+
progress_callback(
|
|
851
|
+
f"Analyzing batch {batch_idx + 1}/{total_batches} (with context)...",
|
|
852
|
+
batch_idx + 1,
|
|
853
|
+
total_batches,
|
|
854
|
+
)
|
|
855
|
+
|
|
856
|
+
try:
|
|
857
|
+
# Build multi-image message with previous batch context
|
|
858
|
+
prompt = _build_batch_context_prompt(context, len(batch), group_context)
|
|
859
|
+
|
|
860
|
+
if batch_idx == 0:
|
|
861
|
+
logger.info(f"Qwen batch-context prompt: {prompt[:500]}")
|
|
862
|
+
|
|
863
|
+
# Build content with all images in the batch
|
|
864
|
+
content: list[dict[str, str]] = []
|
|
865
|
+
for frame_path, _ in batch:
|
|
866
|
+
content.append({"type": "image", "image": f"file://{frame_path}"})
|
|
867
|
+
content.append({"type": "text", "text": prompt})
|
|
868
|
+
|
|
869
|
+
messages = [{"role": "user", "content": content}]
|
|
870
|
+
|
|
871
|
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
872
|
+
image_inputs, video_inputs = process_vision_info(messages)
|
|
873
|
+
inputs = processor(
|
|
874
|
+
text=[text],
|
|
875
|
+
images=image_inputs,
|
|
876
|
+
videos=video_inputs,
|
|
877
|
+
padding=True,
|
|
878
|
+
return_tensors="pt",
|
|
879
|
+
)
|
|
880
|
+
inputs = inputs.to(torch_device)
|
|
881
|
+
|
|
882
|
+
with torch.no_grad():
|
|
883
|
+
generated_ids = model.generate(
|
|
884
|
+
**inputs,
|
|
885
|
+
max_new_tokens=512,
|
|
886
|
+
do_sample=False,
|
|
887
|
+
repetition_penalty=1.2,
|
|
888
|
+
no_repeat_ngram_size=3,
|
|
889
|
+
)
|
|
890
|
+
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
|
891
|
+
output_text = processor.batch_decode(
|
|
892
|
+
generated_ids_trimmed,
|
|
893
|
+
skip_special_tokens=True,
|
|
894
|
+
clean_up_tokenization_spaces=False,
|
|
895
|
+
)[0]
|
|
896
|
+
|
|
897
|
+
logger.info(f"Qwen batch {batch_idx + 1} raw output: {output_text[:500]}")
|
|
898
|
+
objects, description = _parse_batch_response(output_text)
|
|
899
|
+
|
|
900
|
+
# Use description as context for next batch
|
|
901
|
+
if description:
|
|
902
|
+
group_context = description
|
|
903
|
+
|
|
904
|
+
# Associate objects with the middle timestamp of the batch
|
|
905
|
+
batch_timestamps = [t for _, t in batch]
|
|
906
|
+
middle_timestamp = batch_timestamps[len(batch_timestamps) // 2]
|
|
907
|
+
|
|
908
|
+
for obj in objects:
|
|
909
|
+
obj_lower = obj.lower().strip()
|
|
910
|
+
all_objects[obj_lower] = all_objects.get(obj_lower, 0) + 1
|
|
911
|
+
detections.append(
|
|
912
|
+
ObjectDetection(
|
|
913
|
+
timestamp=round(middle_timestamp, 2),
|
|
914
|
+
label=obj_lower,
|
|
915
|
+
confidence=0.95,
|
|
916
|
+
bbox=BoundingBox(x=0, y=0, width=0, height=0),
|
|
917
|
+
)
|
|
918
|
+
)
|
|
919
|
+
|
|
920
|
+
if description:
|
|
921
|
+
descriptions.append(description)
|
|
922
|
+
logger.info(f"Batch {batch_idx + 1} description: {description}")
|
|
923
|
+
|
|
924
|
+
logger.info(f"Batch {batch_idx + 1} objects: {objects}")
|
|
925
|
+
|
|
926
|
+
del inputs, generated_ids
|
|
927
|
+
if torch_device == "mps":
|
|
928
|
+
torch.mps.empty_cache()
|
|
929
|
+
elif torch_device == "cuda":
|
|
930
|
+
torch.cuda.empty_cache()
|
|
931
|
+
|
|
932
|
+
except Exception as e:
|
|
933
|
+
logger.error(f"Failed to process batch {batch_idx + 1}: {e}", exc_info=True)
|
|
934
|
+
if torch_device == "mps":
|
|
935
|
+
torch.mps.empty_cache()
|
|
936
|
+
|
|
937
|
+
return all_objects, detections, descriptions
|
|
938
|
+
|
|
939
|
+
|
|
940
|
+
def _fix_malformed_json(text: str) -> str:
|
|
941
|
+
"""Fix common JSON malformations from VLM output."""
|
|
942
|
+
# Remove markdown code blocks
|
|
943
|
+
text = text.replace("```json", "").replace("```", "").strip()
|
|
944
|
+
|
|
945
|
+
# Fix escaped quotes before colons: "action\": -> "action":
|
|
946
|
+
text = text.replace('\\":', '":')
|
|
947
|
+
|
|
948
|
+
# Replace single quotes with double quotes for keys and string values
|
|
949
|
+
# But be careful not to replace apostrophes within words
|
|
950
|
+
# First, handle keys: 'key': -> "key":
|
|
951
|
+
text = re.sub(r"'(\w+)'(\s*):", r'"\1"\2:', text)
|
|
952
|
+
|
|
953
|
+
# Handle string values: : 'value' -> : "value"
|
|
954
|
+
# This regex looks for : followed by optional whitespace and a single-quoted string
|
|
955
|
+
text = re.sub(r":\s*'([^']*)'", r': "\1"', text)
|
|
956
|
+
|
|
957
|
+
# Remove trailing commas before ] or }
|
|
958
|
+
text = re.sub(r",(\s*[\]\}])", r"\1", text)
|
|
959
|
+
|
|
960
|
+
return text
|
|
961
|
+
|
|
962
|
+
|
|
963
|
+
def _parse_batch_response(response: str) -> tuple[list[str], str | None]:
|
|
964
|
+
"""Parse objects and description from batch analysis response.
|
|
965
|
+
|
|
966
|
+
Handles both standard format and batch-specific format with action field.
|
|
967
|
+
"""
|
|
968
|
+
objects: list[str] = []
|
|
969
|
+
description: str | None = None
|
|
970
|
+
|
|
971
|
+
try:
|
|
972
|
+
clean_response = _fix_malformed_json(response)
|
|
973
|
+
|
|
974
|
+
if "{" in clean_response:
|
|
975
|
+
start_brace = clean_response.find("{")
|
|
976
|
+
json_str = clean_response[start_brace : clean_response.rindex("}") + 1]
|
|
977
|
+
data = json.loads(json_str)
|
|
978
|
+
|
|
979
|
+
# Extract objects
|
|
980
|
+
raw_objects = data.get("objects", [])
|
|
981
|
+
for obj in raw_objects:
|
|
982
|
+
if isinstance(obj, str) and len(obj) < 100 and obj.strip():
|
|
983
|
+
objects.append(obj)
|
|
984
|
+
elif isinstance(obj, dict):
|
|
985
|
+
name = obj.get("name", "") or obj.get("label", "")
|
|
986
|
+
if isinstance(name, str) and len(name) < 100 and name.strip():
|
|
987
|
+
objects.append(name)
|
|
988
|
+
|
|
989
|
+
# Build description from available fields
|
|
990
|
+
desc_parts = []
|
|
991
|
+
|
|
992
|
+
# Action field (batch-specific)
|
|
993
|
+
action = data.get("action", "")
|
|
994
|
+
if isinstance(action, str) and action.strip():
|
|
995
|
+
desc_parts.append(action.strip())
|
|
996
|
+
|
|
997
|
+
# Standard description
|
|
998
|
+
desc = data.get("description", "")
|
|
999
|
+
if isinstance(desc, str) and desc.strip():
|
|
1000
|
+
desc_parts.append(desc.strip())
|
|
1001
|
+
|
|
1002
|
+
# Continues field (batch-context specific)
|
|
1003
|
+
continues = data.get("continues", "")
|
|
1004
|
+
if isinstance(continues, str) and continues.strip():
|
|
1005
|
+
desc_parts.append(continues.strip())
|
|
1006
|
+
|
|
1007
|
+
# Change field (context-specific)
|
|
1008
|
+
change = data.get("change", "")
|
|
1009
|
+
if isinstance(change, str) and change.strip():
|
|
1010
|
+
desc_parts.append(f"Change: {change.strip()}")
|
|
1011
|
+
|
|
1012
|
+
if desc_parts:
|
|
1013
|
+
description = " ".join(desc_parts)
|
|
1014
|
+
|
|
1015
|
+
return objects, description
|
|
1016
|
+
|
|
1017
|
+
except (json.JSONDecodeError, ValueError) as e:
|
|
1018
|
+
logger.warning(f"Failed to parse batch JSON from Qwen response: {e}")
|
|
1019
|
+
|
|
1020
|
+
# Fallback to standard parser
|
|
1021
|
+
return _parse_objects_and_description(response)
|
|
1022
|
+
|
|
1023
|
+
|
|
337
1024
|
def extract_objects_qwen(
|
|
338
1025
|
file_path: str,
|
|
339
1026
|
timestamps: list[float] | None = None,
|
|
@@ -341,6 +1028,8 @@ def extract_objects_qwen(
|
|
|
341
1028
|
context: dict[str, str] | None = None,
|
|
342
1029
|
progress_callback: ProgressCallback | None = None,
|
|
343
1030
|
lut_path: str | None = None,
|
|
1031
|
+
batch_overlap: bool = False,
|
|
1032
|
+
strategy: str | None = None,
|
|
344
1033
|
) -> ObjectsResult:
|
|
345
1034
|
"""Extract objects using Qwen2-VL vision-language model.
|
|
346
1035
|
|
|
@@ -359,12 +1048,19 @@ def extract_objects_qwen(
|
|
|
359
1048
|
- "topic": Subject matter of the video
|
|
360
1049
|
progress_callback: Optional callback for progress updates (message, current, total)
|
|
361
1050
|
lut_path: Optional path to a LUT file (.cube) to apply for log footage color correction
|
|
1051
|
+
batch_overlap: If True, batches overlap by 1 frame for visual continuity.
|
|
1052
|
+
Useful for unstable camera or videos with rapid scene changes.
|
|
1053
|
+
Default False for faster processing.
|
|
1054
|
+
strategy: Override Qwen strategy for this file. One of:
|
|
1055
|
+
- "single": No temporal context (fastest)
|
|
1056
|
+
- "context": Pass previous description as text
|
|
1057
|
+
- "batch": Multi-frame batches
|
|
1058
|
+
- "batch_context": Batches with text context between (richest)
|
|
1059
|
+
If None, uses global setting from config.
|
|
362
1060
|
|
|
363
1061
|
Returns:
|
|
364
1062
|
ObjectsResult with detected objects and contextual descriptions
|
|
365
1063
|
"""
|
|
366
|
-
from qwen_vl_utils import process_vision_info # type: ignore[import-not-found]
|
|
367
|
-
|
|
368
1064
|
logger.info(f"extract_objects_qwen called: file={file_path}, timestamps={timestamps}, context={context}")
|
|
369
1065
|
|
|
370
1066
|
settings = get_settings()
|
|
@@ -384,7 +1080,7 @@ def extract_objects_qwen(
|
|
|
384
1080
|
if timestamps is None:
|
|
385
1081
|
duration = _get_video_duration(file_path)
|
|
386
1082
|
timestamps = [duration / 2]
|
|
387
|
-
logger.info(f"No timestamps provided, sampling from middle ({duration/2:.1f}s)")
|
|
1083
|
+
logger.info(f"No timestamps provided, sampling from middle ({duration / 2:.1f}s)")
|
|
388
1084
|
else:
|
|
389
1085
|
logger.info(f"Analyzing {len(timestamps)} provided timestamps")
|
|
390
1086
|
|
|
@@ -401,15 +1097,13 @@ def extract_objects_qwen(
|
|
|
401
1097
|
if lut_path and os.path.exists(lut_path):
|
|
402
1098
|
# LUT applied - colors are corrected but may still be slightly off
|
|
403
1099
|
context["log_footage_note"] = (
|
|
404
|
-
"This footage was recorded in LOG profile and color-corrected with a LUT.
|
|
1100
|
+
"This footage was recorded in LOG profile and color-corrected with a LUT. Colors shown are the corrected version but may still appear slightly desaturated."
|
|
405
1101
|
)
|
|
406
1102
|
logger.info("Added log footage context hint (with LUT)")
|
|
407
1103
|
elif is_log_footage:
|
|
408
1104
|
# LOG detected but no LUT - colors are definitely off
|
|
409
1105
|
context["log_footage_note"] = (
|
|
410
|
-
f"This footage appears to be in LOG/flat color profile ({color_transfer}). "
|
|
411
|
-
"Colors are desaturated and not representative of the actual scene. "
|
|
412
|
-
"Focus on describing content and action, not colors."
|
|
1106
|
+
f"This footage appears to be in LOG/flat color profile ({color_transfer}). Colors are desaturated and not representative of the actual scene. Focus on describing content and action, not colors."
|
|
413
1107
|
)
|
|
414
1108
|
logger.info(f"Added log footage context hint (no LUT, color_transfer={color_transfer})")
|
|
415
1109
|
|
|
@@ -431,7 +1125,7 @@ def extract_objects_qwen(
|
|
|
431
1125
|
except (RuntimeError, MemoryError, OSError) as e:
|
|
432
1126
|
error_msg = str(e).lower()
|
|
433
1127
|
if "out of memory" in error_msg or "cannot allocate" in error_msg:
|
|
434
|
-
logger.error(f"Out of memory loading Qwen model.
|
|
1128
|
+
logger.error(f"Out of memory loading Qwen model. Close other apps or use a cloud vision API. Error: {e}")
|
|
435
1129
|
# Return empty result - frontend can fall back to cloud API if configured
|
|
436
1130
|
return ObjectsResult(
|
|
437
1131
|
summary={},
|
|
@@ -443,108 +1137,41 @@ def extract_objects_qwen(
|
|
|
443
1137
|
|
|
444
1138
|
logger.info(f"Processing {total_frames} frames for Qwen analysis")
|
|
445
1139
|
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
]
|
|
482
|
-
|
|
483
|
-
# Process inputs
|
|
484
|
-
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
485
|
-
image_inputs, video_inputs = process_vision_info(messages)
|
|
486
|
-
inputs = processor(
|
|
487
|
-
text=[text],
|
|
488
|
-
images=image_inputs,
|
|
489
|
-
videos=video_inputs,
|
|
490
|
-
padding=True,
|
|
491
|
-
return_tensors="pt",
|
|
492
|
-
)
|
|
493
|
-
inputs = inputs.to(torch_device)
|
|
494
|
-
|
|
495
|
-
# Generate response with repetition penalty to prevent loops
|
|
496
|
-
with torch.no_grad():
|
|
497
|
-
generated_ids = model.generate(
|
|
498
|
-
**inputs,
|
|
499
|
-
max_new_tokens=512,
|
|
500
|
-
do_sample=False, # Greedy decoding for consistent JSON
|
|
501
|
-
repetition_penalty=1.2, # Penalize repetition
|
|
502
|
-
no_repeat_ngram_size=3, # Prevent 3-gram repetition
|
|
503
|
-
)
|
|
504
|
-
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
|
505
|
-
output_text = processor.batch_decode(
|
|
506
|
-
generated_ids_trimmed,
|
|
507
|
-
skip_special_tokens=True,
|
|
508
|
-
clean_up_tokenization_spaces=False,
|
|
509
|
-
)[0]
|
|
510
|
-
|
|
511
|
-
# Parse response
|
|
512
|
-
logger.info(f"Qwen raw output for {timestamp:.1f}s: {output_text[:500]}")
|
|
513
|
-
objects, description = _parse_objects_and_description(output_text)
|
|
514
|
-
if not description:
|
|
515
|
-
logger.warning(f"No description parsed from Qwen output at {timestamp:.1f}s")
|
|
516
|
-
for obj in objects:
|
|
517
|
-
obj_lower = obj.lower().strip()
|
|
518
|
-
all_objects[obj_lower] = all_objects.get(obj_lower, 0) + 1
|
|
519
|
-
|
|
520
|
-
detections.append(
|
|
521
|
-
ObjectDetection(
|
|
522
|
-
timestamp=round(timestamp, 2),
|
|
523
|
-
label=obj_lower,
|
|
524
|
-
confidence=0.95, # VLM confidence is generally high
|
|
525
|
-
bbox=BoundingBox(x=0, y=0, width=0, height=0), # No bbox from VLM
|
|
526
|
-
)
|
|
527
|
-
)
|
|
528
|
-
|
|
529
|
-
if description:
|
|
530
|
-
descriptions.append(description)
|
|
531
|
-
logger.info(f"Frame {timestamp:.1f}s description: {description}")
|
|
532
|
-
|
|
533
|
-
logger.info(f"Frame {timestamp:.1f}s objects: {objects}")
|
|
534
|
-
|
|
535
|
-
# Clear memory after each frame
|
|
536
|
-
del inputs, generated_ids
|
|
537
|
-
if torch_device == "mps":
|
|
538
|
-
torch.mps.empty_cache()
|
|
539
|
-
elif torch_device == "cuda":
|
|
540
|
-
torch.cuda.empty_cache()
|
|
541
|
-
|
|
542
|
-
except Exception as e:
|
|
543
|
-
logger.error(f"Failed to process frame {frame_path}: {e}", exc_info=True)
|
|
544
|
-
# Try to recover memory
|
|
545
|
-
if torch_device == "mps":
|
|
546
|
-
torch.mps.empty_cache()
|
|
547
|
-
continue
|
|
1140
|
+
# Get strategy for multi-frame analysis (use override if provided)
|
|
1141
|
+
if strategy is not None:
|
|
1142
|
+
resolved_strategy = QwenStrategy(strategy)
|
|
1143
|
+
logger.info(f"Using Qwen strategy override: {resolved_strategy}")
|
|
1144
|
+
else:
|
|
1145
|
+
resolved_strategy = settings.get_qwen_strategy()
|
|
1146
|
+
logger.info(f"Using Qwen strategy from config: {resolved_strategy}")
|
|
1147
|
+
|
|
1148
|
+
# Dispatch to appropriate strategy implementation
|
|
1149
|
+
if resolved_strategy == QwenStrategy.SINGLE:
|
|
1150
|
+
all_objects, detections, descriptions = _analyze_frames_single(model, processor, torch_device, frame_paths, timestamps, context, progress_callback)
|
|
1151
|
+
elif resolved_strategy == QwenStrategy.CONTEXT:
|
|
1152
|
+
all_objects, detections, descriptions = _analyze_frames_with_context(model, processor, torch_device, frame_paths, timestamps, context, progress_callback)
|
|
1153
|
+
elif resolved_strategy == QwenStrategy.BATCH:
|
|
1154
|
+
all_objects, detections, descriptions = _analyze_frames_batch(
|
|
1155
|
+
model,
|
|
1156
|
+
processor,
|
|
1157
|
+
torch_device,
|
|
1158
|
+
frame_paths,
|
|
1159
|
+
timestamps,
|
|
1160
|
+
context,
|
|
1161
|
+
progress_callback,
|
|
1162
|
+
overlap=batch_overlap,
|
|
1163
|
+
)
|
|
1164
|
+
else: # BATCH_CONTEXT
|
|
1165
|
+
all_objects, detections, descriptions = _analyze_frames_batch_context(
|
|
1166
|
+
model,
|
|
1167
|
+
processor,
|
|
1168
|
+
torch_device,
|
|
1169
|
+
frame_paths,
|
|
1170
|
+
timestamps,
|
|
1171
|
+
context,
|
|
1172
|
+
progress_callback,
|
|
1173
|
+
overlap=batch_overlap,
|
|
1174
|
+
)
|
|
548
1175
|
|
|
549
1176
|
# Deduplicate - count unique objects per type
|
|
550
1177
|
unique_objects = _deduplicate_objects(all_objects)
|
|
@@ -667,8 +1294,7 @@ def _parse_objects_and_description(response: str) -> tuple[list[str], str | None
|
|
|
667
1294
|
|
|
668
1295
|
# Try to find and parse JSON
|
|
669
1296
|
try:
|
|
670
|
-
|
|
671
|
-
clean_response = response.replace("```json", "").replace("```", "").strip()
|
|
1297
|
+
clean_response = _fix_malformed_json(response)
|
|
672
1298
|
|
|
673
1299
|
# Try to parse as JSON (could be object or array)
|
|
674
1300
|
if "[" in clean_response or "{" in clean_response:
|