media-engine 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
media_engine/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.1'
32
- __version_tuple__ = version_tuple = (0, 1, 1)
31
+ __version__ = version = '0.2.1'
32
+ __version_tuple__ = version_tuple = (0, 2, 1)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -57,6 +57,15 @@ class BatchRequest(BaseModel):
57
57
  # Optional LUT path for visual analysis (e.g., for log footage color correction)
58
58
  # Applied to extracted frames before sending to Qwen
59
59
  lut_path: str | None = None
60
+ # Per-file batch overlap setting for visual analysis (file path -> bool)
61
+ # When True, batches overlap by 1 frame for visual continuity (useful for unstable camera)
62
+ # Example: {"/path/shaky_video.mp4": True}
63
+ visual_batch_overlap: dict[str, bool] | None = None
64
+ # Per-file Qwen strategy override (file path -> strategy)
65
+ # Overrides global qwen_strategy setting for specific files
66
+ # Values: "single", "context", "batch", "batch_context"
67
+ # Example: {"/path/action_video.mp4": "batch_context"}
68
+ visual_strategy: dict[str, str] | None = None
60
69
 
61
70
 
62
71
  class BatchFileStatus(BaseModel):
@@ -85,7 +85,7 @@ def run_batch_job(batch_id: str, request: BatchRequest) -> None:
85
85
  yolo_model = settings.get_yolo_model()
86
86
  clip_model = settings.get_clip_model()
87
87
 
88
- logger.info(f"Batch {batch_id} models: whisper={whisper_model}, qwen={qwen_model}, " f"yolo={yolo_model}, clip={clip_model}")
88
+ logger.info(f"Batch {batch_id} models: whisper={whisper_model}, qwen={qwen_model}, yolo={yolo_model}, clip={clip_model}")
89
89
 
90
90
  batch_start_time = time.time()
91
91
  peak_memory = get_memory_mb()
@@ -117,7 +117,7 @@ def run_batch_job(batch_id: str, request: BatchRequest) -> None:
117
117
 
118
118
  # Add time for remaining extractors (after current one)
119
119
  remaining_extractors = EXTRACTOR_ORDER[current_ext_idx + 1 :]
120
- logger.info(f"ETA calc: current={current_extractor}, remaining={remaining_extractors}, " f"enabled={enabled_extractors}")
120
+ logger.info(f"ETA calc: current={current_extractor}, remaining={remaining_extractors}, enabled={enabled_extractors}")
121
121
 
122
122
  for ext in remaining_extractors:
123
123
  if ext not in enabled_extractors:
@@ -210,7 +210,7 @@ def run_batch_job(batch_id: str, request: BatchRequest) -> None:
210
210
 
211
211
  # Debug logging for ETA calculation (use INFO level to see it)
212
212
  if total_eta and total_eta > 0:
213
- logger.info(f"ETA: {extractor} stage={eta}s, total={total_eta}s, " f"subs={enabled_sub_extractors}, files={len(file_durations)}")
213
+ logger.info(f"ETA: {extractor} stage={eta}s, total={total_eta}s, subs={enabled_sub_extractors}, files={len(file_durations)}")
214
214
 
215
215
  # Calculate queue ETA (for all queued batches)
216
216
  queue_eta, queued_count = calculate_queue_eta()
@@ -565,7 +565,7 @@ def run_batch_job(batch_id: str, request: BatchRequest) -> None:
565
565
  }
566
566
  update_file_status(i, "running", "motion", motion_result)
567
567
  update_extractor_status(i, "motion", "completed")
568
- logger.info(f"Motion for {fname}: stable={motion.is_stable}, " f"timestamps={len(adaptive_timestamps[i])}")
568
+ logger.info(f"Motion for {fname}: stable={motion.is_stable}, timestamps={len(adaptive_timestamps[i])}")
569
569
  except Exception as e:
570
570
  logger.warning(f"Motion analysis failed for {file_path}: {e}")
571
571
  update_extractor_status(i, "motion", "failed")
@@ -743,7 +743,7 @@ def run_batch_job(batch_id: str, request: BatchRequest) -> None:
743
743
  face_buffer = decode_frames(file_path, timestamps=face_timestamps)
744
744
  faces = extract_faces(file_path, frame_buffer=face_buffer)
745
745
  face_frame_count = len(face_buffer.frames)
746
- logger.info(f"Face detection on {face_frame_count} frames for {fname} " f"(short video, {face_fps} FPS)")
746
+ logger.info(f"Face detection on {face_frame_count} frames for {fname} (short video, {face_fps} FPS)")
747
747
  else:
748
748
  # Long video - use adaptive batching
749
749
  current_time = 0.0
@@ -802,14 +802,14 @@ def run_batch_job(batch_id: str, request: BatchRequest) -> None:
802
802
  known_embeddings.extend(new_embs)
803
803
  consistent_batches = 0
804
804
  if in_verification_mode:
805
- logger.info(f"New face detected at {current_time:.1f}s, " "exiting verification mode")
805
+ logger.info(f"New face detected at {current_time:.1f}s, exiting verification mode")
806
806
  in_verification_mode = False
807
807
  elif all_known and known_embeddings:
808
808
  # All faces are known
809
809
  consistent_batches += 1
810
810
  if consistent_batches >= min_consistent_batches and not in_verification_mode:
811
811
  in_verification_mode = True
812
- logger.info(f"Faces stable after {current_time:.1f}s, " "switching to verification mode (every 10s)")
812
+ logger.info(f"Faces stable after {current_time:.1f}s, switching to verification mode (every 10s)")
813
813
  elif not known_embeddings:
814
814
  # No faces in this batch and no known faces yet
815
815
  consistent_batches += 1
@@ -841,15 +841,13 @@ def run_batch_job(batch_id: str, request: BatchRequest) -> None:
841
841
  )
842
842
 
843
843
  mode_info = "verification" if in_verification_mode else "normal"
844
- logger.info(
845
- f"Face detection on {total_frames} frames for {fname} " f"(adaptive batching, {len(known_embeddings)} unique, " f"ended in {mode_info} mode)"
846
- )
844
+ logger.info(f"Face detection on {total_frames} frames for {fname} (adaptive batching, {len(known_embeddings)} unique, ended in {mode_info} mode)")
847
845
 
848
846
  # Fallback if no duration info
849
847
  if faces is None and buffer is not None:
850
848
  faces = extract_faces(file_path, frame_buffer=buffer)
851
849
  face_frame_count = len(buffer.frames)
852
- logger.info(f"Face detection on {len(buffer.frames)} frames for {fname} " "(using shared buffer)")
850
+ logger.info(f"Face detection on {len(buffer.frames)} frames for {fname} (using shared buffer)")
853
851
 
854
852
  if faces:
855
853
  faces_data = {
@@ -966,13 +964,17 @@ def run_batch_job(batch_id: str, request: BatchRequest) -> None:
966
964
  timestamps = get_sample_timestamps(motion, max_samples=5)
967
965
 
968
966
  file_context = request.contexts.get(file_path) if request.contexts else None
969
- logger.info(f"Calling Qwen with context for {fname}: {file_context}, lut_path={request.lut_path}")
967
+ file_batch_overlap = request.visual_batch_overlap.get(file_path, False) if request.visual_batch_overlap else False
968
+ file_strategy = request.visual_strategy.get(file_path) if request.visual_strategy else None
969
+ logger.info(f"Calling Qwen for {fname}: context={file_context}, lut_path={request.lut_path}, batch_overlap={file_batch_overlap}, strategy={file_strategy}")
970
970
  visual_result = extract_objects_qwen(
971
971
  file_path,
972
972
  timestamps=timestamps,
973
973
  model_name=qwen_model,
974
974
  context=file_context,
975
975
  lut_path=request.lut_path,
976
+ batch_overlap=file_batch_overlap,
977
+ strategy=file_strategy,
976
978
  )
977
979
  visual_data: dict[str, Any] = {"summary": visual_result.summary}
978
980
  if visual_result.descriptions:
@@ -126,7 +126,7 @@ def record_timing(
126
126
  _timing_history_dirty = True
127
127
 
128
128
  unit_label = "/unit" if units else "s"
129
- logger.debug(f"Recorded timing: {extractor}@{resolution_bucket} = {rate:.2f}{unit_label} " f"(avg: {avg:.2f}{unit_label} from {sample_count} samples)")
129
+ logger.debug(f"Recorded timing: {extractor}@{resolution_bucket} = {rate:.2f}{unit_label} (avg: {avg:.2f}{unit_label} from {sample_count} samples)")
130
130
  # Save periodically (not on every update to reduce disk I/O)
131
131
  if _timing_history_dirty and time.time() - _timing_history_last_save > _TIMING_SAVE_INTERVAL:
132
132
  save_timing_history()
media_engine/config.py CHANGED
@@ -81,6 +81,15 @@ class ObjectDetector(StrEnum):
81
81
  QWEN = "qwen"
82
82
 
83
83
 
84
+ class QwenStrategy(StrEnum):
85
+ """Qwen temporal context strategy for multi-frame analysis."""
86
+
87
+ SINGLE = "single" # No context (current behavior)
88
+ CONTEXT = "context" # Pass previous description as text
89
+ BATCH = "batch" # Multi-frame batch (2-3 frames together)
90
+ BATCH_CONTEXT = "batch_context" # Batch + pass context between groups
91
+
92
+
84
93
  # =============================================================================
85
94
  # Settings (loaded from JSON config file)
86
95
  # =============================================================================
@@ -120,6 +129,7 @@ class Settings(BaseModel):
120
129
  object_detector: str = DEFAULT_OBJECT_DETECTOR # "auto", "yolo", or "qwen"
121
130
  qwen_model: str = DEFAULT_QWEN_MODEL
122
131
  qwen_frames_per_scene: int = DEFAULT_QWEN_FRAMES_PER_SCENE
132
+ qwen_strategy: str = "auto" # "auto", "single", "context", "batch", "batch_context"
123
133
 
124
134
  # YOLO model ("auto" = select based on VRAM)
125
135
  yolo_model: str = "auto"
@@ -163,6 +173,12 @@ class Settings(BaseModel):
163
173
  return get_auto_object_detector()
164
174
  return ObjectDetector(self.object_detector)
165
175
 
176
+ def get_qwen_strategy(self) -> "QwenStrategy":
177
+ """Get resolved Qwen strategy (handles 'auto')."""
178
+ if self.qwen_strategy == "auto":
179
+ return get_auto_qwen_strategy()
180
+ return QwenStrategy(self.qwen_strategy)
181
+
166
182
 
167
183
  def get_config_path() -> Path:
168
184
  """Get the config file path."""
@@ -369,7 +385,7 @@ def get_free_memory_gb() -> float:
369
385
  # Leave a 1GB buffer for system processes
370
386
  available_for_models = max(0.0, available_gb - 1.0)
371
387
 
372
- logger.info(f"Memory: {mem.total / (1024**3):.0f}GB total, " f"{mem.available / (1024**3):.1f}GB available, " f"{available_for_models:.1f}GB for models")
388
+ logger.info(f"Memory: {mem.total / (1024**3):.0f}GB total, {mem.available / (1024**3):.1f}GB available, {available_for_models:.1f}GB for models")
373
389
  return available_for_models
374
390
 
375
391
  except ImportError:
@@ -432,46 +448,101 @@ def get_auto_whisper_model() -> str:
432
448
 
433
449
 
434
450
  def get_auto_qwen_model() -> str:
435
- """Select Qwen2-VL model based on available VRAM.
451
+ """Select Qwen2-VL model based on available free memory.
436
452
 
437
- | VRAM | Model | Size | Quality |
438
- |----------|----------------|-------|---------|
439
- | <8GB | (use YOLO) | - | Basic |
440
- | 8-16GB | Qwen2-VL-2B | ~5GB | Good |
441
- | 16GB+ | Qwen2-VL-7B | ~15GB | Best |
453
+ | Free Memory | Model | Size | Quality |
454
+ |-------------|----------------|-------|---------|
455
+ | <8GB | (use YOLO) | - | Basic |
456
+ | 8-16GB | Qwen2-VL-2B | ~5GB | Good |
457
+ | 16GB+ | Qwen2-VL-7B | ~15GB | Best |
442
458
  """
443
- vram = get_available_vram_gb()
459
+ free_mem = get_free_memory_gb()
444
460
 
445
- if vram >= 16:
461
+ if free_mem >= 16:
446
462
  model = "Qwen/Qwen2-VL-7B-Instruct"
447
- elif vram >= 8:
463
+ elif free_mem >= 8:
448
464
  model = "Qwen/Qwen2-VL-2B-Instruct"
449
465
  else:
450
- # Not enough VRAM for Qwen, should use YOLO instead
466
+ # Not enough free memory for Qwen, should use YOLO instead
451
467
  model = "Qwen/Qwen2-VL-2B-Instruct"
452
- logger.warning(f"Low VRAM ({vram:.1f}GB) - consider using YOLO instead of Qwen")
468
+ logger.warning(f"Low free memory ({free_mem:.1f}GB) - consider using YOLO instead of Qwen")
453
469
 
454
- logger.info(f"Auto-selected Qwen model: {model} (VRAM: {vram:.1f}GB)")
470
+ logger.info(f"Auto-selected Qwen model: {model} (free memory: {free_mem:.1f}GB)")
455
471
  return model
456
472
 
457
473
 
458
474
  def get_auto_object_detector() -> ObjectDetector:
459
- """Select object detector based on available VRAM.
475
+ """Select object detector based on available free memory.
460
476
 
461
477
  YOLO is faster and uses less memory.
462
- Qwen provides better scene understanding but needs more VRAM.
478
+ Qwen provides better scene understanding but needs more memory.
463
479
  """
464
- vram = get_available_vram_gb()
480
+ free_mem = get_free_memory_gb()
465
481
 
466
- if vram >= 8:
482
+ if free_mem >= 8:
467
483
  detector = ObjectDetector.QWEN
468
484
  else:
469
485
  detector = ObjectDetector.YOLO
470
486
 
471
- logger.info(f"Auto-selected object detector: {detector} (VRAM: {vram:.1f}GB)")
487
+ logger.info(f"Auto-selected object detector: {detector} (free memory: {free_mem:.1f}GB)")
472
488
  return detector
473
489
 
474
490
 
491
+ def get_auto_qwen_strategy() -> QwenStrategy:
492
+ """Select Qwen temporal context strategy based on available free memory.
493
+
494
+ Thresholds based on Qwen 2B with 1080p images (max 1280px width).
495
+
496
+ | Free Memory | Strategy | Frames per Call | Description |
497
+ |-------------|---------------|-----------------|--------------------------|
498
+ | <8GB | CONTEXT | 1 | Text context only |
499
+ | 8-12GB | BATCH | 2-3 | Multi-frame batches |
500
+ | 12GB+ | BATCH_CONTEXT | 2-3 | Batches + text context |
501
+ """
502
+ free_mem = get_free_memory_gb()
503
+
504
+ if free_mem >= 12:
505
+ strategy = QwenStrategy.BATCH_CONTEXT
506
+ elif free_mem >= 8:
507
+ strategy = QwenStrategy.BATCH
508
+ else:
509
+ strategy = QwenStrategy.CONTEXT
510
+
511
+ logger.info(f"Auto-selected Qwen strategy: {strategy} (free memory: {free_mem:.1f}GB)")
512
+ return strategy
513
+
514
+
515
+ def get_auto_qwen_batch_size() -> int:
516
+ """Select Qwen batch size based on available free memory.
517
+
518
+ Larger batches provide better temporal context but use more memory.
519
+ Each additional frame in a batch adds ~0.5-1GB memory overhead.
520
+
521
+ | Free Memory | Batch Size | Notes |
522
+ |-------------|------------|--------------------------|
523
+ | <10GB | 2 | Minimal batching |
524
+ | 10-15GB | 3 | Default batch size |
525
+ | 15-25GB | 4 | Good temporal context |
526
+ | 25-40GB | 5 | Rich temporal context |
527
+ | 40GB+ | 6 | Maximum temporal context |
528
+ """
529
+ free_mem = get_free_memory_gb()
530
+
531
+ if free_mem >= 40:
532
+ batch_size = 6
533
+ elif free_mem >= 25:
534
+ batch_size = 5
535
+ elif free_mem >= 15:
536
+ batch_size = 4
537
+ elif free_mem >= 10:
538
+ batch_size = 3
539
+ else:
540
+ batch_size = 2
541
+
542
+ logger.info(f"Auto-selected Qwen batch size: {batch_size} (free memory: {free_mem:.1f}GB)")
543
+ return batch_size
544
+
545
+
475
546
  def get_auto_yolo_model() -> str:
476
547
  """Select YOLO model based on available VRAM.
477
548
 
@@ -546,6 +617,7 @@ def get_vram_summary() -> dict:
546
617
  "free_memory_gb": round(free_mem, 1),
547
618
  "auto_whisper_model": get_auto_whisper_model(),
548
619
  "auto_qwen_model": get_auto_qwen_model() if vram >= 8 else None,
620
+ "auto_qwen_strategy": str(get_auto_qwen_strategy()),
549
621
  "auto_yolo_model": get_auto_yolo_model(),
550
622
  "auto_clip_model": get_auto_clip_model(),
551
623
  "auto_object_detector": str(get_auto_object_detector()),
@@ -654,7 +726,7 @@ def check_memory_before_load(model_name: str, clear_memory_func: Any | None = No
654
726
  available = vram if device != DeviceType.CPU else ram
655
727
 
656
728
  if available < required_gb:
657
- logger.warning(f"Low memory ({available:.1f}GB available) for {model_name} " f"({required_gb:.1f}GB required)")
729
+ logger.warning(f"Low memory ({available:.1f}GB available) for {model_name} ({required_gb:.1f}GB required)")
658
730
 
659
731
  # Try to free memory
660
732
  if clear_memory_func is not None:
@@ -200,7 +200,7 @@ def extract_faces(
200
200
  unique_faces, unique_estimate = _deduplicate_faces(detections, all_embeddings, frame_size=frame_size)
201
201
 
202
202
  needs_review = sum(1 for f in unique_faces if f.needs_review)
203
- logger.info(f"Detected {len(detections)} faces, {unique_estimate} unique, " f"{needs_review} need review")
203
+ logger.info(f"Detected {len(detections)} faces, {unique_estimate} unique, {needs_review} need review")
204
204
 
205
205
  return FacesResult(
206
206
  count=len(detections),
@@ -286,7 +286,7 @@ def decode_frames(
286
286
  out_width = out_width - (out_width % 2)
287
287
  out_height = out_height - (out_height % 2)
288
288
 
289
- logger.info(f"Decoding {len(timestamps)} frames from {file_path} " f"at {out_width}x{out_height}" + (f" (hwaccel={hwaccel})" if hwaccel else ""))
289
+ logger.info(f"Decoding {len(timestamps)} frames from {file_path} at {out_width}x{out_height}" + (f" (hwaccel={hwaccel})" if hwaccel else ""))
290
290
 
291
291
  frames: dict[float, SharedFrame] = {}
292
292
 
@@ -99,7 +99,7 @@ class FrameExtractor:
99
99
  pixels = self._width * self._height
100
100
  max_dim = max(self._width, self._height)
101
101
  if pixels > HIGH_RES_THRESHOLD and max_dim > self.max_dimension:
102
- logger.info(f"High-res video ({self._width}x{self._height}), " f"using FFmpeg decode at {self.max_dimension}px")
102
+ logger.info(f"High-res video ({self._width}x{self._height}), using FFmpeg decode at {self.max_dimension}px")
103
103
  self._use_ffmpeg_decode = True
104
104
  # Release opencv capture - we'll use FFmpeg instead
105
105
  self.cap.release()
@@ -279,7 +279,7 @@ class FrameExtractor:
279
279
  try:
280
280
  # Scale filter that maintains aspect ratio
281
281
  # scale=W:H:force_original_aspect_ratio=decrease
282
- scale_filter = f"scale={self.max_dimension}:{self.max_dimension}" f":force_original_aspect_ratio=decrease"
282
+ scale_filter = f"scale={self.max_dimension}:{self.max_dimension}:force_original_aspect_ratio=decrease"
283
283
 
284
284
  cmd = [
285
285
  "ffmpeg",
@@ -416,7 +416,7 @@ class SonyExtractor:
416
416
  total_duration=recording.total_duration,
417
417
  file_index=file_index,
418
418
  )
419
- logger.info(f"Detected spanned recording: file {file_index + 1} of {len(recording.clips)}, " f"total duration {recording.total_duration:.1f}s")
419
+ logger.info(f"Detected spanned recording: file {file_index + 1} of {len(recording.clips)}, total duration {recording.total_duration:.1f}s")
420
420
 
421
421
  return Metadata(
422
422
  duration=base_metadata.duration,
@@ -377,7 +377,7 @@ def analyze_motion(
377
377
  total_flow_time += time.perf_counter() - flow_start
378
378
 
379
379
  # Log timing breakdown
380
- logger.info(f"Motion analysis timing: decode={total_load_time:.2f}s, " f"optical_flow={total_flow_time:.2f}s, frames={global_frame_idx}")
380
+ logger.info(f"Motion analysis timing: decode={total_load_time:.2f}s, optical_flow={total_flow_time:.2f}s, frames={global_frame_idx}")
381
381
 
382
382
  if not frame_motions:
383
383
  return MotionAnalysis(
@@ -693,7 +693,7 @@ def get_adaptive_timestamps(
693
693
  motion.duration * 0.5,
694
694
  motion.duration * 0.85,
695
695
  ]
696
- logger.info(f"Stable video optimization: {len(timestamps)} frames only " f"(avg_intensity={motion.avg_intensity:.1f})")
696
+ logger.info(f"Stable video optimization: {len(timestamps)} frames only (avg_intensity={motion.avg_intensity:.1f})")
697
697
  return timestamps
698
698
 
699
699
  if motion.is_stable:
@@ -704,7 +704,7 @@ def get_adaptive_timestamps(
704
704
  else:
705
705
  step = motion.duration / (num_samples + 1)
706
706
  timestamps = [step * (i + 1) for i in range(num_samples)]
707
- logger.info(f"Stable video: {len(timestamps)} frames " f"(avg_intensity={motion.avg_intensity:.1f})")
707
+ logger.info(f"Stable video: {len(timestamps)} frames (avg_intensity={motion.avg_intensity:.1f})")
708
708
  return timestamps
709
709
 
710
710
  if not motion.segments:
@@ -753,7 +753,7 @@ def get_adaptive_timestamps(
753
753
  # Ensure timestamps are within video bounds
754
754
  timestamps = [max(0.1, min(t, motion.duration - 0.1)) for t in timestamps]
755
755
 
756
- logger.info(f"Adaptive sampling: {len(timestamps)} frames " f"(avg_intensity={motion.avg_intensity:.1f}, stable={motion.is_stable})")
756
+ logger.info(f"Adaptive sampling: {len(timestamps)} frames (avg_intensity={motion.avg_intensity:.1f}, stable={motion.is_stable})")
757
757
 
758
758
  return timestamps
759
759
 
@@ -157,7 +157,7 @@ def extract_objects(
157
157
  # Deduplicate - track unique objects
158
158
  unique_detections, summary = _deduplicate_objects(raw_detections)
159
159
 
160
- logger.info(f"Detected {len(raw_detections)} objects, " f"{len(unique_detections)} unique across {len(summary)} types")
160
+ logger.info(f"Detected {len(raw_detections)} objects, {len(unique_detections)} unique across {len(summary)} types")
161
161
 
162
162
  return ObjectsResult(
163
163
  summary=summary,