videopython 0.22.2__tar.gz → 0.22.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {videopython-0.22.2 → videopython-0.22.4}/PKG-INFO +1 -1
  2. {videopython-0.22.2 → videopython-0.22.4}/pyproject.toml +1 -1
  3. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/understanding/image.py +22 -2
  4. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/video_analysis.py +143 -79
  5. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/effects.py +2 -2
  6. {videopython-0.22.2 → videopython-0.22.4}/.gitignore +0 -0
  7. {videopython-0.22.2 → videopython-0.22.4}/LICENSE +0 -0
  8. {videopython-0.22.2 → videopython-0.22.4}/README.md +0 -0
  9. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/__init__.py +0 -0
  10. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/__init__.py +0 -0
  11. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/_device.py +0 -0
  12. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/dubbing/__init__.py +0 -0
  13. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/dubbing/dubber.py +0 -0
  14. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/dubbing/models.py +0 -0
  15. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/dubbing/pipeline.py +0 -0
  16. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/dubbing/timing.py +0 -0
  17. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/generation/__init__.py +0 -0
  18. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/generation/audio.py +0 -0
  19. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/generation/image.py +0 -0
  20. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/generation/translation.py +0 -0
  21. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/generation/video.py +0 -0
  22. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/registry.py +0 -0
  23. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/swapping/__init__.py +0 -0
  24. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/swapping/inpainter.py +0 -0
  25. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/swapping/models.py +0 -0
  26. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/swapping/segmenter.py +0 -0
  27. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/swapping/swapper.py +0 -0
  28. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/transforms.py +0 -0
  29. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/understanding/__init__.py +0 -0
  30. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/understanding/audio.py +0 -0
  31. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/understanding/separation.py +0 -0
  32. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/understanding/temporal.py +0 -0
  33. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/__init__.py +0 -0
  34. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/audio/__init__.py +0 -0
  35. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/audio/analysis.py +0 -0
  36. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/audio/audio.py +0 -0
  37. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/combine.py +0 -0
  38. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/description.py +0 -0
  39. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/edit.py +0 -0
  40. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/exceptions.py +0 -0
  41. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/progress.py +0 -0
  42. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/registry.py +0 -0
  43. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/scene.py +0 -0
  44. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/text/__init__.py +0 -0
  45. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/text/overlay.py +0 -0
  46. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/text/transcription.py +0 -0
  47. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/transforms.py +0 -0
  48. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/transitions.py +0 -0
  49. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/utils.py +0 -0
  50. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/video.py +0 -0
  51. {videopython-0.22.2 → videopython-0.22.4}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.22.2
3
+ Version: 0.22.4
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.22.2"
3
+ version = "0.22.4"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -28,6 +28,11 @@ _DEFAULT_PROMPT = (
28
28
  class SceneVLM:
29
29
  """Generates scene captions with local Qwen3-VL."""
30
30
 
31
+ # Default pixel budget per image for scene captioning. Qwen3-VL tiles
32
+ # images into 28x28 patches; fewer pixels = fewer vision tokens = faster
33
+ # inference. 384x384 = 147456 is plenty for scene-level captioning.
34
+ DEFAULT_MAX_IMAGE_PIXELS: int = 384 * 384
35
+
31
36
  def __init__(
32
37
  self,
33
38
  model_name: str | None = None,
@@ -35,6 +40,7 @@ class SceneVLM:
35
40
  max_new_tokens: int = 128,
36
41
  temperature: float = 0.0,
37
42
  model_size: Literal["2b", "4b"] = DEFAULT_SCENE_VLM_MODEL_SIZE,
43
+ max_image_pixels: int | None = None,
38
44
  ):
39
45
  if model_size not in SCENE_VLM_MODEL_IDS:
40
46
  supported = ", ".join(sorted(SCENE_VLM_MODEL_IDS))
@@ -45,6 +51,7 @@ class SceneVLM:
45
51
  self.device = device
46
52
  self.max_new_tokens = max_new_tokens
47
53
  self.temperature = temperature
54
+ self.max_image_pixels = max_image_pixels if max_image_pixels is not None else self.DEFAULT_MAX_IMAGE_PIXELS
48
55
  self._processor: Any = None
49
56
  self._model: Any = None
50
57
 
@@ -54,7 +61,7 @@ class SceneVLM:
54
61
 
55
62
  t0 = time.perf_counter()
56
63
  requested_device = self.device
57
- resolved_device = select_device(self.device, mps_allowed=False)
64
+ resolved_device = select_device(self.device, mps_allowed=True)
58
65
 
59
66
  self._processor = AutoProcessor.from_pretrained(self.model_name)
60
67
  self._model = AutoModelForImageTextToText.from_pretrained(self.model_name, dtype="auto")
@@ -69,6 +76,17 @@ class SceneVLM:
69
76
  )
70
77
  logger.info("SceneVLM model weights loaded in %.2fs", time.perf_counter() - t0)
71
78
 
79
+ def _downscale_image(self, img: Image.Image) -> Image.Image:
80
+ """Downscale image to fit within max_image_pixels budget, preserving aspect ratio."""
81
+ w, h = img.size
82
+ pixels = w * h
83
+ if pixels <= self.max_image_pixels:
84
+ return img
85
+ scale = (self.max_image_pixels / pixels) ** 0.5
86
+ new_w = max(1, int(w * scale))
87
+ new_h = max(1, int(h * scale))
88
+ return img.resize((new_w, new_h), Image.LANCZOS)
89
+
72
90
  def _generation_config_for_run(self) -> Any | None:
73
91
  base_config = getattr(self._model, "generation_config", None)
74
92
  if base_config is None or not hasattr(base_config, "to_dict"):
@@ -104,7 +122,9 @@ class SceneVLM:
104
122
  if not images:
105
123
  raise ValueError("`images` must contain at least one frame")
106
124
 
107
- pil_images = [Image.fromarray(img) if isinstance(img, np.ndarray) else img for img in images]
125
+ pil_images = [
126
+ self._downscale_image(Image.fromarray(img) if isinstance(img, np.ndarray) else img) for img in images
127
+ ]
108
128
  user_prompt = prompt or _DEFAULT_PROMPT
109
129
  content: list[dict[str, Any]] = [{"type": "image", "image": img} for img in pil_images]
110
130
  content.append({"type": "text", "text": user_prompt})
@@ -63,6 +63,7 @@ _GEO_TAG_KEYS: tuple[str, ...] = (
63
63
  _SCENE_VLM_FRAME_SCALE = 3.0 # controls log curve steepness for frame sampling
64
64
  _SCENE_VLM_FRAME_BASE = 5.0 # seconds per unit in log formula
65
65
  _SCENE_VLM_MAX_FRAMES = 30 # hard cap on frames per scene
66
+ _SCENE_VLM_GROUP_THRESHOLD = 10.0 # seconds; adjacent scenes shorter than this get merged for one VLM call
66
67
 
67
68
 
68
69
  @dataclass
@@ -397,10 +398,31 @@ class VideoAnalyzer:
397
398
 
398
399
  run_whisper = AUDIO_TO_TEXT in enabled
399
400
  run_scene_det = SEMANTIC_SCENE_DETECTOR in enabled
401
+ run_vlm = SCENE_VLM in enabled
400
402
 
401
403
  transcription: Transcription | None = None
402
404
  detected: list[SceneBoundary] | None = None
403
405
 
406
+ # Start loading SceneVLM weights in a background thread while
407
+ # Whisper and TransNetV2 run. Model loading is pure I/O + CPU
408
+ # weight deserialization, so it overlaps well with GPU inference.
409
+ vlm_preload_future = None
410
+ scene_vlm_holder: list[SceneVLM | None] = [None]
411
+ if run_vlm and (run_whisper or run_scene_det):
412
+ from concurrent.futures import ThreadPoolExecutor as _TPE
413
+
414
+ _vlm_pool = _TPE(max_workers=1)
415
+
416
+ def _preload_vlm() -> None:
417
+ try:
418
+ vlm = SceneVLM(**self.config.get_params(SCENE_VLM))
419
+ vlm._init_local()
420
+ scene_vlm_holder[0] = vlm
421
+ except Exception:
422
+ logger.warning("SceneVLM preload failed", exc_info=True)
423
+
424
+ vlm_preload_future = _vlm_pool.submit(_preload_vlm)
425
+
404
426
  # Whisper and TransNetV2 operate on independent data (audio vs video
405
427
  # frames) and both fit comfortably in GPU memory together. Run them
406
428
  # concurrently via threads -- the GIL is released during GPU compute
@@ -421,6 +443,11 @@ class VideoAnalyzer:
421
443
  if run_scene_det:
422
444
  self._reset_transnetv2_torch_state()
423
445
 
446
+ # Wait for VLM preload to finish before freeing GPU memory.
447
+ if vlm_preload_future is not None:
448
+ vlm_preload_future.result()
449
+ _vlm_pool.shutdown(wait=False)
450
+
424
451
  # Whisper and TransNetV2 are done -- free their GPU memory before
425
452
  # loading SceneVLM (~9GB). Python GC doesn't guarantee immediate
426
453
  # cleanup, so force it and release the CUDA cache.
@@ -441,6 +468,7 @@ class VideoAnalyzer:
441
468
  video=video,
442
469
  metadata=metadata,
443
470
  scenes=scenes,
471
+ preloaded_scene_vlm=scene_vlm_holder[0] if run_vlm else None,
444
472
  )
445
473
  logger.info("Scene analysis completed in %.2fs", time.perf_counter() - t0)
446
474
 
@@ -528,14 +556,19 @@ class VideoAnalyzer:
528
556
  video: Video | None,
529
557
  metadata: VideoMetadata,
530
558
  scenes: list[SceneBoundary],
559
+ preloaded_scene_vlm: SceneVLM | None = None,
531
560
  ) -> SceneAnalysisSection:
532
561
  enabled = self.config.enabled_analyzers
533
562
 
534
- try:
535
- scene_vlm = SceneVLM(**self.config.get_params(SCENE_VLM)) if SCENE_VLM in enabled else None
536
- except Exception:
537
- logger.warning("Failed to initialize SceneVLM, skipping visual understanding", exc_info=True)
538
- scene_vlm = None
563
+ scene_vlm: SceneVLM | None
564
+ if preloaded_scene_vlm is not None:
565
+ scene_vlm = preloaded_scene_vlm
566
+ else:
567
+ try:
568
+ scene_vlm = SceneVLM(**self.config.get_params(SCENE_VLM)) if SCENE_VLM in enabled else None
569
+ except Exception:
570
+ logger.warning("Failed to initialize SceneVLM, skipping visual understanding", exc_info=True)
571
+ scene_vlm = None
539
572
 
540
573
  try:
541
574
  audio_classifier = (
@@ -555,6 +588,20 @@ class VideoAnalyzer:
555
588
  )
556
589
  path_audio = None
557
590
 
591
+ # -- Batched SceneVLM: collect all timestamps, extract frames once, run one forward pass --
592
+ captions: list[str | None] = [None] * len(scenes)
593
+ if scene_vlm is not None:
594
+ try:
595
+ captions = self._run_scene_vlm_batched(
596
+ scene_vlm=scene_vlm,
597
+ source_path=source_path,
598
+ video=video,
599
+ metadata=metadata,
600
+ scenes=scenes,
601
+ )
602
+ except Exception:
603
+ logger.warning("Batched SceneVLM failed, skipping visual understanding", exc_info=True)
604
+
558
605
  samples: list[SceneAnalysisSample] = []
559
606
  t_audio_total = 0.0
560
607
  for index, scene in enumerate(scenes):
@@ -564,38 +611,23 @@ class VideoAnalyzer:
564
611
  end_second=float(scene.end),
565
612
  start_frame=int(scene.start_frame),
566
613
  end_frame=int(scene.end_frame),
614
+ caption=captions[index],
567
615
  )
568
616
 
569
- scene_clip: Video | None = None
570
- if audio_classifier is not None and path_audio is None:
571
- try:
572
- scene_clip = self._load_scene_video_clip(
573
- source_path=source_path,
574
- video=video,
575
- start_second=scene.start,
576
- end_second=scene.end,
577
- )
578
- except Exception:
579
- scene_clip = None
580
-
581
- if scene_vlm is not None:
582
- try:
583
- sample.caption = self._run_scene_vlm(
584
- scene_vlm=scene_vlm,
585
- source_path=source_path,
586
- video=video,
587
- metadata=metadata,
588
- start_second=scene.start,
589
- end_second=scene.end,
590
- )
591
- except Exception:
592
- logger.warning(
593
- "SceneVLM failed for scene %d (%.1f-%.1fs)", index, scene.start, scene.end, exc_info=True
594
- )
595
-
596
617
  if audio_classifier is not None:
597
618
  t0 = time.perf_counter()
598
619
  try:
620
+ scene_clip: Video | None = None
621
+ if path_audio is None:
622
+ try:
623
+ scene_clip = self._load_scene_video_clip(
624
+ source_path=source_path,
625
+ video=video,
626
+ start_second=scene.start,
627
+ end_second=scene.end,
628
+ )
629
+ except Exception:
630
+ scene_clip = None
599
631
  sample.audio_classification = self._run_scene_audio_classification(
600
632
  audio_classifier=audio_classifier,
601
633
  path_audio=path_audio,
@@ -616,33 +648,91 @@ class VideoAnalyzer:
616
648
 
617
649
  return SceneAnalysisSection(samples=samples)
618
650
 
619
- def _run_scene_vlm(
651
+ def _run_scene_vlm_batched(
620
652
  self,
621
653
  *,
622
654
  scene_vlm: SceneVLM,
623
655
  source_path: Path | None,
624
656
  video: Video | None,
625
657
  metadata: VideoMetadata,
626
- start_second: float,
627
- end_second: float,
628
- ) -> str | None:
629
- duration = max(0.0, end_second - start_second)
630
- frame_count = min(
631
- _SCENE_VLM_MAX_FRAMES,
632
- max(1, math.ceil(_SCENE_VLM_FRAME_SCALE * math.log(duration / _SCENE_VLM_FRAME_BASE + 1))),
633
- )
634
- frames = self._sample_scene_frames(
635
- source_path=source_path,
636
- video=video,
637
- metadata=metadata,
638
- start_second=start_second,
639
- end_second=end_second,
640
- frame_count=frame_count,
641
- )
642
- if not frames:
643
- return None
644
- caption = scene_vlm.analyze_scene(frames)
645
- return caption or None
658
+ scenes: list[SceneBoundary],
659
+ ) -> list[str | None]:
660
+ """Extract frames for all scenes in one ffmpeg call, then caption each group.
661
+
662
+ Adjacent short scenes (< _SCENE_VLM_GROUP_THRESHOLD seconds) are merged
663
+ into a single VLM call to reduce per-call overhead.
664
+ """
665
+ # Group adjacent short scenes to reduce VLM call count.
666
+ # Each group is a list of scene indices that share one VLM call.
667
+ groups: list[list[int]] = []
668
+ current_group: list[int] = []
669
+ current_group_duration = 0.0
670
+ for i, scene in enumerate(scenes):
671
+ dur = max(0.0, scene.end - scene.start)
672
+ if current_group and current_group_duration + dur > _SCENE_VLM_GROUP_THRESHOLD:
673
+ groups.append(current_group)
674
+ current_group = [i]
675
+ current_group_duration = dur
676
+ else:
677
+ current_group.append(i)
678
+ current_group_duration += dur
679
+ if current_group:
680
+ groups.append(current_group)
681
+
682
+ # Compute timestamps for each group (treating merged scenes as one span)
683
+ group_timestamps: list[list[float]] = []
684
+ all_timestamps: list[float] = []
685
+ for group in groups:
686
+ span_start = scenes[group[0]].start
687
+ span_end = scenes[group[-1]].end
688
+ duration = max(0.0, span_end - span_start)
689
+ frame_count = min(
690
+ _SCENE_VLM_MAX_FRAMES,
691
+ max(1, math.ceil(_SCENE_VLM_FRAME_SCALE * math.log(duration / _SCENE_VLM_FRAME_BASE + 1))),
692
+ )
693
+ timestamps = self._sample_timestamps(start_second=span_start, end_second=span_end, frame_count=frame_count)
694
+ group_timestamps.append(timestamps)
695
+ all_timestamps.extend(timestamps)
696
+
697
+ if not all_timestamps:
698
+ return [None] * len(scenes)
699
+
700
+ # Extract all frames in a single ffmpeg call
701
+ if source_path is not None:
702
+ all_frames_array = extract_frames_at_times(source_path, all_timestamps)
703
+ all_frames: list[np.ndarray | Image.Image] = list(all_frames_array)
704
+ else:
705
+ current_video = _require_video(video)
706
+ max_frame = max(len(current_video.frames) - 1, 0)
707
+ indices = [max(0, min(max_frame, int(ts * metadata.fps))) for ts in all_timestamps]
708
+ all_frames = [current_video.frames[idx] for idx in indices]
709
+
710
+ # Caption each group and assign to all scenes in that group
711
+ captions: list[str | None] = [None] * len(scenes)
712
+ offset = 0
713
+ for group, timestamps in zip(groups, group_timestamps):
714
+ frame_count = len(timestamps)
715
+ group_frames = all_frames[offset : offset + frame_count]
716
+ offset += frame_count
717
+ if not group_frames:
718
+ continue
719
+ caption: str | None = None
720
+ try:
721
+ caption = scene_vlm.analyze_scene(group_frames) or None
722
+ except Exception:
723
+ logger.warning(
724
+ "SceneVLM failed for scenes %d-%d (%.1f-%.1fs)",
725
+ group[0],
726
+ group[-1],
727
+ scenes[group[0]].start,
728
+ scenes[group[-1]].end,
729
+ exc_info=True,
730
+ )
731
+ caption = None
732
+ for i in group:
733
+ captions[i] = caption
734
+ logger.info("SceneVLM: %d groups from %d scenes", len(groups), len(scenes))
735
+ return captions
646
736
 
647
737
  def _run_scene_audio_classification(
648
738
  self,
@@ -675,32 +765,6 @@ class VideoAnalyzer:
675
765
  ]
676
766
  return AudioClassification(events=offset_events, clip_predictions=classification.clip_predictions)
677
767
 
678
- def _sample_scene_frames(
679
- self,
680
- *,
681
- source_path: Path | None,
682
- video: Video | None,
683
- metadata: VideoMetadata,
684
- start_second: float,
685
- end_second: float,
686
- frame_count: int,
687
- ) -> list[np.ndarray | Image.Image]:
688
- timestamps = self._sample_timestamps(start_second=start_second, end_second=end_second, frame_count=frame_count)
689
- if not timestamps:
690
- return []
691
-
692
- if source_path is not None:
693
- sampled_frames: list[np.ndarray | Image.Image] = []
694
- sampled_frames.extend(extract_frames_at_times(source_path, timestamps))
695
- return sampled_frames
696
-
697
- current_video = _require_video(video)
698
- max_frame = max(len(current_video.frames) - 1, 0)
699
- indices = [max(0, min(max_frame, int(ts * metadata.fps))) for ts in timestamps]
700
- in_memory_frames: list[np.ndarray | Image.Image] = []
701
- in_memory_frames.extend(current_video.frames[idx] for idx in indices)
702
- return in_memory_frames
703
-
704
768
  @staticmethod
705
769
  def _sample_timestamps(*, start_second: float, end_second: float, frame_count: int) -> list[float]:
706
770
  duration = max(0.0, end_second - start_second)
@@ -582,7 +582,7 @@ class Fade(Effect):
582
582
  if video.audio is not None and not video.audio.is_silent:
583
583
  sample_rate = video.audio.metadata.sample_rate
584
584
  audio_start = round(start_s * sample_rate)
585
- audio_end = round(stop_s * sample_rate)
585
+ audio_end = min(round(stop_s * sample_rate), len(video.audio.data))
586
586
  n_audio_samples = audio_end - audio_start
587
587
  fade_samples = min(round(self.duration * sample_rate), n_audio_samples)
588
588
 
@@ -656,7 +656,7 @@ class VolumeAdjust(AudioEffect):
656
656
 
657
657
  sample_rate = audio.metadata.sample_rate
658
658
  start_sample = round(start * sample_rate)
659
- end_sample = round(stop * sample_rate)
659
+ end_sample = min(round(stop * sample_rate), len(audio.data))
660
660
  n_samples = end_sample - start_sample
661
661
 
662
662
  # Build volume envelope
File without changes
File without changes
File without changes