videopython 0.22.2__tar.gz → 0.22.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.22.2 → videopython-0.22.4}/PKG-INFO +1 -1
- {videopython-0.22.2 → videopython-0.22.4}/pyproject.toml +1 -1
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/understanding/image.py +22 -2
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/video_analysis.py +143 -79
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/effects.py +2 -2
- {videopython-0.22.2 → videopython-0.22.4}/.gitignore +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/LICENSE +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/README.md +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/__init__.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/_device.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/dubbing/__init__.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/dubbing/dubber.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/dubbing/models.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/dubbing/pipeline.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/generation/audio.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/generation/translation.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/registry.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/swapping/__init__.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/swapping/inpainter.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/swapping/models.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/swapping/segmenter.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/swapping/swapper.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/transforms.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/understanding/audio.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/understanding/separation.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/__init__.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/audio/__init__.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/audio/analysis.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/audio/audio.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/combine.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/description.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/edit.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/progress.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/registry.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/scene.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/text/__init__.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/text/overlay.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/text/transcription.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/transforms.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/transitions.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/utils.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/base/video.py +0 -0
- {videopython-0.22.2 → videopython-0.22.4}/src/videopython/py.typed +0 -0
|
@@ -28,6 +28,11 @@ _DEFAULT_PROMPT = (
|
|
|
28
28
|
class SceneVLM:
|
|
29
29
|
"""Generates scene captions with local Qwen3-VL."""
|
|
30
30
|
|
|
31
|
+
# Default pixel budget per image for scene captioning. Qwen3-VL tiles
|
|
32
|
+
# images into 28x28 patches; fewer pixels = fewer vision tokens = faster
|
|
33
|
+
# inference. 384x384 = 147456 is plenty for scene-level captioning.
|
|
34
|
+
DEFAULT_MAX_IMAGE_PIXELS: int = 384 * 384
|
|
35
|
+
|
|
31
36
|
def __init__(
|
|
32
37
|
self,
|
|
33
38
|
model_name: str | None = None,
|
|
@@ -35,6 +40,7 @@ class SceneVLM:
|
|
|
35
40
|
max_new_tokens: int = 128,
|
|
36
41
|
temperature: float = 0.0,
|
|
37
42
|
model_size: Literal["2b", "4b"] = DEFAULT_SCENE_VLM_MODEL_SIZE,
|
|
43
|
+
max_image_pixels: int | None = None,
|
|
38
44
|
):
|
|
39
45
|
if model_size not in SCENE_VLM_MODEL_IDS:
|
|
40
46
|
supported = ", ".join(sorted(SCENE_VLM_MODEL_IDS))
|
|
@@ -45,6 +51,7 @@ class SceneVLM:
|
|
|
45
51
|
self.device = device
|
|
46
52
|
self.max_new_tokens = max_new_tokens
|
|
47
53
|
self.temperature = temperature
|
|
54
|
+
self.max_image_pixels = max_image_pixels if max_image_pixels is not None else self.DEFAULT_MAX_IMAGE_PIXELS
|
|
48
55
|
self._processor: Any = None
|
|
49
56
|
self._model: Any = None
|
|
50
57
|
|
|
@@ -54,7 +61,7 @@ class SceneVLM:
|
|
|
54
61
|
|
|
55
62
|
t0 = time.perf_counter()
|
|
56
63
|
requested_device = self.device
|
|
57
|
-
resolved_device = select_device(self.device, mps_allowed=
|
|
64
|
+
resolved_device = select_device(self.device, mps_allowed=True)
|
|
58
65
|
|
|
59
66
|
self._processor = AutoProcessor.from_pretrained(self.model_name)
|
|
60
67
|
self._model = AutoModelForImageTextToText.from_pretrained(self.model_name, dtype="auto")
|
|
@@ -69,6 +76,17 @@ class SceneVLM:
|
|
|
69
76
|
)
|
|
70
77
|
logger.info("SceneVLM model weights loaded in %.2fs", time.perf_counter() - t0)
|
|
71
78
|
|
|
79
|
+
def _downscale_image(self, img: Image.Image) -> Image.Image:
|
|
80
|
+
"""Downscale image to fit within max_image_pixels budget, preserving aspect ratio."""
|
|
81
|
+
w, h = img.size
|
|
82
|
+
pixels = w * h
|
|
83
|
+
if pixels <= self.max_image_pixels:
|
|
84
|
+
return img
|
|
85
|
+
scale = (self.max_image_pixels / pixels) ** 0.5
|
|
86
|
+
new_w = max(1, int(w * scale))
|
|
87
|
+
new_h = max(1, int(h * scale))
|
|
88
|
+
return img.resize((new_w, new_h), Image.LANCZOS)
|
|
89
|
+
|
|
72
90
|
def _generation_config_for_run(self) -> Any | None:
|
|
73
91
|
base_config = getattr(self._model, "generation_config", None)
|
|
74
92
|
if base_config is None or not hasattr(base_config, "to_dict"):
|
|
@@ -104,7 +122,9 @@ class SceneVLM:
|
|
|
104
122
|
if not images:
|
|
105
123
|
raise ValueError("`images` must contain at least one frame")
|
|
106
124
|
|
|
107
|
-
pil_images = [
|
|
125
|
+
pil_images = [
|
|
126
|
+
self._downscale_image(Image.fromarray(img) if isinstance(img, np.ndarray) else img) for img in images
|
|
127
|
+
]
|
|
108
128
|
user_prompt = prompt or _DEFAULT_PROMPT
|
|
109
129
|
content: list[dict[str, Any]] = [{"type": "image", "image": img} for img in pil_images]
|
|
110
130
|
content.append({"type": "text", "text": user_prompt})
|
|
@@ -63,6 +63,7 @@ _GEO_TAG_KEYS: tuple[str, ...] = (
|
|
|
63
63
|
_SCENE_VLM_FRAME_SCALE = 3.0 # controls log curve steepness for frame sampling
|
|
64
64
|
_SCENE_VLM_FRAME_BASE = 5.0 # seconds per unit in log formula
|
|
65
65
|
_SCENE_VLM_MAX_FRAMES = 30 # hard cap on frames per scene
|
|
66
|
+
_SCENE_VLM_GROUP_THRESHOLD = 10.0 # seconds; adjacent scenes shorter than this get merged for one VLM call
|
|
66
67
|
|
|
67
68
|
|
|
68
69
|
@dataclass
|
|
@@ -397,10 +398,31 @@ class VideoAnalyzer:
|
|
|
397
398
|
|
|
398
399
|
run_whisper = AUDIO_TO_TEXT in enabled
|
|
399
400
|
run_scene_det = SEMANTIC_SCENE_DETECTOR in enabled
|
|
401
|
+
run_vlm = SCENE_VLM in enabled
|
|
400
402
|
|
|
401
403
|
transcription: Transcription | None = None
|
|
402
404
|
detected: list[SceneBoundary] | None = None
|
|
403
405
|
|
|
406
|
+
# Start loading SceneVLM weights in a background thread while
|
|
407
|
+
# Whisper and TransNetV2 run. Model loading is pure I/O + CPU
|
|
408
|
+
# weight deserialization, so it overlaps well with GPU inference.
|
|
409
|
+
vlm_preload_future = None
|
|
410
|
+
scene_vlm_holder: list[SceneVLM | None] = [None]
|
|
411
|
+
if run_vlm and (run_whisper or run_scene_det):
|
|
412
|
+
from concurrent.futures import ThreadPoolExecutor as _TPE
|
|
413
|
+
|
|
414
|
+
_vlm_pool = _TPE(max_workers=1)
|
|
415
|
+
|
|
416
|
+
def _preload_vlm() -> None:
|
|
417
|
+
try:
|
|
418
|
+
vlm = SceneVLM(**self.config.get_params(SCENE_VLM))
|
|
419
|
+
vlm._init_local()
|
|
420
|
+
scene_vlm_holder[0] = vlm
|
|
421
|
+
except Exception:
|
|
422
|
+
logger.warning("SceneVLM preload failed", exc_info=True)
|
|
423
|
+
|
|
424
|
+
vlm_preload_future = _vlm_pool.submit(_preload_vlm)
|
|
425
|
+
|
|
404
426
|
# Whisper and TransNetV2 operate on independent data (audio vs video
|
|
405
427
|
# frames) and both fit comfortably in GPU memory together. Run them
|
|
406
428
|
# concurrently via threads -- the GIL is released during GPU compute
|
|
@@ -421,6 +443,11 @@ class VideoAnalyzer:
|
|
|
421
443
|
if run_scene_det:
|
|
422
444
|
self._reset_transnetv2_torch_state()
|
|
423
445
|
|
|
446
|
+
# Wait for VLM preload to finish before freeing GPU memory.
|
|
447
|
+
if vlm_preload_future is not None:
|
|
448
|
+
vlm_preload_future.result()
|
|
449
|
+
_vlm_pool.shutdown(wait=False)
|
|
450
|
+
|
|
424
451
|
# Whisper and TransNetV2 are done -- free their GPU memory before
|
|
425
452
|
# loading SceneVLM (~9GB). Python GC doesn't guarantee immediate
|
|
426
453
|
# cleanup, so force it and release the CUDA cache.
|
|
@@ -441,6 +468,7 @@ class VideoAnalyzer:
|
|
|
441
468
|
video=video,
|
|
442
469
|
metadata=metadata,
|
|
443
470
|
scenes=scenes,
|
|
471
|
+
preloaded_scene_vlm=scene_vlm_holder[0] if run_vlm else None,
|
|
444
472
|
)
|
|
445
473
|
logger.info("Scene analysis completed in %.2fs", time.perf_counter() - t0)
|
|
446
474
|
|
|
@@ -528,14 +556,19 @@ class VideoAnalyzer:
|
|
|
528
556
|
video: Video | None,
|
|
529
557
|
metadata: VideoMetadata,
|
|
530
558
|
scenes: list[SceneBoundary],
|
|
559
|
+
preloaded_scene_vlm: SceneVLM | None = None,
|
|
531
560
|
) -> SceneAnalysisSection:
|
|
532
561
|
enabled = self.config.enabled_analyzers
|
|
533
562
|
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
563
|
+
scene_vlm: SceneVLM | None
|
|
564
|
+
if preloaded_scene_vlm is not None:
|
|
565
|
+
scene_vlm = preloaded_scene_vlm
|
|
566
|
+
else:
|
|
567
|
+
try:
|
|
568
|
+
scene_vlm = SceneVLM(**self.config.get_params(SCENE_VLM)) if SCENE_VLM in enabled else None
|
|
569
|
+
except Exception:
|
|
570
|
+
logger.warning("Failed to initialize SceneVLM, skipping visual understanding", exc_info=True)
|
|
571
|
+
scene_vlm = None
|
|
539
572
|
|
|
540
573
|
try:
|
|
541
574
|
audio_classifier = (
|
|
@@ -555,6 +588,20 @@ class VideoAnalyzer:
|
|
|
555
588
|
)
|
|
556
589
|
path_audio = None
|
|
557
590
|
|
|
591
|
+
# -- Batched SceneVLM: collect all timestamps, extract frames once, run one forward pass --
|
|
592
|
+
captions: list[str | None] = [None] * len(scenes)
|
|
593
|
+
if scene_vlm is not None:
|
|
594
|
+
try:
|
|
595
|
+
captions = self._run_scene_vlm_batched(
|
|
596
|
+
scene_vlm=scene_vlm,
|
|
597
|
+
source_path=source_path,
|
|
598
|
+
video=video,
|
|
599
|
+
metadata=metadata,
|
|
600
|
+
scenes=scenes,
|
|
601
|
+
)
|
|
602
|
+
except Exception:
|
|
603
|
+
logger.warning("Batched SceneVLM failed, skipping visual understanding", exc_info=True)
|
|
604
|
+
|
|
558
605
|
samples: list[SceneAnalysisSample] = []
|
|
559
606
|
t_audio_total = 0.0
|
|
560
607
|
for index, scene in enumerate(scenes):
|
|
@@ -564,38 +611,23 @@ class VideoAnalyzer:
|
|
|
564
611
|
end_second=float(scene.end),
|
|
565
612
|
start_frame=int(scene.start_frame),
|
|
566
613
|
end_frame=int(scene.end_frame),
|
|
614
|
+
caption=captions[index],
|
|
567
615
|
)
|
|
568
616
|
|
|
569
|
-
scene_clip: Video | None = None
|
|
570
|
-
if audio_classifier is not None and path_audio is None:
|
|
571
|
-
try:
|
|
572
|
-
scene_clip = self._load_scene_video_clip(
|
|
573
|
-
source_path=source_path,
|
|
574
|
-
video=video,
|
|
575
|
-
start_second=scene.start,
|
|
576
|
-
end_second=scene.end,
|
|
577
|
-
)
|
|
578
|
-
except Exception:
|
|
579
|
-
scene_clip = None
|
|
580
|
-
|
|
581
|
-
if scene_vlm is not None:
|
|
582
|
-
try:
|
|
583
|
-
sample.caption = self._run_scene_vlm(
|
|
584
|
-
scene_vlm=scene_vlm,
|
|
585
|
-
source_path=source_path,
|
|
586
|
-
video=video,
|
|
587
|
-
metadata=metadata,
|
|
588
|
-
start_second=scene.start,
|
|
589
|
-
end_second=scene.end,
|
|
590
|
-
)
|
|
591
|
-
except Exception:
|
|
592
|
-
logger.warning(
|
|
593
|
-
"SceneVLM failed for scene %d (%.1f-%.1fs)", index, scene.start, scene.end, exc_info=True
|
|
594
|
-
)
|
|
595
|
-
|
|
596
617
|
if audio_classifier is not None:
|
|
597
618
|
t0 = time.perf_counter()
|
|
598
619
|
try:
|
|
620
|
+
scene_clip: Video | None = None
|
|
621
|
+
if path_audio is None:
|
|
622
|
+
try:
|
|
623
|
+
scene_clip = self._load_scene_video_clip(
|
|
624
|
+
source_path=source_path,
|
|
625
|
+
video=video,
|
|
626
|
+
start_second=scene.start,
|
|
627
|
+
end_second=scene.end,
|
|
628
|
+
)
|
|
629
|
+
except Exception:
|
|
630
|
+
scene_clip = None
|
|
599
631
|
sample.audio_classification = self._run_scene_audio_classification(
|
|
600
632
|
audio_classifier=audio_classifier,
|
|
601
633
|
path_audio=path_audio,
|
|
@@ -616,33 +648,91 @@ class VideoAnalyzer:
|
|
|
616
648
|
|
|
617
649
|
return SceneAnalysisSection(samples=samples)
|
|
618
650
|
|
|
619
|
-
def
|
|
651
|
+
def _run_scene_vlm_batched(
|
|
620
652
|
self,
|
|
621
653
|
*,
|
|
622
654
|
scene_vlm: SceneVLM,
|
|
623
655
|
source_path: Path | None,
|
|
624
656
|
video: Video | None,
|
|
625
657
|
metadata: VideoMetadata,
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
658
|
+
scenes: list[SceneBoundary],
|
|
659
|
+
) -> list[str | None]:
|
|
660
|
+
"""Extract frames for all scenes in one ffmpeg call, then caption each group.
|
|
661
|
+
|
|
662
|
+
Adjacent short scenes (< _SCENE_VLM_GROUP_THRESHOLD seconds) are merged
|
|
663
|
+
into a single VLM call to reduce per-call overhead.
|
|
664
|
+
"""
|
|
665
|
+
# Group adjacent short scenes to reduce VLM call count.
|
|
666
|
+
# Each group is a list of scene indices that share one VLM call.
|
|
667
|
+
groups: list[list[int]] = []
|
|
668
|
+
current_group: list[int] = []
|
|
669
|
+
current_group_duration = 0.0
|
|
670
|
+
for i, scene in enumerate(scenes):
|
|
671
|
+
dur = max(0.0, scene.end - scene.start)
|
|
672
|
+
if current_group and current_group_duration + dur > _SCENE_VLM_GROUP_THRESHOLD:
|
|
673
|
+
groups.append(current_group)
|
|
674
|
+
current_group = [i]
|
|
675
|
+
current_group_duration = dur
|
|
676
|
+
else:
|
|
677
|
+
current_group.append(i)
|
|
678
|
+
current_group_duration += dur
|
|
679
|
+
if current_group:
|
|
680
|
+
groups.append(current_group)
|
|
681
|
+
|
|
682
|
+
# Compute timestamps for each group (treating merged scenes as one span)
|
|
683
|
+
group_timestamps: list[list[float]] = []
|
|
684
|
+
all_timestamps: list[float] = []
|
|
685
|
+
for group in groups:
|
|
686
|
+
span_start = scenes[group[0]].start
|
|
687
|
+
span_end = scenes[group[-1]].end
|
|
688
|
+
duration = max(0.0, span_end - span_start)
|
|
689
|
+
frame_count = min(
|
|
690
|
+
_SCENE_VLM_MAX_FRAMES,
|
|
691
|
+
max(1, math.ceil(_SCENE_VLM_FRAME_SCALE * math.log(duration / _SCENE_VLM_FRAME_BASE + 1))),
|
|
692
|
+
)
|
|
693
|
+
timestamps = self._sample_timestamps(start_second=span_start, end_second=span_end, frame_count=frame_count)
|
|
694
|
+
group_timestamps.append(timestamps)
|
|
695
|
+
all_timestamps.extend(timestamps)
|
|
696
|
+
|
|
697
|
+
if not all_timestamps:
|
|
698
|
+
return [None] * len(scenes)
|
|
699
|
+
|
|
700
|
+
# Extract all frames in a single ffmpeg call
|
|
701
|
+
if source_path is not None:
|
|
702
|
+
all_frames_array = extract_frames_at_times(source_path, all_timestamps)
|
|
703
|
+
all_frames: list[np.ndarray | Image.Image] = list(all_frames_array)
|
|
704
|
+
else:
|
|
705
|
+
current_video = _require_video(video)
|
|
706
|
+
max_frame = max(len(current_video.frames) - 1, 0)
|
|
707
|
+
indices = [max(0, min(max_frame, int(ts * metadata.fps))) for ts in all_timestamps]
|
|
708
|
+
all_frames = [current_video.frames[idx] for idx in indices]
|
|
709
|
+
|
|
710
|
+
# Caption each group and assign to all scenes in that group
|
|
711
|
+
captions: list[str | None] = [None] * len(scenes)
|
|
712
|
+
offset = 0
|
|
713
|
+
for group, timestamps in zip(groups, group_timestamps):
|
|
714
|
+
frame_count = len(timestamps)
|
|
715
|
+
group_frames = all_frames[offset : offset + frame_count]
|
|
716
|
+
offset += frame_count
|
|
717
|
+
if not group_frames:
|
|
718
|
+
continue
|
|
719
|
+
caption: str | None = None
|
|
720
|
+
try:
|
|
721
|
+
caption = scene_vlm.analyze_scene(group_frames) or None
|
|
722
|
+
except Exception:
|
|
723
|
+
logger.warning(
|
|
724
|
+
"SceneVLM failed for scenes %d-%d (%.1f-%.1fs)",
|
|
725
|
+
group[0],
|
|
726
|
+
group[-1],
|
|
727
|
+
scenes[group[0]].start,
|
|
728
|
+
scenes[group[-1]].end,
|
|
729
|
+
exc_info=True,
|
|
730
|
+
)
|
|
731
|
+
caption = None
|
|
732
|
+
for i in group:
|
|
733
|
+
captions[i] = caption
|
|
734
|
+
logger.info("SceneVLM: %d groups from %d scenes", len(groups), len(scenes))
|
|
735
|
+
return captions
|
|
646
736
|
|
|
647
737
|
def _run_scene_audio_classification(
|
|
648
738
|
self,
|
|
@@ -675,32 +765,6 @@ class VideoAnalyzer:
|
|
|
675
765
|
]
|
|
676
766
|
return AudioClassification(events=offset_events, clip_predictions=classification.clip_predictions)
|
|
677
767
|
|
|
678
|
-
def _sample_scene_frames(
|
|
679
|
-
self,
|
|
680
|
-
*,
|
|
681
|
-
source_path: Path | None,
|
|
682
|
-
video: Video | None,
|
|
683
|
-
metadata: VideoMetadata,
|
|
684
|
-
start_second: float,
|
|
685
|
-
end_second: float,
|
|
686
|
-
frame_count: int,
|
|
687
|
-
) -> list[np.ndarray | Image.Image]:
|
|
688
|
-
timestamps = self._sample_timestamps(start_second=start_second, end_second=end_second, frame_count=frame_count)
|
|
689
|
-
if not timestamps:
|
|
690
|
-
return []
|
|
691
|
-
|
|
692
|
-
if source_path is not None:
|
|
693
|
-
sampled_frames: list[np.ndarray | Image.Image] = []
|
|
694
|
-
sampled_frames.extend(extract_frames_at_times(source_path, timestamps))
|
|
695
|
-
return sampled_frames
|
|
696
|
-
|
|
697
|
-
current_video = _require_video(video)
|
|
698
|
-
max_frame = max(len(current_video.frames) - 1, 0)
|
|
699
|
-
indices = [max(0, min(max_frame, int(ts * metadata.fps))) for ts in timestamps]
|
|
700
|
-
in_memory_frames: list[np.ndarray | Image.Image] = []
|
|
701
|
-
in_memory_frames.extend(current_video.frames[idx] for idx in indices)
|
|
702
|
-
return in_memory_frames
|
|
703
|
-
|
|
704
768
|
@staticmethod
|
|
705
769
|
def _sample_timestamps(*, start_second: float, end_second: float, frame_count: int) -> list[float]:
|
|
706
770
|
duration = max(0.0, end_second - start_second)
|
|
@@ -582,7 +582,7 @@ class Fade(Effect):
|
|
|
582
582
|
if video.audio is not None and not video.audio.is_silent:
|
|
583
583
|
sample_rate = video.audio.metadata.sample_rate
|
|
584
584
|
audio_start = round(start_s * sample_rate)
|
|
585
|
-
audio_end = round(stop_s * sample_rate)
|
|
585
|
+
audio_end = min(round(stop_s * sample_rate), len(video.audio.data))
|
|
586
586
|
n_audio_samples = audio_end - audio_start
|
|
587
587
|
fade_samples = min(round(self.duration * sample_rate), n_audio_samples)
|
|
588
588
|
|
|
@@ -656,7 +656,7 @@ class VolumeAdjust(AudioEffect):
|
|
|
656
656
|
|
|
657
657
|
sample_rate = audio.metadata.sample_rate
|
|
658
658
|
start_sample = round(start * sample_rate)
|
|
659
|
-
end_sample = round(stop * sample_rate)
|
|
659
|
+
end_sample = min(round(stop * sample_rate), len(audio.data))
|
|
660
660
|
n_samples = end_sample - start_sample
|
|
661
661
|
|
|
662
662
|
# Build volume envelope
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|