videopython 0.33.0__tar.gz → 0.33.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.33.0 → videopython-0.33.2}/PKG-INFO +1 -1
- {videopython-0.33.0 → videopython-0.33.2}/pyproject.toml +4 -1
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/generation/audio.py +14 -9
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/generation/image.py +6 -1
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/generation/translation.py +2 -2
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/generation/video.py +21 -13
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/understanding/audio.py +11 -2
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/understanding/faces.py +11 -16
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/understanding/image.py +4 -13
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/understanding/temporal.py +12 -6
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/audio/audio.py +4 -4
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/base/_ffmpeg.py +5 -5
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/base/_video_io.py +1 -1
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/base/description.py +21 -20
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/base/transcription.py +10 -8
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/base/video.py +2 -2
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/editing/__init__.py +20 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/editing/effects.py +649 -2
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/editing/operation.py +4 -5
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/editing/streaming.py +8 -2
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/editing/transforms.py +2 -2
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/editing/video_edit.py +2 -2
- {videopython-0.33.0 → videopython-0.33.2}/.gitignore +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/LICENSE +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/README.md +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/__init__.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/_device.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/dubbing/__init__.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/dubbing/config.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/dubbing/dubber.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/dubbing/expressiveness.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/dubbing/loudness.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/dubbing/models.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/dubbing/pipeline.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/dubbing/quality.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/dubbing/remux.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/dubbing/voice_sample.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/generation/qwen3.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/transforms.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/understanding/separation.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/video_analysis/__init__.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/video_analysis/analyzer.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/video_analysis/models.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/video_analysis/sampling.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/ai/video_analysis/stages.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/audio/__init__.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/audio/analysis.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/base/__init__.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/base/_dimensions.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/base/image_text.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/editing/transcription_overlay.py +0 -0
- {videopython-0.33.0 → videopython-0.33.2}/src/videopython/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "videopython"
|
|
3
|
-
version = "0.33.
|
|
3
|
+
version = "0.33.2"
|
|
4
4
|
description = "Minimal video generation and processing library."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
|
|
@@ -137,6 +137,9 @@ Documentation = "https://videopython.com"
|
|
|
137
137
|
[tool.mypy]
|
|
138
138
|
mypy_path = "src/stubs"
|
|
139
139
|
plugins = ["pydantic.mypy"]
|
|
140
|
+
warn_unused_ignores = true
|
|
141
|
+
warn_redundant_casts = true
|
|
142
|
+
disallow_any_generics = true
|
|
140
143
|
|
|
141
144
|
[[tool.mypy.overrides]]
|
|
142
145
|
module = [
|
|
@@ -32,8 +32,8 @@ class TextToSpeech:
|
|
|
32
32
|
self.language = language
|
|
33
33
|
self._model: Any = None
|
|
34
34
|
|
|
35
|
-
def
|
|
36
|
-
from chatterbox.mtl_tts import ChatterboxMultilingualTTS
|
|
35
|
+
def _init_local(self) -> None:
|
|
36
|
+
from chatterbox.mtl_tts import ChatterboxMultilingualTTS
|
|
37
37
|
|
|
38
38
|
requested_device = self.device
|
|
39
39
|
device = select_device(self.device, mps_allowed=False)
|
|
@@ -83,7 +83,7 @@ class TextToSpeech:
|
|
|
83
83
|
import numpy as np
|
|
84
84
|
|
|
85
85
|
if self._model is None:
|
|
86
|
-
self.
|
|
86
|
+
self._init_local()
|
|
87
87
|
|
|
88
88
|
speaker_wav_path: Path | None = None
|
|
89
89
|
cleanup_path = False
|
|
@@ -149,7 +149,6 @@ class TextToMusic:
|
|
|
149
149
|
self.device = device
|
|
150
150
|
self._processor: Any = None
|
|
151
151
|
self._model: Any = None
|
|
152
|
-
self._device: str | None = None
|
|
153
152
|
|
|
154
153
|
def _init_local(self) -> None:
|
|
155
154
|
"""Initialize local MusicGen model."""
|
|
@@ -160,17 +159,17 @@ class TextToMusic:
|
|
|
160
159
|
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
|
161
160
|
|
|
162
161
|
requested_device = self.device
|
|
163
|
-
|
|
162
|
+
device = select_device(self.device, mps_allowed=True)
|
|
164
163
|
|
|
165
164
|
model_name = "facebook/musicgen-small"
|
|
166
165
|
self._processor = AutoProcessor.from_pretrained(model_name)
|
|
167
166
|
self._model = MusicgenForConditionalGeneration.from_pretrained(model_name)
|
|
168
|
-
self._model.to(
|
|
169
|
-
self.device =
|
|
167
|
+
self._model.to(device)
|
|
168
|
+
self.device = device
|
|
170
169
|
log_device_initialization(
|
|
171
170
|
"TextToMusic",
|
|
172
171
|
requested_device=requested_device,
|
|
173
|
-
resolved_device=
|
|
172
|
+
resolved_device=device,
|
|
174
173
|
)
|
|
175
174
|
|
|
176
175
|
def generate_audio(self, text: str, max_new_tokens: int = 256) -> Audio:
|
|
@@ -179,7 +178,7 @@ class TextToMusic:
|
|
|
179
178
|
self._init_local()
|
|
180
179
|
|
|
181
180
|
inputs = self._processor(text=[text], padding=True, return_tensors="pt")
|
|
182
|
-
inputs = {k: v.to(self.
|
|
181
|
+
inputs = {k: v.to(self.device) if hasattr(v, "to") else v for k, v in inputs.items()}
|
|
183
182
|
audio_values = self._model.generate(**inputs, max_new_tokens=max_new_tokens)
|
|
184
183
|
sampling_rate = self._model.config.audio_encoder.sampling_rate
|
|
185
184
|
|
|
@@ -193,3 +192,9 @@ class TextToMusic:
|
|
|
193
192
|
frame_count=len(audio_data),
|
|
194
193
|
)
|
|
195
194
|
return Audio(audio_data, metadata)
|
|
195
|
+
|
|
196
|
+
def unload(self) -> None:
|
|
197
|
+
"""Release the MusicGen model so the next generate_audio() re-initializes."""
|
|
198
|
+
self._model = None
|
|
199
|
+
self._processor = None
|
|
200
|
+
release_device_memory(self.device)
|
|
@@ -6,7 +6,7 @@ from typing import Any
|
|
|
6
6
|
|
|
7
7
|
from PIL import Image
|
|
8
8
|
|
|
9
|
-
from videopython.ai._device import log_device_initialization, select_device
|
|
9
|
+
from videopython.ai._device import log_device_initialization, release_device_memory, select_device
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class TextToImage:
|
|
@@ -49,3 +49,8 @@ class TextToImage:
|
|
|
49
49
|
if self._pipeline is None:
|
|
50
50
|
self._init_local()
|
|
51
51
|
return self._pipeline(prompt=prompt).images[0]
|
|
52
|
+
|
|
53
|
+
def unload(self) -> None:
|
|
54
|
+
"""Release the diffusion pipeline so the next generate_image() re-initializes."""
|
|
55
|
+
self._pipeline = None
|
|
56
|
+
release_device_memory(self.device)
|
|
@@ -170,7 +170,7 @@ class MarianTranslator:
|
|
|
170
170
|
return f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
|
|
171
171
|
|
|
172
172
|
def _init_local(self, source_lang: str, target_lang: str) -> None:
|
|
173
|
-
from transformers import MarianMTModel, MarianTokenizer
|
|
173
|
+
from transformers import MarianMTModel, MarianTokenizer
|
|
174
174
|
|
|
175
175
|
model_name = self._get_local_model_name(source_lang, target_lang)
|
|
176
176
|
|
|
@@ -181,7 +181,7 @@ class MarianTranslator:
|
|
|
181
181
|
self._model = MarianMTModel.from_pretrained(model_name).to(device)
|
|
182
182
|
self.device = device
|
|
183
183
|
log_device_initialization(
|
|
184
|
-
"
|
|
184
|
+
"MarianTranslator",
|
|
185
185
|
requested_device=requested_device,
|
|
186
186
|
resolved_device=device,
|
|
187
187
|
)
|
|
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Any
|
|
|
6
6
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
|
|
9
|
-
from videopython.ai._device import log_device_initialization, select_device
|
|
9
|
+
from videopython.ai._device import log_device_initialization, release_device_memory, select_device
|
|
10
10
|
from videopython.base.video import Video
|
|
11
11
|
|
|
12
12
|
if TYPE_CHECKING:
|
|
@@ -29,22 +29,21 @@ class TextToVideo:
|
|
|
29
29
|
def __init__(self, device: str | None = None):
|
|
30
30
|
self.device = device
|
|
31
31
|
self._pipeline: Any = None
|
|
32
|
-
self._device: str | None = None
|
|
33
32
|
|
|
34
33
|
def _init_local(self) -> None:
|
|
35
34
|
from diffusers import CogVideoXPipeline
|
|
36
35
|
|
|
37
36
|
requested_device = self.device
|
|
38
|
-
|
|
37
|
+
device, dtype = _get_torch_device_and_dtype(self.device)
|
|
39
38
|
|
|
40
39
|
model_name = "THUDM/CogVideoX1.5-5B"
|
|
41
40
|
self._pipeline = CogVideoXPipeline.from_pretrained(model_name, torch_dtype=dtype)
|
|
42
|
-
self._pipeline.to(
|
|
43
|
-
self.device =
|
|
41
|
+
self._pipeline.to(device)
|
|
42
|
+
self.device = device
|
|
44
43
|
log_device_initialization(
|
|
45
44
|
"TextToVideo",
|
|
46
45
|
requested_device=requested_device,
|
|
47
|
-
resolved_device=
|
|
46
|
+
resolved_device=device,
|
|
48
47
|
)
|
|
49
48
|
|
|
50
49
|
def generate_video(
|
|
@@ -65,11 +64,16 @@ class TextToVideo:
|
|
|
65
64
|
num_inference_steps=num_steps,
|
|
66
65
|
num_frames=num_frames,
|
|
67
66
|
guidance_scale=guidance_scale,
|
|
68
|
-
generator=torch.Generator(device=self.
|
|
67
|
+
generator=torch.Generator(device=self.device).manual_seed(42),
|
|
69
68
|
).frames[0]
|
|
70
69
|
video_frames = np.asarray(video_frames, dtype=np.uint8)
|
|
71
70
|
return Video.from_frames(video_frames, fps=16.0)
|
|
72
71
|
|
|
72
|
+
def unload(self) -> None:
|
|
73
|
+
"""Release the diffusion pipeline so the next generate_video() re-initializes."""
|
|
74
|
+
self._pipeline = None
|
|
75
|
+
release_device_memory(self.device)
|
|
76
|
+
|
|
73
77
|
|
|
74
78
|
class ImageToVideo:
|
|
75
79
|
"""Generates videos from static images using local video diffusion."""
|
|
@@ -77,22 +81,21 @@ class ImageToVideo:
|
|
|
77
81
|
def __init__(self, device: str | None = None):
|
|
78
82
|
self.device = device
|
|
79
83
|
self._pipeline: Any = None
|
|
80
|
-
self._device: str | None = None
|
|
81
84
|
|
|
82
85
|
def _init_local(self) -> None:
|
|
83
86
|
from diffusers import CogVideoXImageToVideoPipeline
|
|
84
87
|
|
|
85
88
|
requested_device = self.device
|
|
86
|
-
|
|
89
|
+
device, dtype = _get_torch_device_and_dtype(self.device)
|
|
87
90
|
|
|
88
91
|
model_name = "THUDM/CogVideoX1.5-5B-I2V"
|
|
89
92
|
self._pipeline = CogVideoXImageToVideoPipeline.from_pretrained(model_name, torch_dtype=dtype)
|
|
90
|
-
self._pipeline.to(
|
|
91
|
-
self.device =
|
|
93
|
+
self._pipeline.to(device)
|
|
94
|
+
self.device = device
|
|
92
95
|
log_device_initialization(
|
|
93
96
|
"ImageToVideo",
|
|
94
97
|
requested_device=requested_device,
|
|
95
|
-
resolved_device=
|
|
98
|
+
resolved_device=device,
|
|
96
99
|
)
|
|
97
100
|
|
|
98
101
|
def generate_video(
|
|
@@ -115,7 +118,12 @@ class ImageToVideo:
|
|
|
115
118
|
num_inference_steps=num_steps,
|
|
116
119
|
num_frames=num_frames,
|
|
117
120
|
guidance_scale=guidance_scale,
|
|
118
|
-
generator=torch.Generator(device=self.
|
|
121
|
+
generator=torch.Generator(device=self.device).manual_seed(42),
|
|
119
122
|
).frames[0]
|
|
120
123
|
video_frames = np.asarray(video_frames, dtype=np.uint8)
|
|
121
124
|
return Video.from_frames(video_frames, fps=16.0)
|
|
125
|
+
|
|
126
|
+
def unload(self) -> None:
|
|
127
|
+
"""Release the diffusion pipeline so the next generate_video() re-initializes."""
|
|
128
|
+
self._pipeline = None
|
|
129
|
+
release_device_memory(self.device)
|
|
@@ -188,7 +188,7 @@ class AudioToText:
|
|
|
188
188
|
def _init_diarization(self) -> None:
|
|
189
189
|
"""Initialize pyannote speaker diarization pipeline."""
|
|
190
190
|
import torch
|
|
191
|
-
from pyannote.audio import Pipeline
|
|
191
|
+
from pyannote.audio import Pipeline
|
|
192
192
|
|
|
193
193
|
self._diarization_pipeline = Pipeline.from_pretrained(self.PYANNOTE_DIARIZATION_MODEL)
|
|
194
194
|
self._diarization_pipeline.to(torch.device(self.device))
|
|
@@ -214,7 +214,7 @@ class AudioToText:
|
|
|
214
214
|
self._vad_model = None
|
|
215
215
|
release_device_memory(self.device)
|
|
216
216
|
|
|
217
|
-
def _process_transcription_result(self, transcription_result: dict) -> Transcription:
|
|
217
|
+
def _process_transcription_result(self, transcription_result: dict[str, Any]) -> Transcription:
|
|
218
218
|
"""Process raw transcription result into a Transcription object."""
|
|
219
219
|
transcription_segments = []
|
|
220
220
|
for segment in transcription_result["segments"]:
|
|
@@ -520,6 +520,15 @@ class AudioClassifier:
|
|
|
520
520
|
|
|
521
521
|
self._labels = [self._model.config.id2label[i] for i in range(len(self._model.config.id2label))]
|
|
522
522
|
|
|
523
|
+
def unload(self) -> None:
|
|
524
|
+
"""Release the AST model so the next classify() re-initializes.
|
|
525
|
+
|
|
526
|
+
Used by low-memory dubbing to free VRAM between pipeline stages.
|
|
527
|
+
"""
|
|
528
|
+
self._model = None
|
|
529
|
+
self._processor = None
|
|
530
|
+
release_device_memory(self.device)
|
|
531
|
+
|
|
523
532
|
def _merge_events(self, events: list[AudioEvent], gap_threshold: float = 0.5) -> list[AudioEvent]:
|
|
524
533
|
"""Merge consecutive events of the same class."""
|
|
525
534
|
if not events:
|
|
@@ -237,7 +237,7 @@ class FaceTracker:
|
|
|
237
237
|
|
|
238
238
|
def _select_face(
|
|
239
239
|
self,
|
|
240
|
-
faces: list,
|
|
240
|
+
faces: list[DetectedFace],
|
|
241
241
|
frame_width: int,
|
|
242
242
|
frame_height: int,
|
|
243
243
|
) -> tuple[float, float, float, float] | None:
|
|
@@ -251,29 +251,24 @@ class FaceTracker:
|
|
|
251
251
|
Returns:
|
|
252
252
|
Tuple of (center_x, center_y, width, height) in normalized coords, or None.
|
|
253
253
|
"""
|
|
254
|
-
if not
|
|
254
|
+
faces_with_box = [(f, f.bounding_box) for f in faces if f.bounding_box is not None]
|
|
255
|
+
if not faces_with_box:
|
|
255
256
|
return None
|
|
256
257
|
|
|
257
258
|
if self.selection_strategy == "largest":
|
|
258
|
-
|
|
259
|
+
_, bbox = faces_with_box[0]
|
|
259
260
|
elif self.selection_strategy == "centered":
|
|
260
261
|
frame_center = (0.5, 0.5)
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
key=lambda
|
|
264
|
-
(f.bounding_box.center[0] - frame_center[0]) ** 2
|
|
265
|
-
+ (f.bounding_box.center[1] - frame_center[1]) ** 2
|
|
266
|
-
),
|
|
262
|
+
_, bbox = min(
|
|
263
|
+
faces_with_box,
|
|
264
|
+
key=lambda fb: ((fb[1].center[0] - frame_center[0]) ** 2 + (fb[1].center[1] - frame_center[1]) ** 2),
|
|
267
265
|
)
|
|
268
266
|
elif self.selection_strategy == "index":
|
|
269
|
-
if self.face_index < len(
|
|
270
|
-
|
|
271
|
-
else:
|
|
272
|
-
face = faces[0]
|
|
267
|
+
idx = self.face_index if self.face_index < len(faces_with_box) else 0
|
|
268
|
+
_, bbox = faces_with_box[idx]
|
|
273
269
|
else:
|
|
274
|
-
|
|
270
|
+
_, bbox = faces_with_box[0]
|
|
275
271
|
|
|
276
|
-
bbox = face.bounding_box
|
|
277
272
|
return (bbox.center[0], bbox.center[1], bbox.width, bbox.height)
|
|
278
273
|
|
|
279
274
|
def detect_and_track(
|
|
@@ -407,7 +402,7 @@ class FaceTracker:
|
|
|
407
402
|
|
|
408
403
|
sampled_frames = [frames[i] for i in sample_indices]
|
|
409
404
|
|
|
410
|
-
sampled_detections: list[list] = []
|
|
405
|
+
sampled_detections: list[list[DetectedFace]] = []
|
|
411
406
|
for batch_start in range(0, len(sampled_frames), self.batch_size):
|
|
412
407
|
batch_end = min(batch_start + self.batch_size, len(sampled_frames))
|
|
413
408
|
batch = sampled_frames[batch_start:batch_end]
|
|
@@ -11,7 +11,7 @@ from typing import Any, Literal
|
|
|
11
11
|
import numpy as np
|
|
12
12
|
from PIL import Image
|
|
13
13
|
|
|
14
|
-
from videopython.ai._device import log_device_initialization, select_device
|
|
14
|
+
from videopython.ai._device import log_device_initialization, release_device_memory, select_device
|
|
15
15
|
from videopython.base.description import SceneDescription
|
|
16
16
|
|
|
17
17
|
logger = logging.getLogger(__name__)
|
|
@@ -151,7 +151,7 @@ class SceneVLM:
|
|
|
151
151
|
def _init_local(self) -> None:
|
|
152
152
|
"""Initialize local Qwen3.5 model."""
|
|
153
153
|
import torch
|
|
154
|
-
from transformers import AutoModelForImageTextToText, AutoProcessor
|
|
154
|
+
from transformers import AutoModelForImageTextToText, AutoProcessor
|
|
155
155
|
|
|
156
156
|
t0 = time.perf_counter()
|
|
157
157
|
requested_device = self.device
|
|
@@ -190,16 +190,7 @@ class SceneVLM:
|
|
|
190
190
|
"""
|
|
191
191
|
self._model = None
|
|
192
192
|
self._processor = None
|
|
193
|
-
|
|
194
|
-
import gc
|
|
195
|
-
|
|
196
|
-
import torch
|
|
197
|
-
|
|
198
|
-
gc.collect()
|
|
199
|
-
if torch.cuda.is_available():
|
|
200
|
-
torch.cuda.empty_cache()
|
|
201
|
-
except ImportError:
|
|
202
|
-
pass
|
|
193
|
+
release_device_memory(self.device)
|
|
203
194
|
|
|
204
195
|
def _downscale_image(self, img: Image.Image) -> Image.Image:
|
|
205
196
|
"""Downscale image to fit within max_image_pixels budget, preserving aspect ratio."""
|
|
@@ -284,7 +275,7 @@ class SceneVLM:
|
|
|
284
275
|
def _generate_from_message_batch(self, messages_batch: list[list[dict[str, Any]]]) -> list[str]:
|
|
285
276
|
"""Run batch generation for one or more multimodal chat messages."""
|
|
286
277
|
import torch
|
|
287
|
-
from qwen_vl_utils import process_vision_info
|
|
278
|
+
from qwen_vl_utils import process_vision_info
|
|
288
279
|
|
|
289
280
|
if self._model is None:
|
|
290
281
|
self._init_local()
|
|
@@ -9,7 +9,7 @@ from __future__ import annotations
|
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
from typing import TYPE_CHECKING, Any
|
|
11
11
|
|
|
12
|
-
from videopython.ai._device import log_device_initialization, select_device
|
|
12
|
+
from videopython.ai._device import log_device_initialization, release_device_memory, select_device
|
|
13
13
|
from videopython.base.description import SceneBoundary
|
|
14
14
|
|
|
15
15
|
if TYPE_CHECKING:
|
|
@@ -56,26 +56,32 @@ class SemanticSceneDetector:
|
|
|
56
56
|
|
|
57
57
|
self.threshold = threshold
|
|
58
58
|
self.min_scene_length = min_scene_length
|
|
59
|
-
self.
|
|
59
|
+
self.device: str | None = device
|
|
60
60
|
self._model: Any = None
|
|
61
61
|
|
|
62
|
-
def
|
|
62
|
+
def _init_local(self) -> None:
|
|
63
63
|
"""Load the TransNetV2 model with pretrained weights."""
|
|
64
64
|
if self._model is not None:
|
|
65
65
|
return
|
|
66
66
|
|
|
67
67
|
from transnetv2_pytorch import TransNetV2
|
|
68
68
|
|
|
69
|
-
requested_device = self.
|
|
70
|
-
device = select_device(self.
|
|
69
|
+
requested_device = self.device
|
|
70
|
+
device = select_device(self.device, mps_allowed=True)
|
|
71
71
|
log_device_initialization(
|
|
72
72
|
"SemanticSceneDetector",
|
|
73
73
|
requested_device=requested_device,
|
|
74
74
|
resolved_device=device,
|
|
75
75
|
)
|
|
76
|
+
self.device = device
|
|
76
77
|
self._model = TransNetV2(device=device)
|
|
77
78
|
self._model.eval()
|
|
78
79
|
|
|
80
|
+
def unload(self) -> None:
|
|
81
|
+
"""Release the TransNetV2 model so the next call re-initializes."""
|
|
82
|
+
self._model = None
|
|
83
|
+
release_device_memory(self.device)
|
|
84
|
+
|
|
79
85
|
def detect(self, video: Video) -> list[SceneBoundary]:
|
|
80
86
|
"""Detect scenes in a video using ML-based boundary detection.
|
|
81
87
|
|
|
@@ -114,7 +120,7 @@ class SemanticSceneDetector:
|
|
|
114
120
|
Returns:
|
|
115
121
|
List of SceneBoundary objects representing detected scenes.
|
|
116
122
|
"""
|
|
117
|
-
self.
|
|
123
|
+
self._init_local()
|
|
118
124
|
|
|
119
125
|
# Use TransNetV2's detect_scenes which handles everything internally
|
|
120
126
|
raw_scenes = self._model.detect_scenes(str(path), threshold=self.threshold)
|
|
@@ -5,7 +5,7 @@ import subprocess
|
|
|
5
5
|
import wave
|
|
6
6
|
from dataclasses import dataclass
|
|
7
7
|
from pathlib import Path
|
|
8
|
-
from typing import TYPE_CHECKING
|
|
8
|
+
from typing import TYPE_CHECKING, Any
|
|
9
9
|
|
|
10
10
|
import numpy as np
|
|
11
11
|
|
|
@@ -69,7 +69,7 @@ class Audio:
|
|
|
69
69
|
return bool(np.all(np.abs(self.data) < 1e-7))
|
|
70
70
|
|
|
71
71
|
@staticmethod
|
|
72
|
-
def _get_ffmpeg_info(file_path: Path) -> dict:
|
|
72
|
+
def _get_ffmpeg_info(file_path: Path) -> dict[str, Any]:
|
|
73
73
|
"""Get audio metadata using ffprobe"""
|
|
74
74
|
try:
|
|
75
75
|
info = _ffmpeg.probe(file_path)
|
|
@@ -483,7 +483,7 @@ class Audio:
|
|
|
483
483
|
if first.metadata.channels == 1:
|
|
484
484
|
output = np.zeros(total_samples, dtype=np.float32)
|
|
485
485
|
else:
|
|
486
|
-
output = np.zeros((total_samples, 2), dtype=np.float32)
|
|
486
|
+
output = np.zeros((total_samples, 2), dtype=np.float32)
|
|
487
487
|
|
|
488
488
|
# Copy non-crossfaded portions
|
|
489
489
|
crossfade_start = len(first.data) - crossfade_samples
|
|
@@ -761,7 +761,7 @@ class Audio:
|
|
|
761
761
|
if base.metadata.channels == 1:
|
|
762
762
|
output = np.zeros(total_length, dtype=np.float32)
|
|
763
763
|
else:
|
|
764
|
-
output = np.zeros((total_length, 2), dtype=np.float32)
|
|
764
|
+
output = np.zeros((total_length, 2), dtype=np.float32)
|
|
765
765
|
|
|
766
766
|
# Copy base audio
|
|
767
767
|
output[: len(base.data)] = base.data
|
|
@@ -13,7 +13,7 @@ import json
|
|
|
13
13
|
import subprocess
|
|
14
14
|
from contextlib import contextmanager
|
|
15
15
|
from pathlib import Path
|
|
16
|
-
from typing import Iterator, Sequence
|
|
16
|
+
from typing import Any, Iterator, Sequence
|
|
17
17
|
|
|
18
18
|
from videopython.base.exceptions import FFmpegProbeError, FFmpegRunError
|
|
19
19
|
|
|
@@ -44,7 +44,7 @@ def run(cmd: Sequence[str], *, stdin: bytes | None = None) -> bytes:
|
|
|
44
44
|
return result.stdout
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
def probe(path: str | Path, *, extra_args: Sequence[str] | None = None) -> dict:
|
|
47
|
+
def probe(path: str | Path, *, extra_args: Sequence[str] | None = None) -> dict[str, Any]:
|
|
48
48
|
"""Run ffprobe and return the parsed JSON payload.
|
|
49
49
|
|
|
50
50
|
Args:
|
|
@@ -76,7 +76,7 @@ def probe(path: str | Path, *, extra_args: Sequence[str] | None = None) -> dict:
|
|
|
76
76
|
raise FFmpegProbeError(f"Error parsing ffprobe output: {e}") from e
|
|
77
77
|
|
|
78
78
|
|
|
79
|
-
def _terminate(proc: subprocess.Popen, *, timeout: float = 5) -> None:
|
|
79
|
+
def _terminate(proc: subprocess.Popen[bytes], *, timeout: float = 5) -> None:
|
|
80
80
|
"""Terminate a still-running process, escalating to kill after ``timeout``."""
|
|
81
81
|
if proc.poll() is None:
|
|
82
82
|
proc.terminate()
|
|
@@ -88,7 +88,7 @@ def _terminate(proc: subprocess.Popen, *, timeout: float = 5) -> None:
|
|
|
88
88
|
|
|
89
89
|
|
|
90
90
|
@contextmanager
|
|
91
|
-
def popen_decode(cmd: Sequence[str], *, bufsize: int = -1) -> Iterator[subprocess.Popen]:
|
|
91
|
+
def popen_decode(cmd: Sequence[str], *, bufsize: int = -1) -> Iterator[subprocess.Popen[bytes]]:
|
|
92
92
|
"""Context manager wrapping an ffmpeg decode process.
|
|
93
93
|
|
|
94
94
|
Yields a Popen with ``stdout=PIPE`` and ``stderr=DEVNULL``. Callers
|
|
@@ -116,7 +116,7 @@ def popen_decode(cmd: Sequence[str], *, bufsize: int = -1) -> Iterator[subproces
|
|
|
116
116
|
|
|
117
117
|
|
|
118
118
|
@contextmanager
|
|
119
|
-
def popen_encode(cmd: Sequence[str]) -> Iterator[subprocess.Popen]:
|
|
119
|
+
def popen_encode(cmd: Sequence[str]) -> Iterator[subprocess.Popen[bytes]]:
|
|
120
120
|
"""Context manager wrapping an ffmpeg encode process via stdin pipe.
|
|
121
121
|
|
|
122
122
|
Yields a Popen with ``stdin=PIPE``, ``stdout=DEVNULL``, and
|