videopython 0.26.1__tar.gz → 0.26.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.26.1 → videopython-0.26.2}/PKG-INFO +1 -1
- {videopython-0.26.1 → videopython-0.26.2}/pyproject.toml +1 -1
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/ai/_device.py +27 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/ai/dubbing/dubber.py +15 -5
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/ai/dubbing/pipeline.py +39 -3
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/ai/generation/audio.py +11 -1
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/ai/generation/translation.py +11 -1
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/ai/understanding/audio.py +10 -1
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/ai/understanding/separation.py +9 -1
- {videopython-0.26.1 → videopython-0.26.2}/.gitignore +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/LICENSE +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/README.md +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/__init__.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/ai/dubbing/__init__.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/ai/dubbing/models.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/ai/registry.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/ai/swapping/__init__.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/ai/swapping/inpainter.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/ai/swapping/models.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/ai/swapping/segmenter.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/ai/swapping/swapper.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/ai/transforms.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/ai/video_analysis.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/base/__init__.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/base/audio/__init__.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/base/audio/analysis.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/base/audio/audio.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/base/combine.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/base/description.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/base/effects.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/base/progress.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/base/registry.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/base/scene.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/base/streaming.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/base/text/__init__.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/base/text/overlay.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/base/text/transcription.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/base/transforms.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/base/transitions.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/base/utils.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/base/video.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/editing/__init__.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/editing/multicam.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/editing/premiere_xml.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/editing/video_edit.py +0 -0
- {videopython-0.26.1 → videopython-0.26.2}/src/videopython/py.typed +0 -0
|
@@ -25,6 +25,33 @@ def log_device_initialization(
|
|
|
25
25
|
)
|
|
26
26
|
|
|
27
27
|
|
|
28
|
+
def release_device_memory(device: str | None) -> None:
|
|
29
|
+
"""Release cached allocator memory for the given device.
|
|
30
|
+
|
|
31
|
+
Safe to call when torch is not importable or the device is CPU/None.
|
|
32
|
+
"""
|
|
33
|
+
try:
|
|
34
|
+
import torch
|
|
35
|
+
except ImportError:
|
|
36
|
+
return
|
|
37
|
+
|
|
38
|
+
import gc
|
|
39
|
+
|
|
40
|
+
gc.collect()
|
|
41
|
+
|
|
42
|
+
if device == "cuda" and torch.cuda.is_available():
|
|
43
|
+
torch.cuda.empty_cache()
|
|
44
|
+
return
|
|
45
|
+
|
|
46
|
+
if device == "mps":
|
|
47
|
+
mps_backend = getattr(torch.backends, "mps", None)
|
|
48
|
+
if mps_backend is not None and mps_backend.is_available():
|
|
49
|
+
mps_mod = getattr(torch, "mps", None)
|
|
50
|
+
empty_cache = getattr(mps_mod, "empty_cache", None) if mps_mod is not None else None
|
|
51
|
+
if callable(empty_cache):
|
|
52
|
+
empty_cache()
|
|
53
|
+
|
|
54
|
+
|
|
28
55
|
def select_device(
|
|
29
56
|
device: str | None,
|
|
30
57
|
*,
|
|
@@ -14,18 +14,28 @@ logger = logging.getLogger(__name__)
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class VideoDubber:
|
|
17
|
-
"""Dubs videos into different languages using the local pipeline.
|
|
18
|
-
|
|
19
|
-
|
|
17
|
+
"""Dubs videos into different languages using the local pipeline.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
device: Execution device (``cpu``, ``cuda``, ``mps``, or ``None`` for auto).
|
|
21
|
+
low_memory: When True, each pipeline stage (Whisper, Demucs, MarianMT,
|
|
22
|
+
Chatterbox TTS) is unloaded from memory after it runs, so only one
|
|
23
|
+
model is resident at a time. Trades per-run latency (~10-30s of
|
|
24
|
+
extra model loads) for a much lower memory ceiling. Recommended for
|
|
25
|
+
GPUs with <=12GB VRAM or hosts with <32GB RAM. Default False.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self, device: str | None = None, low_memory: bool = False):
|
|
20
29
|
self.device = device
|
|
30
|
+
self.low_memory = low_memory
|
|
21
31
|
self._local_pipeline: Any = None
|
|
22
32
|
requested = device.lower() if isinstance(device, str) else "auto"
|
|
23
|
-
logger.info("VideoDubber initialized with device=%s", requested)
|
|
33
|
+
logger.info("VideoDubber initialized with device=%s low_memory=%s", requested, low_memory)
|
|
24
34
|
|
|
25
35
|
def _init_local_pipeline(self) -> None:
|
|
26
36
|
from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
|
|
27
37
|
|
|
28
|
-
self._local_pipeline = LocalDubbingPipeline(device=self.device)
|
|
38
|
+
self._local_pipeline = LocalDubbingPipeline(device=self.device, low_memory=self.low_memory)
|
|
29
39
|
|
|
30
40
|
def dub(
|
|
31
41
|
self,
|
|
@@ -15,12 +15,23 @@ logger = logging.getLogger(__name__)
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class LocalDubbingPipeline:
|
|
18
|
-
"""Local pipeline for video dubbing.
|
|
18
|
+
"""Local pipeline for video dubbing.
|
|
19
19
|
|
|
20
|
-
|
|
20
|
+
When ``low_memory=True``, each stage's model is unloaded after it runs, so
|
|
21
|
+
only one model is resident at a time. This trades per-run latency (models
|
|
22
|
+
re-load from disk between stages) for peak memory. Recommended for GPUs
|
|
23
|
+
with <=12GB VRAM or hosts with <32GB RAM.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self, device: str | None = None, low_memory: bool = False):
|
|
21
27
|
self.device = device
|
|
28
|
+
self.low_memory = low_memory
|
|
22
29
|
requested = device.lower() if isinstance(device, str) else "auto"
|
|
23
|
-
logger.info(
|
|
30
|
+
logger.info(
|
|
31
|
+
"LocalDubbingPipeline initialized with device=%s low_memory=%s",
|
|
32
|
+
requested,
|
|
33
|
+
low_memory,
|
|
34
|
+
)
|
|
24
35
|
|
|
25
36
|
self._transcriber: Any = None
|
|
26
37
|
self._transcriber_diarization: bool | None = None
|
|
@@ -31,6 +42,23 @@ class LocalDubbingPipeline:
|
|
|
31
42
|
self._separator: Any = None
|
|
32
43
|
self._synchronizer: TimingSynchronizer | None = None
|
|
33
44
|
|
|
45
|
+
def _maybe_unload(self, component_name: str) -> None:
|
|
46
|
+
"""Unload a stage's model when low_memory mode is enabled.
|
|
47
|
+
|
|
48
|
+
No-op when low_memory=False or the component was never initialized
|
|
49
|
+
(e.g. caller supplied a pre-computed transcription so the transcriber
|
|
50
|
+
was skipped).
|
|
51
|
+
"""
|
|
52
|
+
if not self.low_memory:
|
|
53
|
+
return
|
|
54
|
+
component = getattr(self, component_name, None)
|
|
55
|
+
if component is None:
|
|
56
|
+
return
|
|
57
|
+
unload = getattr(component, "unload", None)
|
|
58
|
+
if callable(unload):
|
|
59
|
+
logger.info("low_memory: unloading %s", component_name.lstrip("_"))
|
|
60
|
+
unload()
|
|
61
|
+
|
|
34
62
|
def _init_transcriber(self, enable_diarization: bool = False) -> None:
|
|
35
63
|
"""Initialize the transcription model."""
|
|
36
64
|
from videopython.ai.understanding.audio import AudioToText
|
|
@@ -141,6 +169,7 @@ class LocalDubbingPipeline:
|
|
|
141
169
|
self._transcriber_diarization = enable_diarization
|
|
142
170
|
|
|
143
171
|
transcription = self._transcriber.transcribe(source_audio)
|
|
172
|
+
self._maybe_unload("_transcriber")
|
|
144
173
|
|
|
145
174
|
if not transcription.segments:
|
|
146
175
|
return DubbingResult(
|
|
@@ -162,6 +191,7 @@ class LocalDubbingPipeline:
|
|
|
162
191
|
self._init_separator()
|
|
163
192
|
|
|
164
193
|
separated_audio = self._separator.separate(source_audio)
|
|
194
|
+
self._maybe_unload("_separator")
|
|
165
195
|
vocal_audio = separated_audio.vocals
|
|
166
196
|
|
|
167
197
|
voice_samples: dict[str, Audio] = {}
|
|
@@ -178,6 +208,7 @@ class LocalDubbingPipeline:
|
|
|
178
208
|
target_lang=target_lang,
|
|
179
209
|
source_lang=detected_lang,
|
|
180
210
|
)
|
|
211
|
+
self._maybe_unload("_translator")
|
|
181
212
|
|
|
182
213
|
report_progress("Generating dubbed speech", 0.50)
|
|
183
214
|
if self._tts is None or self._tts_voice_clone != voice_clone or self._tts_language != target_lang:
|
|
@@ -208,6 +239,8 @@ class LocalDubbingPipeline:
|
|
|
208
239
|
target_durations.append(segment.duration)
|
|
209
240
|
start_times.append(segment.start)
|
|
210
241
|
|
|
242
|
+
self._maybe_unload("_tts")
|
|
243
|
+
|
|
211
244
|
report_progress("Synchronizing timing", 0.85)
|
|
212
245
|
if self._synchronizer is None:
|
|
213
246
|
self._init_synchronizer()
|
|
@@ -263,6 +296,7 @@ class LocalDubbingPipeline:
|
|
|
263
296
|
self._transcriber_diarization = False
|
|
264
297
|
|
|
265
298
|
transcription = self._transcriber.transcribe(source_audio)
|
|
299
|
+
self._maybe_unload("_transcriber")
|
|
266
300
|
|
|
267
301
|
separated_audio: SeparatedAudio | None = None
|
|
268
302
|
vocal_audio = source_audio
|
|
@@ -273,6 +307,7 @@ class LocalDubbingPipeline:
|
|
|
273
307
|
self._init_separator()
|
|
274
308
|
|
|
275
309
|
separated_audio = self._separator.separate(source_audio)
|
|
310
|
+
self._maybe_unload("_separator")
|
|
276
311
|
vocal_audio = separated_audio.vocals
|
|
277
312
|
|
|
278
313
|
report_progress("Extracting voice sample", 0.40)
|
|
@@ -295,6 +330,7 @@ class LocalDubbingPipeline:
|
|
|
295
330
|
|
|
296
331
|
generated_speech = self._tts.generate_audio(text, voice_sample=voice_sample)
|
|
297
332
|
speech_duration = generated_speech.metadata.duration_seconds
|
|
333
|
+
self._maybe_unload("_tts")
|
|
298
334
|
|
|
299
335
|
report_progress("Assembling audio", 0.85)
|
|
300
336
|
|
|
@@ -4,7 +4,7 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from typing import Any
|
|
6
6
|
|
|
7
|
-
from videopython.ai._device import log_device_initialization, select_device
|
|
7
|
+
from videopython.ai._device import log_device_initialization, release_device_memory, select_device
|
|
8
8
|
from videopython.base.audio import Audio, AudioMetadata
|
|
9
9
|
|
|
10
10
|
|
|
@@ -151,6 +151,16 @@ class TextToSpeech:
|
|
|
151
151
|
|
|
152
152
|
return self._generate_local(text, effective_voice)
|
|
153
153
|
|
|
154
|
+
def unload(self) -> None:
|
|
155
|
+
"""Release the TTS model(s) so the next generate_audio() re-initializes.
|
|
156
|
+
|
|
157
|
+
Used by low-memory dubbing to free VRAM between pipeline stages.
|
|
158
|
+
"""
|
|
159
|
+
self._model = None
|
|
160
|
+
self._processor = None
|
|
161
|
+
self._chatterbox_model = None
|
|
162
|
+
release_device_memory(self.device)
|
|
163
|
+
|
|
154
164
|
|
|
155
165
|
class TextToMusic:
|
|
156
166
|
"""Generates music from text descriptions using MusicGen."""
|
|
@@ -4,7 +4,7 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from typing import Any
|
|
6
6
|
|
|
7
|
-
from videopython.ai._device import log_device_initialization, select_device
|
|
7
|
+
from videopython.ai._device import log_device_initialization, release_device_memory, select_device
|
|
8
8
|
from videopython.ai.dubbing.models import TranslatedSegment
|
|
9
9
|
from videopython.base.text.transcription import TranscriptionSegment
|
|
10
10
|
|
|
@@ -180,6 +180,16 @@ class TextTranslator:
|
|
|
180
180
|
|
|
181
181
|
return translated_segments
|
|
182
182
|
|
|
183
|
+
def unload(self) -> None:
|
|
184
|
+
"""Release the translation model so the next translate() re-initializes.
|
|
185
|
+
|
|
186
|
+
Used by low-memory dubbing to free VRAM between pipeline stages.
|
|
187
|
+
"""
|
|
188
|
+
self._model = None
|
|
189
|
+
self._tokenizer = None
|
|
190
|
+
self._current_lang_pair = None
|
|
191
|
+
release_device_memory(self.device)
|
|
192
|
+
|
|
183
193
|
@staticmethod
|
|
184
194
|
def get_supported_languages() -> dict[str, str]:
|
|
185
195
|
return LANGUAGE_NAMES.copy()
|
|
@@ -4,7 +4,7 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from typing import Any, Literal
|
|
6
6
|
|
|
7
|
-
from videopython.ai._device import log_device_initialization, select_device
|
|
7
|
+
from videopython.ai._device import log_device_initialization, release_device_memory, select_device
|
|
8
8
|
from videopython.base.audio import Audio
|
|
9
9
|
from videopython.base.description import AudioClassification, AudioEvent
|
|
10
10
|
from videopython.base.text.transcription import Transcription, TranscriptionSegment, TranscriptionWord
|
|
@@ -51,6 +51,15 @@ class AudioToText:
|
|
|
51
51
|
self._diarization_pipeline = Pipeline.from_pretrained(self.PYANNOTE_DIARIZATION_MODEL)
|
|
52
52
|
self._diarization_pipeline.to(torch.device(self.device))
|
|
53
53
|
|
|
54
|
+
def unload(self) -> None:
|
|
55
|
+
"""Release the Whisper and diarization models so the next call re-initializes.
|
|
56
|
+
|
|
57
|
+
Used by low-memory dubbing to free VRAM between pipeline stages.
|
|
58
|
+
"""
|
|
59
|
+
self._model = None
|
|
60
|
+
self._diarization_pipeline = None
|
|
61
|
+
release_device_memory(self.device)
|
|
62
|
+
|
|
54
63
|
def _process_transcription_result(self, transcription_result: dict) -> Transcription:
|
|
55
64
|
"""Process raw transcription result into a Transcription object."""
|
|
56
65
|
transcription_segments = []
|
|
@@ -4,7 +4,7 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from typing import Any
|
|
6
6
|
|
|
7
|
-
from videopython.ai._device import log_device_initialization, select_device
|
|
7
|
+
from videopython.ai._device import log_device_initialization, release_device_memory, select_device
|
|
8
8
|
from videopython.ai.dubbing.models import SeparatedAudio
|
|
9
9
|
from videopython.base.audio import Audio, AudioMetadata
|
|
10
10
|
|
|
@@ -134,3 +134,11 @@ class AudioSeparator:
|
|
|
134
134
|
def extract_background(self, audio: Audio) -> Audio:
|
|
135
135
|
"""Convenience method to extract only background from audio."""
|
|
136
136
|
return self.separate(audio).background
|
|
137
|
+
|
|
138
|
+
def unload(self) -> None:
|
|
139
|
+
"""Release the Demucs model so the next separate() re-initializes.
|
|
140
|
+
|
|
141
|
+
Used by low-memory dubbing to free VRAM between pipeline stages.
|
|
142
|
+
"""
|
|
143
|
+
self._model = None
|
|
144
|
+
release_device_memory(self.device)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|