videopython 0.26.1__tar.gz → 0.26.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.26.1 → videopython-0.26.3}/PKG-INFO +1 -1
- {videopython-0.26.1 → videopython-0.26.3}/pyproject.toml +1 -1
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/ai/_device.py +27 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/ai/dubbing/dubber.py +97 -7
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/ai/dubbing/pipeline.py +52 -13
- videopython-0.26.3/src/videopython/ai/dubbing/remux.py +73 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/ai/generation/audio.py +11 -1
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/ai/generation/translation.py +11 -1
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/ai/understanding/audio.py +10 -1
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/ai/understanding/separation.py +9 -1
- {videopython-0.26.1 → videopython-0.26.3}/.gitignore +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/LICENSE +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/README.md +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/__init__.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/ai/dubbing/__init__.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/ai/dubbing/models.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/ai/registry.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/ai/swapping/__init__.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/ai/swapping/inpainter.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/ai/swapping/models.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/ai/swapping/segmenter.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/ai/swapping/swapper.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/ai/transforms.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/ai/video_analysis.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/base/__init__.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/base/audio/__init__.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/base/audio/analysis.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/base/audio/audio.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/base/combine.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/base/description.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/base/effects.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/base/progress.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/base/registry.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/base/scene.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/base/streaming.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/base/text/__init__.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/base/text/overlay.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/base/text/transcription.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/base/transforms.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/base/transitions.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/base/utils.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/base/video.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/editing/__init__.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/editing/multicam.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/editing/premiere_xml.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/editing/video_edit.py +0 -0
- {videopython-0.26.1 → videopython-0.26.3}/src/videopython/py.typed +0 -0
|
@@ -25,6 +25,33 @@ def log_device_initialization(
|
|
|
25
25
|
)
|
|
26
26
|
|
|
27
27
|
|
|
28
|
+
def release_device_memory(device: str | None) -> None:
|
|
29
|
+
"""Release cached allocator memory for the given device.
|
|
30
|
+
|
|
31
|
+
Safe to call when torch is not importable or the device is CPU/None.
|
|
32
|
+
"""
|
|
33
|
+
try:
|
|
34
|
+
import torch
|
|
35
|
+
except ImportError:
|
|
36
|
+
return
|
|
37
|
+
|
|
38
|
+
import gc
|
|
39
|
+
|
|
40
|
+
gc.collect()
|
|
41
|
+
|
|
42
|
+
if device == "cuda" and torch.cuda.is_available():
|
|
43
|
+
torch.cuda.empty_cache()
|
|
44
|
+
return
|
|
45
|
+
|
|
46
|
+
if device == "mps":
|
|
47
|
+
mps_backend = getattr(torch.backends, "mps", None)
|
|
48
|
+
if mps_backend is not None and mps_backend.is_available():
|
|
49
|
+
mps_mod = getattr(torch, "mps", None)
|
|
50
|
+
empty_cache = getattr(mps_mod, "empty_cache", None) if mps_mod is not None else None
|
|
51
|
+
if callable(empty_cache):
|
|
52
|
+
empty_cache()
|
|
53
|
+
|
|
54
|
+
|
|
28
55
|
def select_device(
|
|
29
56
|
device: str | None,
|
|
30
57
|
*,
|
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
+
import tempfile
|
|
7
|
+
from pathlib import Path
|
|
6
8
|
from typing import TYPE_CHECKING, Any, Callable
|
|
7
9
|
|
|
8
10
|
from videopython.ai.dubbing.models import DubbingResult, RevoiceResult
|
|
@@ -14,18 +16,28 @@ logger = logging.getLogger(__name__)
|
|
|
14
16
|
|
|
15
17
|
|
|
16
18
|
class VideoDubber:
|
|
17
|
-
"""Dubs videos into different languages using the local pipeline.
|
|
18
|
-
|
|
19
|
-
|
|
19
|
+
"""Dubs videos into different languages using the local pipeline.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
device: Execution device (``cpu``, ``cuda``, ``mps``, or ``None`` for auto).
|
|
23
|
+
low_memory: When True, each pipeline stage (Whisper, Demucs, MarianMT,
|
|
24
|
+
Chatterbox TTS) is unloaded from memory after it runs, so only one
|
|
25
|
+
model is resident at a time. Trades per-run latency (~10-30s of
|
|
26
|
+
extra model loads) for a much lower memory ceiling. Recommended for
|
|
27
|
+
GPUs with <=12GB VRAM or hosts with <32GB RAM. Default False.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, device: str | None = None, low_memory: bool = False):
|
|
20
31
|
self.device = device
|
|
32
|
+
self.low_memory = low_memory
|
|
21
33
|
self._local_pipeline: Any = None
|
|
22
34
|
requested = device.lower() if isinstance(device, str) else "auto"
|
|
23
|
-
logger.info("VideoDubber initialized with device=%s", requested)
|
|
35
|
+
logger.info("VideoDubber initialized with device=%s low_memory=%s", requested, low_memory)
|
|
24
36
|
|
|
25
37
|
def _init_local_pipeline(self) -> None:
|
|
26
38
|
from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
|
|
27
39
|
|
|
28
|
-
self._local_pipeline = LocalDubbingPipeline(device=self.device)
|
|
40
|
+
self._local_pipeline = LocalDubbingPipeline(device=self.device, low_memory=self.low_memory)
|
|
29
41
|
|
|
30
42
|
def dub(
|
|
31
43
|
self,
|
|
@@ -50,7 +62,7 @@ class VideoDubber:
|
|
|
50
62
|
self._init_local_pipeline()
|
|
51
63
|
|
|
52
64
|
return self._local_pipeline.process(
|
|
53
|
-
|
|
65
|
+
source_audio=video.audio,
|
|
54
66
|
target_lang=target_lang,
|
|
55
67
|
source_lang=source_lang,
|
|
56
68
|
preserve_background=preserve_background,
|
|
@@ -89,6 +101,84 @@ class VideoDubber:
|
|
|
89
101
|
)
|
|
90
102
|
return video.add_audio(result.dubbed_audio, overlay=False)
|
|
91
103
|
|
|
104
|
+
def dub_file(
|
|
105
|
+
self,
|
|
106
|
+
input_path: str | Path,
|
|
107
|
+
output_path: str | Path,
|
|
108
|
+
target_lang: str,
|
|
109
|
+
source_lang: str | None = None,
|
|
110
|
+
preserve_background: bool = True,
|
|
111
|
+
voice_clone: bool = True,
|
|
112
|
+
enable_diarization: bool = False,
|
|
113
|
+
progress_callback: Callable[[str, float], None] | None = None,
|
|
114
|
+
transcription: Any = None,
|
|
115
|
+
) -> DubbingResult:
|
|
116
|
+
"""Dub a video file in place on disk without loading video frames into memory.
|
|
117
|
+
|
|
118
|
+
Extracts the audio track via ffmpeg, runs the dubbing pipeline on the
|
|
119
|
+
audio only, then muxes the dubbed audio back into the source video
|
|
120
|
+
using ffmpeg stream-copy (no video re-encode). Peak memory is bounded
|
|
121
|
+
by model weights and the audio track — independent of video length and
|
|
122
|
+
resolution.
|
|
123
|
+
|
|
124
|
+
Use this instead of ``dub_and_replace`` when the source video is long
|
|
125
|
+
or high-resolution and you don't need frame-level access in Python.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
input_path: Path to the source video file.
|
|
129
|
+
output_path: Path to write the dubbed video. Overwritten if it exists.
|
|
130
|
+
target_lang: Target language code (e.g. ``"es"``, ``"fr"``).
|
|
131
|
+
source_lang: Source language code, or ``None`` to auto-detect.
|
|
132
|
+
preserve_background: Preserve background music/effects via source separation.
|
|
133
|
+
voice_clone: Clone the source speaker's voice for the dubbed track.
|
|
134
|
+
enable_diarization: Enable speaker diarization for per-speaker voice cloning.
|
|
135
|
+
progress_callback: Optional callback ``(stage: str, progress: float) -> None``.
|
|
136
|
+
transcription: Optional pre-computed ``Transcription`` to skip the Whisper step.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
``DubbingResult`` with the dubbed audio, translated segments, and
|
|
140
|
+
source transcription. The output video is written to ``output_path``.
|
|
141
|
+
"""
|
|
142
|
+
from videopython.ai.dubbing.remux import replace_audio_stream
|
|
143
|
+
from videopython.base.audio import Audio
|
|
144
|
+
|
|
145
|
+
input_path = Path(input_path)
|
|
146
|
+
output_path = Path(output_path)
|
|
147
|
+
|
|
148
|
+
if not input_path.exists():
|
|
149
|
+
raise FileNotFoundError(f"Input video not found: {input_path}")
|
|
150
|
+
|
|
151
|
+
logger.info("dub_file: loading audio from %s", input_path)
|
|
152
|
+
source_audio = Audio.from_path(input_path)
|
|
153
|
+
|
|
154
|
+
if self._local_pipeline is None:
|
|
155
|
+
self._init_local_pipeline()
|
|
156
|
+
|
|
157
|
+
result = self._local_pipeline.process(
|
|
158
|
+
source_audio=source_audio,
|
|
159
|
+
target_lang=target_lang,
|
|
160
|
+
source_lang=source_lang,
|
|
161
|
+
preserve_background=preserve_background,
|
|
162
|
+
voice_clone=voice_clone,
|
|
163
|
+
enable_diarization=enable_diarization,
|
|
164
|
+
progress_callback=progress_callback,
|
|
165
|
+
transcription=transcription,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
|
169
|
+
dubbed_audio_path = Path(tmp.name)
|
|
170
|
+
try:
|
|
171
|
+
result.dubbed_audio.save(dubbed_audio_path)
|
|
172
|
+
replace_audio_stream(
|
|
173
|
+
video_path=input_path,
|
|
174
|
+
audio_path=dubbed_audio_path,
|
|
175
|
+
output_path=output_path,
|
|
176
|
+
)
|
|
177
|
+
finally:
|
|
178
|
+
dubbed_audio_path.unlink(missing_ok=True)
|
|
179
|
+
|
|
180
|
+
return result
|
|
181
|
+
|
|
92
182
|
def revoice(
|
|
93
183
|
self,
|
|
94
184
|
video: Video,
|
|
@@ -101,7 +191,7 @@ class VideoDubber:
|
|
|
101
191
|
self._init_local_pipeline()
|
|
102
192
|
|
|
103
193
|
return self._local_pipeline.revoice(
|
|
104
|
-
|
|
194
|
+
source_audio=video.audio,
|
|
105
195
|
text=text,
|
|
106
196
|
preserve_background=preserve_background,
|
|
107
197
|
progress_callback=progress_callback,
|
|
@@ -9,18 +9,29 @@ from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, Separate
|
|
|
9
9
|
from videopython.ai.dubbing.timing import TimingSynchronizer
|
|
10
10
|
|
|
11
11
|
if TYPE_CHECKING:
|
|
12
|
-
from videopython.base.
|
|
12
|
+
from videopython.base.audio import Audio
|
|
13
13
|
|
|
14
14
|
logger = logging.getLogger(__name__)
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class LocalDubbingPipeline:
|
|
18
|
-
"""Local pipeline for video dubbing.
|
|
18
|
+
"""Local pipeline for video dubbing.
|
|
19
19
|
|
|
20
|
-
|
|
20
|
+
When ``low_memory=True``, each stage's model is unloaded after it runs, so
|
|
21
|
+
only one model is resident at a time. This trades per-run latency (models
|
|
22
|
+
re-load from disk between stages) for peak memory. Recommended for GPUs
|
|
23
|
+
with <=12GB VRAM or hosts with <32GB RAM.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self, device: str | None = None, low_memory: bool = False):
|
|
21
27
|
self.device = device
|
|
28
|
+
self.low_memory = low_memory
|
|
22
29
|
requested = device.lower() if isinstance(device, str) else "auto"
|
|
23
|
-
logger.info(
|
|
30
|
+
logger.info(
|
|
31
|
+
"LocalDubbingPipeline initialized with device=%s low_memory=%s",
|
|
32
|
+
requested,
|
|
33
|
+
low_memory,
|
|
34
|
+
)
|
|
24
35
|
|
|
25
36
|
self._transcriber: Any = None
|
|
26
37
|
self._transcriber_diarization: bool | None = None
|
|
@@ -31,6 +42,23 @@ class LocalDubbingPipeline:
|
|
|
31
42
|
self._separator: Any = None
|
|
32
43
|
self._synchronizer: TimingSynchronizer | None = None
|
|
33
44
|
|
|
45
|
+
def _maybe_unload(self, component_name: str) -> None:
|
|
46
|
+
"""Unload a stage's model when low_memory mode is enabled.
|
|
47
|
+
|
|
48
|
+
No-op when low_memory=False or the component was never initialized
|
|
49
|
+
(e.g. caller supplied a pre-computed transcription so the transcriber
|
|
50
|
+
was skipped).
|
|
51
|
+
"""
|
|
52
|
+
if not self.low_memory:
|
|
53
|
+
return
|
|
54
|
+
component = getattr(self, component_name, None)
|
|
55
|
+
if component is None:
|
|
56
|
+
return
|
|
57
|
+
unload = getattr(component, "unload", None)
|
|
58
|
+
if callable(unload):
|
|
59
|
+
logger.info("low_memory: unloading %s", component_name.lstrip("_"))
|
|
60
|
+
unload()
|
|
61
|
+
|
|
34
62
|
def _init_transcriber(self, enable_diarization: bool = False) -> None:
|
|
35
63
|
"""Initialize the transcription model."""
|
|
36
64
|
from videopython.ai.understanding.audio import AudioToText
|
|
@@ -74,7 +102,6 @@ class LocalDubbingPipeline:
|
|
|
74
102
|
max_duration: float = 10.0,
|
|
75
103
|
) -> dict[str, Any]:
|
|
76
104
|
"""Extract voice samples for each speaker from the audio."""
|
|
77
|
-
from videopython.base.audio import Audio
|
|
78
105
|
|
|
79
106
|
voice_samples: dict[str, Audio] = {}
|
|
80
107
|
|
|
@@ -107,7 +134,7 @@ class LocalDubbingPipeline:
|
|
|
107
134
|
|
|
108
135
|
def process(
|
|
109
136
|
self,
|
|
110
|
-
|
|
137
|
+
source_audio: Audio,
|
|
111
138
|
target_lang: str,
|
|
112
139
|
source_lang: str | None = None,
|
|
113
140
|
preserve_background: bool = True,
|
|
@@ -116,22 +143,22 @@ class LocalDubbingPipeline:
|
|
|
116
143
|
progress_callback: Callable[[str, float], None] | None = None,
|
|
117
144
|
transcription: Any | None = None,
|
|
118
145
|
) -> DubbingResult:
|
|
119
|
-
"""
|
|
146
|
+
"""Run the dubbing pipeline against the given source audio.
|
|
120
147
|
|
|
121
148
|
Args:
|
|
149
|
+
source_audio: Source audio track to dub. Callers with a ``Video``
|
|
150
|
+
object should pass ``video.audio``; callers with only a file path
|
|
151
|
+
can use ``Audio.from_path(path)`` to avoid loading video frames.
|
|
122
152
|
transcription: Optional pre-computed Transcription object. When provided,
|
|
123
153
|
the internal Whisper transcription step is skipped (saving time and VRAM).
|
|
124
154
|
Must be a ``videopython.base.text.transcription.Transcription`` instance
|
|
125
155
|
with populated ``segments``.
|
|
126
156
|
"""
|
|
127
|
-
from videopython.base.audio import Audio
|
|
128
157
|
|
|
129
158
|
def report_progress(stage: str, progress: float) -> None:
|
|
130
159
|
if progress_callback:
|
|
131
160
|
progress_callback(stage, progress)
|
|
132
161
|
|
|
133
|
-
source_audio = video.audio
|
|
134
|
-
|
|
135
162
|
if transcription is not None:
|
|
136
163
|
report_progress("Using provided transcription", 0.05)
|
|
137
164
|
else:
|
|
@@ -141,6 +168,7 @@ class LocalDubbingPipeline:
|
|
|
141
168
|
self._transcriber_diarization = enable_diarization
|
|
142
169
|
|
|
143
170
|
transcription = self._transcriber.transcribe(source_audio)
|
|
171
|
+
self._maybe_unload("_transcriber")
|
|
144
172
|
|
|
145
173
|
if not transcription.segments:
|
|
146
174
|
return DubbingResult(
|
|
@@ -162,6 +190,7 @@ class LocalDubbingPipeline:
|
|
|
162
190
|
self._init_separator()
|
|
163
191
|
|
|
164
192
|
separated_audio = self._separator.separate(source_audio)
|
|
193
|
+
self._maybe_unload("_separator")
|
|
165
194
|
vocal_audio = separated_audio.vocals
|
|
166
195
|
|
|
167
196
|
voice_samples: dict[str, Audio] = {}
|
|
@@ -178,6 +207,7 @@ class LocalDubbingPipeline:
|
|
|
178
207
|
target_lang=target_lang,
|
|
179
208
|
source_lang=detected_lang,
|
|
180
209
|
)
|
|
210
|
+
self._maybe_unload("_translator")
|
|
181
211
|
|
|
182
212
|
report_progress("Generating dubbed speech", 0.50)
|
|
183
213
|
if self._tts is None or self._tts_voice_clone != voice_clone or self._tts_language != target_lang:
|
|
@@ -208,6 +238,8 @@ class LocalDubbingPipeline:
|
|
|
208
238
|
target_durations.append(segment.duration)
|
|
209
239
|
start_times.append(segment.start)
|
|
210
240
|
|
|
241
|
+
self._maybe_unload("_tts")
|
|
242
|
+
|
|
211
243
|
report_progress("Synchronizing timing", 0.85)
|
|
212
244
|
if self._synchronizer is None:
|
|
213
245
|
self._init_synchronizer()
|
|
@@ -242,19 +274,23 @@ class LocalDubbingPipeline:
|
|
|
242
274
|
|
|
243
275
|
def revoice(
|
|
244
276
|
self,
|
|
245
|
-
|
|
277
|
+
source_audio: Audio,
|
|
246
278
|
text: str,
|
|
247
279
|
preserve_background: bool = True,
|
|
248
280
|
progress_callback: Callable[[str, float], None] | None = None,
|
|
249
281
|
) -> RevoiceResult:
|
|
250
|
-
"""Replace speech in
|
|
282
|
+
"""Replace speech in audio with new text using voice cloning.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
source_audio: Source audio track to revoice. Callers with a ``Video``
|
|
286
|
+
object should pass ``video.audio``.
|
|
287
|
+
"""
|
|
251
288
|
from videopython.base.audio import Audio
|
|
252
289
|
|
|
253
290
|
def report_progress(stage: str, progress: float) -> None:
|
|
254
291
|
if progress_callback:
|
|
255
292
|
progress_callback(stage, progress)
|
|
256
293
|
|
|
257
|
-
source_audio = video.audio
|
|
258
294
|
original_duration = source_audio.metadata.duration_seconds
|
|
259
295
|
|
|
260
296
|
report_progress("Analyzing audio", 0.05)
|
|
@@ -263,6 +299,7 @@ class LocalDubbingPipeline:
|
|
|
263
299
|
self._transcriber_diarization = False
|
|
264
300
|
|
|
265
301
|
transcription = self._transcriber.transcribe(source_audio)
|
|
302
|
+
self._maybe_unload("_transcriber")
|
|
266
303
|
|
|
267
304
|
separated_audio: SeparatedAudio | None = None
|
|
268
305
|
vocal_audio = source_audio
|
|
@@ -273,6 +310,7 @@ class LocalDubbingPipeline:
|
|
|
273
310
|
self._init_separator()
|
|
274
311
|
|
|
275
312
|
separated_audio = self._separator.separate(source_audio)
|
|
313
|
+
self._maybe_unload("_separator")
|
|
276
314
|
vocal_audio = separated_audio.vocals
|
|
277
315
|
|
|
278
316
|
report_progress("Extracting voice sample", 0.40)
|
|
@@ -295,6 +333,7 @@ class LocalDubbingPipeline:
|
|
|
295
333
|
|
|
296
334
|
generated_speech = self._tts.generate_audio(text, voice_sample=voice_sample)
|
|
297
335
|
speech_duration = generated_speech.metadata.duration_seconds
|
|
336
|
+
self._maybe_unload("_tts")
|
|
298
337
|
|
|
299
338
|
report_progress("Assembling audio", 0.85)
|
|
300
339
|
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""ffmpeg helper for replacing a video file's audio track without re-encoding video."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import subprocess
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class RemuxError(RuntimeError):
|
|
13
|
+
"""ffmpeg failed while replacing an audio stream."""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def replace_audio_stream(
|
|
17
|
+
video_path: str | Path,
|
|
18
|
+
audio_path: str | Path,
|
|
19
|
+
output_path: str | Path,
|
|
20
|
+
audio_codec: str = "aac",
|
|
21
|
+
audio_bitrate: str = "192k",
|
|
22
|
+
) -> None:
|
|
23
|
+
"""Copy ``video_path``'s video stream and mux in ``audio_path`` as the audio track.
|
|
24
|
+
|
|
25
|
+
Uses ffmpeg stream-copy for video (no re-encode) and encodes audio to AAC.
|
|
26
|
+
``-shortest`` trims to the shorter of the two streams so the output duration
|
|
27
|
+
matches the source video when the dubbed audio is slightly longer.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
video_path: Source video file (video stream is copied unchanged).
|
|
31
|
+
audio_path: Audio file to use as the new audio track.
|
|
32
|
+
output_path: Destination file. Overwritten if it exists.
|
|
33
|
+
audio_codec: ffmpeg audio codec name. Defaults to ``aac`` (MP4-compatible).
|
|
34
|
+
audio_bitrate: Audio bitrate passed to ffmpeg (``-b:a``).
|
|
35
|
+
|
|
36
|
+
Raises:
|
|
37
|
+
FileNotFoundError: If ``video_path`` or ``audio_path`` does not exist.
|
|
38
|
+
RemuxError: If ffmpeg returns a non-zero exit code.
|
|
39
|
+
"""
|
|
40
|
+
video_path = Path(video_path)
|
|
41
|
+
audio_path = Path(audio_path)
|
|
42
|
+
output_path = Path(output_path)
|
|
43
|
+
|
|
44
|
+
if not video_path.exists():
|
|
45
|
+
raise FileNotFoundError(f"Video file not found: {video_path}")
|
|
46
|
+
if not audio_path.exists():
|
|
47
|
+
raise FileNotFoundError(f"Audio file not found: {audio_path}")
|
|
48
|
+
|
|
49
|
+
cmd = [
|
|
50
|
+
"ffmpeg",
|
|
51
|
+
"-y",
|
|
52
|
+
"-i",
|
|
53
|
+
str(video_path),
|
|
54
|
+
"-i",
|
|
55
|
+
str(audio_path),
|
|
56
|
+
"-map",
|
|
57
|
+
"0:v:0",
|
|
58
|
+
"-map",
|
|
59
|
+
"1:a:0",
|
|
60
|
+
"-c:v",
|
|
61
|
+
"copy",
|
|
62
|
+
"-c:a",
|
|
63
|
+
audio_codec,
|
|
64
|
+
"-b:a",
|
|
65
|
+
audio_bitrate,
|
|
66
|
+
"-shortest",
|
|
67
|
+
str(output_path),
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
logger.info("replace_audio_stream: %s + %s -> %s", video_path, audio_path, output_path)
|
|
71
|
+
result = subprocess.run(cmd, capture_output=True)
|
|
72
|
+
if result.returncode != 0:
|
|
73
|
+
raise RemuxError(f"ffmpeg failed (exit {result.returncode}): {result.stderr.decode(errors='replace')}")
|
|
@@ -4,7 +4,7 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from typing import Any
|
|
6
6
|
|
|
7
|
-
from videopython.ai._device import log_device_initialization, select_device
|
|
7
|
+
from videopython.ai._device import log_device_initialization, release_device_memory, select_device
|
|
8
8
|
from videopython.base.audio import Audio, AudioMetadata
|
|
9
9
|
|
|
10
10
|
|
|
@@ -151,6 +151,16 @@ class TextToSpeech:
|
|
|
151
151
|
|
|
152
152
|
return self._generate_local(text, effective_voice)
|
|
153
153
|
|
|
154
|
+
def unload(self) -> None:
|
|
155
|
+
"""Release the TTS model(s) so the next generate_audio() re-initializes.
|
|
156
|
+
|
|
157
|
+
Used by low-memory dubbing to free VRAM between pipeline stages.
|
|
158
|
+
"""
|
|
159
|
+
self._model = None
|
|
160
|
+
self._processor = None
|
|
161
|
+
self._chatterbox_model = None
|
|
162
|
+
release_device_memory(self.device)
|
|
163
|
+
|
|
154
164
|
|
|
155
165
|
class TextToMusic:
|
|
156
166
|
"""Generates music from text descriptions using MusicGen."""
|
|
@@ -4,7 +4,7 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from typing import Any
|
|
6
6
|
|
|
7
|
-
from videopython.ai._device import log_device_initialization, select_device
|
|
7
|
+
from videopython.ai._device import log_device_initialization, release_device_memory, select_device
|
|
8
8
|
from videopython.ai.dubbing.models import TranslatedSegment
|
|
9
9
|
from videopython.base.text.transcription import TranscriptionSegment
|
|
10
10
|
|
|
@@ -180,6 +180,16 @@ class TextTranslator:
|
|
|
180
180
|
|
|
181
181
|
return translated_segments
|
|
182
182
|
|
|
183
|
+
def unload(self) -> None:
|
|
184
|
+
"""Release the translation model so the next translate() re-initializes.
|
|
185
|
+
|
|
186
|
+
Used by low-memory dubbing to free VRAM between pipeline stages.
|
|
187
|
+
"""
|
|
188
|
+
self._model = None
|
|
189
|
+
self._tokenizer = None
|
|
190
|
+
self._current_lang_pair = None
|
|
191
|
+
release_device_memory(self.device)
|
|
192
|
+
|
|
183
193
|
@staticmethod
|
|
184
194
|
def get_supported_languages() -> dict[str, str]:
|
|
185
195
|
return LANGUAGE_NAMES.copy()
|
|
@@ -4,7 +4,7 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from typing import Any, Literal
|
|
6
6
|
|
|
7
|
-
from videopython.ai._device import log_device_initialization, select_device
|
|
7
|
+
from videopython.ai._device import log_device_initialization, release_device_memory, select_device
|
|
8
8
|
from videopython.base.audio import Audio
|
|
9
9
|
from videopython.base.description import AudioClassification, AudioEvent
|
|
10
10
|
from videopython.base.text.transcription import Transcription, TranscriptionSegment, TranscriptionWord
|
|
@@ -51,6 +51,15 @@ class AudioToText:
|
|
|
51
51
|
self._diarization_pipeline = Pipeline.from_pretrained(self.PYANNOTE_DIARIZATION_MODEL)
|
|
52
52
|
self._diarization_pipeline.to(torch.device(self.device))
|
|
53
53
|
|
|
54
|
+
def unload(self) -> None:
|
|
55
|
+
"""Release the Whisper and diarization models so the next call re-initializes.
|
|
56
|
+
|
|
57
|
+
Used by low-memory dubbing to free VRAM between pipeline stages.
|
|
58
|
+
"""
|
|
59
|
+
self._model = None
|
|
60
|
+
self._diarization_pipeline = None
|
|
61
|
+
release_device_memory(self.device)
|
|
62
|
+
|
|
54
63
|
def _process_transcription_result(self, transcription_result: dict) -> Transcription:
|
|
55
64
|
"""Process raw transcription result into a Transcription object."""
|
|
56
65
|
transcription_segments = []
|
|
@@ -4,7 +4,7 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from typing import Any
|
|
6
6
|
|
|
7
|
-
from videopython.ai._device import log_device_initialization, select_device
|
|
7
|
+
from videopython.ai._device import log_device_initialization, release_device_memory, select_device
|
|
8
8
|
from videopython.ai.dubbing.models import SeparatedAudio
|
|
9
9
|
from videopython.base.audio import Audio, AudioMetadata
|
|
10
10
|
|
|
@@ -134,3 +134,11 @@ class AudioSeparator:
|
|
|
134
134
|
def extract_background(self, audio: Audio) -> Audio:
|
|
135
135
|
"""Convenience method to extract only background from audio."""
|
|
136
136
|
return self.separate(audio).background
|
|
137
|
+
|
|
138
|
+
def unload(self) -> None:
|
|
139
|
+
"""Release the Demucs model so the next separate() re-initializes.
|
|
140
|
+
|
|
141
|
+
Used by low-memory dubbing to free VRAM between pipeline stages.
|
|
142
|
+
"""
|
|
143
|
+
self._model = None
|
|
144
|
+
release_device_memory(self.device)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|