videopython 0.26.0__tar.gz → 0.26.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {videopython-0.26.0 → videopython-0.26.2}/PKG-INFO +1 -1
  2. {videopython-0.26.0 → videopython-0.26.2}/pyproject.toml +1 -1
  3. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/_device.py +27 -0
  4. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/dubbing/dubber.py +27 -6
  5. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/dubbing/pipeline.py +58 -10
  6. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/generation/audio.py +11 -1
  7. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/generation/translation.py +23 -1
  8. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/understanding/audio.py +10 -1
  9. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/understanding/separation.py +9 -1
  10. {videopython-0.26.0 → videopython-0.26.2}/.gitignore +0 -0
  11. {videopython-0.26.0 → videopython-0.26.2}/LICENSE +0 -0
  12. {videopython-0.26.0 → videopython-0.26.2}/README.md +0 -0
  13. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/__init__.py +0 -0
  14. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/__init__.py +0 -0
  15. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/dubbing/__init__.py +0 -0
  16. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/dubbing/models.py +0 -0
  17. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/dubbing/timing.py +0 -0
  18. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/generation/__init__.py +0 -0
  19. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/generation/image.py +0 -0
  20. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/generation/video.py +0 -0
  21. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/registry.py +0 -0
  22. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/swapping/__init__.py +0 -0
  23. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/swapping/inpainter.py +0 -0
  24. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/swapping/models.py +0 -0
  25. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/swapping/segmenter.py +0 -0
  26. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/swapping/swapper.py +0 -0
  27. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/transforms.py +0 -0
  28. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/understanding/__init__.py +0 -0
  29. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/understanding/image.py +0 -0
  30. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/understanding/temporal.py +0 -0
  31. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/ai/video_analysis.py +0 -0
  32. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/base/__init__.py +0 -0
  33. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/base/audio/__init__.py +0 -0
  34. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/base/audio/analysis.py +0 -0
  35. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/base/audio/audio.py +0 -0
  36. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/base/combine.py +0 -0
  37. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/base/description.py +0 -0
  38. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/base/effects.py +0 -0
  39. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/base/exceptions.py +0 -0
  40. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/base/progress.py +0 -0
  41. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/base/registry.py +0 -0
  42. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/base/scene.py +0 -0
  43. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/base/streaming.py +0 -0
  44. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/base/text/__init__.py +0 -0
  45. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/base/text/overlay.py +0 -0
  46. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/base/text/transcription.py +0 -0
  47. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/base/transforms.py +0 -0
  48. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/base/transitions.py +0 -0
  49. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/base/utils.py +0 -0
  50. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/base/video.py +0 -0
  51. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/editing/__init__.py +0 -0
  52. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/editing/multicam.py +0 -0
  53. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/editing/premiere_xml.py +0 -0
  54. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/editing/video_edit.py +0 -0
  55. {videopython-0.26.0 → videopython-0.26.2}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.26.0
3
+ Version: 0.26.2
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.26.0"
3
+ version = "0.26.2"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -25,6 +25,33 @@ def log_device_initialization(
25
25
  )
26
26
 
27
27
 
28
+ def release_device_memory(device: str | None) -> None:
29
+ """Release cached allocator memory for the given device.
30
+
31
+ Safe to call when torch is not importable or the device is CPU/None.
32
+ """
33
+ try:
34
+ import torch
35
+ except ImportError:
36
+ return
37
+
38
+ import gc
39
+
40
+ gc.collect()
41
+
42
+ if device == "cuda" and torch.cuda.is_available():
43
+ torch.cuda.empty_cache()
44
+ return
45
+
46
+ if device == "mps":
47
+ mps_backend = getattr(torch.backends, "mps", None)
48
+ if mps_backend is not None and mps_backend.is_available():
49
+ mps_mod = getattr(torch, "mps", None)
50
+ empty_cache = getattr(mps_mod, "empty_cache", None) if mps_mod is not None else None
51
+ if callable(empty_cache):
52
+ empty_cache()
53
+
54
+
28
55
  def select_device(
29
56
  device: str | None,
30
57
  *,
@@ -14,18 +14,28 @@ logger = logging.getLogger(__name__)
14
14
 
15
15
 
16
16
  class VideoDubber:
17
- """Dubs videos into different languages using the local pipeline."""
18
-
19
- def __init__(self, device: str | None = None):
17
+ """Dubs videos into different languages using the local pipeline.
18
+
19
+ Args:
20
+ device: Execution device (``cpu``, ``cuda``, ``mps``, or ``None`` for auto).
21
+ low_memory: When True, each pipeline stage (Whisper, Demucs, MarianMT,
22
+ Chatterbox TTS) is unloaded from memory after it runs, so only one
23
+ model is resident at a time. Trades per-run latency (~10-30s of
24
+ extra model loads) for a much lower memory ceiling. Recommended for
25
+ GPUs with <=12GB VRAM or hosts with <32GB RAM. Default False.
26
+ """
27
+
28
+ def __init__(self, device: str | None = None, low_memory: bool = False):
20
29
  self.device = device
30
+ self.low_memory = low_memory
21
31
  self._local_pipeline: Any = None
22
32
  requested = device.lower() if isinstance(device, str) else "auto"
23
- logger.info("VideoDubber initialized with device=%s", requested)
33
+ logger.info("VideoDubber initialized with device=%s low_memory=%s", requested, low_memory)
24
34
 
25
35
  def _init_local_pipeline(self) -> None:
26
36
  from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
27
37
 
28
- self._local_pipeline = LocalDubbingPipeline(device=self.device)
38
+ self._local_pipeline = LocalDubbingPipeline(device=self.device, low_memory=self.low_memory)
29
39
 
30
40
  def dub(
31
41
  self,
@@ -36,12 +46,15 @@ class VideoDubber:
36
46
  voice_clone: bool = True,
37
47
  enable_diarization: bool = False,
38
48
  progress_callback: Callable[[str, float], None] | None = None,
49
+ transcription: Any = None,
39
50
  ) -> DubbingResult:
40
51
  """Dub a video into a target language.
41
52
 
42
53
  Args:
43
54
  enable_diarization: Enable speaker diarization to clone each speaker's
44
55
  voice separately. Requires additional VRAM for the diarization model.
56
+ transcription: Optional pre-computed Transcription object. When provided,
57
+ the internal Whisper transcription step is skipped.
45
58
  """
46
59
  if self._local_pipeline is None:
47
60
  self._init_local_pipeline()
@@ -54,6 +67,7 @@ class VideoDubber:
54
67
  voice_clone=voice_clone,
55
68
  enable_diarization=enable_diarization,
56
69
  progress_callback=progress_callback,
70
+ transcription=transcription,
57
71
  )
58
72
 
59
73
  def dub_and_replace(
@@ -65,8 +79,14 @@ class VideoDubber:
65
79
  voice_clone: bool = True,
66
80
  enable_diarization: bool = False,
67
81
  progress_callback: Callable[[str, float], None] | None = None,
82
+ transcription: Any = None,
68
83
  ) -> Video:
69
- """Dub a video and return a new video with the dubbed audio."""
84
+ """Dub a video and return a new video with the dubbed audio.
85
+
86
+ Args:
87
+ transcription: Optional pre-computed Transcription object. When provided,
88
+ the internal Whisper transcription step is skipped.
89
+ """
70
90
  result = self.dub(
71
91
  video=video,
72
92
  target_lang=target_lang,
@@ -75,6 +95,7 @@ class VideoDubber:
75
95
  voice_clone=voice_clone,
76
96
  enable_diarization=enable_diarization,
77
97
  progress_callback=progress_callback,
98
+ transcription=transcription,
78
99
  )
79
100
  return video.add_audio(result.dubbed_audio, overlay=False)
80
101
 
@@ -15,12 +15,23 @@ logger = logging.getLogger(__name__)
15
15
 
16
16
 
17
17
  class LocalDubbingPipeline:
18
- """Local pipeline for video dubbing."""
18
+ """Local pipeline for video dubbing.
19
19
 
20
- def __init__(self, device: str | None = None):
20
+ When ``low_memory=True``, each stage's model is unloaded after it runs, so
21
+ only one model is resident at a time. This trades per-run latency (models
22
+ re-load from disk between stages) for peak memory. Recommended for GPUs
23
+ with <=12GB VRAM or hosts with <32GB RAM.
24
+ """
25
+
26
+ def __init__(self, device: str | None = None, low_memory: bool = False):
21
27
  self.device = device
28
+ self.low_memory = low_memory
22
29
  requested = device.lower() if isinstance(device, str) else "auto"
23
- logger.info("LocalDubbingPipeline initialized with device=%s", requested)
30
+ logger.info(
31
+ "LocalDubbingPipeline initialized with device=%s low_memory=%s",
32
+ requested,
33
+ low_memory,
34
+ )
24
35
 
25
36
  self._transcriber: Any = None
26
37
  self._transcriber_diarization: bool | None = None
@@ -31,6 +42,23 @@ class LocalDubbingPipeline:
31
42
  self._separator: Any = None
32
43
  self._synchronizer: TimingSynchronizer | None = None
33
44
 
45
+ def _maybe_unload(self, component_name: str) -> None:
46
+ """Unload a stage's model when low_memory mode is enabled.
47
+
48
+ No-op when low_memory=False or the component was never initialized
49
+ (e.g. caller supplied a pre-computed transcription so the transcriber
50
+ was skipped).
51
+ """
52
+ if not self.low_memory:
53
+ return
54
+ component = getattr(self, component_name, None)
55
+ if component is None:
56
+ return
57
+ unload = getattr(component, "unload", None)
58
+ if callable(unload):
59
+ logger.info("low_memory: unloading %s", component_name.lstrip("_"))
60
+ unload()
61
+
34
62
  def _init_transcriber(self, enable_diarization: bool = False) -> None:
35
63
  """Initialize the transcription model."""
36
64
  from videopython.ai.understanding.audio import AudioToText
@@ -114,21 +142,34 @@ class LocalDubbingPipeline:
114
142
  voice_clone: bool = True,
115
143
  enable_diarization: bool = False,
116
144
  progress_callback: Callable[[str, float], None] | None = None,
145
+ transcription: Any | None = None,
117
146
  ) -> DubbingResult:
118
- """Process a video through the local dubbing pipeline."""
147
+ """Process a video through the local dubbing pipeline.
148
+
149
+ Args:
150
+ transcription: Optional pre-computed Transcription object. When provided,
151
+ the internal Whisper transcription step is skipped (saving time and VRAM).
152
+ Must be a ``videopython.base.text.transcription.Transcription`` instance
153
+ with populated ``segments``.
154
+ """
119
155
  from videopython.base.audio import Audio
120
156
 
121
157
  def report_progress(stage: str, progress: float) -> None:
122
158
  if progress_callback:
123
159
  progress_callback(stage, progress)
124
160
 
125
- report_progress("Transcribing audio", 0.05)
126
- if self._transcriber is None or self._transcriber_diarization != enable_diarization:
127
- self._init_transcriber(enable_diarization=enable_diarization)
128
- self._transcriber_diarization = enable_diarization
129
-
130
161
  source_audio = video.audio
131
- transcription = self._transcriber.transcribe(source_audio)
162
+
163
+ if transcription is not None:
164
+ report_progress("Using provided transcription", 0.05)
165
+ else:
166
+ report_progress("Transcribing audio", 0.05)
167
+ if self._transcriber is None or self._transcriber_diarization != enable_diarization:
168
+ self._init_transcriber(enable_diarization=enable_diarization)
169
+ self._transcriber_diarization = enable_diarization
170
+
171
+ transcription = self._transcriber.transcribe(source_audio)
172
+ self._maybe_unload("_transcriber")
132
173
 
133
174
  if not transcription.segments:
134
175
  return DubbingResult(
@@ -150,6 +191,7 @@ class LocalDubbingPipeline:
150
191
  self._init_separator()
151
192
 
152
193
  separated_audio = self._separator.separate(source_audio)
194
+ self._maybe_unload("_separator")
153
195
  vocal_audio = separated_audio.vocals
154
196
 
155
197
  voice_samples: dict[str, Audio] = {}
@@ -166,6 +208,7 @@ class LocalDubbingPipeline:
166
208
  target_lang=target_lang,
167
209
  source_lang=detected_lang,
168
210
  )
211
+ self._maybe_unload("_translator")
169
212
 
170
213
  report_progress("Generating dubbed speech", 0.50)
171
214
  if self._tts is None or self._tts_voice_clone != voice_clone or self._tts_language != target_lang:
@@ -196,6 +239,8 @@ class LocalDubbingPipeline:
196
239
  target_durations.append(segment.duration)
197
240
  start_times.append(segment.start)
198
241
 
242
+ self._maybe_unload("_tts")
243
+
199
244
  report_progress("Synchronizing timing", 0.85)
200
245
  if self._synchronizer is None:
201
246
  self._init_synchronizer()
@@ -251,6 +296,7 @@ class LocalDubbingPipeline:
251
296
  self._transcriber_diarization = False
252
297
 
253
298
  transcription = self._transcriber.transcribe(source_audio)
299
+ self._maybe_unload("_transcriber")
254
300
 
255
301
  separated_audio: SeparatedAudio | None = None
256
302
  vocal_audio = source_audio
@@ -261,6 +307,7 @@ class LocalDubbingPipeline:
261
307
  self._init_separator()
262
308
 
263
309
  separated_audio = self._separator.separate(source_audio)
310
+ self._maybe_unload("_separator")
264
311
  vocal_audio = separated_audio.vocals
265
312
 
266
313
  report_progress("Extracting voice sample", 0.40)
@@ -283,6 +330,7 @@ class LocalDubbingPipeline:
283
330
 
284
331
  generated_speech = self._tts.generate_audio(text, voice_sample=voice_sample)
285
332
  speech_duration = generated_speech.metadata.duration_seconds
333
+ self._maybe_unload("_tts")
286
334
 
287
335
  report_progress("Assembling audio", 0.85)
288
336
 
@@ -4,7 +4,7 @@ from __future__ import annotations
4
4
 
5
5
  from typing import Any
6
6
 
7
- from videopython.ai._device import log_device_initialization, select_device
7
+ from videopython.ai._device import log_device_initialization, release_device_memory, select_device
8
8
  from videopython.base.audio import Audio, AudioMetadata
9
9
 
10
10
 
@@ -151,6 +151,16 @@ class TextToSpeech:
151
151
 
152
152
  return self._generate_local(text, effective_voice)
153
153
 
154
+ def unload(self) -> None:
155
+ """Release the TTS model(s) so the next generate_audio() re-initializes.
156
+
157
+ Used by low-memory dubbing to free VRAM between pipeline stages.
158
+ """
159
+ self._model = None
160
+ self._processor = None
161
+ self._chatterbox_model = None
162
+ release_device_memory(self.device)
163
+
154
164
 
155
165
  class TextToMusic:
156
166
  """Generates music from text descriptions using MusicGen."""
@@ -4,7 +4,7 @@ from __future__ import annotations
4
4
 
5
5
  from typing import Any
6
6
 
7
- from videopython.ai._device import log_device_initialization, select_device
7
+ from videopython.ai._device import log_device_initialization, release_device_memory, select_device
8
8
  from videopython.ai.dubbing.models import TranslatedSegment
9
9
  from videopython.base.text.transcription import TranscriptionSegment
10
10
 
@@ -48,6 +48,15 @@ LANGUAGE_NAMES = {
48
48
  class TextTranslator:
49
49
  """Translates text between languages using local seq2seq models."""
50
50
 
51
+ # Languages without a direct opus-mt-{src}-{tgt} model. Maps (source, target)
52
+ # to an alternative HuggingFace model identifier.
53
+ _MODEL_OVERRIDES: dict[tuple[str, str], str] = {
54
+ ("en", "pt"): "Helsinki-NLP/opus-mt-tc-big-en-pt",
55
+ ("en", "ko"): "Helsinki-NLP/opus-mt-tc-big-en-ko",
56
+ ("en", "ja"): "Helsinki-NLP/opus-mt-en-jap",
57
+ ("en", "pl"): "Helsinki-NLP/opus-mt-en-zlw",
58
+ }
59
+
51
60
  def __init__(self, model_name: str | None = None, device: str | None = None):
52
61
  self.model_name = model_name
53
62
  self.device = device
@@ -58,6 +67,9 @@ class TextTranslator:
58
67
  def _get_local_model_name(self, source_lang: str, target_lang: str) -> str:
59
68
  if self.model_name:
60
69
  return self.model_name
70
+ override = self._MODEL_OVERRIDES.get((source_lang, target_lang))
71
+ if override:
72
+ return override
61
73
  return f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
62
74
 
63
75
  def _init_local(self, source_lang: str, target_lang: str) -> None:
@@ -168,6 +180,16 @@ class TextTranslator:
168
180
 
169
181
  return translated_segments
170
182
 
183
+ def unload(self) -> None:
184
+ """Release the translation model so the next translate() re-initializes.
185
+
186
+ Used by low-memory dubbing to free VRAM between pipeline stages.
187
+ """
188
+ self._model = None
189
+ self._tokenizer = None
190
+ self._current_lang_pair = None
191
+ release_device_memory(self.device)
192
+
171
193
  @staticmethod
172
194
  def get_supported_languages() -> dict[str, str]:
173
195
  return LANGUAGE_NAMES.copy()
@@ -4,7 +4,7 @@ from __future__ import annotations
4
4
 
5
5
  from typing import Any, Literal
6
6
 
7
- from videopython.ai._device import log_device_initialization, select_device
7
+ from videopython.ai._device import log_device_initialization, release_device_memory, select_device
8
8
  from videopython.base.audio import Audio
9
9
  from videopython.base.description import AudioClassification, AudioEvent
10
10
  from videopython.base.text.transcription import Transcription, TranscriptionSegment, TranscriptionWord
@@ -51,6 +51,15 @@ class AudioToText:
51
51
  self._diarization_pipeline = Pipeline.from_pretrained(self.PYANNOTE_DIARIZATION_MODEL)
52
52
  self._diarization_pipeline.to(torch.device(self.device))
53
53
 
54
+ def unload(self) -> None:
55
+ """Release the Whisper and diarization models so the next call re-initializes.
56
+
57
+ Used by low-memory dubbing to free VRAM between pipeline stages.
58
+ """
59
+ self._model = None
60
+ self._diarization_pipeline = None
61
+ release_device_memory(self.device)
62
+
54
63
  def _process_transcription_result(self, transcription_result: dict) -> Transcription:
55
64
  """Process raw transcription result into a Transcription object."""
56
65
  transcription_segments = []
@@ -4,7 +4,7 @@ from __future__ import annotations
4
4
 
5
5
  from typing import Any
6
6
 
7
- from videopython.ai._device import log_device_initialization, select_device
7
+ from videopython.ai._device import log_device_initialization, release_device_memory, select_device
8
8
  from videopython.ai.dubbing.models import SeparatedAudio
9
9
  from videopython.base.audio import Audio, AudioMetadata
10
10
 
@@ -134,3 +134,11 @@ class AudioSeparator:
134
134
  def extract_background(self, audio: Audio) -> Audio:
135
135
  """Convenience method to extract only background from audio."""
136
136
  return self.separate(audio).background
137
+
138
+ def unload(self) -> None:
139
+ """Release the Demucs model so the next separate() re-initializes.
140
+
141
+ Used by low-memory dubbing to free VRAM between pipeline stages.
142
+ """
143
+ self._model = None
144
+ release_device_memory(self.device)
File without changes
File without changes
File without changes