videopython 0.29.1__tar.gz → 0.30.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.29.1 → videopython-0.30.0}/PKG-INFO +2 -3
- {videopython-0.29.1 → videopython-0.30.0}/README.md +1 -2
- {videopython-0.29.1 → videopython-0.30.0}/pyproject.toml +1 -1
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/__init__.py +0 -3
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/dubbing/__init__.py +0 -3
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/dubbing/dubber.py +1 -12
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/dubbing/models.py +1 -2
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/dubbing/pipeline.py +16 -148
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/transforms.py +10 -8
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/understanding/temporal.py +1 -17
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/video_analysis.py +12 -12
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/__init__.py +0 -5
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/effects.py +17 -14
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/registry.py +0 -9
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/streaming.py +7 -4
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/text/overlay.py +6 -3
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/transforms.py +15 -12
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/video.py +2 -2
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/editing/__init__.py +0 -2
- videopython-0.29.1/src/videopython/ai/dubbing/cache.py +0 -325
- videopython-0.29.1/src/videopython/ai/swapping/__init__.py +0 -46
- videopython-0.29.1/src/videopython/ai/swapping/inpainter.py +0 -264
- videopython-0.29.1/src/videopython/ai/swapping/models.py +0 -221
- videopython-0.29.1/src/videopython/ai/swapping/segmenter.py +0 -577
- videopython-0.29.1/src/videopython/ai/swapping/swapper.py +0 -524
- videopython-0.29.1/src/videopython/base/combine.py +0 -61
- videopython-0.29.1/src/videopython/base/progress.py +0 -63
- videopython-0.29.1/src/videopython/base/utils.py +0 -6
- videopython-0.29.1/src/videopython/editing/premiere_xml.py +0 -313
- {videopython-0.29.1 → videopython-0.30.0}/.gitignore +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/LICENSE +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/__init__.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/_device.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/dubbing/quality.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/dubbing/remux.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/generation/audio.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/generation/qwen3.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/generation/translation.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/registry.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/understanding/audio.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/understanding/faces.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/understanding/separation.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/audio/__init__.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/audio/analysis.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/audio/audio.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/description.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/scene.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/text/__init__.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/text/transcription.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/transitions.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/editing/multicam.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/editing/video_edit.py +0 -0
- {videopython-0.29.1 → videopython-0.30.0}/src/videopython/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: videopython
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.30.0
|
|
4
4
|
Summary: Minimal video generation and processing library.
|
|
5
5
|
Project-URL: Homepage, https://videopython.com
|
|
6
6
|
Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
|
|
@@ -201,9 +201,8 @@ API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopyth
|
|
|
201
201
|
| **Video analysis** | `VideoAnalyzer` - full-pipeline analysis combining multiple AI capabilities |
|
|
202
202
|
| **Transforms** | `FaceTrackingCrop`, `SplitScreenComposite` |
|
|
203
203
|
| **Dubbing** | `VideoDubber` - voice cloning and revoicing with timing sync |
|
|
204
|
-
| **Object swapping** | `ObjectSwapper` - detect, segment, and inpaint objects in video |
|
|
205
204
|
|
|
206
|
-
API docs: [Generation](https://videopython.com/api/ai/generation/) | [Understanding](https://videopython.com/api/ai/understanding/) | [Transforms](https://videopython.com/api/ai/transforms/) | [Dubbing](https://videopython.com/api/ai/dubbing/)
|
|
205
|
+
API docs: [Generation](https://videopython.com/api/ai/generation/) | [Understanding](https://videopython.com/api/ai/understanding/) | [Transforms](https://videopython.com/api/ai/transforms/) | [Dubbing](https://videopython.com/api/ai/dubbing/)
|
|
207
206
|
|
|
208
207
|
## Examples
|
|
209
208
|
|
|
@@ -152,9 +152,8 @@ API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopyth
|
|
|
152
152
|
| **Video analysis** | `VideoAnalyzer` - full-pipeline analysis combining multiple AI capabilities |
|
|
153
153
|
| **Transforms** | `FaceTrackingCrop`, `SplitScreenComposite` |
|
|
154
154
|
| **Dubbing** | `VideoDubber` - voice cloning and revoicing with timing sync |
|
|
155
|
-
| **Object swapping** | `ObjectSwapper` - detect, segment, and inpaint objects in video |
|
|
156
155
|
|
|
157
|
-
API docs: [Generation](https://videopython.com/api/ai/generation/) | [Understanding](https://videopython.com/api/ai/understanding/) | [Transforms](https://videopython.com/api/ai/transforms/) | [Dubbing](https://videopython.com/api/ai/dubbing/)
|
|
156
|
+
API docs: [Generation](https://videopython.com/api/ai/generation/) | [Understanding](https://videopython.com/api/ai/understanding/) | [Transforms](https://videopython.com/api/ai/transforms/) | [Dubbing](https://videopython.com/api/ai/dubbing/)
|
|
158
157
|
|
|
159
158
|
## Examples
|
|
160
159
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from videopython.ai import registry as _ai_registry # noqa: F401
|
|
2
2
|
|
|
3
3
|
from .generation import ImageToVideo, TextToImage, TextToMusic, TextToSpeech, TextToVideo
|
|
4
|
-
from .swapping import ObjectSwapper
|
|
5
4
|
from .transforms import FaceTrackingCrop, SplitScreenComposite
|
|
6
5
|
from .understanding import (
|
|
7
6
|
AudioClassifier,
|
|
@@ -28,8 +27,6 @@ __all__ = [
|
|
|
28
27
|
# Transforms (AI-powered)
|
|
29
28
|
"FaceTrackingCrop",
|
|
30
29
|
"SplitScreenComposite",
|
|
31
|
-
# Swapping
|
|
32
|
-
"ObjectSwapper",
|
|
33
30
|
# Video analysis
|
|
34
31
|
"VideoAnalysis",
|
|
35
32
|
"VideoAnalysisConfig",
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
"""Local video dubbing functionality."""
|
|
2
2
|
|
|
3
|
-
from videopython.ai.dubbing.cache import DubCache, dub_cache_clear
|
|
4
3
|
from videopython.ai.dubbing.dubber import VideoDubber
|
|
5
4
|
from videopython.ai.dubbing.models import (
|
|
6
5
|
DubbingResult,
|
|
@@ -26,7 +25,5 @@ __all__ = [
|
|
|
26
25
|
"TranscriptQuality",
|
|
27
26
|
"assess_transcript",
|
|
28
27
|
"UnsupportedLanguageError",
|
|
29
|
-
"DubCache",
|
|
30
|
-
"dub_cache_clear",
|
|
31
28
|
"Expressiveness",
|
|
32
29
|
]
|
|
@@ -55,13 +55,6 @@ class VideoDubber:
|
|
|
55
55
|
See :class:`videopython.ai.generation.qwen3.Qwen3Translator`
|
|
56
56
|
for tradeoffs (Qwen3 is slower on CPU but produces
|
|
57
57
|
context-aware, length-budgeted output).
|
|
58
|
-
cache_dir: When set, persist transcription, translated segments,
|
|
59
|
-
and per-segment TTS WAVs under this directory and skip stages
|
|
60
|
-
whose inputs already match a cache entry. Use to resume crashed
|
|
61
|
-
long runs or to iterate on dub configuration without paying
|
|
62
|
-
transcription cost each time. ``None`` (default) disables
|
|
63
|
-
caching. Cache grows unbounded; clear via
|
|
64
|
-
:func:`videopython.ai.dubbing.cache.dub_cache_clear`.
|
|
65
58
|
"""
|
|
66
59
|
|
|
67
60
|
def __init__(
|
|
@@ -75,7 +68,6 @@ class VideoDubber:
|
|
|
75
68
|
vocabulary: list[str] | None = None,
|
|
76
69
|
strict_quality: bool = False,
|
|
77
70
|
translator: TranslatorChoice = "auto",
|
|
78
|
-
cache_dir: str | Path | None = None,
|
|
79
71
|
):
|
|
80
72
|
self.device = device
|
|
81
73
|
self.low_memory = low_memory
|
|
@@ -86,16 +78,14 @@ class VideoDubber:
|
|
|
86
78
|
self.vocabulary = vocabulary
|
|
87
79
|
self.strict_quality = strict_quality
|
|
88
80
|
self.translator = translator
|
|
89
|
-
self.cache_dir = cache_dir
|
|
90
81
|
self._local_pipeline: Any = None
|
|
91
82
|
requested = device.lower() if isinstance(device, str) else "auto"
|
|
92
83
|
logger.info(
|
|
93
|
-
"VideoDubber initialized with device=%s low_memory=%s whisper_model=%s translator=%s
|
|
84
|
+
"VideoDubber initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
|
|
94
85
|
requested,
|
|
95
86
|
low_memory,
|
|
96
87
|
whisper_model,
|
|
97
88
|
translator,
|
|
98
|
-
cache_dir,
|
|
99
89
|
)
|
|
100
90
|
|
|
101
91
|
def _init_local_pipeline(self) -> None:
|
|
@@ -111,7 +101,6 @@ class VideoDubber:
|
|
|
111
101
|
vocabulary=self.vocabulary,
|
|
112
102
|
strict_quality=self.strict_quality,
|
|
113
103
|
translator=self.translator,
|
|
114
|
-
cache_dir=self.cache_dir,
|
|
115
104
|
)
|
|
116
105
|
|
|
117
106
|
def dub(
|
|
@@ -41,8 +41,7 @@ class Expressiveness:
|
|
|
41
41
|
def as_kwargs(self) -> dict[str, float]:
|
|
42
42
|
"""Knobs as a dict, dropping ``None`` entries.
|
|
43
43
|
|
|
44
|
-
Suitable for ``**``-expansion into Chatterbox
|
|
45
|
-
:meth:`DubCache.tts_key`.
|
|
44
|
+
Suitable for ``**``-expansion into Chatterbox.
|
|
46
45
|
"""
|
|
47
46
|
return {
|
|
48
47
|
name: value
|
|
@@ -10,7 +10,6 @@ from typing import TYPE_CHECKING, Any, Callable, Literal
|
|
|
10
10
|
import numpy as np
|
|
11
11
|
|
|
12
12
|
from videopython.ai._device import select_device
|
|
13
|
-
from videopython.ai.dubbing.cache import DubCache
|
|
14
13
|
from videopython.ai.dubbing.models import DubbingResult, Expressiveness, RevoiceResult, SeparatedAudio, TimingSummary
|
|
15
14
|
from videopython.ai.dubbing.quality import GarbageTranscriptError, assess_transcript
|
|
16
15
|
from videopython.ai.dubbing.timing import TimingSynchronizer
|
|
@@ -173,7 +172,6 @@ class LocalDubbingPipeline:
|
|
|
173
172
|
vocabulary: list[str] | None = None,
|
|
174
173
|
strict_quality: bool = False,
|
|
175
174
|
translator: TranslatorChoice = "auto",
|
|
176
|
-
cache_dir: str | Path | None = None,
|
|
177
175
|
):
|
|
178
176
|
self.device = device
|
|
179
177
|
self.low_memory = low_memory
|
|
@@ -184,15 +182,13 @@ class LocalDubbingPipeline:
|
|
|
184
182
|
self.vocabulary = vocabulary
|
|
185
183
|
self.strict_quality = strict_quality
|
|
186
184
|
self.translator = translator
|
|
187
|
-
self.cache_dir = Path(cache_dir) if cache_dir is not None else None
|
|
188
185
|
requested = device.lower() if isinstance(device, str) else "auto"
|
|
189
186
|
logger.info(
|
|
190
|
-
"LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s translator=%s
|
|
187
|
+
"LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
|
|
191
188
|
requested,
|
|
192
189
|
low_memory,
|
|
193
190
|
whisper_model,
|
|
194
191
|
translator,
|
|
195
|
-
self.cache_dir,
|
|
196
192
|
)
|
|
197
193
|
|
|
198
194
|
self._transcriber: Any = None
|
|
@@ -202,7 +198,6 @@ class LocalDubbingPipeline:
|
|
|
202
198
|
self._tts_language: str | None = None
|
|
203
199
|
self._separator: Any = None
|
|
204
200
|
self._synchronizer: TimingSynchronizer | None = None
|
|
205
|
-
self._cache: DubCache | None = DubCache(self.cache_dir) if self.cache_dir is not None else None
|
|
206
201
|
|
|
207
202
|
def _maybe_unload(self, component_name: str) -> None:
|
|
208
203
|
"""Unload a stage's model when low_memory mode is enabled.
|
|
@@ -221,89 +216,42 @@ class LocalDubbingPipeline:
|
|
|
221
216
|
logger.info("low_memory: unloading %s", component_name.lstrip("_"))
|
|
222
217
|
unload()
|
|
223
218
|
|
|
224
|
-
def
|
|
219
|
+
def _transcribe(
|
|
225
220
|
self,
|
|
226
221
|
source_audio: Audio,
|
|
227
222
|
enable_diarization: bool,
|
|
228
223
|
) -> Transcription:
|
|
229
|
-
"""
|
|
230
|
-
|
|
231
|
-
Cache miss: lazy-init the transcriber, transcribe, store the
|
|
232
|
-
result (including all hashed kwargs in metadata.json so future
|
|
233
|
-
invalidators have provenance).
|
|
234
|
-
Cache hit: return the deserialized :class:`Transcription` without
|
|
235
|
-
touching Whisper/diarization at all.
|
|
236
|
-
"""
|
|
237
|
-
src_hash, kwargs_hash = self._transcription_cache_keys(source_audio, enable_diarization)
|
|
238
|
-
if self._cache is not None:
|
|
239
|
-
cached = self._cache.get_transcription(src_hash, kwargs_hash)
|
|
240
|
-
if cached is not None:
|
|
241
|
-
return cached
|
|
242
|
-
|
|
224
|
+
"""Lazy-init the transcriber and run it on ``source_audio``."""
|
|
243
225
|
if self._transcriber is None or self._transcriber_diarization != enable_diarization:
|
|
244
226
|
self._init_transcriber(enable_diarization=enable_diarization)
|
|
245
227
|
self._transcriber_diarization = enable_diarization
|
|
246
228
|
|
|
247
229
|
transcription = self._transcriber.transcribe(source_audio)
|
|
248
230
|
self._maybe_unload("_transcriber")
|
|
249
|
-
|
|
250
|
-
if self._cache is not None:
|
|
251
|
-
self._cache.put_transcription(
|
|
252
|
-
src_hash,
|
|
253
|
-
kwargs_hash,
|
|
254
|
-
transcription,
|
|
255
|
-
hash_inputs={
|
|
256
|
-
"whisper_model": self.whisper_model,
|
|
257
|
-
"enable_diarization": enable_diarization,
|
|
258
|
-
"condition_on_previous_text": self.condition_on_previous_text,
|
|
259
|
-
"no_speech_threshold": self.no_speech_threshold,
|
|
260
|
-
"logprob_threshold": self.logprob_threshold,
|
|
261
|
-
"vocabulary": self.vocabulary,
|
|
262
|
-
},
|
|
263
|
-
)
|
|
264
231
|
return transcription
|
|
265
232
|
|
|
266
233
|
def _tts_segment_audio(
|
|
267
234
|
self,
|
|
268
235
|
segment: TranslatedSegment,
|
|
269
236
|
speaker: str,
|
|
270
|
-
speaker_bytes: bytes | None,
|
|
271
237
|
target_lang: str,
|
|
272
238
|
voice_clone: bool,
|
|
273
239
|
voice_samples: dict[str, Audio],
|
|
274
240
|
speaker_wav_paths: dict[str, Path],
|
|
275
|
-
src_hash_for_tts: str,
|
|
276
241
|
expressiveness: Expressiveness = Expressiveness(),
|
|
277
242
|
) -> Audio | None:
|
|
278
|
-
"""Produce the TTS audio for a single segment
|
|
243
|
+
"""Produce the TTS audio for a single segment.
|
|
279
244
|
|
|
280
245
|
Returns the synthesized :class:`Audio`, or ``None`` if Chatterbox
|
|
281
|
-
crashed on the segment (the caller skips it).
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
so a fully-cached run never loads Chatterbox.
|
|
246
|
+
crashed on the segment (the caller skips it). The TTS model is
|
|
247
|
+
lazy-initialized and per-speaker temp WAVs are materialized once
|
|
248
|
+
across the loop.
|
|
285
249
|
|
|
286
250
|
``expressiveness`` carries the M4 Chatterbox knobs derived from
|
|
287
251
|
the source segment's prosody. Default is the no-knobs profile —
|
|
288
252
|
lets Chatterbox use its own defaults — so callers that don't yet
|
|
289
253
|
derive prosody (e.g. ``revoice``) keep pre-M4 behaviour.
|
|
290
254
|
"""
|
|
291
|
-
from videopython.base.audio import Audio as _Audio
|
|
292
|
-
|
|
293
|
-
tts_cache_key: str | None = None
|
|
294
|
-
if self._cache is not None:
|
|
295
|
-
tts_cache_key = DubCache.tts_key(
|
|
296
|
-
translated_text=segment.translated_text,
|
|
297
|
-
voice_sample_bytes=speaker_bytes,
|
|
298
|
-
language=target_lang,
|
|
299
|
-
**expressiveness.as_kwargs(),
|
|
300
|
-
)
|
|
301
|
-
cached_path = self._cache.get_tts_path(src_hash_for_tts, tts_cache_key)
|
|
302
|
-
if cached_path is not None:
|
|
303
|
-
return _Audio.from_path(cached_path)
|
|
304
|
-
|
|
305
|
-
# Cache miss: pay for TTS init + voice-sample WAV exactly once
|
|
306
|
-
# across the loop. Both are wasted work when every segment hits.
|
|
307
255
|
if self._tts is None or self._tts_language != target_lang:
|
|
308
256
|
self._init_tts(language=target_lang)
|
|
309
257
|
self._tts_language = target_lang
|
|
@@ -314,7 +262,7 @@ class LocalDubbingPipeline:
|
|
|
314
262
|
|
|
315
263
|
wav_path = speaker_wav_paths.get(speaker) if voice_clone else None
|
|
316
264
|
try:
|
|
317
|
-
|
|
265
|
+
return self._tts.generate_audio(
|
|
318
266
|
segment.translated_text,
|
|
319
267
|
voice_sample_path=wav_path,
|
|
320
268
|
**expressiveness.as_kwargs(),
|
|
@@ -332,39 +280,19 @@ class LocalDubbingPipeline:
|
|
|
332
280
|
)
|
|
333
281
|
return None
|
|
334
282
|
|
|
335
|
-
|
|
336
|
-
dubbed_audio.save(self._cache.reserve_tts_path(src_hash_for_tts, tts_cache_key))
|
|
337
|
-
return dubbed_audio
|
|
338
|
-
|
|
339
|
-
def _translate_with_cache(
|
|
283
|
+
def _translate(
|
|
340
284
|
self,
|
|
341
285
|
transcription: Transcription,
|
|
342
|
-
source_audio: Audio,
|
|
343
286
|
source_lang: str,
|
|
344
287
|
target_lang: str,
|
|
345
288
|
report_progress: Callable[[str, float], None],
|
|
346
289
|
) -> tuple[list[TranslatedSegment], list[int]]:
|
|
347
|
-
"""
|
|
290
|
+
"""Translate the transcription's segments into ``target_lang``.
|
|
348
291
|
|
|
349
|
-
Returns ``(translated_segments, translation_failures)``.
|
|
350
|
-
fully-successful translations are cached — partial Qwen failures
|
|
351
|
-
would otherwise lock in an incomplete dub across runs. The
|
|
292
|
+
Returns ``(translated_segments, translation_failures)``. The
|
|
352
293
|
progress callback maps the backend's [0, 1] fraction onto the
|
|
353
294
|
pipeline's translation window (0.35 → 0.50).
|
|
354
295
|
"""
|
|
355
|
-
from videopython.ai.dubbing.models import TranslatedSegment
|
|
356
|
-
|
|
357
|
-
cache_key: str | None = None
|
|
358
|
-
if self._cache is not None:
|
|
359
|
-
cache_key = DubCache.translation_key(
|
|
360
|
-
source_lang=source_lang,
|
|
361
|
-
target_lang=target_lang,
|
|
362
|
-
translator_class=self._resolved_translator_class_name(source_lang, target_lang),
|
|
363
|
-
)
|
|
364
|
-
cached = self._cache.get_translation(DubCache.source_key(source_audio), cache_key)
|
|
365
|
-
if cached is not None:
|
|
366
|
-
return [TranslatedSegment.from_dict(d) for d in cached], []
|
|
367
|
-
|
|
368
296
|
if self._translator is None:
|
|
369
297
|
self._init_translator(source_lang=source_lang, target_lang=target_lang)
|
|
370
298
|
|
|
@@ -387,32 +315,8 @@ class LocalDubbingPipeline:
|
|
|
387
315
|
translation_failures = list(self._translator.translation_failures)
|
|
388
316
|
self._maybe_unload("_translator")
|
|
389
317
|
|
|
390
|
-
if self._cache is not None and cache_key is not None and not translation_failures:
|
|
391
|
-
self._cache.put_translation(
|
|
392
|
-
DubCache.source_key(source_audio),
|
|
393
|
-
cache_key,
|
|
394
|
-
[s.to_dict() for s in translated_segments],
|
|
395
|
-
)
|
|
396
|
-
|
|
397
318
|
return translated_segments, translation_failures
|
|
398
319
|
|
|
399
|
-
def _transcription_cache_keys(self, source_audio: Audio, enable_diarization: bool = False) -> tuple[str, str]:
|
|
400
|
-
"""Return ``(src_hash, kwargs_hash)`` for the current transcription config.
|
|
401
|
-
|
|
402
|
-
Centralizes the kwarg list so the cache lookup, the put, and any
|
|
403
|
-
future invalidator agree on what's hashed.
|
|
404
|
-
"""
|
|
405
|
-
src_hash = DubCache.source_key(source_audio)
|
|
406
|
-
kwargs_hash = DubCache.transcription_kwargs_hash(
|
|
407
|
-
whisper_model=self.whisper_model,
|
|
408
|
-
enable_diarization=enable_diarization,
|
|
409
|
-
condition_on_previous_text=self.condition_on_previous_text,
|
|
410
|
-
no_speech_threshold=self.no_speech_threshold,
|
|
411
|
-
logprob_threshold=self.logprob_threshold,
|
|
412
|
-
vocabulary=self.vocabulary,
|
|
413
|
-
)
|
|
414
|
-
return src_hash, kwargs_hash
|
|
415
|
-
|
|
416
320
|
def _init_transcriber(self, enable_diarization: bool = False) -> None:
|
|
417
321
|
"""Initialize the transcription model."""
|
|
418
322
|
from videopython.ai.understanding.audio import AudioToText
|
|
@@ -444,31 +348,6 @@ class LocalDubbingPipeline:
|
|
|
444
348
|
else: # "auto"
|
|
445
349
|
self._translator = self._resolve_translator_auto(source_lang, target_lang)
|
|
446
350
|
|
|
447
|
-
def _resolved_translator_class_name(self, source_lang: str, target_lang: str) -> str:
|
|
448
|
-
"""Return the *class name* of the translator that ``_init_translator``
|
|
449
|
-
would pick — without constructing one.
|
|
450
|
-
|
|
451
|
-
Used by the cache to key translations on the resolved backend rather
|
|
452
|
-
than the user-supplied ``"auto"``: a CPU run that resolves to Marian
|
|
453
|
-
must not collide with a GPU run that resolves to Qwen.
|
|
454
|
-
"""
|
|
455
|
-
if self.translator == "marian":
|
|
456
|
-
return "MarianTranslator"
|
|
457
|
-
if self.translator == "qwen3":
|
|
458
|
-
return "Qwen3Translator"
|
|
459
|
-
# auto — mirror _resolve_translator_auto's branching, no construction.
|
|
460
|
-
device = select_device(self.device, mps_allowed=True)
|
|
461
|
-
has_gpu = device in ("cuda", "mps")
|
|
462
|
-
if has_gpu and Qwen3Translator.supports(source_lang, target_lang):
|
|
463
|
-
return "Qwen3Translator"
|
|
464
|
-
if MarianTranslator.has_model_for(source_lang, target_lang):
|
|
465
|
-
return "MarianTranslator"
|
|
466
|
-
if Qwen3Translator.supports(source_lang, target_lang):
|
|
467
|
-
return "Qwen3Translator"
|
|
468
|
-
# No backend supports the pair — _init_translator will raise. We
|
|
469
|
-
# return a sentinel; the cache miss path will pay that cost.
|
|
470
|
-
return "Unsupported"
|
|
471
|
-
|
|
472
351
|
def _resolve_translator_auto(self, source_lang: str, target_lang: str) -> TranslationBackend:
|
|
473
352
|
"""Pick a backend based on language coverage AND device.
|
|
474
353
|
|
|
@@ -728,7 +607,7 @@ class LocalDubbingPipeline:
|
|
|
728
607
|
)
|
|
729
608
|
else:
|
|
730
609
|
report_progress("Transcribing audio", 0.05)
|
|
731
|
-
transcription = self.
|
|
610
|
+
transcription = self._transcribe(source_audio, enable_diarization)
|
|
732
611
|
|
|
733
612
|
if not transcription.segments:
|
|
734
613
|
return DubbingResult(
|
|
@@ -796,8 +675,8 @@ class LocalDubbingPipeline:
|
|
|
796
675
|
voice_samples = self._extract_voice_samples(vocal_audio, background_audio, transcription)
|
|
797
676
|
|
|
798
677
|
report_progress("Translating text", 0.35)
|
|
799
|
-
translated_segments, translation_failures = self.
|
|
800
|
-
transcription,
|
|
678
|
+
translated_segments, translation_failures = self._translate(
|
|
679
|
+
transcription, detected_lang, target_lang, report_progress
|
|
801
680
|
)
|
|
802
681
|
|
|
803
682
|
# Per-segment expressiveness derived from source vocals RMS.
|
|
@@ -823,21 +702,12 @@ class LocalDubbingPipeline:
|
|
|
823
702
|
|
|
824
703
|
report_progress("Generating dubbed speech", 0.50)
|
|
825
704
|
|
|
826
|
-
# Per-speaker voice-sample bytes for TTS cache key. Empty when
|
|
827
|
-
# voice_clone=False — the cache key still differentiates "no voice
|
|
828
|
-
# sample" from "specific clone" via the None path.
|
|
829
|
-
voice_sample_bytes: dict[str, bytes] = (
|
|
830
|
-
{speaker: sample.data.tobytes() for speaker, sample in voice_samples.items()} if voice_clone else {}
|
|
831
|
-
)
|
|
832
|
-
src_hash_for_tts = DubCache.source_key(source_audio) if self._cache is not None else ""
|
|
833
|
-
|
|
834
705
|
dubbed_segments: list[Audio] = []
|
|
835
706
|
target_durations: list[float] = []
|
|
836
707
|
start_times: list[float] = []
|
|
837
708
|
|
|
838
|
-
# Per-speaker temp WAVs are materialized lazily by _tts_segment_audio
|
|
839
|
-
#
|
|
840
|
-
# state so the finally block can clean up regardless of cache outcome.
|
|
709
|
+
# Per-speaker temp WAVs are materialized lazily by _tts_segment_audio.
|
|
710
|
+
# The dict is loop-scoped state so the finally block can clean up.
|
|
841
711
|
speaker_wav_paths: dict[str, Path] = {}
|
|
842
712
|
try:
|
|
843
713
|
for i, segment in enumerate(translated_segments):
|
|
@@ -857,12 +727,10 @@ class LocalDubbingPipeline:
|
|
|
857
727
|
dubbed_audio = self._tts_segment_audio(
|
|
858
728
|
segment=segment,
|
|
859
729
|
speaker=speaker,
|
|
860
|
-
speaker_bytes=voice_sample_bytes.get(speaker),
|
|
861
730
|
target_lang=target_lang,
|
|
862
731
|
voice_clone=voice_clone,
|
|
863
732
|
voice_samples=voice_samples,
|
|
864
733
|
speaker_wav_paths=speaker_wav_paths,
|
|
865
|
-
src_hash_for_tts=src_hash_for_tts,
|
|
866
734
|
expressiveness=expressiveness_per_segment[i],
|
|
867
735
|
)
|
|
868
736
|
if dubbed_audio is None:
|
|
@@ -54,7 +54,6 @@ class FaceTrackingCrop(Transformation):
|
|
|
54
54
|
vertical_offset: float = -0.1,
|
|
55
55
|
framing_rule: Literal["offset", "center", "headroom", "thirds", "dynamic"] = "offset",
|
|
56
56
|
headroom: float = 0.15,
|
|
57
|
-
lead_room: float = 0.1,
|
|
58
57
|
smoothing: float = 0.8,
|
|
59
58
|
max_speed: float | None = None,
|
|
60
59
|
fallback: Literal["center", "last_position", "full_frame"] = "last_position",
|
|
@@ -77,7 +76,6 @@ class FaceTrackingCrop(Transformation):
|
|
|
77
76
|
- "thirds": Place face near the upper-third line.
|
|
78
77
|
- "dynamic": Currently same as "headroom".
|
|
79
78
|
headroom: Headroom amount for framing rules that use it.
|
|
80
|
-
lead_room: Reserved for future motion/look-direction framing.
|
|
81
79
|
smoothing: Position smoothing factor (0-1, higher = smoother).
|
|
82
80
|
max_speed: Optional max camera movement per frame (normalized).
|
|
83
81
|
fallback: Behavior when no face detected.
|
|
@@ -92,7 +90,6 @@ class FaceTrackingCrop(Transformation):
|
|
|
92
90
|
self.vertical_offset = vertical_offset
|
|
93
91
|
self.framing_rule = framing_rule
|
|
94
92
|
self.headroom = headroom
|
|
95
|
-
self.lead_room = lead_room
|
|
96
93
|
self.smoothing = smoothing
|
|
97
94
|
self.max_speed = max_speed
|
|
98
95
|
self.fallback = fallback
|
|
@@ -238,10 +235,15 @@ class FaceTrackingCrop(Transformation):
|
|
|
238
235
|
current_position = (0.5, 0.5)
|
|
239
236
|
|
|
240
237
|
framing_label = self.framing_rule if self.framing_rule != "offset" else "legacy-offset"
|
|
241
|
-
|
|
242
|
-
"Face tracking crop: "
|
|
243
|
-
|
|
244
|
-
|
|
238
|
+
logger.info(
|
|
239
|
+
"Face tracking crop: %dx%d -> %dx%d (%d:%d, framing=%s)",
|
|
240
|
+
w,
|
|
241
|
+
h,
|
|
242
|
+
out_w,
|
|
243
|
+
out_h,
|
|
244
|
+
self.target_aspect[0],
|
|
245
|
+
self.target_aspect[1],
|
|
246
|
+
framing_label,
|
|
245
247
|
)
|
|
246
248
|
|
|
247
249
|
new_frames = []
|
|
@@ -448,7 +450,7 @@ class SplitScreenComposite(Transformation):
|
|
|
448
450
|
for _ in range(len(cell_rects))
|
|
449
451
|
]
|
|
450
452
|
|
|
451
|
-
|
|
453
|
+
logger.info("Creating %s split screen: %dx%d", self.layout, out_w, out_h)
|
|
452
454
|
|
|
453
455
|
new_frames = []
|
|
454
456
|
for i in tqdm(range(n_frames), desc="Split screen composite"):
|
|
@@ -102,12 +102,7 @@ class SemanticSceneDetector:
|
|
|
102
102
|
video.save(tmp.name)
|
|
103
103
|
return self.detect_streaming(tmp.name)
|
|
104
104
|
|
|
105
|
-
def detect_streaming(
|
|
106
|
-
self,
|
|
107
|
-
path: str | Path,
|
|
108
|
-
start_second: float | None = None,
|
|
109
|
-
end_second: float | None = None,
|
|
110
|
-
) -> list[SceneBoundary]:
|
|
105
|
+
def detect_streaming(self, path: str | Path) -> list[SceneBoundary]:
|
|
111
106
|
"""Detect scenes from a video file.
|
|
112
107
|
|
|
113
108
|
Uses TransNetV2 with pretrained weights for accurate shot boundary
|
|
@@ -115,21 +110,10 @@ class SemanticSceneDetector:
|
|
|
115
110
|
|
|
116
111
|
Args:
|
|
117
112
|
path: Path to video file.
|
|
118
|
-
start_second: Optional start time for analysis (not yet supported).
|
|
119
|
-
end_second: Optional end time for analysis (not yet supported).
|
|
120
113
|
|
|
121
114
|
Returns:
|
|
122
115
|
List of SceneBoundary objects representing detected scenes.
|
|
123
116
|
"""
|
|
124
|
-
if start_second is not None or end_second is not None:
|
|
125
|
-
import warnings
|
|
126
|
-
|
|
127
|
-
warnings.warn(
|
|
128
|
-
"start_second and end_second are not yet supported by SemanticSceneDetector. Processing entire video.",
|
|
129
|
-
UserWarning,
|
|
130
|
-
stacklevel=2,
|
|
131
|
-
)
|
|
132
|
-
|
|
133
117
|
self._load_model()
|
|
134
118
|
|
|
135
119
|
# Use TransNetV2's detect_scenes which handles everything internally
|
|
@@ -559,7 +559,7 @@ class VideoAnalyzer:
|
|
|
559
559
|
return AudioToText(**self.config.get_params(AUDIO_TO_TEXT)).transcribe(
|
|
560
560
|
Audio.from_path(source_path) if source_path is not None else _require_video(video)
|
|
561
561
|
)
|
|
562
|
-
except
|
|
562
|
+
except (ImportError, OSError, RuntimeError, ValueError):
|
|
563
563
|
logger.warning("AudioToText failed, skipping transcription", exc_info=True)
|
|
564
564
|
return None
|
|
565
565
|
|
|
@@ -571,7 +571,7 @@ class VideoAnalyzer:
|
|
|
571
571
|
if source_path is not None
|
|
572
572
|
else scene_detector.detect(_require_video(video))
|
|
573
573
|
)
|
|
574
|
-
except
|
|
574
|
+
except (ImportError, OSError, RuntimeError, ValueError):
|
|
575
575
|
logger.warning("SemanticSceneDetector failed, using default scene boundaries", exc_info=True)
|
|
576
576
|
return None
|
|
577
577
|
|
|
@@ -644,7 +644,7 @@ class VideoAnalyzer:
|
|
|
644
644
|
else:
|
|
645
645
|
try:
|
|
646
646
|
scene_vlm = SceneVLM(**self.config.get_params(SCENE_VLM)) if SCENE_VLM in enabled else None
|
|
647
|
-
except
|
|
647
|
+
except (ImportError, OSError, RuntimeError, ValueError):
|
|
648
648
|
logger.warning("Failed to initialize SceneVLM, skipping visual understanding", exc_info=True)
|
|
649
649
|
scene_vlm = None
|
|
650
650
|
|
|
@@ -652,7 +652,7 @@ class VideoAnalyzer:
|
|
|
652
652
|
audio_classifier = (
|
|
653
653
|
AudioClassifier(**self.config.get_params(AUDIO_CLASSIFIER)) if AUDIO_CLASSIFIER in enabled else None
|
|
654
654
|
)
|
|
655
|
-
except
|
|
655
|
+
except (ImportError, OSError, RuntimeError, ValueError):
|
|
656
656
|
logger.warning("Failed to initialize AudioClassifier, skipping audio classification", exc_info=True)
|
|
657
657
|
audio_classifier = None
|
|
658
658
|
|
|
@@ -660,7 +660,7 @@ class VideoAnalyzer:
|
|
|
660
660
|
if FACE_TRACKER in enabled:
|
|
661
661
|
try:
|
|
662
662
|
face_tracker = FaceTracker(**self.config.get_params(FACE_TRACKER))
|
|
663
|
-
except
|
|
663
|
+
except (ImportError, OSError, RuntimeError, ValueError):
|
|
664
664
|
logger.warning("Failed to initialize FaceTracker, skipping face tracks", exc_info=True)
|
|
665
665
|
face_tracker = None
|
|
666
666
|
|
|
@@ -668,7 +668,7 @@ class VideoAnalyzer:
|
|
|
668
668
|
if audio_classifier is not None and source_path is not None:
|
|
669
669
|
try:
|
|
670
670
|
path_audio = Audio.from_path(source_path)
|
|
671
|
-
except
|
|
671
|
+
except (OSError, RuntimeError, ValueError):
|
|
672
672
|
logger.warning(
|
|
673
673
|
"Failed to load audio from path, audio classification will use clip fallback", exc_info=True
|
|
674
674
|
)
|
|
@@ -686,7 +686,7 @@ class VideoAnalyzer:
|
|
|
686
686
|
metadata=metadata,
|
|
687
687
|
scenes=scenes,
|
|
688
688
|
)
|
|
689
|
-
except
|
|
689
|
+
except (IndexError, OSError, RuntimeError, ValueError):
|
|
690
690
|
logger.warning("Batched SceneVLM failed, skipping visual understanding", exc_info=True)
|
|
691
691
|
|
|
692
692
|
samples: list[SceneAnalysisSample] = []
|
|
@@ -714,7 +714,7 @@ class VideoAnalyzer:
|
|
|
714
714
|
start_second=scene.start,
|
|
715
715
|
end_second=scene.end,
|
|
716
716
|
)
|
|
717
|
-
except
|
|
717
|
+
except (OSError, RuntimeError, ValueError):
|
|
718
718
|
scene_clip = None
|
|
719
719
|
sample.audio_classification = self._run_scene_audio_classification(
|
|
720
720
|
audio_classifier=audio_classifier,
|
|
@@ -723,7 +723,7 @@ class VideoAnalyzer:
|
|
|
723
723
|
scene_start=scene.start,
|
|
724
724
|
scene_end=scene.end,
|
|
725
725
|
)
|
|
726
|
-
except
|
|
726
|
+
except (OSError, RuntimeError, ValueError):
|
|
727
727
|
logger.warning(
|
|
728
728
|
"AudioClassifier failed for scene %d (%.1f-%.1fs)",
|
|
729
729
|
index,
|
|
@@ -741,7 +741,7 @@ class VideoAnalyzer:
|
|
|
741
741
|
metadata=metadata,
|
|
742
742
|
scene=scene,
|
|
743
743
|
)
|
|
744
|
-
except
|
|
744
|
+
except (IndexError, OSError, RuntimeError, ValueError):
|
|
745
745
|
logger.warning(
|
|
746
746
|
"FaceTracker failed for scene %d (%.1f-%.1fs)",
|
|
747
747
|
index,
|
|
@@ -867,7 +867,7 @@ class VideoAnalyzer:
|
|
|
867
867
|
description: SceneDescription | None = None
|
|
868
868
|
try:
|
|
869
869
|
description = scene_vlm.analyze_scene(deduped)
|
|
870
|
-
except
|
|
870
|
+
except (IndexError, OSError, RuntimeError, ValueError):
|
|
871
871
|
logger.warning(
|
|
872
872
|
"SceneVLM failed for scenes %d-%d (%.1f-%.1fs)",
|
|
873
873
|
group[0],
|
|
@@ -1044,7 +1044,7 @@ class VideoAnalyzer:
|
|
|
1044
1044
|
try:
|
|
1045
1045
|
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
1046
1046
|
payload = json.loads(result.stdout)
|
|
1047
|
-
except
|
|
1047
|
+
except (subprocess.CalledProcessError, json.JSONDecodeError, OSError):
|
|
1048
1048
|
return {}
|
|
1049
1049
|
|
|
1050
1050
|
tags: dict[str, str] = {}
|
|
@@ -37,7 +37,6 @@ from .exceptions import (
|
|
|
37
37
|
VideoMetadataError,
|
|
38
38
|
VideoPythonError,
|
|
39
39
|
)
|
|
40
|
-
from .progress import configure, set_progress, set_verbose
|
|
41
40
|
from .registry import (
|
|
42
41
|
OperationCategory,
|
|
43
42
|
OperationSpec,
|
|
@@ -157,8 +156,4 @@ __all__ = [
|
|
|
157
156
|
"get_specs_by_tag",
|
|
158
157
|
"register",
|
|
159
158
|
"spec_from_class",
|
|
160
|
-
# Configuration
|
|
161
|
-
"configure",
|
|
162
|
-
"set_verbose",
|
|
163
|
-
"set_progress",
|
|
164
159
|
]
|