videopython 0.29.0__tar.gz → 0.30.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.29.0 → videopython-0.30.0}/PKG-INFO +2 -3
- {videopython-0.29.0 → videopython-0.30.0}/README.md +1 -2
- {videopython-0.29.0 → videopython-0.30.0}/pyproject.toml +1 -1
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/__init__.py +0 -3
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/dubbing/__init__.py +0 -3
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/dubbing/dubber.py +9 -12
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/dubbing/models.py +1 -2
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/dubbing/pipeline.py +19 -146
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/transforms.py +10 -8
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/understanding/audio.py +97 -11
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/understanding/temporal.py +1 -17
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/video_analysis.py +12 -12
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/__init__.py +0 -5
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/effects.py +17 -14
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/registry.py +0 -9
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/streaming.py +7 -4
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/text/overlay.py +6 -3
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/transforms.py +15 -12
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/video.py +2 -2
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/editing/__init__.py +0 -2
- videopython-0.29.0/src/videopython/ai/dubbing/cache.py +0 -309
- videopython-0.29.0/src/videopython/ai/swapping/__init__.py +0 -46
- videopython-0.29.0/src/videopython/ai/swapping/inpainter.py +0 -264
- videopython-0.29.0/src/videopython/ai/swapping/models.py +0 -221
- videopython-0.29.0/src/videopython/ai/swapping/segmenter.py +0 -577
- videopython-0.29.0/src/videopython/ai/swapping/swapper.py +0 -524
- videopython-0.29.0/src/videopython/base/combine.py +0 -61
- videopython-0.29.0/src/videopython/base/progress.py +0 -63
- videopython-0.29.0/src/videopython/base/utils.py +0 -6
- videopython-0.29.0/src/videopython/editing/premiere_xml.py +0 -313
- {videopython-0.29.0 → videopython-0.30.0}/.gitignore +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/LICENSE +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/__init__.py +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/_device.py +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/dubbing/quality.py +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/dubbing/remux.py +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/generation/audio.py +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/generation/qwen3.py +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/generation/translation.py +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/registry.py +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/understanding/faces.py +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/understanding/separation.py +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/audio/__init__.py +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/audio/analysis.py +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/audio/audio.py +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/description.py +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/scene.py +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/text/__init__.py +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/text/transcription.py +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/transitions.py +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/editing/multicam.py +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/editing/video_edit.py +0 -0
- {videopython-0.29.0 → videopython-0.30.0}/src/videopython/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: videopython
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.30.0
|
|
4
4
|
Summary: Minimal video generation and processing library.
|
|
5
5
|
Project-URL: Homepage, https://videopython.com
|
|
6
6
|
Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
|
|
@@ -201,9 +201,8 @@ API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopyth
|
|
|
201
201
|
| **Video analysis** | `VideoAnalyzer` - full-pipeline analysis combining multiple AI capabilities |
|
|
202
202
|
| **Transforms** | `FaceTrackingCrop`, `SplitScreenComposite` |
|
|
203
203
|
| **Dubbing** | `VideoDubber` - voice cloning and revoicing with timing sync |
|
|
204
|
-
| **Object swapping** | `ObjectSwapper` - detect, segment, and inpaint objects in video |
|
|
205
204
|
|
|
206
|
-
API docs: [Generation](https://videopython.com/api/ai/generation/) | [Understanding](https://videopython.com/api/ai/understanding/) | [Transforms](https://videopython.com/api/ai/transforms/) | [Dubbing](https://videopython.com/api/ai/dubbing/)
|
|
205
|
+
API docs: [Generation](https://videopython.com/api/ai/generation/) | [Understanding](https://videopython.com/api/ai/understanding/) | [Transforms](https://videopython.com/api/ai/transforms/) | [Dubbing](https://videopython.com/api/ai/dubbing/)
|
|
207
206
|
|
|
208
207
|
## Examples
|
|
209
208
|
|
|
@@ -152,9 +152,8 @@ API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopyth
|
|
|
152
152
|
| **Video analysis** | `VideoAnalyzer` - full-pipeline analysis combining multiple AI capabilities |
|
|
153
153
|
| **Transforms** | `FaceTrackingCrop`, `SplitScreenComposite` |
|
|
154
154
|
| **Dubbing** | `VideoDubber` - voice cloning and revoicing with timing sync |
|
|
155
|
-
| **Object swapping** | `ObjectSwapper` - detect, segment, and inpaint objects in video |
|
|
156
155
|
|
|
157
|
-
API docs: [Generation](https://videopython.com/api/ai/generation/) | [Understanding](https://videopython.com/api/ai/understanding/) | [Transforms](https://videopython.com/api/ai/transforms/) | [Dubbing](https://videopython.com/api/ai/dubbing/)
|
|
156
|
+
API docs: [Generation](https://videopython.com/api/ai/generation/) | [Understanding](https://videopython.com/api/ai/understanding/) | [Transforms](https://videopython.com/api/ai/transforms/) | [Dubbing](https://videopython.com/api/ai/dubbing/)
|
|
158
157
|
|
|
159
158
|
## Examples
|
|
160
159
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from videopython.ai import registry as _ai_registry # noqa: F401
|
|
2
2
|
|
|
3
3
|
from .generation import ImageToVideo, TextToImage, TextToMusic, TextToSpeech, TextToVideo
|
|
4
|
-
from .swapping import ObjectSwapper
|
|
5
4
|
from .transforms import FaceTrackingCrop, SplitScreenComposite
|
|
6
5
|
from .understanding import (
|
|
7
6
|
AudioClassifier,
|
|
@@ -28,8 +27,6 @@ __all__ = [
|
|
|
28
27
|
# Transforms (AI-powered)
|
|
29
28
|
"FaceTrackingCrop",
|
|
30
29
|
"SplitScreenComposite",
|
|
31
|
-
# Swapping
|
|
32
|
-
"ObjectSwapper",
|
|
33
30
|
# Video analysis
|
|
34
31
|
"VideoAnalysis",
|
|
35
32
|
"VideoAnalysisConfig",
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
"""Local video dubbing functionality."""
|
|
2
2
|
|
|
3
|
-
from videopython.ai.dubbing.cache import DubCache, dub_cache_clear
|
|
4
3
|
from videopython.ai.dubbing.dubber import VideoDubber
|
|
5
4
|
from videopython.ai.dubbing.models import (
|
|
6
5
|
DubbingResult,
|
|
@@ -26,7 +25,5 @@ __all__ = [
|
|
|
26
25
|
"TranscriptQuality",
|
|
27
26
|
"assess_transcript",
|
|
28
27
|
"UnsupportedLanguageError",
|
|
29
|
-
"DubCache",
|
|
30
|
-
"dub_cache_clear",
|
|
31
28
|
"Expressiveness",
|
|
32
29
|
]
|
|
@@ -37,6 +37,11 @@ class VideoDubber:
|
|
|
37
37
|
gate; raise to drop more low-confidence windows.
|
|
38
38
|
logprob_threshold: Forwarded to ``AudioToText``. Whisper's average
|
|
39
39
|
log-probability gate.
|
|
40
|
+
vocabulary: Forwarded to ``AudioToText``. Optional list of brand
|
|
41
|
+
names, product names, or proper nouns to bias Whisper's first-
|
|
42
|
+
window decoder via ``initial_prompt``. Recovers near-mishears
|
|
43
|
+
(e.g. Klarna → "carna") on brand-monitoring inputs without new
|
|
44
|
+
model deps.
|
|
40
45
|
strict_quality: When True, the pipeline raises
|
|
41
46
|
:class:`GarbageTranscriptError` before Demucs/translation/TTS run
|
|
42
47
|
if the transcript-quality heuristic returns ``"reject"``. When
|
|
@@ -50,13 +55,6 @@ class VideoDubber:
|
|
|
50
55
|
See :class:`videopython.ai.generation.qwen3.Qwen3Translator`
|
|
51
56
|
for tradeoffs (Qwen3 is slower on CPU but produces
|
|
52
57
|
context-aware, length-budgeted output).
|
|
53
|
-
cache_dir: When set, persist transcription, translated segments,
|
|
54
|
-
and per-segment TTS WAVs under this directory and skip stages
|
|
55
|
-
whose inputs already match a cache entry. Use to resume crashed
|
|
56
|
-
long runs or to iterate on dub configuration without paying
|
|
57
|
-
transcription cost each time. ``None`` (default) disables
|
|
58
|
-
caching. Cache grows unbounded; clear via
|
|
59
|
-
:func:`videopython.ai.dubbing.cache.dub_cache_clear`.
|
|
60
58
|
"""
|
|
61
59
|
|
|
62
60
|
def __init__(
|
|
@@ -67,9 +65,9 @@ class VideoDubber:
|
|
|
67
65
|
condition_on_previous_text: bool = False,
|
|
68
66
|
no_speech_threshold: float = 0.6,
|
|
69
67
|
logprob_threshold: float | None = -1.0,
|
|
68
|
+
vocabulary: list[str] | None = None,
|
|
70
69
|
strict_quality: bool = False,
|
|
71
70
|
translator: TranslatorChoice = "auto",
|
|
72
|
-
cache_dir: str | Path | None = None,
|
|
73
71
|
):
|
|
74
72
|
self.device = device
|
|
75
73
|
self.low_memory = low_memory
|
|
@@ -77,18 +75,17 @@ class VideoDubber:
|
|
|
77
75
|
self.condition_on_previous_text = condition_on_previous_text
|
|
78
76
|
self.no_speech_threshold = no_speech_threshold
|
|
79
77
|
self.logprob_threshold = logprob_threshold
|
|
78
|
+
self.vocabulary = vocabulary
|
|
80
79
|
self.strict_quality = strict_quality
|
|
81
80
|
self.translator = translator
|
|
82
|
-
self.cache_dir = cache_dir
|
|
83
81
|
self._local_pipeline: Any = None
|
|
84
82
|
requested = device.lower() if isinstance(device, str) else "auto"
|
|
85
83
|
logger.info(
|
|
86
|
-
"VideoDubber initialized with device=%s low_memory=%s whisper_model=%s translator=%s
|
|
84
|
+
"VideoDubber initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
|
|
87
85
|
requested,
|
|
88
86
|
low_memory,
|
|
89
87
|
whisper_model,
|
|
90
88
|
translator,
|
|
91
|
-
cache_dir,
|
|
92
89
|
)
|
|
93
90
|
|
|
94
91
|
def _init_local_pipeline(self) -> None:
|
|
@@ -101,9 +98,9 @@ class VideoDubber:
|
|
|
101
98
|
condition_on_previous_text=self.condition_on_previous_text,
|
|
102
99
|
no_speech_threshold=self.no_speech_threshold,
|
|
103
100
|
logprob_threshold=self.logprob_threshold,
|
|
101
|
+
vocabulary=self.vocabulary,
|
|
104
102
|
strict_quality=self.strict_quality,
|
|
105
103
|
translator=self.translator,
|
|
106
|
-
cache_dir=self.cache_dir,
|
|
107
104
|
)
|
|
108
105
|
|
|
109
106
|
def dub(
|
|
@@ -41,8 +41,7 @@ class Expressiveness:
|
|
|
41
41
|
def as_kwargs(self) -> dict[str, float]:
|
|
42
42
|
"""Knobs as a dict, dropping ``None`` entries.
|
|
43
43
|
|
|
44
|
-
Suitable for ``**``-expansion into Chatterbox
|
|
45
|
-
:meth:`DubCache.tts_key`.
|
|
44
|
+
Suitable for ``**``-expansion into Chatterbox.
|
|
46
45
|
"""
|
|
47
46
|
return {
|
|
48
47
|
name: value
|
|
@@ -10,7 +10,6 @@ from typing import TYPE_CHECKING, Any, Callable, Literal
|
|
|
10
10
|
import numpy as np
|
|
11
11
|
|
|
12
12
|
from videopython.ai._device import select_device
|
|
13
|
-
from videopython.ai.dubbing.cache import DubCache
|
|
14
13
|
from videopython.ai.dubbing.models import DubbingResult, Expressiveness, RevoiceResult, SeparatedAudio, TimingSummary
|
|
15
14
|
from videopython.ai.dubbing.quality import GarbageTranscriptError, assess_transcript
|
|
16
15
|
from videopython.ai.dubbing.timing import TimingSynchronizer
|
|
@@ -170,9 +169,9 @@ class LocalDubbingPipeline:
|
|
|
170
169
|
condition_on_previous_text: bool = False,
|
|
171
170
|
no_speech_threshold: float = 0.6,
|
|
172
171
|
logprob_threshold: float | None = -1.0,
|
|
172
|
+
vocabulary: list[str] | None = None,
|
|
173
173
|
strict_quality: bool = False,
|
|
174
174
|
translator: TranslatorChoice = "auto",
|
|
175
|
-
cache_dir: str | Path | None = None,
|
|
176
175
|
):
|
|
177
176
|
self.device = device
|
|
178
177
|
self.low_memory = low_memory
|
|
@@ -180,17 +179,16 @@ class LocalDubbingPipeline:
|
|
|
180
179
|
self.condition_on_previous_text = condition_on_previous_text
|
|
181
180
|
self.no_speech_threshold = no_speech_threshold
|
|
182
181
|
self.logprob_threshold = logprob_threshold
|
|
182
|
+
self.vocabulary = vocabulary
|
|
183
183
|
self.strict_quality = strict_quality
|
|
184
184
|
self.translator = translator
|
|
185
|
-
self.cache_dir = Path(cache_dir) if cache_dir is not None else None
|
|
186
185
|
requested = device.lower() if isinstance(device, str) else "auto"
|
|
187
186
|
logger.info(
|
|
188
|
-
"LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s translator=%s
|
|
187
|
+
"LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
|
|
189
188
|
requested,
|
|
190
189
|
low_memory,
|
|
191
190
|
whisper_model,
|
|
192
191
|
translator,
|
|
193
|
-
self.cache_dir,
|
|
194
192
|
)
|
|
195
193
|
|
|
196
194
|
self._transcriber: Any = None
|
|
@@ -200,7 +198,6 @@ class LocalDubbingPipeline:
|
|
|
200
198
|
self._tts_language: str | None = None
|
|
201
199
|
self._separator: Any = None
|
|
202
200
|
self._synchronizer: TimingSynchronizer | None = None
|
|
203
|
-
self._cache: DubCache | None = DubCache(self.cache_dir) if self.cache_dir is not None else None
|
|
204
201
|
|
|
205
202
|
def _maybe_unload(self, component_name: str) -> None:
|
|
206
203
|
"""Unload a stage's model when low_memory mode is enabled.
|
|
@@ -219,88 +216,42 @@ class LocalDubbingPipeline:
|
|
|
219
216
|
logger.info("low_memory: unloading %s", component_name.lstrip("_"))
|
|
220
217
|
unload()
|
|
221
218
|
|
|
222
|
-
def
|
|
219
|
+
def _transcribe(
|
|
223
220
|
self,
|
|
224
221
|
source_audio: Audio,
|
|
225
222
|
enable_diarization: bool,
|
|
226
223
|
) -> Transcription:
|
|
227
|
-
"""
|
|
228
|
-
|
|
229
|
-
Cache miss: lazy-init the transcriber, transcribe, store the
|
|
230
|
-
result (including all hashed kwargs in metadata.json so future
|
|
231
|
-
invalidators have provenance).
|
|
232
|
-
Cache hit: return the deserialized :class:`Transcription` without
|
|
233
|
-
touching Whisper/diarization at all.
|
|
234
|
-
"""
|
|
235
|
-
src_hash, kwargs_hash = self._transcription_cache_keys(source_audio, enable_diarization)
|
|
236
|
-
if self._cache is not None:
|
|
237
|
-
cached = self._cache.get_transcription(src_hash, kwargs_hash)
|
|
238
|
-
if cached is not None:
|
|
239
|
-
return cached
|
|
240
|
-
|
|
224
|
+
"""Lazy-init the transcriber and run it on ``source_audio``."""
|
|
241
225
|
if self._transcriber is None or self._transcriber_diarization != enable_diarization:
|
|
242
226
|
self._init_transcriber(enable_diarization=enable_diarization)
|
|
243
227
|
self._transcriber_diarization = enable_diarization
|
|
244
228
|
|
|
245
229
|
transcription = self._transcriber.transcribe(source_audio)
|
|
246
230
|
self._maybe_unload("_transcriber")
|
|
247
|
-
|
|
248
|
-
if self._cache is not None:
|
|
249
|
-
self._cache.put_transcription(
|
|
250
|
-
src_hash,
|
|
251
|
-
kwargs_hash,
|
|
252
|
-
transcription,
|
|
253
|
-
hash_inputs={
|
|
254
|
-
"whisper_model": self.whisper_model,
|
|
255
|
-
"enable_diarization": enable_diarization,
|
|
256
|
-
"condition_on_previous_text": self.condition_on_previous_text,
|
|
257
|
-
"no_speech_threshold": self.no_speech_threshold,
|
|
258
|
-
"logprob_threshold": self.logprob_threshold,
|
|
259
|
-
},
|
|
260
|
-
)
|
|
261
231
|
return transcription
|
|
262
232
|
|
|
263
233
|
def _tts_segment_audio(
|
|
264
234
|
self,
|
|
265
235
|
segment: TranslatedSegment,
|
|
266
236
|
speaker: str,
|
|
267
|
-
speaker_bytes: bytes | None,
|
|
268
237
|
target_lang: str,
|
|
269
238
|
voice_clone: bool,
|
|
270
239
|
voice_samples: dict[str, Audio],
|
|
271
240
|
speaker_wav_paths: dict[str, Path],
|
|
272
|
-
src_hash_for_tts: str,
|
|
273
241
|
expressiveness: Expressiveness = Expressiveness(),
|
|
274
242
|
) -> Audio | None:
|
|
275
|
-
"""Produce the TTS audio for a single segment
|
|
243
|
+
"""Produce the TTS audio for a single segment.
|
|
276
244
|
|
|
277
245
|
Returns the synthesized :class:`Audio`, or ``None`` if Chatterbox
|
|
278
|
-
crashed on the segment (the caller skips it).
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
so a fully-cached run never loads Chatterbox.
|
|
246
|
+
crashed on the segment (the caller skips it). The TTS model is
|
|
247
|
+
lazy-initialized and per-speaker temp WAVs are materialized once
|
|
248
|
+
across the loop.
|
|
282
249
|
|
|
283
250
|
``expressiveness`` carries the M4 Chatterbox knobs derived from
|
|
284
251
|
the source segment's prosody. Default is the no-knobs profile —
|
|
285
252
|
lets Chatterbox use its own defaults — so callers that don't yet
|
|
286
253
|
derive prosody (e.g. ``revoice``) keep pre-M4 behaviour.
|
|
287
254
|
"""
|
|
288
|
-
from videopython.base.audio import Audio as _Audio
|
|
289
|
-
|
|
290
|
-
tts_cache_key: str | None = None
|
|
291
|
-
if self._cache is not None:
|
|
292
|
-
tts_cache_key = DubCache.tts_key(
|
|
293
|
-
translated_text=segment.translated_text,
|
|
294
|
-
voice_sample_bytes=speaker_bytes,
|
|
295
|
-
language=target_lang,
|
|
296
|
-
**expressiveness.as_kwargs(),
|
|
297
|
-
)
|
|
298
|
-
cached_path = self._cache.get_tts_path(src_hash_for_tts, tts_cache_key)
|
|
299
|
-
if cached_path is not None:
|
|
300
|
-
return _Audio.from_path(cached_path)
|
|
301
|
-
|
|
302
|
-
# Cache miss: pay for TTS init + voice-sample WAV exactly once
|
|
303
|
-
# across the loop. Both are wasted work when every segment hits.
|
|
304
255
|
if self._tts is None or self._tts_language != target_lang:
|
|
305
256
|
self._init_tts(language=target_lang)
|
|
306
257
|
self._tts_language = target_lang
|
|
@@ -311,7 +262,7 @@ class LocalDubbingPipeline:
|
|
|
311
262
|
|
|
312
263
|
wav_path = speaker_wav_paths.get(speaker) if voice_clone else None
|
|
313
264
|
try:
|
|
314
|
-
|
|
265
|
+
return self._tts.generate_audio(
|
|
315
266
|
segment.translated_text,
|
|
316
267
|
voice_sample_path=wav_path,
|
|
317
268
|
**expressiveness.as_kwargs(),
|
|
@@ -329,39 +280,19 @@ class LocalDubbingPipeline:
|
|
|
329
280
|
)
|
|
330
281
|
return None
|
|
331
282
|
|
|
332
|
-
|
|
333
|
-
dubbed_audio.save(self._cache.reserve_tts_path(src_hash_for_tts, tts_cache_key))
|
|
334
|
-
return dubbed_audio
|
|
335
|
-
|
|
336
|
-
def _translate_with_cache(
|
|
283
|
+
def _translate(
|
|
337
284
|
self,
|
|
338
285
|
transcription: Transcription,
|
|
339
|
-
source_audio: Audio,
|
|
340
286
|
source_lang: str,
|
|
341
287
|
target_lang: str,
|
|
342
288
|
report_progress: Callable[[str, float], None],
|
|
343
289
|
) -> tuple[list[TranslatedSegment], list[int]]:
|
|
344
|
-
"""
|
|
290
|
+
"""Translate the transcription's segments into ``target_lang``.
|
|
345
291
|
|
|
346
|
-
Returns ``(translated_segments, translation_failures)``.
|
|
347
|
-
fully-successful translations are cached — partial Qwen failures
|
|
348
|
-
would otherwise lock in an incomplete dub across runs. The
|
|
292
|
+
Returns ``(translated_segments, translation_failures)``. The
|
|
349
293
|
progress callback maps the backend's [0, 1] fraction onto the
|
|
350
294
|
pipeline's translation window (0.35 → 0.50).
|
|
351
295
|
"""
|
|
352
|
-
from videopython.ai.dubbing.models import TranslatedSegment
|
|
353
|
-
|
|
354
|
-
cache_key: str | None = None
|
|
355
|
-
if self._cache is not None:
|
|
356
|
-
cache_key = DubCache.translation_key(
|
|
357
|
-
source_lang=source_lang,
|
|
358
|
-
target_lang=target_lang,
|
|
359
|
-
translator_class=self._resolved_translator_class_name(source_lang, target_lang),
|
|
360
|
-
)
|
|
361
|
-
cached = self._cache.get_translation(DubCache.source_key(source_audio), cache_key)
|
|
362
|
-
if cached is not None:
|
|
363
|
-
return [TranslatedSegment.from_dict(d) for d in cached], []
|
|
364
|
-
|
|
365
296
|
if self._translator is None:
|
|
366
297
|
self._init_translator(source_lang=source_lang, target_lang=target_lang)
|
|
367
298
|
|
|
@@ -384,31 +315,8 @@ class LocalDubbingPipeline:
|
|
|
384
315
|
translation_failures = list(self._translator.translation_failures)
|
|
385
316
|
self._maybe_unload("_translator")
|
|
386
317
|
|
|
387
|
-
if self._cache is not None and cache_key is not None and not translation_failures:
|
|
388
|
-
self._cache.put_translation(
|
|
389
|
-
DubCache.source_key(source_audio),
|
|
390
|
-
cache_key,
|
|
391
|
-
[s.to_dict() for s in translated_segments],
|
|
392
|
-
)
|
|
393
|
-
|
|
394
318
|
return translated_segments, translation_failures
|
|
395
319
|
|
|
396
|
-
def _transcription_cache_keys(self, source_audio: Audio, enable_diarization: bool = False) -> tuple[str, str]:
|
|
397
|
-
"""Return ``(src_hash, kwargs_hash)`` for the current transcription config.
|
|
398
|
-
|
|
399
|
-
Centralizes the kwarg list so the cache lookup, the put, and any
|
|
400
|
-
future invalidator agree on what's hashed.
|
|
401
|
-
"""
|
|
402
|
-
src_hash = DubCache.source_key(source_audio)
|
|
403
|
-
kwargs_hash = DubCache.transcription_kwargs_hash(
|
|
404
|
-
whisper_model=self.whisper_model,
|
|
405
|
-
enable_diarization=enable_diarization,
|
|
406
|
-
condition_on_previous_text=self.condition_on_previous_text,
|
|
407
|
-
no_speech_threshold=self.no_speech_threshold,
|
|
408
|
-
logprob_threshold=self.logprob_threshold,
|
|
409
|
-
)
|
|
410
|
-
return src_hash, kwargs_hash
|
|
411
|
-
|
|
412
320
|
def _init_transcriber(self, enable_diarization: bool = False) -> None:
|
|
413
321
|
"""Initialize the transcription model."""
|
|
414
322
|
from videopython.ai.understanding.audio import AudioToText
|
|
@@ -420,6 +328,7 @@ class LocalDubbingPipeline:
|
|
|
420
328
|
condition_on_previous_text=self.condition_on_previous_text,
|
|
421
329
|
no_speech_threshold=self.no_speech_threshold,
|
|
422
330
|
logprob_threshold=self.logprob_threshold,
|
|
331
|
+
vocabulary=self.vocabulary,
|
|
423
332
|
)
|
|
424
333
|
|
|
425
334
|
def _init_translator(self, source_lang: str, target_lang: str) -> None:
|
|
@@ -439,31 +348,6 @@ class LocalDubbingPipeline:
|
|
|
439
348
|
else: # "auto"
|
|
440
349
|
self._translator = self._resolve_translator_auto(source_lang, target_lang)
|
|
441
350
|
|
|
442
|
-
def _resolved_translator_class_name(self, source_lang: str, target_lang: str) -> str:
|
|
443
|
-
"""Return the *class name* of the translator that ``_init_translator``
|
|
444
|
-
would pick — without constructing one.
|
|
445
|
-
|
|
446
|
-
Used by the cache to key translations on the resolved backend rather
|
|
447
|
-
than the user-supplied ``"auto"``: a CPU run that resolves to Marian
|
|
448
|
-
must not collide with a GPU run that resolves to Qwen.
|
|
449
|
-
"""
|
|
450
|
-
if self.translator == "marian":
|
|
451
|
-
return "MarianTranslator"
|
|
452
|
-
if self.translator == "qwen3":
|
|
453
|
-
return "Qwen3Translator"
|
|
454
|
-
# auto — mirror _resolve_translator_auto's branching, no construction.
|
|
455
|
-
device = select_device(self.device, mps_allowed=True)
|
|
456
|
-
has_gpu = device in ("cuda", "mps")
|
|
457
|
-
if has_gpu and Qwen3Translator.supports(source_lang, target_lang):
|
|
458
|
-
return "Qwen3Translator"
|
|
459
|
-
if MarianTranslator.has_model_for(source_lang, target_lang):
|
|
460
|
-
return "MarianTranslator"
|
|
461
|
-
if Qwen3Translator.supports(source_lang, target_lang):
|
|
462
|
-
return "Qwen3Translator"
|
|
463
|
-
# No backend supports the pair — _init_translator will raise. We
|
|
464
|
-
# return a sentinel; the cache miss path will pay that cost.
|
|
465
|
-
return "Unsupported"
|
|
466
|
-
|
|
467
351
|
def _resolve_translator_auto(self, source_lang: str, target_lang: str) -> TranslationBackend:
|
|
468
352
|
"""Pick a backend based on language coverage AND device.
|
|
469
353
|
|
|
@@ -723,7 +607,7 @@ class LocalDubbingPipeline:
|
|
|
723
607
|
)
|
|
724
608
|
else:
|
|
725
609
|
report_progress("Transcribing audio", 0.05)
|
|
726
|
-
transcription = self.
|
|
610
|
+
transcription = self._transcribe(source_audio, enable_diarization)
|
|
727
611
|
|
|
728
612
|
if not transcription.segments:
|
|
729
613
|
return DubbingResult(
|
|
@@ -791,8 +675,8 @@ class LocalDubbingPipeline:
|
|
|
791
675
|
voice_samples = self._extract_voice_samples(vocal_audio, background_audio, transcription)
|
|
792
676
|
|
|
793
677
|
report_progress("Translating text", 0.35)
|
|
794
|
-
translated_segments, translation_failures = self.
|
|
795
|
-
transcription,
|
|
678
|
+
translated_segments, translation_failures = self._translate(
|
|
679
|
+
transcription, detected_lang, target_lang, report_progress
|
|
796
680
|
)
|
|
797
681
|
|
|
798
682
|
# Per-segment expressiveness derived from source vocals RMS.
|
|
@@ -818,21 +702,12 @@ class LocalDubbingPipeline:
|
|
|
818
702
|
|
|
819
703
|
report_progress("Generating dubbed speech", 0.50)
|
|
820
704
|
|
|
821
|
-
# Per-speaker voice-sample bytes for TTS cache key. Empty when
|
|
822
|
-
# voice_clone=False — the cache key still differentiates "no voice
|
|
823
|
-
# sample" from "specific clone" via the None path.
|
|
824
|
-
voice_sample_bytes: dict[str, bytes] = (
|
|
825
|
-
{speaker: sample.data.tobytes() for speaker, sample in voice_samples.items()} if voice_clone else {}
|
|
826
|
-
)
|
|
827
|
-
src_hash_for_tts = DubCache.source_key(source_audio) if self._cache is not None else ""
|
|
828
|
-
|
|
829
705
|
dubbed_segments: list[Audio] = []
|
|
830
706
|
target_durations: list[float] = []
|
|
831
707
|
start_times: list[float] = []
|
|
832
708
|
|
|
833
|
-
# Per-speaker temp WAVs are materialized lazily by _tts_segment_audio
|
|
834
|
-
#
|
|
835
|
-
# state so the finally block can clean up regardless of cache outcome.
|
|
709
|
+
# Per-speaker temp WAVs are materialized lazily by _tts_segment_audio.
|
|
710
|
+
# The dict is loop-scoped state so the finally block can clean up.
|
|
836
711
|
speaker_wav_paths: dict[str, Path] = {}
|
|
837
712
|
try:
|
|
838
713
|
for i, segment in enumerate(translated_segments):
|
|
@@ -852,12 +727,10 @@ class LocalDubbingPipeline:
|
|
|
852
727
|
dubbed_audio = self._tts_segment_audio(
|
|
853
728
|
segment=segment,
|
|
854
729
|
speaker=speaker,
|
|
855
|
-
speaker_bytes=voice_sample_bytes.get(speaker),
|
|
856
730
|
target_lang=target_lang,
|
|
857
731
|
voice_clone=voice_clone,
|
|
858
732
|
voice_samples=voice_samples,
|
|
859
733
|
speaker_wav_paths=speaker_wav_paths,
|
|
860
|
-
src_hash_for_tts=src_hash_for_tts,
|
|
861
734
|
expressiveness=expressiveness_per_segment[i],
|
|
862
735
|
)
|
|
863
736
|
if dubbed_audio is None:
|
|
@@ -54,7 +54,6 @@ class FaceTrackingCrop(Transformation):
|
|
|
54
54
|
vertical_offset: float = -0.1,
|
|
55
55
|
framing_rule: Literal["offset", "center", "headroom", "thirds", "dynamic"] = "offset",
|
|
56
56
|
headroom: float = 0.15,
|
|
57
|
-
lead_room: float = 0.1,
|
|
58
57
|
smoothing: float = 0.8,
|
|
59
58
|
max_speed: float | None = None,
|
|
60
59
|
fallback: Literal["center", "last_position", "full_frame"] = "last_position",
|
|
@@ -77,7 +76,6 @@ class FaceTrackingCrop(Transformation):
|
|
|
77
76
|
- "thirds": Place face near the upper-third line.
|
|
78
77
|
- "dynamic": Currently same as "headroom".
|
|
79
78
|
headroom: Headroom amount for framing rules that use it.
|
|
80
|
-
lead_room: Reserved for future motion/look-direction framing.
|
|
81
79
|
smoothing: Position smoothing factor (0-1, higher = smoother).
|
|
82
80
|
max_speed: Optional max camera movement per frame (normalized).
|
|
83
81
|
fallback: Behavior when no face detected.
|
|
@@ -92,7 +90,6 @@ class FaceTrackingCrop(Transformation):
|
|
|
92
90
|
self.vertical_offset = vertical_offset
|
|
93
91
|
self.framing_rule = framing_rule
|
|
94
92
|
self.headroom = headroom
|
|
95
|
-
self.lead_room = lead_room
|
|
96
93
|
self.smoothing = smoothing
|
|
97
94
|
self.max_speed = max_speed
|
|
98
95
|
self.fallback = fallback
|
|
@@ -238,10 +235,15 @@ class FaceTrackingCrop(Transformation):
|
|
|
238
235
|
current_position = (0.5, 0.5)
|
|
239
236
|
|
|
240
237
|
framing_label = self.framing_rule if self.framing_rule != "offset" else "legacy-offset"
|
|
241
|
-
|
|
242
|
-
"Face tracking crop: "
|
|
243
|
-
|
|
244
|
-
|
|
238
|
+
logger.info(
|
|
239
|
+
"Face tracking crop: %dx%d -> %dx%d (%d:%d, framing=%s)",
|
|
240
|
+
w,
|
|
241
|
+
h,
|
|
242
|
+
out_w,
|
|
243
|
+
out_h,
|
|
244
|
+
self.target_aspect[0],
|
|
245
|
+
self.target_aspect[1],
|
|
246
|
+
framing_label,
|
|
245
247
|
)
|
|
246
248
|
|
|
247
249
|
new_frames = []
|
|
@@ -448,7 +450,7 @@ class SplitScreenComposite(Transformation):
|
|
|
448
450
|
for _ in range(len(cell_rects))
|
|
449
451
|
]
|
|
450
452
|
|
|
451
|
-
|
|
453
|
+
logger.info("Creating %s split screen: %dx%d", self.layout, out_w, out_h)
|
|
452
454
|
|
|
453
455
|
new_frames = []
|
|
454
456
|
for i in tqdm(range(n_frames), desc="Split screen composite"):
|