videopython 0.28.1__tar.gz → 0.28.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.28.1 → videopython-0.28.2}/PKG-INFO +2 -1
- {videopython-0.28.1 → videopython-0.28.2}/pyproject.toml +6 -1
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/dubbing/__init__.py +3 -0
- videopython-0.28.2/src/videopython/ai/dubbing/cache.py +296 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/dubbing/dubber.py +16 -1
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/dubbing/models.py +25 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/dubbing/pipeline.py +302 -73
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/dubbing/remux.py +37 -15
- {videopython-0.28.1 → videopython-0.28.2}/.gitignore +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/LICENSE +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/README.md +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/__init__.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/_device.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/dubbing/quality.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/generation/audio.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/generation/qwen3.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/generation/translation.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/registry.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/swapping/__init__.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/swapping/inpainter.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/swapping/models.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/swapping/segmenter.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/swapping/swapper.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/transforms.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/understanding/audio.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/understanding/separation.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/video_analysis.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/__init__.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/audio/__init__.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/audio/analysis.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/audio/audio.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/combine.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/description.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/effects.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/progress.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/registry.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/scene.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/streaming.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/text/__init__.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/text/overlay.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/text/transcription.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/transforms.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/transitions.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/utils.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/video.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/editing/__init__.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/editing/multicam.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/editing/premiere_xml.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/editing/video_edit.py +0 -0
- {videopython-0.28.1 → videopython-0.28.2}/src/videopython/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: videopython
|
|
3
|
-
Version: 0.28.
|
|
3
|
+
Version: 0.28.2
|
|
4
4
|
Summary: Minimal video generation and processing library.
|
|
5
5
|
Project-URL: Homepage, https://videopython.com
|
|
6
6
|
Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
|
|
@@ -34,6 +34,7 @@ Requires-Dist: numba>=0.61.0; extra == 'ai'
|
|
|
34
34
|
Requires-Dist: ollama>=0.4.5; extra == 'ai'
|
|
35
35
|
Requires-Dist: openai-whisper>=20240930; extra == 'ai'
|
|
36
36
|
Requires-Dist: pyannote-audio>=4.0.0; extra == 'ai'
|
|
37
|
+
Requires-Dist: pyloudnorm>=0.1.1; extra == 'ai'
|
|
37
38
|
Requires-Dist: scikit-learn>=1.3.0; extra == 'ai'
|
|
38
39
|
Requires-Dist: scipy>=1.10.0; extra == 'ai'
|
|
39
40
|
Requires-Dist: sentencepiece>=0.1.99; extra == 'ai'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "videopython"
|
|
3
|
-
version = "0.28.
|
|
3
|
+
version = "0.28.2"
|
|
4
4
|
description = "Minimal video generation and processing library."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
|
|
@@ -82,6 +82,8 @@ ai = [
|
|
|
82
82
|
"demucs>=4.0.0",
|
|
83
83
|
# Translation backend: Qwen3 GGUF inference (M2)
|
|
84
84
|
"llama-cpp-python>=0.3.0",
|
|
85
|
+
# Loudness measurement (BS.1770) for dub-vs-source loudness matching (M3)
|
|
86
|
+
"pyloudnorm>=0.1.1",
|
|
85
87
|
]
|
|
86
88
|
|
|
87
89
|
# Required for pip install videopython[ai] - pip uses optional-dependencies, not dependency-groups
|
|
@@ -115,6 +117,8 @@ ai = [
|
|
|
115
117
|
"demucs>=4.0.0",
|
|
116
118
|
# Translation backend: Qwen3 GGUF inference (M2)
|
|
117
119
|
"llama-cpp-python>=0.3.0",
|
|
120
|
+
# Loudness measurement (BS.1770) for dub-vs-source loudness matching (M3)
|
|
121
|
+
"pyloudnorm>=0.1.1",
|
|
118
122
|
]
|
|
119
123
|
|
|
120
124
|
[project.urls]
|
|
@@ -141,6 +145,7 @@ module = [
|
|
|
141
145
|
"silero_vad", "silero_vad.*",
|
|
142
146
|
"cv2", "cv2.*",
|
|
143
147
|
"llama_cpp", "llama_cpp.*",
|
|
148
|
+
"pyloudnorm", "pyloudnorm.*",
|
|
144
149
|
]
|
|
145
150
|
ignore_missing_imports = true
|
|
146
151
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Local video dubbing functionality."""
|
|
2
2
|
|
|
3
|
+
from videopython.ai.dubbing.cache import DubCache, dub_cache_clear
|
|
3
4
|
from videopython.ai.dubbing.dubber import VideoDubber
|
|
4
5
|
from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio, TranslatedSegment
|
|
5
6
|
from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
|
|
@@ -19,4 +20,6 @@ __all__ = [
|
|
|
19
20
|
"TranscriptQuality",
|
|
20
21
|
"assess_transcript",
|
|
21
22
|
"UnsupportedLanguageError",
|
|
23
|
+
"DubCache",
|
|
24
|
+
"dub_cache_clear",
|
|
22
25
|
]
|
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
"""Filesystem-backed cache for resumable dubbing runs.
|
|
2
|
+
|
|
3
|
+
A long dub crashes at TTS segment 312/400 today and re-runs Whisper,
|
|
4
|
+
Demucs, translation, and the first 311 TTS segments from scratch.
|
|
5
|
+
:class:`DubCache` stores three artifacts so subsequent runs skip stages
|
|
6
|
+
whose inputs match:
|
|
7
|
+
|
|
8
|
+
- ``transcription.json`` — output of ``AudioToText.transcribe``.
|
|
9
|
+
- ``translation_<key>.json`` — output of ``TranslationBackend.translate_segments``.
|
|
10
|
+
- ``tts/<key>.wav`` — per-segment TTS WAV.
|
|
11
|
+
|
|
12
|
+
Cache directories are opt-in via ``VideoDubber(cache_dir=...)`` / ``LocalDubbingPipeline(cache_dir=...)``.
|
|
13
|
+
``cache_dir=None`` (default) is a no-op pass-through.
|
|
14
|
+
|
|
15
|
+
Hash inputs are conservative — false misses (re-run a stage) are cheap;
|
|
16
|
+
false hits (deliver a stale dub) are bugs. Source-audio identity uses a
|
|
17
|
+
sha256 of the raw float32 bytes, not file path, so re-encoding the same
|
|
18
|
+
content invalidates correctly.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import hashlib
|
|
24
|
+
import json
|
|
25
|
+
import logging
|
|
26
|
+
from dataclasses import dataclass
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
from typing import TYPE_CHECKING, Any
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from videopython.base.audio import Audio
|
|
32
|
+
from videopython.base.text.transcription import Transcription
|
|
33
|
+
|
|
34
|
+
logger = logging.getLogger(__name__)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# Cache schema version. Bump on incompatible changes to any artifact's
|
|
38
|
+
# on-disk format (e.g. TranscriptionSegment field changes that break
|
|
39
|
+
# from_dict). Mismatched cache entries are treated as a miss.
|
|
40
|
+
SCHEMA_VERSION = 1
|
|
41
|
+
|
|
42
|
+
# Reserved for M4.3 per-speaker voice library. M3.2 does not write here;
|
|
43
|
+
# documented so future code knows the path is taken.
|
|
44
|
+
_VOICE_CLONES_SUBDIR = "voice_clones"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass(frozen=True)
|
|
48
|
+
class _ArtifactPaths:
|
|
49
|
+
"""Resolved paths for a single source's cache directory."""
|
|
50
|
+
|
|
51
|
+
src_dir: Path
|
|
52
|
+
metadata: Path
|
|
53
|
+
transcription: Path
|
|
54
|
+
tts_dir: Path
|
|
55
|
+
|
|
56
|
+
def translation_path(self, lang_key: str) -> Path:
|
|
57
|
+
return self.src_dir / f"translation_{lang_key}.json"
|
|
58
|
+
|
|
59
|
+
def tts_path(self, seg_key: str) -> Path:
|
|
60
|
+
return self.tts_dir / f"{seg_key}.wav"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _stable_hash(*parts: str | int | float | bool | None) -> str:
|
|
64
|
+
"""Short hex digest over a tuple of primitive values.
|
|
65
|
+
|
|
66
|
+
Stable across runs — uses ``str(part)`` so int/float/bool/None all
|
|
67
|
+
serialize deterministically. 16 hex chars (64 bits) is plenty of
|
|
68
|
+
space for the small cardinality we're hashing into.
|
|
69
|
+
"""
|
|
70
|
+
h = hashlib.sha256()
|
|
71
|
+
for part in parts:
|
|
72
|
+
h.update(repr(part).encode("utf-8"))
|
|
73
|
+
h.update(b"\x00")
|
|
74
|
+
return h.hexdigest()[:16]
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _audio_bytes_hash(audio: Audio) -> str:
|
|
78
|
+
"""sha256 over the raw audio data buffer.
|
|
79
|
+
|
|
80
|
+
Used as the per-source cache directory name. Bytes-level so re-encoded
|
|
81
|
+
sources (different container, same content) collide intentionally only
|
|
82
|
+
when the decoded float32 buffer matches.
|
|
83
|
+
"""
|
|
84
|
+
h = hashlib.sha256()
|
|
85
|
+
h.update(audio.data.tobytes())
|
|
86
|
+
return h.hexdigest()[:16]
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class DubCache:
|
|
90
|
+
"""Filesystem cache for transcription, translation, and TTS artifacts.
|
|
91
|
+
|
|
92
|
+
Layout under ``root``::
|
|
93
|
+
|
|
94
|
+
<root>/<src_hash>/
|
|
95
|
+
metadata.json # schema version + hash inputs
|
|
96
|
+
transcription.json # populated on transcription cache miss
|
|
97
|
+
translation_<lang_key>.json
|
|
98
|
+
tts/<seg_key>.wav
|
|
99
|
+
voice_clones/ # reserved for M4.3, not written here
|
|
100
|
+
|
|
101
|
+
All getters return ``None`` on miss. Putters are idempotent
|
|
102
|
+
(overwrite). Schema-version mismatch is treated as a miss for every
|
|
103
|
+
artifact under that source.
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
def __init__(self, root: str | Path) -> None:
|
|
107
|
+
self.root = Path(root)
|
|
108
|
+
self.root.mkdir(parents=True, exist_ok=True)
|
|
109
|
+
|
|
110
|
+
# ----- key derivation --------------------------------------------------
|
|
111
|
+
|
|
112
|
+
@staticmethod
|
|
113
|
+
def source_key(audio: Audio) -> str:
|
|
114
|
+
"""Per-source identifier — sha256 of the raw audio buffer.
|
|
115
|
+
|
|
116
|
+
This is the directory name under ``root``; one dir per distinct
|
|
117
|
+
source, regardless of which stage's kwargs vary.
|
|
118
|
+
"""
|
|
119
|
+
return _audio_bytes_hash(audio)
|
|
120
|
+
|
|
121
|
+
@staticmethod
|
|
122
|
+
def transcription_kwargs_hash(
|
|
123
|
+
*,
|
|
124
|
+
whisper_model: str,
|
|
125
|
+
enable_diarization: bool,
|
|
126
|
+
condition_on_previous_text: bool,
|
|
127
|
+
no_speech_threshold: float,
|
|
128
|
+
logprob_threshold: float | None,
|
|
129
|
+
) -> str:
|
|
130
|
+
return _stable_hash(
|
|
131
|
+
whisper_model,
|
|
132
|
+
enable_diarization,
|
|
133
|
+
condition_on_previous_text,
|
|
134
|
+
no_speech_threshold,
|
|
135
|
+
logprob_threshold,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
@staticmethod
|
|
139
|
+
def translation_key(
|
|
140
|
+
*,
|
|
141
|
+
source_lang: str,
|
|
142
|
+
target_lang: str,
|
|
143
|
+
translator_class: str,
|
|
144
|
+
) -> str:
|
|
145
|
+
"""Hash captures the source/target pair + the resolved backend class.
|
|
146
|
+
|
|
147
|
+
``translator_class`` is the *resolved* class name (e.g. ``"MarianTranslator"``),
|
|
148
|
+
not the user-supplied ``"auto"`` — a CPU run that resolves to Marian
|
|
149
|
+
must not collide with a GPU run that resolves to Qwen on the same
|
|
150
|
+
language pair.
|
|
151
|
+
"""
|
|
152
|
+
return _stable_hash(source_lang, target_lang, translator_class)
|
|
153
|
+
|
|
154
|
+
@staticmethod
|
|
155
|
+
def tts_key(
|
|
156
|
+
*,
|
|
157
|
+
translated_text: str,
|
|
158
|
+
voice_sample_bytes: bytes | None,
|
|
159
|
+
language: str,
|
|
160
|
+
) -> str:
|
|
161
|
+
"""Per-segment key over text + voice sample + language."""
|
|
162
|
+
h = hashlib.sha256()
|
|
163
|
+
h.update(translated_text.encode("utf-8"))
|
|
164
|
+
h.update(b"\x00")
|
|
165
|
+
h.update(voice_sample_bytes or b"")
|
|
166
|
+
h.update(b"\x00")
|
|
167
|
+
h.update(language.encode("utf-8"))
|
|
168
|
+
return h.hexdigest()[:16]
|
|
169
|
+
|
|
170
|
+
# ----- path resolution -------------------------------------------------
|
|
171
|
+
|
|
172
|
+
def _paths_for(self, src_hash: str) -> _ArtifactPaths:
|
|
173
|
+
src_dir = self.root / src_hash
|
|
174
|
+
return _ArtifactPaths(
|
|
175
|
+
src_dir=src_dir,
|
|
176
|
+
metadata=src_dir / "metadata.json",
|
|
177
|
+
transcription=src_dir / "transcription.json",
|
|
178
|
+
tts_dir=src_dir / "tts",
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
def _ensure_metadata(self, paths: _ArtifactPaths, hash_inputs: dict[str, Any]) -> None:
|
|
182
|
+
"""Create the source dir + metadata.json if missing.
|
|
183
|
+
|
|
184
|
+
``hash_inputs`` records the kwargs we hashed against so a future
|
|
185
|
+
schema change can audit cache entries. The schema field is
|
|
186
|
+
load-bearing: mismatched versions invalidate the entire source dir.
|
|
187
|
+
"""
|
|
188
|
+
paths.src_dir.mkdir(parents=True, exist_ok=True)
|
|
189
|
+
paths.tts_dir.mkdir(parents=True, exist_ok=True)
|
|
190
|
+
if not paths.metadata.exists():
|
|
191
|
+
paths.metadata.write_text(
|
|
192
|
+
json.dumps(
|
|
193
|
+
{"schema": SCHEMA_VERSION, "hash_inputs": hash_inputs},
|
|
194
|
+
indent=2,
|
|
195
|
+
),
|
|
196
|
+
encoding="utf-8",
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
def _schema_ok(self, paths: _ArtifactPaths) -> bool:
|
|
200
|
+
if not paths.metadata.exists():
|
|
201
|
+
return True # fresh dir; we'll write metadata on first put.
|
|
202
|
+
try:
|
|
203
|
+
data = json.loads(paths.metadata.read_text(encoding="utf-8"))
|
|
204
|
+
except (OSError, json.JSONDecodeError):
|
|
205
|
+
return False
|
|
206
|
+
return data.get("schema") == SCHEMA_VERSION
|
|
207
|
+
|
|
208
|
+
# ----- transcription ---------------------------------------------------
|
|
209
|
+
|
|
210
|
+
def get_transcription(self, src_hash: str, kwargs_hash: str) -> Transcription | None:
|
|
211
|
+
from videopython.base.text.transcription import Transcription
|
|
212
|
+
|
|
213
|
+
paths = self._paths_for(src_hash)
|
|
214
|
+
if not paths.transcription.exists() or not self._schema_ok(paths):
|
|
215
|
+
return None
|
|
216
|
+
try:
|
|
217
|
+
data = json.loads(paths.transcription.read_text(encoding="utf-8"))
|
|
218
|
+
except (OSError, json.JSONDecodeError):
|
|
219
|
+
return None
|
|
220
|
+
if data.get("kwargs_hash") != kwargs_hash:
|
|
221
|
+
return None
|
|
222
|
+
logger.info("cache hit: transcription (%s)", src_hash)
|
|
223
|
+
return Transcription.from_dict(data["transcription"])
|
|
224
|
+
|
|
225
|
+
def put_transcription(
|
|
226
|
+
self,
|
|
227
|
+
src_hash: str,
|
|
228
|
+
kwargs_hash: str,
|
|
229
|
+
transcription: Transcription,
|
|
230
|
+
hash_inputs: dict[str, Any],
|
|
231
|
+
) -> None:
|
|
232
|
+
paths = self._paths_for(src_hash)
|
|
233
|
+
self._ensure_metadata(paths, hash_inputs)
|
|
234
|
+
paths.transcription.write_text(
|
|
235
|
+
json.dumps(
|
|
236
|
+
{"kwargs_hash": kwargs_hash, "transcription": transcription.to_dict()},
|
|
237
|
+
ensure_ascii=False,
|
|
238
|
+
),
|
|
239
|
+
encoding="utf-8",
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
# ----- translation -----------------------------------------------------
|
|
243
|
+
|
|
244
|
+
def get_translation(self, src_hash: str, lang_key: str) -> list[dict[str, Any]] | None:
|
|
245
|
+
paths = self._paths_for(src_hash)
|
|
246
|
+
if not self._schema_ok(paths):
|
|
247
|
+
return None
|
|
248
|
+
path = paths.translation_path(lang_key)
|
|
249
|
+
if not path.exists():
|
|
250
|
+
return None
|
|
251
|
+
try:
|
|
252
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
253
|
+
except (OSError, json.JSONDecodeError):
|
|
254
|
+
return None
|
|
255
|
+
logger.info("cache hit: translation (%s/%s)", src_hash, lang_key)
|
|
256
|
+
return data["segments"]
|
|
257
|
+
|
|
258
|
+
def put_translation(self, src_hash: str, lang_key: str, segments_dict: list[dict[str, Any]]) -> None:
|
|
259
|
+
paths = self._paths_for(src_hash)
|
|
260
|
+
self._ensure_metadata(paths, {})
|
|
261
|
+
paths.translation_path(lang_key).write_text(
|
|
262
|
+
json.dumps({"segments": segments_dict}, ensure_ascii=False),
|
|
263
|
+
encoding="utf-8",
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
# ----- tts -------------------------------------------------------------
|
|
267
|
+
|
|
268
|
+
def get_tts_path(self, src_hash: str, seg_key: str) -> Path | None:
|
|
269
|
+
paths = self._paths_for(src_hash)
|
|
270
|
+
if not self._schema_ok(paths):
|
|
271
|
+
return None
|
|
272
|
+
path = paths.tts_path(seg_key)
|
|
273
|
+
return path if path.exists() else None
|
|
274
|
+
|
|
275
|
+
def reserve_tts_path(self, src_hash: str, seg_key: str) -> Path:
|
|
276
|
+
"""Return the path TTS output should be written to. Caller is
|
|
277
|
+
responsible for the actual write (Audio.save)."""
|
|
278
|
+
paths = self._paths_for(src_hash)
|
|
279
|
+
self._ensure_metadata(paths, {})
|
|
280
|
+
return paths.tts_path(seg_key)
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def dub_cache_clear(cache_dir: str | Path, src_hash: str | None = None) -> None:
|
|
284
|
+
"""Delete cache entries for a specific source or the whole cache root.
|
|
285
|
+
|
|
286
|
+
No auto-eviction in M3.2 — call this to reclaim disk space when a
|
|
287
|
+
cache directory has grown unwieldy. Safe no-op if ``cache_dir`` or
|
|
288
|
+
``cache_dir/<src_hash>`` does not exist.
|
|
289
|
+
"""
|
|
290
|
+
import shutil
|
|
291
|
+
|
|
292
|
+
root = Path(cache_dir)
|
|
293
|
+
target = root / src_hash if src_hash else root
|
|
294
|
+
if target.exists():
|
|
295
|
+
shutil.rmtree(target)
|
|
296
|
+
logger.info("dub_cache_clear: removed %s", target)
|
|
@@ -50,6 +50,13 @@ class VideoDubber:
|
|
|
50
50
|
See :class:`videopython.ai.generation.qwen3.Qwen3Translator`
|
|
51
51
|
for tradeoffs (Qwen3 is slower on CPU but produces
|
|
52
52
|
context-aware, length-budgeted output).
|
|
53
|
+
cache_dir: When set, persist transcription, translated segments,
|
|
54
|
+
and per-segment TTS WAVs under this directory and skip stages
|
|
55
|
+
whose inputs already match a cache entry. Use to resume crashed
|
|
56
|
+
long runs or to iterate on dub configuration without paying
|
|
57
|
+
transcription cost each time. ``None`` (default) disables
|
|
58
|
+
caching. Cache grows unbounded; clear via
|
|
59
|
+
:func:`videopython.ai.dubbing.cache.dub_cache_clear`.
|
|
53
60
|
"""
|
|
54
61
|
|
|
55
62
|
def __init__(
|
|
@@ -62,6 +69,7 @@ class VideoDubber:
|
|
|
62
69
|
logprob_threshold: float | None = -1.0,
|
|
63
70
|
strict_quality: bool = False,
|
|
64
71
|
translator: TranslatorChoice = "auto",
|
|
72
|
+
cache_dir: str | Path | None = None,
|
|
65
73
|
):
|
|
66
74
|
self.device = device
|
|
67
75
|
self.low_memory = low_memory
|
|
@@ -71,14 +79,16 @@ class VideoDubber:
|
|
|
71
79
|
self.logprob_threshold = logprob_threshold
|
|
72
80
|
self.strict_quality = strict_quality
|
|
73
81
|
self.translator = translator
|
|
82
|
+
self.cache_dir = cache_dir
|
|
74
83
|
self._local_pipeline: Any = None
|
|
75
84
|
requested = device.lower() if isinstance(device, str) else "auto"
|
|
76
85
|
logger.info(
|
|
77
|
-
"VideoDubber initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
|
|
86
|
+
"VideoDubber initialized with device=%s low_memory=%s whisper_model=%s translator=%s cache_dir=%s",
|
|
78
87
|
requested,
|
|
79
88
|
low_memory,
|
|
80
89
|
whisper_model,
|
|
81
90
|
translator,
|
|
91
|
+
cache_dir,
|
|
82
92
|
)
|
|
83
93
|
|
|
84
94
|
def _init_local_pipeline(self) -> None:
|
|
@@ -93,6 +103,7 @@ class VideoDubber:
|
|
|
93
103
|
logprob_threshold=self.logprob_threshold,
|
|
94
104
|
strict_quality=self.strict_quality,
|
|
95
105
|
translator=self.translator,
|
|
106
|
+
cache_dir=self.cache_dir,
|
|
96
107
|
)
|
|
97
108
|
|
|
98
109
|
def dub(
|
|
@@ -175,6 +186,7 @@ class VideoDubber:
|
|
|
175
186
|
enable_diarization: bool = False,
|
|
176
187
|
progress_callback: Callable[[str, float], None] | None = None,
|
|
177
188
|
transcription: Any = None,
|
|
189
|
+
keep_original_audio: bool = False,
|
|
178
190
|
) -> DubbingResult:
|
|
179
191
|
"""Dub a video file in place on disk without loading video frames into memory.
|
|
180
192
|
|
|
@@ -201,6 +213,8 @@ class VideoDubber:
|
|
|
201
213
|
step. Speaker labels on the supplied transcription drive per-speaker
|
|
202
214
|
voice cloning. If it has no speakers, pass ``enable_diarization=True``
|
|
203
215
|
to add them via pyannote (requires word-level timings).
|
|
216
|
+
keep_original_audio: If True, retain the source audio in the output
|
|
217
|
+
as a secondary track behind the dubbed one (editorial A/B).
|
|
204
218
|
|
|
205
219
|
Returns:
|
|
206
220
|
``DubbingResult`` with the dubbed audio, translated segments, and
|
|
@@ -239,6 +253,7 @@ class VideoDubber:
|
|
|
239
253
|
video_path=input_path,
|
|
240
254
|
audio=result.dubbed_audio,
|
|
241
255
|
output_path=output_path,
|
|
256
|
+
keep_original_audio=keep_original_audio,
|
|
242
257
|
)
|
|
243
258
|
|
|
244
259
|
return result
|
|
@@ -59,6 +59,31 @@ class TranslatedSegment:
|
|
|
59
59
|
"""Duration of the segment in seconds."""
|
|
60
60
|
return self.end - self.start
|
|
61
61
|
|
|
62
|
+
def to_dict(self) -> dict[str, Any]:
|
|
63
|
+
"""Convert to dictionary for JSON serialization (used by the dub cache)."""
|
|
64
|
+
return {
|
|
65
|
+
"original_segment": self.original_segment.to_dict(),
|
|
66
|
+
"translated_text": self.translated_text,
|
|
67
|
+
"source_lang": self.source_lang,
|
|
68
|
+
"target_lang": self.target_lang,
|
|
69
|
+
"speaker": self.speaker,
|
|
70
|
+
"start": self.start,
|
|
71
|
+
"end": self.end,
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
@classmethod
|
|
75
|
+
def from_dict(cls, data: dict[str, Any]) -> TranslatedSegment:
|
|
76
|
+
"""Reconstruct from a dict produced by :meth:`to_dict`."""
|
|
77
|
+
return cls(
|
|
78
|
+
original_segment=TranscriptionSegment.from_dict(data["original_segment"]),
|
|
79
|
+
translated_text=data["translated_text"],
|
|
80
|
+
source_lang=data["source_lang"],
|
|
81
|
+
target_lang=data["target_lang"],
|
|
82
|
+
speaker=data.get("speaker"),
|
|
83
|
+
start=data.get("start", 0.0),
|
|
84
|
+
end=data.get("end", 0.0),
|
|
85
|
+
)
|
|
86
|
+
|
|
62
87
|
|
|
63
88
|
@dataclass
|
|
64
89
|
class SeparatedAudio:
|
|
@@ -10,6 +10,7 @@ from typing import TYPE_CHECKING, Any, Callable, Literal
|
|
|
10
10
|
import numpy as np
|
|
11
11
|
|
|
12
12
|
from videopython.ai._device import select_device
|
|
13
|
+
from videopython.ai.dubbing.cache import DubCache
|
|
13
14
|
from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio, TimingSummary
|
|
14
15
|
from videopython.ai.dubbing.quality import GarbageTranscriptError, assess_transcript
|
|
15
16
|
from videopython.ai.dubbing.timing import TimingSynchronizer
|
|
@@ -21,22 +22,25 @@ from videopython.ai.generation.translation import (
|
|
|
21
22
|
)
|
|
22
23
|
|
|
23
24
|
if TYPE_CHECKING:
|
|
25
|
+
from videopython.ai.dubbing.models import TranslatedSegment
|
|
24
26
|
from videopython.base.audio import Audio
|
|
27
|
+
from videopython.base.text.transcription import Transcription
|
|
25
28
|
|
|
26
29
|
|
|
27
30
|
TranslatorChoice = Literal["auto", "marian", "qwen3"]
|
|
28
31
|
|
|
29
32
|
|
|
33
|
+
# BS.1770 integrated-loudness measurement requires at least 400 ms of audio
|
|
34
|
+
# (one gating block). Below this, fall back to peak match — pyloudnorm
|
|
35
|
+
# returns -inf or warns, neither of which gives a usable gain.
|
|
36
|
+
_LUFS_MIN_DURATION_SECONDS = 0.4
|
|
37
|
+
|
|
38
|
+
|
|
30
39
|
def _peak_match(target: Audio, reference: Audio) -> Audio:
|
|
31
40
|
"""Scale ``target`` so its peak amplitude matches ``reference``.
|
|
32
41
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
typically lands quieter than the source — perceptually "thinner."
|
|
36
|
-
A single peak match recovers most of that drift without LUFS deps.
|
|
37
|
-
|
|
38
|
-
No-op when either side has zero peak (silent input or all-silent dub).
|
|
39
|
-
The new ``Audio`` shares no buffer with ``target``.
|
|
42
|
+
Used as the fallback when LUFS measurement isn't viable (clip < 0.4s
|
|
43
|
+
or silent input). The new ``Audio`` shares no buffer with ``target``.
|
|
40
44
|
"""
|
|
41
45
|
from videopython.base.audio import Audio as _Audio
|
|
42
46
|
|
|
@@ -53,6 +57,55 @@ def _peak_match(target: Audio, reference: Audio) -> Audio:
|
|
|
53
57
|
return _Audio(target.data * scale, target.metadata)
|
|
54
58
|
|
|
55
59
|
|
|
60
|
+
def _loudness_match(target: Audio, reference: Audio) -> Audio:
|
|
61
|
+
"""Scale ``target`` so its integrated loudness (BS.1770 / LUFS) matches ``reference``.
|
|
62
|
+
|
|
63
|
+
Demucs background normalization and the timing-assembler peak guard
|
|
64
|
+
each clamp at 1.0 instead of restoring perceived loudness, so a
|
|
65
|
+
dubbed mix lands perceptually "thinner" than the source even after
|
|
66
|
+
peak match. LUFS captures the ear-weighted envelope that peak ratio
|
|
67
|
+
misses on dialogue-heavy material.
|
|
68
|
+
|
|
69
|
+
Falls back to :func:`_peak_match` when either clip is shorter than
|
|
70
|
+
the BS.1770 gating block (400 ms) or when measurement returns -inf
|
|
71
|
+
(silent or near-silent gated content). After gain is applied, peaks
|
|
72
|
+
are clamped to 0.99 — BS.1770 has no peak ceiling and a sufficiently
|
|
73
|
+
quiet source can demand gain that would otherwise clip.
|
|
74
|
+
"""
|
|
75
|
+
from videopython.base.audio import Audio as _Audio
|
|
76
|
+
|
|
77
|
+
target_dur = target.metadata.duration_seconds
|
|
78
|
+
ref_dur = reference.metadata.duration_seconds
|
|
79
|
+
if target_dur < _LUFS_MIN_DURATION_SECONDS or ref_dur < _LUFS_MIN_DURATION_SECONDS:
|
|
80
|
+
return _peak_match(target, reference)
|
|
81
|
+
|
|
82
|
+
if not target.data.size or not reference.data.size:
|
|
83
|
+
return target
|
|
84
|
+
|
|
85
|
+
import pyloudnorm
|
|
86
|
+
|
|
87
|
+
target_lufs = pyloudnorm.Meter(target.metadata.sample_rate).integrated_loudness(target.data)
|
|
88
|
+
reference_lufs = pyloudnorm.Meter(reference.metadata.sample_rate).integrated_loudness(reference.data)
|
|
89
|
+
|
|
90
|
+
# Either clip's gated content was below -70 LUFS (effectively silent
|
|
91
|
+
# under BS.1770). Gain would be undefined — fall back to peak match,
|
|
92
|
+
# which has its own silent-input no-op.
|
|
93
|
+
if not np.isfinite(target_lufs) or not np.isfinite(reference_lufs):
|
|
94
|
+
return _peak_match(target, reference)
|
|
95
|
+
|
|
96
|
+
gain_db = reference_lufs - target_lufs
|
|
97
|
+
if abs(gain_db) < 0.1:
|
|
98
|
+
return target
|
|
99
|
+
scale = float(10 ** (gain_db / 20.0))
|
|
100
|
+
|
|
101
|
+
scaled = target.data * scale
|
|
102
|
+
peak = float(np.max(np.abs(scaled)))
|
|
103
|
+
if peak > 0.99:
|
|
104
|
+
scaled = scaled * (0.99 / peak)
|
|
105
|
+
|
|
106
|
+
return _Audio(scaled, target.metadata)
|
|
107
|
+
|
|
108
|
+
|
|
56
109
|
WhisperModel = Literal["tiny", "base", "small", "medium", "large", "turbo"]
|
|
57
110
|
|
|
58
111
|
logger = logging.getLogger(__name__)
|
|
@@ -85,6 +138,7 @@ class LocalDubbingPipeline:
|
|
|
85
138
|
logprob_threshold: float | None = -1.0,
|
|
86
139
|
strict_quality: bool = False,
|
|
87
140
|
translator: TranslatorChoice = "auto",
|
|
141
|
+
cache_dir: str | Path | None = None,
|
|
88
142
|
):
|
|
89
143
|
self.device = device
|
|
90
144
|
self.low_memory = low_memory
|
|
@@ -94,13 +148,15 @@ class LocalDubbingPipeline:
|
|
|
94
148
|
self.logprob_threshold = logprob_threshold
|
|
95
149
|
self.strict_quality = strict_quality
|
|
96
150
|
self.translator = translator
|
|
151
|
+
self.cache_dir = Path(cache_dir) if cache_dir is not None else None
|
|
97
152
|
requested = device.lower() if isinstance(device, str) else "auto"
|
|
98
153
|
logger.info(
|
|
99
|
-
"LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
|
|
154
|
+
"LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s translator=%s cache_dir=%s",
|
|
100
155
|
requested,
|
|
101
156
|
low_memory,
|
|
102
157
|
whisper_model,
|
|
103
158
|
translator,
|
|
159
|
+
self.cache_dir,
|
|
104
160
|
)
|
|
105
161
|
|
|
106
162
|
self._transcriber: Any = None
|
|
@@ -110,6 +166,7 @@ class LocalDubbingPipeline:
|
|
|
110
166
|
self._tts_language: str | None = None
|
|
111
167
|
self._separator: Any = None
|
|
112
168
|
self._synchronizer: TimingSynchronizer | None = None
|
|
169
|
+
self._cache: DubCache | None = DubCache(self.cache_dir) if self.cache_dir is not None else None
|
|
113
170
|
|
|
114
171
|
def _maybe_unload(self, component_name: str) -> None:
|
|
115
172
|
"""Unload a stage's model when low_memory mode is enabled.
|
|
@@ -128,6 +185,188 @@ class LocalDubbingPipeline:
|
|
|
128
185
|
logger.info("low_memory: unloading %s", component_name.lstrip("_"))
|
|
129
186
|
unload()
|
|
130
187
|
|
|
188
|
+
def _transcribe_with_cache(
|
|
189
|
+
self,
|
|
190
|
+
source_audio: Audio,
|
|
191
|
+
enable_diarization: bool,
|
|
192
|
+
) -> Transcription:
|
|
193
|
+
"""Run transcription with cache-around-the-call.
|
|
194
|
+
|
|
195
|
+
Cache miss: lazy-init the transcriber, transcribe, store the
|
|
196
|
+
result (including all hashed kwargs in metadata.json so future
|
|
197
|
+
invalidators have provenance).
|
|
198
|
+
Cache hit: return the deserialized :class:`Transcription` without
|
|
199
|
+
touching Whisper/diarization at all.
|
|
200
|
+
"""
|
|
201
|
+
src_hash, kwargs_hash = self._transcription_cache_keys(source_audio, enable_diarization)
|
|
202
|
+
if self._cache is not None:
|
|
203
|
+
cached = self._cache.get_transcription(src_hash, kwargs_hash)
|
|
204
|
+
if cached is not None:
|
|
205
|
+
return cached
|
|
206
|
+
|
|
207
|
+
if self._transcriber is None or self._transcriber_diarization != enable_diarization:
|
|
208
|
+
self._init_transcriber(enable_diarization=enable_diarization)
|
|
209
|
+
self._transcriber_diarization = enable_diarization
|
|
210
|
+
|
|
211
|
+
transcription = self._transcriber.transcribe(source_audio)
|
|
212
|
+
self._maybe_unload("_transcriber")
|
|
213
|
+
|
|
214
|
+
if self._cache is not None:
|
|
215
|
+
self._cache.put_transcription(
|
|
216
|
+
src_hash,
|
|
217
|
+
kwargs_hash,
|
|
218
|
+
transcription,
|
|
219
|
+
hash_inputs={
|
|
220
|
+
"whisper_model": self.whisper_model,
|
|
221
|
+
"enable_diarization": enable_diarization,
|
|
222
|
+
"condition_on_previous_text": self.condition_on_previous_text,
|
|
223
|
+
"no_speech_threshold": self.no_speech_threshold,
|
|
224
|
+
"logprob_threshold": self.logprob_threshold,
|
|
225
|
+
},
|
|
226
|
+
)
|
|
227
|
+
return transcription
|
|
228
|
+
|
|
229
|
+
def _tts_segment_audio(
|
|
230
|
+
self,
|
|
231
|
+
segment: TranslatedSegment,
|
|
232
|
+
speaker: str,
|
|
233
|
+
speaker_bytes: bytes | None,
|
|
234
|
+
target_lang: str,
|
|
235
|
+
voice_clone: bool,
|
|
236
|
+
voice_samples: dict[str, Audio],
|
|
237
|
+
speaker_wav_paths: dict[str, Path],
|
|
238
|
+
src_hash_for_tts: str,
|
|
239
|
+
) -> Audio | None:
|
|
240
|
+
"""Produce the TTS audio for a single segment, with cache-around-the-call.
|
|
241
|
+
|
|
242
|
+
Returns the synthesized :class:`Audio`, or ``None`` if Chatterbox
|
|
243
|
+
crashed on the segment (the caller skips it). On cache miss the
|
|
244
|
+
TTS model is lazy-initialized and the per-speaker temp WAV is
|
|
245
|
+
materialized before generation; on cache hit none of that runs,
|
|
246
|
+
so a fully-cached run never loads Chatterbox.
|
|
247
|
+
"""
|
|
248
|
+
from videopython.base.audio import Audio as _Audio
|
|
249
|
+
|
|
250
|
+
tts_cache_key: str | None = None
|
|
251
|
+
if self._cache is not None:
|
|
252
|
+
tts_cache_key = DubCache.tts_key(
|
|
253
|
+
translated_text=segment.translated_text,
|
|
254
|
+
voice_sample_bytes=speaker_bytes,
|
|
255
|
+
language=target_lang,
|
|
256
|
+
)
|
|
257
|
+
cached_path = self._cache.get_tts_path(src_hash_for_tts, tts_cache_key)
|
|
258
|
+
if cached_path is not None:
|
|
259
|
+
return _Audio.from_path(cached_path)
|
|
260
|
+
|
|
261
|
+
# Cache miss: pay for TTS init + voice-sample WAV exactly once
|
|
262
|
+
# across the loop. Both are wasted work when every segment hits.
|
|
263
|
+
if self._tts is None or self._tts_language != target_lang:
|
|
264
|
+
self._init_tts(language=target_lang)
|
|
265
|
+
self._tts_language = target_lang
|
|
266
|
+
if voice_clone and speaker not in speaker_wav_paths and speaker in voice_samples:
|
|
267
|
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
|
268
|
+
voice_samples[speaker].save(f.name)
|
|
269
|
+
speaker_wav_paths[speaker] = Path(f.name)
|
|
270
|
+
|
|
271
|
+
wav_path = speaker_wav_paths.get(speaker) if voice_clone else None
|
|
272
|
+
try:
|
|
273
|
+
if wav_path is not None:
|
|
274
|
+
dubbed_audio = self._tts.generate_audio(segment.translated_text, voice_sample_path=wav_path)
|
|
275
|
+
else:
|
|
276
|
+
dubbed_audio = self._tts.generate_audio(segment.translated_text)
|
|
277
|
+
except Exception as exc:
|
|
278
|
+
# Chatterbox occasionally crashes on short translated text
|
|
279
|
+
# (alignment_stream_analyzer indexing on tensors with <=5
|
|
280
|
+
# speech tokens). One bad segment shouldn't lose a long
|
|
281
|
+
# multi-hour run — log and let the caller skip.
|
|
282
|
+
logger.warning(
|
|
283
|
+
"TTS failed for segment (speaker=%s, text=%r): %s — skipping",
|
|
284
|
+
speaker,
|
|
285
|
+
segment.translated_text,
|
|
286
|
+
exc,
|
|
287
|
+
)
|
|
288
|
+
return None
|
|
289
|
+
|
|
290
|
+
if self._cache is not None and tts_cache_key is not None:
|
|
291
|
+
dubbed_audio.save(self._cache.reserve_tts_path(src_hash_for_tts, tts_cache_key))
|
|
292
|
+
return dubbed_audio
|
|
293
|
+
|
|
294
|
+
def _translate_with_cache(
|
|
295
|
+
self,
|
|
296
|
+
transcription: Transcription,
|
|
297
|
+
source_audio: Audio,
|
|
298
|
+
source_lang: str,
|
|
299
|
+
target_lang: str,
|
|
300
|
+
report_progress: Callable[[str, float], None],
|
|
301
|
+
) -> tuple[list[TranslatedSegment], list[int]]:
|
|
302
|
+
"""Run translation with cache-around-the-call.
|
|
303
|
+
|
|
304
|
+
Returns ``(translated_segments, translation_failures)``. Only
|
|
305
|
+
fully-successful translations are cached — partial Qwen failures
|
|
306
|
+
would otherwise lock in an incomplete dub across runs. The
|
|
307
|
+
progress callback maps the backend's [0, 1] fraction onto the
|
|
308
|
+
pipeline's translation window (0.35 → 0.50).
|
|
309
|
+
"""
|
|
310
|
+
from videopython.ai.dubbing.models import TranslatedSegment
|
|
311
|
+
|
|
312
|
+
cache_key: str | None = None
|
|
313
|
+
if self._cache is not None:
|
|
314
|
+
cache_key = DubCache.translation_key(
|
|
315
|
+
source_lang=source_lang,
|
|
316
|
+
target_lang=target_lang,
|
|
317
|
+
translator_class=self._resolved_translator_class_name(source_lang, target_lang),
|
|
318
|
+
)
|
|
319
|
+
cached = self._cache.get_translation(DubCache.source_key(source_audio), cache_key)
|
|
320
|
+
if cached is not None:
|
|
321
|
+
return [TranslatedSegment.from_dict(d) for d in cached], []
|
|
322
|
+
|
|
323
|
+
if self._translator is None:
|
|
324
|
+
self._init_translator(source_lang=source_lang, target_lang=target_lang)
|
|
325
|
+
|
|
326
|
+
# Translation stage spans 0.35 → 0.50 of overall pipeline progress.
|
|
327
|
+
# MarianMT runs sequentially over 8-segment batches; on a 15-min
|
|
328
|
+
# source that's minutes of silent dwell on 0.35 without per-batch
|
|
329
|
+
# ticks. Map the [0,1] translation fraction onto that 15% window.
|
|
330
|
+
def _on_translation_progress(fraction: float) -> None:
|
|
331
|
+
clamped = max(0.0, min(1.0, fraction))
|
|
332
|
+
report_progress(f"Translating text ({int(clamped * 100)}%)", 0.35 + 0.15 * clamped)
|
|
333
|
+
|
|
334
|
+
translated_segments = self._translator.translate_segments(
|
|
335
|
+
segments=transcription.segments,
|
|
336
|
+
target_lang=target_lang,
|
|
337
|
+
source_lang=source_lang,
|
|
338
|
+
progress_callback=_on_translation_progress,
|
|
339
|
+
)
|
|
340
|
+
# Capture per-segment failures (always empty for Marian) before
|
|
341
|
+
# _maybe_unload nukes the backend in low_memory mode.
|
|
342
|
+
translation_failures = list(self._translator.translation_failures)
|
|
343
|
+
self._maybe_unload("_translator")
|
|
344
|
+
|
|
345
|
+
if self._cache is not None and cache_key is not None and not translation_failures:
|
|
346
|
+
self._cache.put_translation(
|
|
347
|
+
DubCache.source_key(source_audio),
|
|
348
|
+
cache_key,
|
|
349
|
+
[s.to_dict() for s in translated_segments],
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
return translated_segments, translation_failures
|
|
353
|
+
|
|
354
|
+
def _transcription_cache_keys(self, source_audio: Audio, enable_diarization: bool = False) -> tuple[str, str]:
|
|
355
|
+
"""Return ``(src_hash, kwargs_hash)`` for the current transcription config.
|
|
356
|
+
|
|
357
|
+
Centralizes the kwarg list so the cache lookup, the put, and any
|
|
358
|
+
future invalidator agree on what's hashed.
|
|
359
|
+
"""
|
|
360
|
+
src_hash = DubCache.source_key(source_audio)
|
|
361
|
+
kwargs_hash = DubCache.transcription_kwargs_hash(
|
|
362
|
+
whisper_model=self.whisper_model,
|
|
363
|
+
enable_diarization=enable_diarization,
|
|
364
|
+
condition_on_previous_text=self.condition_on_previous_text,
|
|
365
|
+
no_speech_threshold=self.no_speech_threshold,
|
|
366
|
+
logprob_threshold=self.logprob_threshold,
|
|
367
|
+
)
|
|
368
|
+
return src_hash, kwargs_hash
|
|
369
|
+
|
|
131
370
|
def _init_transcriber(self, enable_diarization: bool = False) -> None:
|
|
132
371
|
"""Initialize the transcription model."""
|
|
133
372
|
from videopython.ai.understanding.audio import AudioToText
|
|
@@ -158,6 +397,31 @@ class LocalDubbingPipeline:
|
|
|
158
397
|
else: # "auto"
|
|
159
398
|
self._translator = self._resolve_translator_auto(source_lang, target_lang)
|
|
160
399
|
|
|
400
|
+
def _resolved_translator_class_name(self, source_lang: str, target_lang: str) -> str:
|
|
401
|
+
"""Return the *class name* of the translator that ``_init_translator``
|
|
402
|
+
would pick — without constructing one.
|
|
403
|
+
|
|
404
|
+
Used by the cache to key translations on the resolved backend rather
|
|
405
|
+
than the user-supplied ``"auto"``: a CPU run that resolves to Marian
|
|
406
|
+
must not collide with a GPU run that resolves to Qwen.
|
|
407
|
+
"""
|
|
408
|
+
if self.translator == "marian":
|
|
409
|
+
return "MarianTranslator"
|
|
410
|
+
if self.translator == "qwen3":
|
|
411
|
+
return "Qwen3Translator"
|
|
412
|
+
# auto — mirror _resolve_translator_auto's branching, no construction.
|
|
413
|
+
device = select_device(self.device, mps_allowed=True)
|
|
414
|
+
has_gpu = device in ("cuda", "mps")
|
|
415
|
+
if has_gpu and Qwen3Translator.supports(source_lang, target_lang):
|
|
416
|
+
return "Qwen3Translator"
|
|
417
|
+
if MarianTranslator.has_model_for(source_lang, target_lang):
|
|
418
|
+
return "MarianTranslator"
|
|
419
|
+
if Qwen3Translator.supports(source_lang, target_lang):
|
|
420
|
+
return "Qwen3Translator"
|
|
421
|
+
# No backend supports the pair — _init_translator will raise. We
|
|
422
|
+
# return a sentinel; the cache miss path will pay that cost.
|
|
423
|
+
return "Unsupported"
|
|
424
|
+
|
|
161
425
|
def _resolve_translator_auto(self, source_lang: str, target_lang: str) -> TranslationBackend:
|
|
162
426
|
"""Pick a backend based on language coverage AND device.
|
|
163
427
|
|
|
@@ -417,12 +681,7 @@ class LocalDubbingPipeline:
|
|
|
417
681
|
)
|
|
418
682
|
else:
|
|
419
683
|
report_progress("Transcribing audio", 0.05)
|
|
420
|
-
|
|
421
|
-
self._init_transcriber(enable_diarization=enable_diarization)
|
|
422
|
-
self._transcriber_diarization = enable_diarization
|
|
423
|
-
|
|
424
|
-
transcription = self._transcriber.transcribe(source_audio)
|
|
425
|
-
self._maybe_unload("_transcriber")
|
|
684
|
+
transcription = self._transcribe_with_cache(source_audio, enable_diarization)
|
|
426
685
|
|
|
427
686
|
if not transcription.segments:
|
|
428
687
|
return DubbingResult(
|
|
@@ -495,50 +754,29 @@ class LocalDubbingPipeline:
|
|
|
495
754
|
del vocal_audio
|
|
496
755
|
|
|
497
756
|
report_progress("Translating text", 0.35)
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
# Translation stage spans 0.35 → 0.50 of overall pipeline progress.
|
|
502
|
-
# MarianMT runs sequentially over 8-segment batches; on a 15-min
|
|
503
|
-
# source that's minutes of silent dwell on 0.35 without per-batch
|
|
504
|
-
# ticks. Map the [0,1] translation fraction onto that 15% window.
|
|
505
|
-
def _on_translation_progress(fraction: float) -> None:
|
|
506
|
-
clamped = max(0.0, min(1.0, fraction))
|
|
507
|
-
report_progress(f"Translating text ({int(clamped * 100)}%)", 0.35 + 0.15 * clamped)
|
|
508
|
-
|
|
509
|
-
translated_segments = self._translator.translate_segments(
|
|
510
|
-
segments=transcription.segments,
|
|
511
|
-
target_lang=target_lang,
|
|
512
|
-
source_lang=detected_lang,
|
|
513
|
-
progress_callback=_on_translation_progress,
|
|
757
|
+
translated_segments, translation_failures = self._translate_with_cache(
|
|
758
|
+
transcription, source_audio, detected_lang, target_lang, report_progress
|
|
514
759
|
)
|
|
515
|
-
# Capture per-segment failures (always empty for Marian) before
|
|
516
|
-
# _maybe_unload nukes the backend in low_memory mode.
|
|
517
|
-
translation_failures = list(self._translator.translation_failures)
|
|
518
|
-
self._maybe_unload("_translator")
|
|
519
760
|
|
|
520
761
|
report_progress("Generating dubbed speech", 0.50)
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
762
|
+
|
|
763
|
+
# Per-speaker voice-sample bytes for TTS cache key. Empty when
|
|
764
|
+
# voice_clone=False — the cache key still differentiates "no voice
|
|
765
|
+
# sample" from "specific clone" via the None path.
|
|
766
|
+
voice_sample_bytes: dict[str, bytes] = (
|
|
767
|
+
{speaker: sample.data.tobytes() for speaker, sample in voice_samples.items()} if voice_clone else {}
|
|
768
|
+
)
|
|
769
|
+
src_hash_for_tts = DubCache.source_key(source_audio) if self._cache is not None else ""
|
|
524
770
|
|
|
525
771
|
dubbed_segments: list[Audio] = []
|
|
526
772
|
target_durations: list[float] = []
|
|
527
773
|
start_times: list[float] = []
|
|
528
774
|
|
|
529
|
-
#
|
|
530
|
-
#
|
|
531
|
-
#
|
|
532
|
-
# on every call (one temp WAV write + delete per segment), which is
|
|
533
|
-
# pure overhead for long dubs with many segments per speaker.
|
|
775
|
+
# Per-speaker temp WAVs are materialized lazily by _tts_segment_audio
|
|
776
|
+
# so a fully-cached run never writes one. The dict is loop-scoped
|
|
777
|
+
# state so the finally block can clean up regardless of cache outcome.
|
|
534
778
|
speaker_wav_paths: dict[str, Path] = {}
|
|
535
779
|
try:
|
|
536
|
-
if voice_clone:
|
|
537
|
-
for speaker, sample in voice_samples.items():
|
|
538
|
-
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
|
539
|
-
sample.save(f.name)
|
|
540
|
-
speaker_wav_paths[speaker] = Path(f.name)
|
|
541
|
-
|
|
542
780
|
for i, segment in enumerate(translated_segments):
|
|
543
781
|
if segment.duration < 0.1:
|
|
544
782
|
continue
|
|
@@ -553,26 +791,17 @@ class LocalDubbingPipeline:
|
|
|
553
791
|
report_progress(f"Generating speech ({i + 1}/{len(translated_segments)})", progress)
|
|
554
792
|
|
|
555
793
|
speaker = segment.speaker or "speaker_0"
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
# multi-hour run — log and skip so the rest proceeds.
|
|
568
|
-
logger.warning(
|
|
569
|
-
"TTS failed for segment %d/%d (speaker=%s, text=%r): %s — skipping",
|
|
570
|
-
i + 1,
|
|
571
|
-
len(translated_segments),
|
|
572
|
-
speaker,
|
|
573
|
-
segment.translated_text,
|
|
574
|
-
e,
|
|
575
|
-
)
|
|
794
|
+
dubbed_audio = self._tts_segment_audio(
|
|
795
|
+
segment=segment,
|
|
796
|
+
speaker=speaker,
|
|
797
|
+
speaker_bytes=voice_sample_bytes.get(speaker),
|
|
798
|
+
target_lang=target_lang,
|
|
799
|
+
voice_clone=voice_clone,
|
|
800
|
+
voice_samples=voice_samples,
|
|
801
|
+
speaker_wav_paths=speaker_wav_paths,
|
|
802
|
+
src_hash_for_tts=src_hash_for_tts,
|
|
803
|
+
)
|
|
804
|
+
if dubbed_audio is None:
|
|
576
805
|
continue
|
|
577
806
|
|
|
578
807
|
dubbed_segments.append(dubbed_audio)
|
|
@@ -611,10 +840,10 @@ class LocalDubbingPipeline:
|
|
|
611
840
|
else:
|
|
612
841
|
final_audio = dubbed_speech
|
|
613
842
|
|
|
614
|
-
#
|
|
615
|
-
# than the original. Done last so it captures
|
|
616
|
-
# mixes and speech-only outputs uniformly.
|
|
617
|
-
final_audio =
|
|
843
|
+
# Loudness-match against the source so the dub doesn't land
|
|
844
|
+
# perceptually thinner than the original. Done last so it captures
|
|
845
|
+
# both vocals+background mixes and speech-only outputs uniformly.
|
|
846
|
+
final_audio = _loudness_match(final_audio, source_audio)
|
|
618
847
|
|
|
619
848
|
report_progress("Complete", 1.0)
|
|
620
849
|
|
|
@@ -733,7 +962,7 @@ class LocalDubbingPipeline:
|
|
|
733
962
|
else:
|
|
734
963
|
final_audio = generated_speech
|
|
735
964
|
|
|
736
|
-
final_audio =
|
|
965
|
+
final_audio = _loudness_match(final_audio, source_audio)
|
|
737
966
|
|
|
738
967
|
report_progress("Complete", 1.0)
|
|
739
968
|
|
|
@@ -21,25 +21,45 @@ class RemuxError(RuntimeError):
|
|
|
21
21
|
"""ffmpeg failed while replacing an audio stream."""
|
|
22
22
|
|
|
23
23
|
|
|
24
|
+
def _build_stream_maps(keep_original_audio: bool) -> list[str]:
|
|
25
|
+
"""ffmpeg ``-map`` flags for the video + audio + subtitle streams.
|
|
26
|
+
|
|
27
|
+
Convention: dubbed audio (input 1) is the *first* audio track so default
|
|
28
|
+
playback uses it; original audio (input 0) tags onto the back when
|
|
29
|
+
``keep_original_audio=True`` for editorial A/B. Subtitles from input 0
|
|
30
|
+
are carried with ``?`` so sources without subs don't fail the mux.
|
|
31
|
+
"""
|
|
32
|
+
maps = ["-map", "0:v:0", "-map", "1:a:0"]
|
|
33
|
+
if keep_original_audio:
|
|
34
|
+
maps += ["-map", "0:a?"]
|
|
35
|
+
maps += ["-map", "0:s?"]
|
|
36
|
+
return maps
|
|
37
|
+
|
|
38
|
+
|
|
24
39
|
def replace_audio_stream(
|
|
25
40
|
video_path: str | Path,
|
|
26
41
|
audio_path: str | Path,
|
|
27
42
|
output_path: str | Path,
|
|
28
43
|
audio_codec: str = "aac",
|
|
29
44
|
audio_bitrate: str = "192k",
|
|
45
|
+
keep_original_audio: bool = False,
|
|
30
46
|
) -> None:
|
|
31
47
|
"""Copy ``video_path``'s video stream and mux in ``audio_path`` as the audio track.
|
|
32
48
|
|
|
33
49
|
Uses ffmpeg stream-copy for video (no re-encode) and encodes audio to AAC.
|
|
34
|
-
|
|
35
|
-
|
|
50
|
+
Subtitle streams from ``video_path`` are carried through unchanged
|
|
51
|
+
(stream-copy). ``-shortest`` trims to the shorter of the two streams so
|
|
52
|
+
the output duration matches the source video when the dubbed audio is
|
|
53
|
+
slightly longer.
|
|
36
54
|
|
|
37
55
|
Args:
|
|
38
|
-
video_path: Source video file (video
|
|
39
|
-
audio_path: Audio file to use as the new audio track.
|
|
56
|
+
video_path: Source video file (video + subtitle streams are copied unchanged).
|
|
57
|
+
audio_path: Audio file to use as the new (default) audio track.
|
|
40
58
|
output_path: Destination file. Overwritten if it exists.
|
|
41
59
|
audio_codec: ffmpeg audio codec name. Defaults to ``aac`` (MP4-compatible).
|
|
42
60
|
audio_bitrate: Audio bitrate passed to ffmpeg (``-b:a``).
|
|
61
|
+
keep_original_audio: If True, retain the source audio as a secondary
|
|
62
|
+
track behind the dubbed one. Useful for editorial A/B.
|
|
43
63
|
|
|
44
64
|
Raises:
|
|
45
65
|
FileNotFoundError: If ``video_path`` or ``audio_path`` does not exist.
|
|
@@ -61,16 +81,15 @@ def replace_audio_stream(
|
|
|
61
81
|
str(video_path),
|
|
62
82
|
"-i",
|
|
63
83
|
str(audio_path),
|
|
64
|
-
|
|
65
|
-
"0:v:0",
|
|
66
|
-
"-map",
|
|
67
|
-
"1:a:0",
|
|
84
|
+
*_build_stream_maps(keep_original_audio),
|
|
68
85
|
"-c:v",
|
|
69
86
|
"copy",
|
|
70
87
|
"-c:a",
|
|
71
88
|
audio_codec,
|
|
72
89
|
"-b:a",
|
|
73
90
|
audio_bitrate,
|
|
91
|
+
"-c:s",
|
|
92
|
+
"copy",
|
|
74
93
|
"-shortest",
|
|
75
94
|
str(output_path),
|
|
76
95
|
]
|
|
@@ -87,20 +106,24 @@ def replace_audio_stream_from_audio(
|
|
|
87
106
|
output_path: str | Path,
|
|
88
107
|
audio_codec: str = "aac",
|
|
89
108
|
audio_bitrate: str = "192k",
|
|
109
|
+
keep_original_audio: bool = False,
|
|
90
110
|
) -> None:
|
|
91
111
|
"""Like ``replace_audio_stream`` but takes an in-memory ``Audio`` and pipes WAV to ffmpeg.
|
|
92
112
|
|
|
93
113
|
Avoids the ``Audio.save -> read-from-disk -> ffmpeg`` round-trip used by
|
|
94
114
|
the path-based variant: we serialize the WAV in memory and feed it to
|
|
95
115
|
ffmpeg via stdin. For long dubs this saves a full WAV write+read of the
|
|
96
|
-
output audio (~10 GB for a 2h source).
|
|
116
|
+
output audio (~10 GB for a 2h source). Subtitle streams from
|
|
117
|
+
``video_path`` are carried through unchanged (stream-copy).
|
|
97
118
|
|
|
98
119
|
Args:
|
|
99
|
-
video_path: Source video file (video
|
|
100
|
-
audio: ``Audio`` instance to mux in as the new audio track.
|
|
120
|
+
video_path: Source video file (video + subtitle streams are copied unchanged).
|
|
121
|
+
audio: ``Audio`` instance to mux in as the new (default) audio track.
|
|
101
122
|
output_path: Destination file. Overwritten if it exists.
|
|
102
123
|
audio_codec: ffmpeg audio codec name. Defaults to ``aac``.
|
|
103
124
|
audio_bitrate: Audio bitrate passed to ffmpeg (``-b:a``).
|
|
125
|
+
keep_original_audio: If True, retain the source audio as a secondary
|
|
126
|
+
track behind the dubbed one. Useful for editorial A/B.
|
|
104
127
|
|
|
105
128
|
Raises:
|
|
106
129
|
FileNotFoundError: If ``video_path`` does not exist.
|
|
@@ -133,16 +156,15 @@ def replace_audio_stream_from_audio(
|
|
|
133
156
|
"wav",
|
|
134
157
|
"-i",
|
|
135
158
|
"-",
|
|
136
|
-
|
|
137
|
-
"0:v:0",
|
|
138
|
-
"-map",
|
|
139
|
-
"1:a:0",
|
|
159
|
+
*_build_stream_maps(keep_original_audio),
|
|
140
160
|
"-c:v",
|
|
141
161
|
"copy",
|
|
142
162
|
"-c:a",
|
|
143
163
|
audio_codec,
|
|
144
164
|
"-b:a",
|
|
145
165
|
audio_bitrate,
|
|
166
|
+
"-c:s",
|
|
167
|
+
"copy",
|
|
146
168
|
"-shortest",
|
|
147
169
|
str(output_path),
|
|
148
170
|
]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|