videopython 0.28.1__tar.gz → 0.28.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. {videopython-0.28.1 → videopython-0.28.2}/PKG-INFO +2 -1
  2. {videopython-0.28.1 → videopython-0.28.2}/pyproject.toml +6 -1
  3. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/dubbing/__init__.py +3 -0
  4. videopython-0.28.2/src/videopython/ai/dubbing/cache.py +296 -0
  5. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/dubbing/dubber.py +16 -1
  6. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/dubbing/models.py +25 -0
  7. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/dubbing/pipeline.py +302 -73
  8. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/dubbing/remux.py +37 -15
  9. {videopython-0.28.1 → videopython-0.28.2}/.gitignore +0 -0
  10. {videopython-0.28.1 → videopython-0.28.2}/LICENSE +0 -0
  11. {videopython-0.28.1 → videopython-0.28.2}/README.md +0 -0
  12. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/__init__.py +0 -0
  13. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/__init__.py +0 -0
  14. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/_device.py +0 -0
  15. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/dubbing/quality.py +0 -0
  16. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/dubbing/timing.py +0 -0
  17. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/generation/__init__.py +0 -0
  18. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/generation/audio.py +0 -0
  19. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/generation/image.py +0 -0
  20. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/generation/qwen3.py +0 -0
  21. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/generation/translation.py +0 -0
  22. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/generation/video.py +0 -0
  23. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/registry.py +0 -0
  24. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/swapping/__init__.py +0 -0
  25. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/swapping/inpainter.py +0 -0
  26. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/swapping/models.py +0 -0
  27. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/swapping/segmenter.py +0 -0
  28. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/swapping/swapper.py +0 -0
  29. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/transforms.py +0 -0
  30. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/understanding/__init__.py +0 -0
  31. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/understanding/audio.py +0 -0
  32. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/understanding/image.py +0 -0
  33. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/understanding/separation.py +0 -0
  34. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/understanding/temporal.py +0 -0
  35. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/ai/video_analysis.py +0 -0
  36. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/__init__.py +0 -0
  37. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/audio/__init__.py +0 -0
  38. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/audio/analysis.py +0 -0
  39. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/audio/audio.py +0 -0
  40. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/combine.py +0 -0
  41. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/description.py +0 -0
  42. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/effects.py +0 -0
  43. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/exceptions.py +0 -0
  44. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/progress.py +0 -0
  45. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/registry.py +0 -0
  46. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/scene.py +0 -0
  47. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/streaming.py +0 -0
  48. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/text/__init__.py +0 -0
  49. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/text/overlay.py +0 -0
  50. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/text/transcription.py +0 -0
  51. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/transforms.py +0 -0
  52. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/transitions.py +0 -0
  53. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/utils.py +0 -0
  54. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/base/video.py +0 -0
  55. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/editing/__init__.py +0 -0
  56. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/editing/multicam.py +0 -0
  57. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/editing/premiere_xml.py +0 -0
  58. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/editing/video_edit.py +0 -0
  59. {videopython-0.28.1 → videopython-0.28.2}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.28.1
3
+ Version: 0.28.2
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -34,6 +34,7 @@ Requires-Dist: numba>=0.61.0; extra == 'ai'
34
34
  Requires-Dist: ollama>=0.4.5; extra == 'ai'
35
35
  Requires-Dist: openai-whisper>=20240930; extra == 'ai'
36
36
  Requires-Dist: pyannote-audio>=4.0.0; extra == 'ai'
37
+ Requires-Dist: pyloudnorm>=0.1.1; extra == 'ai'
37
38
  Requires-Dist: scikit-learn>=1.3.0; extra == 'ai'
38
39
  Requires-Dist: scipy>=1.10.0; extra == 'ai'
39
40
  Requires-Dist: sentencepiece>=0.1.99; extra == 'ai'
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.28.1"
3
+ version = "0.28.2"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -82,6 +82,8 @@ ai = [
82
82
  "demucs>=4.0.0",
83
83
  # Translation backend: Qwen3 GGUF inference (M2)
84
84
  "llama-cpp-python>=0.3.0",
85
+ # Loudness measurement (BS.1770) for dub-vs-source loudness matching (M3)
86
+ "pyloudnorm>=0.1.1",
85
87
  ]
86
88
 
87
89
  # Required for pip install videopython[ai] - pip uses optional-dependencies, not dependency-groups
@@ -115,6 +117,8 @@ ai = [
115
117
  "demucs>=4.0.0",
116
118
  # Translation backend: Qwen3 GGUF inference (M2)
117
119
  "llama-cpp-python>=0.3.0",
120
+ # Loudness measurement (BS.1770) for dub-vs-source loudness matching (M3)
121
+ "pyloudnorm>=0.1.1",
118
122
  ]
119
123
 
120
124
  [project.urls]
@@ -141,6 +145,7 @@ module = [
141
145
  "silero_vad", "silero_vad.*",
142
146
  "cv2", "cv2.*",
143
147
  "llama_cpp", "llama_cpp.*",
148
+ "pyloudnorm", "pyloudnorm.*",
144
149
  ]
145
150
  ignore_missing_imports = true
146
151
 
@@ -1,5 +1,6 @@
1
1
  """Local video dubbing functionality."""
2
2
 
3
+ from videopython.ai.dubbing.cache import DubCache, dub_cache_clear
3
4
  from videopython.ai.dubbing.dubber import VideoDubber
4
5
  from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio, TranslatedSegment
5
6
  from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
@@ -19,4 +20,6 @@ __all__ = [
19
20
  "TranscriptQuality",
20
21
  "assess_transcript",
21
22
  "UnsupportedLanguageError",
23
+ "DubCache",
24
+ "dub_cache_clear",
22
25
  ]
@@ -0,0 +1,296 @@
1
+ """Filesystem-backed cache for resumable dubbing runs.
2
+
3
+ A long dub crashes at TTS segment 312/400 today and re-runs Whisper,
4
+ Demucs, translation, and the first 311 TTS segments from scratch.
5
+ :class:`DubCache` stores three artifacts so subsequent runs skip stages
6
+ whose inputs match:
7
+
8
+ - ``transcription.json`` — output of ``AudioToText.transcribe``.
9
+ - ``translation_<key>.json`` — output of ``TranslationBackend.translate_segments``.
10
+ - ``tts/<key>.wav`` — per-segment TTS WAV.
11
+
12
+ Cache directories are opt-in via ``VideoDubber(cache_dir=...)`` / ``LocalDubbingPipeline(cache_dir=...)``.
13
+ ``cache_dir=None`` (default) is a no-op pass-through.
14
+
15
+ Hash inputs are conservative — false misses (re-run a stage) are cheap;
16
+ false hits (deliver a stale dub) are bugs. Source-audio identity uses a
17
+ sha256 of the raw float32 bytes, not file path, so re-encoding the same
18
+ content invalidates correctly.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import hashlib
24
+ import json
25
+ import logging
26
+ from dataclasses import dataclass
27
+ from pathlib import Path
28
+ from typing import TYPE_CHECKING, Any
29
+
30
+ if TYPE_CHECKING:
31
+ from videopython.base.audio import Audio
32
+ from videopython.base.text.transcription import Transcription
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ # Cache schema version. Bump on incompatible changes to any artifact's
38
+ # on-disk format (e.g. TranscriptionSegment field changes that break
39
+ # from_dict). Mismatched cache entries are treated as a miss.
40
+ SCHEMA_VERSION = 1
41
+
42
+ # Reserved for M4.3 per-speaker voice library. M3.2 does not write here;
43
+ # documented so future code knows the path is taken.
44
+ _VOICE_CLONES_SUBDIR = "voice_clones"
45
+
46
+
47
+ @dataclass(frozen=True)
48
+ class _ArtifactPaths:
49
+ """Resolved paths for a single source's cache directory."""
50
+
51
+ src_dir: Path
52
+ metadata: Path
53
+ transcription: Path
54
+ tts_dir: Path
55
+
56
+ def translation_path(self, lang_key: str) -> Path:
57
+ return self.src_dir / f"translation_{lang_key}.json"
58
+
59
+ def tts_path(self, seg_key: str) -> Path:
60
+ return self.tts_dir / f"{seg_key}.wav"
61
+
62
+
63
+ def _stable_hash(*parts: str | int | float | bool | None) -> str:
64
+ """Short hex digest over a tuple of primitive values.
65
+
66
+ Stable across runs — uses ``str(part)`` so int/float/bool/None all
67
+ serialize deterministically. 16 hex chars (64 bits) is plenty of
68
+ space for the small cardinality we're hashing into.
69
+ """
70
+ h = hashlib.sha256()
71
+ for part in parts:
72
+ h.update(repr(part).encode("utf-8"))
73
+ h.update(b"\x00")
74
+ return h.hexdigest()[:16]
75
+
76
+
77
+ def _audio_bytes_hash(audio: Audio) -> str:
78
+ """sha256 over the raw audio data buffer.
79
+
80
+ Used as the per-source cache directory name. Bytes-level so re-encoded
81
+ sources (different container, same content) collide intentionally only
82
+ when the decoded float32 buffer matches.
83
+ """
84
+ h = hashlib.sha256()
85
+ h.update(audio.data.tobytes())
86
+ return h.hexdigest()[:16]
87
+
88
+
89
+ class DubCache:
90
+ """Filesystem cache for transcription, translation, and TTS artifacts.
91
+
92
+ Layout under ``root``::
93
+
94
+ <root>/<src_hash>/
95
+ metadata.json # schema version + hash inputs
96
+ transcription.json # populated on transcription cache miss
97
+ translation_<lang_key>.json
98
+ tts/<seg_key>.wav
99
+ voice_clones/ # reserved for M4.3, not written here
100
+
101
+ All getters return ``None`` on miss. Putters are idempotent
102
+ (overwrite). Schema-version mismatch is treated as a miss for every
103
+ artifact under that source.
104
+ """
105
+
106
+ def __init__(self, root: str | Path) -> None:
107
+ self.root = Path(root)
108
+ self.root.mkdir(parents=True, exist_ok=True)
109
+
110
+ # ----- key derivation --------------------------------------------------
111
+
112
+ @staticmethod
113
+ def source_key(audio: Audio) -> str:
114
+ """Per-source identifier — sha256 of the raw audio buffer.
115
+
116
+ This is the directory name under ``root``; one dir per distinct
117
+ source, regardless of which stage's kwargs vary.
118
+ """
119
+ return _audio_bytes_hash(audio)
120
+
121
+ @staticmethod
122
+ def transcription_kwargs_hash(
123
+ *,
124
+ whisper_model: str,
125
+ enable_diarization: bool,
126
+ condition_on_previous_text: bool,
127
+ no_speech_threshold: float,
128
+ logprob_threshold: float | None,
129
+ ) -> str:
130
+ return _stable_hash(
131
+ whisper_model,
132
+ enable_diarization,
133
+ condition_on_previous_text,
134
+ no_speech_threshold,
135
+ logprob_threshold,
136
+ )
137
+
138
+ @staticmethod
139
+ def translation_key(
140
+ *,
141
+ source_lang: str,
142
+ target_lang: str,
143
+ translator_class: str,
144
+ ) -> str:
145
+ """Hash captures the source/target pair + the resolved backend class.
146
+
147
+ ``translator_class`` is the *resolved* class name (e.g. ``"MarianTranslator"``),
148
+ not the user-supplied ``"auto"`` — a CPU run that resolves to Marian
149
+ must not collide with a GPU run that resolves to Qwen on the same
150
+ language pair.
151
+ """
152
+ return _stable_hash(source_lang, target_lang, translator_class)
153
+
154
+ @staticmethod
155
+ def tts_key(
156
+ *,
157
+ translated_text: str,
158
+ voice_sample_bytes: bytes | None,
159
+ language: str,
160
+ ) -> str:
161
+ """Per-segment key over text + voice sample + language."""
162
+ h = hashlib.sha256()
163
+ h.update(translated_text.encode("utf-8"))
164
+ h.update(b"\x00")
165
+ h.update(voice_sample_bytes or b"")
166
+ h.update(b"\x00")
167
+ h.update(language.encode("utf-8"))
168
+ return h.hexdigest()[:16]
169
+
170
+ # ----- path resolution -------------------------------------------------
171
+
172
+ def _paths_for(self, src_hash: str) -> _ArtifactPaths:
173
+ src_dir = self.root / src_hash
174
+ return _ArtifactPaths(
175
+ src_dir=src_dir,
176
+ metadata=src_dir / "metadata.json",
177
+ transcription=src_dir / "transcription.json",
178
+ tts_dir=src_dir / "tts",
179
+ )
180
+
181
+ def _ensure_metadata(self, paths: _ArtifactPaths, hash_inputs: dict[str, Any]) -> None:
182
+ """Create the source dir + metadata.json if missing.
183
+
184
+ ``hash_inputs`` records the kwargs we hashed against so a future
185
+ schema change can audit cache entries. The schema field is
186
+ load-bearing: mismatched versions invalidate the entire source dir.
187
+ """
188
+ paths.src_dir.mkdir(parents=True, exist_ok=True)
189
+ paths.tts_dir.mkdir(parents=True, exist_ok=True)
190
+ if not paths.metadata.exists():
191
+ paths.metadata.write_text(
192
+ json.dumps(
193
+ {"schema": SCHEMA_VERSION, "hash_inputs": hash_inputs},
194
+ indent=2,
195
+ ),
196
+ encoding="utf-8",
197
+ )
198
+
199
+ def _schema_ok(self, paths: _ArtifactPaths) -> bool:
200
+ if not paths.metadata.exists():
201
+ return True # fresh dir; we'll write metadata on first put.
202
+ try:
203
+ data = json.loads(paths.metadata.read_text(encoding="utf-8"))
204
+ except (OSError, json.JSONDecodeError):
205
+ return False
206
+ return data.get("schema") == SCHEMA_VERSION
207
+
208
+ # ----- transcription ---------------------------------------------------
209
+
210
+ def get_transcription(self, src_hash: str, kwargs_hash: str) -> Transcription | None:
211
+ from videopython.base.text.transcription import Transcription
212
+
213
+ paths = self._paths_for(src_hash)
214
+ if not paths.transcription.exists() or not self._schema_ok(paths):
215
+ return None
216
+ try:
217
+ data = json.loads(paths.transcription.read_text(encoding="utf-8"))
218
+ except (OSError, json.JSONDecodeError):
219
+ return None
220
+ if data.get("kwargs_hash") != kwargs_hash:
221
+ return None
222
+ logger.info("cache hit: transcription (%s)", src_hash)
223
+ return Transcription.from_dict(data["transcription"])
224
+
225
+ def put_transcription(
226
+ self,
227
+ src_hash: str,
228
+ kwargs_hash: str,
229
+ transcription: Transcription,
230
+ hash_inputs: dict[str, Any],
231
+ ) -> None:
232
+ paths = self._paths_for(src_hash)
233
+ self._ensure_metadata(paths, hash_inputs)
234
+ paths.transcription.write_text(
235
+ json.dumps(
236
+ {"kwargs_hash": kwargs_hash, "transcription": transcription.to_dict()},
237
+ ensure_ascii=False,
238
+ ),
239
+ encoding="utf-8",
240
+ )
241
+
242
+ # ----- translation -----------------------------------------------------
243
+
244
+ def get_translation(self, src_hash: str, lang_key: str) -> list[dict[str, Any]] | None:
245
+ paths = self._paths_for(src_hash)
246
+ if not self._schema_ok(paths):
247
+ return None
248
+ path = paths.translation_path(lang_key)
249
+ if not path.exists():
250
+ return None
251
+ try:
252
+ data = json.loads(path.read_text(encoding="utf-8"))
253
+ except (OSError, json.JSONDecodeError):
254
+ return None
255
+ logger.info("cache hit: translation (%s/%s)", src_hash, lang_key)
256
+ return data["segments"]
257
+
258
+ def put_translation(self, src_hash: str, lang_key: str, segments_dict: list[dict[str, Any]]) -> None:
259
+ paths = self._paths_for(src_hash)
260
+ self._ensure_metadata(paths, {})
261
+ paths.translation_path(lang_key).write_text(
262
+ json.dumps({"segments": segments_dict}, ensure_ascii=False),
263
+ encoding="utf-8",
264
+ )
265
+
266
+ # ----- tts -------------------------------------------------------------
267
+
268
+ def get_tts_path(self, src_hash: str, seg_key: str) -> Path | None:
269
+ paths = self._paths_for(src_hash)
270
+ if not self._schema_ok(paths):
271
+ return None
272
+ path = paths.tts_path(seg_key)
273
+ return path if path.exists() else None
274
+
275
+ def reserve_tts_path(self, src_hash: str, seg_key: str) -> Path:
276
+ """Return the path TTS output should be written to. Caller is
277
+ responsible for the actual write (Audio.save)."""
278
+ paths = self._paths_for(src_hash)
279
+ self._ensure_metadata(paths, {})
280
+ return paths.tts_path(seg_key)
281
+
282
+
283
+ def dub_cache_clear(cache_dir: str | Path, src_hash: str | None = None) -> None:
284
+ """Delete cache entries for a specific source or the whole cache root.
285
+
286
+ No auto-eviction in M3.2 — call this to reclaim disk space when a
287
+ cache directory has grown unwieldy. Safe no-op if ``cache_dir`` or
288
+ ``cache_dir/<src_hash>`` does not exist.
289
+ """
290
+ import shutil
291
+
292
+ root = Path(cache_dir)
293
+ target = root / src_hash if src_hash else root
294
+ if target.exists():
295
+ shutil.rmtree(target)
296
+ logger.info("dub_cache_clear: removed %s", target)
@@ -50,6 +50,13 @@ class VideoDubber:
50
50
  See :class:`videopython.ai.generation.qwen3.Qwen3Translator`
51
51
  for tradeoffs (Qwen3 is slower on CPU but produces
52
52
  context-aware, length-budgeted output).
53
+ cache_dir: When set, persist transcription, translated segments,
54
+ and per-segment TTS WAVs under this directory and skip stages
55
+ whose inputs already match a cache entry. Use to resume crashed
56
+ long runs or to iterate on dub configuration without paying
57
+ transcription cost each time. ``None`` (default) disables
58
+ caching. Cache grows unbounded; clear via
59
+ :func:`videopython.ai.dubbing.cache.dub_cache_clear`.
53
60
  """
54
61
 
55
62
  def __init__(
@@ -62,6 +69,7 @@ class VideoDubber:
62
69
  logprob_threshold: float | None = -1.0,
63
70
  strict_quality: bool = False,
64
71
  translator: TranslatorChoice = "auto",
72
+ cache_dir: str | Path | None = None,
65
73
  ):
66
74
  self.device = device
67
75
  self.low_memory = low_memory
@@ -71,14 +79,16 @@ class VideoDubber:
71
79
  self.logprob_threshold = logprob_threshold
72
80
  self.strict_quality = strict_quality
73
81
  self.translator = translator
82
+ self.cache_dir = cache_dir
74
83
  self._local_pipeline: Any = None
75
84
  requested = device.lower() if isinstance(device, str) else "auto"
76
85
  logger.info(
77
- "VideoDubber initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
86
+ "VideoDubber initialized with device=%s low_memory=%s whisper_model=%s translator=%s cache_dir=%s",
78
87
  requested,
79
88
  low_memory,
80
89
  whisper_model,
81
90
  translator,
91
+ cache_dir,
82
92
  )
83
93
 
84
94
  def _init_local_pipeline(self) -> None:
@@ -93,6 +103,7 @@ class VideoDubber:
93
103
  logprob_threshold=self.logprob_threshold,
94
104
  strict_quality=self.strict_quality,
95
105
  translator=self.translator,
106
+ cache_dir=self.cache_dir,
96
107
  )
97
108
 
98
109
  def dub(
@@ -175,6 +186,7 @@ class VideoDubber:
175
186
  enable_diarization: bool = False,
176
187
  progress_callback: Callable[[str, float], None] | None = None,
177
188
  transcription: Any = None,
189
+ keep_original_audio: bool = False,
178
190
  ) -> DubbingResult:
179
191
  """Dub a video file in place on disk without loading video frames into memory.
180
192
 
@@ -201,6 +213,8 @@ class VideoDubber:
201
213
  step. Speaker labels on the supplied transcription drive per-speaker
202
214
  voice cloning. If it has no speakers, pass ``enable_diarization=True``
203
215
  to add them via pyannote (requires word-level timings).
216
+ keep_original_audio: If True, retain the source audio in the output
217
+ as a secondary track behind the dubbed one (editorial A/B).
204
218
 
205
219
  Returns:
206
220
  ``DubbingResult`` with the dubbed audio, translated segments, and
@@ -239,6 +253,7 @@ class VideoDubber:
239
253
  video_path=input_path,
240
254
  audio=result.dubbed_audio,
241
255
  output_path=output_path,
256
+ keep_original_audio=keep_original_audio,
242
257
  )
243
258
 
244
259
  return result
@@ -59,6 +59,31 @@ class TranslatedSegment:
59
59
  """Duration of the segment in seconds."""
60
60
  return self.end - self.start
61
61
 
62
+ def to_dict(self) -> dict[str, Any]:
63
+ """Convert to dictionary for JSON serialization (used by the dub cache)."""
64
+ return {
65
+ "original_segment": self.original_segment.to_dict(),
66
+ "translated_text": self.translated_text,
67
+ "source_lang": self.source_lang,
68
+ "target_lang": self.target_lang,
69
+ "speaker": self.speaker,
70
+ "start": self.start,
71
+ "end": self.end,
72
+ }
73
+
74
+ @classmethod
75
+ def from_dict(cls, data: dict[str, Any]) -> TranslatedSegment:
76
+ """Reconstruct from a dict produced by :meth:`to_dict`."""
77
+ return cls(
78
+ original_segment=TranscriptionSegment.from_dict(data["original_segment"]),
79
+ translated_text=data["translated_text"],
80
+ source_lang=data["source_lang"],
81
+ target_lang=data["target_lang"],
82
+ speaker=data.get("speaker"),
83
+ start=data.get("start", 0.0),
84
+ end=data.get("end", 0.0),
85
+ )
86
+
62
87
 
63
88
  @dataclass
64
89
  class SeparatedAudio:
@@ -10,6 +10,7 @@ from typing import TYPE_CHECKING, Any, Callable, Literal
10
10
  import numpy as np
11
11
 
12
12
  from videopython.ai._device import select_device
13
+ from videopython.ai.dubbing.cache import DubCache
13
14
  from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio, TimingSummary
14
15
  from videopython.ai.dubbing.quality import GarbageTranscriptError, assess_transcript
15
16
  from videopython.ai.dubbing.timing import TimingSynchronizer
@@ -21,22 +22,25 @@ from videopython.ai.generation.translation import (
21
22
  )
22
23
 
23
24
  if TYPE_CHECKING:
25
+ from videopython.ai.dubbing.models import TranslatedSegment
24
26
  from videopython.base.audio import Audio
27
+ from videopython.base.text.transcription import Transcription
25
28
 
26
29
 
27
30
  TranslatorChoice = Literal["auto", "marian", "qwen3"]
28
31
 
29
32
 
33
+ # BS.1770 integrated-loudness measurement requires at least 400 ms of audio
34
+ # (one gating block). Below this, fall back to peak match — pyloudnorm
35
+ # returns -inf or warns, neither of which gives a usable gain.
36
+ _LUFS_MIN_DURATION_SECONDS = 0.4
37
+
38
+
30
39
  def _peak_match(target: Audio, reference: Audio) -> Audio:
31
40
  """Scale ``target`` so its peak amplitude matches ``reference``.
32
41
 
33
- Demucs background normalization and the timing-assembler peak guard
34
- each clamp at 1.0 instead of restoring headroom, so a dubbed mix
35
- typically lands quieter than the source — perceptually "thinner."
36
- A single peak match recovers most of that drift without LUFS deps.
37
-
38
- No-op when either side has zero peak (silent input or all-silent dub).
39
- The new ``Audio`` shares no buffer with ``target``.
42
+ Used as the fallback when LUFS measurement isn't viable (clip < 0.4s
43
+ or silent input). The new ``Audio`` shares no buffer with ``target``.
40
44
  """
41
45
  from videopython.base.audio import Audio as _Audio
42
46
 
@@ -53,6 +57,55 @@ def _peak_match(target: Audio, reference: Audio) -> Audio:
53
57
  return _Audio(target.data * scale, target.metadata)
54
58
 
55
59
 
60
+ def _loudness_match(target: Audio, reference: Audio) -> Audio:
61
+ """Scale ``target`` so its integrated loudness (BS.1770 / LUFS) matches ``reference``.
62
+
63
+ Demucs background normalization and the timing-assembler peak guard
64
+ each clamp at 1.0 instead of restoring perceived loudness, so a
65
+ dubbed mix lands perceptually "thinner" than the source even after
66
+ peak match. LUFS captures the ear-weighted envelope that peak ratio
67
+ misses on dialogue-heavy material.
68
+
69
+ Falls back to :func:`_peak_match` when either clip is shorter than
70
+ the BS.1770 gating block (400 ms) or when measurement returns -inf
71
+ (silent or near-silent gated content). After gain is applied, peaks
72
+ are clamped to 0.99 — BS.1770 has no peak ceiling and a sufficiently
73
+ quiet source can demand gain that would otherwise clip.
74
+ """
75
+ from videopython.base.audio import Audio as _Audio
76
+
77
+ target_dur = target.metadata.duration_seconds
78
+ ref_dur = reference.metadata.duration_seconds
79
+ if target_dur < _LUFS_MIN_DURATION_SECONDS or ref_dur < _LUFS_MIN_DURATION_SECONDS:
80
+ return _peak_match(target, reference)
81
+
82
+ if not target.data.size or not reference.data.size:
83
+ return target
84
+
85
+ import pyloudnorm
86
+
87
+ target_lufs = pyloudnorm.Meter(target.metadata.sample_rate).integrated_loudness(target.data)
88
+ reference_lufs = pyloudnorm.Meter(reference.metadata.sample_rate).integrated_loudness(reference.data)
89
+
90
+ # Either clip's gated content was below -70 LUFS (effectively silent
91
+ # under BS.1770). Gain would be undefined — fall back to peak match,
92
+ # which has its own silent-input no-op.
93
+ if not np.isfinite(target_lufs) or not np.isfinite(reference_lufs):
94
+ return _peak_match(target, reference)
95
+
96
+ gain_db = reference_lufs - target_lufs
97
+ if abs(gain_db) < 0.1:
98
+ return target
99
+ scale = float(10 ** (gain_db / 20.0))
100
+
101
+ scaled = target.data * scale
102
+ peak = float(np.max(np.abs(scaled)))
103
+ if peak > 0.99:
104
+ scaled = scaled * (0.99 / peak)
105
+
106
+ return _Audio(scaled, target.metadata)
107
+
108
+
56
109
  WhisperModel = Literal["tiny", "base", "small", "medium", "large", "turbo"]
57
110
 
58
111
  logger = logging.getLogger(__name__)
@@ -85,6 +138,7 @@ class LocalDubbingPipeline:
85
138
  logprob_threshold: float | None = -1.0,
86
139
  strict_quality: bool = False,
87
140
  translator: TranslatorChoice = "auto",
141
+ cache_dir: str | Path | None = None,
88
142
  ):
89
143
  self.device = device
90
144
  self.low_memory = low_memory
@@ -94,13 +148,15 @@ class LocalDubbingPipeline:
94
148
  self.logprob_threshold = logprob_threshold
95
149
  self.strict_quality = strict_quality
96
150
  self.translator = translator
151
+ self.cache_dir = Path(cache_dir) if cache_dir is not None else None
97
152
  requested = device.lower() if isinstance(device, str) else "auto"
98
153
  logger.info(
99
- "LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
154
+ "LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s translator=%s cache_dir=%s",
100
155
  requested,
101
156
  low_memory,
102
157
  whisper_model,
103
158
  translator,
159
+ self.cache_dir,
104
160
  )
105
161
 
106
162
  self._transcriber: Any = None
@@ -110,6 +166,7 @@ class LocalDubbingPipeline:
110
166
  self._tts_language: str | None = None
111
167
  self._separator: Any = None
112
168
  self._synchronizer: TimingSynchronizer | None = None
169
+ self._cache: DubCache | None = DubCache(self.cache_dir) if self.cache_dir is not None else None
113
170
 
114
171
  def _maybe_unload(self, component_name: str) -> None:
115
172
  """Unload a stage's model when low_memory mode is enabled.
@@ -128,6 +185,188 @@ class LocalDubbingPipeline:
128
185
  logger.info("low_memory: unloading %s", component_name.lstrip("_"))
129
186
  unload()
130
187
 
188
+ def _transcribe_with_cache(
189
+ self,
190
+ source_audio: Audio,
191
+ enable_diarization: bool,
192
+ ) -> Transcription:
193
+ """Run transcription with cache-around-the-call.
194
+
195
+ Cache miss: lazy-init the transcriber, transcribe, store the
196
+ result (including all hashed kwargs in metadata.json so future
197
+ invalidators have provenance).
198
+ Cache hit: return the deserialized :class:`Transcription` without
199
+ touching Whisper/diarization at all.
200
+ """
201
+ src_hash, kwargs_hash = self._transcription_cache_keys(source_audio, enable_diarization)
202
+ if self._cache is not None:
203
+ cached = self._cache.get_transcription(src_hash, kwargs_hash)
204
+ if cached is not None:
205
+ return cached
206
+
207
+ if self._transcriber is None or self._transcriber_diarization != enable_diarization:
208
+ self._init_transcriber(enable_diarization=enable_diarization)
209
+ self._transcriber_diarization = enable_diarization
210
+
211
+ transcription = self._transcriber.transcribe(source_audio)
212
+ self._maybe_unload("_transcriber")
213
+
214
+ if self._cache is not None:
215
+ self._cache.put_transcription(
216
+ src_hash,
217
+ kwargs_hash,
218
+ transcription,
219
+ hash_inputs={
220
+ "whisper_model": self.whisper_model,
221
+ "enable_diarization": enable_diarization,
222
+ "condition_on_previous_text": self.condition_on_previous_text,
223
+ "no_speech_threshold": self.no_speech_threshold,
224
+ "logprob_threshold": self.logprob_threshold,
225
+ },
226
+ )
227
+ return transcription
228
+
229
+ def _tts_segment_audio(
230
+ self,
231
+ segment: TranslatedSegment,
232
+ speaker: str,
233
+ speaker_bytes: bytes | None,
234
+ target_lang: str,
235
+ voice_clone: bool,
236
+ voice_samples: dict[str, Audio],
237
+ speaker_wav_paths: dict[str, Path],
238
+ src_hash_for_tts: str,
239
+ ) -> Audio | None:
240
+ """Produce the TTS audio for a single segment, with cache-around-the-call.
241
+
242
+ Returns the synthesized :class:`Audio`, or ``None`` if Chatterbox
243
+ crashed on the segment (the caller skips it). On cache miss the
244
+ TTS model is lazy-initialized and the per-speaker temp WAV is
245
+ materialized before generation; on cache hit none of that runs,
246
+ so a fully-cached run never loads Chatterbox.
247
+ """
248
+ from videopython.base.audio import Audio as _Audio
249
+
250
+ tts_cache_key: str | None = None
251
+ if self._cache is not None:
252
+ tts_cache_key = DubCache.tts_key(
253
+ translated_text=segment.translated_text,
254
+ voice_sample_bytes=speaker_bytes,
255
+ language=target_lang,
256
+ )
257
+ cached_path = self._cache.get_tts_path(src_hash_for_tts, tts_cache_key)
258
+ if cached_path is not None:
259
+ return _Audio.from_path(cached_path)
260
+
261
+ # Cache miss: pay for TTS init + voice-sample WAV exactly once
262
+ # across the loop. Both are wasted work when every segment hits.
263
+ if self._tts is None or self._tts_language != target_lang:
264
+ self._init_tts(language=target_lang)
265
+ self._tts_language = target_lang
266
+ if voice_clone and speaker not in speaker_wav_paths and speaker in voice_samples:
267
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
268
+ voice_samples[speaker].save(f.name)
269
+ speaker_wav_paths[speaker] = Path(f.name)
270
+
271
+ wav_path = speaker_wav_paths.get(speaker) if voice_clone else None
272
+ try:
273
+ if wav_path is not None:
274
+ dubbed_audio = self._tts.generate_audio(segment.translated_text, voice_sample_path=wav_path)
275
+ else:
276
+ dubbed_audio = self._tts.generate_audio(segment.translated_text)
277
+ except Exception as exc:
278
+ # Chatterbox occasionally crashes on short translated text
279
+ # (alignment_stream_analyzer indexing on tensors with <=5
280
+ # speech tokens). One bad segment shouldn't lose a long
281
+ # multi-hour run — log and let the caller skip.
282
+ logger.warning(
283
+ "TTS failed for segment (speaker=%s, text=%r): %s — skipping",
284
+ speaker,
285
+ segment.translated_text,
286
+ exc,
287
+ )
288
+ return None
289
+
290
+ if self._cache is not None and tts_cache_key is not None:
291
+ dubbed_audio.save(self._cache.reserve_tts_path(src_hash_for_tts, tts_cache_key))
292
+ return dubbed_audio
293
+
294
+ def _translate_with_cache(
295
+ self,
296
+ transcription: Transcription,
297
+ source_audio: Audio,
298
+ source_lang: str,
299
+ target_lang: str,
300
+ report_progress: Callable[[str, float], None],
301
+ ) -> tuple[list[TranslatedSegment], list[int]]:
302
+ """Run translation with cache-around-the-call.
303
+
304
+ Returns ``(translated_segments, translation_failures)``. Only
305
+ fully-successful translations are cached — partial Qwen failures
306
+ would otherwise lock in an incomplete dub across runs. The
307
+ progress callback maps the backend's [0, 1] fraction onto the
308
+ pipeline's translation window (0.35 → 0.50).
309
+ """
310
+ from videopython.ai.dubbing.models import TranslatedSegment
311
+
312
+ cache_key: str | None = None
313
+ if self._cache is not None:
314
+ cache_key = DubCache.translation_key(
315
+ source_lang=source_lang,
316
+ target_lang=target_lang,
317
+ translator_class=self._resolved_translator_class_name(source_lang, target_lang),
318
+ )
319
+ cached = self._cache.get_translation(DubCache.source_key(source_audio), cache_key)
320
+ if cached is not None:
321
+ return [TranslatedSegment.from_dict(d) for d in cached], []
322
+
323
+ if self._translator is None:
324
+ self._init_translator(source_lang=source_lang, target_lang=target_lang)
325
+
326
+ # Translation stage spans 0.35 → 0.50 of overall pipeline progress.
327
+ # MarianMT runs sequentially over 8-segment batches; on a 15-min
328
+ # source that's minutes of silent dwell on 0.35 without per-batch
329
+ # ticks. Map the [0,1] translation fraction onto that 15% window.
330
+ def _on_translation_progress(fraction: float) -> None:
331
+ clamped = max(0.0, min(1.0, fraction))
332
+ report_progress(f"Translating text ({int(clamped * 100)}%)", 0.35 + 0.15 * clamped)
333
+
334
+ translated_segments = self._translator.translate_segments(
335
+ segments=transcription.segments,
336
+ target_lang=target_lang,
337
+ source_lang=source_lang,
338
+ progress_callback=_on_translation_progress,
339
+ )
340
+ # Capture per-segment failures (always empty for Marian) before
341
+ # _maybe_unload nukes the backend in low_memory mode.
342
+ translation_failures = list(self._translator.translation_failures)
343
+ self._maybe_unload("_translator")
344
+
345
+ if self._cache is not None and cache_key is not None and not translation_failures:
346
+ self._cache.put_translation(
347
+ DubCache.source_key(source_audio),
348
+ cache_key,
349
+ [s.to_dict() for s in translated_segments],
350
+ )
351
+
352
+ return translated_segments, translation_failures
353
+
354
+ def _transcription_cache_keys(self, source_audio: Audio, enable_diarization: bool = False) -> tuple[str, str]:
355
+ """Return ``(src_hash, kwargs_hash)`` for the current transcription config.
356
+
357
+ Centralizes the kwarg list so the cache lookup, the put, and any
358
+ future invalidator agree on what's hashed.
359
+ """
360
+ src_hash = DubCache.source_key(source_audio)
361
+ kwargs_hash = DubCache.transcription_kwargs_hash(
362
+ whisper_model=self.whisper_model,
363
+ enable_diarization=enable_diarization,
364
+ condition_on_previous_text=self.condition_on_previous_text,
365
+ no_speech_threshold=self.no_speech_threshold,
366
+ logprob_threshold=self.logprob_threshold,
367
+ )
368
+ return src_hash, kwargs_hash
369
+
131
370
  def _init_transcriber(self, enable_diarization: bool = False) -> None:
132
371
  """Initialize the transcription model."""
133
372
  from videopython.ai.understanding.audio import AudioToText
@@ -158,6 +397,31 @@ class LocalDubbingPipeline:
158
397
  else: # "auto"
159
398
  self._translator = self._resolve_translator_auto(source_lang, target_lang)
160
399
 
400
+ def _resolved_translator_class_name(self, source_lang: str, target_lang: str) -> str:
401
+ """Return the *class name* of the translator that ``_init_translator``
402
+ would pick — without constructing one.
403
+
404
+ Used by the cache to key translations on the resolved backend rather
405
+ than the user-supplied ``"auto"``: a CPU run that resolves to Marian
406
+ must not collide with a GPU run that resolves to Qwen.
407
+ """
408
+ if self.translator == "marian":
409
+ return "MarianTranslator"
410
+ if self.translator == "qwen3":
411
+ return "Qwen3Translator"
412
+ # auto — mirror _resolve_translator_auto's branching, no construction.
413
+ device = select_device(self.device, mps_allowed=True)
414
+ has_gpu = device in ("cuda", "mps")
415
+ if has_gpu and Qwen3Translator.supports(source_lang, target_lang):
416
+ return "Qwen3Translator"
417
+ if MarianTranslator.has_model_for(source_lang, target_lang):
418
+ return "MarianTranslator"
419
+ if Qwen3Translator.supports(source_lang, target_lang):
420
+ return "Qwen3Translator"
421
+ # No backend supports the pair — _init_translator will raise. We
422
+ # return a sentinel; the cache miss path will pay that cost.
423
+ return "Unsupported"
424
+
161
425
  def _resolve_translator_auto(self, source_lang: str, target_lang: str) -> TranslationBackend:
162
426
  """Pick a backend based on language coverage AND device.
163
427
 
@@ -417,12 +681,7 @@ class LocalDubbingPipeline:
417
681
  )
418
682
  else:
419
683
  report_progress("Transcribing audio", 0.05)
420
- if self._transcriber is None or self._transcriber_diarization != enable_diarization:
421
- self._init_transcriber(enable_diarization=enable_diarization)
422
- self._transcriber_diarization = enable_diarization
423
-
424
- transcription = self._transcriber.transcribe(source_audio)
425
- self._maybe_unload("_transcriber")
684
+ transcription = self._transcribe_with_cache(source_audio, enable_diarization)
426
685
 
427
686
  if not transcription.segments:
428
687
  return DubbingResult(
@@ -495,50 +754,29 @@ class LocalDubbingPipeline:
495
754
  del vocal_audio
496
755
 
497
756
  report_progress("Translating text", 0.35)
498
- if self._translator is None:
499
- self._init_translator(source_lang=detected_lang, target_lang=target_lang)
500
-
501
- # Translation stage spans 0.35 → 0.50 of overall pipeline progress.
502
- # MarianMT runs sequentially over 8-segment batches; on a 15-min
503
- # source that's minutes of silent dwell on 0.35 without per-batch
504
- # ticks. Map the [0,1] translation fraction onto that 15% window.
505
- def _on_translation_progress(fraction: float) -> None:
506
- clamped = max(0.0, min(1.0, fraction))
507
- report_progress(f"Translating text ({int(clamped * 100)}%)", 0.35 + 0.15 * clamped)
508
-
509
- translated_segments = self._translator.translate_segments(
510
- segments=transcription.segments,
511
- target_lang=target_lang,
512
- source_lang=detected_lang,
513
- progress_callback=_on_translation_progress,
757
+ translated_segments, translation_failures = self._translate_with_cache(
758
+ transcription, source_audio, detected_lang, target_lang, report_progress
514
759
  )
515
- # Capture per-segment failures (always empty for Marian) before
516
- # _maybe_unload nukes the backend in low_memory mode.
517
- translation_failures = list(self._translator.translation_failures)
518
- self._maybe_unload("_translator")
519
760
 
520
761
  report_progress("Generating dubbed speech", 0.50)
521
- if self._tts is None or self._tts_language != target_lang:
522
- self._init_tts(language=target_lang)
523
- self._tts_language = target_lang
762
+
763
+ # Per-speaker voice-sample bytes for TTS cache key. Empty when
764
+ # voice_clone=False — the cache key still differentiates "no voice
765
+ # sample" from "specific clone" via the None path.
766
+ voice_sample_bytes: dict[str, bytes] = (
767
+ {speaker: sample.data.tobytes() for speaker, sample in voice_samples.items()} if voice_clone else {}
768
+ )
769
+ src_hash_for_tts = DubCache.source_key(source_audio) if self._cache is not None else ""
524
770
 
525
771
  dubbed_segments: list[Audio] = []
526
772
  target_durations: list[float] = []
527
773
  start_times: list[float] = []
528
774
 
529
- # Encode each speaker's voice sample to a temp WAV exactly once and
530
- # reuse the path across every segment for that speaker. Without this
531
- # cache, TextToSpeech.generate_audio re-encodes the same voice sample
532
- # on every call (one temp WAV write + delete per segment), which is
533
- # pure overhead for long dubs with many segments per speaker.
775
+ # Per-speaker temp WAVs are materialized lazily by _tts_segment_audio
776
+ # so a fully-cached run never writes one. The dict is loop-scoped
777
+ # state so the finally block can clean up regardless of cache outcome.
534
778
  speaker_wav_paths: dict[str, Path] = {}
535
779
  try:
536
- if voice_clone:
537
- for speaker, sample in voice_samples.items():
538
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
539
- sample.save(f.name)
540
- speaker_wav_paths[speaker] = Path(f.name)
541
-
542
780
  for i, segment in enumerate(translated_segments):
543
781
  if segment.duration < 0.1:
544
782
  continue
@@ -553,26 +791,17 @@ class LocalDubbingPipeline:
553
791
  report_progress(f"Generating speech ({i + 1}/{len(translated_segments)})", progress)
554
792
 
555
793
  speaker = segment.speaker or "speaker_0"
556
- cached_path = speaker_wav_paths.get(speaker) if voice_clone else None
557
-
558
- try:
559
- if cached_path is not None:
560
- dubbed_audio = self._tts.generate_audio(segment.translated_text, voice_sample_path=cached_path)
561
- else:
562
- dubbed_audio = self._tts.generate_audio(segment.translated_text)
563
- except Exception as e:
564
- # Chatterbox occasionally crashes on short translated text
565
- # (alignment_stream_analyzer indexing on tensors with <=5
566
- # speech tokens). One bad segment shouldn't lose a long
567
- # multi-hour run — log and skip so the rest proceeds.
568
- logger.warning(
569
- "TTS failed for segment %d/%d (speaker=%s, text=%r): %s — skipping",
570
- i + 1,
571
- len(translated_segments),
572
- speaker,
573
- segment.translated_text,
574
- e,
575
- )
794
+ dubbed_audio = self._tts_segment_audio(
795
+ segment=segment,
796
+ speaker=speaker,
797
+ speaker_bytes=voice_sample_bytes.get(speaker),
798
+ target_lang=target_lang,
799
+ voice_clone=voice_clone,
800
+ voice_samples=voice_samples,
801
+ speaker_wav_paths=speaker_wav_paths,
802
+ src_hash_for_tts=src_hash_for_tts,
803
+ )
804
+ if dubbed_audio is None:
576
805
  continue
577
806
 
578
807
  dubbed_segments.append(dubbed_audio)
@@ -611,10 +840,10 @@ class LocalDubbingPipeline:
611
840
  else:
612
841
  final_audio = dubbed_speech
613
842
 
614
- # Peak-match against the source so the dub doesn't land quieter
615
- # than the original. Done last so it captures both vocals+background
616
- # mixes and speech-only outputs uniformly.
617
- final_audio = _peak_match(final_audio, source_audio)
843
+ # Loudness-match against the source so the dub doesn't land
844
+ # perceptually thinner than the original. Done last so it captures
845
+ # both vocals+background mixes and speech-only outputs uniformly.
846
+ final_audio = _loudness_match(final_audio, source_audio)
618
847
 
619
848
  report_progress("Complete", 1.0)
620
849
 
@@ -733,7 +962,7 @@ class LocalDubbingPipeline:
733
962
  else:
734
963
  final_audio = generated_speech
735
964
 
736
- final_audio = _peak_match(final_audio, source_audio)
965
+ final_audio = _loudness_match(final_audio, source_audio)
737
966
 
738
967
  report_progress("Complete", 1.0)
739
968
 
@@ -21,25 +21,45 @@ class RemuxError(RuntimeError):
21
21
  """ffmpeg failed while replacing an audio stream."""
22
22
 
23
23
 
24
+ def _build_stream_maps(keep_original_audio: bool) -> list[str]:
25
+ """ffmpeg ``-map`` flags for the video + audio + subtitle streams.
26
+
27
+ Convention: dubbed audio (input 1) is the *first* audio track so default
28
+ playback uses it; original audio (input 0) tags onto the back when
29
+ ``keep_original_audio=True`` for editorial A/B. Subtitles from input 0
30
+ are carried with ``?`` so sources without subs don't fail the mux.
31
+ """
32
+ maps = ["-map", "0:v:0", "-map", "1:a:0"]
33
+ if keep_original_audio:
34
+ maps += ["-map", "0:a?"]
35
+ maps += ["-map", "0:s?"]
36
+ return maps
37
+
38
+
24
39
  def replace_audio_stream(
25
40
  video_path: str | Path,
26
41
  audio_path: str | Path,
27
42
  output_path: str | Path,
28
43
  audio_codec: str = "aac",
29
44
  audio_bitrate: str = "192k",
45
+ keep_original_audio: bool = False,
30
46
  ) -> None:
31
47
  """Copy ``video_path``'s video stream and mux in ``audio_path`` as the audio track.
32
48
 
33
49
  Uses ffmpeg stream-copy for video (no re-encode) and encodes audio to AAC.
34
- ``-shortest`` trims to the shorter of the two streams so the output duration
35
- matches the source video when the dubbed audio is slightly longer.
50
+ Subtitle streams from ``video_path`` are carried through unchanged
51
+ (stream-copy). ``-shortest`` trims to the shorter of the two streams so
52
+ the output duration matches the source video when the dubbed audio is
53
+ slightly longer.
36
54
 
37
55
  Args:
38
- video_path: Source video file (video stream is copied unchanged).
39
- audio_path: Audio file to use as the new audio track.
56
+ video_path: Source video file (video + subtitle streams are copied unchanged).
57
+ audio_path: Audio file to use as the new (default) audio track.
40
58
  output_path: Destination file. Overwritten if it exists.
41
59
  audio_codec: ffmpeg audio codec name. Defaults to ``aac`` (MP4-compatible).
42
60
  audio_bitrate: Audio bitrate passed to ffmpeg (``-b:a``).
61
+ keep_original_audio: If True, retain the source audio as a secondary
62
+ track behind the dubbed one. Useful for editorial A/B.
43
63
 
44
64
  Raises:
45
65
  FileNotFoundError: If ``video_path`` or ``audio_path`` does not exist.
@@ -61,16 +81,15 @@ def replace_audio_stream(
61
81
  str(video_path),
62
82
  "-i",
63
83
  str(audio_path),
64
- "-map",
65
- "0:v:0",
66
- "-map",
67
- "1:a:0",
84
+ *_build_stream_maps(keep_original_audio),
68
85
  "-c:v",
69
86
  "copy",
70
87
  "-c:a",
71
88
  audio_codec,
72
89
  "-b:a",
73
90
  audio_bitrate,
91
+ "-c:s",
92
+ "copy",
74
93
  "-shortest",
75
94
  str(output_path),
76
95
  ]
@@ -87,20 +106,24 @@ def replace_audio_stream_from_audio(
87
106
  output_path: str | Path,
88
107
  audio_codec: str = "aac",
89
108
  audio_bitrate: str = "192k",
109
+ keep_original_audio: bool = False,
90
110
  ) -> None:
91
111
  """Like ``replace_audio_stream`` but takes an in-memory ``Audio`` and pipes WAV to ffmpeg.
92
112
 
93
113
  Avoids the ``Audio.save -> read-from-disk -> ffmpeg`` round-trip used by
94
114
  the path-based variant: we serialize the WAV in memory and feed it to
95
115
  ffmpeg via stdin. For long dubs this saves a full WAV write+read of the
96
- output audio (~10 GB for a 2h source).
116
+ output audio (~10 GB for a 2h source). Subtitle streams from
117
+ ``video_path`` are carried through unchanged (stream-copy).
97
118
 
98
119
  Args:
99
- video_path: Source video file (video stream is copied unchanged).
100
- audio: ``Audio`` instance to mux in as the new audio track.
120
+ video_path: Source video file (video + subtitle streams are copied unchanged).
121
+ audio: ``Audio`` instance to mux in as the new (default) audio track.
101
122
  output_path: Destination file. Overwritten if it exists.
102
123
  audio_codec: ffmpeg audio codec name. Defaults to ``aac``.
103
124
  audio_bitrate: Audio bitrate passed to ffmpeg (``-b:a``).
125
+ keep_original_audio: If True, retain the source audio as a secondary
126
+ track behind the dubbed one. Useful for editorial A/B.
104
127
 
105
128
  Raises:
106
129
  FileNotFoundError: If ``video_path`` does not exist.
@@ -133,16 +156,15 @@ def replace_audio_stream_from_audio(
133
156
  "wav",
134
157
  "-i",
135
158
  "-",
136
- "-map",
137
- "0:v:0",
138
- "-map",
139
- "1:a:0",
159
+ *_build_stream_maps(keep_original_audio),
140
160
  "-c:v",
141
161
  "copy",
142
162
  "-c:a",
143
163
  audio_codec,
144
164
  "-b:a",
145
165
  audio_bitrate,
166
+ "-c:s",
167
+ "copy",
146
168
  "-shortest",
147
169
  str(output_path),
148
170
  ]
File without changes
File without changes
File without changes