videopython 0.28.0__tar.gz → 0.28.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. {videopython-0.28.0 → videopython-0.28.2}/PKG-INFO +3 -1
  2. {videopython-0.28.0 → videopython-0.28.2}/pyproject.toml +11 -1
  3. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/ai/dubbing/__init__.py +5 -0
  4. videopython-0.28.2/src/videopython/ai/dubbing/cache.py +296 -0
  5. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/ai/dubbing/dubber.py +27 -2
  6. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/ai/dubbing/models.py +31 -0
  7. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/ai/dubbing/pipeline.py +373 -74
  8. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/ai/dubbing/remux.py +37 -15
  9. videopython-0.28.2/src/videopython/ai/generation/qwen3.py +394 -0
  10. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/ai/generation/translation.py +109 -5
  11. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/ai/understanding/audio.py +40 -1
  12. {videopython-0.28.0 → videopython-0.28.2}/.gitignore +0 -0
  13. {videopython-0.28.0 → videopython-0.28.2}/LICENSE +0 -0
  14. {videopython-0.28.0 → videopython-0.28.2}/README.md +0 -0
  15. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/__init__.py +0 -0
  16. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/ai/__init__.py +0 -0
  17. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/ai/_device.py +0 -0
  18. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/ai/dubbing/quality.py +0 -0
  19. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/ai/dubbing/timing.py +0 -0
  20. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/ai/generation/__init__.py +0 -0
  21. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/ai/generation/audio.py +0 -0
  22. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/ai/generation/image.py +0 -0
  23. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/ai/generation/video.py +0 -0
  24. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/ai/registry.py +0 -0
  25. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/ai/swapping/__init__.py +0 -0
  26. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/ai/swapping/inpainter.py +0 -0
  27. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/ai/swapping/models.py +0 -0
  28. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/ai/swapping/segmenter.py +0 -0
  29. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/ai/swapping/swapper.py +0 -0
  30. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/ai/transforms.py +0 -0
  31. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/ai/understanding/__init__.py +0 -0
  32. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/ai/understanding/image.py +0 -0
  33. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/ai/understanding/separation.py +0 -0
  34. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/ai/understanding/temporal.py +0 -0
  35. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/ai/video_analysis.py +0 -0
  36. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/base/__init__.py +0 -0
  37. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/base/audio/__init__.py +0 -0
  38. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/base/audio/analysis.py +0 -0
  39. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/base/audio/audio.py +0 -0
  40. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/base/combine.py +0 -0
  41. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/base/description.py +0 -0
  42. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/base/effects.py +0 -0
  43. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/base/exceptions.py +0 -0
  44. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/base/progress.py +0 -0
  45. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/base/registry.py +0 -0
  46. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/base/scene.py +0 -0
  47. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/base/streaming.py +0 -0
  48. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/base/text/__init__.py +0 -0
  49. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/base/text/overlay.py +0 -0
  50. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/base/text/transcription.py +0 -0
  51. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/base/transforms.py +0 -0
  52. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/base/transitions.py +0 -0
  53. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/base/utils.py +0 -0
  54. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/base/video.py +0 -0
  55. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/editing/__init__.py +0 -0
  56. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/editing/multicam.py +0 -0
  57. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/editing/premiere_xml.py +0 -0
  58. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/editing/video_edit.py +0 -0
  59. {videopython-0.28.0 → videopython-0.28.2}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.28.0
3
+ Version: 0.28.2
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -29,10 +29,12 @@ Requires-Dist: demucs>=4.0.0; extra == 'ai'
29
29
  Requires-Dist: diffusers>=0.30.0; extra == 'ai'
30
30
  Requires-Dist: easyocr>=1.7.0; extra == 'ai'
31
31
  Requires-Dist: hf-transfer>=0.1.9; extra == 'ai'
32
+ Requires-Dist: llama-cpp-python>=0.3.0; extra == 'ai'
32
33
  Requires-Dist: numba>=0.61.0; extra == 'ai'
33
34
  Requires-Dist: ollama>=0.4.5; extra == 'ai'
34
35
  Requires-Dist: openai-whisper>=20240930; extra == 'ai'
35
36
  Requires-Dist: pyannote-audio>=4.0.0; extra == 'ai'
37
+ Requires-Dist: pyloudnorm>=0.1.1; extra == 'ai'
36
38
  Requires-Dist: scikit-learn>=1.3.0; extra == 'ai'
37
39
  Requires-Dist: scipy>=1.10.0; extra == 'ai'
38
40
  Requires-Dist: sentencepiece>=0.1.99; extra == 'ai'
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.28.0"
3
+ version = "0.28.2"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -80,6 +80,10 @@ ai = [
80
80
  "sentencepiece>=0.1.99",
81
81
  # Audio source separation
82
82
  "demucs>=4.0.0",
83
+ # Translation backend: Qwen3 GGUF inference (M2)
84
+ "llama-cpp-python>=0.3.0",
85
+ # Loudness measurement (BS.1770) for dub-vs-source loudness matching (M3)
86
+ "pyloudnorm>=0.1.1",
83
87
  ]
84
88
 
85
89
  # Required for pip install videopython[ai] - pip uses optional-dependencies, not dependency-groups
@@ -111,6 +115,10 @@ ai = [
111
115
  "sentencepiece>=0.1.99",
112
116
  # Audio source separation
113
117
  "demucs>=4.0.0",
118
+ # Translation backend: Qwen3 GGUF inference (M2)
119
+ "llama-cpp-python>=0.3.0",
120
+ # Loudness measurement (BS.1770) for dub-vs-source loudness matching (M3)
121
+ "pyloudnorm>=0.1.1",
114
122
  ]
115
123
 
116
124
  [project.urls]
@@ -136,6 +144,8 @@ module = [
136
144
  "pyannote", "pyannote.*",
137
145
  "silero_vad", "silero_vad.*",
138
146
  "cv2", "cv2.*",
147
+ "llama_cpp", "llama_cpp.*",
148
+ "pyloudnorm", "pyloudnorm.*",
139
149
  ]
140
150
  ignore_missing_imports = true
141
151
 
@@ -1,10 +1,12 @@
1
1
  """Local video dubbing functionality."""
2
2
 
3
+ from videopython.ai.dubbing.cache import DubCache, dub_cache_clear
3
4
  from videopython.ai.dubbing.dubber import VideoDubber
4
5
  from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio, TranslatedSegment
5
6
  from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
6
7
  from videopython.ai.dubbing.quality import GarbageTranscriptError, TranscriptQuality, assess_transcript
7
8
  from videopython.ai.dubbing.timing import TimingSynchronizer
9
+ from videopython.ai.generation.translation import UnsupportedLanguageError
8
10
 
9
11
  __all__ = [
10
12
  "VideoDubber",
@@ -17,4 +19,7 @@ __all__ = [
17
19
  "GarbageTranscriptError",
18
20
  "TranscriptQuality",
19
21
  "assess_transcript",
22
+ "UnsupportedLanguageError",
23
+ "DubCache",
24
+ "dub_cache_clear",
20
25
  ]
@@ -0,0 +1,296 @@
1
+ """Filesystem-backed cache for resumable dubbing runs.
2
+
3
+ A long dub crashes at TTS segment 312/400 today and re-runs Whisper,
4
+ Demucs, translation, and the first 311 TTS segments from scratch.
5
+ :class:`DubCache` stores three artifacts so subsequent runs skip stages
6
+ whose inputs match:
7
+
8
+ - ``transcription.json`` — output of ``AudioToText.transcribe``.
9
+ - ``translation_<key>.json`` — output of ``TranslationBackend.translate_segments``.
10
+ - ``tts/<key>.wav`` — per-segment TTS WAV.
11
+
12
+ Cache directories are opt-in via ``VideoDubber(cache_dir=...)`` / ``LocalDubbingPipeline(cache_dir=...)``.
13
+ ``cache_dir=None`` (default) is a no-op pass-through.
14
+
15
+ Hash inputs are conservative — false misses (re-run a stage) are cheap;
16
+ false hits (deliver a stale dub) are bugs. Source-audio identity uses a
17
+ sha256 of the raw float32 bytes, not file path, so re-encoding the same
18
+ content invalidates correctly.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import hashlib
24
+ import json
25
+ import logging
26
+ from dataclasses import dataclass
27
+ from pathlib import Path
28
+ from typing import TYPE_CHECKING, Any
29
+
30
+ if TYPE_CHECKING:
31
+ from videopython.base.audio import Audio
32
+ from videopython.base.text.transcription import Transcription
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ # Cache schema version. Bump on incompatible changes to any artifact's
38
+ # on-disk format (e.g. TranscriptionSegment field changes that break
39
+ # from_dict). Mismatched cache entries are treated as a miss.
40
+ SCHEMA_VERSION = 1
41
+
42
+ # Reserved for M4.3 per-speaker voice library. M3.2 does not write here;
43
+ # documented so future code knows the path is taken.
44
+ _VOICE_CLONES_SUBDIR = "voice_clones"
45
+
46
+
47
+ @dataclass(frozen=True)
48
+ class _ArtifactPaths:
49
+ """Resolved paths for a single source's cache directory."""
50
+
51
+ src_dir: Path
52
+ metadata: Path
53
+ transcription: Path
54
+ tts_dir: Path
55
+
56
+ def translation_path(self, lang_key: str) -> Path:
57
+ return self.src_dir / f"translation_{lang_key}.json"
58
+
59
+ def tts_path(self, seg_key: str) -> Path:
60
+ return self.tts_dir / f"{seg_key}.wav"
61
+
62
+
63
+ def _stable_hash(*parts: str | int | float | bool | None) -> str:
64
+ """Short hex digest over a tuple of primitive values.
65
+
66
+ Stable across runs — uses ``str(part)`` so int/float/bool/None all
67
+ serialize deterministically. 16 hex chars (64 bits) is plenty of
68
+ space for the small cardinality we're hashing into.
69
+ """
70
+ h = hashlib.sha256()
71
+ for part in parts:
72
+ h.update(repr(part).encode("utf-8"))
73
+ h.update(b"\x00")
74
+ return h.hexdigest()[:16]
75
+
76
+
77
+ def _audio_bytes_hash(audio: Audio) -> str:
78
+ """sha256 over the raw audio data buffer.
79
+
80
+ Used as the per-source cache directory name. Bytes-level so re-encoded
81
+ sources (different container, same content) collide intentionally only
82
+ when the decoded float32 buffer matches.
83
+ """
84
+ h = hashlib.sha256()
85
+ h.update(audio.data.tobytes())
86
+ return h.hexdigest()[:16]
87
+
88
+
89
+ class DubCache:
90
+ """Filesystem cache for transcription, translation, and TTS artifacts.
91
+
92
+ Layout under ``root``::
93
+
94
+ <root>/<src_hash>/
95
+ metadata.json # schema version + hash inputs
96
+ transcription.json # populated on transcription cache miss
97
+ translation_<lang_key>.json
98
+ tts/<seg_key>.wav
99
+ voice_clones/ # reserved for M4.3, not written here
100
+
101
+ All getters return ``None`` on miss. Putters are idempotent
102
+ (overwrite). Schema-version mismatch is treated as a miss for every
103
+ artifact under that source.
104
+ """
105
+
106
+ def __init__(self, root: str | Path) -> None:
107
+ self.root = Path(root)
108
+ self.root.mkdir(parents=True, exist_ok=True)
109
+
110
+ # ----- key derivation --------------------------------------------------
111
+
112
+ @staticmethod
113
+ def source_key(audio: Audio) -> str:
114
+ """Per-source identifier — sha256 of the raw audio buffer.
115
+
116
+ This is the directory name under ``root``; one dir per distinct
117
+ source, regardless of which stage's kwargs vary.
118
+ """
119
+ return _audio_bytes_hash(audio)
120
+
121
+ @staticmethod
122
+ def transcription_kwargs_hash(
123
+ *,
124
+ whisper_model: str,
125
+ enable_diarization: bool,
126
+ condition_on_previous_text: bool,
127
+ no_speech_threshold: float,
128
+ logprob_threshold: float | None,
129
+ ) -> str:
130
+ return _stable_hash(
131
+ whisper_model,
132
+ enable_diarization,
133
+ condition_on_previous_text,
134
+ no_speech_threshold,
135
+ logprob_threshold,
136
+ )
137
+
138
+ @staticmethod
139
+ def translation_key(
140
+ *,
141
+ source_lang: str,
142
+ target_lang: str,
143
+ translator_class: str,
144
+ ) -> str:
145
+ """Hash captures the source/target pair + the resolved backend class.
146
+
147
+ ``translator_class`` is the *resolved* class name (e.g. ``"MarianTranslator"``),
148
+ not the user-supplied ``"auto"`` — a CPU run that resolves to Marian
149
+ must not collide with a GPU run that resolves to Qwen on the same
150
+ language pair.
151
+ """
152
+ return _stable_hash(source_lang, target_lang, translator_class)
153
+
154
+ @staticmethod
155
+ def tts_key(
156
+ *,
157
+ translated_text: str,
158
+ voice_sample_bytes: bytes | None,
159
+ language: str,
160
+ ) -> str:
161
+ """Per-segment key over text + voice sample + language."""
162
+ h = hashlib.sha256()
163
+ h.update(translated_text.encode("utf-8"))
164
+ h.update(b"\x00")
165
+ h.update(voice_sample_bytes or b"")
166
+ h.update(b"\x00")
167
+ h.update(language.encode("utf-8"))
168
+ return h.hexdigest()[:16]
169
+
170
+ # ----- path resolution -------------------------------------------------
171
+
172
+ def _paths_for(self, src_hash: str) -> _ArtifactPaths:
173
+ src_dir = self.root / src_hash
174
+ return _ArtifactPaths(
175
+ src_dir=src_dir,
176
+ metadata=src_dir / "metadata.json",
177
+ transcription=src_dir / "transcription.json",
178
+ tts_dir=src_dir / "tts",
179
+ )
180
+
181
+ def _ensure_metadata(self, paths: _ArtifactPaths, hash_inputs: dict[str, Any]) -> None:
182
+ """Create the source dir + metadata.json if missing.
183
+
184
+ ``hash_inputs`` records the kwargs we hashed against so a future
185
+ schema change can audit cache entries. The schema field is
186
+ load-bearing: mismatched versions invalidate the entire source dir.
187
+ """
188
+ paths.src_dir.mkdir(parents=True, exist_ok=True)
189
+ paths.tts_dir.mkdir(parents=True, exist_ok=True)
190
+ if not paths.metadata.exists():
191
+ paths.metadata.write_text(
192
+ json.dumps(
193
+ {"schema": SCHEMA_VERSION, "hash_inputs": hash_inputs},
194
+ indent=2,
195
+ ),
196
+ encoding="utf-8",
197
+ )
198
+
199
+ def _schema_ok(self, paths: _ArtifactPaths) -> bool:
200
+ if not paths.metadata.exists():
201
+ return True # fresh dir; we'll write metadata on first put.
202
+ try:
203
+ data = json.loads(paths.metadata.read_text(encoding="utf-8"))
204
+ except (OSError, json.JSONDecodeError):
205
+ return False
206
+ return data.get("schema") == SCHEMA_VERSION
207
+
208
+ # ----- transcription ---------------------------------------------------
209
+
210
+ def get_transcription(self, src_hash: str, kwargs_hash: str) -> Transcription | None:
211
+ from videopython.base.text.transcription import Transcription
212
+
213
+ paths = self._paths_for(src_hash)
214
+ if not paths.transcription.exists() or not self._schema_ok(paths):
215
+ return None
216
+ try:
217
+ data = json.loads(paths.transcription.read_text(encoding="utf-8"))
218
+ except (OSError, json.JSONDecodeError):
219
+ return None
220
+ if data.get("kwargs_hash") != kwargs_hash:
221
+ return None
222
+ logger.info("cache hit: transcription (%s)", src_hash)
223
+ return Transcription.from_dict(data["transcription"])
224
+
225
+ def put_transcription(
226
+ self,
227
+ src_hash: str,
228
+ kwargs_hash: str,
229
+ transcription: Transcription,
230
+ hash_inputs: dict[str, Any],
231
+ ) -> None:
232
+ paths = self._paths_for(src_hash)
233
+ self._ensure_metadata(paths, hash_inputs)
234
+ paths.transcription.write_text(
235
+ json.dumps(
236
+ {"kwargs_hash": kwargs_hash, "transcription": transcription.to_dict()},
237
+ ensure_ascii=False,
238
+ ),
239
+ encoding="utf-8",
240
+ )
241
+
242
+ # ----- translation -----------------------------------------------------
243
+
244
+ def get_translation(self, src_hash: str, lang_key: str) -> list[dict[str, Any]] | None:
245
+ paths = self._paths_for(src_hash)
246
+ if not self._schema_ok(paths):
247
+ return None
248
+ path = paths.translation_path(lang_key)
249
+ if not path.exists():
250
+ return None
251
+ try:
252
+ data = json.loads(path.read_text(encoding="utf-8"))
253
+ except (OSError, json.JSONDecodeError):
254
+ return None
255
+ logger.info("cache hit: translation (%s/%s)", src_hash, lang_key)
256
+ return data["segments"]
257
+
258
+ def put_translation(self, src_hash: str, lang_key: str, segments_dict: list[dict[str, Any]]) -> None:
259
+ paths = self._paths_for(src_hash)
260
+ self._ensure_metadata(paths, {})
261
+ paths.translation_path(lang_key).write_text(
262
+ json.dumps({"segments": segments_dict}, ensure_ascii=False),
263
+ encoding="utf-8",
264
+ )
265
+
266
+ # ----- tts -------------------------------------------------------------
267
+
268
+ def get_tts_path(self, src_hash: str, seg_key: str) -> Path | None:
269
+ paths = self._paths_for(src_hash)
270
+ if not self._schema_ok(paths):
271
+ return None
272
+ path = paths.tts_path(seg_key)
273
+ return path if path.exists() else None
274
+
275
+ def reserve_tts_path(self, src_hash: str, seg_key: str) -> Path:
276
+ """Return the path TTS output should be written to. Caller is
277
+ responsible for the actual write (Audio.save)."""
278
+ paths = self._paths_for(src_hash)
279
+ self._ensure_metadata(paths, {})
280
+ return paths.tts_path(seg_key)
281
+
282
+
283
+ def dub_cache_clear(cache_dir: str | Path, src_hash: str | None = None) -> None:
284
+ """Delete cache entries for a specific source or the whole cache root.
285
+
286
+ No auto-eviction in M3.2 — call this to reclaim disk space when a
287
+ cache directory has grown unwieldy. Safe no-op if ``cache_dir`` or
288
+ ``cache_dir/<src_hash>`` does not exist.
289
+ """
290
+ import shutil
291
+
292
+ root = Path(cache_dir)
293
+ target = root / src_hash if src_hash else root
294
+ if target.exists():
295
+ shutil.rmtree(target)
296
+ logger.info("dub_cache_clear: removed %s", target)
@@ -7,7 +7,7 @@ from pathlib import Path
7
7
  from typing import TYPE_CHECKING, Any, Callable
8
8
 
9
9
  from videopython.ai.dubbing.models import DubbingResult, RevoiceResult
10
- from videopython.ai.dubbing.pipeline import WhisperModel
10
+ from videopython.ai.dubbing.pipeline import TranslatorChoice, WhisperModel
11
11
 
12
12
  if TYPE_CHECKING:
13
13
  from videopython.base.video import Video
@@ -44,6 +44,19 @@ class VideoDubber:
44
44
  but processing continues. Either way the
45
45
  :class:`TranscriptQuality` is exposed on ``DubbingResult`` for
46
46
  inspection.
47
+ translator: Translation backend to use. ``"auto"`` (default)
48
+ picks Qwen3 on GPU, MarianMT on CPU; ``"marian"`` and
49
+ ``"qwen3"`` force the named backend regardless of device.
50
+ See :class:`videopython.ai.generation.qwen3.Qwen3Translator`
51
+ for tradeoffs (Qwen3 is slower on CPU but produces
52
+ context-aware, length-budgeted output).
53
+ cache_dir: When set, persist transcription, translated segments,
54
+ and per-segment TTS WAVs under this directory and skip stages
55
+ whose inputs already match a cache entry. Use to resume crashed
56
+ long runs or to iterate on dub configuration without paying
57
+ transcription cost each time. ``None`` (default) disables
58
+ caching. Cache grows unbounded; clear via
59
+ :func:`videopython.ai.dubbing.cache.dub_cache_clear`.
47
60
  """
48
61
 
49
62
  def __init__(
@@ -55,6 +68,8 @@ class VideoDubber:
55
68
  no_speech_threshold: float = 0.6,
56
69
  logprob_threshold: float | None = -1.0,
57
70
  strict_quality: bool = False,
71
+ translator: TranslatorChoice = "auto",
72
+ cache_dir: str | Path | None = None,
58
73
  ):
59
74
  self.device = device
60
75
  self.low_memory = low_memory
@@ -63,13 +78,17 @@ class VideoDubber:
63
78
  self.no_speech_threshold = no_speech_threshold
64
79
  self.logprob_threshold = logprob_threshold
65
80
  self.strict_quality = strict_quality
81
+ self.translator = translator
82
+ self.cache_dir = cache_dir
66
83
  self._local_pipeline: Any = None
67
84
  requested = device.lower() if isinstance(device, str) else "auto"
68
85
  logger.info(
69
- "VideoDubber initialized with device=%s low_memory=%s whisper_model=%s",
86
+ "VideoDubber initialized with device=%s low_memory=%s whisper_model=%s translator=%s cache_dir=%s",
70
87
  requested,
71
88
  low_memory,
72
89
  whisper_model,
90
+ translator,
91
+ cache_dir,
73
92
  )
74
93
 
75
94
  def _init_local_pipeline(self) -> None:
@@ -83,6 +102,8 @@ class VideoDubber:
83
102
  no_speech_threshold=self.no_speech_threshold,
84
103
  logprob_threshold=self.logprob_threshold,
85
104
  strict_quality=self.strict_quality,
105
+ translator=self.translator,
106
+ cache_dir=self.cache_dir,
86
107
  )
87
108
 
88
109
  def dub(
@@ -165,6 +186,7 @@ class VideoDubber:
165
186
  enable_diarization: bool = False,
166
187
  progress_callback: Callable[[str, float], None] | None = None,
167
188
  transcription: Any = None,
189
+ keep_original_audio: bool = False,
168
190
  ) -> DubbingResult:
169
191
  """Dub a video file in place on disk without loading video frames into memory.
170
192
 
@@ -191,6 +213,8 @@ class VideoDubber:
191
213
  step. Speaker labels on the supplied transcription drive per-speaker
192
214
  voice cloning. If it has no speakers, pass ``enable_diarization=True``
193
215
  to add them via pyannote (requires word-level timings).
216
+ keep_original_audio: If True, retain the source audio in the output
217
+ as a secondary track behind the dubbed one (editorial A/B).
194
218
 
195
219
  Returns:
196
220
  ``DubbingResult`` with the dubbed audio, translated segments, and
@@ -229,6 +253,7 @@ class VideoDubber:
229
253
  video_path=input_path,
230
254
  audio=result.dubbed_audio,
231
255
  output_path=output_path,
256
+ keep_original_audio=keep_original_audio,
232
257
  )
233
258
 
234
259
  return result
@@ -59,6 +59,31 @@ class TranslatedSegment:
59
59
  """Duration of the segment in seconds."""
60
60
  return self.end - self.start
61
61
 
62
+ def to_dict(self) -> dict[str, Any]:
63
+ """Convert to dictionary for JSON serialization (used by the dub cache)."""
64
+ return {
65
+ "original_segment": self.original_segment.to_dict(),
66
+ "translated_text": self.translated_text,
67
+ "source_lang": self.source_lang,
68
+ "target_lang": self.target_lang,
69
+ "speaker": self.speaker,
70
+ "start": self.start,
71
+ "end": self.end,
72
+ }
73
+
74
+ @classmethod
75
+ def from_dict(cls, data: dict[str, Any]) -> TranslatedSegment:
76
+ """Reconstruct from a dict produced by :meth:`to_dict`."""
77
+ return cls(
78
+ original_segment=TranscriptionSegment.from_dict(data["original_segment"]),
79
+ translated_text=data["translated_text"],
80
+ source_lang=data["source_lang"],
81
+ target_lang=data["target_lang"],
82
+ speaker=data.get("speaker"),
83
+ start=data.get("start", 0.0),
84
+ end=data.get("end", 0.0),
85
+ )
86
+
62
87
 
63
88
  @dataclass
64
89
  class SeparatedAudio:
@@ -180,6 +205,11 @@ class DubbingResult:
180
205
  timing_summary: Aggregate stats over per-segment timing adjustments.
181
206
  transcript_quality: Heuristic quality assessment of the transcription
182
207
  (None when the pipeline returned early on an empty transcription).
208
+ translation_failures: Indices of segments where translation failed
209
+ entirely. Used by Qwen3Translator when both the primary call and
210
+ the per-segment Marian fallback fail; those segments are dubbed
211
+ with empty text. Empty list under MarianTranslator (Marian has
212
+ no failure mode that drops segments).
183
213
  """
184
214
 
185
215
  dubbed_audio: Audio
@@ -191,6 +221,7 @@ class DubbingResult:
191
221
  voice_samples: dict[str, Audio] = field(default_factory=dict)
192
222
  timing_summary: TimingSummary | None = None
193
223
  transcript_quality: TranscriptQuality | None = None
224
+ translation_failures: list[int] = field(default_factory=list)
194
225
 
195
226
  @property
196
227
  def num_segments(self) -> int: