videopython 0.28.1__tar.gz → 0.28.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.28.1 → videopython-0.28.3}/PKG-INFO +2 -1
- {videopython-0.28.1 → videopython-0.28.3}/pyproject.toml +6 -1
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/dubbing/__init__.py +11 -1
- videopython-0.28.3/src/videopython/ai/dubbing/cache.py +309 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/dubbing/dubber.py +16 -1
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/dubbing/models.py +61 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/dubbing/pipeline.py +364 -76
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/dubbing/remux.py +37 -15
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/generation/audio.py +24 -0
- {videopython-0.28.1 → videopython-0.28.3}/.gitignore +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/LICENSE +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/README.md +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/__init__.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/_device.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/dubbing/quality.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/generation/qwen3.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/generation/translation.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/registry.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/swapping/__init__.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/swapping/inpainter.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/swapping/models.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/swapping/segmenter.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/swapping/swapper.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/transforms.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/understanding/audio.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/understanding/separation.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/ai/video_analysis.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/base/__init__.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/base/audio/__init__.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/base/audio/analysis.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/base/audio/audio.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/base/combine.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/base/description.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/base/effects.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/base/progress.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/base/registry.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/base/scene.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/base/streaming.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/base/text/__init__.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/base/text/overlay.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/base/text/transcription.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/base/transforms.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/base/transitions.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/base/utils.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/base/video.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/editing/__init__.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/editing/multicam.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/editing/premiere_xml.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/editing/video_edit.py +0 -0
- {videopython-0.28.1 → videopython-0.28.3}/src/videopython/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: videopython
|
|
3
|
-
Version: 0.28.
|
|
3
|
+
Version: 0.28.3
|
|
4
4
|
Summary: Minimal video generation and processing library.
|
|
5
5
|
Project-URL: Homepage, https://videopython.com
|
|
6
6
|
Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
|
|
@@ -34,6 +34,7 @@ Requires-Dist: numba>=0.61.0; extra == 'ai'
|
|
|
34
34
|
Requires-Dist: ollama>=0.4.5; extra == 'ai'
|
|
35
35
|
Requires-Dist: openai-whisper>=20240930; extra == 'ai'
|
|
36
36
|
Requires-Dist: pyannote-audio>=4.0.0; extra == 'ai'
|
|
37
|
+
Requires-Dist: pyloudnorm>=0.1.1; extra == 'ai'
|
|
37
38
|
Requires-Dist: scikit-learn>=1.3.0; extra == 'ai'
|
|
38
39
|
Requires-Dist: scipy>=1.10.0; extra == 'ai'
|
|
39
40
|
Requires-Dist: sentencepiece>=0.1.99; extra == 'ai'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "videopython"
|
|
3
|
-
version = "0.28.
|
|
3
|
+
version = "0.28.3"
|
|
4
4
|
description = "Minimal video generation and processing library."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
|
|
@@ -82,6 +82,8 @@ ai = [
|
|
|
82
82
|
"demucs>=4.0.0",
|
|
83
83
|
# Translation backend: Qwen3 GGUF inference (M2)
|
|
84
84
|
"llama-cpp-python>=0.3.0",
|
|
85
|
+
# Loudness measurement (BS.1770) for dub-vs-source loudness matching (M3)
|
|
86
|
+
"pyloudnorm>=0.1.1",
|
|
85
87
|
]
|
|
86
88
|
|
|
87
89
|
# Required for pip install videopython[ai] - pip uses optional-dependencies, not dependency-groups
|
|
@@ -115,6 +117,8 @@ ai = [
|
|
|
115
117
|
"demucs>=4.0.0",
|
|
116
118
|
# Translation backend: Qwen3 GGUF inference (M2)
|
|
117
119
|
"llama-cpp-python>=0.3.0",
|
|
120
|
+
# Loudness measurement (BS.1770) for dub-vs-source loudness matching (M3)
|
|
121
|
+
"pyloudnorm>=0.1.1",
|
|
118
122
|
]
|
|
119
123
|
|
|
120
124
|
[project.urls]
|
|
@@ -141,6 +145,7 @@ module = [
|
|
|
141
145
|
"silero_vad", "silero_vad.*",
|
|
142
146
|
"cv2", "cv2.*",
|
|
143
147
|
"llama_cpp", "llama_cpp.*",
|
|
148
|
+
"pyloudnorm", "pyloudnorm.*",
|
|
144
149
|
]
|
|
145
150
|
ignore_missing_imports = true
|
|
146
151
|
|
|
@@ -1,7 +1,14 @@
|
|
|
1
1
|
"""Local video dubbing functionality."""
|
|
2
2
|
|
|
3
|
+
from videopython.ai.dubbing.cache import DubCache, dub_cache_clear
|
|
3
4
|
from videopython.ai.dubbing.dubber import VideoDubber
|
|
4
|
-
from videopython.ai.dubbing.models import
|
|
5
|
+
from videopython.ai.dubbing.models import (
|
|
6
|
+
DubbingResult,
|
|
7
|
+
Expressiveness,
|
|
8
|
+
RevoiceResult,
|
|
9
|
+
SeparatedAudio,
|
|
10
|
+
TranslatedSegment,
|
|
11
|
+
)
|
|
5
12
|
from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
|
|
6
13
|
from videopython.ai.dubbing.quality import GarbageTranscriptError, TranscriptQuality, assess_transcript
|
|
7
14
|
from videopython.ai.dubbing.timing import TimingSynchronizer
|
|
@@ -19,4 +26,7 @@ __all__ = [
|
|
|
19
26
|
"TranscriptQuality",
|
|
20
27
|
"assess_transcript",
|
|
21
28
|
"UnsupportedLanguageError",
|
|
29
|
+
"DubCache",
|
|
30
|
+
"dub_cache_clear",
|
|
31
|
+
"Expressiveness",
|
|
22
32
|
]
|
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
"""Filesystem-backed cache for resumable dubbing runs.
|
|
2
|
+
|
|
3
|
+
A long dub crashes at TTS segment 312/400 today and re-runs Whisper,
|
|
4
|
+
Demucs, translation, and the first 311 TTS segments from scratch.
|
|
5
|
+
:class:`DubCache` stores three artifacts so subsequent runs skip stages
|
|
6
|
+
whose inputs match:
|
|
7
|
+
|
|
8
|
+
- ``transcription.json`` — output of ``AudioToText.transcribe``.
|
|
9
|
+
- ``translation_<key>.json`` — output of ``TranslationBackend.translate_segments``.
|
|
10
|
+
- ``tts/<key>.wav`` — per-segment TTS WAV.
|
|
11
|
+
|
|
12
|
+
Cache directories are opt-in via ``VideoDubber(cache_dir=...)`` / ``LocalDubbingPipeline(cache_dir=...)``.
|
|
13
|
+
``cache_dir=None`` (default) is a no-op pass-through.
|
|
14
|
+
|
|
15
|
+
Hash inputs are conservative — false misses (re-run a stage) are cheap;
|
|
16
|
+
false hits (deliver a stale dub) are bugs. Source-audio identity uses a
|
|
17
|
+
sha256 of the raw float32 bytes, not file path, so re-encoding the same
|
|
18
|
+
content invalidates correctly.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import hashlib
|
|
24
|
+
import json
|
|
25
|
+
import logging
|
|
26
|
+
from dataclasses import dataclass
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
from typing import TYPE_CHECKING, Any
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from videopython.base.audio import Audio
|
|
32
|
+
from videopython.base.text.transcription import Transcription
|
|
33
|
+
|
|
34
|
+
logger = logging.getLogger(__name__)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# Cache schema version. Bump on incompatible changes to any artifact's
|
|
38
|
+
# on-disk format (e.g. TranscriptionSegment field changes that break
|
|
39
|
+
# from_dict). Mismatched cache entries are treated as a miss.
|
|
40
|
+
SCHEMA_VERSION = 1
|
|
41
|
+
|
|
42
|
+
# Reserved for M4.3 per-speaker voice library. M3.2 does not write here;
|
|
43
|
+
# documented so future code knows the path is taken.
|
|
44
|
+
_VOICE_CLONES_SUBDIR = "voice_clones"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass(frozen=True)
|
|
48
|
+
class _ArtifactPaths:
|
|
49
|
+
"""Resolved paths for a single source's cache directory."""
|
|
50
|
+
|
|
51
|
+
src_dir: Path
|
|
52
|
+
metadata: Path
|
|
53
|
+
transcription: Path
|
|
54
|
+
tts_dir: Path
|
|
55
|
+
|
|
56
|
+
def translation_path(self, lang_key: str) -> Path:
|
|
57
|
+
return self.src_dir / f"translation_{lang_key}.json"
|
|
58
|
+
|
|
59
|
+
def tts_path(self, seg_key: str) -> Path:
|
|
60
|
+
return self.tts_dir / f"{seg_key}.wav"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _stable_hash(*parts: str | int | float | bool | None) -> str:
|
|
64
|
+
"""Short hex digest over a tuple of primitive values.
|
|
65
|
+
|
|
66
|
+
Stable across runs — uses ``str(part)`` so int/float/bool/None all
|
|
67
|
+
serialize deterministically. 16 hex chars (64 bits) is plenty of
|
|
68
|
+
space for the small cardinality we're hashing into.
|
|
69
|
+
"""
|
|
70
|
+
h = hashlib.sha256()
|
|
71
|
+
for part in parts:
|
|
72
|
+
h.update(repr(part).encode("utf-8"))
|
|
73
|
+
h.update(b"\x00")
|
|
74
|
+
return h.hexdigest()[:16]
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _audio_bytes_hash(audio: Audio) -> str:
|
|
78
|
+
"""sha256 over the raw audio data buffer.
|
|
79
|
+
|
|
80
|
+
Used as the per-source cache directory name. Bytes-level so re-encoded
|
|
81
|
+
sources (different container, same content) collide intentionally only
|
|
82
|
+
when the decoded float32 buffer matches.
|
|
83
|
+
"""
|
|
84
|
+
h = hashlib.sha256()
|
|
85
|
+
h.update(audio.data.tobytes())
|
|
86
|
+
return h.hexdigest()[:16]
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class DubCache:
|
|
90
|
+
"""Filesystem cache for transcription, translation, and TTS artifacts.
|
|
91
|
+
|
|
92
|
+
Layout under ``root``::
|
|
93
|
+
|
|
94
|
+
<root>/<src_hash>/
|
|
95
|
+
metadata.json # schema version + hash inputs
|
|
96
|
+
transcription.json # populated on transcription cache miss
|
|
97
|
+
translation_<lang_key>.json
|
|
98
|
+
tts/<seg_key>.wav
|
|
99
|
+
voice_clones/ # reserved for M4.3, not written here
|
|
100
|
+
|
|
101
|
+
All getters return ``None`` on miss. Putters are idempotent
|
|
102
|
+
(overwrite). Schema-version mismatch is treated as a miss for every
|
|
103
|
+
artifact under that source.
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
def __init__(self, root: str | Path) -> None:
|
|
107
|
+
self.root = Path(root)
|
|
108
|
+
self.root.mkdir(parents=True, exist_ok=True)
|
|
109
|
+
|
|
110
|
+
# ----- key derivation --------------------------------------------------
|
|
111
|
+
|
|
112
|
+
@staticmethod
|
|
113
|
+
def source_key(audio: Audio) -> str:
|
|
114
|
+
"""Per-source identifier — sha256 of the raw audio buffer.
|
|
115
|
+
|
|
116
|
+
This is the directory name under ``root``; one dir per distinct
|
|
117
|
+
source, regardless of which stage's kwargs vary.
|
|
118
|
+
"""
|
|
119
|
+
return _audio_bytes_hash(audio)
|
|
120
|
+
|
|
121
|
+
@staticmethod
|
|
122
|
+
def transcription_kwargs_hash(
|
|
123
|
+
*,
|
|
124
|
+
whisper_model: str,
|
|
125
|
+
enable_diarization: bool,
|
|
126
|
+
condition_on_previous_text: bool,
|
|
127
|
+
no_speech_threshold: float,
|
|
128
|
+
logprob_threshold: float | None,
|
|
129
|
+
) -> str:
|
|
130
|
+
return _stable_hash(
|
|
131
|
+
whisper_model,
|
|
132
|
+
enable_diarization,
|
|
133
|
+
condition_on_previous_text,
|
|
134
|
+
no_speech_threshold,
|
|
135
|
+
logprob_threshold,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
@staticmethod
|
|
139
|
+
def translation_key(
|
|
140
|
+
*,
|
|
141
|
+
source_lang: str,
|
|
142
|
+
target_lang: str,
|
|
143
|
+
translator_class: str,
|
|
144
|
+
) -> str:
|
|
145
|
+
"""Hash captures the source/target pair + the resolved backend class.
|
|
146
|
+
|
|
147
|
+
``translator_class`` is the *resolved* class name (e.g. ``"MarianTranslator"``),
|
|
148
|
+
not the user-supplied ``"auto"`` — a CPU run that resolves to Marian
|
|
149
|
+
must not collide with a GPU run that resolves to Qwen on the same
|
|
150
|
+
language pair.
|
|
151
|
+
"""
|
|
152
|
+
return _stable_hash(source_lang, target_lang, translator_class)
|
|
153
|
+
|
|
154
|
+
@staticmethod
|
|
155
|
+
def tts_key(
|
|
156
|
+
*,
|
|
157
|
+
translated_text: str,
|
|
158
|
+
voice_sample_bytes: bytes | None,
|
|
159
|
+
language: str,
|
|
160
|
+
exaggeration: float | None = None,
|
|
161
|
+
cfg_weight: float | None = None,
|
|
162
|
+
temperature: float | None = None,
|
|
163
|
+
) -> str:
|
|
164
|
+
"""Per-segment key over text + voice sample + language + expressiveness.
|
|
165
|
+
|
|
166
|
+
``exaggeration`` / ``cfg_weight`` / ``temperature`` are the M4
|
|
167
|
+
Chatterbox knobs. Defaulting to ``None`` keeps pre-M4 callers that
|
|
168
|
+
omit them hashing the same way (no-knob profile collides with
|
|
169
|
+
absent kwargs), so cache invalidation is driven by *passing
|
|
170
|
+
non-None values*, not by the M4 code path being present.
|
|
171
|
+
"""
|
|
172
|
+
h = hashlib.sha256()
|
|
173
|
+
h.update(translated_text.encode("utf-8"))
|
|
174
|
+
h.update(b"\x00")
|
|
175
|
+
h.update(voice_sample_bytes or b"")
|
|
176
|
+
h.update(b"\x00")
|
|
177
|
+
h.update(language.encode("utf-8"))
|
|
178
|
+
for knob in (exaggeration, cfg_weight, temperature):
|
|
179
|
+
h.update(b"\x00")
|
|
180
|
+
h.update(repr(knob).encode("utf-8"))
|
|
181
|
+
return h.hexdigest()[:16]
|
|
182
|
+
|
|
183
|
+
# ----- path resolution -------------------------------------------------
|
|
184
|
+
|
|
185
|
+
def _paths_for(self, src_hash: str) -> _ArtifactPaths:
|
|
186
|
+
src_dir = self.root / src_hash
|
|
187
|
+
return _ArtifactPaths(
|
|
188
|
+
src_dir=src_dir,
|
|
189
|
+
metadata=src_dir / "metadata.json",
|
|
190
|
+
transcription=src_dir / "transcription.json",
|
|
191
|
+
tts_dir=src_dir / "tts",
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
def _ensure_metadata(self, paths: _ArtifactPaths, hash_inputs: dict[str, Any]) -> None:
|
|
195
|
+
"""Create the source dir + metadata.json if missing.
|
|
196
|
+
|
|
197
|
+
``hash_inputs`` records the kwargs we hashed against so a future
|
|
198
|
+
schema change can audit cache entries. The schema field is
|
|
199
|
+
load-bearing: mismatched versions invalidate the entire source dir.
|
|
200
|
+
"""
|
|
201
|
+
paths.src_dir.mkdir(parents=True, exist_ok=True)
|
|
202
|
+
paths.tts_dir.mkdir(parents=True, exist_ok=True)
|
|
203
|
+
if not paths.metadata.exists():
|
|
204
|
+
paths.metadata.write_text(
|
|
205
|
+
json.dumps(
|
|
206
|
+
{"schema": SCHEMA_VERSION, "hash_inputs": hash_inputs},
|
|
207
|
+
indent=2,
|
|
208
|
+
),
|
|
209
|
+
encoding="utf-8",
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
def _schema_ok(self, paths: _ArtifactPaths) -> bool:
|
|
213
|
+
if not paths.metadata.exists():
|
|
214
|
+
return True # fresh dir; we'll write metadata on first put.
|
|
215
|
+
try:
|
|
216
|
+
data = json.loads(paths.metadata.read_text(encoding="utf-8"))
|
|
217
|
+
except (OSError, json.JSONDecodeError):
|
|
218
|
+
return False
|
|
219
|
+
return data.get("schema") == SCHEMA_VERSION
|
|
220
|
+
|
|
221
|
+
# ----- transcription ---------------------------------------------------
|
|
222
|
+
|
|
223
|
+
def get_transcription(self, src_hash: str, kwargs_hash: str) -> Transcription | None:
|
|
224
|
+
from videopython.base.text.transcription import Transcription
|
|
225
|
+
|
|
226
|
+
paths = self._paths_for(src_hash)
|
|
227
|
+
if not paths.transcription.exists() or not self._schema_ok(paths):
|
|
228
|
+
return None
|
|
229
|
+
try:
|
|
230
|
+
data = json.loads(paths.transcription.read_text(encoding="utf-8"))
|
|
231
|
+
except (OSError, json.JSONDecodeError):
|
|
232
|
+
return None
|
|
233
|
+
if data.get("kwargs_hash") != kwargs_hash:
|
|
234
|
+
return None
|
|
235
|
+
logger.info("cache hit: transcription (%s)", src_hash)
|
|
236
|
+
return Transcription.from_dict(data["transcription"])
|
|
237
|
+
|
|
238
|
+
def put_transcription(
|
|
239
|
+
self,
|
|
240
|
+
src_hash: str,
|
|
241
|
+
kwargs_hash: str,
|
|
242
|
+
transcription: Transcription,
|
|
243
|
+
hash_inputs: dict[str, Any],
|
|
244
|
+
) -> None:
|
|
245
|
+
paths = self._paths_for(src_hash)
|
|
246
|
+
self._ensure_metadata(paths, hash_inputs)
|
|
247
|
+
paths.transcription.write_text(
|
|
248
|
+
json.dumps(
|
|
249
|
+
{"kwargs_hash": kwargs_hash, "transcription": transcription.to_dict()},
|
|
250
|
+
ensure_ascii=False,
|
|
251
|
+
),
|
|
252
|
+
encoding="utf-8",
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
# ----- translation -----------------------------------------------------
|
|
256
|
+
|
|
257
|
+
def get_translation(self, src_hash: str, lang_key: str) -> list[dict[str, Any]] | None:
|
|
258
|
+
paths = self._paths_for(src_hash)
|
|
259
|
+
if not self._schema_ok(paths):
|
|
260
|
+
return None
|
|
261
|
+
path = paths.translation_path(lang_key)
|
|
262
|
+
if not path.exists():
|
|
263
|
+
return None
|
|
264
|
+
try:
|
|
265
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
266
|
+
except (OSError, json.JSONDecodeError):
|
|
267
|
+
return None
|
|
268
|
+
logger.info("cache hit: translation (%s/%s)", src_hash, lang_key)
|
|
269
|
+
return data["segments"]
|
|
270
|
+
|
|
271
|
+
def put_translation(self, src_hash: str, lang_key: str, segments_dict: list[dict[str, Any]]) -> None:
|
|
272
|
+
paths = self._paths_for(src_hash)
|
|
273
|
+
self._ensure_metadata(paths, {})
|
|
274
|
+
paths.translation_path(lang_key).write_text(
|
|
275
|
+
json.dumps({"segments": segments_dict}, ensure_ascii=False),
|
|
276
|
+
encoding="utf-8",
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
# ----- tts -------------------------------------------------------------
|
|
280
|
+
|
|
281
|
+
def get_tts_path(self, src_hash: str, seg_key: str) -> Path | None:
|
|
282
|
+
paths = self._paths_for(src_hash)
|
|
283
|
+
if not self._schema_ok(paths):
|
|
284
|
+
return None
|
|
285
|
+
path = paths.tts_path(seg_key)
|
|
286
|
+
return path if path.exists() else None
|
|
287
|
+
|
|
288
|
+
def reserve_tts_path(self, src_hash: str, seg_key: str) -> Path:
|
|
289
|
+
"""Return the path TTS output should be written to. Caller is
|
|
290
|
+
responsible for the actual write (Audio.save)."""
|
|
291
|
+
paths = self._paths_for(src_hash)
|
|
292
|
+
self._ensure_metadata(paths, {})
|
|
293
|
+
return paths.tts_path(seg_key)
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def dub_cache_clear(cache_dir: str | Path, src_hash: str | None = None) -> None:
|
|
297
|
+
"""Delete cache entries for a specific source or the whole cache root.
|
|
298
|
+
|
|
299
|
+
No auto-eviction in M3.2 — call this to reclaim disk space when a
|
|
300
|
+
cache directory has grown unwieldy. Safe no-op if ``cache_dir`` or
|
|
301
|
+
``cache_dir/<src_hash>`` does not exist.
|
|
302
|
+
"""
|
|
303
|
+
import shutil
|
|
304
|
+
|
|
305
|
+
root = Path(cache_dir)
|
|
306
|
+
target = root / src_hash if src_hash else root
|
|
307
|
+
if target.exists():
|
|
308
|
+
shutil.rmtree(target)
|
|
309
|
+
logger.info("dub_cache_clear: removed %s", target)
|
|
@@ -50,6 +50,13 @@ class VideoDubber:
|
|
|
50
50
|
See :class:`videopython.ai.generation.qwen3.Qwen3Translator`
|
|
51
51
|
for tradeoffs (Qwen3 is slower on CPU but produces
|
|
52
52
|
context-aware, length-budgeted output).
|
|
53
|
+
cache_dir: When set, persist transcription, translated segments,
|
|
54
|
+
and per-segment TTS WAVs under this directory and skip stages
|
|
55
|
+
whose inputs already match a cache entry. Use to resume crashed
|
|
56
|
+
long runs or to iterate on dub configuration without paying
|
|
57
|
+
transcription cost each time. ``None`` (default) disables
|
|
58
|
+
caching. Cache grows unbounded; clear via
|
|
59
|
+
:func:`videopython.ai.dubbing.cache.dub_cache_clear`.
|
|
53
60
|
"""
|
|
54
61
|
|
|
55
62
|
def __init__(
|
|
@@ -62,6 +69,7 @@ class VideoDubber:
|
|
|
62
69
|
logprob_threshold: float | None = -1.0,
|
|
63
70
|
strict_quality: bool = False,
|
|
64
71
|
translator: TranslatorChoice = "auto",
|
|
72
|
+
cache_dir: str | Path | None = None,
|
|
65
73
|
):
|
|
66
74
|
self.device = device
|
|
67
75
|
self.low_memory = low_memory
|
|
@@ -71,14 +79,16 @@ class VideoDubber:
|
|
|
71
79
|
self.logprob_threshold = logprob_threshold
|
|
72
80
|
self.strict_quality = strict_quality
|
|
73
81
|
self.translator = translator
|
|
82
|
+
self.cache_dir = cache_dir
|
|
74
83
|
self._local_pipeline: Any = None
|
|
75
84
|
requested = device.lower() if isinstance(device, str) else "auto"
|
|
76
85
|
logger.info(
|
|
77
|
-
"VideoDubber initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
|
|
86
|
+
"VideoDubber initialized with device=%s low_memory=%s whisper_model=%s translator=%s cache_dir=%s",
|
|
78
87
|
requested,
|
|
79
88
|
low_memory,
|
|
80
89
|
whisper_model,
|
|
81
90
|
translator,
|
|
91
|
+
cache_dir,
|
|
82
92
|
)
|
|
83
93
|
|
|
84
94
|
def _init_local_pipeline(self) -> None:
|
|
@@ -93,6 +103,7 @@ class VideoDubber:
|
|
|
93
103
|
logprob_threshold=self.logprob_threshold,
|
|
94
104
|
strict_quality=self.strict_quality,
|
|
95
105
|
translator=self.translator,
|
|
106
|
+
cache_dir=self.cache_dir,
|
|
96
107
|
)
|
|
97
108
|
|
|
98
109
|
def dub(
|
|
@@ -175,6 +186,7 @@ class VideoDubber:
|
|
|
175
186
|
enable_diarization: bool = False,
|
|
176
187
|
progress_callback: Callable[[str, float], None] | None = None,
|
|
177
188
|
transcription: Any = None,
|
|
189
|
+
keep_original_audio: bool = False,
|
|
178
190
|
) -> DubbingResult:
|
|
179
191
|
"""Dub a video file in place on disk without loading video frames into memory.
|
|
180
192
|
|
|
@@ -201,6 +213,8 @@ class VideoDubber:
|
|
|
201
213
|
step. Speaker labels on the supplied transcription drive per-speaker
|
|
202
214
|
voice cloning. If it has no speakers, pass ``enable_diarization=True``
|
|
203
215
|
to add them via pyannote (requires word-level timings).
|
|
216
|
+
keep_original_audio: If True, retain the source audio in the output
|
|
217
|
+
as a secondary track behind the dubbed one (editorial A/B).
|
|
204
218
|
|
|
205
219
|
Returns:
|
|
206
220
|
``DubbingResult`` with the dubbed audio, translated segments, and
|
|
@@ -239,6 +253,7 @@ class VideoDubber:
|
|
|
239
253
|
video_path=input_path,
|
|
240
254
|
audio=result.dubbed_audio,
|
|
241
255
|
output_path=output_path,
|
|
256
|
+
keep_original_audio=keep_original_audio,
|
|
242
257
|
)
|
|
243
258
|
|
|
244
259
|
return result
|
|
@@ -19,6 +19,42 @@ if TYPE_CHECKING:
|
|
|
19
19
|
CLEAN_SPEED_TOLERANCE = 0.01
|
|
20
20
|
|
|
21
21
|
|
|
22
|
+
@dataclass(frozen=True)
|
|
23
|
+
class Expressiveness:
|
|
24
|
+
"""Chatterbox ``generate()`` knobs derived from source-segment prosody.
|
|
25
|
+
|
|
26
|
+
``None`` on any field means "let Chatterbox use its own default" —
|
|
27
|
+
avoids pinning the dub against future Chatterbox default changes.
|
|
28
|
+
|
|
29
|
+
Attributes:
|
|
30
|
+
exaggeration: Emotional intensity. Chatterbox default ``0.5``;
|
|
31
|
+
``0.7+`` produces dramatic output.
|
|
32
|
+
cfg_weight: Classifier-free guidance weight. Chatterbox default
|
|
33
|
+
``0.5``; lower values (~``0.3``) slow pacing.
|
|
34
|
+
temperature: Sampling temperature. Chatterbox default ``0.8``.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
exaggeration: float | None = None
|
|
38
|
+
cfg_weight: float | None = None
|
|
39
|
+
temperature: float | None = None
|
|
40
|
+
|
|
41
|
+
def as_kwargs(self) -> dict[str, float]:
|
|
42
|
+
"""Knobs as a dict, dropping ``None`` entries.
|
|
43
|
+
|
|
44
|
+
Suitable for ``**``-expansion into Chatterbox or
|
|
45
|
+
:meth:`DubCache.tts_key`.
|
|
46
|
+
"""
|
|
47
|
+
return {
|
|
48
|
+
name: value
|
|
49
|
+
for name, value in (
|
|
50
|
+
("exaggeration", self.exaggeration),
|
|
51
|
+
("cfg_weight", self.cfg_weight),
|
|
52
|
+
("temperature", self.temperature),
|
|
53
|
+
)
|
|
54
|
+
if value is not None
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
|
|
22
58
|
@dataclass
|
|
23
59
|
class TranslatedSegment:
|
|
24
60
|
"""A segment of translated text with timing information.
|
|
@@ -59,6 +95,31 @@ class TranslatedSegment:
|
|
|
59
95
|
"""Duration of the segment in seconds."""
|
|
60
96
|
return self.end - self.start
|
|
61
97
|
|
|
98
|
+
def to_dict(self) -> dict[str, Any]:
|
|
99
|
+
"""Convert to dictionary for JSON serialization (used by the dub cache)."""
|
|
100
|
+
return {
|
|
101
|
+
"original_segment": self.original_segment.to_dict(),
|
|
102
|
+
"translated_text": self.translated_text,
|
|
103
|
+
"source_lang": self.source_lang,
|
|
104
|
+
"target_lang": self.target_lang,
|
|
105
|
+
"speaker": self.speaker,
|
|
106
|
+
"start": self.start,
|
|
107
|
+
"end": self.end,
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
@classmethod
|
|
111
|
+
def from_dict(cls, data: dict[str, Any]) -> TranslatedSegment:
|
|
112
|
+
"""Reconstruct from a dict produced by :meth:`to_dict`."""
|
|
113
|
+
return cls(
|
|
114
|
+
original_segment=TranscriptionSegment.from_dict(data["original_segment"]),
|
|
115
|
+
translated_text=data["translated_text"],
|
|
116
|
+
source_lang=data["source_lang"],
|
|
117
|
+
target_lang=data["target_lang"],
|
|
118
|
+
speaker=data.get("speaker"),
|
|
119
|
+
start=data.get("start", 0.0),
|
|
120
|
+
end=data.get("end", 0.0),
|
|
121
|
+
)
|
|
122
|
+
|
|
62
123
|
|
|
63
124
|
@dataclass
|
|
64
125
|
class SeparatedAudio:
|