videopython 0.32.0__tar.gz → 0.33.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.32.0 → videopython-0.33.0}/PKG-INFO +1 -1
- {videopython-0.32.0 → videopython-0.33.0}/pyproject.toml +1 -1
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/dubbing/__init__.py +2 -0
- videopython-0.33.0/src/videopython/ai/dubbing/config.py +80 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/dubbing/dubber.py +12 -76
- videopython-0.33.0/src/videopython/ai/dubbing/expressiveness.py +47 -0
- videopython-0.33.0/src/videopython/ai/dubbing/loudness.py +86 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/dubbing/models.py +50 -69
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/dubbing/pipeline.py +69 -339
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/dubbing/quality.py +5 -26
- videopython-0.33.0/src/videopython/ai/dubbing/voice_sample.py +152 -0
- videopython-0.33.0/src/videopython/ai/video_analysis/__init__.py +39 -0
- videopython-0.33.0/src/videopython/ai/video_analysis/analyzer.py +490 -0
- videopython-0.33.0/src/videopython/ai/video_analysis/models.py +228 -0
- videopython-0.33.0/src/videopython/ai/video_analysis/sampling.py +113 -0
- videopython-0.33.0/src/videopython/ai/video_analysis/stages.py +354 -0
- videopython-0.32.0/src/videopython/ai/video_analysis.py +0 -1181
- {videopython-0.32.0 → videopython-0.33.0}/.gitignore +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/LICENSE +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/README.md +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/__init__.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/_device.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/dubbing/remux.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/generation/audio.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/generation/qwen3.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/generation/translation.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/transforms.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/understanding/audio.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/understanding/faces.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/understanding/separation.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/audio/__init__.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/audio/analysis.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/audio/audio.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/base/__init__.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/base/_dimensions.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/base/_ffmpeg.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/base/_video_io.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/base/description.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/base/image_text.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/base/transcription.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/base/video.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/editing/__init__.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/editing/effects.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/editing/operation.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/editing/streaming.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/editing/transcription_overlay.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/editing/transforms.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/editing/video_edit.py +0 -0
- {videopython-0.32.0 → videopython-0.33.0}/src/videopython/py.typed +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Local video dubbing functionality."""
|
|
2
2
|
|
|
3
|
+
from videopython.ai.dubbing.config import DubbingConfig
|
|
3
4
|
from videopython.ai.dubbing.dubber import VideoDubber
|
|
4
5
|
from videopython.ai.dubbing.models import (
|
|
5
6
|
DubbingResult,
|
|
@@ -15,6 +16,7 @@ from videopython.ai.generation.translation import UnsupportedLanguageError
|
|
|
15
16
|
|
|
16
17
|
__all__ = [
|
|
17
18
|
"VideoDubber",
|
|
19
|
+
"DubbingConfig",
|
|
18
20
|
"DubbingResult",
|
|
19
21
|
"RevoiceResult",
|
|
20
22
|
"TranslatedSegment",
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Configuration model for the dubbing pipeline."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Literal
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, ConfigDict
|
|
8
|
+
|
|
9
|
+
TranslatorChoice = Literal["auto", "marian", "qwen3"]
|
|
10
|
+
WhisperModel = Literal["tiny", "base", "small", "medium", "large", "turbo"]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DubbingConfig(BaseModel):
|
|
14
|
+
"""Knobs shared by :class:`VideoDubber` and :class:`LocalDubbingPipeline`.
|
|
15
|
+
|
|
16
|
+
Accepted as either ``config=DubbingConfig(...)`` or flat kwargs on the
|
|
17
|
+
two constructors; the flat path builds a ``DubbingConfig`` internally.
|
|
18
|
+
|
|
19
|
+
Attributes:
|
|
20
|
+
device: Execution device (``cpu``, ``cuda``, ``mps``, or ``None`` for auto).
|
|
21
|
+
low_memory: When True, each pipeline stage (Whisper, Demucs, MarianMT,
|
|
22
|
+
Chatterbox TTS) is unloaded from memory after it runs, so only one
|
|
23
|
+
model is resident at a time. Trades per-run latency (~10-30s of
|
|
24
|
+
extra model loads) for a much lower memory ceiling. Recommended
|
|
25
|
+
for GPUs with <=12GB VRAM or hosts with <32GB RAM. Default False.
|
|
26
|
+
whisper_model: Whisper model size used for transcription. Larger
|
|
27
|
+
models give better accuracy at the cost of VRAM and latency. One
|
|
28
|
+
of ``tiny``, ``base``, ``small``, ``medium``, ``large``, ``turbo``.
|
|
29
|
+
Default ``turbo``.
|
|
30
|
+
condition_on_previous_text: Forwarded to ``AudioToText``. Defaults to
|
|
31
|
+
``False`` (Whisper's own default is ``True``). With conditioning
|
|
32
|
+
on, a single hallucinated filler phrase cascades through the rest
|
|
33
|
+
of the file. See ``AudioToText`` for the full rationale.
|
|
34
|
+
no_speech_threshold: Forwarded to ``AudioToText``. Whisper's
|
|
35
|
+
no-speech gate; raise to drop more low-confidence windows.
|
|
36
|
+
logprob_threshold: Forwarded to ``AudioToText``. Whisper's average
|
|
37
|
+
log-probability gate.
|
|
38
|
+
vocabulary: Forwarded to ``AudioToText``. Optional list of brand
|
|
39
|
+
names, product names, or proper nouns to bias Whisper's
|
|
40
|
+
first-window decoder via ``initial_prompt``. Recovers
|
|
41
|
+
near-mishears (e.g. Klarna -> "carna") on brand-monitoring
|
|
42
|
+
inputs without new model deps.
|
|
43
|
+
strict_quality: When True, the pipeline raises
|
|
44
|
+
:class:`GarbageTranscriptError` before Demucs/translation/TTS
|
|
45
|
+
run if the transcript-quality heuristic returns ``"reject"``.
|
|
46
|
+
When False (default), low-quality transcripts are logged at
|
|
47
|
+
WARNING but processing continues. Either way the
|
|
48
|
+
:class:`TranscriptQuality` is exposed on ``DubbingResult`` for
|
|
49
|
+
inspection.
|
|
50
|
+
translator: Translation backend to use. ``"auto"`` (default) picks
|
|
51
|
+
Qwen3 on GPU, MarianMT on CPU; ``"marian"`` and ``"qwen3"`` force
|
|
52
|
+
the named backend regardless of device. See
|
|
53
|
+
:class:`videopython.ai.generation.qwen3.Qwen3Translator` for
|
|
54
|
+
tradeoffs (Qwen3 is slower on CPU but produces context-aware,
|
|
55
|
+
length-budgeted output).
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
model_config = ConfigDict(frozen=True)
|
|
59
|
+
|
|
60
|
+
device: str | None = None
|
|
61
|
+
low_memory: bool = False
|
|
62
|
+
whisper_model: WhisperModel = "turbo"
|
|
63
|
+
condition_on_previous_text: bool = False
|
|
64
|
+
no_speech_threshold: float = 0.6
|
|
65
|
+
logprob_threshold: float | None = -1.0
|
|
66
|
+
vocabulary: list[str] | None = None
|
|
67
|
+
strict_quality: bool = False
|
|
68
|
+
translator: TranslatorChoice = "auto"
|
|
69
|
+
|
|
70
|
+
def init_log_fields(self) -> dict[str, object]:
|
|
71
|
+
"""Subset of fields surfaced in the init-log line.
|
|
72
|
+
|
|
73
|
+
Hand-picked so log noise stays bounded as the config grows.
|
|
74
|
+
"""
|
|
75
|
+
return {
|
|
76
|
+
"device": self.device.lower() if isinstance(self.device, str) else "auto",
|
|
77
|
+
"low_memory": self.low_memory,
|
|
78
|
+
"whisper_model": self.whisper_model,
|
|
79
|
+
"translator": self.translator,
|
|
80
|
+
}
|
|
@@ -6,8 +6,8 @@ import logging
|
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
from typing import TYPE_CHECKING, Any, Callable
|
|
8
8
|
|
|
9
|
+
from videopython.ai.dubbing.config import DubbingConfig
|
|
9
10
|
from videopython.ai.dubbing.models import DubbingResult, RevoiceResult
|
|
10
|
-
from videopython.ai.dubbing.pipeline import TranslatorChoice, WhisperModel
|
|
11
11
|
|
|
12
12
|
if TYPE_CHECKING:
|
|
13
13
|
from videopython.base.video import Video
|
|
@@ -18,90 +18,26 @@ logger = logging.getLogger(__name__)
|
|
|
18
18
|
class VideoDubber:
|
|
19
19
|
"""Dubs videos into different languages using the local pipeline.
|
|
20
20
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
model is resident at a time. Trades per-run latency (~10-30s of
|
|
26
|
-
extra model loads) for a much lower memory ceiling. Recommended for
|
|
27
|
-
GPUs with <=12GB VRAM or hosts with <32GB RAM. Default False.
|
|
28
|
-
whisper_model: Whisper model size used for transcription. Larger models
|
|
29
|
-
give better accuracy at the cost of VRAM and latency. One of
|
|
30
|
-
``tiny``, ``base``, ``small``, ``medium``, ``large``, ``turbo``.
|
|
31
|
-
Default ``turbo``.
|
|
32
|
-
condition_on_previous_text: Forwarded to ``AudioToText``. Defaults to
|
|
33
|
-
``False`` (Whisper's own default is ``True``). With conditioning on,
|
|
34
|
-
a single hallucinated filler phrase cascades through the rest of
|
|
35
|
-
the file. See ``AudioToText`` for the full rationale.
|
|
36
|
-
no_speech_threshold: Forwarded to ``AudioToText``. Whisper's no-speech
|
|
37
|
-
gate; raise to drop more low-confidence windows.
|
|
38
|
-
logprob_threshold: Forwarded to ``AudioToText``. Whisper's average
|
|
39
|
-
log-probability gate.
|
|
40
|
-
vocabulary: Forwarded to ``AudioToText``. Optional list of brand
|
|
41
|
-
names, product names, or proper nouns to bias Whisper's first-
|
|
42
|
-
window decoder via ``initial_prompt``. Recovers near-mishears
|
|
43
|
-
(e.g. Klarna → "carna") on brand-monitoring inputs without new
|
|
44
|
-
model deps.
|
|
45
|
-
strict_quality: When True, the pipeline raises
|
|
46
|
-
:class:`GarbageTranscriptError` before Demucs/translation/TTS run
|
|
47
|
-
if the transcript-quality heuristic returns ``"reject"``. When
|
|
48
|
-
False (default), low-quality transcripts are logged at WARNING
|
|
49
|
-
but processing continues. Either way the
|
|
50
|
-
:class:`TranscriptQuality` is exposed on ``DubbingResult`` for
|
|
51
|
-
inspection.
|
|
52
|
-
translator: Translation backend to use. ``"auto"`` (default)
|
|
53
|
-
picks Qwen3 on GPU, MarianMT on CPU; ``"marian"`` and
|
|
54
|
-
``"qwen3"`` force the named backend regardless of device.
|
|
55
|
-
See :class:`videopython.ai.generation.qwen3.Qwen3Translator`
|
|
56
|
-
for tradeoffs (Qwen3 is slower on CPU but produces
|
|
57
|
-
context-aware, length-budgeted output).
|
|
21
|
+
Accepts either a :class:`DubbingConfig` or the same knobs as flat kwargs
|
|
22
|
+
(``device``, ``low_memory``, ``whisper_model``, ``translator``, etc.) --
|
|
23
|
+
the flat path builds a ``DubbingConfig`` internally. See
|
|
24
|
+
:class:`DubbingConfig` for the full knob list and defaults.
|
|
58
25
|
"""
|
|
59
26
|
|
|
60
|
-
def __init__(
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
whisper_model: WhisperModel = "turbo",
|
|
65
|
-
condition_on_previous_text: bool = False,
|
|
66
|
-
no_speech_threshold: float = 0.6,
|
|
67
|
-
logprob_threshold: float | None = -1.0,
|
|
68
|
-
vocabulary: list[str] | None = None,
|
|
69
|
-
strict_quality: bool = False,
|
|
70
|
-
translator: TranslatorChoice = "auto",
|
|
71
|
-
):
|
|
72
|
-
self.device = device
|
|
73
|
-
self.low_memory = low_memory
|
|
74
|
-
self.whisper_model = whisper_model
|
|
75
|
-
self.condition_on_previous_text = condition_on_previous_text
|
|
76
|
-
self.no_speech_threshold = no_speech_threshold
|
|
77
|
-
self.logprob_threshold = logprob_threshold
|
|
78
|
-
self.vocabulary = vocabulary
|
|
79
|
-
self.strict_quality = strict_quality
|
|
80
|
-
self.translator = translator
|
|
27
|
+
def __init__(self, config: DubbingConfig | None = None, **kwargs: Any):
|
|
28
|
+
if config is not None and kwargs:
|
|
29
|
+
raise TypeError("Pass either `config=` or knob kwargs, not both")
|
|
30
|
+
self.config = config or DubbingConfig(**kwargs)
|
|
81
31
|
self._local_pipeline: Any = None
|
|
82
|
-
requested = device.lower() if isinstance(device, str) else "auto"
|
|
83
32
|
logger.info(
|
|
84
|
-
"VideoDubber initialized with
|
|
85
|
-
|
|
86
|
-
low_memory,
|
|
87
|
-
whisper_model,
|
|
88
|
-
translator,
|
|
33
|
+
"VideoDubber initialized with %s",
|
|
34
|
+
" ".join(f"{k}={v}" for k, v in self.config.init_log_fields().items()),
|
|
89
35
|
)
|
|
90
36
|
|
|
91
37
|
def _init_local_pipeline(self) -> None:
|
|
92
38
|
from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
|
|
93
39
|
|
|
94
|
-
self._local_pipeline = LocalDubbingPipeline(
|
|
95
|
-
device=self.device,
|
|
96
|
-
low_memory=self.low_memory,
|
|
97
|
-
whisper_model=self.whisper_model,
|
|
98
|
-
condition_on_previous_text=self.condition_on_previous_text,
|
|
99
|
-
no_speech_threshold=self.no_speech_threshold,
|
|
100
|
-
logprob_threshold=self.logprob_threshold,
|
|
101
|
-
vocabulary=self.vocabulary,
|
|
102
|
-
strict_quality=self.strict_quality,
|
|
103
|
-
translator=self.translator,
|
|
104
|
-
)
|
|
40
|
+
self._local_pipeline = LocalDubbingPipeline(config=self.config)
|
|
105
41
|
|
|
106
42
|
def dub(
|
|
107
43
|
self,
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Source-prosody-driven expressiveness knobs for Chatterbox TTS."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
from videopython.ai.dubbing.models import Expressiveness
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from videopython.audio import Audio
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# Prosody-conditioning thresholds. Source-segment RMS / whole-vocals RMS
|
|
16
|
+
# below CALM lands in the calm bucket; above DRAMATIC in the dramatic
|
|
17
|
+
# bucket; in between gets Chatterbox's defaults. Knob values picked
|
|
18
|
+
# by-ear on cam1_1min.mp4 -- see RELEASE_NOTES 0.29.0.
|
|
19
|
+
CALM_RATIO_THRESHOLD = 0.7
|
|
20
|
+
DRAMATIC_RATIO_THRESHOLD = 1.3
|
|
21
|
+
_CALM = Expressiveness(exaggeration=0.3, cfg_weight=0.7)
|
|
22
|
+
_DRAMATIC = Expressiveness(exaggeration=0.85, cfg_weight=0.35)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def rms(data: np.ndarray) -> float:
|
|
26
|
+
"""RMS over samples; ``0.0`` for empty input. float64 reduction so a
|
|
27
|
+
long slice can't overflow the squared accumulator."""
|
|
28
|
+
if data.size == 0:
|
|
29
|
+
return 0.0
|
|
30
|
+
return float(np.sqrt(np.mean(np.square(data, dtype=np.float64))))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def expressiveness_for(source_slice: Audio, baseline_rms: float) -> Expressiveness:
|
|
34
|
+
"""Map a source vocals slice to a Chatterbox expressiveness profile
|
|
35
|
+
by RMS ratio. Falls back to the no-knobs default for empty or silent
|
|
36
|
+
inputs."""
|
|
37
|
+
if baseline_rms <= 0.0:
|
|
38
|
+
return Expressiveness()
|
|
39
|
+
segment_rms = rms(source_slice.data)
|
|
40
|
+
if segment_rms <= 0.0:
|
|
41
|
+
return Expressiveness()
|
|
42
|
+
ratio = segment_rms / baseline_rms
|
|
43
|
+
if ratio < CALM_RATIO_THRESHOLD:
|
|
44
|
+
return _CALM
|
|
45
|
+
if ratio > DRAMATIC_RATIO_THRESHOLD:
|
|
46
|
+
return _DRAMATIC
|
|
47
|
+
return Expressiveness()
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""LUFS / peak loudness matching for dubbed audio."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from videopython.audio import Audio
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# BS.1770 integrated-loudness measurement requires at least 400 ms of audio
|
|
14
|
+
# (one gating block). Below this, fall back to peak match -- pyloudnorm
|
|
15
|
+
# returns -inf or warns, neither of which gives a usable gain.
|
|
16
|
+
_LUFS_MIN_DURATION_SECONDS = 0.4
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def peak_match(target: Audio, reference: Audio) -> Audio:
|
|
20
|
+
"""Scale ``target`` so its peak amplitude matches ``reference``.
|
|
21
|
+
|
|
22
|
+
Used as the fallback when LUFS measurement isn't viable (clip < 0.4s
|
|
23
|
+
or silent input). The new ``Audio`` shares no buffer with ``target``.
|
|
24
|
+
"""
|
|
25
|
+
from videopython.audio import Audio as _Audio
|
|
26
|
+
|
|
27
|
+
target_peak = float(np.max(np.abs(target.data))) if target.data.size else 0.0
|
|
28
|
+
reference_peak = float(np.max(np.abs(reference.data))) if reference.data.size else 0.0
|
|
29
|
+
|
|
30
|
+
if target_peak <= 0.0 or reference_peak <= 0.0:
|
|
31
|
+
return target
|
|
32
|
+
|
|
33
|
+
scale = reference_peak / target_peak
|
|
34
|
+
if abs(scale - 1.0) < 1e-3:
|
|
35
|
+
return target
|
|
36
|
+
|
|
37
|
+
return _Audio(target.data * scale, target.metadata)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def loudness_match(target: Audio, reference: Audio) -> Audio:
|
|
41
|
+
"""Scale ``target`` so its integrated loudness (BS.1770 / LUFS) matches ``reference``.
|
|
42
|
+
|
|
43
|
+
Demucs background normalization and the timing-assembler peak guard
|
|
44
|
+
each clamp at 1.0 instead of restoring perceived loudness, so a
|
|
45
|
+
dubbed mix lands perceptually "thinner" than the source even after
|
|
46
|
+
peak match. LUFS captures the ear-weighted envelope that peak ratio
|
|
47
|
+
misses on dialogue-heavy material.
|
|
48
|
+
|
|
49
|
+
Falls back to :func:`peak_match` when either clip is shorter than
|
|
50
|
+
the BS.1770 gating block (400 ms) or when measurement returns -inf
|
|
51
|
+
(silent or near-silent gated content). After gain is applied, peaks
|
|
52
|
+
are clamped to 0.99 -- BS.1770 has no peak ceiling and a sufficiently
|
|
53
|
+
quiet source can demand gain that would otherwise clip.
|
|
54
|
+
"""
|
|
55
|
+
from videopython.audio import Audio as _Audio
|
|
56
|
+
|
|
57
|
+
target_dur = target.metadata.duration_seconds
|
|
58
|
+
ref_dur = reference.metadata.duration_seconds
|
|
59
|
+
if target_dur < _LUFS_MIN_DURATION_SECONDS or ref_dur < _LUFS_MIN_DURATION_SECONDS:
|
|
60
|
+
return peak_match(target, reference)
|
|
61
|
+
|
|
62
|
+
if not target.data.size or not reference.data.size:
|
|
63
|
+
return target
|
|
64
|
+
|
|
65
|
+
import pyloudnorm
|
|
66
|
+
|
|
67
|
+
target_lufs = pyloudnorm.Meter(target.metadata.sample_rate).integrated_loudness(target.data)
|
|
68
|
+
reference_lufs = pyloudnorm.Meter(reference.metadata.sample_rate).integrated_loudness(reference.data)
|
|
69
|
+
|
|
70
|
+
# Either clip's gated content was below -70 LUFS (effectively silent
|
|
71
|
+
# under BS.1770). Gain would be undefined -- fall back to peak match,
|
|
72
|
+
# which has its own silent-input no-op.
|
|
73
|
+
if not np.isfinite(target_lufs) or not np.isfinite(reference_lufs):
|
|
74
|
+
return peak_match(target, reference)
|
|
75
|
+
|
|
76
|
+
gain_db = reference_lufs - target_lufs
|
|
77
|
+
if abs(gain_db) < 0.1:
|
|
78
|
+
return target
|
|
79
|
+
scale = float(10 ** (gain_db / 20.0))
|
|
80
|
+
|
|
81
|
+
scaled = target.data * scale
|
|
82
|
+
peak = float(np.max(np.abs(scaled)))
|
|
83
|
+
if peak > 0.99:
|
|
84
|
+
scaled = scaled * (0.99 / peak)
|
|
85
|
+
|
|
86
|
+
return _Audio(scaled, target.metadata)
|
|
@@ -2,14 +2,15 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
from
|
|
6
|
-
from typing import TYPE_CHECKING, Any
|
|
5
|
+
from typing import TYPE_CHECKING, Annotated, Any
|
|
7
6
|
|
|
7
|
+
from pydantic import BaseModel, BeforeValidator, ConfigDict, Field, PlainSerializer, model_validator
|
|
8
|
+
|
|
9
|
+
from videopython.ai.dubbing.quality import TranscriptQuality
|
|
8
10
|
from videopython.audio import Audio
|
|
9
11
|
from videopython.base.transcription import Transcription, TranscriptionSegment
|
|
10
12
|
|
|
11
13
|
if TYPE_CHECKING:
|
|
12
|
-
from videopython.ai.dubbing.quality import TranscriptQuality
|
|
13
14
|
from videopython.ai.dubbing.timing import TimingAdjustment
|
|
14
15
|
|
|
15
16
|
|
|
@@ -19,11 +20,30 @@ if TYPE_CHECKING:
|
|
|
19
20
|
CLEAN_SPEED_TOLERANCE = 0.01
|
|
20
21
|
|
|
21
22
|
|
|
22
|
-
|
|
23
|
-
|
|
23
|
+
# TranscriptionSegment and Transcription still live in videopython.base as
|
|
24
|
+
# plain dataclasses with hand-rolled to_dict/from_dict. Bridge them at
|
|
25
|
+
# the field boundary so the dubbing cache wire format stays identical.
|
|
26
|
+
def _validate_transcription_segment(value: Any) -> Any:
|
|
27
|
+
if value is None or isinstance(value, TranscriptionSegment):
|
|
28
|
+
return value
|
|
29
|
+
return TranscriptionSegment.from_dict(value)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _serialize_with_to_dict(value: Any) -> Any:
|
|
33
|
+
return value.to_dict() if value is not None else None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
_TranscriptionSegmentField = Annotated[
|
|
37
|
+
TranscriptionSegment,
|
|
38
|
+
BeforeValidator(_validate_transcription_segment),
|
|
39
|
+
PlainSerializer(_serialize_with_to_dict, return_type=dict, when_used="always"),
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class Expressiveness(BaseModel):
|
|
24
44
|
"""Chatterbox ``generate()`` knobs derived from source-segment prosody.
|
|
25
45
|
|
|
26
|
-
``None`` on any field means "let Chatterbox use its own default"
|
|
46
|
+
``None`` on any field means "let Chatterbox use its own default" --
|
|
27
47
|
avoids pinning the dub against future Chatterbox default changes.
|
|
28
48
|
|
|
29
49
|
Attributes:
|
|
@@ -34,6 +54,8 @@ class Expressiveness:
|
|
|
34
54
|
temperature: Sampling temperature. Chatterbox default ``0.8``.
|
|
35
55
|
"""
|
|
36
56
|
|
|
57
|
+
model_config = ConfigDict(frozen=True)
|
|
58
|
+
|
|
37
59
|
exaggeration: float | None = None
|
|
38
60
|
cfg_weight: float | None = None
|
|
39
61
|
temperature: float | None = None
|
|
@@ -54,8 +76,7 @@ class Expressiveness:
|
|
|
54
76
|
}
|
|
55
77
|
|
|
56
78
|
|
|
57
|
-
|
|
58
|
-
class TranslatedSegment:
|
|
79
|
+
class TranslatedSegment(BaseModel):
|
|
59
80
|
"""A segment of translated text with timing information.
|
|
60
81
|
|
|
61
82
|
Attributes:
|
|
@@ -68,7 +89,9 @@ class TranslatedSegment:
|
|
|
68
89
|
end: End time in seconds.
|
|
69
90
|
"""
|
|
70
91
|
|
|
71
|
-
|
|
92
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
93
|
+
|
|
94
|
+
original_segment: _TranscriptionSegmentField
|
|
72
95
|
translated_text: str
|
|
73
96
|
source_lang: str
|
|
74
97
|
target_lang: str
|
|
@@ -76,13 +99,17 @@ class TranslatedSegment:
|
|
|
76
99
|
start: float = 0.0
|
|
77
100
|
end: float = 0.0
|
|
78
101
|
|
|
79
|
-
|
|
80
|
-
|
|
102
|
+
@model_validator(mode="after")
|
|
103
|
+
def _default_timing_from_segment(self) -> TranslatedSegment:
|
|
104
|
+
# ``start == end == 0.0`` is the dataclass-era sentinel for "use the
|
|
105
|
+
# original segment's timing." Preserved so legacy callers (and the
|
|
106
|
+
# dub cache wire format) keep working.
|
|
81
107
|
if self.start == 0.0 and self.end == 0.0:
|
|
82
108
|
self.start = self.original_segment.start
|
|
83
109
|
self.end = self.original_segment.end
|
|
84
110
|
if self.speaker is None:
|
|
85
111
|
self.speaker = self.original_segment.speaker
|
|
112
|
+
return self
|
|
86
113
|
|
|
87
114
|
@property
|
|
88
115
|
def original_text(self) -> str:
|
|
@@ -94,34 +121,8 @@ class TranslatedSegment:
|
|
|
94
121
|
"""Duration of the segment in seconds."""
|
|
95
122
|
return self.end - self.start
|
|
96
123
|
|
|
97
|
-
def to_dict(self) -> dict[str, Any]:
|
|
98
|
-
"""Convert to dictionary for JSON serialization (used by the dub cache)."""
|
|
99
|
-
return {
|
|
100
|
-
"original_segment": self.original_segment.to_dict(),
|
|
101
|
-
"translated_text": self.translated_text,
|
|
102
|
-
"source_lang": self.source_lang,
|
|
103
|
-
"target_lang": self.target_lang,
|
|
104
|
-
"speaker": self.speaker,
|
|
105
|
-
"start": self.start,
|
|
106
|
-
"end": self.end,
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
@classmethod
|
|
110
|
-
def from_dict(cls, data: dict[str, Any]) -> TranslatedSegment:
|
|
111
|
-
"""Reconstruct from a dict produced by :meth:`to_dict`."""
|
|
112
|
-
return cls(
|
|
113
|
-
original_segment=TranscriptionSegment.from_dict(data["original_segment"]),
|
|
114
|
-
translated_text=data["translated_text"],
|
|
115
|
-
source_lang=data["source_lang"],
|
|
116
|
-
target_lang=data["target_lang"],
|
|
117
|
-
speaker=data.get("speaker"),
|
|
118
|
-
start=data.get("start", 0.0),
|
|
119
|
-
end=data.get("end", 0.0),
|
|
120
|
-
)
|
|
121
|
-
|
|
122
124
|
|
|
123
|
-
|
|
124
|
-
class SeparatedAudio:
|
|
125
|
+
class SeparatedAudio(BaseModel):
|
|
125
126
|
"""Audio separated into different components.
|
|
126
127
|
|
|
127
128
|
Attributes:
|
|
@@ -132,6 +133,8 @@ class SeparatedAudio:
|
|
|
132
133
|
original: The original unseparated audio.
|
|
133
134
|
"""
|
|
134
135
|
|
|
136
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
137
|
+
|
|
135
138
|
vocals: Audio
|
|
136
139
|
background: Audio
|
|
137
140
|
original: Audio
|
|
@@ -144,8 +147,7 @@ class SeparatedAudio:
|
|
|
144
147
|
return self.music is not None and self.effects is not None
|
|
145
148
|
|
|
146
149
|
|
|
147
|
-
|
|
148
|
-
class TimingSummary:
|
|
150
|
+
class TimingSummary(BaseModel):
|
|
149
151
|
"""Aggregate stats over per-segment timing adjustments.
|
|
150
152
|
|
|
151
153
|
Surfaces how aggressively the timing synchronizer had to compress or
|
|
@@ -201,32 +203,8 @@ class TimingSummary:
|
|
|
201
203
|
max_truncation_seconds=max_truncation,
|
|
202
204
|
)
|
|
203
205
|
|
|
204
|
-
def to_dict(self) -> dict[str, Any]:
|
|
205
|
-
"""Convert to dictionary for JSON serialization."""
|
|
206
|
-
return {
|
|
207
|
-
"total_segments": self.total_segments,
|
|
208
|
-
"clean_count": self.clean_count,
|
|
209
|
-
"stretched_count": self.stretched_count,
|
|
210
|
-
"truncated_count": self.truncated_count,
|
|
211
|
-
"mean_speed_factor": self.mean_speed_factor,
|
|
212
|
-
"max_truncation_seconds": self.max_truncation_seconds,
|
|
213
|
-
}
|
|
214
206
|
|
|
215
|
-
|
|
216
|
-
def from_dict(cls, data: dict[str, Any]) -> TimingSummary:
|
|
217
|
-
"""Create TimingSummary from dictionary."""
|
|
218
|
-
return cls(
|
|
219
|
-
total_segments=data["total_segments"],
|
|
220
|
-
clean_count=data["clean_count"],
|
|
221
|
-
stretched_count=data["stretched_count"],
|
|
222
|
-
truncated_count=data["truncated_count"],
|
|
223
|
-
mean_speed_factor=data["mean_speed_factor"],
|
|
224
|
-
max_truncation_seconds=data["max_truncation_seconds"],
|
|
225
|
-
)
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
@dataclass
|
|
229
|
-
class DubbingResult:
|
|
207
|
+
class DubbingResult(BaseModel):
|
|
230
208
|
"""Result of a video dubbing operation.
|
|
231
209
|
|
|
232
210
|
Attributes:
|
|
@@ -247,16 +225,18 @@ class DubbingResult:
|
|
|
247
225
|
no failure mode that drops segments).
|
|
248
226
|
"""
|
|
249
227
|
|
|
228
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
229
|
+
|
|
250
230
|
dubbed_audio: Audio
|
|
251
231
|
translated_segments: list[TranslatedSegment]
|
|
252
232
|
source_transcription: Transcription
|
|
253
233
|
source_lang: str
|
|
254
234
|
target_lang: str
|
|
255
235
|
separated_audio: SeparatedAudio | None = None
|
|
256
|
-
voice_samples: dict[str, Audio] =
|
|
236
|
+
voice_samples: dict[str, Audio] = Field(default_factory=dict)
|
|
257
237
|
timing_summary: TimingSummary | None = None
|
|
258
238
|
transcript_quality: TranscriptQuality | None = None
|
|
259
|
-
translation_failures: list[int] =
|
|
239
|
+
translation_failures: list[int] = Field(default_factory=list)
|
|
260
240
|
|
|
261
241
|
@property
|
|
262
242
|
def num_segments(self) -> int:
|
|
@@ -283,8 +263,7 @@ class DubbingResult:
|
|
|
283
263
|
return segments_by_speaker
|
|
284
264
|
|
|
285
265
|
|
|
286
|
-
|
|
287
|
-
class RevoiceResult:
|
|
266
|
+
class RevoiceResult(BaseModel):
|
|
288
267
|
"""Result of a voice replacement operation.
|
|
289
268
|
|
|
290
269
|
Attributes:
|
|
@@ -296,6 +275,8 @@ class RevoiceResult:
|
|
|
296
275
|
speech_duration: Duration of the generated speech.
|
|
297
276
|
"""
|
|
298
277
|
|
|
278
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
279
|
+
|
|
299
280
|
revoiced_audio: Audio
|
|
300
281
|
text: str
|
|
301
282
|
separated_audio: SeparatedAudio | None = None
|