videopython 0.27.2__tar.gz → 0.28.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.27.2 → videopython-0.28.1}/PKG-INFO +2 -1
- {videopython-0.27.2 → videopython-0.28.1}/pyproject.toml +6 -1
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/dubbing/__init__.py +6 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/dubbing/dubber.py +22 -2
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/dubbing/models.py +103 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/dubbing/pipeline.py +235 -32
- videopython-0.28.1/src/videopython/ai/dubbing/quality.py +178 -0
- videopython-0.28.1/src/videopython/ai/generation/qwen3.py +394 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/generation/translation.py +130 -8
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/understanding/audio.py +43 -1
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/text/transcription.py +9 -0
- {videopython-0.27.2 → videopython-0.28.1}/.gitignore +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/LICENSE +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/README.md +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/__init__.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/_device.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/dubbing/remux.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/generation/audio.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/registry.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/swapping/__init__.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/swapping/inpainter.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/swapping/models.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/swapping/segmenter.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/swapping/swapper.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/transforms.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/understanding/separation.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/video_analysis.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/__init__.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/audio/__init__.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/audio/analysis.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/audio/audio.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/combine.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/description.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/effects.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/progress.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/registry.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/scene.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/streaming.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/text/__init__.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/text/overlay.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/transforms.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/transitions.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/utils.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/video.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/editing/__init__.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/editing/multicam.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/editing/premiere_xml.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/editing/video_edit.py +0 -0
- {videopython-0.27.2 → videopython-0.28.1}/src/videopython/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: videopython
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.28.1
|
|
4
4
|
Summary: Minimal video generation and processing library.
|
|
5
5
|
Project-URL: Homepage, https://videopython.com
|
|
6
6
|
Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
|
|
@@ -29,6 +29,7 @@ Requires-Dist: demucs>=4.0.0; extra == 'ai'
|
|
|
29
29
|
Requires-Dist: diffusers>=0.30.0; extra == 'ai'
|
|
30
30
|
Requires-Dist: easyocr>=1.7.0; extra == 'ai'
|
|
31
31
|
Requires-Dist: hf-transfer>=0.1.9; extra == 'ai'
|
|
32
|
+
Requires-Dist: llama-cpp-python>=0.3.0; extra == 'ai'
|
|
32
33
|
Requires-Dist: numba>=0.61.0; extra == 'ai'
|
|
33
34
|
Requires-Dist: ollama>=0.4.5; extra == 'ai'
|
|
34
35
|
Requires-Dist: openai-whisper>=20240930; extra == 'ai'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "videopython"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.28.1"
|
|
4
4
|
description = "Minimal video generation and processing library."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
|
|
@@ -80,6 +80,8 @@ ai = [
|
|
|
80
80
|
"sentencepiece>=0.1.99",
|
|
81
81
|
# Audio source separation
|
|
82
82
|
"demucs>=4.0.0",
|
|
83
|
+
# Translation backend: Qwen3 GGUF inference (M2)
|
|
84
|
+
"llama-cpp-python>=0.3.0",
|
|
83
85
|
]
|
|
84
86
|
|
|
85
87
|
# Required for pip install videopython[ai] - pip uses optional-dependencies, not dependency-groups
|
|
@@ -111,6 +113,8 @@ ai = [
|
|
|
111
113
|
"sentencepiece>=0.1.99",
|
|
112
114
|
# Audio source separation
|
|
113
115
|
"demucs>=4.0.0",
|
|
116
|
+
# Translation backend: Qwen3 GGUF inference (M2)
|
|
117
|
+
"llama-cpp-python>=0.3.0",
|
|
114
118
|
]
|
|
115
119
|
|
|
116
120
|
[project.urls]
|
|
@@ -136,6 +140,7 @@ module = [
|
|
|
136
140
|
"pyannote", "pyannote.*",
|
|
137
141
|
"silero_vad", "silero_vad.*",
|
|
138
142
|
"cv2", "cv2.*",
|
|
143
|
+
"llama_cpp", "llama_cpp.*",
|
|
139
144
|
]
|
|
140
145
|
ignore_missing_imports = true
|
|
141
146
|
|
|
@@ -3,7 +3,9 @@
|
|
|
3
3
|
from videopython.ai.dubbing.dubber import VideoDubber
|
|
4
4
|
from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio, TranslatedSegment
|
|
5
5
|
from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
|
|
6
|
+
from videopython.ai.dubbing.quality import GarbageTranscriptError, TranscriptQuality, assess_transcript
|
|
6
7
|
from videopython.ai.dubbing.timing import TimingSynchronizer
|
|
8
|
+
from videopython.ai.generation.translation import UnsupportedLanguageError
|
|
7
9
|
|
|
8
10
|
__all__ = [
|
|
9
11
|
"VideoDubber",
|
|
@@ -13,4 +15,8 @@ __all__ = [
|
|
|
13
15
|
"SeparatedAudio",
|
|
14
16
|
"LocalDubbingPipeline",
|
|
15
17
|
"TimingSynchronizer",
|
|
18
|
+
"GarbageTranscriptError",
|
|
19
|
+
"TranscriptQuality",
|
|
20
|
+
"assess_transcript",
|
|
21
|
+
"UnsupportedLanguageError",
|
|
16
22
|
]
|
|
@@ -7,7 +7,7 @@ from pathlib import Path
|
|
|
7
7
|
from typing import TYPE_CHECKING, Any, Callable
|
|
8
8
|
|
|
9
9
|
from videopython.ai.dubbing.models import DubbingResult, RevoiceResult
|
|
10
|
-
from videopython.ai.dubbing.pipeline import WhisperModel
|
|
10
|
+
from videopython.ai.dubbing.pipeline import TranslatorChoice, WhisperModel
|
|
11
11
|
|
|
12
12
|
if TYPE_CHECKING:
|
|
13
13
|
from videopython.base.video import Video
|
|
@@ -37,6 +37,19 @@ class VideoDubber:
|
|
|
37
37
|
gate; raise to drop more low-confidence windows.
|
|
38
38
|
logprob_threshold: Forwarded to ``AudioToText``. Whisper's average
|
|
39
39
|
log-probability gate.
|
|
40
|
+
strict_quality: When True, the pipeline raises
|
|
41
|
+
:class:`GarbageTranscriptError` before Demucs/translation/TTS run
|
|
42
|
+
if the transcript-quality heuristic returns ``"reject"``. When
|
|
43
|
+
False (default), low-quality transcripts are logged at WARNING
|
|
44
|
+
but processing continues. Either way the
|
|
45
|
+
:class:`TranscriptQuality` is exposed on ``DubbingResult`` for
|
|
46
|
+
inspection.
|
|
47
|
+
translator: Translation backend to use. ``"auto"`` (default)
|
|
48
|
+
picks Qwen3 on GPU, MarianMT on CPU; ``"marian"`` and
|
|
49
|
+
``"qwen3"`` force the named backend regardless of device.
|
|
50
|
+
See :class:`videopython.ai.generation.qwen3.Qwen3Translator`
|
|
51
|
+
for tradeoffs (Qwen3 is slower on CPU but produces
|
|
52
|
+
context-aware, length-budgeted output).
|
|
40
53
|
"""
|
|
41
54
|
|
|
42
55
|
def __init__(
|
|
@@ -47,6 +60,8 @@ class VideoDubber:
|
|
|
47
60
|
condition_on_previous_text: bool = False,
|
|
48
61
|
no_speech_threshold: float = 0.6,
|
|
49
62
|
logprob_threshold: float | None = -1.0,
|
|
63
|
+
strict_quality: bool = False,
|
|
64
|
+
translator: TranslatorChoice = "auto",
|
|
50
65
|
):
|
|
51
66
|
self.device = device
|
|
52
67
|
self.low_memory = low_memory
|
|
@@ -54,13 +69,16 @@ class VideoDubber:
|
|
|
54
69
|
self.condition_on_previous_text = condition_on_previous_text
|
|
55
70
|
self.no_speech_threshold = no_speech_threshold
|
|
56
71
|
self.logprob_threshold = logprob_threshold
|
|
72
|
+
self.strict_quality = strict_quality
|
|
73
|
+
self.translator = translator
|
|
57
74
|
self._local_pipeline: Any = None
|
|
58
75
|
requested = device.lower() if isinstance(device, str) else "auto"
|
|
59
76
|
logger.info(
|
|
60
|
-
"VideoDubber initialized with device=%s low_memory=%s whisper_model=%s",
|
|
77
|
+
"VideoDubber initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
|
|
61
78
|
requested,
|
|
62
79
|
low_memory,
|
|
63
80
|
whisper_model,
|
|
81
|
+
translator,
|
|
64
82
|
)
|
|
65
83
|
|
|
66
84
|
def _init_local_pipeline(self) -> None:
|
|
@@ -73,6 +91,8 @@ class VideoDubber:
|
|
|
73
91
|
condition_on_previous_text=self.condition_on_previous_text,
|
|
74
92
|
no_speech_threshold=self.no_speech_threshold,
|
|
75
93
|
logprob_threshold=self.logprob_threshold,
|
|
94
|
+
strict_quality=self.strict_quality,
|
|
95
|
+
translator=self.translator,
|
|
76
96
|
)
|
|
77
97
|
|
|
78
98
|
def dub(
|
|
@@ -3,10 +3,21 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
from dataclasses import dataclass, field
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
6
7
|
|
|
7
8
|
from videopython.base.audio import Audio
|
|
8
9
|
from videopython.base.text.transcription import Transcription, TranscriptionSegment
|
|
9
10
|
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from videopython.ai.dubbing.quality import TranscriptQuality
|
|
13
|
+
from videopython.ai.dubbing.timing import TimingAdjustment
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# Speed factors within this band of 1.0 are treated as a "clean" timing
|
|
17
|
+
# adjustment (no perceptible compression/stretch). Heuristic threshold for
|
|
18
|
+
# the TimingSummary classification only.
|
|
19
|
+
CLEAN_SPEED_TOLERANCE = 0.01
|
|
20
|
+
|
|
10
21
|
|
|
11
22
|
@dataclass
|
|
12
23
|
class TranslatedSegment:
|
|
@@ -73,6 +84,87 @@ class SeparatedAudio:
|
|
|
73
84
|
return self.music is not None and self.effects is not None
|
|
74
85
|
|
|
75
86
|
|
|
87
|
+
@dataclass
|
|
88
|
+
class TimingSummary:
|
|
89
|
+
"""Aggregate stats over per-segment timing adjustments.
|
|
90
|
+
|
|
91
|
+
Surfaces how aggressively the timing synchronizer had to compress or
|
|
92
|
+
truncate dubbed segments to fit the source's spoken regions. High
|
|
93
|
+
truncation rates indicate translation produced text too long for the
|
|
94
|
+
source duration.
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
total_segments: int
|
|
98
|
+
clean_count: int
|
|
99
|
+
stretched_count: int
|
|
100
|
+
truncated_count: int
|
|
101
|
+
mean_speed_factor: float
|
|
102
|
+
max_truncation_seconds: float
|
|
103
|
+
|
|
104
|
+
@classmethod
|
|
105
|
+
def from_adjustments(cls, adjustments: list[TimingAdjustment]) -> TimingSummary:
|
|
106
|
+
"""Aggregate a list of TimingAdjustments into a TimingSummary."""
|
|
107
|
+
total = len(adjustments)
|
|
108
|
+
if total == 0:
|
|
109
|
+
return cls(
|
|
110
|
+
total_segments=0,
|
|
111
|
+
clean_count=0,
|
|
112
|
+
stretched_count=0,
|
|
113
|
+
truncated_count=0,
|
|
114
|
+
mean_speed_factor=1.0,
|
|
115
|
+
max_truncation_seconds=0.0,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
clean = 0
|
|
119
|
+
stretched = 0
|
|
120
|
+
truncated = 0
|
|
121
|
+
speed_sum = 0.0
|
|
122
|
+
max_truncation = 0.0
|
|
123
|
+
for adj in adjustments:
|
|
124
|
+
speed_sum += adj.speed_factor
|
|
125
|
+
if adj.was_truncated:
|
|
126
|
+
truncated += 1
|
|
127
|
+
truncation = adj.original_duration - adj.actual_duration
|
|
128
|
+
if truncation > max_truncation:
|
|
129
|
+
max_truncation = truncation
|
|
130
|
+
elif abs(adj.speed_factor - 1.0) <= CLEAN_SPEED_TOLERANCE:
|
|
131
|
+
clean += 1
|
|
132
|
+
else:
|
|
133
|
+
stretched += 1
|
|
134
|
+
|
|
135
|
+
return cls(
|
|
136
|
+
total_segments=total,
|
|
137
|
+
clean_count=clean,
|
|
138
|
+
stretched_count=stretched,
|
|
139
|
+
truncated_count=truncated,
|
|
140
|
+
mean_speed_factor=speed_sum / total,
|
|
141
|
+
max_truncation_seconds=max_truncation,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
def to_dict(self) -> dict[str, Any]:
|
|
145
|
+
"""Convert to dictionary for JSON serialization."""
|
|
146
|
+
return {
|
|
147
|
+
"total_segments": self.total_segments,
|
|
148
|
+
"clean_count": self.clean_count,
|
|
149
|
+
"stretched_count": self.stretched_count,
|
|
150
|
+
"truncated_count": self.truncated_count,
|
|
151
|
+
"mean_speed_factor": self.mean_speed_factor,
|
|
152
|
+
"max_truncation_seconds": self.max_truncation_seconds,
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
@classmethod
|
|
156
|
+
def from_dict(cls, data: dict[str, Any]) -> TimingSummary:
|
|
157
|
+
"""Create TimingSummary from dictionary."""
|
|
158
|
+
return cls(
|
|
159
|
+
total_segments=data["total_segments"],
|
|
160
|
+
clean_count=data["clean_count"],
|
|
161
|
+
stretched_count=data["stretched_count"],
|
|
162
|
+
truncated_count=data["truncated_count"],
|
|
163
|
+
mean_speed_factor=data["mean_speed_factor"],
|
|
164
|
+
max_truncation_seconds=data["max_truncation_seconds"],
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
|
|
76
168
|
@dataclass
|
|
77
169
|
class DubbingResult:
|
|
78
170
|
"""Result of a video dubbing operation.
|
|
@@ -85,6 +177,14 @@ class DubbingResult:
|
|
|
85
177
|
target_lang: Target language for dubbing.
|
|
86
178
|
separated_audio: Separated audio components (if preserve_background=True).
|
|
87
179
|
voice_samples: Dictionary mapping speaker IDs to voice sample Audio.
|
|
180
|
+
timing_summary: Aggregate stats over per-segment timing adjustments.
|
|
181
|
+
transcript_quality: Heuristic quality assessment of the transcription
|
|
182
|
+
(None when the pipeline returned early on an empty transcription).
|
|
183
|
+
translation_failures: Indices of segments where translation failed
|
|
184
|
+
entirely. Used by Qwen3Translator when both the primary call and
|
|
185
|
+
the per-segment Marian fallback fail; those segments are dubbed
|
|
186
|
+
with empty text. Empty list under MarianTranslator (Marian has
|
|
187
|
+
no failure mode that drops segments).
|
|
88
188
|
"""
|
|
89
189
|
|
|
90
190
|
dubbed_audio: Audio
|
|
@@ -94,6 +194,9 @@ class DubbingResult:
|
|
|
94
194
|
target_lang: str
|
|
95
195
|
separated_audio: SeparatedAudio | None = None
|
|
96
196
|
voice_samples: dict[str, Audio] = field(default_factory=dict)
|
|
197
|
+
timing_summary: TimingSummary | None = None
|
|
198
|
+
transcript_quality: TranscriptQuality | None = None
|
|
199
|
+
translation_failures: list[int] = field(default_factory=list)
|
|
97
200
|
|
|
98
201
|
@property
|
|
99
202
|
def num_segments(self) -> int:
|
|
@@ -9,13 +9,24 @@ from typing import TYPE_CHECKING, Any, Callable, Literal
|
|
|
9
9
|
|
|
10
10
|
import numpy as np
|
|
11
11
|
|
|
12
|
-
from videopython.ai.
|
|
12
|
+
from videopython.ai._device import select_device
|
|
13
|
+
from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio, TimingSummary
|
|
14
|
+
from videopython.ai.dubbing.quality import GarbageTranscriptError, assess_transcript
|
|
13
15
|
from videopython.ai.dubbing.timing import TimingSynchronizer
|
|
16
|
+
from videopython.ai.generation.qwen3 import Qwen3Translator
|
|
17
|
+
from videopython.ai.generation.translation import (
|
|
18
|
+
MarianTranslator,
|
|
19
|
+
TranslationBackend,
|
|
20
|
+
UnsupportedLanguageError,
|
|
21
|
+
)
|
|
14
22
|
|
|
15
23
|
if TYPE_CHECKING:
|
|
16
24
|
from videopython.base.audio import Audio
|
|
17
25
|
|
|
18
26
|
|
|
27
|
+
TranslatorChoice = Literal["auto", "marian", "qwen3"]
|
|
28
|
+
|
|
29
|
+
|
|
19
30
|
def _peak_match(target: Audio, reference: Audio) -> Audio:
|
|
20
31
|
"""Scale ``target`` so its peak amplitude matches ``reference``.
|
|
21
32
|
|
|
@@ -46,6 +57,14 @@ WhisperModel = Literal["tiny", "base", "small", "medium", "large", "turbo"]
|
|
|
46
57
|
|
|
47
58
|
logger = logging.getLogger(__name__)
|
|
48
59
|
|
|
60
|
+
# Voice-sample quality gating thresholds. Tuned conservatively to favor
|
|
61
|
+
# accepting real-world dialogue over rejecting it; failures fall back to
|
|
62
|
+
# the longest segment with a WARNING log so we can re-tune from production
|
|
63
|
+
# data instead of guessing.
|
|
64
|
+
PEAK_CLIP_THRESHOLD = 0.99
|
|
65
|
+
MIN_VOCAL_BG_RMS_RATIO = 1.5
|
|
66
|
+
VOICE_SAMPLE_TARGET_DURATION = 6.0
|
|
67
|
+
|
|
49
68
|
|
|
50
69
|
class LocalDubbingPipeline:
|
|
51
70
|
"""Local pipeline for video dubbing.
|
|
@@ -64,6 +83,8 @@ class LocalDubbingPipeline:
|
|
|
64
83
|
condition_on_previous_text: bool = False,
|
|
65
84
|
no_speech_threshold: float = 0.6,
|
|
66
85
|
logprob_threshold: float | None = -1.0,
|
|
86
|
+
strict_quality: bool = False,
|
|
87
|
+
translator: TranslatorChoice = "auto",
|
|
67
88
|
):
|
|
68
89
|
self.device = device
|
|
69
90
|
self.low_memory = low_memory
|
|
@@ -71,12 +92,15 @@ class LocalDubbingPipeline:
|
|
|
71
92
|
self.condition_on_previous_text = condition_on_previous_text
|
|
72
93
|
self.no_speech_threshold = no_speech_threshold
|
|
73
94
|
self.logprob_threshold = logprob_threshold
|
|
95
|
+
self.strict_quality = strict_quality
|
|
96
|
+
self.translator = translator
|
|
74
97
|
requested = device.lower() if isinstance(device, str) else "auto"
|
|
75
98
|
logger.info(
|
|
76
|
-
"LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s",
|
|
99
|
+
"LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
|
|
77
100
|
requested,
|
|
78
101
|
low_memory,
|
|
79
102
|
whisper_model,
|
|
103
|
+
translator,
|
|
80
104
|
)
|
|
81
105
|
|
|
82
106
|
self._transcriber: Any = None
|
|
@@ -117,11 +141,64 @@ class LocalDubbingPipeline:
|
|
|
117
141
|
logprob_threshold=self.logprob_threshold,
|
|
118
142
|
)
|
|
119
143
|
|
|
120
|
-
def _init_translator(self) -> None:
|
|
121
|
-
"""Initialize the translation
|
|
122
|
-
|
|
144
|
+
def _init_translator(self, source_lang: str, target_lang: str) -> None:
|
|
145
|
+
"""Initialize the translation backend.
|
|
146
|
+
|
|
147
|
+
Resolves the configured ``self.translator`` choice into a concrete
|
|
148
|
+
backend. ``"auto"`` uses :meth:`_resolve_translator_auto`; explicit
|
|
149
|
+
choices instantiate the named backend directly. Re-initialization
|
|
150
|
+
is a no-op when ``self._translator`` is already a matching instance
|
|
151
|
+
for the same language pair (handled at call sites via the existing
|
|
152
|
+
``self._translator is None`` gate).
|
|
153
|
+
"""
|
|
154
|
+
if self.translator == "marian":
|
|
155
|
+
self._translator = MarianTranslator(device=self.device)
|
|
156
|
+
elif self.translator == "qwen3":
|
|
157
|
+
self._translator = Qwen3Translator(device=self.device)
|
|
158
|
+
else: # "auto"
|
|
159
|
+
self._translator = self._resolve_translator_auto(source_lang, target_lang)
|
|
160
|
+
|
|
161
|
+
def _resolve_translator_auto(self, source_lang: str, target_lang: str) -> TranslationBackend:
|
|
162
|
+
"""Pick a backend based on language coverage AND device.
|
|
163
|
+
|
|
164
|
+
Qwen3-4B Q4_K_M on CPU is roughly 10-15x slower than MarianMT (M2.1
|
|
165
|
+
spike on dreams_15min.mp4). The resolver picks Marian on CPU
|
|
166
|
+
whenever it covers the language pair and only escalates to Qwen
|
|
167
|
+
when a GPU is available or Marian doesn't cover the pair.
|
|
168
|
+
"""
|
|
169
|
+
device = select_device(self.device, mps_allowed=True)
|
|
170
|
+
has_gpu = device in ("cuda", "mps")
|
|
171
|
+
|
|
172
|
+
# 1. GPU + Qwen covers the pair → Qwen wins (best quality).
|
|
173
|
+
if has_gpu and Qwen3Translator.supports(source_lang, target_lang):
|
|
174
|
+
logger.info(
|
|
175
|
+
"translator: auto-selected qwen3 (device=%s, supports %s->%s)",
|
|
176
|
+
device,
|
|
177
|
+
source_lang,
|
|
178
|
+
target_lang,
|
|
179
|
+
)
|
|
180
|
+
return Qwen3Translator(device=self.device)
|
|
181
|
+
|
|
182
|
+
# 2. Marian covers the pair → Marian (fast).
|
|
183
|
+
if MarianTranslator.has_model_for(source_lang, target_lang):
|
|
184
|
+
if has_gpu:
|
|
185
|
+
reason = f"Qwen does not cover {source_lang}->{target_lang}"
|
|
186
|
+
else:
|
|
187
|
+
reason = f"device={device} (Qwen would be ~10-15x slower; pass translator='qwen3' to override)"
|
|
188
|
+
logger.info("translator: auto-selected marian (%s)", reason)
|
|
189
|
+
return MarianTranslator(device=self.device)
|
|
190
|
+
|
|
191
|
+
# 3. CPU + only Qwen covers it: warn loudly and use Qwen anyway.
|
|
192
|
+
if Qwen3Translator.supports(source_lang, target_lang):
|
|
193
|
+
logger.warning(
|
|
194
|
+
"translator: auto-selected qwen3 on CPU (%s->%s not in Marian); "
|
|
195
|
+
"translation will be slow (~10-15x MarianMT). Consider GPU.",
|
|
196
|
+
source_lang,
|
|
197
|
+
target_lang,
|
|
198
|
+
)
|
|
199
|
+
return Qwen3Translator(device=self.device)
|
|
123
200
|
|
|
124
|
-
|
|
201
|
+
raise UnsupportedLanguageError(source_lang, target_lang)
|
|
125
202
|
|
|
126
203
|
def _init_tts(self, language: str = "en") -> None:
|
|
127
204
|
"""Initialize the text-to-speech model."""
|
|
@@ -141,12 +218,25 @@ class LocalDubbingPipeline:
|
|
|
141
218
|
|
|
142
219
|
def _extract_voice_samples(
|
|
143
220
|
self,
|
|
144
|
-
|
|
221
|
+
vocal_audio: Any,
|
|
222
|
+
background_audio: Any | None,
|
|
145
223
|
transcription: Any,
|
|
146
224
|
min_duration: float = 3.0,
|
|
147
225
|
max_duration: float = 10.0,
|
|
148
226
|
) -> dict[str, Any]:
|
|
149
|
-
"""Extract
|
|
227
|
+
"""Extract a per-speaker voice sample with quality gating.
|
|
228
|
+
|
|
229
|
+
Picks the highest-scored segment per speaker after rejecting clipped
|
|
230
|
+
slices (peak >= ``PEAK_CLIP_THRESHOLD``) and slices where Demucs left
|
|
231
|
+
the background louder than the vocals
|
|
232
|
+
(``vocal_rms / bg_rms < MIN_VOCAL_BG_RMS_RATIO``). When the
|
|
233
|
+
background track isn't available (e.g. ``revoice`` after
|
|
234
|
+
``low_memory`` dropped it), the RMS check is skipped silently.
|
|
235
|
+
|
|
236
|
+
Falls back to the longest available segment with a WARNING log when
|
|
237
|
+
every candidate is rejected, so the dub continues with the best
|
|
238
|
+
sample we have rather than silently dropping the speaker.
|
|
239
|
+
"""
|
|
150
240
|
from videopython.base.audio import Audio
|
|
151
241
|
|
|
152
242
|
voice_samples: dict[str, Audio] = {}
|
|
@@ -159,29 +249,106 @@ class LocalDubbingPipeline:
|
|
|
159
249
|
segments_by_speaker[speaker].append(segment)
|
|
160
250
|
|
|
161
251
|
for speaker, segments in segments_by_speaker.items():
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
252
|
+
chosen, fallback_reason = self._pick_voice_segment(
|
|
253
|
+
speaker, segments, vocal_audio, background_audio, min_duration
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
if chosen is None:
|
|
257
|
+
logger.warning("No usable voice-sample segment for speaker %r (no candidates)", speaker)
|
|
258
|
+
continue
|
|
259
|
+
|
|
260
|
+
if fallback_reason is not None:
|
|
261
|
+
logger.warning(
|
|
262
|
+
"Voice-sample quality fallback for speaker %r (%d candidates): %s — using longest segment",
|
|
263
|
+
speaker,
|
|
264
|
+
len(segments),
|
|
265
|
+
fallback_reason,
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
start = chosen.start
|
|
269
|
+
end = min(chosen.end, start + max_duration)
|
|
270
|
+
sliced = vocal_audio.slice(start, end)
|
|
271
|
+
# Audio.slice returns a numpy view into the source. Copy so the
|
|
272
|
+
# short voice sample doesn't keep the full vocals array (~1.3 GB
|
|
273
|
+
# for 2h sources) alive across translate + TTS.
|
|
274
|
+
voice_samples[speaker] = Audio(sliced.data.copy(), sliced.metadata)
|
|
182
275
|
|
|
183
276
|
return voice_samples
|
|
184
277
|
|
|
278
|
+
def _pick_voice_segment(
|
|
279
|
+
self,
|
|
280
|
+
speaker: str,
|
|
281
|
+
segments: list[Any],
|
|
282
|
+
vocal_audio: Any,
|
|
283
|
+
background_audio: Any | None,
|
|
284
|
+
min_duration: float,
|
|
285
|
+
) -> tuple[Any | None, str | None]:
|
|
286
|
+
"""Score eligible segments and pick the best one for ``speaker``.
|
|
287
|
+
|
|
288
|
+
Returns ``(segment, fallback_reason)``. ``fallback_reason`` is None
|
|
289
|
+
when scoring picked a segment cleanly; non-None when every candidate
|
|
290
|
+
was rejected and the longest segment was used instead.
|
|
291
|
+
"""
|
|
292
|
+
if not segments:
|
|
293
|
+
return None, None
|
|
294
|
+
|
|
295
|
+
eligible = [s for s in segments if (s.end - s.start) >= min_duration]
|
|
296
|
+
|
|
297
|
+
rejection_reasons: list[str] = []
|
|
298
|
+
scored: list[tuple[float, Any]] = []
|
|
299
|
+
for segment in eligible:
|
|
300
|
+
score, reason = self._score_voice_segment(segment, vocal_audio, background_audio)
|
|
301
|
+
if score is None:
|
|
302
|
+
rejection_reasons.append(reason or "rejected")
|
|
303
|
+
else:
|
|
304
|
+
scored.append((score, segment))
|
|
305
|
+
|
|
306
|
+
if scored:
|
|
307
|
+
scored.sort(key=lambda item: item[0], reverse=True)
|
|
308
|
+
return scored[0][1], None
|
|
309
|
+
|
|
310
|
+
# All eligible segments rejected (or none met the min duration).
|
|
311
|
+
# Fall back to the longest segment overall so the speaker still
|
|
312
|
+
# gets a clone reference.
|
|
313
|
+
longest = max(segments, key=lambda s: s.end - s.start)
|
|
314
|
+
if eligible:
|
|
315
|
+
reason = ", ".join(sorted(set(rejection_reasons)))
|
|
316
|
+
else:
|
|
317
|
+
reason = f"no segment >= {min_duration:.1f}s"
|
|
318
|
+
return longest, reason
|
|
319
|
+
|
|
320
|
+
def _score_voice_segment(
|
|
321
|
+
self,
|
|
322
|
+
segment: Any,
|
|
323
|
+
vocal_audio: Any,
|
|
324
|
+
background_audio: Any | None,
|
|
325
|
+
) -> tuple[float | None, str | None]:
|
|
326
|
+
"""Return ``(score, reason)`` for a candidate segment.
|
|
327
|
+
|
|
328
|
+
``score`` is ``None`` when the segment is rejected; ``reason`` carries
|
|
329
|
+
the rejection cause so the fallback logger can summarize.
|
|
330
|
+
"""
|
|
331
|
+
vocal_slice = vocal_audio.slice(segment.start, segment.end)
|
|
332
|
+
if vocal_slice.data.size == 0:
|
|
333
|
+
return None, "empty slice"
|
|
334
|
+
|
|
335
|
+
peak = float(np.max(np.abs(vocal_slice.data)))
|
|
336
|
+
if peak >= PEAK_CLIP_THRESHOLD:
|
|
337
|
+
return None, "clipped"
|
|
338
|
+
|
|
339
|
+
vocal_rms = float(np.sqrt(np.mean(vocal_slice.data**2)))
|
|
340
|
+
|
|
341
|
+
if background_audio is not None:
|
|
342
|
+
bg_slice = background_audio.slice(segment.start, segment.end)
|
|
343
|
+
if bg_slice.data.size > 0:
|
|
344
|
+
bg_rms = float(np.sqrt(np.mean(bg_slice.data**2)))
|
|
345
|
+
if bg_rms > 0 and (vocal_rms / bg_rms) < MIN_VOCAL_BG_RMS_RATIO:
|
|
346
|
+
return None, "background-dominated"
|
|
347
|
+
|
|
348
|
+
duration = segment.end - segment.start
|
|
349
|
+
duration_penalty = abs(duration - VOICE_SAMPLE_TARGET_DURATION)
|
|
350
|
+
return vocal_rms - 0.05 * duration_penalty, None
|
|
351
|
+
|
|
185
352
|
def process(
|
|
186
353
|
self,
|
|
187
354
|
source_audio: Audio,
|
|
@@ -266,6 +433,23 @@ class LocalDubbingPipeline:
|
|
|
266
433
|
target_lang=target_lang,
|
|
267
434
|
)
|
|
268
435
|
|
|
436
|
+
# Cheap heuristic gate before the expensive Demucs/translation/TTS
|
|
437
|
+
# stages. Lets strict_quality callers refuse-and-refund without
|
|
438
|
+
# running the rest of the pipeline; non-strict runs continue but
|
|
439
|
+
# surface the assessment on DubbingResult.
|
|
440
|
+
transcript_quality = assess_transcript(transcription, source_audio.metadata.duration_seconds)
|
|
441
|
+
if transcript_quality.recommendation == "reject" and self.strict_quality:
|
|
442
|
+
raise GarbageTranscriptError(
|
|
443
|
+
f"Refusing to dub: {', '.join(transcript_quality.flags)}",
|
|
444
|
+
transcript_quality,
|
|
445
|
+
)
|
|
446
|
+
if transcript_quality.recommendation in ("warn", "reject"):
|
|
447
|
+
logger.warning(
|
|
448
|
+
"Transcript quality flags raised: %s (recommendation=%s)",
|
|
449
|
+
", ".join(transcript_quality.flags),
|
|
450
|
+
transcript_quality.recommendation,
|
|
451
|
+
)
|
|
452
|
+
|
|
269
453
|
detected_lang = source_lang or transcription.language or "en"
|
|
270
454
|
|
|
271
455
|
separated_audio: SeparatedAudio | None = None
|
|
@@ -303,7 +487,7 @@ class LocalDubbingPipeline:
|
|
|
303
487
|
voice_samples: dict[str, Audio] = {}
|
|
304
488
|
if voice_clone:
|
|
305
489
|
report_progress("Extracting voice samples", 0.25)
|
|
306
|
-
voice_samples = self._extract_voice_samples(vocal_audio, transcription)
|
|
490
|
+
voice_samples = self._extract_voice_samples(vocal_audio, background_audio, transcription)
|
|
307
491
|
|
|
308
492
|
# vocals is no longer needed; voice_samples are independent copies.
|
|
309
493
|
# In low_memory mode this is the only ref keeping the buffer alive
|
|
@@ -312,13 +496,25 @@ class LocalDubbingPipeline:
|
|
|
312
496
|
|
|
313
497
|
report_progress("Translating text", 0.35)
|
|
314
498
|
if self._translator is None:
|
|
315
|
-
self._init_translator()
|
|
499
|
+
self._init_translator(source_lang=detected_lang, target_lang=target_lang)
|
|
500
|
+
|
|
501
|
+
# Translation stage spans 0.35 → 0.50 of overall pipeline progress.
|
|
502
|
+
# MarianMT runs sequentially over 8-segment batches; on a 15-min
|
|
503
|
+
# source that's minutes of silent dwell on 0.35 without per-batch
|
|
504
|
+
# ticks. Map the [0,1] translation fraction onto that 15% window.
|
|
505
|
+
def _on_translation_progress(fraction: float) -> None:
|
|
506
|
+
clamped = max(0.0, min(1.0, fraction))
|
|
507
|
+
report_progress(f"Translating text ({int(clamped * 100)}%)", 0.35 + 0.15 * clamped)
|
|
316
508
|
|
|
317
509
|
translated_segments = self._translator.translate_segments(
|
|
318
510
|
segments=transcription.segments,
|
|
319
511
|
target_lang=target_lang,
|
|
320
512
|
source_lang=detected_lang,
|
|
513
|
+
progress_callback=_on_translation_progress,
|
|
321
514
|
)
|
|
515
|
+
# Capture per-segment failures (always empty for Marian) before
|
|
516
|
+
# _maybe_unload nukes the backend in low_memory mode.
|
|
517
|
+
translation_failures = list(self._translator.translation_failures)
|
|
322
518
|
self._maybe_unload("_translator")
|
|
323
519
|
|
|
324
520
|
report_progress("Generating dubbed speech", 0.50)
|
|
@@ -393,7 +589,8 @@ class LocalDubbingPipeline:
|
|
|
393
589
|
self._init_synchronizer()
|
|
394
590
|
assert self._synchronizer is not None
|
|
395
591
|
|
|
396
|
-
synchronized_segments,
|
|
592
|
+
synchronized_segments, adjustments = self._synchronizer.synchronize_segments(dubbed_segments, target_durations)
|
|
593
|
+
timing_summary = TimingSummary.from_adjustments(adjustments)
|
|
397
594
|
del dubbed_segments
|
|
398
595
|
|
|
399
596
|
report_progress("Assembling final audio", 0.90)
|
|
@@ -429,6 +626,9 @@ class LocalDubbingPipeline:
|
|
|
429
626
|
target_lang=target_lang,
|
|
430
627
|
separated_audio=separated_audio,
|
|
431
628
|
voice_samples=voice_samples,
|
|
629
|
+
timing_summary=timing_summary,
|
|
630
|
+
transcript_quality=transcript_quality,
|
|
631
|
+
translation_failures=translation_failures,
|
|
432
632
|
)
|
|
433
633
|
|
|
434
634
|
def revoice(
|
|
@@ -486,7 +686,10 @@ class LocalDubbingPipeline:
|
|
|
486
686
|
voice_sample: Audio | None = None
|
|
487
687
|
|
|
488
688
|
if transcription.segments:
|
|
489
|
-
|
|
689
|
+
# revoice doesn't track the background after the low_memory drop,
|
|
690
|
+
# so quality gating degrades to "no RMS check" here. Clipping is
|
|
691
|
+
# still rejected.
|
|
692
|
+
voice_samples = self._extract_voice_samples(vocal_audio, None, transcription)
|
|
490
693
|
if voice_samples:
|
|
491
694
|
voice_sample = next(iter(voice_samples.values()))
|
|
492
695
|
|