videopython 0.27.2__tar.gz → 0.28.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {videopython-0.27.2 → videopython-0.28.1}/PKG-INFO +2 -1
  2. {videopython-0.27.2 → videopython-0.28.1}/pyproject.toml +6 -1
  3. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/dubbing/__init__.py +6 -0
  4. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/dubbing/dubber.py +22 -2
  5. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/dubbing/models.py +103 -0
  6. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/dubbing/pipeline.py +235 -32
  7. videopython-0.28.1/src/videopython/ai/dubbing/quality.py +178 -0
  8. videopython-0.28.1/src/videopython/ai/generation/qwen3.py +394 -0
  9. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/generation/translation.py +130 -8
  10. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/understanding/audio.py +43 -1
  11. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/text/transcription.py +9 -0
  12. {videopython-0.27.2 → videopython-0.28.1}/.gitignore +0 -0
  13. {videopython-0.27.2 → videopython-0.28.1}/LICENSE +0 -0
  14. {videopython-0.27.2 → videopython-0.28.1}/README.md +0 -0
  15. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/__init__.py +0 -0
  16. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/__init__.py +0 -0
  17. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/_device.py +0 -0
  18. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/dubbing/remux.py +0 -0
  19. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/dubbing/timing.py +0 -0
  20. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/generation/__init__.py +0 -0
  21. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/generation/audio.py +0 -0
  22. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/generation/image.py +0 -0
  23. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/generation/video.py +0 -0
  24. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/registry.py +0 -0
  25. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/swapping/__init__.py +0 -0
  26. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/swapping/inpainter.py +0 -0
  27. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/swapping/models.py +0 -0
  28. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/swapping/segmenter.py +0 -0
  29. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/swapping/swapper.py +0 -0
  30. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/transforms.py +0 -0
  31. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/understanding/__init__.py +0 -0
  32. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/understanding/image.py +0 -0
  33. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/understanding/separation.py +0 -0
  34. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/understanding/temporal.py +0 -0
  35. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/ai/video_analysis.py +0 -0
  36. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/__init__.py +0 -0
  37. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/audio/__init__.py +0 -0
  38. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/audio/analysis.py +0 -0
  39. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/audio/audio.py +0 -0
  40. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/combine.py +0 -0
  41. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/description.py +0 -0
  42. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/effects.py +0 -0
  43. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/exceptions.py +0 -0
  44. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/progress.py +0 -0
  45. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/registry.py +0 -0
  46. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/scene.py +0 -0
  47. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/streaming.py +0 -0
  48. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/text/__init__.py +0 -0
  49. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/text/overlay.py +0 -0
  50. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/transforms.py +0 -0
  51. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/transitions.py +0 -0
  52. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/utils.py +0 -0
  53. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/base/video.py +0 -0
  54. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/editing/__init__.py +0 -0
  55. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/editing/multicam.py +0 -0
  56. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/editing/premiere_xml.py +0 -0
  57. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/editing/video_edit.py +0 -0
  58. {videopython-0.27.2 → videopython-0.28.1}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.27.2
3
+ Version: 0.28.1
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -29,6 +29,7 @@ Requires-Dist: demucs>=4.0.0; extra == 'ai'
29
29
  Requires-Dist: diffusers>=0.30.0; extra == 'ai'
30
30
  Requires-Dist: easyocr>=1.7.0; extra == 'ai'
31
31
  Requires-Dist: hf-transfer>=0.1.9; extra == 'ai'
32
+ Requires-Dist: llama-cpp-python>=0.3.0; extra == 'ai'
32
33
  Requires-Dist: numba>=0.61.0; extra == 'ai'
33
34
  Requires-Dist: ollama>=0.4.5; extra == 'ai'
34
35
  Requires-Dist: openai-whisper>=20240930; extra == 'ai'
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.27.2"
3
+ version = "0.28.1"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -80,6 +80,8 @@ ai = [
80
80
  "sentencepiece>=0.1.99",
81
81
  # Audio source separation
82
82
  "demucs>=4.0.0",
83
+ # Translation backend: Qwen3 GGUF inference (M2)
84
+ "llama-cpp-python>=0.3.0",
83
85
  ]
84
86
 
85
87
  # Required for pip install videopython[ai] - pip uses optional-dependencies, not dependency-groups
@@ -111,6 +113,8 @@ ai = [
111
113
  "sentencepiece>=0.1.99",
112
114
  # Audio source separation
113
115
  "demucs>=4.0.0",
116
+ # Translation backend: Qwen3 GGUF inference (M2)
117
+ "llama-cpp-python>=0.3.0",
114
118
  ]
115
119
 
116
120
  [project.urls]
@@ -136,6 +140,7 @@ module = [
136
140
  "pyannote", "pyannote.*",
137
141
  "silero_vad", "silero_vad.*",
138
142
  "cv2", "cv2.*",
143
+ "llama_cpp", "llama_cpp.*",
139
144
  ]
140
145
  ignore_missing_imports = true
141
146
 
@@ -3,7 +3,9 @@
3
3
  from videopython.ai.dubbing.dubber import VideoDubber
4
4
  from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio, TranslatedSegment
5
5
  from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
6
+ from videopython.ai.dubbing.quality import GarbageTranscriptError, TranscriptQuality, assess_transcript
6
7
  from videopython.ai.dubbing.timing import TimingSynchronizer
8
+ from videopython.ai.generation.translation import UnsupportedLanguageError
7
9
 
8
10
  __all__ = [
9
11
  "VideoDubber",
@@ -13,4 +15,8 @@ __all__ = [
13
15
  "SeparatedAudio",
14
16
  "LocalDubbingPipeline",
15
17
  "TimingSynchronizer",
18
+ "GarbageTranscriptError",
19
+ "TranscriptQuality",
20
+ "assess_transcript",
21
+ "UnsupportedLanguageError",
16
22
  ]
@@ -7,7 +7,7 @@ from pathlib import Path
7
7
  from typing import TYPE_CHECKING, Any, Callable
8
8
 
9
9
  from videopython.ai.dubbing.models import DubbingResult, RevoiceResult
10
- from videopython.ai.dubbing.pipeline import WhisperModel
10
+ from videopython.ai.dubbing.pipeline import TranslatorChoice, WhisperModel
11
11
 
12
12
  if TYPE_CHECKING:
13
13
  from videopython.base.video import Video
@@ -37,6 +37,19 @@ class VideoDubber:
37
37
  gate; raise to drop more low-confidence windows.
38
38
  logprob_threshold: Forwarded to ``AudioToText``. Whisper's average
39
39
  log-probability gate.
40
+ strict_quality: When True, the pipeline raises
41
+ :class:`GarbageTranscriptError` before Demucs/translation/TTS run
42
+ if the transcript-quality heuristic returns ``"reject"``. When
43
+ False (default), low-quality transcripts are logged at WARNING
44
+ but processing continues. Either way the
45
+ :class:`TranscriptQuality` is exposed on ``DubbingResult`` for
46
+ inspection.
47
+ translator: Translation backend to use. ``"auto"`` (default)
48
+ picks Qwen3 on GPU, MarianMT on CPU; ``"marian"`` and
49
+ ``"qwen3"`` force the named backend regardless of device.
50
+ See :class:`videopython.ai.generation.qwen3.Qwen3Translator`
51
+ for tradeoffs (Qwen3 is slower on CPU but produces
52
+ context-aware, length-budgeted output).
40
53
  """
41
54
 
42
55
  def __init__(
@@ -47,6 +60,8 @@ class VideoDubber:
47
60
  condition_on_previous_text: bool = False,
48
61
  no_speech_threshold: float = 0.6,
49
62
  logprob_threshold: float | None = -1.0,
63
+ strict_quality: bool = False,
64
+ translator: TranslatorChoice = "auto",
50
65
  ):
51
66
  self.device = device
52
67
  self.low_memory = low_memory
@@ -54,13 +69,16 @@ class VideoDubber:
54
69
  self.condition_on_previous_text = condition_on_previous_text
55
70
  self.no_speech_threshold = no_speech_threshold
56
71
  self.logprob_threshold = logprob_threshold
72
+ self.strict_quality = strict_quality
73
+ self.translator = translator
57
74
  self._local_pipeline: Any = None
58
75
  requested = device.lower() if isinstance(device, str) else "auto"
59
76
  logger.info(
60
- "VideoDubber initialized with device=%s low_memory=%s whisper_model=%s",
77
+ "VideoDubber initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
61
78
  requested,
62
79
  low_memory,
63
80
  whisper_model,
81
+ translator,
64
82
  )
65
83
 
66
84
  def _init_local_pipeline(self) -> None:
@@ -73,6 +91,8 @@ class VideoDubber:
73
91
  condition_on_previous_text=self.condition_on_previous_text,
74
92
  no_speech_threshold=self.no_speech_threshold,
75
93
  logprob_threshold=self.logprob_threshold,
94
+ strict_quality=self.strict_quality,
95
+ translator=self.translator,
76
96
  )
77
97
 
78
98
  def dub(
@@ -3,10 +3,21 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  from dataclasses import dataclass, field
6
+ from typing import TYPE_CHECKING, Any
6
7
 
7
8
  from videopython.base.audio import Audio
8
9
  from videopython.base.text.transcription import Transcription, TranscriptionSegment
9
10
 
11
+ if TYPE_CHECKING:
12
+ from videopython.ai.dubbing.quality import TranscriptQuality
13
+ from videopython.ai.dubbing.timing import TimingAdjustment
14
+
15
+
16
+ # Speed factors within this band of 1.0 are treated as a "clean" timing
17
+ # adjustment (no perceptible compression/stretch). Heuristic threshold for
18
+ # the TimingSummary classification only.
19
+ CLEAN_SPEED_TOLERANCE = 0.01
20
+
10
21
 
11
22
  @dataclass
12
23
  class TranslatedSegment:
@@ -73,6 +84,87 @@ class SeparatedAudio:
73
84
  return self.music is not None and self.effects is not None
74
85
 
75
86
 
87
+ @dataclass
88
+ class TimingSummary:
89
+ """Aggregate stats over per-segment timing adjustments.
90
+
91
+ Surfaces how aggressively the timing synchronizer had to compress or
92
+ truncate dubbed segments to fit the source's spoken regions. High
93
+ truncation rates indicate translation produced text too long for the
94
+ source duration.
95
+ """
96
+
97
+ total_segments: int
98
+ clean_count: int
99
+ stretched_count: int
100
+ truncated_count: int
101
+ mean_speed_factor: float
102
+ max_truncation_seconds: float
103
+
104
+ @classmethod
105
+ def from_adjustments(cls, adjustments: list[TimingAdjustment]) -> TimingSummary:
106
+ """Aggregate a list of TimingAdjustments into a TimingSummary."""
107
+ total = len(adjustments)
108
+ if total == 0:
109
+ return cls(
110
+ total_segments=0,
111
+ clean_count=0,
112
+ stretched_count=0,
113
+ truncated_count=0,
114
+ mean_speed_factor=1.0,
115
+ max_truncation_seconds=0.0,
116
+ )
117
+
118
+ clean = 0
119
+ stretched = 0
120
+ truncated = 0
121
+ speed_sum = 0.0
122
+ max_truncation = 0.0
123
+ for adj in adjustments:
124
+ speed_sum += adj.speed_factor
125
+ if adj.was_truncated:
126
+ truncated += 1
127
+ truncation = adj.original_duration - adj.actual_duration
128
+ if truncation > max_truncation:
129
+ max_truncation = truncation
130
+ elif abs(adj.speed_factor - 1.0) <= CLEAN_SPEED_TOLERANCE:
131
+ clean += 1
132
+ else:
133
+ stretched += 1
134
+
135
+ return cls(
136
+ total_segments=total,
137
+ clean_count=clean,
138
+ stretched_count=stretched,
139
+ truncated_count=truncated,
140
+ mean_speed_factor=speed_sum / total,
141
+ max_truncation_seconds=max_truncation,
142
+ )
143
+
144
+ def to_dict(self) -> dict[str, Any]:
145
+ """Convert to dictionary for JSON serialization."""
146
+ return {
147
+ "total_segments": self.total_segments,
148
+ "clean_count": self.clean_count,
149
+ "stretched_count": self.stretched_count,
150
+ "truncated_count": self.truncated_count,
151
+ "mean_speed_factor": self.mean_speed_factor,
152
+ "max_truncation_seconds": self.max_truncation_seconds,
153
+ }
154
+
155
+ @classmethod
156
+ def from_dict(cls, data: dict[str, Any]) -> TimingSummary:
157
+ """Create TimingSummary from dictionary."""
158
+ return cls(
159
+ total_segments=data["total_segments"],
160
+ clean_count=data["clean_count"],
161
+ stretched_count=data["stretched_count"],
162
+ truncated_count=data["truncated_count"],
163
+ mean_speed_factor=data["mean_speed_factor"],
164
+ max_truncation_seconds=data["max_truncation_seconds"],
165
+ )
166
+
167
+
76
168
  @dataclass
77
169
  class DubbingResult:
78
170
  """Result of a video dubbing operation.
@@ -85,6 +177,14 @@ class DubbingResult:
85
177
  target_lang: Target language for dubbing.
86
178
  separated_audio: Separated audio components (if preserve_background=True).
87
179
  voice_samples: Dictionary mapping speaker IDs to voice sample Audio.
180
+ timing_summary: Aggregate stats over per-segment timing adjustments.
181
+ transcript_quality: Heuristic quality assessment of the transcription
182
+ (None when the pipeline returned early on an empty transcription).
183
+ translation_failures: Indices of segments where translation failed
184
+ entirely. Used by Qwen3Translator when both the primary call and
185
+ the per-segment Marian fallback fail; those segments are dubbed
186
+ with empty text. Empty list under MarianTranslator (Marian has
187
+ no failure mode that drops segments).
88
188
  """
89
189
 
90
190
  dubbed_audio: Audio
@@ -94,6 +194,9 @@ class DubbingResult:
94
194
  target_lang: str
95
195
  separated_audio: SeparatedAudio | None = None
96
196
  voice_samples: dict[str, Audio] = field(default_factory=dict)
197
+ timing_summary: TimingSummary | None = None
198
+ transcript_quality: TranscriptQuality | None = None
199
+ translation_failures: list[int] = field(default_factory=list)
97
200
 
98
201
  @property
99
202
  def num_segments(self) -> int:
@@ -9,13 +9,24 @@ from typing import TYPE_CHECKING, Any, Callable, Literal
9
9
 
10
10
  import numpy as np
11
11
 
12
- from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio
12
+ from videopython.ai._device import select_device
13
+ from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio, TimingSummary
14
+ from videopython.ai.dubbing.quality import GarbageTranscriptError, assess_transcript
13
15
  from videopython.ai.dubbing.timing import TimingSynchronizer
16
+ from videopython.ai.generation.qwen3 import Qwen3Translator
17
+ from videopython.ai.generation.translation import (
18
+ MarianTranslator,
19
+ TranslationBackend,
20
+ UnsupportedLanguageError,
21
+ )
14
22
 
15
23
  if TYPE_CHECKING:
16
24
  from videopython.base.audio import Audio
17
25
 
18
26
 
27
+ TranslatorChoice = Literal["auto", "marian", "qwen3"]
28
+
29
+
19
30
  def _peak_match(target: Audio, reference: Audio) -> Audio:
20
31
  """Scale ``target`` so its peak amplitude matches ``reference``.
21
32
 
@@ -46,6 +57,14 @@ WhisperModel = Literal["tiny", "base", "small", "medium", "large", "turbo"]
46
57
 
47
58
  logger = logging.getLogger(__name__)
48
59
 
60
+ # Voice-sample quality gating thresholds. Tuned conservatively to favor
61
+ # accepting real-world dialogue over rejecting it; failures fall back to
62
+ # the longest segment with a WARNING log so we can re-tune from production
63
+ # data instead of guessing.
64
+ PEAK_CLIP_THRESHOLD = 0.99
65
+ MIN_VOCAL_BG_RMS_RATIO = 1.5
66
+ VOICE_SAMPLE_TARGET_DURATION = 6.0
67
+
49
68
 
50
69
  class LocalDubbingPipeline:
51
70
  """Local pipeline for video dubbing.
@@ -64,6 +83,8 @@ class LocalDubbingPipeline:
64
83
  condition_on_previous_text: bool = False,
65
84
  no_speech_threshold: float = 0.6,
66
85
  logprob_threshold: float | None = -1.0,
86
+ strict_quality: bool = False,
87
+ translator: TranslatorChoice = "auto",
67
88
  ):
68
89
  self.device = device
69
90
  self.low_memory = low_memory
@@ -71,12 +92,15 @@ class LocalDubbingPipeline:
71
92
  self.condition_on_previous_text = condition_on_previous_text
72
93
  self.no_speech_threshold = no_speech_threshold
73
94
  self.logprob_threshold = logprob_threshold
95
+ self.strict_quality = strict_quality
96
+ self.translator = translator
74
97
  requested = device.lower() if isinstance(device, str) else "auto"
75
98
  logger.info(
76
- "LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s",
99
+ "LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
77
100
  requested,
78
101
  low_memory,
79
102
  whisper_model,
103
+ translator,
80
104
  )
81
105
 
82
106
  self._transcriber: Any = None
@@ -117,11 +141,64 @@ class LocalDubbingPipeline:
117
141
  logprob_threshold=self.logprob_threshold,
118
142
  )
119
143
 
120
- def _init_translator(self) -> None:
121
- """Initialize the translation model."""
122
- from videopython.ai.generation.translation import TextTranslator
144
+ def _init_translator(self, source_lang: str, target_lang: str) -> None:
145
+ """Initialize the translation backend.
146
+
147
+ Resolves the configured ``self.translator`` choice into a concrete
148
+ backend. ``"auto"`` uses :meth:`_resolve_translator_auto`; explicit
149
+ choices instantiate the named backend directly. Re-initialization
150
+ is a no-op when ``self._translator`` is already a matching instance
151
+ for the same language pair (handled at call sites via the existing
152
+ ``self._translator is None`` gate).
153
+ """
154
+ if self.translator == "marian":
155
+ self._translator = MarianTranslator(device=self.device)
156
+ elif self.translator == "qwen3":
157
+ self._translator = Qwen3Translator(device=self.device)
158
+ else: # "auto"
159
+ self._translator = self._resolve_translator_auto(source_lang, target_lang)
160
+
161
+ def _resolve_translator_auto(self, source_lang: str, target_lang: str) -> TranslationBackend:
162
+ """Pick a backend based on language coverage AND device.
163
+
164
+ Qwen3-4B Q4_K_M on CPU is roughly 10-15x slower than MarianMT (M2.1
165
+ spike on dreams_15min.mp4). The resolver picks Marian on CPU
166
+ whenever it covers the language pair and only escalates to Qwen
167
+ when a GPU is available or Marian doesn't cover the pair.
168
+ """
169
+ device = select_device(self.device, mps_allowed=True)
170
+ has_gpu = device in ("cuda", "mps")
171
+
172
+ # 1. GPU + Qwen covers the pair → Qwen wins (best quality).
173
+ if has_gpu and Qwen3Translator.supports(source_lang, target_lang):
174
+ logger.info(
175
+ "translator: auto-selected qwen3 (device=%s, supports %s->%s)",
176
+ device,
177
+ source_lang,
178
+ target_lang,
179
+ )
180
+ return Qwen3Translator(device=self.device)
181
+
182
+ # 2. Marian covers the pair → Marian (fast).
183
+ if MarianTranslator.has_model_for(source_lang, target_lang):
184
+ if has_gpu:
185
+ reason = f"Qwen does not cover {source_lang}->{target_lang}"
186
+ else:
187
+ reason = f"device={device} (Qwen would be ~10-15x slower; pass translator='qwen3' to override)"
188
+ logger.info("translator: auto-selected marian (%s)", reason)
189
+ return MarianTranslator(device=self.device)
190
+
191
+ # 3. CPU + only Qwen covers it: warn loudly and use Qwen anyway.
192
+ if Qwen3Translator.supports(source_lang, target_lang):
193
+ logger.warning(
194
+ "translator: auto-selected qwen3 on CPU (%s->%s not in Marian); "
195
+ "translation will be slow (~10-15x MarianMT). Consider GPU.",
196
+ source_lang,
197
+ target_lang,
198
+ )
199
+ return Qwen3Translator(device=self.device)
123
200
 
124
- self._translator = TextTranslator(device=self.device)
201
+ raise UnsupportedLanguageError(source_lang, target_lang)
125
202
 
126
203
  def _init_tts(self, language: str = "en") -> None:
127
204
  """Initialize the text-to-speech model."""
@@ -141,12 +218,25 @@ class LocalDubbingPipeline:
141
218
 
142
219
  def _extract_voice_samples(
143
220
  self,
144
- audio: Any,
221
+ vocal_audio: Any,
222
+ background_audio: Any | None,
145
223
  transcription: Any,
146
224
  min_duration: float = 3.0,
147
225
  max_duration: float = 10.0,
148
226
  ) -> dict[str, Any]:
149
- """Extract voice samples for each speaker from the audio."""
227
+ """Extract a per-speaker voice sample with quality gating.
228
+
229
+ Picks the highest-scored segment per speaker after rejecting clipped
230
+ slices (peak >= ``PEAK_CLIP_THRESHOLD``) and slices where Demucs left
231
+ the background louder than the vocals
232
+ (``vocal_rms / bg_rms < MIN_VOCAL_BG_RMS_RATIO``). When the
233
+ background track isn't available (e.g. ``revoice`` after
234
+ ``low_memory`` dropped it), the RMS check is skipped silently.
235
+
236
+ Falls back to the longest available segment with a WARNING log when
237
+ every candidate is rejected, so the dub continues with the best
238
+ sample we have rather than silently dropping the speaker.
239
+ """
150
240
  from videopython.base.audio import Audio
151
241
 
152
242
  voice_samples: dict[str, Audio] = {}
@@ -159,29 +249,106 @@ class LocalDubbingPipeline:
159
249
  segments_by_speaker[speaker].append(segment)
160
250
 
161
251
  for speaker, segments in segments_by_speaker.items():
162
- target_duration = 6.0
163
- best_segment = None
164
- best_diff = float("inf")
165
-
166
- for segment in segments:
167
- duration = segment.end - segment.start
168
- if duration >= min_duration:
169
- diff = abs(duration - target_duration)
170
- if diff < best_diff:
171
- best_diff = diff
172
- best_segment = segment
173
-
174
- if best_segment is not None:
175
- start = best_segment.start
176
- end = min(best_segment.end, start + max_duration)
177
- sliced = audio.slice(start, end)
178
- # Audio.slice returns a numpy view into the source. Copy so the
179
- # short voice sample doesn't keep the full vocals array (~1.3 GB
180
- # for 2h sources) alive across translate + TTS.
181
- voice_samples[speaker] = Audio(sliced.data.copy(), sliced.metadata)
252
+ chosen, fallback_reason = self._pick_voice_segment(
253
+ speaker, segments, vocal_audio, background_audio, min_duration
254
+ )
255
+
256
+ if chosen is None:
257
+ logger.warning("No usable voice-sample segment for speaker %r (no candidates)", speaker)
258
+ continue
259
+
260
+ if fallback_reason is not None:
261
+ logger.warning(
262
+ "Voice-sample quality fallback for speaker %r (%d candidates): %s — using longest segment",
263
+ speaker,
264
+ len(segments),
265
+ fallback_reason,
266
+ )
267
+
268
+ start = chosen.start
269
+ end = min(chosen.end, start + max_duration)
270
+ sliced = vocal_audio.slice(start, end)
271
+ # Audio.slice returns a numpy view into the source. Copy so the
272
+ # short voice sample doesn't keep the full vocals array (~1.3 GB
273
+ # for 2h sources) alive across translate + TTS.
274
+ voice_samples[speaker] = Audio(sliced.data.copy(), sliced.metadata)
182
275
 
183
276
  return voice_samples
184
277
 
278
+ def _pick_voice_segment(
279
+ self,
280
+ speaker: str,
281
+ segments: list[Any],
282
+ vocal_audio: Any,
283
+ background_audio: Any | None,
284
+ min_duration: float,
285
+ ) -> tuple[Any | None, str | None]:
286
+ """Score eligible segments and pick the best one for ``speaker``.
287
+
288
+ Returns ``(segment, fallback_reason)``. ``fallback_reason`` is None
289
+ when scoring picked a segment cleanly; non-None when every candidate
290
+ was rejected and the longest segment was used instead.
291
+ """
292
+ if not segments:
293
+ return None, None
294
+
295
+ eligible = [s for s in segments if (s.end - s.start) >= min_duration]
296
+
297
+ rejection_reasons: list[str] = []
298
+ scored: list[tuple[float, Any]] = []
299
+ for segment in eligible:
300
+ score, reason = self._score_voice_segment(segment, vocal_audio, background_audio)
301
+ if score is None:
302
+ rejection_reasons.append(reason or "rejected")
303
+ else:
304
+ scored.append((score, segment))
305
+
306
+ if scored:
307
+ scored.sort(key=lambda item: item[0], reverse=True)
308
+ return scored[0][1], None
309
+
310
+ # All eligible segments rejected (or none met the min duration).
311
+ # Fall back to the longest segment overall so the speaker still
312
+ # gets a clone reference.
313
+ longest = max(segments, key=lambda s: s.end - s.start)
314
+ if eligible:
315
+ reason = ", ".join(sorted(set(rejection_reasons)))
316
+ else:
317
+ reason = f"no segment >= {min_duration:.1f}s"
318
+ return longest, reason
319
+
320
+ def _score_voice_segment(
321
+ self,
322
+ segment: Any,
323
+ vocal_audio: Any,
324
+ background_audio: Any | None,
325
+ ) -> tuple[float | None, str | None]:
326
+ """Return ``(score, reason)`` for a candidate segment.
327
+
328
+ ``score`` is ``None`` when the segment is rejected; ``reason`` carries
329
+ the rejection cause so the fallback logger can summarize.
330
+ """
331
+ vocal_slice = vocal_audio.slice(segment.start, segment.end)
332
+ if vocal_slice.data.size == 0:
333
+ return None, "empty slice"
334
+
335
+ peak = float(np.max(np.abs(vocal_slice.data)))
336
+ if peak >= PEAK_CLIP_THRESHOLD:
337
+ return None, "clipped"
338
+
339
+ vocal_rms = float(np.sqrt(np.mean(vocal_slice.data**2)))
340
+
341
+ if background_audio is not None:
342
+ bg_slice = background_audio.slice(segment.start, segment.end)
343
+ if bg_slice.data.size > 0:
344
+ bg_rms = float(np.sqrt(np.mean(bg_slice.data**2)))
345
+ if bg_rms > 0 and (vocal_rms / bg_rms) < MIN_VOCAL_BG_RMS_RATIO:
346
+ return None, "background-dominated"
347
+
348
+ duration = segment.end - segment.start
349
+ duration_penalty = abs(duration - VOICE_SAMPLE_TARGET_DURATION)
350
+ return vocal_rms - 0.05 * duration_penalty, None
351
+
185
352
  def process(
186
353
  self,
187
354
  source_audio: Audio,
@@ -266,6 +433,23 @@ class LocalDubbingPipeline:
266
433
  target_lang=target_lang,
267
434
  )
268
435
 
436
+ # Cheap heuristic gate before the expensive Demucs/translation/TTS
437
+ # stages. Lets strict_quality callers refuse-and-refund without
438
+ # running the rest of the pipeline; non-strict runs continue but
439
+ # surface the assessment on DubbingResult.
440
+ transcript_quality = assess_transcript(transcription, source_audio.metadata.duration_seconds)
441
+ if transcript_quality.recommendation == "reject" and self.strict_quality:
442
+ raise GarbageTranscriptError(
443
+ f"Refusing to dub: {', '.join(transcript_quality.flags)}",
444
+ transcript_quality,
445
+ )
446
+ if transcript_quality.recommendation in ("warn", "reject"):
447
+ logger.warning(
448
+ "Transcript quality flags raised: %s (recommendation=%s)",
449
+ ", ".join(transcript_quality.flags),
450
+ transcript_quality.recommendation,
451
+ )
452
+
269
453
  detected_lang = source_lang or transcription.language or "en"
270
454
 
271
455
  separated_audio: SeparatedAudio | None = None
@@ -303,7 +487,7 @@ class LocalDubbingPipeline:
303
487
  voice_samples: dict[str, Audio] = {}
304
488
  if voice_clone:
305
489
  report_progress("Extracting voice samples", 0.25)
306
- voice_samples = self._extract_voice_samples(vocal_audio, transcription)
490
+ voice_samples = self._extract_voice_samples(vocal_audio, background_audio, transcription)
307
491
 
308
492
  # vocals is no longer needed; voice_samples are independent copies.
309
493
  # In low_memory mode this is the only ref keeping the buffer alive
@@ -312,13 +496,25 @@ class LocalDubbingPipeline:
312
496
 
313
497
  report_progress("Translating text", 0.35)
314
498
  if self._translator is None:
315
- self._init_translator()
499
+ self._init_translator(source_lang=detected_lang, target_lang=target_lang)
500
+
501
+ # Translation stage spans 0.35 → 0.50 of overall pipeline progress.
502
+ # MarianMT runs sequentially over 8-segment batches; on a 15-min
503
+ # source that's minutes of silent dwell on 0.35 without per-batch
504
+ # ticks. Map the [0,1] translation fraction onto that 15% window.
505
+ def _on_translation_progress(fraction: float) -> None:
506
+ clamped = max(0.0, min(1.0, fraction))
507
+ report_progress(f"Translating text ({int(clamped * 100)}%)", 0.35 + 0.15 * clamped)
316
508
 
317
509
  translated_segments = self._translator.translate_segments(
318
510
  segments=transcription.segments,
319
511
  target_lang=target_lang,
320
512
  source_lang=detected_lang,
513
+ progress_callback=_on_translation_progress,
321
514
  )
515
+ # Capture per-segment failures (always empty for Marian) before
516
+ # _maybe_unload nukes the backend in low_memory mode.
517
+ translation_failures = list(self._translator.translation_failures)
322
518
  self._maybe_unload("_translator")
323
519
 
324
520
  report_progress("Generating dubbed speech", 0.50)
@@ -393,7 +589,8 @@ class LocalDubbingPipeline:
393
589
  self._init_synchronizer()
394
590
  assert self._synchronizer is not None
395
591
 
396
- synchronized_segments, _ = self._synchronizer.synchronize_segments(dubbed_segments, target_durations)
592
+ synchronized_segments, adjustments = self._synchronizer.synchronize_segments(dubbed_segments, target_durations)
593
+ timing_summary = TimingSummary.from_adjustments(adjustments)
397
594
  del dubbed_segments
398
595
 
399
596
  report_progress("Assembling final audio", 0.90)
@@ -429,6 +626,9 @@ class LocalDubbingPipeline:
429
626
  target_lang=target_lang,
430
627
  separated_audio=separated_audio,
431
628
  voice_samples=voice_samples,
629
+ timing_summary=timing_summary,
630
+ transcript_quality=transcript_quality,
631
+ translation_failures=translation_failures,
432
632
  )
433
633
 
434
634
  def revoice(
@@ -486,7 +686,10 @@ class LocalDubbingPipeline:
486
686
  voice_sample: Audio | None = None
487
687
 
488
688
  if transcription.segments:
489
- voice_samples = self._extract_voice_samples(vocal_audio, transcription)
689
+ # revoice doesn't track the background after the low_memory drop,
690
+ # so quality gating degrades to "no RMS check" here. Clipping is
691
+ # still rejected.
692
+ voice_samples = self._extract_voice_samples(vocal_audio, None, transcription)
490
693
  if voice_samples:
491
694
  voice_sample = next(iter(voice_samples.values()))
492
695