videopython 0.27.1__tar.gz → 0.28.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {videopython-0.27.1 → videopython-0.28.0}/PKG-INFO +1 -1
  2. {videopython-0.27.1 → videopython-0.28.0}/pyproject.toml +1 -1
  3. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/dubbing/__init__.py +4 -0
  4. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/dubbing/dubber.py +27 -0
  5. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/dubbing/models.py +97 -0
  6. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/dubbing/pipeline.py +168 -26
  7. videopython-0.28.0/src/videopython/ai/dubbing/quality.py +178 -0
  8. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/generation/translation.py +22 -4
  9. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/understanding/audio.py +35 -2
  10. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/base/text/transcription.py +9 -0
  11. {videopython-0.27.1 → videopython-0.28.0}/.gitignore +0 -0
  12. {videopython-0.27.1 → videopython-0.28.0}/LICENSE +0 -0
  13. {videopython-0.27.1 → videopython-0.28.0}/README.md +0 -0
  14. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/__init__.py +0 -0
  15. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/__init__.py +0 -0
  16. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/_device.py +0 -0
  17. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/dubbing/remux.py +0 -0
  18. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/dubbing/timing.py +0 -0
  19. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/generation/__init__.py +0 -0
  20. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/generation/audio.py +0 -0
  21. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/generation/image.py +0 -0
  22. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/generation/video.py +0 -0
  23. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/registry.py +0 -0
  24. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/swapping/__init__.py +0 -0
  25. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/swapping/inpainter.py +0 -0
  26. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/swapping/models.py +0 -0
  27. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/swapping/segmenter.py +0 -0
  28. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/swapping/swapper.py +0 -0
  29. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/transforms.py +0 -0
  30. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/understanding/__init__.py +0 -0
  31. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/understanding/image.py +0 -0
  32. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/understanding/separation.py +0 -0
  33. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/understanding/temporal.py +0 -0
  34. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/ai/video_analysis.py +0 -0
  35. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/base/__init__.py +0 -0
  36. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/base/audio/__init__.py +0 -0
  37. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/base/audio/analysis.py +0 -0
  38. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/base/audio/audio.py +0 -0
  39. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/base/combine.py +0 -0
  40. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/base/description.py +0 -0
  41. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/base/effects.py +0 -0
  42. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/base/exceptions.py +0 -0
  43. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/base/progress.py +0 -0
  44. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/base/registry.py +0 -0
  45. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/base/scene.py +0 -0
  46. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/base/streaming.py +0 -0
  47. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/base/text/__init__.py +0 -0
  48. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/base/text/overlay.py +0 -0
  49. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/base/transforms.py +0 -0
  50. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/base/transitions.py +0 -0
  51. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/base/utils.py +0 -0
  52. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/base/video.py +0 -0
  53. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/editing/__init__.py +0 -0
  54. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/editing/multicam.py +0 -0
  55. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/editing/premiere_xml.py +0 -0
  56. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/editing/video_edit.py +0 -0
  57. {videopython-0.27.1 → videopython-0.28.0}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.27.1
3
+ Version: 0.28.0
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.27.1"
3
+ version = "0.28.0"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -3,6 +3,7 @@
3
3
  from videopython.ai.dubbing.dubber import VideoDubber
4
4
  from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio, TranslatedSegment
5
5
  from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
6
+ from videopython.ai.dubbing.quality import GarbageTranscriptError, TranscriptQuality, assess_transcript
6
7
  from videopython.ai.dubbing.timing import TimingSynchronizer
7
8
 
8
9
  __all__ = [
@@ -13,4 +14,7 @@ __all__ = [
13
14
  "SeparatedAudio",
14
15
  "LocalDubbingPipeline",
15
16
  "TimingSynchronizer",
17
+ "GarbageTranscriptError",
18
+ "TranscriptQuality",
19
+ "assess_transcript",
16
20
  ]
@@ -29,6 +29,21 @@ class VideoDubber:
29
29
  give better accuracy at the cost of VRAM and latency. One of
30
30
  ``tiny``, ``base``, ``small``, ``medium``, ``large``, ``turbo``.
31
31
  Default ``turbo``.
32
+ condition_on_previous_text: Forwarded to ``AudioToText``. Defaults to
33
+ ``False`` (Whisper's own default is ``True``). With conditioning on,
34
+ a single hallucinated filler phrase cascades through the rest of
35
+ the file. See ``AudioToText`` for the full rationale.
36
+ no_speech_threshold: Forwarded to ``AudioToText``. Whisper's no-speech
37
+ gate; raise to drop more low-confidence windows.
38
+ logprob_threshold: Forwarded to ``AudioToText``. Whisper's average
39
+ log-probability gate.
40
+ strict_quality: When True, the pipeline raises
41
+ :class:`GarbageTranscriptError` before Demucs/translation/TTS run
42
+ if the transcript-quality heuristic returns ``"reject"``. When
43
+ False (default), low-quality transcripts are logged at WARNING
44
+ but processing continues. Either way the
45
+ :class:`TranscriptQuality` is exposed on ``DubbingResult`` for
46
+ inspection.
32
47
  """
33
48
 
34
49
  def __init__(
@@ -36,10 +51,18 @@ class VideoDubber:
36
51
  device: str | None = None,
37
52
  low_memory: bool = False,
38
53
  whisper_model: WhisperModel = "turbo",
54
+ condition_on_previous_text: bool = False,
55
+ no_speech_threshold: float = 0.6,
56
+ logprob_threshold: float | None = -1.0,
57
+ strict_quality: bool = False,
39
58
  ):
40
59
  self.device = device
41
60
  self.low_memory = low_memory
42
61
  self.whisper_model = whisper_model
62
+ self.condition_on_previous_text = condition_on_previous_text
63
+ self.no_speech_threshold = no_speech_threshold
64
+ self.logprob_threshold = logprob_threshold
65
+ self.strict_quality = strict_quality
43
66
  self._local_pipeline: Any = None
44
67
  requested = device.lower() if isinstance(device, str) else "auto"
45
68
  logger.info(
@@ -56,6 +79,10 @@ class VideoDubber:
56
79
  device=self.device,
57
80
  low_memory=self.low_memory,
58
81
  whisper_model=self.whisper_model,
82
+ condition_on_previous_text=self.condition_on_previous_text,
83
+ no_speech_threshold=self.no_speech_threshold,
84
+ logprob_threshold=self.logprob_threshold,
85
+ strict_quality=self.strict_quality,
59
86
  )
60
87
 
61
88
  def dub(
@@ -3,10 +3,21 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  from dataclasses import dataclass, field
6
+ from typing import TYPE_CHECKING, Any
6
7
 
7
8
  from videopython.base.audio import Audio
8
9
  from videopython.base.text.transcription import Transcription, TranscriptionSegment
9
10
 
11
+ if TYPE_CHECKING:
12
+ from videopython.ai.dubbing.quality import TranscriptQuality
13
+ from videopython.ai.dubbing.timing import TimingAdjustment
14
+
15
+
16
+ # Speed factors within this band of 1.0 are treated as a "clean" timing
17
+ # adjustment (no perceptible compression/stretch). Heuristic threshold for
18
+ # the TimingSummary classification only.
19
+ CLEAN_SPEED_TOLERANCE = 0.01
20
+
10
21
 
11
22
  @dataclass
12
23
  class TranslatedSegment:
@@ -73,6 +84,87 @@ class SeparatedAudio:
73
84
  return self.music is not None and self.effects is not None
74
85
 
75
86
 
87
+ @dataclass
88
+ class TimingSummary:
89
+ """Aggregate stats over per-segment timing adjustments.
90
+
91
+ Surfaces how aggressively the timing synchronizer had to compress or
92
+ truncate dubbed segments to fit the source's spoken regions. High
93
+ truncation rates indicate translation produced text too long for the
94
+ source duration.
95
+ """
96
+
97
+ total_segments: int
98
+ clean_count: int
99
+ stretched_count: int
100
+ truncated_count: int
101
+ mean_speed_factor: float
102
+ max_truncation_seconds: float
103
+
104
+ @classmethod
105
+ def from_adjustments(cls, adjustments: list[TimingAdjustment]) -> TimingSummary:
106
+ """Aggregate a list of TimingAdjustments into a TimingSummary."""
107
+ total = len(adjustments)
108
+ if total == 0:
109
+ return cls(
110
+ total_segments=0,
111
+ clean_count=0,
112
+ stretched_count=0,
113
+ truncated_count=0,
114
+ mean_speed_factor=1.0,
115
+ max_truncation_seconds=0.0,
116
+ )
117
+
118
+ clean = 0
119
+ stretched = 0
120
+ truncated = 0
121
+ speed_sum = 0.0
122
+ max_truncation = 0.0
123
+ for adj in adjustments:
124
+ speed_sum += adj.speed_factor
125
+ if adj.was_truncated:
126
+ truncated += 1
127
+ truncation = adj.original_duration - adj.actual_duration
128
+ if truncation > max_truncation:
129
+ max_truncation = truncation
130
+ elif abs(adj.speed_factor - 1.0) <= CLEAN_SPEED_TOLERANCE:
131
+ clean += 1
132
+ else:
133
+ stretched += 1
134
+
135
+ return cls(
136
+ total_segments=total,
137
+ clean_count=clean,
138
+ stretched_count=stretched,
139
+ truncated_count=truncated,
140
+ mean_speed_factor=speed_sum / total,
141
+ max_truncation_seconds=max_truncation,
142
+ )
143
+
144
+ def to_dict(self) -> dict[str, Any]:
145
+ """Convert to dictionary for JSON serialization."""
146
+ return {
147
+ "total_segments": self.total_segments,
148
+ "clean_count": self.clean_count,
149
+ "stretched_count": self.stretched_count,
150
+ "truncated_count": self.truncated_count,
151
+ "mean_speed_factor": self.mean_speed_factor,
152
+ "max_truncation_seconds": self.max_truncation_seconds,
153
+ }
154
+
155
+ @classmethod
156
+ def from_dict(cls, data: dict[str, Any]) -> TimingSummary:
157
+ """Create TimingSummary from dictionary."""
158
+ return cls(
159
+ total_segments=data["total_segments"],
160
+ clean_count=data["clean_count"],
161
+ stretched_count=data["stretched_count"],
162
+ truncated_count=data["truncated_count"],
163
+ mean_speed_factor=data["mean_speed_factor"],
164
+ max_truncation_seconds=data["max_truncation_seconds"],
165
+ )
166
+
167
+
76
168
  @dataclass
77
169
  class DubbingResult:
78
170
  """Result of a video dubbing operation.
@@ -85,6 +177,9 @@ class DubbingResult:
85
177
  target_lang: Target language for dubbing.
86
178
  separated_audio: Separated audio components (if preserve_background=True).
87
179
  voice_samples: Dictionary mapping speaker IDs to voice sample Audio.
180
+ timing_summary: Aggregate stats over per-segment timing adjustments.
181
+ transcript_quality: Heuristic quality assessment of the transcription
182
+ (None when the pipeline returned early on an empty transcription).
88
183
  """
89
184
 
90
185
  dubbed_audio: Audio
@@ -94,6 +189,8 @@ class DubbingResult:
94
189
  target_lang: str
95
190
  separated_audio: SeparatedAudio | None = None
96
191
  voice_samples: dict[str, Audio] = field(default_factory=dict)
192
+ timing_summary: TimingSummary | None = None
193
+ transcript_quality: TranscriptQuality | None = None
97
194
 
98
195
  @property
99
196
  def num_segments(self) -> int:
@@ -9,7 +9,8 @@ from typing import TYPE_CHECKING, Any, Callable, Literal
9
9
 
10
10
  import numpy as np
11
11
 
12
- from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio
12
+ from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio, TimingSummary
13
+ from videopython.ai.dubbing.quality import GarbageTranscriptError, assess_transcript
13
14
  from videopython.ai.dubbing.timing import TimingSynchronizer
14
15
 
15
16
  if TYPE_CHECKING:
@@ -46,6 +47,14 @@ WhisperModel = Literal["tiny", "base", "small", "medium", "large", "turbo"]
46
47
 
47
48
  logger = logging.getLogger(__name__)
48
49
 
50
+ # Voice-sample quality gating thresholds. Tuned conservatively to favor
51
+ # accepting real-world dialogue over rejecting it; failures fall back to
52
+ # the longest segment with a WARNING log so we can re-tune from production
53
+ # data instead of guessing.
54
+ PEAK_CLIP_THRESHOLD = 0.99
55
+ MIN_VOCAL_BG_RMS_RATIO = 1.5
56
+ VOICE_SAMPLE_TARGET_DURATION = 6.0
57
+
49
58
 
50
59
  class LocalDubbingPipeline:
51
60
  """Local pipeline for video dubbing.
@@ -61,10 +70,18 @@ class LocalDubbingPipeline:
61
70
  device: str | None = None,
62
71
  low_memory: bool = False,
63
72
  whisper_model: WhisperModel = "turbo",
73
+ condition_on_previous_text: bool = False,
74
+ no_speech_threshold: float = 0.6,
75
+ logprob_threshold: float | None = -1.0,
76
+ strict_quality: bool = False,
64
77
  ):
65
78
  self.device = device
66
79
  self.low_memory = low_memory
67
80
  self.whisper_model = whisper_model
81
+ self.condition_on_previous_text = condition_on_previous_text
82
+ self.no_speech_threshold = no_speech_threshold
83
+ self.logprob_threshold = logprob_threshold
84
+ self.strict_quality = strict_quality
68
85
  requested = device.lower() if isinstance(device, str) else "auto"
69
86
  logger.info(
70
87
  "LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s",
@@ -106,6 +123,9 @@ class LocalDubbingPipeline:
106
123
  model_name=self.whisper_model,
107
124
  device=self.device,
108
125
  enable_diarization=enable_diarization,
126
+ condition_on_previous_text=self.condition_on_previous_text,
127
+ no_speech_threshold=self.no_speech_threshold,
128
+ logprob_threshold=self.logprob_threshold,
109
129
  )
110
130
 
111
131
  def _init_translator(self) -> None:
@@ -132,12 +152,25 @@ class LocalDubbingPipeline:
132
152
 
133
153
  def _extract_voice_samples(
134
154
  self,
135
- audio: Any,
155
+ vocal_audio: Any,
156
+ background_audio: Any | None,
136
157
  transcription: Any,
137
158
  min_duration: float = 3.0,
138
159
  max_duration: float = 10.0,
139
160
  ) -> dict[str, Any]:
140
- """Extract voice samples for each speaker from the audio."""
161
+ """Extract a per-speaker voice sample with quality gating.
162
+
163
+ Picks the highest-scored segment per speaker after rejecting clipped
164
+ slices (peak >= ``PEAK_CLIP_THRESHOLD``) and slices where Demucs left
165
+ the background louder than the vocals
166
+ (``vocal_rms / bg_rms < MIN_VOCAL_BG_RMS_RATIO``). When the
167
+ background track isn't available (e.g. ``revoice`` after
168
+ ``low_memory`` dropped it), the RMS check is skipped silently.
169
+
170
+ Falls back to the longest available segment with a WARNING log when
171
+ every candidate is rejected, so the dub continues with the best
172
+ sample we have rather than silently dropping the speaker.
173
+ """
141
174
  from videopython.base.audio import Audio
142
175
 
143
176
  voice_samples: dict[str, Audio] = {}
@@ -150,29 +183,106 @@ class LocalDubbingPipeline:
150
183
  segments_by_speaker[speaker].append(segment)
151
184
 
152
185
  for speaker, segments in segments_by_speaker.items():
153
- target_duration = 6.0
154
- best_segment = None
155
- best_diff = float("inf")
156
-
157
- for segment in segments:
158
- duration = segment.end - segment.start
159
- if duration >= min_duration:
160
- diff = abs(duration - target_duration)
161
- if diff < best_diff:
162
- best_diff = diff
163
- best_segment = segment
164
-
165
- if best_segment is not None:
166
- start = best_segment.start
167
- end = min(best_segment.end, start + max_duration)
168
- sliced = audio.slice(start, end)
169
- # Audio.slice returns a numpy view into the source. Copy so the
170
- # short voice sample doesn't keep the full vocals array (~1.3 GB
171
- # for 2h sources) alive across translate + TTS.
172
- voice_samples[speaker] = Audio(sliced.data.copy(), sliced.metadata)
186
+ chosen, fallback_reason = self._pick_voice_segment(
187
+ speaker, segments, vocal_audio, background_audio, min_duration
188
+ )
189
+
190
+ if chosen is None:
191
+ logger.warning("No usable voice-sample segment for speaker %r (no candidates)", speaker)
192
+ continue
193
+
194
+ if fallback_reason is not None:
195
+ logger.warning(
196
+ "Voice-sample quality fallback for speaker %r (%d candidates): %s — using longest segment",
197
+ speaker,
198
+ len(segments),
199
+ fallback_reason,
200
+ )
201
+
202
+ start = chosen.start
203
+ end = min(chosen.end, start + max_duration)
204
+ sliced = vocal_audio.slice(start, end)
205
+ # Audio.slice returns a numpy view into the source. Copy so the
206
+ # short voice sample doesn't keep the full vocals array (~1.3 GB
207
+ # for 2h sources) alive across translate + TTS.
208
+ voice_samples[speaker] = Audio(sliced.data.copy(), sliced.metadata)
173
209
 
174
210
  return voice_samples
175
211
 
212
+ def _pick_voice_segment(
213
+ self,
214
+ speaker: str,
215
+ segments: list[Any],
216
+ vocal_audio: Any,
217
+ background_audio: Any | None,
218
+ min_duration: float,
219
+ ) -> tuple[Any | None, str | None]:
220
+ """Score eligible segments and pick the best one for ``speaker``.
221
+
222
+ Returns ``(segment, fallback_reason)``. ``fallback_reason`` is None
223
+ when scoring picked a segment cleanly; non-None when every candidate
224
+ was rejected and the longest segment was used instead.
225
+ """
226
+ if not segments:
227
+ return None, None
228
+
229
+ eligible = [s for s in segments if (s.end - s.start) >= min_duration]
230
+
231
+ rejection_reasons: list[str] = []
232
+ scored: list[tuple[float, Any]] = []
233
+ for segment in eligible:
234
+ score, reason = self._score_voice_segment(segment, vocal_audio, background_audio)
235
+ if score is None:
236
+ rejection_reasons.append(reason or "rejected")
237
+ else:
238
+ scored.append((score, segment))
239
+
240
+ if scored:
241
+ scored.sort(key=lambda item: item[0], reverse=True)
242
+ return scored[0][1], None
243
+
244
+ # All eligible segments rejected (or none met the min duration).
245
+ # Fall back to the longest segment overall so the speaker still
246
+ # gets a clone reference.
247
+ longest = max(segments, key=lambda s: s.end - s.start)
248
+ if eligible:
249
+ reason = ", ".join(sorted(set(rejection_reasons)))
250
+ else:
251
+ reason = f"no segment >= {min_duration:.1f}s"
252
+ return longest, reason
253
+
254
+ def _score_voice_segment(
255
+ self,
256
+ segment: Any,
257
+ vocal_audio: Any,
258
+ background_audio: Any | None,
259
+ ) -> tuple[float | None, str | None]:
260
+ """Return ``(score, reason)`` for a candidate segment.
261
+
262
+ ``score`` is ``None`` when the segment is rejected; ``reason`` carries
263
+ the rejection cause so the fallback logger can summarize.
264
+ """
265
+ vocal_slice = vocal_audio.slice(segment.start, segment.end)
266
+ if vocal_slice.data.size == 0:
267
+ return None, "empty slice"
268
+
269
+ peak = float(np.max(np.abs(vocal_slice.data)))
270
+ if peak >= PEAK_CLIP_THRESHOLD:
271
+ return None, "clipped"
272
+
273
+ vocal_rms = float(np.sqrt(np.mean(vocal_slice.data**2)))
274
+
275
+ if background_audio is not None:
276
+ bg_slice = background_audio.slice(segment.start, segment.end)
277
+ if bg_slice.data.size > 0:
278
+ bg_rms = float(np.sqrt(np.mean(bg_slice.data**2)))
279
+ if bg_rms > 0 and (vocal_rms / bg_rms) < MIN_VOCAL_BG_RMS_RATIO:
280
+ return None, "background-dominated"
281
+
282
+ duration = segment.end - segment.start
283
+ duration_penalty = abs(duration - VOICE_SAMPLE_TARGET_DURATION)
284
+ return vocal_rms - 0.05 * duration_penalty, None
285
+
176
286
  def process(
177
287
  self,
178
288
  source_audio: Audio,
@@ -257,6 +367,23 @@ class LocalDubbingPipeline:
257
367
  target_lang=target_lang,
258
368
  )
259
369
 
370
+ # Cheap heuristic gate before the expensive Demucs/translation/TTS
371
+ # stages. Lets strict_quality callers refuse-and-refund without
372
+ # running the rest of the pipeline; non-strict runs continue but
373
+ # surface the assessment on DubbingResult.
374
+ transcript_quality = assess_transcript(transcription, source_audio.metadata.duration_seconds)
375
+ if transcript_quality.recommendation == "reject" and self.strict_quality:
376
+ raise GarbageTranscriptError(
377
+ f"Refusing to dub: {', '.join(transcript_quality.flags)}",
378
+ transcript_quality,
379
+ )
380
+ if transcript_quality.recommendation in ("warn", "reject"):
381
+ logger.warning(
382
+ "Transcript quality flags raised: %s (recommendation=%s)",
383
+ ", ".join(transcript_quality.flags),
384
+ transcript_quality.recommendation,
385
+ )
386
+
260
387
  detected_lang = source_lang or transcription.language or "en"
261
388
 
262
389
  separated_audio: SeparatedAudio | None = None
@@ -294,7 +421,7 @@ class LocalDubbingPipeline:
294
421
  voice_samples: dict[str, Audio] = {}
295
422
  if voice_clone:
296
423
  report_progress("Extracting voice samples", 0.25)
297
- voice_samples = self._extract_voice_samples(vocal_audio, transcription)
424
+ voice_samples = self._extract_voice_samples(vocal_audio, background_audio, transcription)
298
425
 
299
426
  # vocals is no longer needed; voice_samples are independent copies.
300
427
  # In low_memory mode this is the only ref keeping the buffer alive
@@ -305,10 +432,19 @@ class LocalDubbingPipeline:
305
432
  if self._translator is None:
306
433
  self._init_translator()
307
434
 
435
+ # Translation stage spans 0.35 → 0.50 of overall pipeline progress.
436
+ # MarianMT runs sequentially over 8-segment batches; on a 15-min
437
+ # source that's minutes of silent dwell on 0.35 without per-batch
438
+ # ticks. Map the [0,1] translation fraction onto that 15% window.
439
+ def _on_translation_progress(fraction: float) -> None:
440
+ clamped = max(0.0, min(1.0, fraction))
441
+ report_progress(f"Translating text ({int(clamped * 100)}%)", 0.35 + 0.15 * clamped)
442
+
308
443
  translated_segments = self._translator.translate_segments(
309
444
  segments=transcription.segments,
310
445
  target_lang=target_lang,
311
446
  source_lang=detected_lang,
447
+ progress_callback=_on_translation_progress,
312
448
  )
313
449
  self._maybe_unload("_translator")
314
450
 
@@ -384,7 +520,8 @@ class LocalDubbingPipeline:
384
520
  self._init_synchronizer()
385
521
  assert self._synchronizer is not None
386
522
 
387
- synchronized_segments, _ = self._synchronizer.synchronize_segments(dubbed_segments, target_durations)
523
+ synchronized_segments, adjustments = self._synchronizer.synchronize_segments(dubbed_segments, target_durations)
524
+ timing_summary = TimingSummary.from_adjustments(adjustments)
388
525
  del dubbed_segments
389
526
 
390
527
  report_progress("Assembling final audio", 0.90)
@@ -420,6 +557,8 @@ class LocalDubbingPipeline:
420
557
  target_lang=target_lang,
421
558
  separated_audio=separated_audio,
422
559
  voice_samples=voice_samples,
560
+ timing_summary=timing_summary,
561
+ transcript_quality=transcript_quality,
423
562
  )
424
563
 
425
564
  def revoice(
@@ -477,7 +616,10 @@ class LocalDubbingPipeline:
477
616
  voice_sample: Audio | None = None
478
617
 
479
618
  if transcription.segments:
480
- voice_samples = self._extract_voice_samples(vocal_audio, transcription)
619
+ # revoice doesn't track the background after the low_memory drop,
620
+ # so quality gating degrades to "no RMS check" here. Clipping is
621
+ # still rejected.
622
+ voice_samples = self._extract_voice_samples(vocal_audio, None, transcription)
481
623
  if voice_samples:
482
624
  voice_sample = next(iter(voice_samples.values()))
483
625
 
@@ -0,0 +1,178 @@
1
+ """Cheap heuristics over a Whisper transcription to flag degenerate output.
2
+
3
+ Surfaces three failure modes seen in production where Demucs/translation/TTS
4
+ would otherwise spend minutes producing a useless dub:
5
+
6
+ - Dominant-phrase cascade — one phrase repeats across most segments. The
7
+ classic Whisper failure on ambient music / outro screens
8
+ ("Thank you for watching").
9
+ - Low decoder confidence — median per-segment ``avg_logprob`` is poor.
10
+ - Silent input misread as speech — total speech duration is tiny relative
11
+ to the clip's wall-clock duration (only meaningful on long inputs).
12
+
13
+ Each check raises a flag; a recommendation is derived from how many fired.
14
+ Threshold constants live at module scope so production data can re-tune them
15
+ without touching code structure.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import re
21
+ import statistics
22
+ from collections import Counter
23
+ from dataclasses import dataclass, field
24
+ from typing import TYPE_CHECKING, Any, Literal
25
+
26
+ if TYPE_CHECKING:
27
+ from videopython.base.text.transcription import Transcription
28
+
29
+
30
+ # Tuned conservatively to favor "warn" over "reject"; first-week production
31
+ # data may move them.
32
+ DOMINANT_PHRASE_FRACTION_THRESHOLD = 0.70
33
+ LOW_LOGPROB_MEDIAN_THRESHOLD = -1.5
34
+ LOW_SPEECH_FRACTION_THRESHOLD = 0.05
35
+ SHORT_CLIP_SECONDS = 30.0 # below this, speech-fraction is too unstable to trust
36
+
37
+
38
+ Recommendation = Literal["ok", "warn", "reject"]
39
+
40
+
41
+ _PUNCT_RE = re.compile(r"[^\w\s]+", re.UNICODE)
42
+ _WHITESPACE_RE = re.compile(r"\s+")
43
+
44
+
45
+ def _normalize_phrase(text: str) -> str:
46
+ """Lowercase, strip punctuation, collapse whitespace."""
47
+ cleaned = _PUNCT_RE.sub(" ", text.lower())
48
+ return _WHITESPACE_RE.sub(" ", cleaned).strip()
49
+
50
+
51
+ @dataclass
52
+ class TranscriptQuality:
53
+ """Quality assessment of a Whisper transcription.
54
+
55
+ Attributes:
56
+ recommendation: ``"ok"`` (continue), ``"warn"`` (continue, log), or
57
+ ``"reject"`` (caller should refuse to dub if strict_quality).
58
+ dominant_phrase: The repeating phrase that triggered the dominance
59
+ flag, or None when the flag didn't fire.
60
+ dominant_phrase_fraction: Character-count share of the most common
61
+ normalized segment phrase. 0.0 when no segments.
62
+ median_avg_logprob: Median of ``avg_logprob`` across segments that
63
+ carry it; None when no segment had a logprob (e.g. SRT-loaded).
64
+ speech_fraction: Sum of segment durations divided by the audio's
65
+ wall-clock duration.
66
+ flags: Human-readable list of which checks fired.
67
+ """
68
+
69
+ recommendation: Recommendation
70
+ dominant_phrase: str | None
71
+ dominant_phrase_fraction: float
72
+ median_avg_logprob: float | None
73
+ speech_fraction: float
74
+ flags: list[str] = field(default_factory=list)
75
+
76
+ def to_dict(self) -> dict[str, Any]:
77
+ return {
78
+ "recommendation": self.recommendation,
79
+ "dominant_phrase": self.dominant_phrase,
80
+ "dominant_phrase_fraction": self.dominant_phrase_fraction,
81
+ "median_avg_logprob": self.median_avg_logprob,
82
+ "speech_fraction": self.speech_fraction,
83
+ "flags": list(self.flags),
84
+ }
85
+
86
+ @classmethod
87
+ def from_dict(cls, data: dict[str, Any]) -> TranscriptQuality:
88
+ return cls(
89
+ recommendation=data["recommendation"],
90
+ dominant_phrase=data.get("dominant_phrase"),
91
+ dominant_phrase_fraction=data.get("dominant_phrase_fraction", 0.0),
92
+ median_avg_logprob=data.get("median_avg_logprob"),
93
+ speech_fraction=data.get("speech_fraction", 0.0),
94
+ flags=list(data.get("flags", [])),
95
+ )
96
+
97
+
98
+ class GarbageTranscriptError(RuntimeError):
99
+ """Raised by the dubbing pipeline when ``strict_quality=True`` and the
100
+ transcript heuristic returns ``recommendation="reject"``.
101
+
102
+ The triggering :class:`TranscriptQuality` is attached as ``quality`` so
103
+ callers can introspect the flags without re-running the pipeline.
104
+ """
105
+
106
+ def __init__(self, message: str, quality: TranscriptQuality):
107
+ super().__init__(message)
108
+ self.quality = quality
109
+
110
+
111
+ def assess_transcript(
112
+ transcription: Transcription,
113
+ audio_duration_seconds: float,
114
+ ) -> TranscriptQuality:
115
+ """Run the three quality checks and return a recommendation.
116
+
117
+ See module docstring for what each check looks for.
118
+ """
119
+ segments = list(transcription.segments)
120
+
121
+ # Dominant-phrase share by character count.
122
+ dominant_phrase: str | None = None
123
+ dominant_fraction = 0.0
124
+ if segments:
125
+ normalized = [_normalize_phrase(s.text) for s in segments]
126
+ char_counts: Counter[str] = Counter()
127
+ total_chars = 0
128
+ for phrase in normalized:
129
+ if not phrase:
130
+ continue
131
+ n = len(phrase)
132
+ char_counts[phrase] += n
133
+ total_chars += n
134
+ if total_chars > 0 and char_counts:
135
+ most_common_phrase, most_common_chars = char_counts.most_common(1)[0]
136
+ dominant_fraction = most_common_chars / total_chars
137
+ dominant_phrase = most_common_phrase
138
+
139
+ # Median avg_logprob across segments that carry it.
140
+ logprobs = [s.avg_logprob for s in segments if s.avg_logprob is not None]
141
+ median_logprob = statistics.median(logprobs) if logprobs else None
142
+
143
+ # Speech fraction = sum of segment durations / audio duration.
144
+ speech_seconds = sum(max(0.0, s.end - s.start) for s in segments)
145
+ speech_fraction = speech_seconds / audio_duration_seconds if audio_duration_seconds > 0 else 0.0
146
+
147
+ flags: list[str] = []
148
+ dominance_flag = dominant_fraction >= DOMINANT_PHRASE_FRACTION_THRESHOLD
149
+ if dominance_flag:
150
+ flags.append(f"dominant phrase {dominant_fraction:.0%}: {dominant_phrase!r}")
151
+
152
+ logprob_flag = median_logprob is not None and median_logprob < LOW_LOGPROB_MEDIAN_THRESHOLD
153
+ if logprob_flag:
154
+ flags.append(f"median avg_logprob {median_logprob:.2f} below {LOW_LOGPROB_MEDIAN_THRESHOLD}")
155
+
156
+ # Speech-fraction is unstable on short clips; skip it there.
157
+ speech_flag = audio_duration_seconds > SHORT_CLIP_SECONDS and speech_fraction < LOW_SPEECH_FRACTION_THRESHOLD
158
+ if speech_flag:
159
+ flags.append(f"speech fraction {speech_fraction:.1%} below {LOW_SPEECH_FRACTION_THRESHOLD:.0%}")
160
+
161
+ # Reject only when dominance + at least one other flag fires; legitimate
162
+ # repetitive content (chants, lyric clips) should warn, not reject.
163
+ recommendation: Recommendation
164
+ if dominance_flag and (logprob_flag or speech_flag):
165
+ recommendation = "reject"
166
+ elif flags:
167
+ recommendation = "warn"
168
+ else:
169
+ recommendation = "ok"
170
+
171
+ return TranscriptQuality(
172
+ recommendation=recommendation,
173
+ dominant_phrase=dominant_phrase if dominance_flag else None,
174
+ dominant_phrase_fraction=dominant_fraction,
175
+ median_avg_logprob=median_logprob,
176
+ speech_fraction=speech_fraction,
177
+ flags=flags,
178
+ )
@@ -2,7 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from typing import Any
5
+ from typing import Any, Callable
6
6
 
7
7
  from videopython.ai._device import log_device_initialization, release_device_memory, select_device
8
8
  from videopython.ai.dubbing.models import TranslatedSegment
@@ -135,8 +135,15 @@ class TextTranslator:
135
135
  texts: list[str],
136
136
  target_lang: str,
137
137
  source_lang: str | None = None,
138
+ progress_callback: Callable[[float], None] | None = None,
138
139
  ) -> list[str]:
139
- """Translate multiple texts to target language."""
140
+ """Translate multiple texts to target language.
141
+
142
+ ``progress_callback`` is called once per batch with a fraction in
143
+ ``[0, 1]`` representing translation-stage progress. It does not fire
144
+ on the empty-input or same-language shortcuts (those are O(0) work
145
+ and the caller frames its own progress events around the call).
146
+ """
140
147
  import torch
141
148
 
142
149
  if not texts:
@@ -150,8 +157,9 @@ class TextTranslator:
150
157
 
151
158
  translated: list[str] = []
152
159
  batch_size = 8
160
+ total = len(texts)
153
161
 
154
- for i in range(0, len(texts), batch_size):
162
+ for i in range(0, total, batch_size):
155
163
  batch = texts[i : i + batch_size]
156
164
  inputs = self._tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
157
165
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
@@ -162,6 +170,9 @@ class TextTranslator:
162
170
  for output in outputs:
163
171
  translated.append(self._tokenizer.decode(output, skip_special_tokens=True))
164
172
 
173
+ if progress_callback is not None:
174
+ progress_callback(min(1.0, (i + len(batch)) / total))
175
+
165
176
  return translated
166
177
 
167
178
  def translate_segments(
@@ -169,6 +180,7 @@ class TextTranslator:
169
180
  segments: list[TranscriptionSegment],
170
181
  target_lang: str,
171
182
  source_lang: str | None = None,
183
+ progress_callback: Callable[[float], None] | None = None,
172
184
  ) -> list[TranslatedSegment]:
173
185
  """Translate transcription segments while preserving timing/speaker info.
174
186
 
@@ -177,12 +189,18 @@ class TextTranslator:
177
189
  ``translated_text=""`` instead. This avoids MarianMT hallucinating
178
190
  full sentences from " .", "...", or single-token Whisper segments,
179
191
  which would otherwise be TTS'd into the dubbed track.
192
+
193
+ ``progress_callback`` is forwarded to :meth:`translate_batch` so
194
+ callers can render translation-stage progress without knowing the
195
+ batch size.
180
196
  """
181
197
  effective_source = source_lang or "en"
182
198
 
183
199
  translatable_indices = [i for i, segment in enumerate(segments) if _is_translatable_text(segment.text)]
184
200
  translatable_texts = [segments[i].text for i in translatable_indices]
185
- translated_texts = self.translate_batch(translatable_texts, target_lang, source_lang)
201
+ translated_texts = self.translate_batch(
202
+ translatable_texts, target_lang, source_lang, progress_callback=progress_callback
203
+ )
186
204
 
187
205
  translation_map: dict[int, str] = dict(zip(translatable_indices, translated_texts))
188
206
 
@@ -20,6 +20,20 @@ class AudioToText:
20
20
  voiced regions only — fixes Whisper's tendency to lock onto the wrong
21
21
  language when the file opens with silence, music, or non-vocal credits.
22
22
  Disable with ``enable_vad=False`` to reproduce pre-0.27 behaviour.
23
+
24
+ Three Whisper decoder kwargs are surfaced for anti-hallucination tuning:
25
+
26
+ - ``condition_on_previous_text`` defaults to ``False`` (Whisper's own
27
+ default is ``True``). With conditioning on, a single hallucinated filler
28
+ phrase cascades through the rest of the file because each window's
29
+ decoder is primed by the previous window's decoded text. Turning it off
30
+ is the most commonly recommended fix for that failure mode; the cost on
31
+ clean audio is small (slightly less context for ambiguous homophones
32
+ across sentence boundaries).
33
+ - ``no_speech_threshold`` and ``logprob_threshold`` are forwarded with
34
+ Whisper's documented defaults (``0.6`` and ``-1.0``); raising
35
+ ``no_speech_threshold`` biases toward dropping low-confidence windows
36
+ instead of emitting filler.
23
37
  """
24
38
 
25
39
  PYANNOTE_DIARIZATION_MODEL = "pyannote/speaker-diarization-community-1"
@@ -29,11 +43,17 @@ class AudioToText:
29
43
  model_name: Literal["tiny", "base", "small", "medium", "large", "turbo"] = "turbo",
30
44
  enable_diarization: bool = False,
31
45
  enable_vad: bool = True,
46
+ condition_on_previous_text: bool = False,
47
+ no_speech_threshold: float = 0.6,
48
+ logprob_threshold: float | None = -1.0,
32
49
  device: str | None = None,
33
50
  ):
34
51
  self.model_name = model_name
35
52
  self.enable_diarization = enable_diarization
36
53
  self.enable_vad = enable_vad
54
+ self.condition_on_previous_text = condition_on_previous_text
55
+ self.no_speech_threshold = no_speech_threshold
56
+ self.logprob_threshold = logprob_threshold
37
57
  self.device = select_device(device, mps_allowed=False)
38
58
  log_device_initialization(
39
59
  "AudioToText",
@@ -44,6 +64,16 @@ class AudioToText:
44
64
  self._diarization_pipeline: Any = None
45
65
  self._vad_model: Any = None
46
66
 
67
+ def _transcribe_kwargs(self, language: str | None) -> dict[str, Any]:
68
+ """Kwargs threaded into ``whisper.Whisper.transcribe`` from both call sites."""
69
+ return {
70
+ "word_timestamps": True,
71
+ "language": language,
72
+ "condition_on_previous_text": self.condition_on_previous_text,
73
+ "no_speech_threshold": self.no_speech_threshold,
74
+ "logprob_threshold": self.logprob_threshold,
75
+ }
76
+
47
77
  def _init_local(self) -> None:
48
78
  """Initialize local Whisper model."""
49
79
  import whisper
@@ -92,6 +122,9 @@ class AudioToText:
92
122
  end=segment["end"],
93
123
  text=segment["text"],
94
124
  words=transcription_words,
125
+ avg_logprob=segment.get("avg_logprob"),
126
+ no_speech_prob=segment.get("no_speech_prob"),
127
+ compression_ratio=segment.get("compression_ratio"),
95
128
  )
96
129
  transcription_segments.append(transcription_segment)
97
130
 
@@ -253,7 +286,7 @@ class AudioToText:
253
286
  self._init_diarization()
254
287
 
255
288
  audio_data = audio_mono.data
256
- transcription_result = self._model.transcribe(audio=audio_data, word_timestamps=True, language=language)
289
+ transcription_result = self._model.transcribe(audio=audio_data, **self._transcribe_kwargs(language))
257
290
 
258
291
  waveform = torch.from_numpy(audio_data.astype(np.float32)).unsqueeze(0)
259
292
  diarization_result = self._diarization_pipeline(
@@ -300,7 +333,7 @@ class AudioToText:
300
333
  if self.enable_diarization:
301
334
  return self._transcribe_with_diarization(audio_mono, language)
302
335
 
303
- transcription_result = self._model.transcribe(audio=audio_mono.data, word_timestamps=True, language=language)
336
+ transcription_result = self._model.transcribe(audio=audio_mono.data, **self._transcribe_kwargs(language))
304
337
  return self._process_transcription_result(transcription_result)
305
338
 
306
339
  def transcribe(self, media: Audio | Video) -> Transcription:
@@ -40,6 +40,9 @@ class TranscriptionSegment:
40
40
  text: str
41
41
  words: list[TranscriptionWord]
42
42
  speaker: str | None = None
43
+ avg_logprob: float | None = None
44
+ no_speech_prob: float | None = None
45
+ compression_ratio: float | None = None
43
46
 
44
47
  def to_dict(self) -> dict:
45
48
  """Convert to dictionary for JSON serialization."""
@@ -49,6 +52,9 @@ class TranscriptionSegment:
49
52
  "text": self.text,
50
53
  "words": [w.to_dict() for w in self.words],
51
54
  "speaker": self.speaker,
55
+ "avg_logprob": self.avg_logprob,
56
+ "no_speech_prob": self.no_speech_prob,
57
+ "compression_ratio": self.compression_ratio,
52
58
  }
53
59
 
54
60
  @classmethod
@@ -60,6 +66,9 @@ class TranscriptionSegment:
60
66
  text=data["text"],
61
67
  words=[TranscriptionWord.from_dict(w) for w in data["words"]],
62
68
  speaker=data.get("speaker"),
69
+ avg_logprob=data.get("avg_logprob"),
70
+ no_speech_prob=data.get("no_speech_prob"),
71
+ compression_ratio=data.get("compression_ratio"),
63
72
  )
64
73
 
65
74
 
File without changes
File without changes
File without changes