videopython 0.27.2__tar.gz → 0.28.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {videopython-0.27.2 → videopython-0.28.0}/PKG-INFO +1 -1
  2. {videopython-0.27.2 → videopython-0.28.0}/pyproject.toml +1 -1
  3. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/dubbing/__init__.py +4 -0
  4. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/dubbing/dubber.py +10 -0
  5. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/dubbing/models.py +97 -0
  6. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/dubbing/pipeline.py +159 -26
  7. videopython-0.28.0/src/videopython/ai/dubbing/quality.py +178 -0
  8. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/generation/translation.py +22 -4
  9. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/understanding/audio.py +3 -0
  10. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/text/transcription.py +9 -0
  11. {videopython-0.27.2 → videopython-0.28.0}/.gitignore +0 -0
  12. {videopython-0.27.2 → videopython-0.28.0}/LICENSE +0 -0
  13. {videopython-0.27.2 → videopython-0.28.0}/README.md +0 -0
  14. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/__init__.py +0 -0
  15. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/__init__.py +0 -0
  16. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/_device.py +0 -0
  17. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/dubbing/remux.py +0 -0
  18. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/dubbing/timing.py +0 -0
  19. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/generation/__init__.py +0 -0
  20. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/generation/audio.py +0 -0
  21. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/generation/image.py +0 -0
  22. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/generation/video.py +0 -0
  23. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/registry.py +0 -0
  24. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/swapping/__init__.py +0 -0
  25. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/swapping/inpainter.py +0 -0
  26. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/swapping/models.py +0 -0
  27. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/swapping/segmenter.py +0 -0
  28. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/swapping/swapper.py +0 -0
  29. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/transforms.py +0 -0
  30. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/understanding/__init__.py +0 -0
  31. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/understanding/image.py +0 -0
  32. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/understanding/separation.py +0 -0
  33. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/understanding/temporal.py +0 -0
  34. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/video_analysis.py +0 -0
  35. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/__init__.py +0 -0
  36. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/audio/__init__.py +0 -0
  37. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/audio/analysis.py +0 -0
  38. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/audio/audio.py +0 -0
  39. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/combine.py +0 -0
  40. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/description.py +0 -0
  41. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/effects.py +0 -0
  42. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/exceptions.py +0 -0
  43. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/progress.py +0 -0
  44. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/registry.py +0 -0
  45. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/scene.py +0 -0
  46. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/streaming.py +0 -0
  47. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/text/__init__.py +0 -0
  48. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/text/overlay.py +0 -0
  49. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/transforms.py +0 -0
  50. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/transitions.py +0 -0
  51. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/utils.py +0 -0
  52. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/video.py +0 -0
  53. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/editing/__init__.py +0 -0
  54. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/editing/multicam.py +0 -0
  55. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/editing/premiere_xml.py +0 -0
  56. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/editing/video_edit.py +0 -0
  57. {videopython-0.27.2 → videopython-0.28.0}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.27.2
3
+ Version: 0.28.0
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.27.2"
3
+ version = "0.28.0"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -3,6 +3,7 @@
3
3
  from videopython.ai.dubbing.dubber import VideoDubber
4
4
  from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio, TranslatedSegment
5
5
  from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
6
+ from videopython.ai.dubbing.quality import GarbageTranscriptError, TranscriptQuality, assess_transcript
6
7
  from videopython.ai.dubbing.timing import TimingSynchronizer
7
8
 
8
9
  __all__ = [
@@ -13,4 +14,7 @@ __all__ = [
13
14
  "SeparatedAudio",
14
15
  "LocalDubbingPipeline",
15
16
  "TimingSynchronizer",
17
+ "GarbageTranscriptError",
18
+ "TranscriptQuality",
19
+ "assess_transcript",
16
20
  ]
@@ -37,6 +37,13 @@ class VideoDubber:
37
37
  gate; raise to drop more low-confidence windows.
38
38
  logprob_threshold: Forwarded to ``AudioToText``. Whisper's average
39
39
  log-probability gate.
40
+ strict_quality: When True, the pipeline raises
41
+ :class:`GarbageTranscriptError` before Demucs/translation/TTS run
42
+ if the transcript-quality heuristic returns ``"reject"``. When
43
+ False (default), low-quality transcripts are logged at WARNING
44
+ but processing continues. Either way the
45
+ :class:`TranscriptQuality` is exposed on ``DubbingResult`` for
46
+ inspection.
40
47
  """
41
48
 
42
49
  def __init__(
@@ -47,6 +54,7 @@ class VideoDubber:
47
54
  condition_on_previous_text: bool = False,
48
55
  no_speech_threshold: float = 0.6,
49
56
  logprob_threshold: float | None = -1.0,
57
+ strict_quality: bool = False,
50
58
  ):
51
59
  self.device = device
52
60
  self.low_memory = low_memory
@@ -54,6 +62,7 @@ class VideoDubber:
54
62
  self.condition_on_previous_text = condition_on_previous_text
55
63
  self.no_speech_threshold = no_speech_threshold
56
64
  self.logprob_threshold = logprob_threshold
65
+ self.strict_quality = strict_quality
57
66
  self._local_pipeline: Any = None
58
67
  requested = device.lower() if isinstance(device, str) else "auto"
59
68
  logger.info(
@@ -73,6 +82,7 @@ class VideoDubber:
73
82
  condition_on_previous_text=self.condition_on_previous_text,
74
83
  no_speech_threshold=self.no_speech_threshold,
75
84
  logprob_threshold=self.logprob_threshold,
85
+ strict_quality=self.strict_quality,
76
86
  )
77
87
 
78
88
  def dub(
@@ -3,10 +3,21 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  from dataclasses import dataclass, field
6
+ from typing import TYPE_CHECKING, Any
6
7
 
7
8
  from videopython.base.audio import Audio
8
9
  from videopython.base.text.transcription import Transcription, TranscriptionSegment
9
10
 
11
+ if TYPE_CHECKING:
12
+ from videopython.ai.dubbing.quality import TranscriptQuality
13
+ from videopython.ai.dubbing.timing import TimingAdjustment
14
+
15
+
16
+ # Speed factors within this band of 1.0 are treated as a "clean" timing
17
+ # adjustment (no perceptible compression/stretch). Heuristic threshold for
18
+ # the TimingSummary classification only.
19
+ CLEAN_SPEED_TOLERANCE = 0.01
20
+
10
21
 
11
22
  @dataclass
12
23
  class TranslatedSegment:
@@ -73,6 +84,87 @@ class SeparatedAudio:
73
84
  return self.music is not None and self.effects is not None
74
85
 
75
86
 
87
+ @dataclass
88
+ class TimingSummary:
89
+ """Aggregate stats over per-segment timing adjustments.
90
+
91
+ Surfaces how aggressively the timing synchronizer had to compress or
92
+ truncate dubbed segments to fit the source's spoken regions. High
93
+ truncation rates indicate translation produced text too long for the
94
+ source duration.
95
+ """
96
+
97
+ total_segments: int
98
+ clean_count: int
99
+ stretched_count: int
100
+ truncated_count: int
101
+ mean_speed_factor: float
102
+ max_truncation_seconds: float
103
+
104
+ @classmethod
105
+ def from_adjustments(cls, adjustments: list[TimingAdjustment]) -> TimingSummary:
106
+ """Aggregate a list of TimingAdjustments into a TimingSummary."""
107
+ total = len(adjustments)
108
+ if total == 0:
109
+ return cls(
110
+ total_segments=0,
111
+ clean_count=0,
112
+ stretched_count=0,
113
+ truncated_count=0,
114
+ mean_speed_factor=1.0,
115
+ max_truncation_seconds=0.0,
116
+ )
117
+
118
+ clean = 0
119
+ stretched = 0
120
+ truncated = 0
121
+ speed_sum = 0.0
122
+ max_truncation = 0.0
123
+ for adj in adjustments:
124
+ speed_sum += adj.speed_factor
125
+ if adj.was_truncated:
126
+ truncated += 1
127
+ truncation = adj.original_duration - adj.actual_duration
128
+ if truncation > max_truncation:
129
+ max_truncation = truncation
130
+ elif abs(adj.speed_factor - 1.0) <= CLEAN_SPEED_TOLERANCE:
131
+ clean += 1
132
+ else:
133
+ stretched += 1
134
+
135
+ return cls(
136
+ total_segments=total,
137
+ clean_count=clean,
138
+ stretched_count=stretched,
139
+ truncated_count=truncated,
140
+ mean_speed_factor=speed_sum / total,
141
+ max_truncation_seconds=max_truncation,
142
+ )
143
+
144
+ def to_dict(self) -> dict[str, Any]:
145
+ """Convert to dictionary for JSON serialization."""
146
+ return {
147
+ "total_segments": self.total_segments,
148
+ "clean_count": self.clean_count,
149
+ "stretched_count": self.stretched_count,
150
+ "truncated_count": self.truncated_count,
151
+ "mean_speed_factor": self.mean_speed_factor,
152
+ "max_truncation_seconds": self.max_truncation_seconds,
153
+ }
154
+
155
+ @classmethod
156
+ def from_dict(cls, data: dict[str, Any]) -> TimingSummary:
157
+ """Create TimingSummary from dictionary."""
158
+ return cls(
159
+ total_segments=data["total_segments"],
160
+ clean_count=data["clean_count"],
161
+ stretched_count=data["stretched_count"],
162
+ truncated_count=data["truncated_count"],
163
+ mean_speed_factor=data["mean_speed_factor"],
164
+ max_truncation_seconds=data["max_truncation_seconds"],
165
+ )
166
+
167
+
76
168
  @dataclass
77
169
  class DubbingResult:
78
170
  """Result of a video dubbing operation.
@@ -85,6 +177,9 @@ class DubbingResult:
85
177
  target_lang: Target language for dubbing.
86
178
  separated_audio: Separated audio components (if preserve_background=True).
87
179
  voice_samples: Dictionary mapping speaker IDs to voice sample Audio.
180
+ timing_summary: Aggregate stats over per-segment timing adjustments.
181
+ transcript_quality: Heuristic quality assessment of the transcription
182
+ (None when the pipeline returned early on an empty transcription).
88
183
  """
89
184
 
90
185
  dubbed_audio: Audio
@@ -94,6 +189,8 @@ class DubbingResult:
94
189
  target_lang: str
95
190
  separated_audio: SeparatedAudio | None = None
96
191
  voice_samples: dict[str, Audio] = field(default_factory=dict)
192
+ timing_summary: TimingSummary | None = None
193
+ transcript_quality: TranscriptQuality | None = None
97
194
 
98
195
  @property
99
196
  def num_segments(self) -> int:
@@ -9,7 +9,8 @@ from typing import TYPE_CHECKING, Any, Callable, Literal
9
9
 
10
10
  import numpy as np
11
11
 
12
- from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio
12
+ from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio, TimingSummary
13
+ from videopython.ai.dubbing.quality import GarbageTranscriptError, assess_transcript
13
14
  from videopython.ai.dubbing.timing import TimingSynchronizer
14
15
 
15
16
  if TYPE_CHECKING:
@@ -46,6 +47,14 @@ WhisperModel = Literal["tiny", "base", "small", "medium", "large", "turbo"]
46
47
 
47
48
  logger = logging.getLogger(__name__)
48
49
 
50
+ # Voice-sample quality gating thresholds. Tuned conservatively to favor
51
+ # accepting real-world dialogue over rejecting it; failures fall back to
52
+ # the longest segment with a WARNING log so we can re-tune from production
53
+ # data instead of guessing.
54
+ PEAK_CLIP_THRESHOLD = 0.99
55
+ MIN_VOCAL_BG_RMS_RATIO = 1.5
56
+ VOICE_SAMPLE_TARGET_DURATION = 6.0
57
+
49
58
 
50
59
  class LocalDubbingPipeline:
51
60
  """Local pipeline for video dubbing.
@@ -64,6 +73,7 @@ class LocalDubbingPipeline:
64
73
  condition_on_previous_text: bool = False,
65
74
  no_speech_threshold: float = 0.6,
66
75
  logprob_threshold: float | None = -1.0,
76
+ strict_quality: bool = False,
67
77
  ):
68
78
  self.device = device
69
79
  self.low_memory = low_memory
@@ -71,6 +81,7 @@ class LocalDubbingPipeline:
71
81
  self.condition_on_previous_text = condition_on_previous_text
72
82
  self.no_speech_threshold = no_speech_threshold
73
83
  self.logprob_threshold = logprob_threshold
84
+ self.strict_quality = strict_quality
74
85
  requested = device.lower() if isinstance(device, str) else "auto"
75
86
  logger.info(
76
87
  "LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s",
@@ -141,12 +152,25 @@ class LocalDubbingPipeline:
141
152
 
142
153
  def _extract_voice_samples(
143
154
  self,
144
- audio: Any,
155
+ vocal_audio: Any,
156
+ background_audio: Any | None,
145
157
  transcription: Any,
146
158
  min_duration: float = 3.0,
147
159
  max_duration: float = 10.0,
148
160
  ) -> dict[str, Any]:
149
- """Extract voice samples for each speaker from the audio."""
161
+ """Extract a per-speaker voice sample with quality gating.
162
+
163
+ Picks the highest-scored segment per speaker after rejecting clipped
164
+ slices (peak >= ``PEAK_CLIP_THRESHOLD``) and slices where Demucs left
165
+ the background louder than the vocals
166
+ (``vocal_rms / bg_rms < MIN_VOCAL_BG_RMS_RATIO``). When the
167
+ background track isn't available (e.g. ``revoice`` after
168
+ ``low_memory`` dropped it), the RMS check is skipped silently.
169
+
170
+ Falls back to the longest available segment with a WARNING log when
171
+ every candidate is rejected, so the dub continues with the best
172
+ sample we have rather than silently dropping the speaker.
173
+ """
150
174
  from videopython.base.audio import Audio
151
175
 
152
176
  voice_samples: dict[str, Audio] = {}
@@ -159,29 +183,106 @@ class LocalDubbingPipeline:
159
183
  segments_by_speaker[speaker].append(segment)
160
184
 
161
185
  for speaker, segments in segments_by_speaker.items():
162
- target_duration = 6.0
163
- best_segment = None
164
- best_diff = float("inf")
165
-
166
- for segment in segments:
167
- duration = segment.end - segment.start
168
- if duration >= min_duration:
169
- diff = abs(duration - target_duration)
170
- if diff < best_diff:
171
- best_diff = diff
172
- best_segment = segment
173
-
174
- if best_segment is not None:
175
- start = best_segment.start
176
- end = min(best_segment.end, start + max_duration)
177
- sliced = audio.slice(start, end)
178
- # Audio.slice returns a numpy view into the source. Copy so the
179
- # short voice sample doesn't keep the full vocals array (~1.3 GB
180
- # for 2h sources) alive across translate + TTS.
181
- voice_samples[speaker] = Audio(sliced.data.copy(), sliced.metadata)
186
+ chosen, fallback_reason = self._pick_voice_segment(
187
+ speaker, segments, vocal_audio, background_audio, min_duration
188
+ )
189
+
190
+ if chosen is None:
191
+ logger.warning("No usable voice-sample segment for speaker %r (no candidates)", speaker)
192
+ continue
193
+
194
+ if fallback_reason is not None:
195
+ logger.warning(
196
+ "Voice-sample quality fallback for speaker %r (%d candidates): %s — using longest segment",
197
+ speaker,
198
+ len(segments),
199
+ fallback_reason,
200
+ )
201
+
202
+ start = chosen.start
203
+ end = min(chosen.end, start + max_duration)
204
+ sliced = vocal_audio.slice(start, end)
205
+ # Audio.slice returns a numpy view into the source. Copy so the
206
+ # short voice sample doesn't keep the full vocals array (~1.3 GB
207
+ # for 2h sources) alive across translate + TTS.
208
+ voice_samples[speaker] = Audio(sliced.data.copy(), sliced.metadata)
182
209
 
183
210
  return voice_samples
184
211
 
212
+ def _pick_voice_segment(
213
+ self,
214
+ speaker: str,
215
+ segments: list[Any],
216
+ vocal_audio: Any,
217
+ background_audio: Any | None,
218
+ min_duration: float,
219
+ ) -> tuple[Any | None, str | None]:
220
+ """Score eligible segments and pick the best one for ``speaker``.
221
+
222
+ Returns ``(segment, fallback_reason)``. ``fallback_reason`` is None
223
+ when scoring picked a segment cleanly; non-None when every candidate
224
+ was rejected and the longest segment was used instead.
225
+ """
226
+ if not segments:
227
+ return None, None
228
+
229
+ eligible = [s for s in segments if (s.end - s.start) >= min_duration]
230
+
231
+ rejection_reasons: list[str] = []
232
+ scored: list[tuple[float, Any]] = []
233
+ for segment in eligible:
234
+ score, reason = self._score_voice_segment(segment, vocal_audio, background_audio)
235
+ if score is None:
236
+ rejection_reasons.append(reason or "rejected")
237
+ else:
238
+ scored.append((score, segment))
239
+
240
+ if scored:
241
+ scored.sort(key=lambda item: item[0], reverse=True)
242
+ return scored[0][1], None
243
+
244
+ # All eligible segments rejected (or none met the min duration).
245
+ # Fall back to the longest segment overall so the speaker still
246
+ # gets a clone reference.
247
+ longest = max(segments, key=lambda s: s.end - s.start)
248
+ if eligible:
249
+ reason = ", ".join(sorted(set(rejection_reasons)))
250
+ else:
251
+ reason = f"no segment >= {min_duration:.1f}s"
252
+ return longest, reason
253
+
254
+ def _score_voice_segment(
255
+ self,
256
+ segment: Any,
257
+ vocal_audio: Any,
258
+ background_audio: Any | None,
259
+ ) -> tuple[float | None, str | None]:
260
+ """Return ``(score, reason)`` for a candidate segment.
261
+
262
+ ``score`` is ``None`` when the segment is rejected; ``reason`` carries
263
+ the rejection cause so the fallback logger can summarize.
264
+ """
265
+ vocal_slice = vocal_audio.slice(segment.start, segment.end)
266
+ if vocal_slice.data.size == 0:
267
+ return None, "empty slice"
268
+
269
+ peak = float(np.max(np.abs(vocal_slice.data)))
270
+ if peak >= PEAK_CLIP_THRESHOLD:
271
+ return None, "clipped"
272
+
273
+ vocal_rms = float(np.sqrt(np.mean(vocal_slice.data**2)))
274
+
275
+ if background_audio is not None:
276
+ bg_slice = background_audio.slice(segment.start, segment.end)
277
+ if bg_slice.data.size > 0:
278
+ bg_rms = float(np.sqrt(np.mean(bg_slice.data**2)))
279
+ if bg_rms > 0 and (vocal_rms / bg_rms) < MIN_VOCAL_BG_RMS_RATIO:
280
+ return None, "background-dominated"
281
+
282
+ duration = segment.end - segment.start
283
+ duration_penalty = abs(duration - VOICE_SAMPLE_TARGET_DURATION)
284
+ return vocal_rms - 0.05 * duration_penalty, None
285
+
185
286
  def process(
186
287
  self,
187
288
  source_audio: Audio,
@@ -266,6 +367,23 @@ class LocalDubbingPipeline:
266
367
  target_lang=target_lang,
267
368
  )
268
369
 
370
+ # Cheap heuristic gate before the expensive Demucs/translation/TTS
371
+ # stages. Lets strict_quality callers refuse-and-refund without
372
+ # running the rest of the pipeline; non-strict runs continue but
373
+ # surface the assessment on DubbingResult.
374
+ transcript_quality = assess_transcript(transcription, source_audio.metadata.duration_seconds)
375
+ if transcript_quality.recommendation == "reject" and self.strict_quality:
376
+ raise GarbageTranscriptError(
377
+ f"Refusing to dub: {', '.join(transcript_quality.flags)}",
378
+ transcript_quality,
379
+ )
380
+ if transcript_quality.recommendation in ("warn", "reject"):
381
+ logger.warning(
382
+ "Transcript quality flags raised: %s (recommendation=%s)",
383
+ ", ".join(transcript_quality.flags),
384
+ transcript_quality.recommendation,
385
+ )
386
+
269
387
  detected_lang = source_lang or transcription.language or "en"
270
388
 
271
389
  separated_audio: SeparatedAudio | None = None
@@ -303,7 +421,7 @@ class LocalDubbingPipeline:
303
421
  voice_samples: dict[str, Audio] = {}
304
422
  if voice_clone:
305
423
  report_progress("Extracting voice samples", 0.25)
306
- voice_samples = self._extract_voice_samples(vocal_audio, transcription)
424
+ voice_samples = self._extract_voice_samples(vocal_audio, background_audio, transcription)
307
425
 
308
426
  # vocals is no longer needed; voice_samples are independent copies.
309
427
  # In low_memory mode this is the only ref keeping the buffer alive
@@ -314,10 +432,19 @@ class LocalDubbingPipeline:
314
432
  if self._translator is None:
315
433
  self._init_translator()
316
434
 
435
+ # Translation stage spans 0.35 → 0.50 of overall pipeline progress.
436
+ # MarianMT runs sequentially over 8-segment batches; on a 15-min
437
+ # source that's minutes of silent dwell on 0.35 without per-batch
438
+ # ticks. Map the [0,1] translation fraction onto that 15% window.
439
+ def _on_translation_progress(fraction: float) -> None:
440
+ clamped = max(0.0, min(1.0, fraction))
441
+ report_progress(f"Translating text ({int(clamped * 100)}%)", 0.35 + 0.15 * clamped)
442
+
317
443
  translated_segments = self._translator.translate_segments(
318
444
  segments=transcription.segments,
319
445
  target_lang=target_lang,
320
446
  source_lang=detected_lang,
447
+ progress_callback=_on_translation_progress,
321
448
  )
322
449
  self._maybe_unload("_translator")
323
450
 
@@ -393,7 +520,8 @@ class LocalDubbingPipeline:
393
520
  self._init_synchronizer()
394
521
  assert self._synchronizer is not None
395
522
 
396
- synchronized_segments, _ = self._synchronizer.synchronize_segments(dubbed_segments, target_durations)
523
+ synchronized_segments, adjustments = self._synchronizer.synchronize_segments(dubbed_segments, target_durations)
524
+ timing_summary = TimingSummary.from_adjustments(adjustments)
397
525
  del dubbed_segments
398
526
 
399
527
  report_progress("Assembling final audio", 0.90)
@@ -429,6 +557,8 @@ class LocalDubbingPipeline:
429
557
  target_lang=target_lang,
430
558
  separated_audio=separated_audio,
431
559
  voice_samples=voice_samples,
560
+ timing_summary=timing_summary,
561
+ transcript_quality=transcript_quality,
432
562
  )
433
563
 
434
564
  def revoice(
@@ -486,7 +616,10 @@ class LocalDubbingPipeline:
486
616
  voice_sample: Audio | None = None
487
617
 
488
618
  if transcription.segments:
489
- voice_samples = self._extract_voice_samples(vocal_audio, transcription)
619
+ # revoice doesn't track the background after the low_memory drop,
620
+ # so quality gating degrades to "no RMS check" here. Clipping is
621
+ # still rejected.
622
+ voice_samples = self._extract_voice_samples(vocal_audio, None, transcription)
490
623
  if voice_samples:
491
624
  voice_sample = next(iter(voice_samples.values()))
492
625
 
@@ -0,0 +1,178 @@
1
+ """Cheap heuristics over a Whisper transcription to flag degenerate output.
2
+
3
+ Surfaces three failure modes seen in production where Demucs/translation/TTS
4
+ would otherwise spend minutes producing a useless dub:
5
+
6
+ - Dominant-phrase cascade — one phrase repeats across most segments. The
7
+ classic Whisper failure on ambient music / outro screens
8
+ ("Thank you for watching").
9
+ - Low decoder confidence — median per-segment ``avg_logprob`` is poor.
10
+ - Silent input misread as speech — total speech duration is tiny relative
11
+ to the clip's wall-clock duration (only meaningful on long inputs).
12
+
13
+ Each check raises a flag; a recommendation is derived from how many fired.
14
+ Threshold constants live at module scope so production data can re-tune them
15
+ without touching code structure.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import re
21
+ import statistics
22
+ from collections import Counter
23
+ from dataclasses import dataclass, field
24
+ from typing import TYPE_CHECKING, Any, Literal
25
+
26
+ if TYPE_CHECKING:
27
+ from videopython.base.text.transcription import Transcription
28
+
29
+
30
+ # Tuned conservatively to favor "warn" over "reject"; first-week production
31
+ # data may move them.
32
+ DOMINANT_PHRASE_FRACTION_THRESHOLD = 0.70
33
+ LOW_LOGPROB_MEDIAN_THRESHOLD = -1.5
34
+ LOW_SPEECH_FRACTION_THRESHOLD = 0.05
35
+ SHORT_CLIP_SECONDS = 30.0 # below this, speech-fraction is too unstable to trust
36
+
37
+
38
+ Recommendation = Literal["ok", "warn", "reject"]
39
+
40
+
41
+ _PUNCT_RE = re.compile(r"[^\w\s]+", re.UNICODE)
42
+ _WHITESPACE_RE = re.compile(r"\s+")
43
+
44
+
45
+ def _normalize_phrase(text: str) -> str:
46
+ """Lowercase, strip punctuation, collapse whitespace."""
47
+ cleaned = _PUNCT_RE.sub(" ", text.lower())
48
+ return _WHITESPACE_RE.sub(" ", cleaned).strip()
49
+
50
+
51
+ @dataclass
52
+ class TranscriptQuality:
53
+ """Quality assessment of a Whisper transcription.
54
+
55
+ Attributes:
56
+ recommendation: ``"ok"`` (continue), ``"warn"`` (continue, log), or
57
+ ``"reject"`` (caller should refuse to dub if strict_quality).
58
+ dominant_phrase: The repeating phrase that triggered the dominance
59
+ flag, or None when the flag didn't fire.
60
+ dominant_phrase_fraction: Character-count share of the most common
61
+ normalized segment phrase. 0.0 when no segments.
62
+ median_avg_logprob: Median of ``avg_logprob`` across segments that
63
+ carry it; None when no segment had a logprob (e.g. SRT-loaded).
64
+ speech_fraction: Sum of segment durations divided by the audio's
65
+ wall-clock duration.
66
+ flags: Human-readable list of which checks fired.
67
+ """
68
+
69
+ recommendation: Recommendation
70
+ dominant_phrase: str | None
71
+ dominant_phrase_fraction: float
72
+ median_avg_logprob: float | None
73
+ speech_fraction: float
74
+ flags: list[str] = field(default_factory=list)
75
+
76
+ def to_dict(self) -> dict[str, Any]:
77
+ return {
78
+ "recommendation": self.recommendation,
79
+ "dominant_phrase": self.dominant_phrase,
80
+ "dominant_phrase_fraction": self.dominant_phrase_fraction,
81
+ "median_avg_logprob": self.median_avg_logprob,
82
+ "speech_fraction": self.speech_fraction,
83
+ "flags": list(self.flags),
84
+ }
85
+
86
+ @classmethod
87
+ def from_dict(cls, data: dict[str, Any]) -> TranscriptQuality:
88
+ return cls(
89
+ recommendation=data["recommendation"],
90
+ dominant_phrase=data.get("dominant_phrase"),
91
+ dominant_phrase_fraction=data.get("dominant_phrase_fraction", 0.0),
92
+ median_avg_logprob=data.get("median_avg_logprob"),
93
+ speech_fraction=data.get("speech_fraction", 0.0),
94
+ flags=list(data.get("flags", [])),
95
+ )
96
+
97
+
98
+ class GarbageTranscriptError(RuntimeError):
99
+ """Raised by the dubbing pipeline when ``strict_quality=True`` and the
100
+ transcript heuristic returns ``recommendation="reject"``.
101
+
102
+ The triggering :class:`TranscriptQuality` is attached as ``quality`` so
103
+ callers can introspect the flags without re-running the pipeline.
104
+ """
105
+
106
+ def __init__(self, message: str, quality: TranscriptQuality):
107
+ super().__init__(message)
108
+ self.quality = quality
109
+
110
+
111
+ def assess_transcript(
112
+ transcription: Transcription,
113
+ audio_duration_seconds: float,
114
+ ) -> TranscriptQuality:
115
+ """Run the three quality checks and return a recommendation.
116
+
117
+ See module docstring for what each check looks for.
118
+ """
119
+ segments = list(transcription.segments)
120
+
121
+ # Dominant-phrase share by character count.
122
+ dominant_phrase: str | None = None
123
+ dominant_fraction = 0.0
124
+ if segments:
125
+ normalized = [_normalize_phrase(s.text) for s in segments]
126
+ char_counts: Counter[str] = Counter()
127
+ total_chars = 0
128
+ for phrase in normalized:
129
+ if not phrase:
130
+ continue
131
+ n = len(phrase)
132
+ char_counts[phrase] += n
133
+ total_chars += n
134
+ if total_chars > 0 and char_counts:
135
+ most_common_phrase, most_common_chars = char_counts.most_common(1)[0]
136
+ dominant_fraction = most_common_chars / total_chars
137
+ dominant_phrase = most_common_phrase
138
+
139
+ # Median avg_logprob across segments that carry it.
140
+ logprobs = [s.avg_logprob for s in segments if s.avg_logprob is not None]
141
+ median_logprob = statistics.median(logprobs) if logprobs else None
142
+
143
+ # Speech fraction = sum of segment durations / audio duration.
144
+ speech_seconds = sum(max(0.0, s.end - s.start) for s in segments)
145
+ speech_fraction = speech_seconds / audio_duration_seconds if audio_duration_seconds > 0 else 0.0
146
+
147
+ flags: list[str] = []
148
+ dominance_flag = dominant_fraction >= DOMINANT_PHRASE_FRACTION_THRESHOLD
149
+ if dominance_flag:
150
+ flags.append(f"dominant phrase {dominant_fraction:.0%}: {dominant_phrase!r}")
151
+
152
+ logprob_flag = median_logprob is not None and median_logprob < LOW_LOGPROB_MEDIAN_THRESHOLD
153
+ if logprob_flag:
154
+ flags.append(f"median avg_logprob {median_logprob:.2f} below {LOW_LOGPROB_MEDIAN_THRESHOLD}")
155
+
156
+ # Speech-fraction is unstable on short clips; skip it there.
157
+ speech_flag = audio_duration_seconds > SHORT_CLIP_SECONDS and speech_fraction < LOW_SPEECH_FRACTION_THRESHOLD
158
+ if speech_flag:
159
+ flags.append(f"speech fraction {speech_fraction:.1%} below {LOW_SPEECH_FRACTION_THRESHOLD:.0%}")
160
+
161
+ # Reject only when dominance + at least one other flag fires; legitimate
162
+ # repetitive content (chants, lyric clips) should warn, not reject.
163
+ recommendation: Recommendation
164
+ if dominance_flag and (logprob_flag or speech_flag):
165
+ recommendation = "reject"
166
+ elif flags:
167
+ recommendation = "warn"
168
+ else:
169
+ recommendation = "ok"
170
+
171
+ return TranscriptQuality(
172
+ recommendation=recommendation,
173
+ dominant_phrase=dominant_phrase if dominance_flag else None,
174
+ dominant_phrase_fraction=dominant_fraction,
175
+ median_avg_logprob=median_logprob,
176
+ speech_fraction=speech_fraction,
177
+ flags=flags,
178
+ )
@@ -2,7 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from typing import Any
5
+ from typing import Any, Callable
6
6
 
7
7
  from videopython.ai._device import log_device_initialization, release_device_memory, select_device
8
8
  from videopython.ai.dubbing.models import TranslatedSegment
@@ -135,8 +135,15 @@ class TextTranslator:
135
135
  texts: list[str],
136
136
  target_lang: str,
137
137
  source_lang: str | None = None,
138
+ progress_callback: Callable[[float], None] | None = None,
138
139
  ) -> list[str]:
139
- """Translate multiple texts to target language."""
140
+ """Translate multiple texts to target language.
141
+
142
+ ``progress_callback`` is called once per batch with a fraction in
143
+ ``[0, 1]`` representing translation-stage progress. It does not fire
144
+ on the empty-input or same-language shortcuts (those are O(0) work
145
+ and the caller frames its own progress events around the call).
146
+ """
140
147
  import torch
141
148
 
142
149
  if not texts:
@@ -150,8 +157,9 @@ class TextTranslator:
150
157
 
151
158
  translated: list[str] = []
152
159
  batch_size = 8
160
+ total = len(texts)
153
161
 
154
- for i in range(0, len(texts), batch_size):
162
+ for i in range(0, total, batch_size):
155
163
  batch = texts[i : i + batch_size]
156
164
  inputs = self._tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
157
165
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
@@ -162,6 +170,9 @@ class TextTranslator:
162
170
  for output in outputs:
163
171
  translated.append(self._tokenizer.decode(output, skip_special_tokens=True))
164
172
 
173
+ if progress_callback is not None:
174
+ progress_callback(min(1.0, (i + len(batch)) / total))
175
+
165
176
  return translated
166
177
 
167
178
  def translate_segments(
@@ -169,6 +180,7 @@ class TextTranslator:
169
180
  segments: list[TranscriptionSegment],
170
181
  target_lang: str,
171
182
  source_lang: str | None = None,
183
+ progress_callback: Callable[[float], None] | None = None,
172
184
  ) -> list[TranslatedSegment]:
173
185
  """Translate transcription segments while preserving timing/speaker info.
174
186
 
@@ -177,12 +189,18 @@ class TextTranslator:
177
189
  ``translated_text=""`` instead. This avoids MarianMT hallucinating
178
190
  full sentences from " .", "...", or single-token Whisper segments,
179
191
  which would otherwise be TTS'd into the dubbed track.
192
+
193
+ ``progress_callback`` is forwarded to :meth:`translate_batch` so
194
+ callers can render translation-stage progress without knowing the
195
+ batch size.
180
196
  """
181
197
  effective_source = source_lang or "en"
182
198
 
183
199
  translatable_indices = [i for i, segment in enumerate(segments) if _is_translatable_text(segment.text)]
184
200
  translatable_texts = [segments[i].text for i in translatable_indices]
185
- translated_texts = self.translate_batch(translatable_texts, target_lang, source_lang)
201
+ translated_texts = self.translate_batch(
202
+ translatable_texts, target_lang, source_lang, progress_callback=progress_callback
203
+ )
186
204
 
187
205
  translation_map: dict[int, str] = dict(zip(translatable_indices, translated_texts))
188
206
 
@@ -122,6 +122,9 @@ class AudioToText:
122
122
  end=segment["end"],
123
123
  text=segment["text"],
124
124
  words=transcription_words,
125
+ avg_logprob=segment.get("avg_logprob"),
126
+ no_speech_prob=segment.get("no_speech_prob"),
127
+ compression_ratio=segment.get("compression_ratio"),
125
128
  )
126
129
  transcription_segments.append(transcription_segment)
127
130
 
@@ -40,6 +40,9 @@ class TranscriptionSegment:
40
40
  text: str
41
41
  words: list[TranscriptionWord]
42
42
  speaker: str | None = None
43
+ avg_logprob: float | None = None
44
+ no_speech_prob: float | None = None
45
+ compression_ratio: float | None = None
43
46
 
44
47
  def to_dict(self) -> dict:
45
48
  """Convert to dictionary for JSON serialization."""
@@ -49,6 +52,9 @@ class TranscriptionSegment:
49
52
  "text": self.text,
50
53
  "words": [w.to_dict() for w in self.words],
51
54
  "speaker": self.speaker,
55
+ "avg_logprob": self.avg_logprob,
56
+ "no_speech_prob": self.no_speech_prob,
57
+ "compression_ratio": self.compression_ratio,
52
58
  }
53
59
 
54
60
  @classmethod
@@ -60,6 +66,9 @@ class TranscriptionSegment:
60
66
  text=data["text"],
61
67
  words=[TranscriptionWord.from_dict(w) for w in data["words"]],
62
68
  speaker=data.get("speaker"),
69
+ avg_logprob=data.get("avg_logprob"),
70
+ no_speech_prob=data.get("no_speech_prob"),
71
+ compression_ratio=data.get("compression_ratio"),
63
72
  )
64
73
 
65
74
 
File without changes
File without changes
File without changes