videopython 0.29.0__tar.gz → 0.30.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. {videopython-0.29.0 → videopython-0.30.0}/PKG-INFO +2 -3
  2. {videopython-0.29.0 → videopython-0.30.0}/README.md +1 -2
  3. {videopython-0.29.0 → videopython-0.30.0}/pyproject.toml +1 -1
  4. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/__init__.py +0 -3
  5. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/dubbing/__init__.py +0 -3
  6. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/dubbing/dubber.py +9 -12
  7. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/dubbing/models.py +1 -2
  8. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/dubbing/pipeline.py +19 -146
  9. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/transforms.py +10 -8
  10. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/understanding/audio.py +97 -11
  11. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/understanding/temporal.py +1 -17
  12. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/video_analysis.py +12 -12
  13. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/__init__.py +0 -5
  14. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/effects.py +17 -14
  15. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/registry.py +0 -9
  16. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/streaming.py +7 -4
  17. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/text/overlay.py +6 -3
  18. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/transforms.py +15 -12
  19. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/video.py +2 -2
  20. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/editing/__init__.py +0 -2
  21. videopython-0.29.0/src/videopython/ai/dubbing/cache.py +0 -309
  22. videopython-0.29.0/src/videopython/ai/swapping/__init__.py +0 -46
  23. videopython-0.29.0/src/videopython/ai/swapping/inpainter.py +0 -264
  24. videopython-0.29.0/src/videopython/ai/swapping/models.py +0 -221
  25. videopython-0.29.0/src/videopython/ai/swapping/segmenter.py +0 -577
  26. videopython-0.29.0/src/videopython/ai/swapping/swapper.py +0 -524
  27. videopython-0.29.0/src/videopython/base/combine.py +0 -61
  28. videopython-0.29.0/src/videopython/base/progress.py +0 -63
  29. videopython-0.29.0/src/videopython/base/utils.py +0 -6
  30. videopython-0.29.0/src/videopython/editing/premiere_xml.py +0 -313
  31. {videopython-0.29.0 → videopython-0.30.0}/.gitignore +0 -0
  32. {videopython-0.29.0 → videopython-0.30.0}/LICENSE +0 -0
  33. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/__init__.py +0 -0
  34. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/_device.py +0 -0
  35. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/dubbing/quality.py +0 -0
  36. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/dubbing/remux.py +0 -0
  37. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/dubbing/timing.py +0 -0
  38. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/generation/__init__.py +0 -0
  39. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/generation/audio.py +0 -0
  40. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/generation/image.py +0 -0
  41. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/generation/qwen3.py +0 -0
  42. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/generation/translation.py +0 -0
  43. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/generation/video.py +0 -0
  44. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/registry.py +0 -0
  45. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/understanding/__init__.py +0 -0
  46. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/understanding/faces.py +0 -0
  47. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/understanding/image.py +0 -0
  48. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/ai/understanding/separation.py +0 -0
  49. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/audio/__init__.py +0 -0
  50. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/audio/analysis.py +0 -0
  51. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/audio/audio.py +0 -0
  52. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/description.py +0 -0
  53. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/exceptions.py +0 -0
  54. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/scene.py +0 -0
  55. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/text/__init__.py +0 -0
  56. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/text/transcription.py +0 -0
  57. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/base/transitions.py +0 -0
  58. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/editing/multicam.py +0 -0
  59. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/editing/video_edit.py +0 -0
  60. {videopython-0.29.0 → videopython-0.30.0}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.29.0
3
+ Version: 0.30.0
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -201,9 +201,8 @@ API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopyth
201
201
  | **Video analysis** | `VideoAnalyzer` - full-pipeline analysis combining multiple AI capabilities |
202
202
  | **Transforms** | `FaceTrackingCrop`, `SplitScreenComposite` |
203
203
  | **Dubbing** | `VideoDubber` - voice cloning and revoicing with timing sync |
204
- | **Object swapping** | `ObjectSwapper` - detect, segment, and inpaint objects in video |
205
204
 
206
- API docs: [Generation](https://videopython.com/api/ai/generation/) | [Understanding](https://videopython.com/api/ai/understanding/) | [Transforms](https://videopython.com/api/ai/transforms/) | [Dubbing](https://videopython.com/api/ai/dubbing/) | [Object Swapping](https://videopython.com/api/ai/swapping/)
205
+ API docs: [Generation](https://videopython.com/api/ai/generation/) | [Understanding](https://videopython.com/api/ai/understanding/) | [Transforms](https://videopython.com/api/ai/transforms/) | [Dubbing](https://videopython.com/api/ai/dubbing/)
207
206
 
208
207
  ## Examples
209
208
 
@@ -152,9 +152,8 @@ API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopyth
152
152
  | **Video analysis** | `VideoAnalyzer` - full-pipeline analysis combining multiple AI capabilities |
153
153
  | **Transforms** | `FaceTrackingCrop`, `SplitScreenComposite` |
154
154
  | **Dubbing** | `VideoDubber` - voice cloning and revoicing with timing sync |
155
- | **Object swapping** | `ObjectSwapper` - detect, segment, and inpaint objects in video |
156
155
 
157
- API docs: [Generation](https://videopython.com/api/ai/generation/) | [Understanding](https://videopython.com/api/ai/understanding/) | [Transforms](https://videopython.com/api/ai/transforms/) | [Dubbing](https://videopython.com/api/ai/dubbing/) | [Object Swapping](https://videopython.com/api/ai/swapping/)
156
+ API docs: [Generation](https://videopython.com/api/ai/generation/) | [Understanding](https://videopython.com/api/ai/understanding/) | [Transforms](https://videopython.com/api/ai/transforms/) | [Dubbing](https://videopython.com/api/ai/dubbing/)
158
157
 
159
158
  ## Examples
160
159
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.29.0"
3
+ version = "0.30.0"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -1,7 +1,6 @@
1
1
  from videopython.ai import registry as _ai_registry # noqa: F401
2
2
 
3
3
  from .generation import ImageToVideo, TextToImage, TextToMusic, TextToSpeech, TextToVideo
4
- from .swapping import ObjectSwapper
5
4
  from .transforms import FaceTrackingCrop, SplitScreenComposite
6
5
  from .understanding import (
7
6
  AudioClassifier,
@@ -28,8 +27,6 @@ __all__ = [
28
27
  # Transforms (AI-powered)
29
28
  "FaceTrackingCrop",
30
29
  "SplitScreenComposite",
31
- # Swapping
32
- "ObjectSwapper",
33
30
  # Video analysis
34
31
  "VideoAnalysis",
35
32
  "VideoAnalysisConfig",
@@ -1,6 +1,5 @@
1
1
  """Local video dubbing functionality."""
2
2
 
3
- from videopython.ai.dubbing.cache import DubCache, dub_cache_clear
4
3
  from videopython.ai.dubbing.dubber import VideoDubber
5
4
  from videopython.ai.dubbing.models import (
6
5
  DubbingResult,
@@ -26,7 +25,5 @@ __all__ = [
26
25
  "TranscriptQuality",
27
26
  "assess_transcript",
28
27
  "UnsupportedLanguageError",
29
- "DubCache",
30
- "dub_cache_clear",
31
28
  "Expressiveness",
32
29
  ]
@@ -37,6 +37,11 @@ class VideoDubber:
37
37
  gate; raise to drop more low-confidence windows.
38
38
  logprob_threshold: Forwarded to ``AudioToText``. Whisper's average
39
39
  log-probability gate.
40
+ vocabulary: Forwarded to ``AudioToText``. Optional list of brand
41
+ names, product names, or proper nouns to bias Whisper's first-
42
+ window decoder via ``initial_prompt``. Recovers near-mishears
43
+ (e.g. Klarna → "carna") on brand-monitoring inputs without new
44
+ model deps.
40
45
  strict_quality: When True, the pipeline raises
41
46
  :class:`GarbageTranscriptError` before Demucs/translation/TTS run
42
47
  if the transcript-quality heuristic returns ``"reject"``. When
@@ -50,13 +55,6 @@ class VideoDubber:
50
55
  See :class:`videopython.ai.generation.qwen3.Qwen3Translator`
51
56
  for tradeoffs (Qwen3 is slower on CPU but produces
52
57
  context-aware, length-budgeted output).
53
- cache_dir: When set, persist transcription, translated segments,
54
- and per-segment TTS WAVs under this directory and skip stages
55
- whose inputs already match a cache entry. Use to resume crashed
56
- long runs or to iterate on dub configuration without paying
57
- transcription cost each time. ``None`` (default) disables
58
- caching. Cache grows unbounded; clear via
59
- :func:`videopython.ai.dubbing.cache.dub_cache_clear`.
60
58
  """
61
59
 
62
60
  def __init__(
@@ -67,9 +65,9 @@ class VideoDubber:
67
65
  condition_on_previous_text: bool = False,
68
66
  no_speech_threshold: float = 0.6,
69
67
  logprob_threshold: float | None = -1.0,
68
+ vocabulary: list[str] | None = None,
70
69
  strict_quality: bool = False,
71
70
  translator: TranslatorChoice = "auto",
72
- cache_dir: str | Path | None = None,
73
71
  ):
74
72
  self.device = device
75
73
  self.low_memory = low_memory
@@ -77,18 +75,17 @@ class VideoDubber:
77
75
  self.condition_on_previous_text = condition_on_previous_text
78
76
  self.no_speech_threshold = no_speech_threshold
79
77
  self.logprob_threshold = logprob_threshold
78
+ self.vocabulary = vocabulary
80
79
  self.strict_quality = strict_quality
81
80
  self.translator = translator
82
- self.cache_dir = cache_dir
83
81
  self._local_pipeline: Any = None
84
82
  requested = device.lower() if isinstance(device, str) else "auto"
85
83
  logger.info(
86
- "VideoDubber initialized with device=%s low_memory=%s whisper_model=%s translator=%s cache_dir=%s",
84
+ "VideoDubber initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
87
85
  requested,
88
86
  low_memory,
89
87
  whisper_model,
90
88
  translator,
91
- cache_dir,
92
89
  )
93
90
 
94
91
  def _init_local_pipeline(self) -> None:
@@ -101,9 +98,9 @@ class VideoDubber:
101
98
  condition_on_previous_text=self.condition_on_previous_text,
102
99
  no_speech_threshold=self.no_speech_threshold,
103
100
  logprob_threshold=self.logprob_threshold,
101
+ vocabulary=self.vocabulary,
104
102
  strict_quality=self.strict_quality,
105
103
  translator=self.translator,
106
- cache_dir=self.cache_dir,
107
104
  )
108
105
 
109
106
  def dub(
@@ -41,8 +41,7 @@ class Expressiveness:
41
41
  def as_kwargs(self) -> dict[str, float]:
42
42
  """Knobs as a dict, dropping ``None`` entries.
43
43
 
44
- Suitable for ``**``-expansion into Chatterbox or
45
- :meth:`DubCache.tts_key`.
44
+ Suitable for ``**``-expansion into Chatterbox.
46
45
  """
47
46
  return {
48
47
  name: value
@@ -10,7 +10,6 @@ from typing import TYPE_CHECKING, Any, Callable, Literal
10
10
  import numpy as np
11
11
 
12
12
  from videopython.ai._device import select_device
13
- from videopython.ai.dubbing.cache import DubCache
14
13
  from videopython.ai.dubbing.models import DubbingResult, Expressiveness, RevoiceResult, SeparatedAudio, TimingSummary
15
14
  from videopython.ai.dubbing.quality import GarbageTranscriptError, assess_transcript
16
15
  from videopython.ai.dubbing.timing import TimingSynchronizer
@@ -170,9 +169,9 @@ class LocalDubbingPipeline:
170
169
  condition_on_previous_text: bool = False,
171
170
  no_speech_threshold: float = 0.6,
172
171
  logprob_threshold: float | None = -1.0,
172
+ vocabulary: list[str] | None = None,
173
173
  strict_quality: bool = False,
174
174
  translator: TranslatorChoice = "auto",
175
- cache_dir: str | Path | None = None,
176
175
  ):
177
176
  self.device = device
178
177
  self.low_memory = low_memory
@@ -180,17 +179,16 @@ class LocalDubbingPipeline:
180
179
  self.condition_on_previous_text = condition_on_previous_text
181
180
  self.no_speech_threshold = no_speech_threshold
182
181
  self.logprob_threshold = logprob_threshold
182
+ self.vocabulary = vocabulary
183
183
  self.strict_quality = strict_quality
184
184
  self.translator = translator
185
- self.cache_dir = Path(cache_dir) if cache_dir is not None else None
186
185
  requested = device.lower() if isinstance(device, str) else "auto"
187
186
  logger.info(
188
- "LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s translator=%s cache_dir=%s",
187
+ "LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
189
188
  requested,
190
189
  low_memory,
191
190
  whisper_model,
192
191
  translator,
193
- self.cache_dir,
194
192
  )
195
193
 
196
194
  self._transcriber: Any = None
@@ -200,7 +198,6 @@ class LocalDubbingPipeline:
200
198
  self._tts_language: str | None = None
201
199
  self._separator: Any = None
202
200
  self._synchronizer: TimingSynchronizer | None = None
203
- self._cache: DubCache | None = DubCache(self.cache_dir) if self.cache_dir is not None else None
204
201
 
205
202
  def _maybe_unload(self, component_name: str) -> None:
206
203
  """Unload a stage's model when low_memory mode is enabled.
@@ -219,88 +216,42 @@ class LocalDubbingPipeline:
219
216
  logger.info("low_memory: unloading %s", component_name.lstrip("_"))
220
217
  unload()
221
218
 
222
- def _transcribe_with_cache(
219
+ def _transcribe(
223
220
  self,
224
221
  source_audio: Audio,
225
222
  enable_diarization: bool,
226
223
  ) -> Transcription:
227
- """Run transcription with cache-around-the-call.
228
-
229
- Cache miss: lazy-init the transcriber, transcribe, store the
230
- result (including all hashed kwargs in metadata.json so future
231
- invalidators have provenance).
232
- Cache hit: return the deserialized :class:`Transcription` without
233
- touching Whisper/diarization at all.
234
- """
235
- src_hash, kwargs_hash = self._transcription_cache_keys(source_audio, enable_diarization)
236
- if self._cache is not None:
237
- cached = self._cache.get_transcription(src_hash, kwargs_hash)
238
- if cached is not None:
239
- return cached
240
-
224
+ """Lazy-init the transcriber and run it on ``source_audio``."""
241
225
  if self._transcriber is None or self._transcriber_diarization != enable_diarization:
242
226
  self._init_transcriber(enable_diarization=enable_diarization)
243
227
  self._transcriber_diarization = enable_diarization
244
228
 
245
229
  transcription = self._transcriber.transcribe(source_audio)
246
230
  self._maybe_unload("_transcriber")
247
-
248
- if self._cache is not None:
249
- self._cache.put_transcription(
250
- src_hash,
251
- kwargs_hash,
252
- transcription,
253
- hash_inputs={
254
- "whisper_model": self.whisper_model,
255
- "enable_diarization": enable_diarization,
256
- "condition_on_previous_text": self.condition_on_previous_text,
257
- "no_speech_threshold": self.no_speech_threshold,
258
- "logprob_threshold": self.logprob_threshold,
259
- },
260
- )
261
231
  return transcription
262
232
 
263
233
  def _tts_segment_audio(
264
234
  self,
265
235
  segment: TranslatedSegment,
266
236
  speaker: str,
267
- speaker_bytes: bytes | None,
268
237
  target_lang: str,
269
238
  voice_clone: bool,
270
239
  voice_samples: dict[str, Audio],
271
240
  speaker_wav_paths: dict[str, Path],
272
- src_hash_for_tts: str,
273
241
  expressiveness: Expressiveness = Expressiveness(),
274
242
  ) -> Audio | None:
275
- """Produce the TTS audio for a single segment, with cache-around-the-call.
243
+ """Produce the TTS audio for a single segment.
276
244
 
277
245
  Returns the synthesized :class:`Audio`, or ``None`` if Chatterbox
278
- crashed on the segment (the caller skips it). On cache miss the
279
- TTS model is lazy-initialized and the per-speaker temp WAV is
280
- materialized before generation; on cache hit none of that runs,
281
- so a fully-cached run never loads Chatterbox.
246
+ crashed on the segment (the caller skips it). The TTS model is
247
+ lazy-initialized and per-speaker temp WAVs are materialized once
248
+ across the loop.
282
249
 
283
250
  ``expressiveness`` carries the M4 Chatterbox knobs derived from
284
251
  the source segment's prosody. Default is the no-knobs profile —
285
252
  lets Chatterbox use its own defaults — so callers that don't yet
286
253
  derive prosody (e.g. ``revoice``) keep pre-M4 behaviour.
287
254
  """
288
- from videopython.base.audio import Audio as _Audio
289
-
290
- tts_cache_key: str | None = None
291
- if self._cache is not None:
292
- tts_cache_key = DubCache.tts_key(
293
- translated_text=segment.translated_text,
294
- voice_sample_bytes=speaker_bytes,
295
- language=target_lang,
296
- **expressiveness.as_kwargs(),
297
- )
298
- cached_path = self._cache.get_tts_path(src_hash_for_tts, tts_cache_key)
299
- if cached_path is not None:
300
- return _Audio.from_path(cached_path)
301
-
302
- # Cache miss: pay for TTS init + voice-sample WAV exactly once
303
- # across the loop. Both are wasted work when every segment hits.
304
255
  if self._tts is None or self._tts_language != target_lang:
305
256
  self._init_tts(language=target_lang)
306
257
  self._tts_language = target_lang
@@ -311,7 +262,7 @@ class LocalDubbingPipeline:
311
262
 
312
263
  wav_path = speaker_wav_paths.get(speaker) if voice_clone else None
313
264
  try:
314
- dubbed_audio = self._tts.generate_audio(
265
+ return self._tts.generate_audio(
315
266
  segment.translated_text,
316
267
  voice_sample_path=wav_path,
317
268
  **expressiveness.as_kwargs(),
@@ -329,39 +280,19 @@ class LocalDubbingPipeline:
329
280
  )
330
281
  return None
331
282
 
332
- if self._cache is not None and tts_cache_key is not None:
333
- dubbed_audio.save(self._cache.reserve_tts_path(src_hash_for_tts, tts_cache_key))
334
- return dubbed_audio
335
-
336
- def _translate_with_cache(
283
+ def _translate(
337
284
  self,
338
285
  transcription: Transcription,
339
- source_audio: Audio,
340
286
  source_lang: str,
341
287
  target_lang: str,
342
288
  report_progress: Callable[[str, float], None],
343
289
  ) -> tuple[list[TranslatedSegment], list[int]]:
344
- """Run translation with cache-around-the-call.
290
+ """Translate the transcription's segments into ``target_lang``.
345
291
 
346
- Returns ``(translated_segments, translation_failures)``. Only
347
- fully-successful translations are cached — partial Qwen failures
348
- would otherwise lock in an incomplete dub across runs. The
292
+ Returns ``(translated_segments, translation_failures)``. The
349
293
  progress callback maps the backend's [0, 1] fraction onto the
350
294
  pipeline's translation window (0.35 → 0.50).
351
295
  """
352
- from videopython.ai.dubbing.models import TranslatedSegment
353
-
354
- cache_key: str | None = None
355
- if self._cache is not None:
356
- cache_key = DubCache.translation_key(
357
- source_lang=source_lang,
358
- target_lang=target_lang,
359
- translator_class=self._resolved_translator_class_name(source_lang, target_lang),
360
- )
361
- cached = self._cache.get_translation(DubCache.source_key(source_audio), cache_key)
362
- if cached is not None:
363
- return [TranslatedSegment.from_dict(d) for d in cached], []
364
-
365
296
  if self._translator is None:
366
297
  self._init_translator(source_lang=source_lang, target_lang=target_lang)
367
298
 
@@ -384,31 +315,8 @@ class LocalDubbingPipeline:
384
315
  translation_failures = list(self._translator.translation_failures)
385
316
  self._maybe_unload("_translator")
386
317
 
387
- if self._cache is not None and cache_key is not None and not translation_failures:
388
- self._cache.put_translation(
389
- DubCache.source_key(source_audio),
390
- cache_key,
391
- [s.to_dict() for s in translated_segments],
392
- )
393
-
394
318
  return translated_segments, translation_failures
395
319
 
396
- def _transcription_cache_keys(self, source_audio: Audio, enable_diarization: bool = False) -> tuple[str, str]:
397
- """Return ``(src_hash, kwargs_hash)`` for the current transcription config.
398
-
399
- Centralizes the kwarg list so the cache lookup, the put, and any
400
- future invalidator agree on what's hashed.
401
- """
402
- src_hash = DubCache.source_key(source_audio)
403
- kwargs_hash = DubCache.transcription_kwargs_hash(
404
- whisper_model=self.whisper_model,
405
- enable_diarization=enable_diarization,
406
- condition_on_previous_text=self.condition_on_previous_text,
407
- no_speech_threshold=self.no_speech_threshold,
408
- logprob_threshold=self.logprob_threshold,
409
- )
410
- return src_hash, kwargs_hash
411
-
412
320
  def _init_transcriber(self, enable_diarization: bool = False) -> None:
413
321
  """Initialize the transcription model."""
414
322
  from videopython.ai.understanding.audio import AudioToText
@@ -420,6 +328,7 @@ class LocalDubbingPipeline:
420
328
  condition_on_previous_text=self.condition_on_previous_text,
421
329
  no_speech_threshold=self.no_speech_threshold,
422
330
  logprob_threshold=self.logprob_threshold,
331
+ vocabulary=self.vocabulary,
423
332
  )
424
333
 
425
334
  def _init_translator(self, source_lang: str, target_lang: str) -> None:
@@ -439,31 +348,6 @@ class LocalDubbingPipeline:
439
348
  else: # "auto"
440
349
  self._translator = self._resolve_translator_auto(source_lang, target_lang)
441
350
 
442
- def _resolved_translator_class_name(self, source_lang: str, target_lang: str) -> str:
443
- """Return the *class name* of the translator that ``_init_translator``
444
- would pick — without constructing one.
445
-
446
- Used by the cache to key translations on the resolved backend rather
447
- than the user-supplied ``"auto"``: a CPU run that resolves to Marian
448
- must not collide with a GPU run that resolves to Qwen.
449
- """
450
- if self.translator == "marian":
451
- return "MarianTranslator"
452
- if self.translator == "qwen3":
453
- return "Qwen3Translator"
454
- # auto — mirror _resolve_translator_auto's branching, no construction.
455
- device = select_device(self.device, mps_allowed=True)
456
- has_gpu = device in ("cuda", "mps")
457
- if has_gpu and Qwen3Translator.supports(source_lang, target_lang):
458
- return "Qwen3Translator"
459
- if MarianTranslator.has_model_for(source_lang, target_lang):
460
- return "MarianTranslator"
461
- if Qwen3Translator.supports(source_lang, target_lang):
462
- return "Qwen3Translator"
463
- # No backend supports the pair — _init_translator will raise. We
464
- # return a sentinel; the cache miss path will pay that cost.
465
- return "Unsupported"
466
-
467
351
  def _resolve_translator_auto(self, source_lang: str, target_lang: str) -> TranslationBackend:
468
352
  """Pick a backend based on language coverage AND device.
469
353
 
@@ -723,7 +607,7 @@ class LocalDubbingPipeline:
723
607
  )
724
608
  else:
725
609
  report_progress("Transcribing audio", 0.05)
726
- transcription = self._transcribe_with_cache(source_audio, enable_diarization)
610
+ transcription = self._transcribe(source_audio, enable_diarization)
727
611
 
728
612
  if not transcription.segments:
729
613
  return DubbingResult(
@@ -791,8 +675,8 @@ class LocalDubbingPipeline:
791
675
  voice_samples = self._extract_voice_samples(vocal_audio, background_audio, transcription)
792
676
 
793
677
  report_progress("Translating text", 0.35)
794
- translated_segments, translation_failures = self._translate_with_cache(
795
- transcription, source_audio, detected_lang, target_lang, report_progress
678
+ translated_segments, translation_failures = self._translate(
679
+ transcription, detected_lang, target_lang, report_progress
796
680
  )
797
681
 
798
682
  # Per-segment expressiveness derived from source vocals RMS.
@@ -818,21 +702,12 @@ class LocalDubbingPipeline:
818
702
 
819
703
  report_progress("Generating dubbed speech", 0.50)
820
704
 
821
- # Per-speaker voice-sample bytes for TTS cache key. Empty when
822
- # voice_clone=False — the cache key still differentiates "no voice
823
- # sample" from "specific clone" via the None path.
824
- voice_sample_bytes: dict[str, bytes] = (
825
- {speaker: sample.data.tobytes() for speaker, sample in voice_samples.items()} if voice_clone else {}
826
- )
827
- src_hash_for_tts = DubCache.source_key(source_audio) if self._cache is not None else ""
828
-
829
705
  dubbed_segments: list[Audio] = []
830
706
  target_durations: list[float] = []
831
707
  start_times: list[float] = []
832
708
 
833
- # Per-speaker temp WAVs are materialized lazily by _tts_segment_audio
834
- # so a fully-cached run never writes one. The dict is loop-scoped
835
- # state so the finally block can clean up regardless of cache outcome.
709
+ # Per-speaker temp WAVs are materialized lazily by _tts_segment_audio.
710
+ # The dict is loop-scoped state so the finally block can clean up.
836
711
  speaker_wav_paths: dict[str, Path] = {}
837
712
  try:
838
713
  for i, segment in enumerate(translated_segments):
@@ -852,12 +727,10 @@ class LocalDubbingPipeline:
852
727
  dubbed_audio = self._tts_segment_audio(
853
728
  segment=segment,
854
729
  speaker=speaker,
855
- speaker_bytes=voice_sample_bytes.get(speaker),
856
730
  target_lang=target_lang,
857
731
  voice_clone=voice_clone,
858
732
  voice_samples=voice_samples,
859
733
  speaker_wav_paths=speaker_wav_paths,
860
- src_hash_for_tts=src_hash_for_tts,
861
734
  expressiveness=expressiveness_per_segment[i],
862
735
  )
863
736
  if dubbed_audio is None:
@@ -54,7 +54,6 @@ class FaceTrackingCrop(Transformation):
54
54
  vertical_offset: float = -0.1,
55
55
  framing_rule: Literal["offset", "center", "headroom", "thirds", "dynamic"] = "offset",
56
56
  headroom: float = 0.15,
57
- lead_room: float = 0.1,
58
57
  smoothing: float = 0.8,
59
58
  max_speed: float | None = None,
60
59
  fallback: Literal["center", "last_position", "full_frame"] = "last_position",
@@ -77,7 +76,6 @@ class FaceTrackingCrop(Transformation):
77
76
  - "thirds": Place face near the upper-third line.
78
77
  - "dynamic": Currently same as "headroom".
79
78
  headroom: Headroom amount for framing rules that use it.
80
- lead_room: Reserved for future motion/look-direction framing.
81
79
  smoothing: Position smoothing factor (0-1, higher = smoother).
82
80
  max_speed: Optional max camera movement per frame (normalized).
83
81
  fallback: Behavior when no face detected.
@@ -92,7 +90,6 @@ class FaceTrackingCrop(Transformation):
92
90
  self.vertical_offset = vertical_offset
93
91
  self.framing_rule = framing_rule
94
92
  self.headroom = headroom
95
- self.lead_room = lead_room
96
93
  self.smoothing = smoothing
97
94
  self.max_speed = max_speed
98
95
  self.fallback = fallback
@@ -238,10 +235,15 @@ class FaceTrackingCrop(Transformation):
238
235
  current_position = (0.5, 0.5)
239
236
 
240
237
  framing_label = self.framing_rule if self.framing_rule != "offset" else "legacy-offset"
241
- print(
242
- "Face tracking crop: "
243
- f"{w}x{h} -> {out_w}x{out_h} "
244
- f"({self.target_aspect[0]}:{self.target_aspect[1]}, framing={framing_label})"
238
+ logger.info(
239
+ "Face tracking crop: %dx%d -> %dx%d (%d:%d, framing=%s)",
240
+ w,
241
+ h,
242
+ out_w,
243
+ out_h,
244
+ self.target_aspect[0],
245
+ self.target_aspect[1],
246
+ framing_label,
245
247
  )
246
248
 
247
249
  new_frames = []
@@ -448,7 +450,7 @@ class SplitScreenComposite(Transformation):
448
450
  for _ in range(len(cell_rects))
449
451
  ]
450
452
 
451
- print(f"Creating {self.layout} split screen: {out_w}x{out_h}")
453
+ logger.info("Creating %s split screen: %dx%d", self.layout, out_w, out_h)
452
454
 
453
455
  new_frames = []
454
456
  for i in tqdm(range(n_frames), desc="Split screen composite"):