videopython 0.29.1__tar.gz → 0.30.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. {videopython-0.29.1 → videopython-0.30.0}/PKG-INFO +2 -3
  2. {videopython-0.29.1 → videopython-0.30.0}/README.md +1 -2
  3. {videopython-0.29.1 → videopython-0.30.0}/pyproject.toml +1 -1
  4. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/__init__.py +0 -3
  5. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/dubbing/__init__.py +0 -3
  6. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/dubbing/dubber.py +1 -12
  7. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/dubbing/models.py +1 -2
  8. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/dubbing/pipeline.py +16 -148
  9. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/transforms.py +10 -8
  10. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/understanding/temporal.py +1 -17
  11. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/video_analysis.py +12 -12
  12. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/__init__.py +0 -5
  13. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/effects.py +17 -14
  14. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/registry.py +0 -9
  15. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/streaming.py +7 -4
  16. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/text/overlay.py +6 -3
  17. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/transforms.py +15 -12
  18. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/video.py +2 -2
  19. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/editing/__init__.py +0 -2
  20. videopython-0.29.1/src/videopython/ai/dubbing/cache.py +0 -325
  21. videopython-0.29.1/src/videopython/ai/swapping/__init__.py +0 -46
  22. videopython-0.29.1/src/videopython/ai/swapping/inpainter.py +0 -264
  23. videopython-0.29.1/src/videopython/ai/swapping/models.py +0 -221
  24. videopython-0.29.1/src/videopython/ai/swapping/segmenter.py +0 -577
  25. videopython-0.29.1/src/videopython/ai/swapping/swapper.py +0 -524
  26. videopython-0.29.1/src/videopython/base/combine.py +0 -61
  27. videopython-0.29.1/src/videopython/base/progress.py +0 -63
  28. videopython-0.29.1/src/videopython/base/utils.py +0 -6
  29. videopython-0.29.1/src/videopython/editing/premiere_xml.py +0 -313
  30. {videopython-0.29.1 → videopython-0.30.0}/.gitignore +0 -0
  31. {videopython-0.29.1 → videopython-0.30.0}/LICENSE +0 -0
  32. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/__init__.py +0 -0
  33. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/_device.py +0 -0
  34. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/dubbing/quality.py +0 -0
  35. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/dubbing/remux.py +0 -0
  36. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/dubbing/timing.py +0 -0
  37. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/generation/__init__.py +0 -0
  38. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/generation/audio.py +0 -0
  39. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/generation/image.py +0 -0
  40. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/generation/qwen3.py +0 -0
  41. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/generation/translation.py +0 -0
  42. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/generation/video.py +0 -0
  43. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/registry.py +0 -0
  44. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/understanding/__init__.py +0 -0
  45. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/understanding/audio.py +0 -0
  46. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/understanding/faces.py +0 -0
  47. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/understanding/image.py +0 -0
  48. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/ai/understanding/separation.py +0 -0
  49. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/audio/__init__.py +0 -0
  50. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/audio/analysis.py +0 -0
  51. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/audio/audio.py +0 -0
  52. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/description.py +0 -0
  53. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/exceptions.py +0 -0
  54. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/scene.py +0 -0
  55. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/text/__init__.py +0 -0
  56. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/text/transcription.py +0 -0
  57. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/base/transitions.py +0 -0
  58. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/editing/multicam.py +0 -0
  59. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/editing/video_edit.py +0 -0
  60. {videopython-0.29.1 → videopython-0.30.0}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.29.1
3
+ Version: 0.30.0
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -201,9 +201,8 @@ API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopyth
201
201
  | **Video analysis** | `VideoAnalyzer` - full-pipeline analysis combining multiple AI capabilities |
202
202
  | **Transforms** | `FaceTrackingCrop`, `SplitScreenComposite` |
203
203
  | **Dubbing** | `VideoDubber` - voice cloning and revoicing with timing sync |
204
- | **Object swapping** | `ObjectSwapper` - detect, segment, and inpaint objects in video |
205
204
 
206
- API docs: [Generation](https://videopython.com/api/ai/generation/) | [Understanding](https://videopython.com/api/ai/understanding/) | [Transforms](https://videopython.com/api/ai/transforms/) | [Dubbing](https://videopython.com/api/ai/dubbing/) | [Object Swapping](https://videopython.com/api/ai/swapping/)
205
+ API docs: [Generation](https://videopython.com/api/ai/generation/) | [Understanding](https://videopython.com/api/ai/understanding/) | [Transforms](https://videopython.com/api/ai/transforms/) | [Dubbing](https://videopython.com/api/ai/dubbing/)
207
206
 
208
207
  ## Examples
209
208
 
@@ -152,9 +152,8 @@ API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopyth
152
152
  | **Video analysis** | `VideoAnalyzer` - full-pipeline analysis combining multiple AI capabilities |
153
153
  | **Transforms** | `FaceTrackingCrop`, `SplitScreenComposite` |
154
154
  | **Dubbing** | `VideoDubber` - voice cloning and revoicing with timing sync |
155
- | **Object swapping** | `ObjectSwapper` - detect, segment, and inpaint objects in video |
156
155
 
157
- API docs: [Generation](https://videopython.com/api/ai/generation/) | [Understanding](https://videopython.com/api/ai/understanding/) | [Transforms](https://videopython.com/api/ai/transforms/) | [Dubbing](https://videopython.com/api/ai/dubbing/) | [Object Swapping](https://videopython.com/api/ai/swapping/)
156
+ API docs: [Generation](https://videopython.com/api/ai/generation/) | [Understanding](https://videopython.com/api/ai/understanding/) | [Transforms](https://videopython.com/api/ai/transforms/) | [Dubbing](https://videopython.com/api/ai/dubbing/)
158
157
 
159
158
  ## Examples
160
159
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.29.1"
3
+ version = "0.30.0"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -1,7 +1,6 @@
1
1
  from videopython.ai import registry as _ai_registry # noqa: F401
2
2
 
3
3
  from .generation import ImageToVideo, TextToImage, TextToMusic, TextToSpeech, TextToVideo
4
- from .swapping import ObjectSwapper
5
4
  from .transforms import FaceTrackingCrop, SplitScreenComposite
6
5
  from .understanding import (
7
6
  AudioClassifier,
@@ -28,8 +27,6 @@ __all__ = [
28
27
  # Transforms (AI-powered)
29
28
  "FaceTrackingCrop",
30
29
  "SplitScreenComposite",
31
- # Swapping
32
- "ObjectSwapper",
33
30
  # Video analysis
34
31
  "VideoAnalysis",
35
32
  "VideoAnalysisConfig",
@@ -1,6 +1,5 @@
1
1
  """Local video dubbing functionality."""
2
2
 
3
- from videopython.ai.dubbing.cache import DubCache, dub_cache_clear
4
3
  from videopython.ai.dubbing.dubber import VideoDubber
5
4
  from videopython.ai.dubbing.models import (
6
5
  DubbingResult,
@@ -26,7 +25,5 @@ __all__ = [
26
25
  "TranscriptQuality",
27
26
  "assess_transcript",
28
27
  "UnsupportedLanguageError",
29
- "DubCache",
30
- "dub_cache_clear",
31
28
  "Expressiveness",
32
29
  ]
@@ -55,13 +55,6 @@ class VideoDubber:
55
55
  See :class:`videopython.ai.generation.qwen3.Qwen3Translator`
56
56
  for tradeoffs (Qwen3 is slower on CPU but produces
57
57
  context-aware, length-budgeted output).
58
- cache_dir: When set, persist transcription, translated segments,
59
- and per-segment TTS WAVs under this directory and skip stages
60
- whose inputs already match a cache entry. Use to resume crashed
61
- long runs or to iterate on dub configuration without paying
62
- transcription cost each time. ``None`` (default) disables
63
- caching. Cache grows unbounded; clear via
64
- :func:`videopython.ai.dubbing.cache.dub_cache_clear`.
65
58
  """
66
59
 
67
60
  def __init__(
@@ -75,7 +68,6 @@ class VideoDubber:
75
68
  vocabulary: list[str] | None = None,
76
69
  strict_quality: bool = False,
77
70
  translator: TranslatorChoice = "auto",
78
- cache_dir: str | Path | None = None,
79
71
  ):
80
72
  self.device = device
81
73
  self.low_memory = low_memory
@@ -86,16 +78,14 @@ class VideoDubber:
86
78
  self.vocabulary = vocabulary
87
79
  self.strict_quality = strict_quality
88
80
  self.translator = translator
89
- self.cache_dir = cache_dir
90
81
  self._local_pipeline: Any = None
91
82
  requested = device.lower() if isinstance(device, str) else "auto"
92
83
  logger.info(
93
- "VideoDubber initialized with device=%s low_memory=%s whisper_model=%s translator=%s cache_dir=%s",
84
+ "VideoDubber initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
94
85
  requested,
95
86
  low_memory,
96
87
  whisper_model,
97
88
  translator,
98
- cache_dir,
99
89
  )
100
90
 
101
91
  def _init_local_pipeline(self) -> None:
@@ -111,7 +101,6 @@ class VideoDubber:
111
101
  vocabulary=self.vocabulary,
112
102
  strict_quality=self.strict_quality,
113
103
  translator=self.translator,
114
- cache_dir=self.cache_dir,
115
104
  )
116
105
 
117
106
  def dub(
@@ -41,8 +41,7 @@ class Expressiveness:
41
41
  def as_kwargs(self) -> dict[str, float]:
42
42
  """Knobs as a dict, dropping ``None`` entries.
43
43
 
44
- Suitable for ``**``-expansion into Chatterbox or
45
- :meth:`DubCache.tts_key`.
44
+ Suitable for ``**``-expansion into Chatterbox.
46
45
  """
47
46
  return {
48
47
  name: value
@@ -10,7 +10,6 @@ from typing import TYPE_CHECKING, Any, Callable, Literal
10
10
  import numpy as np
11
11
 
12
12
  from videopython.ai._device import select_device
13
- from videopython.ai.dubbing.cache import DubCache
14
13
  from videopython.ai.dubbing.models import DubbingResult, Expressiveness, RevoiceResult, SeparatedAudio, TimingSummary
15
14
  from videopython.ai.dubbing.quality import GarbageTranscriptError, assess_transcript
16
15
  from videopython.ai.dubbing.timing import TimingSynchronizer
@@ -173,7 +172,6 @@ class LocalDubbingPipeline:
173
172
  vocabulary: list[str] | None = None,
174
173
  strict_quality: bool = False,
175
174
  translator: TranslatorChoice = "auto",
176
- cache_dir: str | Path | None = None,
177
175
  ):
178
176
  self.device = device
179
177
  self.low_memory = low_memory
@@ -184,15 +182,13 @@ class LocalDubbingPipeline:
184
182
  self.vocabulary = vocabulary
185
183
  self.strict_quality = strict_quality
186
184
  self.translator = translator
187
- self.cache_dir = Path(cache_dir) if cache_dir is not None else None
188
185
  requested = device.lower() if isinstance(device, str) else "auto"
189
186
  logger.info(
190
- "LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s translator=%s cache_dir=%s",
187
+ "LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
191
188
  requested,
192
189
  low_memory,
193
190
  whisper_model,
194
191
  translator,
195
- self.cache_dir,
196
192
  )
197
193
 
198
194
  self._transcriber: Any = None
@@ -202,7 +198,6 @@ class LocalDubbingPipeline:
202
198
  self._tts_language: str | None = None
203
199
  self._separator: Any = None
204
200
  self._synchronizer: TimingSynchronizer | None = None
205
- self._cache: DubCache | None = DubCache(self.cache_dir) if self.cache_dir is not None else None
206
201
 
207
202
  def _maybe_unload(self, component_name: str) -> None:
208
203
  """Unload a stage's model when low_memory mode is enabled.
@@ -221,89 +216,42 @@ class LocalDubbingPipeline:
221
216
  logger.info("low_memory: unloading %s", component_name.lstrip("_"))
222
217
  unload()
223
218
 
224
- def _transcribe_with_cache(
219
+ def _transcribe(
225
220
  self,
226
221
  source_audio: Audio,
227
222
  enable_diarization: bool,
228
223
  ) -> Transcription:
229
- """Run transcription with cache-around-the-call.
230
-
231
- Cache miss: lazy-init the transcriber, transcribe, store the
232
- result (including all hashed kwargs in metadata.json so future
233
- invalidators have provenance).
234
- Cache hit: return the deserialized :class:`Transcription` without
235
- touching Whisper/diarization at all.
236
- """
237
- src_hash, kwargs_hash = self._transcription_cache_keys(source_audio, enable_diarization)
238
- if self._cache is not None:
239
- cached = self._cache.get_transcription(src_hash, kwargs_hash)
240
- if cached is not None:
241
- return cached
242
-
224
+ """Lazy-init the transcriber and run it on ``source_audio``."""
243
225
  if self._transcriber is None or self._transcriber_diarization != enable_diarization:
244
226
  self._init_transcriber(enable_diarization=enable_diarization)
245
227
  self._transcriber_diarization = enable_diarization
246
228
 
247
229
  transcription = self._transcriber.transcribe(source_audio)
248
230
  self._maybe_unload("_transcriber")
249
-
250
- if self._cache is not None:
251
- self._cache.put_transcription(
252
- src_hash,
253
- kwargs_hash,
254
- transcription,
255
- hash_inputs={
256
- "whisper_model": self.whisper_model,
257
- "enable_diarization": enable_diarization,
258
- "condition_on_previous_text": self.condition_on_previous_text,
259
- "no_speech_threshold": self.no_speech_threshold,
260
- "logprob_threshold": self.logprob_threshold,
261
- "vocabulary": self.vocabulary,
262
- },
263
- )
264
231
  return transcription
265
232
 
266
233
  def _tts_segment_audio(
267
234
  self,
268
235
  segment: TranslatedSegment,
269
236
  speaker: str,
270
- speaker_bytes: bytes | None,
271
237
  target_lang: str,
272
238
  voice_clone: bool,
273
239
  voice_samples: dict[str, Audio],
274
240
  speaker_wav_paths: dict[str, Path],
275
- src_hash_for_tts: str,
276
241
  expressiveness: Expressiveness = Expressiveness(),
277
242
  ) -> Audio | None:
278
- """Produce the TTS audio for a single segment, with cache-around-the-call.
243
+ """Produce the TTS audio for a single segment.
279
244
 
280
245
  Returns the synthesized :class:`Audio`, or ``None`` if Chatterbox
281
- crashed on the segment (the caller skips it). On cache miss the
282
- TTS model is lazy-initialized and the per-speaker temp WAV is
283
- materialized before generation; on cache hit none of that runs,
284
- so a fully-cached run never loads Chatterbox.
246
+ crashed on the segment (the caller skips it). The TTS model is
247
+ lazy-initialized and per-speaker temp WAVs are materialized once
248
+ across the loop.
285
249
 
286
250
  ``expressiveness`` carries the M4 Chatterbox knobs derived from
287
251
  the source segment's prosody. Default is the no-knobs profile —
288
252
  lets Chatterbox use its own defaults — so callers that don't yet
289
253
  derive prosody (e.g. ``revoice``) keep pre-M4 behaviour.
290
254
  """
291
- from videopython.base.audio import Audio as _Audio
292
-
293
- tts_cache_key: str | None = None
294
- if self._cache is not None:
295
- tts_cache_key = DubCache.tts_key(
296
- translated_text=segment.translated_text,
297
- voice_sample_bytes=speaker_bytes,
298
- language=target_lang,
299
- **expressiveness.as_kwargs(),
300
- )
301
- cached_path = self._cache.get_tts_path(src_hash_for_tts, tts_cache_key)
302
- if cached_path is not None:
303
- return _Audio.from_path(cached_path)
304
-
305
- # Cache miss: pay for TTS init + voice-sample WAV exactly once
306
- # across the loop. Both are wasted work when every segment hits.
307
255
  if self._tts is None or self._tts_language != target_lang:
308
256
  self._init_tts(language=target_lang)
309
257
  self._tts_language = target_lang
@@ -314,7 +262,7 @@ class LocalDubbingPipeline:
314
262
 
315
263
  wav_path = speaker_wav_paths.get(speaker) if voice_clone else None
316
264
  try:
317
- dubbed_audio = self._tts.generate_audio(
265
+ return self._tts.generate_audio(
318
266
  segment.translated_text,
319
267
  voice_sample_path=wav_path,
320
268
  **expressiveness.as_kwargs(),
@@ -332,39 +280,19 @@ class LocalDubbingPipeline:
332
280
  )
333
281
  return None
334
282
 
335
- if self._cache is not None and tts_cache_key is not None:
336
- dubbed_audio.save(self._cache.reserve_tts_path(src_hash_for_tts, tts_cache_key))
337
- return dubbed_audio
338
-
339
- def _translate_with_cache(
283
+ def _translate(
340
284
  self,
341
285
  transcription: Transcription,
342
- source_audio: Audio,
343
286
  source_lang: str,
344
287
  target_lang: str,
345
288
  report_progress: Callable[[str, float], None],
346
289
  ) -> tuple[list[TranslatedSegment], list[int]]:
347
- """Run translation with cache-around-the-call.
290
+ """Translate the transcription's segments into ``target_lang``.
348
291
 
349
- Returns ``(translated_segments, translation_failures)``. Only
350
- fully-successful translations are cached — partial Qwen failures
351
- would otherwise lock in an incomplete dub across runs. The
292
+ Returns ``(translated_segments, translation_failures)``. The
352
293
  progress callback maps the backend's [0, 1] fraction onto the
353
294
  pipeline's translation window (0.35 → 0.50).
354
295
  """
355
- from videopython.ai.dubbing.models import TranslatedSegment
356
-
357
- cache_key: str | None = None
358
- if self._cache is not None:
359
- cache_key = DubCache.translation_key(
360
- source_lang=source_lang,
361
- target_lang=target_lang,
362
- translator_class=self._resolved_translator_class_name(source_lang, target_lang),
363
- )
364
- cached = self._cache.get_translation(DubCache.source_key(source_audio), cache_key)
365
- if cached is not None:
366
- return [TranslatedSegment.from_dict(d) for d in cached], []
367
-
368
296
  if self._translator is None:
369
297
  self._init_translator(source_lang=source_lang, target_lang=target_lang)
370
298
 
@@ -387,32 +315,8 @@ class LocalDubbingPipeline:
387
315
  translation_failures = list(self._translator.translation_failures)
388
316
  self._maybe_unload("_translator")
389
317
 
390
- if self._cache is not None and cache_key is not None and not translation_failures:
391
- self._cache.put_translation(
392
- DubCache.source_key(source_audio),
393
- cache_key,
394
- [s.to_dict() for s in translated_segments],
395
- )
396
-
397
318
  return translated_segments, translation_failures
398
319
 
399
- def _transcription_cache_keys(self, source_audio: Audio, enable_diarization: bool = False) -> tuple[str, str]:
400
- """Return ``(src_hash, kwargs_hash)`` for the current transcription config.
401
-
402
- Centralizes the kwarg list so the cache lookup, the put, and any
403
- future invalidator agree on what's hashed.
404
- """
405
- src_hash = DubCache.source_key(source_audio)
406
- kwargs_hash = DubCache.transcription_kwargs_hash(
407
- whisper_model=self.whisper_model,
408
- enable_diarization=enable_diarization,
409
- condition_on_previous_text=self.condition_on_previous_text,
410
- no_speech_threshold=self.no_speech_threshold,
411
- logprob_threshold=self.logprob_threshold,
412
- vocabulary=self.vocabulary,
413
- )
414
- return src_hash, kwargs_hash
415
-
416
320
  def _init_transcriber(self, enable_diarization: bool = False) -> None:
417
321
  """Initialize the transcription model."""
418
322
  from videopython.ai.understanding.audio import AudioToText
@@ -444,31 +348,6 @@ class LocalDubbingPipeline:
444
348
  else: # "auto"
445
349
  self._translator = self._resolve_translator_auto(source_lang, target_lang)
446
350
 
447
- def _resolved_translator_class_name(self, source_lang: str, target_lang: str) -> str:
448
- """Return the *class name* of the translator that ``_init_translator``
449
- would pick — without constructing one.
450
-
451
- Used by the cache to key translations on the resolved backend rather
452
- than the user-supplied ``"auto"``: a CPU run that resolves to Marian
453
- must not collide with a GPU run that resolves to Qwen.
454
- """
455
- if self.translator == "marian":
456
- return "MarianTranslator"
457
- if self.translator == "qwen3":
458
- return "Qwen3Translator"
459
- # auto — mirror _resolve_translator_auto's branching, no construction.
460
- device = select_device(self.device, mps_allowed=True)
461
- has_gpu = device in ("cuda", "mps")
462
- if has_gpu and Qwen3Translator.supports(source_lang, target_lang):
463
- return "Qwen3Translator"
464
- if MarianTranslator.has_model_for(source_lang, target_lang):
465
- return "MarianTranslator"
466
- if Qwen3Translator.supports(source_lang, target_lang):
467
- return "Qwen3Translator"
468
- # No backend supports the pair — _init_translator will raise. We
469
- # return a sentinel; the cache miss path will pay that cost.
470
- return "Unsupported"
471
-
472
351
  def _resolve_translator_auto(self, source_lang: str, target_lang: str) -> TranslationBackend:
473
352
  """Pick a backend based on language coverage AND device.
474
353
 
@@ -728,7 +607,7 @@ class LocalDubbingPipeline:
728
607
  )
729
608
  else:
730
609
  report_progress("Transcribing audio", 0.05)
731
- transcription = self._transcribe_with_cache(source_audio, enable_diarization)
610
+ transcription = self._transcribe(source_audio, enable_diarization)
732
611
 
733
612
  if not transcription.segments:
734
613
  return DubbingResult(
@@ -796,8 +675,8 @@ class LocalDubbingPipeline:
796
675
  voice_samples = self._extract_voice_samples(vocal_audio, background_audio, transcription)
797
676
 
798
677
  report_progress("Translating text", 0.35)
799
- translated_segments, translation_failures = self._translate_with_cache(
800
- transcription, source_audio, detected_lang, target_lang, report_progress
678
+ translated_segments, translation_failures = self._translate(
679
+ transcription, detected_lang, target_lang, report_progress
801
680
  )
802
681
 
803
682
  # Per-segment expressiveness derived from source vocals RMS.
@@ -823,21 +702,12 @@ class LocalDubbingPipeline:
823
702
 
824
703
  report_progress("Generating dubbed speech", 0.50)
825
704
 
826
- # Per-speaker voice-sample bytes for TTS cache key. Empty when
827
- # voice_clone=False — the cache key still differentiates "no voice
828
- # sample" from "specific clone" via the None path.
829
- voice_sample_bytes: dict[str, bytes] = (
830
- {speaker: sample.data.tobytes() for speaker, sample in voice_samples.items()} if voice_clone else {}
831
- )
832
- src_hash_for_tts = DubCache.source_key(source_audio) if self._cache is not None else ""
833
-
834
705
  dubbed_segments: list[Audio] = []
835
706
  target_durations: list[float] = []
836
707
  start_times: list[float] = []
837
708
 
838
- # Per-speaker temp WAVs are materialized lazily by _tts_segment_audio
839
- # so a fully-cached run never writes one. The dict is loop-scoped
840
- # state so the finally block can clean up regardless of cache outcome.
709
+ # Per-speaker temp WAVs are materialized lazily by _tts_segment_audio.
710
+ # The dict is loop-scoped state so the finally block can clean up.
841
711
  speaker_wav_paths: dict[str, Path] = {}
842
712
  try:
843
713
  for i, segment in enumerate(translated_segments):
@@ -857,12 +727,10 @@ class LocalDubbingPipeline:
857
727
  dubbed_audio = self._tts_segment_audio(
858
728
  segment=segment,
859
729
  speaker=speaker,
860
- speaker_bytes=voice_sample_bytes.get(speaker),
861
730
  target_lang=target_lang,
862
731
  voice_clone=voice_clone,
863
732
  voice_samples=voice_samples,
864
733
  speaker_wav_paths=speaker_wav_paths,
865
- src_hash_for_tts=src_hash_for_tts,
866
734
  expressiveness=expressiveness_per_segment[i],
867
735
  )
868
736
  if dubbed_audio is None:
@@ -54,7 +54,6 @@ class FaceTrackingCrop(Transformation):
54
54
  vertical_offset: float = -0.1,
55
55
  framing_rule: Literal["offset", "center", "headroom", "thirds", "dynamic"] = "offset",
56
56
  headroom: float = 0.15,
57
- lead_room: float = 0.1,
58
57
  smoothing: float = 0.8,
59
58
  max_speed: float | None = None,
60
59
  fallback: Literal["center", "last_position", "full_frame"] = "last_position",
@@ -77,7 +76,6 @@ class FaceTrackingCrop(Transformation):
77
76
  - "thirds": Place face near the upper-third line.
78
77
  - "dynamic": Currently same as "headroom".
79
78
  headroom: Headroom amount for framing rules that use it.
80
- lead_room: Reserved for future motion/look-direction framing.
81
79
  smoothing: Position smoothing factor (0-1, higher = smoother).
82
80
  max_speed: Optional max camera movement per frame (normalized).
83
81
  fallback: Behavior when no face detected.
@@ -92,7 +90,6 @@ class FaceTrackingCrop(Transformation):
92
90
  self.vertical_offset = vertical_offset
93
91
  self.framing_rule = framing_rule
94
92
  self.headroom = headroom
95
- self.lead_room = lead_room
96
93
  self.smoothing = smoothing
97
94
  self.max_speed = max_speed
98
95
  self.fallback = fallback
@@ -238,10 +235,15 @@ class FaceTrackingCrop(Transformation):
238
235
  current_position = (0.5, 0.5)
239
236
 
240
237
  framing_label = self.framing_rule if self.framing_rule != "offset" else "legacy-offset"
241
- print(
242
- "Face tracking crop: "
243
- f"{w}x{h} -> {out_w}x{out_h} "
244
- f"({self.target_aspect[0]}:{self.target_aspect[1]}, framing={framing_label})"
238
+ logger.info(
239
+ "Face tracking crop: %dx%d -> %dx%d (%d:%d, framing=%s)",
240
+ w,
241
+ h,
242
+ out_w,
243
+ out_h,
244
+ self.target_aspect[0],
245
+ self.target_aspect[1],
246
+ framing_label,
245
247
  )
246
248
 
247
249
  new_frames = []
@@ -448,7 +450,7 @@ class SplitScreenComposite(Transformation):
448
450
  for _ in range(len(cell_rects))
449
451
  ]
450
452
 
451
- print(f"Creating {self.layout} split screen: {out_w}x{out_h}")
453
+ logger.info("Creating %s split screen: %dx%d", self.layout, out_w, out_h)
452
454
 
453
455
  new_frames = []
454
456
  for i in tqdm(range(n_frames), desc="Split screen composite"):
@@ -102,12 +102,7 @@ class SemanticSceneDetector:
102
102
  video.save(tmp.name)
103
103
  return self.detect_streaming(tmp.name)
104
104
 
105
- def detect_streaming(
106
- self,
107
- path: str | Path,
108
- start_second: float | None = None,
109
- end_second: float | None = None,
110
- ) -> list[SceneBoundary]:
105
+ def detect_streaming(self, path: str | Path) -> list[SceneBoundary]:
111
106
  """Detect scenes from a video file.
112
107
 
113
108
  Uses TransNetV2 with pretrained weights for accurate shot boundary
@@ -115,21 +110,10 @@ class SemanticSceneDetector:
115
110
 
116
111
  Args:
117
112
  path: Path to video file.
118
- start_second: Optional start time for analysis (not yet supported).
119
- end_second: Optional end time for analysis (not yet supported).
120
113
 
121
114
  Returns:
122
115
  List of SceneBoundary objects representing detected scenes.
123
116
  """
124
- if start_second is not None or end_second is not None:
125
- import warnings
126
-
127
- warnings.warn(
128
- "start_second and end_second are not yet supported by SemanticSceneDetector. Processing entire video.",
129
- UserWarning,
130
- stacklevel=2,
131
- )
132
-
133
117
  self._load_model()
134
118
 
135
119
  # Use TransNetV2's detect_scenes which handles everything internally
@@ -559,7 +559,7 @@ class VideoAnalyzer:
559
559
  return AudioToText(**self.config.get_params(AUDIO_TO_TEXT)).transcribe(
560
560
  Audio.from_path(source_path) if source_path is not None else _require_video(video)
561
561
  )
562
- except Exception:
562
+ except (ImportError, OSError, RuntimeError, ValueError):
563
563
  logger.warning("AudioToText failed, skipping transcription", exc_info=True)
564
564
  return None
565
565
 
@@ -571,7 +571,7 @@ class VideoAnalyzer:
571
571
  if source_path is not None
572
572
  else scene_detector.detect(_require_video(video))
573
573
  )
574
- except Exception:
574
+ except (ImportError, OSError, RuntimeError, ValueError):
575
575
  logger.warning("SemanticSceneDetector failed, using default scene boundaries", exc_info=True)
576
576
  return None
577
577
 
@@ -644,7 +644,7 @@ class VideoAnalyzer:
644
644
  else:
645
645
  try:
646
646
  scene_vlm = SceneVLM(**self.config.get_params(SCENE_VLM)) if SCENE_VLM in enabled else None
647
- except Exception:
647
+ except (ImportError, OSError, RuntimeError, ValueError):
648
648
  logger.warning("Failed to initialize SceneVLM, skipping visual understanding", exc_info=True)
649
649
  scene_vlm = None
650
650
 
@@ -652,7 +652,7 @@ class VideoAnalyzer:
652
652
  audio_classifier = (
653
653
  AudioClassifier(**self.config.get_params(AUDIO_CLASSIFIER)) if AUDIO_CLASSIFIER in enabled else None
654
654
  )
655
- except Exception:
655
+ except (ImportError, OSError, RuntimeError, ValueError):
656
656
  logger.warning("Failed to initialize AudioClassifier, skipping audio classification", exc_info=True)
657
657
  audio_classifier = None
658
658
 
@@ -660,7 +660,7 @@ class VideoAnalyzer:
660
660
  if FACE_TRACKER in enabled:
661
661
  try:
662
662
  face_tracker = FaceTracker(**self.config.get_params(FACE_TRACKER))
663
- except Exception:
663
+ except (ImportError, OSError, RuntimeError, ValueError):
664
664
  logger.warning("Failed to initialize FaceTracker, skipping face tracks", exc_info=True)
665
665
  face_tracker = None
666
666
 
@@ -668,7 +668,7 @@ class VideoAnalyzer:
668
668
  if audio_classifier is not None and source_path is not None:
669
669
  try:
670
670
  path_audio = Audio.from_path(source_path)
671
- except Exception:
671
+ except (OSError, RuntimeError, ValueError):
672
672
  logger.warning(
673
673
  "Failed to load audio from path, audio classification will use clip fallback", exc_info=True
674
674
  )
@@ -686,7 +686,7 @@ class VideoAnalyzer:
686
686
  metadata=metadata,
687
687
  scenes=scenes,
688
688
  )
689
- except Exception:
689
+ except (IndexError, OSError, RuntimeError, ValueError):
690
690
  logger.warning("Batched SceneVLM failed, skipping visual understanding", exc_info=True)
691
691
 
692
692
  samples: list[SceneAnalysisSample] = []
@@ -714,7 +714,7 @@ class VideoAnalyzer:
714
714
  start_second=scene.start,
715
715
  end_second=scene.end,
716
716
  )
717
- except Exception:
717
+ except (OSError, RuntimeError, ValueError):
718
718
  scene_clip = None
719
719
  sample.audio_classification = self._run_scene_audio_classification(
720
720
  audio_classifier=audio_classifier,
@@ -723,7 +723,7 @@ class VideoAnalyzer:
723
723
  scene_start=scene.start,
724
724
  scene_end=scene.end,
725
725
  )
726
- except Exception:
726
+ except (OSError, RuntimeError, ValueError):
727
727
  logger.warning(
728
728
  "AudioClassifier failed for scene %d (%.1f-%.1fs)",
729
729
  index,
@@ -741,7 +741,7 @@ class VideoAnalyzer:
741
741
  metadata=metadata,
742
742
  scene=scene,
743
743
  )
744
- except Exception:
744
+ except (IndexError, OSError, RuntimeError, ValueError):
745
745
  logger.warning(
746
746
  "FaceTracker failed for scene %d (%.1f-%.1fs)",
747
747
  index,
@@ -867,7 +867,7 @@ class VideoAnalyzer:
867
867
  description: SceneDescription | None = None
868
868
  try:
869
869
  description = scene_vlm.analyze_scene(deduped)
870
- except Exception:
870
+ except (IndexError, OSError, RuntimeError, ValueError):
871
871
  logger.warning(
872
872
  "SceneVLM failed for scenes %d-%d (%.1f-%.1fs)",
873
873
  group[0],
@@ -1044,7 +1044,7 @@ class VideoAnalyzer:
1044
1044
  try:
1045
1045
  result = subprocess.run(cmd, capture_output=True, text=True, check=True)
1046
1046
  payload = json.loads(result.stdout)
1047
- except Exception:
1047
+ except (subprocess.CalledProcessError, json.JSONDecodeError, OSError):
1048
1048
  return {}
1049
1049
 
1050
1050
  tags: dict[str, str] = {}
@@ -37,7 +37,6 @@ from .exceptions import (
37
37
  VideoMetadataError,
38
38
  VideoPythonError,
39
39
  )
40
- from .progress import configure, set_progress, set_verbose
41
40
  from .registry import (
42
41
  OperationCategory,
43
42
  OperationSpec,
@@ -157,8 +156,4 @@ __all__ = [
157
156
  "get_specs_by_tag",
158
157
  "register",
159
158
  "spec_from_class",
160
- # Configuration
161
- "configure",
162
- "set_verbose",
163
- "set_progress",
164
159
  ]