videopython 0.27.0__tar.gz → 0.27.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {videopython-0.27.0 → videopython-0.27.2}/PKG-INFO +1 -1
  2. {videopython-0.27.0 → videopython-0.27.2}/pyproject.toml +1 -1
  3. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/dubbing/dubber.py +19 -2
  4. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/dubbing/pipeline.py +10 -1
  5. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/understanding/audio.py +38 -4
  6. {videopython-0.27.0 → videopython-0.27.2}/.gitignore +0 -0
  7. {videopython-0.27.0 → videopython-0.27.2}/LICENSE +0 -0
  8. {videopython-0.27.0 → videopython-0.27.2}/README.md +0 -0
  9. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/__init__.py +0 -0
  10. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/__init__.py +0 -0
  11. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/_device.py +0 -0
  12. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/dubbing/__init__.py +0 -0
  13. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/dubbing/models.py +0 -0
  14. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/dubbing/remux.py +0 -0
  15. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/dubbing/timing.py +0 -0
  16. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/generation/__init__.py +0 -0
  17. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/generation/audio.py +0 -0
  18. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/generation/image.py +0 -0
  19. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/generation/translation.py +0 -0
  20. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/generation/video.py +0 -0
  21. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/registry.py +0 -0
  22. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/swapping/__init__.py +0 -0
  23. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/swapping/inpainter.py +0 -0
  24. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/swapping/models.py +0 -0
  25. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/swapping/segmenter.py +0 -0
  26. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/swapping/swapper.py +0 -0
  27. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/transforms.py +0 -0
  28. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/understanding/__init__.py +0 -0
  29. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/understanding/image.py +0 -0
  30. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/understanding/separation.py +0 -0
  31. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/understanding/temporal.py +0 -0
  32. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/video_analysis.py +0 -0
  33. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/__init__.py +0 -0
  34. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/audio/__init__.py +0 -0
  35. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/audio/analysis.py +0 -0
  36. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/audio/audio.py +0 -0
  37. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/combine.py +0 -0
  38. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/description.py +0 -0
  39. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/effects.py +0 -0
  40. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/exceptions.py +0 -0
  41. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/progress.py +0 -0
  42. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/registry.py +0 -0
  43. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/scene.py +0 -0
  44. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/streaming.py +0 -0
  45. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/text/__init__.py +0 -0
  46. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/text/overlay.py +0 -0
  47. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/text/transcription.py +0 -0
  48. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/transforms.py +0 -0
  49. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/transitions.py +0 -0
  50. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/utils.py +0 -0
  51. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/video.py +0 -0
  52. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/editing/__init__.py +0 -0
  53. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/editing/multicam.py +0 -0
  54. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/editing/premiere_xml.py +0 -0
  55. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/editing/video_edit.py +0 -0
  56. {videopython-0.27.0 → videopython-0.27.2}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.27.0
3
+ Version: 0.27.2
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.27.0"
3
+ version = "0.27.2"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -28,18 +28,32 @@ class VideoDubber:
28
28
  whisper_model: Whisper model size used for transcription. Larger models
29
29
  give better accuracy at the cost of VRAM and latency. One of
30
30
  ``tiny``, ``base``, ``small``, ``medium``, ``large``, ``turbo``.
31
- Default ``small``.
31
+ Default ``turbo``.
32
+ condition_on_previous_text: Forwarded to ``AudioToText``. Defaults to
33
+ ``False`` (Whisper's own default is ``True``). With conditioning on,
34
+ a single hallucinated filler phrase cascades through the rest of
35
+ the file. See ``AudioToText`` for the full rationale.
36
+ no_speech_threshold: Forwarded to ``AudioToText``. Whisper's no-speech
37
+ gate; raise to drop more low-confidence windows.
38
+ logprob_threshold: Forwarded to ``AudioToText``. Whisper's average
39
+ log-probability gate.
32
40
  """
33
41
 
34
42
  def __init__(
35
43
  self,
36
44
  device: str | None = None,
37
45
  low_memory: bool = False,
38
- whisper_model: WhisperModel = "small",
46
+ whisper_model: WhisperModel = "turbo",
47
+ condition_on_previous_text: bool = False,
48
+ no_speech_threshold: float = 0.6,
49
+ logprob_threshold: float | None = -1.0,
39
50
  ):
40
51
  self.device = device
41
52
  self.low_memory = low_memory
42
53
  self.whisper_model = whisper_model
54
+ self.condition_on_previous_text = condition_on_previous_text
55
+ self.no_speech_threshold = no_speech_threshold
56
+ self.logprob_threshold = logprob_threshold
43
57
  self._local_pipeline: Any = None
44
58
  requested = device.lower() if isinstance(device, str) else "auto"
45
59
  logger.info(
@@ -56,6 +70,9 @@ class VideoDubber:
56
70
  device=self.device,
57
71
  low_memory=self.low_memory,
58
72
  whisper_model=self.whisper_model,
73
+ condition_on_previous_text=self.condition_on_previous_text,
74
+ no_speech_threshold=self.no_speech_threshold,
75
+ logprob_threshold=self.logprob_threshold,
59
76
  )
60
77
 
61
78
  def dub(
@@ -60,11 +60,17 @@ class LocalDubbingPipeline:
60
60
  self,
61
61
  device: str | None = None,
62
62
  low_memory: bool = False,
63
- whisper_model: WhisperModel = "small",
63
+ whisper_model: WhisperModel = "turbo",
64
+ condition_on_previous_text: bool = False,
65
+ no_speech_threshold: float = 0.6,
66
+ logprob_threshold: float | None = -1.0,
64
67
  ):
65
68
  self.device = device
66
69
  self.low_memory = low_memory
67
70
  self.whisper_model = whisper_model
71
+ self.condition_on_previous_text = condition_on_previous_text
72
+ self.no_speech_threshold = no_speech_threshold
73
+ self.logprob_threshold = logprob_threshold
68
74
  requested = device.lower() if isinstance(device, str) else "auto"
69
75
  logger.info(
70
76
  "LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s",
@@ -106,6 +112,9 @@ class LocalDubbingPipeline:
106
112
  model_name=self.whisper_model,
107
113
  device=self.device,
108
114
  enable_diarization=enable_diarization,
115
+ condition_on_previous_text=self.condition_on_previous_text,
116
+ no_speech_threshold=self.no_speech_threshold,
117
+ logprob_threshold=self.logprob_threshold,
109
118
  )
110
119
 
111
120
  def _init_translator(self) -> None:
@@ -15,21 +15,45 @@ class AudioToText:
15
15
  """Transcription service for audio and video using local Whisper models.
16
16
 
17
17
  Uses openai-whisper for transcription (with word-level timestamps) and
18
- pyannote-audio for optional speaker diarization.
18
+ pyannote-audio for optional speaker diarization. By default, Silero VAD
19
+ runs before Whisper to gate language detection on a 30s window built from
20
+ voiced regions only — fixes Whisper's tendency to lock onto the wrong
21
+ language when the file opens with silence, music, or non-vocal credits.
22
+ Disable with ``enable_vad=False`` to reproduce pre-0.27 behaviour.
23
+
24
+ Three Whisper decoder kwargs are surfaced for anti-hallucination tuning:
25
+
26
+ - ``condition_on_previous_text`` defaults to ``False`` (Whisper's own
27
+ default is ``True``). With conditioning on, a single hallucinated filler
28
+ phrase cascades through the rest of the file because each window's
29
+ decoder is primed by the previous window's decoded text. Turning it off
30
+ is the most commonly recommended fix for that failure mode; the cost on
31
+ clean audio is small (slightly less context for ambiguous homophones
32
+ across sentence boundaries).
33
+ - ``no_speech_threshold`` and ``logprob_threshold`` are forwarded with
34
+ Whisper's documented defaults (``0.6`` and ``-1.0``); raising
35
+ ``no_speech_threshold`` biases toward dropping low-confidence windows
36
+ instead of emitting filler.
19
37
  """
20
38
 
21
39
  PYANNOTE_DIARIZATION_MODEL = "pyannote/speaker-diarization-community-1"
22
40
 
23
41
  def __init__(
24
42
  self,
25
- model_name: Literal["tiny", "base", "small", "medium", "large", "turbo"] = "small",
43
+ model_name: Literal["tiny", "base", "small", "medium", "large", "turbo"] = "turbo",
26
44
  enable_diarization: bool = False,
27
45
  enable_vad: bool = True,
46
+ condition_on_previous_text: bool = False,
47
+ no_speech_threshold: float = 0.6,
48
+ logprob_threshold: float | None = -1.0,
28
49
  device: str | None = None,
29
50
  ):
30
51
  self.model_name = model_name
31
52
  self.enable_diarization = enable_diarization
32
53
  self.enable_vad = enable_vad
54
+ self.condition_on_previous_text = condition_on_previous_text
55
+ self.no_speech_threshold = no_speech_threshold
56
+ self.logprob_threshold = logprob_threshold
33
57
  self.device = select_device(device, mps_allowed=False)
34
58
  log_device_initialization(
35
59
  "AudioToText",
@@ -40,6 +64,16 @@ class AudioToText:
40
64
  self._diarization_pipeline: Any = None
41
65
  self._vad_model: Any = None
42
66
 
67
+ def _transcribe_kwargs(self, language: str | None) -> dict[str, Any]:
68
+ """Kwargs threaded into ``whisper.Whisper.transcribe`` from both call sites."""
69
+ return {
70
+ "word_timestamps": True,
71
+ "language": language,
72
+ "condition_on_previous_text": self.condition_on_previous_text,
73
+ "no_speech_threshold": self.no_speech_threshold,
74
+ "logprob_threshold": self.logprob_threshold,
75
+ }
76
+
43
77
  def _init_local(self) -> None:
44
78
  """Initialize local Whisper model."""
45
79
  import whisper
@@ -249,7 +283,7 @@ class AudioToText:
249
283
  self._init_diarization()
250
284
 
251
285
  audio_data = audio_mono.data
252
- transcription_result = self._model.transcribe(audio=audio_data, word_timestamps=True, language=language)
286
+ transcription_result = self._model.transcribe(audio=audio_data, **self._transcribe_kwargs(language))
253
287
 
254
288
  waveform = torch.from_numpy(audio_data.astype(np.float32)).unsqueeze(0)
255
289
  diarization_result = self._diarization_pipeline(
@@ -296,7 +330,7 @@ class AudioToText:
296
330
  if self.enable_diarization:
297
331
  return self._transcribe_with_diarization(audio_mono, language)
298
332
 
299
- transcription_result = self._model.transcribe(audio=audio_mono.data, word_timestamps=True, language=language)
333
+ transcription_result = self._model.transcribe(audio=audio_mono.data, **self._transcribe_kwargs(language))
300
334
  return self._process_transcription_result(transcription_result)
301
335
 
302
336
  def transcribe(self, media: Audio | Video) -> Transcription:
File without changes
File without changes
File without changes