videopython 0.27.1__tar.gz → 0.27.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {videopython-0.27.1 → videopython-0.27.2}/PKG-INFO +1 -1
  2. {videopython-0.27.1 → videopython-0.27.2}/pyproject.toml +1 -1
  3. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/dubbing/dubber.py +17 -0
  4. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/dubbing/pipeline.py +9 -0
  5. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/understanding/audio.py +32 -2
  6. {videopython-0.27.1 → videopython-0.27.2}/.gitignore +0 -0
  7. {videopython-0.27.1 → videopython-0.27.2}/LICENSE +0 -0
  8. {videopython-0.27.1 → videopython-0.27.2}/README.md +0 -0
  9. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/__init__.py +0 -0
  10. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/__init__.py +0 -0
  11. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/_device.py +0 -0
  12. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/dubbing/__init__.py +0 -0
  13. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/dubbing/models.py +0 -0
  14. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/dubbing/remux.py +0 -0
  15. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/dubbing/timing.py +0 -0
  16. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/generation/__init__.py +0 -0
  17. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/generation/audio.py +0 -0
  18. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/generation/image.py +0 -0
  19. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/generation/translation.py +0 -0
  20. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/generation/video.py +0 -0
  21. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/registry.py +0 -0
  22. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/swapping/__init__.py +0 -0
  23. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/swapping/inpainter.py +0 -0
  24. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/swapping/models.py +0 -0
  25. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/swapping/segmenter.py +0 -0
  26. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/swapping/swapper.py +0 -0
  27. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/transforms.py +0 -0
  28. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/understanding/__init__.py +0 -0
  29. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/understanding/image.py +0 -0
  30. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/understanding/separation.py +0 -0
  31. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/understanding/temporal.py +0 -0
  32. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/video_analysis.py +0 -0
  33. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/__init__.py +0 -0
  34. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/audio/__init__.py +0 -0
  35. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/audio/analysis.py +0 -0
  36. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/audio/audio.py +0 -0
  37. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/combine.py +0 -0
  38. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/description.py +0 -0
  39. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/effects.py +0 -0
  40. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/exceptions.py +0 -0
  41. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/progress.py +0 -0
  42. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/registry.py +0 -0
  43. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/scene.py +0 -0
  44. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/streaming.py +0 -0
  45. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/text/__init__.py +0 -0
  46. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/text/overlay.py +0 -0
  47. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/text/transcription.py +0 -0
  48. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/transforms.py +0 -0
  49. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/transitions.py +0 -0
  50. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/utils.py +0 -0
  51. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/video.py +0 -0
  52. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/editing/__init__.py +0 -0
  53. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/editing/multicam.py +0 -0
  54. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/editing/premiere_xml.py +0 -0
  55. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/editing/video_edit.py +0 -0
  56. {videopython-0.27.1 → videopython-0.27.2}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.27.1
3
+ Version: 0.27.2
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.27.1"
3
+ version = "0.27.2"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -29,6 +29,14 @@ class VideoDubber:
29
29
  give better accuracy at the cost of VRAM and latency. One of
30
30
  ``tiny``, ``base``, ``small``, ``medium``, ``large``, ``turbo``.
31
31
  Default ``turbo``.
32
+ condition_on_previous_text: Forwarded to ``AudioToText``. Defaults to
33
+ ``False`` (Whisper's own default is ``True``). With conditioning on,
34
+ a single hallucinated filler phrase cascades through the rest of
35
+ the file. See ``AudioToText`` for the full rationale.
36
+ no_speech_threshold: Forwarded to ``AudioToText``. Whisper's no-speech
37
+ gate; raise to drop more low-confidence windows.
38
+ logprob_threshold: Forwarded to ``AudioToText``. Whisper's average
39
+ log-probability gate.
32
40
  """
33
41
 
34
42
  def __init__(
@@ -36,10 +44,16 @@ class VideoDubber:
36
44
  device: str | None = None,
37
45
  low_memory: bool = False,
38
46
  whisper_model: WhisperModel = "turbo",
47
+ condition_on_previous_text: bool = False,
48
+ no_speech_threshold: float = 0.6,
49
+ logprob_threshold: float | None = -1.0,
39
50
  ):
40
51
  self.device = device
41
52
  self.low_memory = low_memory
42
53
  self.whisper_model = whisper_model
54
+ self.condition_on_previous_text = condition_on_previous_text
55
+ self.no_speech_threshold = no_speech_threshold
56
+ self.logprob_threshold = logprob_threshold
43
57
  self._local_pipeline: Any = None
44
58
  requested = device.lower() if isinstance(device, str) else "auto"
45
59
  logger.info(
@@ -56,6 +70,9 @@ class VideoDubber:
56
70
  device=self.device,
57
71
  low_memory=self.low_memory,
58
72
  whisper_model=self.whisper_model,
73
+ condition_on_previous_text=self.condition_on_previous_text,
74
+ no_speech_threshold=self.no_speech_threshold,
75
+ logprob_threshold=self.logprob_threshold,
59
76
  )
60
77
 
61
78
  def dub(
@@ -61,10 +61,16 @@ class LocalDubbingPipeline:
61
61
  device: str | None = None,
62
62
  low_memory: bool = False,
63
63
  whisper_model: WhisperModel = "turbo",
64
+ condition_on_previous_text: bool = False,
65
+ no_speech_threshold: float = 0.6,
66
+ logprob_threshold: float | None = -1.0,
64
67
  ):
65
68
  self.device = device
66
69
  self.low_memory = low_memory
67
70
  self.whisper_model = whisper_model
71
+ self.condition_on_previous_text = condition_on_previous_text
72
+ self.no_speech_threshold = no_speech_threshold
73
+ self.logprob_threshold = logprob_threshold
68
74
  requested = device.lower() if isinstance(device, str) else "auto"
69
75
  logger.info(
70
76
  "LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s",
@@ -106,6 +112,9 @@ class LocalDubbingPipeline:
106
112
  model_name=self.whisper_model,
107
113
  device=self.device,
108
114
  enable_diarization=enable_diarization,
115
+ condition_on_previous_text=self.condition_on_previous_text,
116
+ no_speech_threshold=self.no_speech_threshold,
117
+ logprob_threshold=self.logprob_threshold,
109
118
  )
110
119
 
111
120
  def _init_translator(self) -> None:
@@ -20,6 +20,20 @@ class AudioToText:
20
20
  voiced regions only — fixes Whisper's tendency to lock onto the wrong
21
21
  language when the file opens with silence, music, or non-vocal credits.
22
22
  Disable with ``enable_vad=False`` to reproduce pre-0.27 behaviour.
23
+
24
+ Three Whisper decoder kwargs are surfaced for anti-hallucination tuning:
25
+
26
+ - ``condition_on_previous_text`` defaults to ``False`` (Whisper's own
27
+ default is ``True``). With conditioning on, a single hallucinated filler
28
+ phrase cascades through the rest of the file because each window's
29
+ decoder is primed by the previous window's decoded text. Turning it off
30
+ is the most commonly recommended fix for that failure mode; the cost on
31
+ clean audio is small (slightly less context for ambiguous homophones
32
+ across sentence boundaries).
33
+ - ``no_speech_threshold`` and ``logprob_threshold`` are forwarded with
34
+ Whisper's documented defaults (``0.6`` and ``-1.0``); raising
35
+ ``no_speech_threshold`` biases toward dropping low-confidence windows
36
+ instead of emitting filler.
23
37
  """
24
38
 
25
39
  PYANNOTE_DIARIZATION_MODEL = "pyannote/speaker-diarization-community-1"
@@ -29,11 +43,17 @@ class AudioToText:
29
43
  model_name: Literal["tiny", "base", "small", "medium", "large", "turbo"] = "turbo",
30
44
  enable_diarization: bool = False,
31
45
  enable_vad: bool = True,
46
+ condition_on_previous_text: bool = False,
47
+ no_speech_threshold: float = 0.6,
48
+ logprob_threshold: float | None = -1.0,
32
49
  device: str | None = None,
33
50
  ):
34
51
  self.model_name = model_name
35
52
  self.enable_diarization = enable_diarization
36
53
  self.enable_vad = enable_vad
54
+ self.condition_on_previous_text = condition_on_previous_text
55
+ self.no_speech_threshold = no_speech_threshold
56
+ self.logprob_threshold = logprob_threshold
37
57
  self.device = select_device(device, mps_allowed=False)
38
58
  log_device_initialization(
39
59
  "AudioToText",
@@ -44,6 +64,16 @@ class AudioToText:
44
64
  self._diarization_pipeline: Any = None
45
65
  self._vad_model: Any = None
46
66
 
67
+ def _transcribe_kwargs(self, language: str | None) -> dict[str, Any]:
68
+ """Kwargs threaded into ``whisper.Whisper.transcribe`` from both call sites."""
69
+ return {
70
+ "word_timestamps": True,
71
+ "language": language,
72
+ "condition_on_previous_text": self.condition_on_previous_text,
73
+ "no_speech_threshold": self.no_speech_threshold,
74
+ "logprob_threshold": self.logprob_threshold,
75
+ }
76
+
47
77
  def _init_local(self) -> None:
48
78
  """Initialize local Whisper model."""
49
79
  import whisper
@@ -253,7 +283,7 @@ class AudioToText:
253
283
  self._init_diarization()
254
284
 
255
285
  audio_data = audio_mono.data
256
- transcription_result = self._model.transcribe(audio=audio_data, word_timestamps=True, language=language)
286
+ transcription_result = self._model.transcribe(audio=audio_data, **self._transcribe_kwargs(language))
257
287
 
258
288
  waveform = torch.from_numpy(audio_data.astype(np.float32)).unsqueeze(0)
259
289
  diarization_result = self._diarization_pipeline(
@@ -300,7 +330,7 @@ class AudioToText:
300
330
  if self.enable_diarization:
301
331
  return self._transcribe_with_diarization(audio_mono, language)
302
332
 
303
- transcription_result = self._model.transcribe(audio=audio_mono.data, word_timestamps=True, language=language)
333
+ transcription_result = self._model.transcribe(audio=audio_mono.data, **self._transcribe_kwargs(language))
304
334
  return self._process_transcription_result(transcription_result)
305
335
 
306
336
  def transcribe(self, media: Audio | Video) -> Transcription:
File without changes
File without changes
File without changes