videopython 0.27.1__tar.gz → 0.27.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.27.1 → videopython-0.27.2}/PKG-INFO +1 -1
- {videopython-0.27.1 → videopython-0.27.2}/pyproject.toml +1 -1
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/dubbing/dubber.py +17 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/dubbing/pipeline.py +9 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/understanding/audio.py +32 -2
- {videopython-0.27.1 → videopython-0.27.2}/.gitignore +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/LICENSE +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/README.md +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/__init__.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/_device.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/dubbing/__init__.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/dubbing/models.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/dubbing/remux.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/generation/audio.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/generation/translation.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/registry.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/swapping/__init__.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/swapping/inpainter.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/swapping/models.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/swapping/segmenter.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/swapping/swapper.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/transforms.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/understanding/separation.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/ai/video_analysis.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/__init__.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/audio/__init__.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/audio/analysis.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/audio/audio.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/combine.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/description.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/effects.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/progress.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/registry.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/scene.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/streaming.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/text/__init__.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/text/overlay.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/text/transcription.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/transforms.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/transitions.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/utils.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/base/video.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/editing/__init__.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/editing/multicam.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/editing/premiere_xml.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/editing/video_edit.py +0 -0
- {videopython-0.27.1 → videopython-0.27.2}/src/videopython/py.typed +0 -0
|
@@ -29,6 +29,14 @@ class VideoDubber:
|
|
|
29
29
|
give better accuracy at the cost of VRAM and latency. One of
|
|
30
30
|
``tiny``, ``base``, ``small``, ``medium``, ``large``, ``turbo``.
|
|
31
31
|
Default ``turbo``.
|
|
32
|
+
condition_on_previous_text: Forwarded to ``AudioToText``. Defaults to
|
|
33
|
+
``False`` (Whisper's own default is ``True``). With conditioning on,
|
|
34
|
+
a single hallucinated filler phrase cascades through the rest of
|
|
35
|
+
the file. See ``AudioToText`` for the full rationale.
|
|
36
|
+
no_speech_threshold: Forwarded to ``AudioToText``. Whisper's no-speech
|
|
37
|
+
gate; raise to drop more low-confidence windows.
|
|
38
|
+
logprob_threshold: Forwarded to ``AudioToText``. Whisper's average
|
|
39
|
+
log-probability gate.
|
|
32
40
|
"""
|
|
33
41
|
|
|
34
42
|
def __init__(
|
|
@@ -36,10 +44,16 @@ class VideoDubber:
|
|
|
36
44
|
device: str | None = None,
|
|
37
45
|
low_memory: bool = False,
|
|
38
46
|
whisper_model: WhisperModel = "turbo",
|
|
47
|
+
condition_on_previous_text: bool = False,
|
|
48
|
+
no_speech_threshold: float = 0.6,
|
|
49
|
+
logprob_threshold: float | None = -1.0,
|
|
39
50
|
):
|
|
40
51
|
self.device = device
|
|
41
52
|
self.low_memory = low_memory
|
|
42
53
|
self.whisper_model = whisper_model
|
|
54
|
+
self.condition_on_previous_text = condition_on_previous_text
|
|
55
|
+
self.no_speech_threshold = no_speech_threshold
|
|
56
|
+
self.logprob_threshold = logprob_threshold
|
|
43
57
|
self._local_pipeline: Any = None
|
|
44
58
|
requested = device.lower() if isinstance(device, str) else "auto"
|
|
45
59
|
logger.info(
|
|
@@ -56,6 +70,9 @@ class VideoDubber:
|
|
|
56
70
|
device=self.device,
|
|
57
71
|
low_memory=self.low_memory,
|
|
58
72
|
whisper_model=self.whisper_model,
|
|
73
|
+
condition_on_previous_text=self.condition_on_previous_text,
|
|
74
|
+
no_speech_threshold=self.no_speech_threshold,
|
|
75
|
+
logprob_threshold=self.logprob_threshold,
|
|
59
76
|
)
|
|
60
77
|
|
|
61
78
|
def dub(
|
|
@@ -61,10 +61,16 @@ class LocalDubbingPipeline:
|
|
|
61
61
|
device: str | None = None,
|
|
62
62
|
low_memory: bool = False,
|
|
63
63
|
whisper_model: WhisperModel = "turbo",
|
|
64
|
+
condition_on_previous_text: bool = False,
|
|
65
|
+
no_speech_threshold: float = 0.6,
|
|
66
|
+
logprob_threshold: float | None = -1.0,
|
|
64
67
|
):
|
|
65
68
|
self.device = device
|
|
66
69
|
self.low_memory = low_memory
|
|
67
70
|
self.whisper_model = whisper_model
|
|
71
|
+
self.condition_on_previous_text = condition_on_previous_text
|
|
72
|
+
self.no_speech_threshold = no_speech_threshold
|
|
73
|
+
self.logprob_threshold = logprob_threshold
|
|
68
74
|
requested = device.lower() if isinstance(device, str) else "auto"
|
|
69
75
|
logger.info(
|
|
70
76
|
"LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s",
|
|
@@ -106,6 +112,9 @@ class LocalDubbingPipeline:
|
|
|
106
112
|
model_name=self.whisper_model,
|
|
107
113
|
device=self.device,
|
|
108
114
|
enable_diarization=enable_diarization,
|
|
115
|
+
condition_on_previous_text=self.condition_on_previous_text,
|
|
116
|
+
no_speech_threshold=self.no_speech_threshold,
|
|
117
|
+
logprob_threshold=self.logprob_threshold,
|
|
109
118
|
)
|
|
110
119
|
|
|
111
120
|
def _init_translator(self) -> None:
|
|
@@ -20,6 +20,20 @@ class AudioToText:
|
|
|
20
20
|
voiced regions only — fixes Whisper's tendency to lock onto the wrong
|
|
21
21
|
language when the file opens with silence, music, or non-vocal credits.
|
|
22
22
|
Disable with ``enable_vad=False`` to reproduce pre-0.27 behaviour.
|
|
23
|
+
|
|
24
|
+
Three Whisper decoder kwargs are surfaced for anti-hallucination tuning:
|
|
25
|
+
|
|
26
|
+
- ``condition_on_previous_text`` defaults to ``False`` (Whisper's own
|
|
27
|
+
default is ``True``). With conditioning on, a single hallucinated filler
|
|
28
|
+
phrase cascades through the rest of the file because each window's
|
|
29
|
+
decoder is primed by the previous window's decoded text. Turning it off
|
|
30
|
+
is the most commonly recommended fix for that failure mode; the cost on
|
|
31
|
+
clean audio is small (slightly less context for ambiguous homophones
|
|
32
|
+
across sentence boundaries).
|
|
33
|
+
- ``no_speech_threshold`` and ``logprob_threshold`` are forwarded with
|
|
34
|
+
Whisper's documented defaults (``0.6`` and ``-1.0``); raising
|
|
35
|
+
``no_speech_threshold`` biases toward dropping low-confidence windows
|
|
36
|
+
instead of emitting filler.
|
|
23
37
|
"""
|
|
24
38
|
|
|
25
39
|
PYANNOTE_DIARIZATION_MODEL = "pyannote/speaker-diarization-community-1"
|
|
@@ -29,11 +43,17 @@ class AudioToText:
|
|
|
29
43
|
model_name: Literal["tiny", "base", "small", "medium", "large", "turbo"] = "turbo",
|
|
30
44
|
enable_diarization: bool = False,
|
|
31
45
|
enable_vad: bool = True,
|
|
46
|
+
condition_on_previous_text: bool = False,
|
|
47
|
+
no_speech_threshold: float = 0.6,
|
|
48
|
+
logprob_threshold: float | None = -1.0,
|
|
32
49
|
device: str | None = None,
|
|
33
50
|
):
|
|
34
51
|
self.model_name = model_name
|
|
35
52
|
self.enable_diarization = enable_diarization
|
|
36
53
|
self.enable_vad = enable_vad
|
|
54
|
+
self.condition_on_previous_text = condition_on_previous_text
|
|
55
|
+
self.no_speech_threshold = no_speech_threshold
|
|
56
|
+
self.logprob_threshold = logprob_threshold
|
|
37
57
|
self.device = select_device(device, mps_allowed=False)
|
|
38
58
|
log_device_initialization(
|
|
39
59
|
"AudioToText",
|
|
@@ -44,6 +64,16 @@ class AudioToText:
|
|
|
44
64
|
self._diarization_pipeline: Any = None
|
|
45
65
|
self._vad_model: Any = None
|
|
46
66
|
|
|
67
|
+
def _transcribe_kwargs(self, language: str | None) -> dict[str, Any]:
|
|
68
|
+
"""Kwargs threaded into ``whisper.Whisper.transcribe`` from both call sites."""
|
|
69
|
+
return {
|
|
70
|
+
"word_timestamps": True,
|
|
71
|
+
"language": language,
|
|
72
|
+
"condition_on_previous_text": self.condition_on_previous_text,
|
|
73
|
+
"no_speech_threshold": self.no_speech_threshold,
|
|
74
|
+
"logprob_threshold": self.logprob_threshold,
|
|
75
|
+
}
|
|
76
|
+
|
|
47
77
|
def _init_local(self) -> None:
|
|
48
78
|
"""Initialize local Whisper model."""
|
|
49
79
|
import whisper
|
|
@@ -253,7 +283,7 @@ class AudioToText:
|
|
|
253
283
|
self._init_diarization()
|
|
254
284
|
|
|
255
285
|
audio_data = audio_mono.data
|
|
256
|
-
transcription_result = self._model.transcribe(audio=audio_data,
|
|
286
|
+
transcription_result = self._model.transcribe(audio=audio_data, **self._transcribe_kwargs(language))
|
|
257
287
|
|
|
258
288
|
waveform = torch.from_numpy(audio_data.astype(np.float32)).unsqueeze(0)
|
|
259
289
|
diarization_result = self._diarization_pipeline(
|
|
@@ -300,7 +330,7 @@ class AudioToText:
|
|
|
300
330
|
if self.enable_diarization:
|
|
301
331
|
return self._transcribe_with_diarization(audio_mono, language)
|
|
302
332
|
|
|
303
|
-
transcription_result = self._model.transcribe(audio=audio_mono.data,
|
|
333
|
+
transcription_result = self._model.transcribe(audio=audio_mono.data, **self._transcribe_kwargs(language))
|
|
304
334
|
return self._process_transcription_result(transcription_result)
|
|
305
335
|
|
|
306
336
|
def transcribe(self, media: Audio | Video) -> Transcription:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|