videopython 0.27.0__tar.gz → 0.27.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.27.0 → videopython-0.27.2}/PKG-INFO +1 -1
- {videopython-0.27.0 → videopython-0.27.2}/pyproject.toml +1 -1
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/dubbing/dubber.py +19 -2
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/dubbing/pipeline.py +10 -1
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/understanding/audio.py +38 -4
- {videopython-0.27.0 → videopython-0.27.2}/.gitignore +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/LICENSE +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/README.md +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/__init__.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/_device.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/dubbing/__init__.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/dubbing/models.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/dubbing/remux.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/generation/audio.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/generation/translation.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/registry.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/swapping/__init__.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/swapping/inpainter.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/swapping/models.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/swapping/segmenter.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/swapping/swapper.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/transforms.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/understanding/separation.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/ai/video_analysis.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/__init__.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/audio/__init__.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/audio/analysis.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/audio/audio.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/combine.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/description.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/effects.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/progress.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/registry.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/scene.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/streaming.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/text/__init__.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/text/overlay.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/text/transcription.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/transforms.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/transitions.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/utils.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/base/video.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/editing/__init__.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/editing/multicam.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/editing/premiere_xml.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/editing/video_edit.py +0 -0
- {videopython-0.27.0 → videopython-0.27.2}/src/videopython/py.typed +0 -0
|
@@ -28,18 +28,32 @@ class VideoDubber:
|
|
|
28
28
|
whisper_model: Whisper model size used for transcription. Larger models
|
|
29
29
|
give better accuracy at the cost of VRAM and latency. One of
|
|
30
30
|
``tiny``, ``base``, ``small``, ``medium``, ``large``, ``turbo``.
|
|
31
|
-
Default ``
|
|
31
|
+
Default ``turbo``.
|
|
32
|
+
condition_on_previous_text: Forwarded to ``AudioToText``. Defaults to
|
|
33
|
+
``False`` (Whisper's own default is ``True``). With conditioning on,
|
|
34
|
+
a single hallucinated filler phrase cascades through the rest of
|
|
35
|
+
the file. See ``AudioToText`` for the full rationale.
|
|
36
|
+
no_speech_threshold: Forwarded to ``AudioToText``. Whisper's no-speech
|
|
37
|
+
gate; raise to drop more low-confidence windows.
|
|
38
|
+
logprob_threshold: Forwarded to ``AudioToText``. Whisper's average
|
|
39
|
+
log-probability gate.
|
|
32
40
|
"""
|
|
33
41
|
|
|
34
42
|
def __init__(
|
|
35
43
|
self,
|
|
36
44
|
device: str | None = None,
|
|
37
45
|
low_memory: bool = False,
|
|
38
|
-
whisper_model: WhisperModel = "
|
|
46
|
+
whisper_model: WhisperModel = "turbo",
|
|
47
|
+
condition_on_previous_text: bool = False,
|
|
48
|
+
no_speech_threshold: float = 0.6,
|
|
49
|
+
logprob_threshold: float | None = -1.0,
|
|
39
50
|
):
|
|
40
51
|
self.device = device
|
|
41
52
|
self.low_memory = low_memory
|
|
42
53
|
self.whisper_model = whisper_model
|
|
54
|
+
self.condition_on_previous_text = condition_on_previous_text
|
|
55
|
+
self.no_speech_threshold = no_speech_threshold
|
|
56
|
+
self.logprob_threshold = logprob_threshold
|
|
43
57
|
self._local_pipeline: Any = None
|
|
44
58
|
requested = device.lower() if isinstance(device, str) else "auto"
|
|
45
59
|
logger.info(
|
|
@@ -56,6 +70,9 @@ class VideoDubber:
|
|
|
56
70
|
device=self.device,
|
|
57
71
|
low_memory=self.low_memory,
|
|
58
72
|
whisper_model=self.whisper_model,
|
|
73
|
+
condition_on_previous_text=self.condition_on_previous_text,
|
|
74
|
+
no_speech_threshold=self.no_speech_threshold,
|
|
75
|
+
logprob_threshold=self.logprob_threshold,
|
|
59
76
|
)
|
|
60
77
|
|
|
61
78
|
def dub(
|
|
@@ -60,11 +60,17 @@ class LocalDubbingPipeline:
|
|
|
60
60
|
self,
|
|
61
61
|
device: str | None = None,
|
|
62
62
|
low_memory: bool = False,
|
|
63
|
-
whisper_model: WhisperModel = "
|
|
63
|
+
whisper_model: WhisperModel = "turbo",
|
|
64
|
+
condition_on_previous_text: bool = False,
|
|
65
|
+
no_speech_threshold: float = 0.6,
|
|
66
|
+
logprob_threshold: float | None = -1.0,
|
|
64
67
|
):
|
|
65
68
|
self.device = device
|
|
66
69
|
self.low_memory = low_memory
|
|
67
70
|
self.whisper_model = whisper_model
|
|
71
|
+
self.condition_on_previous_text = condition_on_previous_text
|
|
72
|
+
self.no_speech_threshold = no_speech_threshold
|
|
73
|
+
self.logprob_threshold = logprob_threshold
|
|
68
74
|
requested = device.lower() if isinstance(device, str) else "auto"
|
|
69
75
|
logger.info(
|
|
70
76
|
"LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s",
|
|
@@ -106,6 +112,9 @@ class LocalDubbingPipeline:
|
|
|
106
112
|
model_name=self.whisper_model,
|
|
107
113
|
device=self.device,
|
|
108
114
|
enable_diarization=enable_diarization,
|
|
115
|
+
condition_on_previous_text=self.condition_on_previous_text,
|
|
116
|
+
no_speech_threshold=self.no_speech_threshold,
|
|
117
|
+
logprob_threshold=self.logprob_threshold,
|
|
109
118
|
)
|
|
110
119
|
|
|
111
120
|
def _init_translator(self) -> None:
|
|
@@ -15,21 +15,45 @@ class AudioToText:
|
|
|
15
15
|
"""Transcription service for audio and video using local Whisper models.
|
|
16
16
|
|
|
17
17
|
Uses openai-whisper for transcription (with word-level timestamps) and
|
|
18
|
-
pyannote-audio for optional speaker diarization.
|
|
18
|
+
pyannote-audio for optional speaker diarization. By default, Silero VAD
|
|
19
|
+
runs before Whisper to gate language detection on a 30s window built from
|
|
20
|
+
voiced regions only — fixes Whisper's tendency to lock onto the wrong
|
|
21
|
+
language when the file opens with silence, music, or non-vocal credits.
|
|
22
|
+
Disable with ``enable_vad=False`` to reproduce pre-0.27 behaviour.
|
|
23
|
+
|
|
24
|
+
Three Whisper decoder kwargs are surfaced for anti-hallucination tuning:
|
|
25
|
+
|
|
26
|
+
- ``condition_on_previous_text`` defaults to ``False`` (Whisper's own
|
|
27
|
+
default is ``True``). With conditioning on, a single hallucinated filler
|
|
28
|
+
phrase cascades through the rest of the file because each window's
|
|
29
|
+
decoder is primed by the previous window's decoded text. Turning it off
|
|
30
|
+
is the most commonly recommended fix for that failure mode; the cost on
|
|
31
|
+
clean audio is small (slightly less context for ambiguous homophones
|
|
32
|
+
across sentence boundaries).
|
|
33
|
+
- ``no_speech_threshold`` and ``logprob_threshold`` are forwarded with
|
|
34
|
+
Whisper's documented defaults (``0.6`` and ``-1.0``); raising
|
|
35
|
+
``no_speech_threshold`` biases toward dropping low-confidence windows
|
|
36
|
+
instead of emitting filler.
|
|
19
37
|
"""
|
|
20
38
|
|
|
21
39
|
PYANNOTE_DIARIZATION_MODEL = "pyannote/speaker-diarization-community-1"
|
|
22
40
|
|
|
23
41
|
def __init__(
|
|
24
42
|
self,
|
|
25
|
-
model_name: Literal["tiny", "base", "small", "medium", "large", "turbo"] = "
|
|
43
|
+
model_name: Literal["tiny", "base", "small", "medium", "large", "turbo"] = "turbo",
|
|
26
44
|
enable_diarization: bool = False,
|
|
27
45
|
enable_vad: bool = True,
|
|
46
|
+
condition_on_previous_text: bool = False,
|
|
47
|
+
no_speech_threshold: float = 0.6,
|
|
48
|
+
logprob_threshold: float | None = -1.0,
|
|
28
49
|
device: str | None = None,
|
|
29
50
|
):
|
|
30
51
|
self.model_name = model_name
|
|
31
52
|
self.enable_diarization = enable_diarization
|
|
32
53
|
self.enable_vad = enable_vad
|
|
54
|
+
self.condition_on_previous_text = condition_on_previous_text
|
|
55
|
+
self.no_speech_threshold = no_speech_threshold
|
|
56
|
+
self.logprob_threshold = logprob_threshold
|
|
33
57
|
self.device = select_device(device, mps_allowed=False)
|
|
34
58
|
log_device_initialization(
|
|
35
59
|
"AudioToText",
|
|
@@ -40,6 +64,16 @@ class AudioToText:
|
|
|
40
64
|
self._diarization_pipeline: Any = None
|
|
41
65
|
self._vad_model: Any = None
|
|
42
66
|
|
|
67
|
+
def _transcribe_kwargs(self, language: str | None) -> dict[str, Any]:
|
|
68
|
+
"""Kwargs threaded into ``whisper.Whisper.transcribe`` from both call sites."""
|
|
69
|
+
return {
|
|
70
|
+
"word_timestamps": True,
|
|
71
|
+
"language": language,
|
|
72
|
+
"condition_on_previous_text": self.condition_on_previous_text,
|
|
73
|
+
"no_speech_threshold": self.no_speech_threshold,
|
|
74
|
+
"logprob_threshold": self.logprob_threshold,
|
|
75
|
+
}
|
|
76
|
+
|
|
43
77
|
def _init_local(self) -> None:
|
|
44
78
|
"""Initialize local Whisper model."""
|
|
45
79
|
import whisper
|
|
@@ -249,7 +283,7 @@ class AudioToText:
|
|
|
249
283
|
self._init_diarization()
|
|
250
284
|
|
|
251
285
|
audio_data = audio_mono.data
|
|
252
|
-
transcription_result = self._model.transcribe(audio=audio_data,
|
|
286
|
+
transcription_result = self._model.transcribe(audio=audio_data, **self._transcribe_kwargs(language))
|
|
253
287
|
|
|
254
288
|
waveform = torch.from_numpy(audio_data.astype(np.float32)).unsqueeze(0)
|
|
255
289
|
diarization_result = self._diarization_pipeline(
|
|
@@ -296,7 +330,7 @@ class AudioToText:
|
|
|
296
330
|
if self.enable_diarization:
|
|
297
331
|
return self._transcribe_with_diarization(audio_mono, language)
|
|
298
332
|
|
|
299
|
-
transcription_result = self._model.transcribe(audio=audio_mono.data,
|
|
333
|
+
transcription_result = self._model.transcribe(audio=audio_mono.data, **self._transcribe_kwargs(language))
|
|
300
334
|
return self._process_transcription_result(transcription_result)
|
|
301
335
|
|
|
302
336
|
def transcribe(self, media: Audio | Video) -> Transcription:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|