videopython 0.31.3__tar.gz → 0.33.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.31.3 → videopython-0.33.0}/PKG-INFO +21 -8
- {videopython-0.31.3 → videopython-0.33.0}/README.md +20 -7
- {videopython-0.31.3 → videopython-0.33.0}/pyproject.toml +1 -1
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/dubbing/__init__.py +2 -0
- videopython-0.33.0/src/videopython/ai/dubbing/config.py +80 -0
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/dubbing/dubber.py +14 -78
- videopython-0.33.0/src/videopython/ai/dubbing/expressiveness.py +47 -0
- videopython-0.33.0/src/videopython/ai/dubbing/loudness.py +86 -0
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/dubbing/models.py +52 -71
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/dubbing/pipeline.py +73 -343
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/dubbing/quality.py +6 -27
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/dubbing/remux.py +1 -1
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/dubbing/timing.py +1 -1
- videopython-0.33.0/src/videopython/ai/dubbing/voice_sample.py +152 -0
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/generation/audio.py +1 -1
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/generation/qwen3.py +1 -1
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/generation/translation.py +1 -1
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/transforms.py +1 -1
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/understanding/audio.py +2 -2
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/understanding/separation.py +1 -1
- videopython-0.33.0/src/videopython/ai/video_analysis/__init__.py +39 -0
- videopython-0.33.0/src/videopython/ai/video_analysis/analyzer.py +490 -0
- videopython-0.33.0/src/videopython/ai/video_analysis/models.py +228 -0
- videopython-0.33.0/src/videopython/ai/video_analysis/sampling.py +113 -0
- videopython-0.33.0/src/videopython/ai/video_analysis/stages.py +354 -0
- {videopython-0.31.3/src/videopython/base → videopython-0.33.0/src/videopython}/audio/audio.py +5 -5
- videopython-0.33.0/src/videopython/base/__init__.py +62 -0
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/base/_dimensions.py +1 -1
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/base/_video_io.py +1 -1
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/base/description.py +4 -2
- {videopython-0.31.3/src/videopython/base/text → videopython-0.33.0/src/videopython/base}/image_text.py +3 -2
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/base/video.py +1 -1
- videopython-0.33.0/src/videopython/editing/__init__.py +61 -0
- {videopython-0.31.3/src/videopython/base → videopython-0.33.0/src/videopython/editing}/effects.py +3 -3
- {videopython-0.31.3/src/videopython/base → videopython-0.33.0/src/videopython/editing}/operation.py +1 -1
- {videopython-0.31.3/src/videopython/base → videopython-0.33.0/src/videopython/editing}/streaming.py +2 -2
- videopython-0.31.3/src/videopython/base/text/overlay.py → videopython-0.33.0/src/videopython/editing/transcription_overlay.py +3 -3
- {videopython-0.31.3/src/videopython/base → videopython-0.33.0/src/videopython/editing}/transforms.py +3 -3
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/editing/video_edit.py +6 -6
- videopython-0.31.3/src/videopython/ai/video_analysis.py +0 -1181
- videopython-0.31.3/src/videopython/base/__init__.py +0 -128
- videopython-0.31.3/src/videopython/base/scene.py +0 -456
- videopython-0.31.3/src/videopython/base/text/__init__.py +0 -13
- videopython-0.31.3/src/videopython/editing/__init__.py +0 -6
- {videopython-0.31.3 → videopython-0.33.0}/.gitignore +0 -0
- {videopython-0.31.3 → videopython-0.33.0}/LICENSE +0 -0
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/__init__.py +0 -0
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/_device.py +0 -0
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/understanding/faces.py +0 -0
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.31.3/src/videopython/base → videopython-0.33.0/src/videopython}/audio/__init__.py +0 -0
- {videopython-0.31.3/src/videopython/base → videopython-0.33.0/src/videopython}/audio/analysis.py +0 -0
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/base/_ffmpeg.py +0 -0
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.31.3/src/videopython/base/text → videopython-0.33.0/src/videopython/base}/transcription.py +0 -0
- {videopython-0.31.3 → videopython-0.33.0}/src/videopython/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: videopython
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.33.0
|
|
4
4
|
Summary: Minimal video generation and processing library.
|
|
5
5
|
Project-URL: Homepage, https://videopython.com
|
|
6
6
|
Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
|
|
@@ -91,7 +91,8 @@ Every editing primitive is an `Operation` subclass — a Pydantic model
|
|
|
91
91
|
whose fields ARE the JSON wire format. Apply one to a `Video`:
|
|
92
92
|
|
|
93
93
|
```python
|
|
94
|
-
from videopython.base import Video
|
|
94
|
+
from videopython.base import Video
|
|
95
|
+
from videopython.editing import CutSeconds, Resize, Fade
|
|
95
96
|
|
|
96
97
|
video = Video.from_path("raw.mp4")
|
|
97
98
|
video = CutSeconds(start=10, end=25).apply(video)
|
|
@@ -141,7 +142,7 @@ instead if you want the result back in memory as a `Video`.
|
|
|
141
142
|
|
|
142
143
|
```python
|
|
143
144
|
from videopython.ai import TextToImage, ImageToVideo, TextToSpeech
|
|
144
|
-
from videopython.
|
|
145
|
+
from videopython.editing import Resize
|
|
145
146
|
|
|
146
147
|
image = TextToImage().generate_image("A cinematic mountain sunrise")
|
|
147
148
|
video = ImageToVideo().generate_video(image=image)
|
|
@@ -182,7 +183,7 @@ Every registered op exposes its own Pydantic schema, so an agent can
|
|
|
182
183
|
introspect what's available without hardcoded lists:
|
|
183
184
|
|
|
184
185
|
```python
|
|
185
|
-
from videopython.
|
|
186
|
+
from videopython.editing import Operation, OpCategory
|
|
186
187
|
|
|
187
188
|
for op_id, cls in Operation.registry().items():
|
|
188
189
|
print(f"{op_id}: {(cls.__doc__ or '').splitlines()[0]}")
|
|
@@ -205,18 +206,30 @@ Docs: [Editing Plans](https://videopython.com/api/editing/) | [Operations](https
|
|
|
205
206
|
|
|
206
207
|
## Features
|
|
207
208
|
|
|
208
|
-
### `videopython.base` -
|
|
209
|
+
### `videopython.base` - data containers + I/O (no AI dependencies)
|
|
209
210
|
|
|
210
211
|
| Area | Highlights |
|
|
211
212
|
|---|---|
|
|
212
213
|
| **Video I/O** | `Video`, `VideoMetadata`, `FrameIterator` - load, save, inspect |
|
|
214
|
+
| **Text rendering** | `ImageText` - generic PIL text-on-image primitive |
|
|
215
|
+
| **Transcription** | `Transcription`, `TranscriptionSegment`, `TranscriptionWord` - data classes returned by transcription backends |
|
|
216
|
+
| **Result types** | `BoundingBox`, `DetectedFace`, `FaceTrack`, `SceneBoundary`, `AudioEvent`, `MotionInfo`, ... - shared by editing and AI |
|
|
217
|
+
|
|
218
|
+
### `videopython.audio` - audio data container
|
|
219
|
+
|
|
220
|
+
| Area | Highlights |
|
|
221
|
+
|---|---|
|
|
222
|
+
| **Audio** | `Audio`, `AudioMetadata` - load/save, overlay, concat, normalize, time-stretch, silence detection, segment classification |
|
|
223
|
+
|
|
224
|
+
### `videopython.editing` - editing primitives + plan runner
|
|
225
|
+
|
|
226
|
+
| Area | Highlights |
|
|
227
|
+
|---|---|
|
|
213
228
|
| **Operation foundation** | `Operation`, `Effect`, `TimeRange`, `OpCategory` - Pydantic base + auto-registry + discriminated-union schema |
|
|
214
229
|
| **Editing plans** | `VideoEdit`, `SegmentConfig` - JSON/LLM-friendly multi-segment plans with JSON Schema generation, dry-run validation, and streaming `run_to_file` |
|
|
215
230
|
| **Transforms** | Cut (time/frame), resize, crop, FPS resampling, speed change, reverse, freeze frame, silence removal |
|
|
216
231
|
| **Effects** | Blur, zoom, color grading, vignette, Ken Burns, image overlay, fade, text overlay, volume adjust |
|
|
217
|
-
| **
|
|
218
|
-
| **Text** | Transcription data classes, `TranscriptionOverlay` for subtitle rendering |
|
|
219
|
-
| **Scene detection** | Histogram-based scene boundaries (`detect`, `detect_streaming`, `detect_parallel`) |
|
|
232
|
+
| **Subtitles** | `TranscriptionOverlay` - animated word-by-word subtitle rendering |
|
|
220
233
|
|
|
221
234
|
API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopython.com/api/core/video/) | [Audio](https://videopython.com/api/core/audio/) | [Editing Plans](https://videopython.com/api/editing/) | [Operations](https://videopython.com/api/operations/) | [Transforms](https://videopython.com/api/transforms/) | [Effects](https://videopython.com/api/effects/) | [Text](https://videopython.com/api/text/)
|
|
222
235
|
|
|
@@ -42,7 +42,8 @@ Every editing primitive is an `Operation` subclass — a Pydantic model
|
|
|
42
42
|
whose fields ARE the JSON wire format. Apply one to a `Video`:
|
|
43
43
|
|
|
44
44
|
```python
|
|
45
|
-
from videopython.base import Video
|
|
45
|
+
from videopython.base import Video
|
|
46
|
+
from videopython.editing import CutSeconds, Resize, Fade
|
|
46
47
|
|
|
47
48
|
video = Video.from_path("raw.mp4")
|
|
48
49
|
video = CutSeconds(start=10, end=25).apply(video)
|
|
@@ -92,7 +93,7 @@ instead if you want the result back in memory as a `Video`.
|
|
|
92
93
|
|
|
93
94
|
```python
|
|
94
95
|
from videopython.ai import TextToImage, ImageToVideo, TextToSpeech
|
|
95
|
-
from videopython.
|
|
96
|
+
from videopython.editing import Resize
|
|
96
97
|
|
|
97
98
|
image = TextToImage().generate_image("A cinematic mountain sunrise")
|
|
98
99
|
video = ImageToVideo().generate_video(image=image)
|
|
@@ -133,7 +134,7 @@ Every registered op exposes its own Pydantic schema, so an agent can
|
|
|
133
134
|
introspect what's available without hardcoded lists:
|
|
134
135
|
|
|
135
136
|
```python
|
|
136
|
-
from videopython.
|
|
137
|
+
from videopython.editing import Operation, OpCategory
|
|
137
138
|
|
|
138
139
|
for op_id, cls in Operation.registry().items():
|
|
139
140
|
print(f"{op_id}: {(cls.__doc__ or '').splitlines()[0]}")
|
|
@@ -156,18 +157,30 @@ Docs: [Editing Plans](https://videopython.com/api/editing/) | [Operations](https
|
|
|
156
157
|
|
|
157
158
|
## Features
|
|
158
159
|
|
|
159
|
-
### `videopython.base` -
|
|
160
|
+
### `videopython.base` - data containers + I/O (no AI dependencies)
|
|
160
161
|
|
|
161
162
|
| Area | Highlights |
|
|
162
163
|
|---|---|
|
|
163
164
|
| **Video I/O** | `Video`, `VideoMetadata`, `FrameIterator` - load, save, inspect |
|
|
165
|
+
| **Text rendering** | `ImageText` - generic PIL text-on-image primitive |
|
|
166
|
+
| **Transcription** | `Transcription`, `TranscriptionSegment`, `TranscriptionWord` - data classes returned by transcription backends |
|
|
167
|
+
| **Result types** | `BoundingBox`, `DetectedFace`, `FaceTrack`, `SceneBoundary`, `AudioEvent`, `MotionInfo`, ... - shared by editing and AI |
|
|
168
|
+
|
|
169
|
+
### `videopython.audio` - audio data container
|
|
170
|
+
|
|
171
|
+
| Area | Highlights |
|
|
172
|
+
|---|---|
|
|
173
|
+
| **Audio** | `Audio`, `AudioMetadata` - load/save, overlay, concat, normalize, time-stretch, silence detection, segment classification |
|
|
174
|
+
|
|
175
|
+
### `videopython.editing` - editing primitives + plan runner
|
|
176
|
+
|
|
177
|
+
| Area | Highlights |
|
|
178
|
+
|---|---|
|
|
164
179
|
| **Operation foundation** | `Operation`, `Effect`, `TimeRange`, `OpCategory` - Pydantic base + auto-registry + discriminated-union schema |
|
|
165
180
|
| **Editing plans** | `VideoEdit`, `SegmentConfig` - JSON/LLM-friendly multi-segment plans with JSON Schema generation, dry-run validation, and streaming `run_to_file` |
|
|
166
181
|
| **Transforms** | Cut (time/frame), resize, crop, FPS resampling, speed change, reverse, freeze frame, silence removal |
|
|
167
182
|
| **Effects** | Blur, zoom, color grading, vignette, Ken Burns, image overlay, fade, text overlay, volume adjust |
|
|
168
|
-
| **
|
|
169
|
-
| **Text** | Transcription data classes, `TranscriptionOverlay` for subtitle rendering |
|
|
170
|
-
| **Scene detection** | Histogram-based scene boundaries (`detect`, `detect_streaming`, `detect_parallel`) |
|
|
183
|
+
| **Subtitles** | `TranscriptionOverlay` - animated word-by-word subtitle rendering |
|
|
171
184
|
|
|
172
185
|
API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopython.com/api/core/video/) | [Audio](https://videopython.com/api/core/audio/) | [Editing Plans](https://videopython.com/api/editing/) | [Operations](https://videopython.com/api/operations/) | [Transforms](https://videopython.com/api/transforms/) | [Effects](https://videopython.com/api/effects/) | [Text](https://videopython.com/api/text/)
|
|
173
186
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Local video dubbing functionality."""
|
|
2
2
|
|
|
3
|
+
from videopython.ai.dubbing.config import DubbingConfig
|
|
3
4
|
from videopython.ai.dubbing.dubber import VideoDubber
|
|
4
5
|
from videopython.ai.dubbing.models import (
|
|
5
6
|
DubbingResult,
|
|
@@ -15,6 +16,7 @@ from videopython.ai.generation.translation import UnsupportedLanguageError
|
|
|
15
16
|
|
|
16
17
|
__all__ = [
|
|
17
18
|
"VideoDubber",
|
|
19
|
+
"DubbingConfig",
|
|
18
20
|
"DubbingResult",
|
|
19
21
|
"RevoiceResult",
|
|
20
22
|
"TranslatedSegment",
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Configuration model for the dubbing pipeline."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Literal
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, ConfigDict
|
|
8
|
+
|
|
9
|
+
TranslatorChoice = Literal["auto", "marian", "qwen3"]
|
|
10
|
+
WhisperModel = Literal["tiny", "base", "small", "medium", "large", "turbo"]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DubbingConfig(BaseModel):
|
|
14
|
+
"""Knobs shared by :class:`VideoDubber` and :class:`LocalDubbingPipeline`.
|
|
15
|
+
|
|
16
|
+
Accepted as either ``config=DubbingConfig(...)`` or flat kwargs on the
|
|
17
|
+
two constructors; the flat path builds a ``DubbingConfig`` internally.
|
|
18
|
+
|
|
19
|
+
Attributes:
|
|
20
|
+
device: Execution device (``cpu``, ``cuda``, ``mps``, or ``None`` for auto).
|
|
21
|
+
low_memory: When True, each pipeline stage (Whisper, Demucs, MarianMT,
|
|
22
|
+
Chatterbox TTS) is unloaded from memory after it runs, so only one
|
|
23
|
+
model is resident at a time. Trades per-run latency (~10-30s of
|
|
24
|
+
extra model loads) for a much lower memory ceiling. Recommended
|
|
25
|
+
for GPUs with <=12GB VRAM or hosts with <32GB RAM. Default False.
|
|
26
|
+
whisper_model: Whisper model size used for transcription. Larger
|
|
27
|
+
models give better accuracy at the cost of VRAM and latency. One
|
|
28
|
+
of ``tiny``, ``base``, ``small``, ``medium``, ``large``, ``turbo``.
|
|
29
|
+
Default ``turbo``.
|
|
30
|
+
condition_on_previous_text: Forwarded to ``AudioToText``. Defaults to
|
|
31
|
+
``False`` (Whisper's own default is ``True``). With conditioning
|
|
32
|
+
on, a single hallucinated filler phrase cascades through the rest
|
|
33
|
+
of the file. See ``AudioToText`` for the full rationale.
|
|
34
|
+
no_speech_threshold: Forwarded to ``AudioToText``. Whisper's
|
|
35
|
+
no-speech gate; raise to drop more low-confidence windows.
|
|
36
|
+
logprob_threshold: Forwarded to ``AudioToText``. Whisper's average
|
|
37
|
+
log-probability gate.
|
|
38
|
+
vocabulary: Forwarded to ``AudioToText``. Optional list of brand
|
|
39
|
+
names, product names, or proper nouns to bias Whisper's
|
|
40
|
+
first-window decoder via ``initial_prompt``. Recovers
|
|
41
|
+
near-mishears (e.g. Klarna -> "carna") on brand-monitoring
|
|
42
|
+
inputs without new model deps.
|
|
43
|
+
strict_quality: When True, the pipeline raises
|
|
44
|
+
:class:`GarbageTranscriptError` before Demucs/translation/TTS
|
|
45
|
+
run if the transcript-quality heuristic returns ``"reject"``.
|
|
46
|
+
When False (default), low-quality transcripts are logged at
|
|
47
|
+
WARNING but processing continues. Either way the
|
|
48
|
+
:class:`TranscriptQuality` is exposed on ``DubbingResult`` for
|
|
49
|
+
inspection.
|
|
50
|
+
translator: Translation backend to use. ``"auto"`` (default) picks
|
|
51
|
+
Qwen3 on GPU, MarianMT on CPU; ``"marian"`` and ``"qwen3"`` force
|
|
52
|
+
the named backend regardless of device. See
|
|
53
|
+
:class:`videopython.ai.generation.qwen3.Qwen3Translator` for
|
|
54
|
+
tradeoffs (Qwen3 is slower on CPU but produces context-aware,
|
|
55
|
+
length-budgeted output).
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
model_config = ConfigDict(frozen=True)
|
|
59
|
+
|
|
60
|
+
device: str | None = None
|
|
61
|
+
low_memory: bool = False
|
|
62
|
+
whisper_model: WhisperModel = "turbo"
|
|
63
|
+
condition_on_previous_text: bool = False
|
|
64
|
+
no_speech_threshold: float = 0.6
|
|
65
|
+
logprob_threshold: float | None = -1.0
|
|
66
|
+
vocabulary: list[str] | None = None
|
|
67
|
+
strict_quality: bool = False
|
|
68
|
+
translator: TranslatorChoice = "auto"
|
|
69
|
+
|
|
70
|
+
def init_log_fields(self) -> dict[str, object]:
|
|
71
|
+
"""Subset of fields surfaced in the init-log line.
|
|
72
|
+
|
|
73
|
+
Hand-picked so log noise stays bounded as the config grows.
|
|
74
|
+
"""
|
|
75
|
+
return {
|
|
76
|
+
"device": self.device.lower() if isinstance(self.device, str) else "auto",
|
|
77
|
+
"low_memory": self.low_memory,
|
|
78
|
+
"whisper_model": self.whisper_model,
|
|
79
|
+
"translator": self.translator,
|
|
80
|
+
}
|
|
@@ -6,8 +6,8 @@ import logging
|
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
from typing import TYPE_CHECKING, Any, Callable
|
|
8
8
|
|
|
9
|
+
from videopython.ai.dubbing.config import DubbingConfig
|
|
9
10
|
from videopython.ai.dubbing.models import DubbingResult, RevoiceResult
|
|
10
|
-
from videopython.ai.dubbing.pipeline import TranslatorChoice, WhisperModel
|
|
11
11
|
|
|
12
12
|
if TYPE_CHECKING:
|
|
13
13
|
from videopython.base.video import Video
|
|
@@ -18,90 +18,26 @@ logger = logging.getLogger(__name__)
|
|
|
18
18
|
class VideoDubber:
|
|
19
19
|
"""Dubs videos into different languages using the local pipeline.
|
|
20
20
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
model is resident at a time. Trades per-run latency (~10-30s of
|
|
26
|
-
extra model loads) for a much lower memory ceiling. Recommended for
|
|
27
|
-
GPUs with <=12GB VRAM or hosts with <32GB RAM. Default False.
|
|
28
|
-
whisper_model: Whisper model size used for transcription. Larger models
|
|
29
|
-
give better accuracy at the cost of VRAM and latency. One of
|
|
30
|
-
``tiny``, ``base``, ``small``, ``medium``, ``large``, ``turbo``.
|
|
31
|
-
Default ``turbo``.
|
|
32
|
-
condition_on_previous_text: Forwarded to ``AudioToText``. Defaults to
|
|
33
|
-
``False`` (Whisper's own default is ``True``). With conditioning on,
|
|
34
|
-
a single hallucinated filler phrase cascades through the rest of
|
|
35
|
-
the file. See ``AudioToText`` for the full rationale.
|
|
36
|
-
no_speech_threshold: Forwarded to ``AudioToText``. Whisper's no-speech
|
|
37
|
-
gate; raise to drop more low-confidence windows.
|
|
38
|
-
logprob_threshold: Forwarded to ``AudioToText``. Whisper's average
|
|
39
|
-
log-probability gate.
|
|
40
|
-
vocabulary: Forwarded to ``AudioToText``. Optional list of brand
|
|
41
|
-
names, product names, or proper nouns to bias Whisper's first-
|
|
42
|
-
window decoder via ``initial_prompt``. Recovers near-mishears
|
|
43
|
-
(e.g. Klarna → "carna") on brand-monitoring inputs without new
|
|
44
|
-
model deps.
|
|
45
|
-
strict_quality: When True, the pipeline raises
|
|
46
|
-
:class:`GarbageTranscriptError` before Demucs/translation/TTS run
|
|
47
|
-
if the transcript-quality heuristic returns ``"reject"``. When
|
|
48
|
-
False (default), low-quality transcripts are logged at WARNING
|
|
49
|
-
but processing continues. Either way the
|
|
50
|
-
:class:`TranscriptQuality` is exposed on ``DubbingResult`` for
|
|
51
|
-
inspection.
|
|
52
|
-
translator: Translation backend to use. ``"auto"`` (default)
|
|
53
|
-
picks Qwen3 on GPU, MarianMT on CPU; ``"marian"`` and
|
|
54
|
-
``"qwen3"`` force the named backend regardless of device.
|
|
55
|
-
See :class:`videopython.ai.generation.qwen3.Qwen3Translator`
|
|
56
|
-
for tradeoffs (Qwen3 is slower on CPU but produces
|
|
57
|
-
context-aware, length-budgeted output).
|
|
21
|
+
Accepts either a :class:`DubbingConfig` or the same knobs as flat kwargs
|
|
22
|
+
(``device``, ``low_memory``, ``whisper_model``, ``translator``, etc.) --
|
|
23
|
+
the flat path builds a ``DubbingConfig`` internally. See
|
|
24
|
+
:class:`DubbingConfig` for the full knob list and defaults.
|
|
58
25
|
"""
|
|
59
26
|
|
|
60
|
-
def __init__(
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
whisper_model: WhisperModel = "turbo",
|
|
65
|
-
condition_on_previous_text: bool = False,
|
|
66
|
-
no_speech_threshold: float = 0.6,
|
|
67
|
-
logprob_threshold: float | None = -1.0,
|
|
68
|
-
vocabulary: list[str] | None = None,
|
|
69
|
-
strict_quality: bool = False,
|
|
70
|
-
translator: TranslatorChoice = "auto",
|
|
71
|
-
):
|
|
72
|
-
self.device = device
|
|
73
|
-
self.low_memory = low_memory
|
|
74
|
-
self.whisper_model = whisper_model
|
|
75
|
-
self.condition_on_previous_text = condition_on_previous_text
|
|
76
|
-
self.no_speech_threshold = no_speech_threshold
|
|
77
|
-
self.logprob_threshold = logprob_threshold
|
|
78
|
-
self.vocabulary = vocabulary
|
|
79
|
-
self.strict_quality = strict_quality
|
|
80
|
-
self.translator = translator
|
|
27
|
+
def __init__(self, config: DubbingConfig | None = None, **kwargs: Any):
|
|
28
|
+
if config is not None and kwargs:
|
|
29
|
+
raise TypeError("Pass either `config=` or knob kwargs, not both")
|
|
30
|
+
self.config = config or DubbingConfig(**kwargs)
|
|
81
31
|
self._local_pipeline: Any = None
|
|
82
|
-
requested = device.lower() if isinstance(device, str) else "auto"
|
|
83
32
|
logger.info(
|
|
84
|
-
"VideoDubber initialized with
|
|
85
|
-
|
|
86
|
-
low_memory,
|
|
87
|
-
whisper_model,
|
|
88
|
-
translator,
|
|
33
|
+
"VideoDubber initialized with %s",
|
|
34
|
+
" ".join(f"{k}={v}" for k, v in self.config.init_log_fields().items()),
|
|
89
35
|
)
|
|
90
36
|
|
|
91
37
|
def _init_local_pipeline(self) -> None:
|
|
92
38
|
from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
|
|
93
39
|
|
|
94
|
-
self._local_pipeline = LocalDubbingPipeline(
|
|
95
|
-
device=self.device,
|
|
96
|
-
low_memory=self.low_memory,
|
|
97
|
-
whisper_model=self.whisper_model,
|
|
98
|
-
condition_on_previous_text=self.condition_on_previous_text,
|
|
99
|
-
no_speech_threshold=self.no_speech_threshold,
|
|
100
|
-
logprob_threshold=self.logprob_threshold,
|
|
101
|
-
vocabulary=self.vocabulary,
|
|
102
|
-
strict_quality=self.strict_quality,
|
|
103
|
-
translator=self.translator,
|
|
104
|
-
)
|
|
40
|
+
self._local_pipeline = LocalDubbingPipeline(config=self.config)
|
|
105
41
|
|
|
106
42
|
def dub(
|
|
107
43
|
self,
|
|
@@ -218,7 +154,7 @@ class VideoDubber:
|
|
|
218
154
|
source transcription. The output video is written to ``output_path``.
|
|
219
155
|
"""
|
|
220
156
|
from videopython.ai.dubbing.remux import replace_audio_stream_from_audio
|
|
221
|
-
from videopython.
|
|
157
|
+
from videopython.audio import Audio
|
|
222
158
|
|
|
223
159
|
input_path = Path(input_path)
|
|
224
160
|
output_path = Path(output_path)
|
|
@@ -292,7 +228,7 @@ class VideoDubber:
|
|
|
292
228
|
video_duration = video.total_seconds
|
|
293
229
|
|
|
294
230
|
if video_duration > speech_duration:
|
|
295
|
-
from videopython.
|
|
231
|
+
from videopython.editing.transforms import CutSeconds
|
|
296
232
|
|
|
297
233
|
output_video = CutSeconds(start=0, end=speech_duration).apply(video)
|
|
298
234
|
else:
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Source-prosody-driven expressiveness knobs for Chatterbox TTS."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
from videopython.ai.dubbing.models import Expressiveness
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from videopython.audio import Audio
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# Prosody-conditioning thresholds. Source-segment RMS / whole-vocals RMS
|
|
16
|
+
# below CALM lands in the calm bucket; above DRAMATIC in the dramatic
|
|
17
|
+
# bucket; in between gets Chatterbox's defaults. Knob values picked
|
|
18
|
+
# by-ear on cam1_1min.mp4 -- see RELEASE_NOTES 0.29.0.
|
|
19
|
+
CALM_RATIO_THRESHOLD = 0.7
|
|
20
|
+
DRAMATIC_RATIO_THRESHOLD = 1.3
|
|
21
|
+
_CALM = Expressiveness(exaggeration=0.3, cfg_weight=0.7)
|
|
22
|
+
_DRAMATIC = Expressiveness(exaggeration=0.85, cfg_weight=0.35)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def rms(data: np.ndarray) -> float:
|
|
26
|
+
"""RMS over samples; ``0.0`` for empty input. float64 reduction so a
|
|
27
|
+
long slice can't overflow the squared accumulator."""
|
|
28
|
+
if data.size == 0:
|
|
29
|
+
return 0.0
|
|
30
|
+
return float(np.sqrt(np.mean(np.square(data, dtype=np.float64))))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def expressiveness_for(source_slice: Audio, baseline_rms: float) -> Expressiveness:
|
|
34
|
+
"""Map a source vocals slice to a Chatterbox expressiveness profile
|
|
35
|
+
by RMS ratio. Falls back to the no-knobs default for empty or silent
|
|
36
|
+
inputs."""
|
|
37
|
+
if baseline_rms <= 0.0:
|
|
38
|
+
return Expressiveness()
|
|
39
|
+
segment_rms = rms(source_slice.data)
|
|
40
|
+
if segment_rms <= 0.0:
|
|
41
|
+
return Expressiveness()
|
|
42
|
+
ratio = segment_rms / baseline_rms
|
|
43
|
+
if ratio < CALM_RATIO_THRESHOLD:
|
|
44
|
+
return _CALM
|
|
45
|
+
if ratio > DRAMATIC_RATIO_THRESHOLD:
|
|
46
|
+
return _DRAMATIC
|
|
47
|
+
return Expressiveness()
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""LUFS / peak loudness matching for dubbed audio."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from videopython.audio import Audio
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# BS.1770 integrated-loudness measurement requires at least 400 ms of audio
|
|
14
|
+
# (one gating block). Below this, fall back to peak match -- pyloudnorm
|
|
15
|
+
# returns -inf or warns, neither of which gives a usable gain.
|
|
16
|
+
_LUFS_MIN_DURATION_SECONDS = 0.4
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def peak_match(target: Audio, reference: Audio) -> Audio:
|
|
20
|
+
"""Scale ``target`` so its peak amplitude matches ``reference``.
|
|
21
|
+
|
|
22
|
+
Used as the fallback when LUFS measurement isn't viable (clip < 0.4s
|
|
23
|
+
or silent input). The new ``Audio`` shares no buffer with ``target``.
|
|
24
|
+
"""
|
|
25
|
+
from videopython.audio import Audio as _Audio
|
|
26
|
+
|
|
27
|
+
target_peak = float(np.max(np.abs(target.data))) if target.data.size else 0.0
|
|
28
|
+
reference_peak = float(np.max(np.abs(reference.data))) if reference.data.size else 0.0
|
|
29
|
+
|
|
30
|
+
if target_peak <= 0.0 or reference_peak <= 0.0:
|
|
31
|
+
return target
|
|
32
|
+
|
|
33
|
+
scale = reference_peak / target_peak
|
|
34
|
+
if abs(scale - 1.0) < 1e-3:
|
|
35
|
+
return target
|
|
36
|
+
|
|
37
|
+
return _Audio(target.data * scale, target.metadata)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def loudness_match(target: Audio, reference: Audio) -> Audio:
|
|
41
|
+
"""Scale ``target`` so its integrated loudness (BS.1770 / LUFS) matches ``reference``.
|
|
42
|
+
|
|
43
|
+
Demucs background normalization and the timing-assembler peak guard
|
|
44
|
+
each clamp at 1.0 instead of restoring perceived loudness, so a
|
|
45
|
+
dubbed mix lands perceptually "thinner" than the source even after
|
|
46
|
+
peak match. LUFS captures the ear-weighted envelope that peak ratio
|
|
47
|
+
misses on dialogue-heavy material.
|
|
48
|
+
|
|
49
|
+
Falls back to :func:`peak_match` when either clip is shorter than
|
|
50
|
+
the BS.1770 gating block (400 ms) or when measurement returns -inf
|
|
51
|
+
(silent or near-silent gated content). After gain is applied, peaks
|
|
52
|
+
are clamped to 0.99 -- BS.1770 has no peak ceiling and a sufficiently
|
|
53
|
+
quiet source can demand gain that would otherwise clip.
|
|
54
|
+
"""
|
|
55
|
+
from videopython.audio import Audio as _Audio
|
|
56
|
+
|
|
57
|
+
target_dur = target.metadata.duration_seconds
|
|
58
|
+
ref_dur = reference.metadata.duration_seconds
|
|
59
|
+
if target_dur < _LUFS_MIN_DURATION_SECONDS or ref_dur < _LUFS_MIN_DURATION_SECONDS:
|
|
60
|
+
return peak_match(target, reference)
|
|
61
|
+
|
|
62
|
+
if not target.data.size or not reference.data.size:
|
|
63
|
+
return target
|
|
64
|
+
|
|
65
|
+
import pyloudnorm
|
|
66
|
+
|
|
67
|
+
target_lufs = pyloudnorm.Meter(target.metadata.sample_rate).integrated_loudness(target.data)
|
|
68
|
+
reference_lufs = pyloudnorm.Meter(reference.metadata.sample_rate).integrated_loudness(reference.data)
|
|
69
|
+
|
|
70
|
+
# Either clip's gated content was below -70 LUFS (effectively silent
|
|
71
|
+
# under BS.1770). Gain would be undefined -- fall back to peak match,
|
|
72
|
+
# which has its own silent-input no-op.
|
|
73
|
+
if not np.isfinite(target_lufs) or not np.isfinite(reference_lufs):
|
|
74
|
+
return peak_match(target, reference)
|
|
75
|
+
|
|
76
|
+
gain_db = reference_lufs - target_lufs
|
|
77
|
+
if abs(gain_db) < 0.1:
|
|
78
|
+
return target
|
|
79
|
+
scale = float(10 ** (gain_db / 20.0))
|
|
80
|
+
|
|
81
|
+
scaled = target.data * scale
|
|
82
|
+
peak = float(np.max(np.abs(scaled)))
|
|
83
|
+
if peak > 0.99:
|
|
84
|
+
scaled = scaled * (0.99 / peak)
|
|
85
|
+
|
|
86
|
+
return _Audio(scaled, target.metadata)
|