videopython 0.31.3__tar.gz → 0.33.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {videopython-0.31.3 → videopython-0.33.0}/PKG-INFO +21 -8
  2. {videopython-0.31.3 → videopython-0.33.0}/README.md +20 -7
  3. {videopython-0.31.3 → videopython-0.33.0}/pyproject.toml +1 -1
  4. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/dubbing/__init__.py +2 -0
  5. videopython-0.33.0/src/videopython/ai/dubbing/config.py +80 -0
  6. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/dubbing/dubber.py +14 -78
  7. videopython-0.33.0/src/videopython/ai/dubbing/expressiveness.py +47 -0
  8. videopython-0.33.0/src/videopython/ai/dubbing/loudness.py +86 -0
  9. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/dubbing/models.py +52 -71
  10. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/dubbing/pipeline.py +73 -343
  11. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/dubbing/quality.py +6 -27
  12. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/dubbing/remux.py +1 -1
  13. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/dubbing/timing.py +1 -1
  14. videopython-0.33.0/src/videopython/ai/dubbing/voice_sample.py +152 -0
  15. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/generation/audio.py +1 -1
  16. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/generation/qwen3.py +1 -1
  17. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/generation/translation.py +1 -1
  18. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/transforms.py +1 -1
  19. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/understanding/audio.py +2 -2
  20. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/understanding/separation.py +1 -1
  21. videopython-0.33.0/src/videopython/ai/video_analysis/__init__.py +39 -0
  22. videopython-0.33.0/src/videopython/ai/video_analysis/analyzer.py +490 -0
  23. videopython-0.33.0/src/videopython/ai/video_analysis/models.py +228 -0
  24. videopython-0.33.0/src/videopython/ai/video_analysis/sampling.py +113 -0
  25. videopython-0.33.0/src/videopython/ai/video_analysis/stages.py +354 -0
  26. {videopython-0.31.3/src/videopython/base → videopython-0.33.0/src/videopython}/audio/audio.py +5 -5
  27. videopython-0.33.0/src/videopython/base/__init__.py +62 -0
  28. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/base/_dimensions.py +1 -1
  29. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/base/_video_io.py +1 -1
  30. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/base/description.py +4 -2
  31. {videopython-0.31.3/src/videopython/base/text → videopython-0.33.0/src/videopython/base}/image_text.py +3 -2
  32. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/base/video.py +1 -1
  33. videopython-0.33.0/src/videopython/editing/__init__.py +61 -0
  34. {videopython-0.31.3/src/videopython/base → videopython-0.33.0/src/videopython/editing}/effects.py +3 -3
  35. {videopython-0.31.3/src/videopython/base → videopython-0.33.0/src/videopython/editing}/operation.py +1 -1
  36. {videopython-0.31.3/src/videopython/base → videopython-0.33.0/src/videopython/editing}/streaming.py +2 -2
  37. videopython-0.31.3/src/videopython/base/text/overlay.py → videopython-0.33.0/src/videopython/editing/transcription_overlay.py +3 -3
  38. {videopython-0.31.3/src/videopython/base → videopython-0.33.0/src/videopython/editing}/transforms.py +3 -3
  39. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/editing/video_edit.py +6 -6
  40. videopython-0.31.3/src/videopython/ai/video_analysis.py +0 -1181
  41. videopython-0.31.3/src/videopython/base/__init__.py +0 -128
  42. videopython-0.31.3/src/videopython/base/scene.py +0 -456
  43. videopython-0.31.3/src/videopython/base/text/__init__.py +0 -13
  44. videopython-0.31.3/src/videopython/editing/__init__.py +0 -6
  45. {videopython-0.31.3 → videopython-0.33.0}/.gitignore +0 -0
  46. {videopython-0.31.3 → videopython-0.33.0}/LICENSE +0 -0
  47. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/__init__.py +0 -0
  48. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/__init__.py +0 -0
  49. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/_device.py +0 -0
  50. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/generation/__init__.py +0 -0
  51. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/generation/image.py +0 -0
  52. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/generation/video.py +0 -0
  53. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/understanding/__init__.py +0 -0
  54. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/understanding/faces.py +0 -0
  55. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/understanding/image.py +0 -0
  56. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/ai/understanding/temporal.py +0 -0
  57. {videopython-0.31.3/src/videopython/base → videopython-0.33.0/src/videopython}/audio/__init__.py +0 -0
  58. {videopython-0.31.3/src/videopython/base → videopython-0.33.0/src/videopython}/audio/analysis.py +0 -0
  59. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/base/_ffmpeg.py +0 -0
  60. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/base/exceptions.py +0 -0
  61. {videopython-0.31.3/src/videopython/base/text → videopython-0.33.0/src/videopython/base}/transcription.py +0 -0
  62. {videopython-0.31.3 → videopython-0.33.0}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.31.3
3
+ Version: 0.33.0
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -91,7 +91,8 @@ Every editing primitive is an `Operation` subclass — a Pydantic model
91
91
  whose fields ARE the JSON wire format. Apply one to a `Video`:
92
92
 
93
93
  ```python
94
- from videopython.base import Video, CutSeconds, Resize, Fade
94
+ from videopython.base import Video
95
+ from videopython.editing import CutSeconds, Resize, Fade
95
96
 
96
97
  video = Video.from_path("raw.mp4")
97
98
  video = CutSeconds(start=10, end=25).apply(video)
@@ -141,7 +142,7 @@ instead if you want the result back in memory as a `Video`.
141
142
 
142
143
  ```python
143
144
  from videopython.ai import TextToImage, ImageToVideo, TextToSpeech
144
- from videopython.base import Resize
145
+ from videopython.editing import Resize
145
146
 
146
147
  image = TextToImage().generate_image("A cinematic mountain sunrise")
147
148
  video = ImageToVideo().generate_video(image=image)
@@ -182,7 +183,7 @@ Every registered op exposes its own Pydantic schema, so an agent can
182
183
  introspect what's available without hardcoded lists:
183
184
 
184
185
  ```python
185
- from videopython.base import Operation, OpCategory
186
+ from videopython.editing import Operation, OpCategory
186
187
 
187
188
  for op_id, cls in Operation.registry().items():
188
189
  print(f"{op_id}: {(cls.__doc__ or '').splitlines()[0]}")
@@ -205,18 +206,30 @@ Docs: [Editing Plans](https://videopython.com/api/editing/) | [Operations](https
205
206
 
206
207
  ## Features
207
208
 
208
- ### `videopython.base` - core editing (no AI dependencies)
209
+ ### `videopython.base` - data containers + I/O (no AI dependencies)
209
210
 
210
211
  | Area | Highlights |
211
212
  |---|---|
212
213
  | **Video I/O** | `Video`, `VideoMetadata`, `FrameIterator` - load, save, inspect |
214
+ | **Text rendering** | `ImageText` - generic PIL text-on-image primitive |
215
+ | **Transcription** | `Transcription`, `TranscriptionSegment`, `TranscriptionWord` - data classes returned by transcription backends |
216
+ | **Result types** | `BoundingBox`, `DetectedFace`, `FaceTrack`, `SceneBoundary`, `AudioEvent`, `MotionInfo`, ... - shared by editing and AI |
217
+
218
+ ### `videopython.audio` - audio data container
219
+
220
+ | Area | Highlights |
221
+ |---|---|
222
+ | **Audio** | `Audio`, `AudioMetadata` - load/save, overlay, concat, normalize, time-stretch, silence detection, segment classification |
223
+
224
+ ### `videopython.editing` - editing primitives + plan runner
225
+
226
+ | Area | Highlights |
227
+ |---|---|
213
228
  | **Operation foundation** | `Operation`, `Effect`, `TimeRange`, `OpCategory` - Pydantic base + auto-registry + discriminated-union schema |
214
229
  | **Editing plans** | `VideoEdit`, `SegmentConfig` - JSON/LLM-friendly multi-segment plans with JSON Schema generation, dry-run validation, and streaming `run_to_file` |
215
230
  | **Transforms** | Cut (time/frame), resize, crop, FPS resampling, speed change, reverse, freeze frame, silence removal |
216
231
  | **Effects** | Blur, zoom, color grading, vignette, Ken Burns, image overlay, fade, text overlay, volume adjust |
217
- | **Audio** | Load/save, overlay, concat, normalize, time-stretch, silence detection, segment classification |
218
- | **Text** | Transcription data classes, `TranscriptionOverlay` for subtitle rendering |
219
- | **Scene detection** | Histogram-based scene boundaries (`detect`, `detect_streaming`, `detect_parallel`) |
232
+ | **Subtitles** | `TranscriptionOverlay` - animated word-by-word subtitle rendering |
220
233
 
221
234
  API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopython.com/api/core/video/) | [Audio](https://videopython.com/api/core/audio/) | [Editing Plans](https://videopython.com/api/editing/) | [Operations](https://videopython.com/api/operations/) | [Transforms](https://videopython.com/api/transforms/) | [Effects](https://videopython.com/api/effects/) | [Text](https://videopython.com/api/text/)
222
235
 
@@ -42,7 +42,8 @@ Every editing primitive is an `Operation` subclass — a Pydantic model
42
42
  whose fields ARE the JSON wire format. Apply one to a `Video`:
43
43
 
44
44
  ```python
45
- from videopython.base import Video, CutSeconds, Resize, Fade
45
+ from videopython.base import Video
46
+ from videopython.editing import CutSeconds, Resize, Fade
46
47
 
47
48
  video = Video.from_path("raw.mp4")
48
49
  video = CutSeconds(start=10, end=25).apply(video)
@@ -92,7 +93,7 @@ instead if you want the result back in memory as a `Video`.
92
93
 
93
94
  ```python
94
95
  from videopython.ai import TextToImage, ImageToVideo, TextToSpeech
95
- from videopython.base import Resize
96
+ from videopython.editing import Resize
96
97
 
97
98
  image = TextToImage().generate_image("A cinematic mountain sunrise")
98
99
  video = ImageToVideo().generate_video(image=image)
@@ -133,7 +134,7 @@ Every registered op exposes its own Pydantic schema, so an agent can
133
134
  introspect what's available without hardcoded lists:
134
135
 
135
136
  ```python
136
- from videopython.base import Operation, OpCategory
137
+ from videopython.editing import Operation, OpCategory
137
138
 
138
139
  for op_id, cls in Operation.registry().items():
139
140
  print(f"{op_id}: {(cls.__doc__ or '').splitlines()[0]}")
@@ -156,18 +157,30 @@ Docs: [Editing Plans](https://videopython.com/api/editing/) | [Operations](https
156
157
 
157
158
  ## Features
158
159
 
159
- ### `videopython.base` - core editing (no AI dependencies)
160
+ ### `videopython.base` - data containers + I/O (no AI dependencies)
160
161
 
161
162
  | Area | Highlights |
162
163
  |---|---|
163
164
  | **Video I/O** | `Video`, `VideoMetadata`, `FrameIterator` - load, save, inspect |
165
+ | **Text rendering** | `ImageText` - generic PIL text-on-image primitive |
166
+ | **Transcription** | `Transcription`, `TranscriptionSegment`, `TranscriptionWord` - data classes returned by transcription backends |
167
+ | **Result types** | `BoundingBox`, `DetectedFace`, `FaceTrack`, `SceneBoundary`, `AudioEvent`, `MotionInfo`, ... - shared by editing and AI |
168
+
169
+ ### `videopython.audio` - audio data container
170
+
171
+ | Area | Highlights |
172
+ |---|---|
173
+ | **Audio** | `Audio`, `AudioMetadata` - load/save, overlay, concat, normalize, time-stretch, silence detection, segment classification |
174
+
175
+ ### `videopython.editing` - editing primitives + plan runner
176
+
177
+ | Area | Highlights |
178
+ |---|---|
164
179
  | **Operation foundation** | `Operation`, `Effect`, `TimeRange`, `OpCategory` - Pydantic base + auto-registry + discriminated-union schema |
165
180
  | **Editing plans** | `VideoEdit`, `SegmentConfig` - JSON/LLM-friendly multi-segment plans with JSON Schema generation, dry-run validation, and streaming `run_to_file` |
166
181
  | **Transforms** | Cut (time/frame), resize, crop, FPS resampling, speed change, reverse, freeze frame, silence removal |
167
182
  | **Effects** | Blur, zoom, color grading, vignette, Ken Burns, image overlay, fade, text overlay, volume adjust |
168
- | **Audio** | Load/save, overlay, concat, normalize, time-stretch, silence detection, segment classification |
169
- | **Text** | Transcription data classes, `TranscriptionOverlay` for subtitle rendering |
170
- | **Scene detection** | Histogram-based scene boundaries (`detect`, `detect_streaming`, `detect_parallel`) |
183
+ | **Subtitles** | `TranscriptionOverlay` - animated word-by-word subtitle rendering |
171
184
 
172
185
  API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopython.com/api/core/video/) | [Audio](https://videopython.com/api/core/audio/) | [Editing Plans](https://videopython.com/api/editing/) | [Operations](https://videopython.com/api/operations/) | [Transforms](https://videopython.com/api/transforms/) | [Effects](https://videopython.com/api/effects/) | [Text](https://videopython.com/api/text/)
173
186
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.31.3"
3
+ version = "0.33.0"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -1,5 +1,6 @@
1
1
  """Local video dubbing functionality."""
2
2
 
3
+ from videopython.ai.dubbing.config import DubbingConfig
3
4
  from videopython.ai.dubbing.dubber import VideoDubber
4
5
  from videopython.ai.dubbing.models import (
5
6
  DubbingResult,
@@ -15,6 +16,7 @@ from videopython.ai.generation.translation import UnsupportedLanguageError
15
16
 
16
17
  __all__ = [
17
18
  "VideoDubber",
19
+ "DubbingConfig",
18
20
  "DubbingResult",
19
21
  "RevoiceResult",
20
22
  "TranslatedSegment",
@@ -0,0 +1,80 @@
1
+ """Configuration model for the dubbing pipeline."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Literal
6
+
7
+ from pydantic import BaseModel, ConfigDict
8
+
9
+ TranslatorChoice = Literal["auto", "marian", "qwen3"]
10
+ WhisperModel = Literal["tiny", "base", "small", "medium", "large", "turbo"]
11
+
12
+
13
+ class DubbingConfig(BaseModel):
14
+ """Knobs shared by :class:`VideoDubber` and :class:`LocalDubbingPipeline`.
15
+
16
+ Accepted as either ``config=DubbingConfig(...)`` or flat kwargs on the
17
+ two constructors; the flat path builds a ``DubbingConfig`` internally.
18
+
19
+ Attributes:
20
+ device: Execution device (``cpu``, ``cuda``, ``mps``, or ``None`` for auto).
21
+ low_memory: When True, each pipeline stage (Whisper, Demucs, MarianMT,
22
+ Chatterbox TTS) is unloaded from memory after it runs, so only one
23
+ model is resident at a time. Trades per-run latency (~10-30s of
24
+ extra model loads) for a much lower memory ceiling. Recommended
25
+ for GPUs with <=12GB VRAM or hosts with <32GB RAM. Default False.
26
+ whisper_model: Whisper model size used for transcription. Larger
27
+ models give better accuracy at the cost of VRAM and latency. One
28
+ of ``tiny``, ``base``, ``small``, ``medium``, ``large``, ``turbo``.
29
+ Default ``turbo``.
30
+ condition_on_previous_text: Forwarded to ``AudioToText``. Defaults to
31
+ ``False`` (Whisper's own default is ``True``). With conditioning
32
+ on, a single hallucinated filler phrase cascades through the rest
33
+ of the file. See ``AudioToText`` for the full rationale.
34
+ no_speech_threshold: Forwarded to ``AudioToText``. Whisper's
35
+ no-speech gate; raise to drop more low-confidence windows.
36
+ logprob_threshold: Forwarded to ``AudioToText``. Whisper's average
37
+ log-probability gate.
38
+ vocabulary: Forwarded to ``AudioToText``. Optional list of brand
39
+ names, product names, or proper nouns to bias Whisper's
40
+ first-window decoder via ``initial_prompt``. Recovers
41
+ near-mishears (e.g. Klarna -> "carna") on brand-monitoring
42
+ inputs without new model deps.
43
+ strict_quality: When True, the pipeline raises
44
+ :class:`GarbageTranscriptError` before Demucs/translation/TTS
45
+ run if the transcript-quality heuristic returns ``"reject"``.
46
+ When False (default), low-quality transcripts are logged at
47
+ WARNING but processing continues. Either way the
48
+ :class:`TranscriptQuality` is exposed on ``DubbingResult`` for
49
+ inspection.
50
+ translator: Translation backend to use. ``"auto"`` (default) picks
51
+ Qwen3 on GPU, MarianMT on CPU; ``"marian"`` and ``"qwen3"`` force
52
+ the named backend regardless of device. See
53
+ :class:`videopython.ai.generation.qwen3.Qwen3Translator` for
54
+ tradeoffs (Qwen3 is slower on CPU but produces context-aware,
55
+ length-budgeted output).
56
+ """
57
+
58
+ model_config = ConfigDict(frozen=True)
59
+
60
+ device: str | None = None
61
+ low_memory: bool = False
62
+ whisper_model: WhisperModel = "turbo"
63
+ condition_on_previous_text: bool = False
64
+ no_speech_threshold: float = 0.6
65
+ logprob_threshold: float | None = -1.0
66
+ vocabulary: list[str] | None = None
67
+ strict_quality: bool = False
68
+ translator: TranslatorChoice = "auto"
69
+
70
+ def init_log_fields(self) -> dict[str, object]:
71
+ """Subset of fields surfaced in the init-log line.
72
+
73
+ Hand-picked so log noise stays bounded as the config grows.
74
+ """
75
+ return {
76
+ "device": self.device.lower() if isinstance(self.device, str) else "auto",
77
+ "low_memory": self.low_memory,
78
+ "whisper_model": self.whisper_model,
79
+ "translator": self.translator,
80
+ }
@@ -6,8 +6,8 @@ import logging
6
6
  from pathlib import Path
7
7
  from typing import TYPE_CHECKING, Any, Callable
8
8
 
9
+ from videopython.ai.dubbing.config import DubbingConfig
9
10
  from videopython.ai.dubbing.models import DubbingResult, RevoiceResult
10
- from videopython.ai.dubbing.pipeline import TranslatorChoice, WhisperModel
11
11
 
12
12
  if TYPE_CHECKING:
13
13
  from videopython.base.video import Video
@@ -18,90 +18,26 @@ logger = logging.getLogger(__name__)
18
18
  class VideoDubber:
19
19
  """Dubs videos into different languages using the local pipeline.
20
20
 
21
- Args:
22
- device: Execution device (``cpu``, ``cuda``, ``mps``, or ``None`` for auto).
23
- low_memory: When True, each pipeline stage (Whisper, Demucs, MarianMT,
24
- Chatterbox TTS) is unloaded from memory after it runs, so only one
25
- model is resident at a time. Trades per-run latency (~10-30s of
26
- extra model loads) for a much lower memory ceiling. Recommended for
27
- GPUs with <=12GB VRAM or hosts with <32GB RAM. Default False.
28
- whisper_model: Whisper model size used for transcription. Larger models
29
- give better accuracy at the cost of VRAM and latency. One of
30
- ``tiny``, ``base``, ``small``, ``medium``, ``large``, ``turbo``.
31
- Default ``turbo``.
32
- condition_on_previous_text: Forwarded to ``AudioToText``. Defaults to
33
- ``False`` (Whisper's own default is ``True``). With conditioning on,
34
- a single hallucinated filler phrase cascades through the rest of
35
- the file. See ``AudioToText`` for the full rationale.
36
- no_speech_threshold: Forwarded to ``AudioToText``. Whisper's no-speech
37
- gate; raise to drop more low-confidence windows.
38
- logprob_threshold: Forwarded to ``AudioToText``. Whisper's average
39
- log-probability gate.
40
- vocabulary: Forwarded to ``AudioToText``. Optional list of brand
41
- names, product names, or proper nouns to bias Whisper's first-
42
- window decoder via ``initial_prompt``. Recovers near-mishears
43
- (e.g. Klarna → "carna") on brand-monitoring inputs without new
44
- model deps.
45
- strict_quality: When True, the pipeline raises
46
- :class:`GarbageTranscriptError` before Demucs/translation/TTS run
47
- if the transcript-quality heuristic returns ``"reject"``. When
48
- False (default), low-quality transcripts are logged at WARNING
49
- but processing continues. Either way the
50
- :class:`TranscriptQuality` is exposed on ``DubbingResult`` for
51
- inspection.
52
- translator: Translation backend to use. ``"auto"`` (default)
53
- picks Qwen3 on GPU, MarianMT on CPU; ``"marian"`` and
54
- ``"qwen3"`` force the named backend regardless of device.
55
- See :class:`videopython.ai.generation.qwen3.Qwen3Translator`
56
- for tradeoffs (Qwen3 is slower on CPU but produces
57
- context-aware, length-budgeted output).
21
+ Accepts either a :class:`DubbingConfig` or the same knobs as flat kwargs
22
+ (``device``, ``low_memory``, ``whisper_model``, ``translator``, etc.) --
23
+ the flat path builds a ``DubbingConfig`` internally. See
24
+ :class:`DubbingConfig` for the full knob list and defaults.
58
25
  """
59
26
 
60
- def __init__(
61
- self,
62
- device: str | None = None,
63
- low_memory: bool = False,
64
- whisper_model: WhisperModel = "turbo",
65
- condition_on_previous_text: bool = False,
66
- no_speech_threshold: float = 0.6,
67
- logprob_threshold: float | None = -1.0,
68
- vocabulary: list[str] | None = None,
69
- strict_quality: bool = False,
70
- translator: TranslatorChoice = "auto",
71
- ):
72
- self.device = device
73
- self.low_memory = low_memory
74
- self.whisper_model = whisper_model
75
- self.condition_on_previous_text = condition_on_previous_text
76
- self.no_speech_threshold = no_speech_threshold
77
- self.logprob_threshold = logprob_threshold
78
- self.vocabulary = vocabulary
79
- self.strict_quality = strict_quality
80
- self.translator = translator
27
+ def __init__(self, config: DubbingConfig | None = None, **kwargs: Any):
28
+ if config is not None and kwargs:
29
+ raise TypeError("Pass either `config=` or knob kwargs, not both")
30
+ self.config = config or DubbingConfig(**kwargs)
81
31
  self._local_pipeline: Any = None
82
- requested = device.lower() if isinstance(device, str) else "auto"
83
32
  logger.info(
84
- "VideoDubber initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
85
- requested,
86
- low_memory,
87
- whisper_model,
88
- translator,
33
+ "VideoDubber initialized with %s",
34
+ " ".join(f"{k}={v}" for k, v in self.config.init_log_fields().items()),
89
35
  )
90
36
 
91
37
  def _init_local_pipeline(self) -> None:
92
38
  from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
93
39
 
94
- self._local_pipeline = LocalDubbingPipeline(
95
- device=self.device,
96
- low_memory=self.low_memory,
97
- whisper_model=self.whisper_model,
98
- condition_on_previous_text=self.condition_on_previous_text,
99
- no_speech_threshold=self.no_speech_threshold,
100
- logprob_threshold=self.logprob_threshold,
101
- vocabulary=self.vocabulary,
102
- strict_quality=self.strict_quality,
103
- translator=self.translator,
104
- )
40
+ self._local_pipeline = LocalDubbingPipeline(config=self.config)
105
41
 
106
42
  def dub(
107
43
  self,
@@ -218,7 +154,7 @@ class VideoDubber:
218
154
  source transcription. The output video is written to ``output_path``.
219
155
  """
220
156
  from videopython.ai.dubbing.remux import replace_audio_stream_from_audio
221
- from videopython.base.audio import Audio
157
+ from videopython.audio import Audio
222
158
 
223
159
  input_path = Path(input_path)
224
160
  output_path = Path(output_path)
@@ -292,7 +228,7 @@ class VideoDubber:
292
228
  video_duration = video.total_seconds
293
229
 
294
230
  if video_duration > speech_duration:
295
- from videopython.base.transforms import CutSeconds
231
+ from videopython.editing.transforms import CutSeconds
296
232
 
297
233
  output_video = CutSeconds(start=0, end=speech_duration).apply(video)
298
234
  else:
@@ -0,0 +1,47 @@
1
+ """Source-prosody-driven expressiveness knobs for Chatterbox TTS."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING
6
+
7
+ import numpy as np
8
+
9
+ from videopython.ai.dubbing.models import Expressiveness
10
+
11
+ if TYPE_CHECKING:
12
+ from videopython.audio import Audio
13
+
14
+
15
+ # Prosody-conditioning thresholds. Source-segment RMS / whole-vocals RMS
16
+ # below CALM lands in the calm bucket; above DRAMATIC in the dramatic
17
+ # bucket; in between gets Chatterbox's defaults. Knob values picked
18
+ # by-ear on cam1_1min.mp4 -- see RELEASE_NOTES 0.29.0.
19
+ CALM_RATIO_THRESHOLD = 0.7
20
+ DRAMATIC_RATIO_THRESHOLD = 1.3
21
+ _CALM = Expressiveness(exaggeration=0.3, cfg_weight=0.7)
22
+ _DRAMATIC = Expressiveness(exaggeration=0.85, cfg_weight=0.35)
23
+
24
+
25
+ def rms(data: np.ndarray) -> float:
26
+ """RMS over samples; ``0.0`` for empty input. float64 reduction so a
27
+ long slice can't overflow the squared accumulator."""
28
+ if data.size == 0:
29
+ return 0.0
30
+ return float(np.sqrt(np.mean(np.square(data, dtype=np.float64))))
31
+
32
+
33
+ def expressiveness_for(source_slice: Audio, baseline_rms: float) -> Expressiveness:
34
+ """Map a source vocals slice to a Chatterbox expressiveness profile
35
+ by RMS ratio. Falls back to the no-knobs default for empty or silent
36
+ inputs."""
37
+ if baseline_rms <= 0.0:
38
+ return Expressiveness()
39
+ segment_rms = rms(source_slice.data)
40
+ if segment_rms <= 0.0:
41
+ return Expressiveness()
42
+ ratio = segment_rms / baseline_rms
43
+ if ratio < CALM_RATIO_THRESHOLD:
44
+ return _CALM
45
+ if ratio > DRAMATIC_RATIO_THRESHOLD:
46
+ return _DRAMATIC
47
+ return Expressiveness()
@@ -0,0 +1,86 @@
1
+ """LUFS / peak loudness matching for dubbed audio."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING
6
+
7
+ import numpy as np
8
+
9
+ if TYPE_CHECKING:
10
+ from videopython.audio import Audio
11
+
12
+
13
+ # BS.1770 integrated-loudness measurement requires at least 400 ms of audio
14
+ # (one gating block). Below this, fall back to peak match -- pyloudnorm
15
+ # returns -inf or warns, neither of which gives a usable gain.
16
+ _LUFS_MIN_DURATION_SECONDS = 0.4
17
+
18
+
19
+ def peak_match(target: Audio, reference: Audio) -> Audio:
20
+ """Scale ``target`` so its peak amplitude matches ``reference``.
21
+
22
+ Used as the fallback when LUFS measurement isn't viable (clip < 0.4s
23
+ or silent input). The new ``Audio`` shares no buffer with ``target``.
24
+ """
25
+ from videopython.audio import Audio as _Audio
26
+
27
+ target_peak = float(np.max(np.abs(target.data))) if target.data.size else 0.0
28
+ reference_peak = float(np.max(np.abs(reference.data))) if reference.data.size else 0.0
29
+
30
+ if target_peak <= 0.0 or reference_peak <= 0.0:
31
+ return target
32
+
33
+ scale = reference_peak / target_peak
34
+ if abs(scale - 1.0) < 1e-3:
35
+ return target
36
+
37
+ return _Audio(target.data * scale, target.metadata)
38
+
39
+
40
+ def loudness_match(target: Audio, reference: Audio) -> Audio:
41
+ """Scale ``target`` so its integrated loudness (BS.1770 / LUFS) matches ``reference``.
42
+
43
+ Demucs background normalization and the timing-assembler peak guard
44
+ each clamp at 1.0 instead of restoring perceived loudness, so a
45
+ dubbed mix lands perceptually "thinner" than the source even after
46
+ peak match. LUFS captures the ear-weighted envelope that peak ratio
47
+ misses on dialogue-heavy material.
48
+
49
+ Falls back to :func:`peak_match` when either clip is shorter than
50
+ the BS.1770 gating block (400 ms) or when measurement returns -inf
51
+ (silent or near-silent gated content). After gain is applied, peaks
52
+ are clamped to 0.99 -- BS.1770 has no peak ceiling and a sufficiently
53
+ quiet source can demand gain that would otherwise clip.
54
+ """
55
+ from videopython.audio import Audio as _Audio
56
+
57
+ target_dur = target.metadata.duration_seconds
58
+ ref_dur = reference.metadata.duration_seconds
59
+ if target_dur < _LUFS_MIN_DURATION_SECONDS or ref_dur < _LUFS_MIN_DURATION_SECONDS:
60
+ return peak_match(target, reference)
61
+
62
+ if not target.data.size or not reference.data.size:
63
+ return target
64
+
65
+ import pyloudnorm
66
+
67
+ target_lufs = pyloudnorm.Meter(target.metadata.sample_rate).integrated_loudness(target.data)
68
+ reference_lufs = pyloudnorm.Meter(reference.metadata.sample_rate).integrated_loudness(reference.data)
69
+
70
+ # Either clip's gated content was below -70 LUFS (effectively silent
71
+ # under BS.1770). Gain would be undefined -- fall back to peak match,
72
+ # which has its own silent-input no-op.
73
+ if not np.isfinite(target_lufs) or not np.isfinite(reference_lufs):
74
+ return peak_match(target, reference)
75
+
76
+ gain_db = reference_lufs - target_lufs
77
+ if abs(gain_db) < 0.1:
78
+ return target
79
+ scale = float(10 ** (gain_db / 20.0))
80
+
81
+ scaled = target.data * scale
82
+ peak = float(np.max(np.abs(scaled)))
83
+ if peak > 0.99:
84
+ scaled = scaled * (0.99 / peak)
85
+
86
+ return _Audio(scaled, target.metadata)