videopython 0.28.2__tar.gz → 0.29.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {videopython-0.28.2 → videopython-0.29.0}/PKG-INFO +7 -4
  2. {videopython-0.28.2 → videopython-0.29.0}/README.md +4 -2
  3. {videopython-0.28.2 → videopython-0.29.0}/pyproject.toml +13 -4
  4. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/ai/__init__.py +3 -5
  5. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/ai/dubbing/__init__.py +8 -1
  6. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/ai/dubbing/cache.py +14 -1
  7. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/ai/dubbing/models.py +36 -0
  8. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/ai/dubbing/pipeline.py +69 -10
  9. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/ai/generation/audio.py +24 -0
  10. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/ai/transforms.py +2 -478
  11. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/ai/understanding/__init__.py +3 -3
  12. videopython-0.29.0/src/videopython/ai/understanding/faces.py +592 -0
  13. videopython-0.29.0/src/videopython/ai/understanding/image.py +397 -0
  14. videopython-0.29.0/src/videopython/ai/understanding/temporal.py +218 -0
  15. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/ai/video_analysis.py +217 -37
  16. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/base/__init__.py +4 -2
  17. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/base/description.py +82 -52
  18. videopython-0.28.2/src/videopython/ai/understanding/image.py +0 -215
  19. videopython-0.28.2/src/videopython/ai/understanding/temporal.py +0 -464
  20. {videopython-0.28.2 → videopython-0.29.0}/.gitignore +0 -0
  21. {videopython-0.28.2 → videopython-0.29.0}/LICENSE +0 -0
  22. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/__init__.py +0 -0
  23. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/ai/_device.py +0 -0
  24. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/ai/dubbing/dubber.py +0 -0
  25. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/ai/dubbing/quality.py +0 -0
  26. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/ai/dubbing/remux.py +0 -0
  27. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/ai/dubbing/timing.py +0 -0
  28. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/ai/generation/__init__.py +0 -0
  29. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/ai/generation/image.py +0 -0
  30. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/ai/generation/qwen3.py +0 -0
  31. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/ai/generation/translation.py +0 -0
  32. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/ai/generation/video.py +0 -0
  33. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/ai/registry.py +0 -0
  34. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/ai/swapping/__init__.py +0 -0
  35. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/ai/swapping/inpainter.py +0 -0
  36. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/ai/swapping/models.py +0 -0
  37. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/ai/swapping/segmenter.py +0 -0
  38. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/ai/swapping/swapper.py +0 -0
  39. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/ai/understanding/audio.py +0 -0
  40. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/ai/understanding/separation.py +0 -0
  41. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/base/audio/__init__.py +0 -0
  42. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/base/audio/analysis.py +0 -0
  43. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/base/audio/audio.py +0 -0
  44. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/base/combine.py +0 -0
  45. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/base/effects.py +0 -0
  46. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/base/exceptions.py +0 -0
  47. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/base/progress.py +0 -0
  48. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/base/registry.py +0 -0
  49. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/base/scene.py +0 -0
  50. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/base/streaming.py +0 -0
  51. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/base/text/__init__.py +0 -0
  52. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/base/text/overlay.py +0 -0
  53. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/base/text/transcription.py +0 -0
  54. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/base/transforms.py +0 -0
  55. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/base/transitions.py +0 -0
  56. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/base/utils.py +0 -0
  57. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/base/video.py +0 -0
  58. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/editing/__init__.py +0 -0
  59. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/editing/multicam.py +0 -0
  60. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/editing/premiere_xml.py +0 -0
  61. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/editing/video_edit.py +0 -0
  62. {videopython-0.28.2 → videopython-0.29.0}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.28.2
3
+ Version: 0.29.0
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -27,14 +27,15 @@ Requires-Dist: accelerate>=0.29.2; extra == 'ai'
27
27
  Requires-Dist: chatterbox-tts>=0.1.7; extra == 'ai'
28
28
  Requires-Dist: demucs>=4.0.0; extra == 'ai'
29
29
  Requires-Dist: diffusers>=0.30.0; extra == 'ai'
30
- Requires-Dist: easyocr>=1.7.0; extra == 'ai'
31
30
  Requires-Dist: hf-transfer>=0.1.9; extra == 'ai'
31
+ Requires-Dist: imagehash>=4.3; extra == 'ai'
32
32
  Requires-Dist: llama-cpp-python>=0.3.0; extra == 'ai'
33
33
  Requires-Dist: numba>=0.61.0; extra == 'ai'
34
34
  Requires-Dist: ollama>=0.4.5; extra == 'ai'
35
35
  Requires-Dist: openai-whisper>=20240930; extra == 'ai'
36
36
  Requires-Dist: pyannote-audio>=4.0.0; extra == 'ai'
37
37
  Requires-Dist: pyloudnorm>=0.1.1; extra == 'ai'
38
+ Requires-Dist: qwen-vl-utils>=0.0.10; extra == 'ai'
38
39
  Requires-Dist: scikit-learn>=1.3.0; extra == 'ai'
39
40
  Requires-Dist: scipy>=1.10.0; extra == 'ai'
40
41
  Requires-Dist: sentencepiece>=0.1.99; extra == 'ai'
@@ -56,6 +57,8 @@ Minimal, LLM-friendly Python library for programmatic video editing, processing,
56
57
 
57
58
  Full documentation: [videopython.com](https://videopython.com)
58
59
 
60
+ > **Disclaimer:** This project started as a hand-written hobby project, but most of the code is now produced by LLM agents. Humans still drive direction, approve changes, and own design decisions.
61
+
59
62
  ## Installation
60
63
 
61
64
  ### 1. Install FFmpeg
@@ -193,10 +196,10 @@ API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopyth
193
196
  | Area | Highlights |
194
197
  |---|---|
195
198
  | **Generation** | `TextToVideo`, `ImageToVideo`, `TextToImage`, `TextToSpeech`, `TextToMusic` |
196
- | **Understanding** | `AudioToText` (transcription), `AudioClassifier`, `SceneVLM` (visual scene description), `ActionRecognizer` |
199
+ | **Understanding** | `AudioToText` (transcription), `AudioClassifier`, `SceneVLM` (structured visual scene description), `FaceTracker` (per-shot face tracks) |
197
200
  | **Scene detection** | `SemanticSceneDetector` (neural scene boundaries) |
198
201
  | **Video analysis** | `VideoAnalyzer` - full-pipeline analysis combining multiple AI capabilities |
199
- | **Transforms** | `FaceTracker`, `FaceTrackingCrop`, `SplitScreenComposite` |
202
+ | **Transforms** | `FaceTrackingCrop`, `SplitScreenComposite` |
200
203
  | **Dubbing** | `VideoDubber` - voice cloning and revoicing with timing sync |
201
204
  | **Object swapping** | `ObjectSwapper` - detect, segment, and inpaint objects in video |
202
205
 
@@ -8,6 +8,8 @@ Minimal, LLM-friendly Python library for programmatic video editing, processing,
8
8
 
9
9
  Full documentation: [videopython.com](https://videopython.com)
10
10
 
11
+ > **Disclaimer:** This project started as a hand-written hobby project, but most of the code is now produced by LLM agents. Humans still drive direction, approve changes, and own design decisions.
12
+
11
13
  ## Installation
12
14
 
13
15
  ### 1. Install FFmpeg
@@ -145,10 +147,10 @@ API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopyth
145
147
  | Area | Highlights |
146
148
  |---|---|
147
149
  | **Generation** | `TextToVideo`, `ImageToVideo`, `TextToImage`, `TextToSpeech`, `TextToMusic` |
148
- | **Understanding** | `AudioToText` (transcription), `AudioClassifier`, `SceneVLM` (visual scene description), `ActionRecognizer` |
150
+ | **Understanding** | `AudioToText` (transcription), `AudioClassifier`, `SceneVLM` (structured visual scene description), `FaceTracker` (per-shot face tracks) |
149
151
  | **Scene detection** | `SemanticSceneDetector` (neural scene boundaries) |
150
152
  | **Video analysis** | `VideoAnalyzer` - full-pipeline analysis combining multiple AI capabilities |
151
- | **Transforms** | `FaceTracker`, `FaceTrackingCrop`, `SplitScreenComposite` |
153
+ | **Transforms** | `FaceTrackingCrop`, `SplitScreenComposite` |
152
154
  | **Dubbing** | `VideoDubber` - voice cloning and revoicing with timing sync |
153
155
  | **Object swapping** | `ObjectSwapper` - detect, segment, and inpaint objects in video |
154
156
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.28.2"
3
+ version = "0.29.0"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -70,7 +70,6 @@ ai = [
70
70
  "scikit-learn>=1.3.0",
71
71
  # Detection backends
72
72
  "ultralytics>=8.0.0",
73
- "easyocr>=1.7.0",
74
73
  # Audio classification (AST via transformers - no separate dep needed)
75
74
  # Scene detection
76
75
  "transnetv2-pytorch>=1.0.5",
@@ -84,6 +83,11 @@ ai = [
84
83
  "llama-cpp-python>=0.3.0",
85
84
  # Loudness measurement (BS.1770) for dub-vs-source loudness matching (M3)
86
85
  "pyloudnorm>=0.1.1",
86
+ # Vision-language preprocessing for Qwen3.5 (M5) - documented prerequisite
87
+ # for AutoModelForImageTextToText with image/video chat templates.
88
+ "qwen-vl-utils>=0.0.10",
89
+ # Perceptual hashing for SceneVLM frame dedup (M5)
90
+ "imagehash>=4.3",
87
91
  ]
88
92
 
89
93
  # Required for pip install videopython[ai] - pip uses optional-dependencies, not dependency-groups
@@ -105,7 +109,6 @@ ai = [
105
109
  "scikit-learn>=1.3.0",
106
110
  # Detection backends
107
111
  "ultralytics>=8.0.0",
108
- "easyocr>=1.7.0",
109
112
  # Audio classification (AST via transformers - no separate dep needed)
110
113
  # Scene detection
111
114
  "transnetv2-pytorch>=1.0.5",
@@ -119,6 +122,11 @@ ai = [
119
122
  "llama-cpp-python>=0.3.0",
120
123
  # Loudness measurement (BS.1770) for dub-vs-source loudness matching (M3)
121
124
  "pyloudnorm>=0.1.1",
125
+ # Vision-language preprocessing for Qwen3.5 (M5) - documented prerequisite
126
+ # for AutoModelForImageTextToText with image/video chat templates.
127
+ "qwen-vl-utils>=0.0.10",
128
+ # Perceptual hashing for SceneVLM frame dedup (M5)
129
+ "imagehash>=4.3",
122
130
  ]
123
131
 
124
132
  [project.urls]
@@ -135,7 +143,6 @@ module = [
135
143
  "diffusers", "diffusers.*",
136
144
  "ollama", "ollama.*",
137
145
  "ultralytics", "ultralytics.*",
138
- "easyocr", "easyocr.*",
139
146
  "transformers", "transformers.*",
140
147
  "transnetv2_pytorch", "transnetv2_pytorch.*",
141
148
  "chatterbox", "chatterbox.*",
@@ -146,6 +153,8 @@ module = [
146
153
  "cv2", "cv2.*",
147
154
  "llama_cpp", "llama_cpp.*",
148
155
  "pyloudnorm", "pyloudnorm.*",
156
+ "qwen_vl_utils", "qwen_vl_utils.*",
157
+ "imagehash", "imagehash.*",
149
158
  ]
150
159
  ignore_missing_imports = true
151
160
 
@@ -2,11 +2,11 @@ from videopython.ai import registry as _ai_registry # noqa: F401
2
2
 
3
3
  from .generation import ImageToVideo, TextToImage, TextToMusic, TextToSpeech, TextToVideo
4
4
  from .swapping import ObjectSwapper
5
- from .transforms import FaceTracker, FaceTrackingCrop, SplitScreenComposite
5
+ from .transforms import FaceTrackingCrop, SplitScreenComposite
6
6
  from .understanding import (
7
- ActionRecognizer,
8
7
  AudioClassifier,
9
8
  AudioToText,
9
+ FaceTracker,
10
10
  SceneVLM,
11
11
  SemanticSceneDetector,
12
12
  )
@@ -22,12 +22,10 @@ __all__ = [
22
22
  # Understanding
23
23
  "AudioToText",
24
24
  "AudioClassifier",
25
+ "FaceTracker",
25
26
  "SceneVLM",
26
- # Temporal
27
- "ActionRecognizer",
28
27
  "SemanticSceneDetector",
29
28
  # Transforms (AI-powered)
30
- "FaceTracker",
31
29
  "FaceTrackingCrop",
32
30
  "SplitScreenComposite",
33
31
  # Swapping
@@ -2,7 +2,13 @@
2
2
 
3
3
  from videopython.ai.dubbing.cache import DubCache, dub_cache_clear
4
4
  from videopython.ai.dubbing.dubber import VideoDubber
5
- from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio, TranslatedSegment
5
+ from videopython.ai.dubbing.models import (
6
+ DubbingResult,
7
+ Expressiveness,
8
+ RevoiceResult,
9
+ SeparatedAudio,
10
+ TranslatedSegment,
11
+ )
6
12
  from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
7
13
  from videopython.ai.dubbing.quality import GarbageTranscriptError, TranscriptQuality, assess_transcript
8
14
  from videopython.ai.dubbing.timing import TimingSynchronizer
@@ -22,4 +28,5 @@ __all__ = [
22
28
  "UnsupportedLanguageError",
23
29
  "DubCache",
24
30
  "dub_cache_clear",
31
+ "Expressiveness",
25
32
  ]
@@ -157,14 +157,27 @@ class DubCache:
157
157
  translated_text: str,
158
158
  voice_sample_bytes: bytes | None,
159
159
  language: str,
160
+ exaggeration: float | None = None,
161
+ cfg_weight: float | None = None,
162
+ temperature: float | None = None,
160
163
  ) -> str:
161
- """Per-segment key over text + voice sample + language."""
164
+ """Per-segment key over text + voice sample + language + expressiveness.
165
+
166
+ ``exaggeration`` / ``cfg_weight`` / ``temperature`` are the M4
167
+ Chatterbox knobs. Defaulting to ``None`` keeps pre-M4 callers that
168
+ omit them hashing the same way (no-knob profile collides with
169
+ absent kwargs), so cache invalidation is driven by *passing
170
+ non-None values*, not by the M4 code path being present.
171
+ """
162
172
  h = hashlib.sha256()
163
173
  h.update(translated_text.encode("utf-8"))
164
174
  h.update(b"\x00")
165
175
  h.update(voice_sample_bytes or b"")
166
176
  h.update(b"\x00")
167
177
  h.update(language.encode("utf-8"))
178
+ for knob in (exaggeration, cfg_weight, temperature):
179
+ h.update(b"\x00")
180
+ h.update(repr(knob).encode("utf-8"))
168
181
  return h.hexdigest()[:16]
169
182
 
170
183
  # ----- path resolution -------------------------------------------------
@@ -19,6 +19,42 @@ if TYPE_CHECKING:
19
19
  CLEAN_SPEED_TOLERANCE = 0.01
20
20
 
21
21
 
22
+ @dataclass(frozen=True)
23
+ class Expressiveness:
24
+ """Chatterbox ``generate()`` knobs derived from source-segment prosody.
25
+
26
+ ``None`` on any field means "let Chatterbox use its own default" —
27
+ avoids pinning the dub against future Chatterbox default changes.
28
+
29
+ Attributes:
30
+ exaggeration: Emotional intensity. Chatterbox default ``0.5``;
31
+ ``0.7+`` produces dramatic output.
32
+ cfg_weight: Classifier-free guidance weight. Chatterbox default
33
+ ``0.5``; lower values (~``0.3``) slow pacing.
34
+ temperature: Sampling temperature. Chatterbox default ``0.8``.
35
+ """
36
+
37
+ exaggeration: float | None = None
38
+ cfg_weight: float | None = None
39
+ temperature: float | None = None
40
+
41
+ def as_kwargs(self) -> dict[str, float]:
42
+ """Knobs as a dict, dropping ``None`` entries.
43
+
44
+ Suitable for ``**``-expansion into Chatterbox or
45
+ :meth:`DubCache.tts_key`.
46
+ """
47
+ return {
48
+ name: value
49
+ for name, value in (
50
+ ("exaggeration", self.exaggeration),
51
+ ("cfg_weight", self.cfg_weight),
52
+ ("temperature", self.temperature),
53
+ )
54
+ if value is not None
55
+ }
56
+
57
+
22
58
  @dataclass
23
59
  class TranslatedSegment:
24
60
  """A segment of translated text with timing information.
@@ -11,7 +11,7 @@ import numpy as np
11
11
 
12
12
  from videopython.ai._device import select_device
13
13
  from videopython.ai.dubbing.cache import DubCache
14
- from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio, TimingSummary
14
+ from videopython.ai.dubbing.models import DubbingResult, Expressiveness, RevoiceResult, SeparatedAudio, TimingSummary
15
15
  from videopython.ai.dubbing.quality import GarbageTranscriptError, assess_transcript
16
16
  from videopython.ai.dubbing.timing import TimingSynchronizer
17
17
  from videopython.ai.generation.qwen3 import Qwen3Translator
@@ -118,6 +118,40 @@ PEAK_CLIP_THRESHOLD = 0.99
118
118
  MIN_VOCAL_BG_RMS_RATIO = 1.5
119
119
  VOICE_SAMPLE_TARGET_DURATION = 6.0
120
120
 
121
+ # Prosody-conditioning thresholds. Source-segment RMS / whole-vocals RMS
122
+ # below CALM lands in the calm bucket; above DRAMATIC in the dramatic
123
+ # bucket; in between gets Chatterbox's defaults. Knob values picked
124
+ # by-ear on cam1_1min.mp4 — see RELEASE_NOTES 0.29.0.
125
+ CALM_RATIO_THRESHOLD = 0.7
126
+ DRAMATIC_RATIO_THRESHOLD = 1.3
127
+ _CALM = Expressiveness(exaggeration=0.3, cfg_weight=0.7)
128
+ _DRAMATIC = Expressiveness(exaggeration=0.85, cfg_weight=0.35)
129
+
130
+
131
+ def _rms(data: np.ndarray) -> float:
132
+ """RMS over samples; ``0.0`` for empty input. float64 reduction so a
133
+ long slice can't overflow the squared accumulator."""
134
+ if data.size == 0:
135
+ return 0.0
136
+ return float(np.sqrt(np.mean(np.square(data, dtype=np.float64))))
137
+
138
+
139
+ def _expressiveness_for(source_slice: Audio, baseline_rms: float) -> Expressiveness:
140
+ """Map a source vocals slice to a Chatterbox expressiveness profile
141
+ by RMS ratio. Falls back to the no-knobs default for empty or silent
142
+ inputs."""
143
+ if baseline_rms <= 0.0:
144
+ return Expressiveness()
145
+ segment_rms = _rms(source_slice.data)
146
+ if segment_rms <= 0.0:
147
+ return Expressiveness()
148
+ ratio = segment_rms / baseline_rms
149
+ if ratio < CALM_RATIO_THRESHOLD:
150
+ return _CALM
151
+ if ratio > DRAMATIC_RATIO_THRESHOLD:
152
+ return _DRAMATIC
153
+ return Expressiveness()
154
+
121
155
 
122
156
  class LocalDubbingPipeline:
123
157
  """Local pipeline for video dubbing.
@@ -236,6 +270,7 @@ class LocalDubbingPipeline:
236
270
  voice_samples: dict[str, Audio],
237
271
  speaker_wav_paths: dict[str, Path],
238
272
  src_hash_for_tts: str,
273
+ expressiveness: Expressiveness = Expressiveness(),
239
274
  ) -> Audio | None:
240
275
  """Produce the TTS audio for a single segment, with cache-around-the-call.
241
276
 
@@ -244,6 +279,11 @@ class LocalDubbingPipeline:
244
279
  TTS model is lazy-initialized and the per-speaker temp WAV is
245
280
  materialized before generation; on cache hit none of that runs,
246
281
  so a fully-cached run never loads Chatterbox.
282
+
283
+ ``expressiveness`` carries the M4 Chatterbox knobs derived from
284
+ the source segment's prosody. Default is the no-knobs profile —
285
+ lets Chatterbox use its own defaults — so callers that don't yet
286
+ derive prosody (e.g. ``revoice``) keep pre-M4 behaviour.
247
287
  """
248
288
  from videopython.base.audio import Audio as _Audio
249
289
 
@@ -253,6 +293,7 @@ class LocalDubbingPipeline:
253
293
  translated_text=segment.translated_text,
254
294
  voice_sample_bytes=speaker_bytes,
255
295
  language=target_lang,
296
+ **expressiveness.as_kwargs(),
256
297
  )
257
298
  cached_path = self._cache.get_tts_path(src_hash_for_tts, tts_cache_key)
258
299
  if cached_path is not None:
@@ -270,10 +311,11 @@ class LocalDubbingPipeline:
270
311
 
271
312
  wav_path = speaker_wav_paths.get(speaker) if voice_clone else None
272
313
  try:
273
- if wav_path is not None:
274
- dubbed_audio = self._tts.generate_audio(segment.translated_text, voice_sample_path=wav_path)
275
- else:
276
- dubbed_audio = self._tts.generate_audio(segment.translated_text)
314
+ dubbed_audio = self._tts.generate_audio(
315
+ segment.translated_text,
316
+ voice_sample_path=wav_path,
317
+ **expressiveness.as_kwargs(),
318
+ )
277
319
  except Exception as exc:
278
320
  # Chatterbox occasionally crashes on short translated text
279
321
  # (alignment_stream_analyzer indexing on tensors with <=5
@@ -748,16 +790,32 @@ class LocalDubbingPipeline:
748
790
  report_progress("Extracting voice samples", 0.25)
749
791
  voice_samples = self._extract_voice_samples(vocal_audio, background_audio, transcription)
750
792
 
751
- # vocals is no longer needed; voice_samples are independent copies.
752
- # In low_memory mode this is the only ref keeping the buffer alive
753
- # (separated_audio was dropped above), so dropping the local frees it.
754
- del vocal_audio
755
-
756
793
  report_progress("Translating text", 0.35)
757
794
  translated_segments, translation_failures = self._translate_with_cache(
758
795
  transcription, source_audio, detected_lang, target_lang, report_progress
759
796
  )
760
797
 
798
+ # Per-segment expressiveness derived from source vocals RMS.
799
+ # Computed before vocal_audio is released so the TTS loop doesn't
800
+ # hold the buffer. Segment ends are clamped to the vocals duration
801
+ # — transcription timestamps can drift past the buffer tail
802
+ # (especially on synthetic test audio) and Audio.slice rejects
803
+ # out-of-range ends past a 0.1s tolerance.
804
+ baseline_rms = _rms(vocal_audio.data)
805
+ vocal_duration = vocal_audio.metadata.duration_seconds
806
+ expressiveness_per_segment = [
807
+ _expressiveness_for(
808
+ vocal_audio.slice(min(s.start, vocal_duration), min(s.end, vocal_duration)),
809
+ baseline_rms,
810
+ )
811
+ for s in translated_segments
812
+ ]
813
+
814
+ # vocals is no longer needed; voice_samples are independent copies.
815
+ # In low_memory mode this is the only ref keeping the buffer alive
816
+ # (separated_audio was dropped above), so dropping the local frees it.
817
+ del vocal_audio
818
+
761
819
  report_progress("Generating dubbed speech", 0.50)
762
820
 
763
821
  # Per-speaker voice-sample bytes for TTS cache key. Empty when
@@ -800,6 +858,7 @@ class LocalDubbingPipeline:
800
858
  voice_samples=voice_samples,
801
859
  speaker_wav_paths=speaker_wav_paths,
802
860
  src_hash_for_tts=src_hash_for_tts,
861
+ expressiveness=expressiveness_per_segment[i],
803
862
  )
804
863
  if dubbed_audio is None:
805
864
  continue
@@ -51,6 +51,9 @@ class TextToSpeech:
51
51
  text: str,
52
52
  voice_sample: Audio | None = None,
53
53
  voice_sample_path: str | Path | None = None,
54
+ exaggeration: float | None = None,
55
+ cfg_weight: float | None = None,
56
+ temperature: float | None = None,
54
57
  ) -> Audio:
55
58
  """Generate speech audio from text.
56
59
 
@@ -64,6 +67,15 @@ class TextToSpeech:
64
67
  precedence over ``voice_sample`` and ``self.voice``. Used by
65
68
  the dubbing pipeline to encode each speaker's sample once and
66
69
  reuse it across all of that speaker's segments.
70
+ exaggeration: Chatterbox emotional-intensity knob (default
71
+ ``0.5``). ``None`` (default) means do not pass the kwarg —
72
+ Chatterbox uses its own default and we stay forward-compatible
73
+ with changes to it. ``0.7+`` produces dramatic output.
74
+ cfg_weight: Chatterbox classifier-free-guidance weight (default
75
+ ``0.5``). ``None`` means do not pass. Lower values (~``0.3``)
76
+ slow pacing.
77
+ temperature: Chatterbox sampling temperature (default ``0.8``).
78
+ ``None`` means do not pass.
67
79
  """
68
80
  import tempfile
69
81
  from pathlib import Path
@@ -86,11 +98,23 @@ class TextToSpeech:
86
98
  speaker_wav_path = Path(f.name)
87
99
  cleanup_path = True
88
100
 
101
+ # Only forward knobs the caller explicitly set. Passing nothing
102
+ # for a knob lets Chatterbox use its own default — important so a
103
+ # future Chatterbox default change doesn't get pinned by us.
104
+ knobs: dict[str, float] = {}
105
+ if exaggeration is not None:
106
+ knobs["exaggeration"] = exaggeration
107
+ if cfg_weight is not None:
108
+ knobs["cfg_weight"] = cfg_weight
109
+ if temperature is not None:
110
+ knobs["temperature"] = temperature
111
+
89
112
  try:
90
113
  wav = self._model.generate(
91
114
  text=text,
92
115
  language_id=self.language,
93
116
  audio_prompt_path=str(speaker_wav_path) if speaker_wav_path else None,
117
+ **knobs,
94
118
  )
95
119
 
96
120
  audio_data = wav.cpu().float().numpy().squeeze()