videopython 0.25.4__tar.gz → 0.25.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. {videopython-0.25.4 → videopython-0.25.6}/PKG-INFO +7 -14
  2. {videopython-0.25.4 → videopython-0.25.6}/README.md +1 -1
  3. {videopython-0.25.4 → videopython-0.25.6}/pyproject.toml +35 -20
  4. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/ai/dubbing/dubber.py +10 -1
  5. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/ai/dubbing/pipeline.py +28 -14
  6. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/ai/dubbing/timing.py +2 -2
  7. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/ai/generation/audio.py +27 -48
  8. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/ai/generation/translation.py +7 -3
  9. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/base/effects.py +21 -33
  10. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/base/transitions.py +2 -1
  11. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/editing/multicam.py +26 -13
  12. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/editing/premiere_xml.py +3 -3
  13. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/editing/video_edit.py +11 -12
  14. {videopython-0.25.4 → videopython-0.25.6}/.gitignore +0 -0
  15. {videopython-0.25.4 → videopython-0.25.6}/LICENSE +0 -0
  16. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/__init__.py +0 -0
  17. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/ai/__init__.py +0 -0
  18. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/ai/_device.py +0 -0
  19. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/ai/dubbing/__init__.py +0 -0
  20. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/ai/dubbing/models.py +0 -0
  21. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/ai/generation/__init__.py +0 -0
  22. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/ai/generation/image.py +0 -0
  23. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/ai/generation/video.py +0 -0
  24. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/ai/registry.py +0 -0
  25. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/ai/swapping/__init__.py +0 -0
  26. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/ai/swapping/inpainter.py +0 -0
  27. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/ai/swapping/models.py +0 -0
  28. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/ai/swapping/segmenter.py +0 -0
  29. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/ai/swapping/swapper.py +0 -0
  30. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/ai/transforms.py +0 -0
  31. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/ai/understanding/__init__.py +0 -0
  32. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/ai/understanding/audio.py +0 -0
  33. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/ai/understanding/image.py +0 -0
  34. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/ai/understanding/separation.py +0 -0
  35. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/ai/understanding/temporal.py +0 -0
  36. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/ai/video_analysis.py +0 -0
  37. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/base/__init__.py +0 -0
  38. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/base/audio/__init__.py +0 -0
  39. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/base/audio/analysis.py +0 -0
  40. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/base/audio/audio.py +0 -0
  41. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/base/combine.py +0 -0
  42. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/base/description.py +0 -0
  43. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/base/exceptions.py +0 -0
  44. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/base/progress.py +0 -0
  45. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/base/registry.py +0 -0
  46. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/base/scene.py +0 -0
  47. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/base/text/__init__.py +0 -0
  48. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/base/text/overlay.py +0 -0
  49. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/base/text/transcription.py +0 -0
  50. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/base/transforms.py +0 -0
  51. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/base/utils.py +0 -0
  52. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/base/video.py +0 -0
  53. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/editing/__init__.py +0 -0
  54. {videopython-0.25.4 → videopython-0.25.6}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.25.4
3
+ Version: 0.25.6
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -21,13 +21,12 @@ Requires-Dist: numpy>=1.25.2
21
21
  Requires-Dist: opencv-python-headless>=4.9.0.80
22
22
  Requires-Dist: pillow>=12.1.1
23
23
  Requires-Dist: pydantic>=2.8.0
24
- Requires-Dist: torchcodec>=0.9.1
25
24
  Requires-Dist: tqdm>=4.66.3
26
25
  Provides-Extra: ai
27
26
  Requires-Dist: accelerate>=0.29.2; extra == 'ai'
28
- Requires-Dist: coqui-tts>=0.24.0; extra == 'ai'
27
+ Requires-Dist: chatterbox-tts>=0.1.7; extra == 'ai'
29
28
  Requires-Dist: demucs>=4.0.0; extra == 'ai'
30
- Requires-Dist: diffusers>=0.26.3; extra == 'ai'
29
+ Requires-Dist: diffusers>=0.30.0; extra == 'ai'
31
30
  Requires-Dist: easyocr>=1.7.0; extra == 'ai'
32
31
  Requires-Dist: hf-transfer>=0.1.9; extra == 'ai'
33
32
  Requires-Dist: numba>=0.61.0; extra == 'ai'
@@ -36,18 +35,12 @@ Requires-Dist: openai-whisper>=20240930; extra == 'ai'
36
35
  Requires-Dist: pyannote-audio>=4.0.0; extra == 'ai'
37
36
  Requires-Dist: scikit-learn>=1.3.0; extra == 'ai'
38
37
  Requires-Dist: scipy>=1.10.0; extra == 'ai'
39
- Requires-Dist: torch>=2.1.0; extra == 'ai'
38
+ Requires-Dist: sentencepiece>=0.1.99; extra == 'ai'
39
+ Requires-Dist: torch>=2.8.0; extra == 'ai'
40
+ Requires-Dist: torchaudio>=2.8.0; extra == 'ai'
40
41
  Requires-Dist: transformers>=5.2.0; extra == 'ai'
41
42
  Requires-Dist: transnetv2-pytorch>=1.0.5; extra == 'ai'
42
43
  Requires-Dist: ultralytics>=8.0.0; extra == 'ai'
43
- Provides-Extra: dev
44
- Requires-Dist: mypy>=1.8.0; extra == 'dev'
45
- Requires-Dist: pre-commit>=3.8.0; extra == 'dev'
46
- Requires-Dist: pytest-cov>=6.1.1; extra == 'dev'
47
- Requires-Dist: pytest>=7.4.0; extra == 'dev'
48
- Requires-Dist: ruff>=0.1.14; extra == 'dev'
49
- Requires-Dist: types-pillow>=10.2.0.20240213; extra == 'dev'
50
- Requires-Dist: types-tqdm>=4.66.0.20240106; extra == 'dev'
51
44
  Description-Content-Type: text/markdown
52
45
 
53
46
  # videopython
@@ -133,7 +126,7 @@ final.save("output.mp4")
133
126
  from videopython.ai import TextToImage, ImageToVideo, TextToSpeech
134
127
 
135
128
  image = TextToImage().generate_image("A cinematic mountain sunrise")
136
- video = ImageToVideo().generate_video(image=image, fps=24).resize(1080, 1920)
129
+ video = ImageToVideo().generate_video(image=image).resize(1080, 1920)
137
130
  audio = TextToSpeech().generate_audio("Welcome to videopython.")
138
131
  video.add_audio(audio).save("ai_video.mp4")
139
132
  ```
@@ -81,7 +81,7 @@ final.save("output.mp4")
81
81
  from videopython.ai import TextToImage, ImageToVideo, TextToSpeech
82
82
 
83
83
  image = TextToImage().generate_image("A cinematic mountain sunrise")
84
- video = ImageToVideo().generate_video(image=image, fps=24).resize(1080, 1920)
84
+ video = ImageToVideo().generate_video(image=image).resize(1080, 1920)
85
85
  audio = TextToSpeech().generate_audio("Welcome to videopython.")
86
86
  video.add_audio(audio).save("ai_video.mp4")
87
87
  ```
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.25.4"
3
+ version = "0.25.6"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -35,7 +35,6 @@ dependencies = [
35
35
  "numpy>=1.25.2",
36
36
  "opencv-python-headless>=4.9.0.80",
37
37
  "pillow>=12.1.1",
38
- "torchcodec>=0.9.1",
39
38
  "tqdm>=4.66.3",
40
39
  "pydantic>=2.8.0",
41
40
  ]
@@ -56,9 +55,10 @@ dev = [
56
55
  ]
57
56
  ai = [
58
57
  "accelerate>=0.29.2",
59
- "diffusers>=0.26.3",
58
+ "diffusers>=0.30.0",
60
59
  "hf-transfer>=0.1.9",
61
- "torch>=2.1.0",
60
+ "torch>=2.8.0",
61
+ "torchaudio>=2.8.0",
62
62
  "transformers>=5.2.0",
63
63
  "openai-whisper>=20240930",
64
64
  "pyannote-audio>=4.0.0",
@@ -72,28 +72,22 @@ ai = [
72
72
  # Audio classification (AST via transformers - no separate dep needed)
73
73
  # Scene detection
74
74
  "transnetv2-pytorch>=1.0.5",
75
- # Voice cloning TTS (coqui-tts is the maintained fork of TTS)
76
- "coqui-tts>=0.24.0",
75
+ # Voice cloning TTS (Chatterbox Multilingual by Resemble AI)
76
+ "chatterbox-tts>=0.1.7",
77
+ # Translation (Marian MT tokenizer requires sentencepiece)
78
+ "sentencepiece>=0.1.99",
77
79
  # Audio source separation
78
80
  "demucs>=4.0.0",
79
81
  ]
80
82
 
81
83
  # Required for pip install videopython[ai] - pip uses optional-dependencies, not dependency-groups
82
84
  [project.optional-dependencies]
83
- dev = [
84
- "pre-commit>=3.8.0",
85
- "ruff>=0.1.14",
86
- "mypy>=1.8.0",
87
- "pytest>=7.4.0",
88
- "types-Pillow>=10.2.0.20240213",
89
- "types-tqdm>=4.66.0.20240106",
90
- "pytest-cov>=6.1.1",
91
- ]
92
85
  ai = [
93
86
  "accelerate>=0.29.2",
94
- "diffusers>=0.26.3",
87
+ "diffusers>=0.30.0",
95
88
  "hf-transfer>=0.1.9",
96
- "torch>=2.1.0",
89
+ "torch>=2.8.0",
90
+ "torchaudio>=2.8.0",
97
91
  "transformers>=5.2.0",
98
92
  "openai-whisper>=20240930",
99
93
  "pyannote-audio>=4.0.0",
@@ -107,8 +101,10 @@ ai = [
107
101
  # Audio classification (AST via transformers - no separate dep needed)
108
102
  # Scene detection
109
103
  "transnetv2-pytorch>=1.0.5",
110
- # Voice cloning TTS (coqui-tts is the maintained fork of TTS)
111
- "coqui-tts>=0.24.0",
104
+ # Voice cloning TTS (Chatterbox Multilingual by Resemble AI)
105
+ "chatterbox-tts>=0.1.7",
106
+ # Translation (Marian MT tokenizer requires sentencepiece)
107
+ "sentencepiece>=0.1.99",
112
108
  # Audio source separation
113
109
  "demucs>=4.0.0",
114
110
  ]
@@ -130,13 +126,32 @@ module = [
130
126
  "easyocr", "easyocr.*",
131
127
  "transformers", "transformers.*",
132
128
  "transnetv2_pytorch", "transnetv2_pytorch.*",
133
- "TTS", "TTS.*", "coqui", "coqui.*",
129
+ "chatterbox", "chatterbox.*",
134
130
  "demucs", "demucs.*",
131
+ "huggingface_hub", "huggingface_hub.*",
135
132
  "pyannote", "pyannote.*",
136
133
  "cv2", "cv2.*",
137
134
  ]
138
135
  ignore_missing_imports = true
139
136
 
137
+ [tool.uv]
138
+ # chatterbox-tts 0.1.7 pins strict versions of torch, torchaudio, numpy, and
139
+ # diffusers that conflict with pyannote-audio (torch>=2.8) and CogVideoX
140
+ # (diffusers>=0.30). Override to let the resolver pick compatible versions.
141
+ # The ai dependency floors are aligned with these overrides to keep pip and uv
142
+ # resolving similar versions.
143
+ override-dependencies = [
144
+ "torch>=2.8.0", "torchaudio>=2.8.0", "numpy>=2.0.0", "diffusers>=0.30.0",
145
+ # ultralytics depends on opencv-python which conflicts with our
146
+ # opencv-python-headless (both provide cv2). Exclude opencv-python so
147
+ # only the headless variant is installed.
148
+ "opencv-python ; sys_platform == '_'",
149
+ ]
150
+ # Pin minimum versions for transitive deps with known vulnerabilities.
151
+ # Pygments 2.20.0 has a security fix but breaks mkdocs (passes None to html.escape).
152
+ # Keep pygments<2.20.0 until a compatible release is available.
153
+ constraint-dependencies = ["requests>=2.33.0", "pygments>=2.19.2,<2.20.0"]
154
+
140
155
  [build-system]
141
156
  requires = ["hatchling"]
142
157
  build-backend = "hatchling.build"
@@ -34,9 +34,15 @@ class VideoDubber:
34
34
  source_lang: str | None = None,
35
35
  preserve_background: bool = True,
36
36
  voice_clone: bool = True,
37
+ enable_diarization: bool = False,
37
38
  progress_callback: Callable[[str, float], None] | None = None,
38
39
  ) -> DubbingResult:
39
- """Dub a video into a target language."""
40
+ """Dub a video into a target language.
41
+
42
+ Args:
43
+ enable_diarization: Enable speaker diarization to clone each speaker's
44
+ voice separately. Requires additional VRAM for the diarization model.
45
+ """
40
46
  if self._local_pipeline is None:
41
47
  self._init_local_pipeline()
42
48
 
@@ -46,6 +52,7 @@ class VideoDubber:
46
52
  source_lang=source_lang,
47
53
  preserve_background=preserve_background,
48
54
  voice_clone=voice_clone,
55
+ enable_diarization=enable_diarization,
49
56
  progress_callback=progress_callback,
50
57
  )
51
58
 
@@ -56,6 +63,7 @@ class VideoDubber:
56
63
  source_lang: str | None = None,
57
64
  preserve_background: bool = True,
58
65
  voice_clone: bool = True,
66
+ enable_diarization: bool = False,
59
67
  progress_callback: Callable[[str, float], None] | None = None,
60
68
  ) -> Video:
61
69
  """Dub a video and return a new video with the dubbed audio."""
@@ -65,6 +73,7 @@ class VideoDubber:
65
73
  source_lang=source_lang,
66
74
  preserve_background=preserve_background,
67
75
  voice_clone=voice_clone,
76
+ enable_diarization=enable_diarization,
68
77
  progress_callback=progress_callback,
69
78
  )
70
79
  return video.add_audio(result.dubbed_audio, overlay=False)
@@ -23,16 +23,19 @@ class LocalDubbingPipeline:
23
23
  logger.info("LocalDubbingPipeline initialized with device=%s", requested)
24
24
 
25
25
  self._transcriber: Any = None
26
+ self._transcriber_diarization: bool | None = None
26
27
  self._translator: Any = None
27
28
  self._tts: Any = None
29
+ self._tts_voice_clone: bool | None = None
30
+ self._tts_language: str | None = None
28
31
  self._separator: Any = None
29
32
  self._synchronizer: TimingSynchronizer | None = None
30
33
 
31
- def _init_transcriber(self) -> None:
34
+ def _init_transcriber(self, enable_diarization: bool = False) -> None:
32
35
  """Initialize the transcription model."""
33
36
  from videopython.ai.understanding.audio import AudioToText
34
37
 
35
- self._transcriber = AudioToText(device=self.device)
38
+ self._transcriber = AudioToText(device=self.device, enable_diarization=enable_diarization)
36
39
 
37
40
  def _init_translator(self) -> None:
38
41
  """Initialize the translation model."""
@@ -40,17 +43,18 @@ class LocalDubbingPipeline:
40
43
 
41
44
  self._translator = TextTranslator(device=self.device)
42
45
 
43
- def _init_tts(self, voice_clone: bool = False) -> None:
46
+ def _init_tts(self, voice_clone: bool = False, language: str = "en") -> None:
44
47
  """Initialize the text-to-speech model."""
45
48
  from videopython.ai.generation.audio import TextToSpeech
46
49
 
47
50
  if voice_clone:
48
51
  self._tts = TextToSpeech(
49
- model_size="xtts",
52
+ model_size="chatterbox",
50
53
  device=self.device,
54
+ language=language,
51
55
  )
52
56
  else:
53
- self._tts = TextToSpeech(device=self.device)
57
+ self._tts = TextToSpeech(device=self.device, language=language)
54
58
 
55
59
  def _init_separator(self) -> None:
56
60
  """Initialize the audio separator."""
@@ -108,6 +112,7 @@ class LocalDubbingPipeline:
108
112
  source_lang: str | None = None,
109
113
  preserve_background: bool = True,
110
114
  voice_clone: bool = True,
115
+ enable_diarization: bool = False,
111
116
  progress_callback: Callable[[str, float], None] | None = None,
112
117
  ) -> DubbingResult:
113
118
  """Process a video through the local dubbing pipeline."""
@@ -118,8 +123,9 @@ class LocalDubbingPipeline:
118
123
  progress_callback(stage, progress)
119
124
 
120
125
  report_progress("Transcribing audio", 0.05)
121
- if self._transcriber is None:
122
- self._init_transcriber()
126
+ if self._transcriber is None or self._transcriber_diarization != enable_diarization:
127
+ self._init_transcriber(enable_diarization=enable_diarization)
128
+ self._transcriber_diarization = enable_diarization
123
129
 
124
130
  source_audio = video.audio
125
131
  transcription = self._transcriber.transcribe(source_audio)
@@ -133,7 +139,7 @@ class LocalDubbingPipeline:
133
139
  target_lang=target_lang,
134
140
  )
135
141
 
136
- detected_lang = source_lang or "en"
142
+ detected_lang = source_lang or transcription.language or "en"
137
143
 
138
144
  separated_audio: SeparatedAudio | None = None
139
145
  vocal_audio = source_audio
@@ -162,14 +168,19 @@ class LocalDubbingPipeline:
162
168
  )
163
169
 
164
170
  report_progress("Generating dubbed speech", 0.50)
165
- if self._tts is None:
166
- self._init_tts(voice_clone=voice_clone)
171
+ if self._tts is None or self._tts_voice_clone != voice_clone or self._tts_language != target_lang:
172
+ self._init_tts(voice_clone=voice_clone, language=target_lang)
173
+ self._tts_voice_clone = voice_clone
174
+ self._tts_language = target_lang
167
175
 
168
176
  dubbed_segments: list[Audio] = []
169
177
  target_durations: list[float] = []
170
178
  start_times: list[float] = []
171
179
 
172
180
  for i, segment in enumerate(translated_segments):
181
+ if segment.duration < 0.1:
182
+ continue
183
+
173
184
  progress = 0.50 + (0.30 * (i / len(translated_segments)))
174
185
  report_progress(f"Generating speech ({i + 1}/{len(translated_segments)})", progress)
175
186
 
@@ -235,8 +246,9 @@ class LocalDubbingPipeline:
235
246
  original_duration = source_audio.metadata.duration_seconds
236
247
 
237
248
  report_progress("Analyzing audio", 0.05)
238
- if self._transcriber is None:
239
- self._init_transcriber()
249
+ if self._transcriber is None or self._transcriber_diarization is not False:
250
+ self._init_transcriber(enable_diarization=False)
251
+ self._transcriber_diarization = False
240
252
 
241
253
  transcription = self._transcriber.transcribe(source_audio)
242
254
 
@@ -264,8 +276,10 @@ class LocalDubbingPipeline:
264
276
  voice_sample = vocal_audio.slice(0, sample_duration)
265
277
 
266
278
  report_progress("Generating speech", 0.60)
267
- if self._tts is None:
268
- self._init_tts(voice_clone=True)
279
+ if self._tts is None or self._tts_voice_clone is not True or self._tts_language != "en":
280
+ self._init_tts(voice_clone=True, language="en")
281
+ self._tts_voice_clone = True
282
+ self._tts_language = "en"
269
283
 
270
284
  generated_speech = self._tts.generate_audio(text, voice_sample=voice_sample)
271
285
  speech_duration = generated_speech.metadata.duration_seconds
@@ -79,8 +79,8 @@ class TimingSynchronizer:
79
79
  """
80
80
  original_duration = audio.metadata.duration_seconds
81
81
 
82
- if original_duration <= 0:
83
- # Empty audio, return as-is
82
+ if original_duration <= 0 or target_duration <= 0:
83
+ # Empty audio or zero-length target, return as-is
84
84
  return audio, TimingAdjustment(
85
85
  segment_index=segment_index,
86
86
  original_duration=original_duration,
@@ -11,10 +11,13 @@ from videopython.base.audio import Audio, AudioMetadata
11
11
  class TextToSpeech:
12
12
  """Generates speech audio from text using local models.
13
13
 
14
- Supports Bark (`base`, `small`) and XTTS voice cloning (`xtts`).
14
+ Supports Bark (`base`, `small`) for general TTS and Chatterbox Multilingual
15
+ (`chatterbox`) for multilingual voice cloning.
15
16
  """
16
17
 
17
- SUPPORTED_LOCAL_MODELS: list[str] = ["base", "small", "xtts"]
18
+ SUPPORTED_LOCAL_MODELS: list[str] = ["base", "small", "chatterbox"]
19
+
20
+ CHATTERBOX_SAMPLE_RATE: int = 24000
18
21
 
19
22
  def __init__(
20
23
  self,
@@ -32,7 +35,7 @@ class TextToSpeech:
32
35
  self.language = language
33
36
  self._model: Any = None
34
37
  self._processor: Any = None
35
- self._xtts_model: Any = None
38
+ self._chatterbox_model: Any = None
36
39
 
37
40
  def _init_local(self) -> None:
38
41
  """Initialize local Bark model."""
@@ -51,43 +54,14 @@ class TextToSpeech:
51
54
  resolved_device=device,
52
55
  )
53
56
 
54
- def _patch_xtts_load_audio(self) -> None:
55
- """Patch XTTS load_audio to avoid torchcodec dependency issues."""
56
- import TTS.tts.models.xtts as xtts_module
57
-
58
- def load_audio_soundfile(audiopath: str, sampling_rate: int):
59
- import soundfile as sf # type: ignore[import-untyped]
60
- import torch
61
- import torchaudio.functional as F # type: ignore[import-untyped]
62
-
63
- audio_np, sr = sf.read(audiopath, dtype="float32")
64
-
65
- audio = torch.from_numpy(audio_np)
66
- if audio.dim() == 1:
67
- audio = audio.unsqueeze(0)
68
- else:
69
- audio = audio.T
70
-
71
- if audio.size(0) != 1:
72
- audio = torch.mean(audio, dim=0, keepdim=True)
73
-
74
- if sr != sampling_rate:
75
- audio = F.resample(audio, sr, sampling_rate)
76
-
77
- return audio
78
-
79
- xtts_module.load_audio = load_audio_soundfile
80
-
81
- def _init_xtts(self) -> None:
82
- """Initialize XTTS-v2 model for voice cloning."""
83
- from TTS.api import TTS
84
-
85
- self._patch_xtts_load_audio()
57
+ def _init_chatterbox(self) -> None:
58
+ """Initialize Chatterbox Multilingual model for voice cloning."""
59
+ from chatterbox.mtl_tts import ChatterboxMultilingualTTS # type: ignore[import-untyped]
86
60
 
87
61
  requested_device = self.device
88
62
  device = select_device(self.device, mps_allowed=False)
89
63
 
90
- self._xtts_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
64
+ self._chatterbox_model = ChatterboxMultilingualTTS.from_pretrained(device=device)
91
65
  self.device = device
92
66
  log_device_initialization(
93
67
  "TextToSpeech",
@@ -120,28 +94,32 @@ class TextToSpeech:
120
94
  )
121
95
  return Audio(audio_data, metadata)
122
96
 
123
- def _generate_xtts(self, text: str, voice_sample: Audio) -> Audio:
124
- """Generate speech using XTTS-v2 with voice cloning."""
97
+ def _generate_chatterbox(self, text: str, voice_sample: Audio) -> Audio:
98
+ """Generate speech using Chatterbox Multilingual with voice cloning."""
125
99
  import tempfile
126
100
  from pathlib import Path
127
101
 
128
102
  import numpy as np
129
103
 
130
- if self._xtts_model is None:
131
- self._init_xtts()
104
+ if self._chatterbox_model is None:
105
+ self._init_chatterbox()
132
106
 
133
107
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
134
108
  voice_sample.save(f.name)
135
109
  speaker_wav_path = Path(f.name)
136
110
 
137
111
  try:
138
- audio_list = self._xtts_model.tts(
112
+ wav = self._chatterbox_model.generate(
139
113
  text=text,
140
- speaker_wav=str(speaker_wav_path),
141
- language=self.language,
114
+ language_id=self.language,
115
+ audio_prompt_path=str(speaker_wav_path),
142
116
  )
143
- audio_data = np.array(audio_list, dtype=np.float32)
144
- sample_rate = 24000
117
+
118
+ audio_data = wav.cpu().float().numpy().squeeze()
119
+ if audio_data.ndim == 0:
120
+ audio_data = np.array([audio_data], dtype=np.float32)
121
+
122
+ sample_rate = self.CHATTERBOX_SAMPLE_RATE
145
123
 
146
124
  metadata = AudioMetadata(
147
125
  sample_rate=sample_rate,
@@ -163,12 +141,13 @@ class TextToSpeech:
163
141
  """Generate speech audio from text."""
164
142
  effective_voice = voice_preset or self.voice
165
143
 
166
- if self.model_size == "xtts" or voice_sample is not None:
144
+ if self.model_size == "chatterbox" or voice_sample is not None:
167
145
  if voice_sample is None:
168
146
  raise ValueError(
169
- "voice_sample is required for XTTS voice cloning. Provide an Audio sample of the voice to clone."
147
+ "voice_sample is required for Chatterbox voice cloning. "
148
+ "Provide an Audio sample of the voice to clone."
170
149
  )
171
- return self._generate_xtts(text, voice_sample)
150
+ return self._generate_chatterbox(text, voice_sample)
172
151
 
173
152
  return self._generate_local(text, effective_voice)
174
153
 
@@ -61,15 +61,15 @@ class TextTranslator:
61
61
  return f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
62
62
 
63
63
  def _init_local(self, source_lang: str, target_lang: str) -> None:
64
- from transformers import AutoModelForSeq2SeqLM, AutoTokenizer # type: ignore[attr-defined]
64
+ from transformers import MarianMTModel, MarianTokenizer # type: ignore[attr-defined]
65
65
 
66
66
  model_name = self._get_local_model_name(source_lang, target_lang)
67
67
 
68
68
  requested_device = self.device
69
69
  device = select_device(self.device, mps_allowed=True)
70
70
 
71
- self._tokenizer = AutoTokenizer.from_pretrained(model_name)
72
- self._model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
71
+ self._tokenizer = MarianTokenizer.from_pretrained(model_name)
72
+ self._model = MarianMTModel.from_pretrained(model_name).to(device)
73
73
  self.device = device
74
74
  log_device_initialization(
75
75
  "TextTranslator",
@@ -103,6 +103,8 @@ class TextTranslator:
103
103
  return text
104
104
 
105
105
  effective_source = source_lang or "en"
106
+ if effective_source == target_lang:
107
+ return text
106
108
  return self._translate_local(text, target_lang, effective_source)
107
109
 
108
110
  def translate_batch(
@@ -118,6 +120,8 @@ class TextTranslator:
118
120
  return []
119
121
 
120
122
  effective_source = source_lang or "en"
123
+ if effective_source == target_lang:
124
+ return list(texts)
121
125
  if self._model is None or self._current_lang_pair != (effective_source, target_lang):
122
126
  self._init_local(effective_source, target_lang)
123
127
 
@@ -32,6 +32,22 @@ __all__ = [
32
32
  ]
33
33
 
34
34
 
35
+ def _resolve_time_range(start: float | None, stop: float | None, total_seconds: float) -> tuple[float, float]:
36
+ """Clamp and validate an effect time range against the video duration.
37
+
38
+ Returns resolved (start, stop) in seconds.
39
+ """
40
+ start_s = start if start is not None else 0
41
+ stop_s = stop if stop is not None else total_seconds
42
+ stop_s = min(stop_s, total_seconds)
43
+ start_s = min(start_s, total_seconds)
44
+ if start_s < 0:
45
+ raise ValueError(f"Effect start must be non-negative, got {start_s}!")
46
+ if stop_s < start_s:
47
+ raise ValueError(f"Effect stop ({stop_s}) must be >= start ({start_s})!")
48
+ return start_s, stop_s
49
+
50
+
35
51
  class Effect(ABC):
36
52
  """Abstract class for effect on frames of video.
37
53
 
@@ -54,20 +70,10 @@ class Effect(ABC):
54
70
  Only set when the effect should end before the video does.
55
71
  """
56
72
  original_shape = video.video_shape
57
- start = start if start is not None else 0
58
- stop = stop if stop is not None else video.total_seconds
59
- # Clamp to video duration (frame rounding can make stop slightly exceed
60
- # actual duration after segment assembly).
61
- stop = min(stop, video.total_seconds)
62
- start = min(start, video.total_seconds)
63
- # Check for start and stop correctness
64
- if start < 0:
65
- raise ValueError(f"Effect start must be non-negative, got {start}!")
66
- if stop < start:
67
- raise ValueError(f"Effect stop ({stop}) must be >= start ({start})!")
73
+ start_s, stop_s = _resolve_time_range(start, stop, video.total_seconds)
68
74
  # Apply effect on video slice
69
- effect_start_frame = round(start * video.fps)
70
- effect_end_frame = round(stop * video.fps)
75
+ effect_start_frame = round(start_s * video.fps)
76
+ effect_end_frame = round(stop_s * video.fps)
71
77
  video_with_effect = self._apply(video[effect_start_frame:effect_end_frame])
72
78
  old_audio = video.audio
73
79
  video = Video.from_frames(
@@ -601,16 +607,7 @@ class Fade(Effect):
601
607
  Only set when the effect should end before the video does.
602
608
  """
603
609
  original_shape = video.video_shape
604
- start_s = start if start is not None else 0
605
- stop_s = stop if stop is not None else video.total_seconds
606
- # Clamp to video duration (frame rounding can make stop slightly exceed
607
- # actual duration after segment assembly).
608
- stop_s = min(stop_s, video.total_seconds)
609
- start_s = min(start_s, video.total_seconds)
610
- if start_s < 0:
611
- raise ValueError(f"Effect start must be non-negative, got {start_s}!")
612
- if stop_s < start_s:
613
- raise ValueError(f"Effect stop ({stop_s}) must be >= start ({start_s})!")
610
+ start_s, stop_s = _resolve_time_range(start, stop, video.total_seconds)
614
611
 
615
612
  effect_start_frame = round(start_s * video.fps)
616
613
  effect_end_frame = round(stop_s * video.fps)
@@ -689,16 +686,7 @@ class AudioEffect(Effect):
689
686
  stop: Stop time in seconds. Omit to apply until the end.
690
687
  Only set when the effect should end before the video does.
691
688
  """
692
- start_s = start if start is not None else 0
693
- stop_s = stop if stop is not None else video.total_seconds
694
- # Clamp to video duration (frame rounding can make stop slightly exceed
695
- # actual duration after segment assembly).
696
- stop_s = min(stop_s, video.total_seconds)
697
- start_s = min(start_s, video.total_seconds)
698
- if start_s < 0:
699
- raise ValueError(f"Effect start must be non-negative, got {start_s}!")
700
- if stop_s < start_s:
701
- raise ValueError(f"Effect stop ({stop_s}) must be >= start ({start_s})!")
689
+ start_s, stop_s = _resolve_time_range(start, stop, video.total_seconds)
702
690
  video.audio = self._apply_audio(video.audio, start_s, stop_s, video.fps)
703
691
  return video
704
692
 
@@ -49,8 +49,9 @@ class Transition(ABC):
49
49
  return _TRANSITION_REGISTRY[transition_type]._from_dict(data)
50
50
 
51
51
  @classmethod
52
+ @abstractmethod
52
53
  def _from_dict(cls, data: dict[str, Any]) -> "Transition":
53
- raise NotImplementedError
54
+ pass
54
55
 
55
56
 
56
57
  class InstantTransition(Transition):
@@ -118,14 +118,12 @@ class MultiCamEdit:
118
118
 
119
119
  # Cache source metadata for validate() and run()
120
120
  self._source_meta = first
121
- self._source_duration = first.total_seconds
121
+ self._source_duration = min(m.total_seconds for m in meta_list)
122
122
  self._source_metas = metas
123
123
 
124
124
  # Build per-camera time ranges (cut start, cut end) from the timeline
125
125
  camera_ranges: dict[str, list[tuple[float, float]]] = {}
126
- for i, cut in enumerate(self.cuts):
127
- start = cut.time
128
- end = self.cuts[i + 1].time if i + 1 < len(self.cuts) else self._source_duration
126
+ for cut, start, end in self._cut_ranges():
129
127
  camera_ranges.setdefault(cut.camera, []).append((start, end))
130
128
 
131
129
  # Validate adjusted seek positions per source
@@ -146,20 +144,20 @@ class MultiCamEdit:
146
144
  f"exceeds source duration ({source_dur}s)"
147
145
  )
148
146
 
149
- def run(self) -> Video:
150
- """Execute the multicam edit and return the final video."""
151
- source_duration = self._source_duration
152
-
153
- # Build time ranges: each segment runs from its cut time to the next cut time
154
- segments: list[tuple[CutPoint, float, float]] = []
147
+ def _cut_ranges(self) -> list[tuple[CutPoint, float, float]]:
148
+ """Build (cut, start_time, end_time) for each segment in the timeline."""
149
+ ranges: list[tuple[CutPoint, float, float]] = []
155
150
  for i, cut in enumerate(self.cuts):
156
151
  start = cut.time
157
- end = self.cuts[i + 1].time if i + 1 < len(self.cuts) else source_duration
158
- segments.append((cut, start, end))
152
+ end = self.cuts[i + 1].time if i + 1 < len(self.cuts) else self._source_duration
153
+ ranges.append((cut, start, end))
154
+ return ranges
159
155
 
156
+ def run(self) -> Video:
157
+ """Execute the multicam edit and return the final video."""
160
158
  # Load and join segments
161
159
  result: Video | None = None
162
- for i, (cut, start, end) in enumerate(segments):
160
+ for i, (cut, start, end) in enumerate(self._cut_ranges()):
163
161
  source_path = self.sources[cut.camera]
164
162
  offset = self.source_offsets.get(cut.camera, 0.0)
165
163
  segment = Video.from_path(str(source_path), start_second=start - offset, end_second=end - offset)
@@ -185,6 +183,21 @@ class MultiCamEdit:
185
183
 
186
184
  return result
187
185
 
186
+ @property
187
+ def source_meta(self) -> VideoMetadata:
188
+ """Metadata of the reference source (first listed)."""
189
+ return self._source_meta
190
+
191
+ @property
192
+ def source_duration(self) -> float:
193
+ """Timeline duration in seconds (minimum across all sources)."""
194
+ return self._source_duration
195
+
196
+ @property
197
+ def source_metas(self) -> dict[str, VideoMetadata]:
198
+ """Per-camera metadata keyed by source name."""
199
+ return dict(self._source_metas)
200
+
188
201
  def validate(self) -> VideoMetadata:
189
202
  """Validate the plan and predict output metadata without loading frames."""
190
203
  total_seconds = self._source_duration
@@ -173,9 +173,9 @@ def to_premiere_xml(edit: MultiCamEdit) -> str:
173
173
  """
174
174
  from videopython.base.transitions import FadeTransition
175
175
 
176
- meta = edit._source_meta
176
+ meta = edit.source_meta
177
177
  fps = meta.fps
178
- source_duration = edit._source_duration
178
+ source_duration = edit.source_duration
179
179
  total_frames = _seconds_to_frames(source_duration, fps)
180
180
 
181
181
  def frames(s: float) -> int:
@@ -244,7 +244,7 @@ def to_premiere_xml(edit: MultiCamEdit) -> str:
244
244
 
245
245
  file_id = f"file-{camera}"
246
246
  if file_id not in defined_file_ids:
247
- src_meta = edit._source_metas[camera]
247
+ src_meta = edit.source_metas[camera]
248
248
  src_dur_frames = _seconds_to_frames(src_meta.total_seconds, fps)
249
249
  _build_file_element(
250
250
  ci,
@@ -664,9 +664,9 @@ def _normalize_effect_apply_args(apply_args: Mapping[str, Any], location: str) -
664
664
  """
665
665
  normalized = dict(apply_args)
666
666
  if "start" in normalized:
667
- normalized["start"] = _coerce_optional_number_at_location(normalized["start"], f"{location}.start")
667
+ normalized["start"] = _coerce_optional_number(normalized["start"], "start", location=f"{location}.start")
668
668
  if "stop" in normalized:
669
- normalized["stop"] = _coerce_optional_number_at_location(normalized["stop"], f"{location}.stop")
669
+ normalized["stop"] = _coerce_optional_number(normalized["stop"], "stop", location=f"{location}.stop")
670
670
  return normalized
671
671
 
672
672
 
@@ -1030,6 +1030,12 @@ def _predict_crop_metadata(meta: VideoMetadata, args: Mapping[str, Any]) -> Vide
1030
1030
 
1031
1031
 
1032
1032
  def _crop_value_to_pixels(value: Any, dimension: int) -> int:
1033
+ """Convert a crop value to pixels.
1034
+
1035
+ Float values in the range (0, 1] are treated as fractions of *dimension*
1036
+ (e.g. 0.5 means 50%). All other numeric values (including integers) are
1037
+ treated as absolute pixel counts.
1038
+ """
1033
1039
  if isinstance(value, bool) or not isinstance(value, (int, float)):
1034
1040
  raise ValueError("crop values must be numeric")
1035
1041
  if isinstance(value, float) and 0 < value <= 1:
@@ -1088,17 +1094,10 @@ def _require_number(value: Any, location: str) -> float:
1088
1094
  return float(value)
1089
1095
 
1090
1096
 
1091
- def _coerce_optional_number(value: Any, param_name: str) -> float | None:
1097
+ def _coerce_optional_number(value: Any, param_name: str, *, location: str | None = None) -> float | None:
1092
1098
  if value is None:
1093
1099
  return None
1094
1100
  if isinstance(value, bool) or not isinstance(value, (int, float)):
1095
- raise ValueError(f"Effect apply parameter '{param_name}' must be a number")
1096
- return float(value)
1097
-
1098
-
1099
- def _coerce_optional_number_at_location(value: Any, location: str) -> float | None:
1100
- if value is None:
1101
- return None
1102
- if isinstance(value, bool) or not isinstance(value, (int, float)):
1103
- raise ValueError(f"{location} must be a number")
1101
+ label = location if location is not None else f"Effect apply parameter '{param_name}'"
1102
+ raise ValueError(f"{label} must be a number")
1104
1103
  return float(value)
File without changes
File without changes