videopython 0.25.4__tar.gz → 0.25.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. {videopython-0.25.4 → videopython-0.25.5}/PKG-INFO +3 -2
  2. {videopython-0.25.4 → videopython-0.25.5}/pyproject.toml +16 -6
  3. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/dubbing/dubber.py +10 -1
  4. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/dubbing/pipeline.py +13 -8
  5. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/dubbing/timing.py +2 -2
  6. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/generation/audio.py +27 -48
  7. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/generation/translation.py +7 -3
  8. {videopython-0.25.4 → videopython-0.25.5}/.gitignore +0 -0
  9. {videopython-0.25.4 → videopython-0.25.5}/LICENSE +0 -0
  10. {videopython-0.25.4 → videopython-0.25.5}/README.md +0 -0
  11. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/__init__.py +0 -0
  12. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/__init__.py +0 -0
  13. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/_device.py +0 -0
  14. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/dubbing/__init__.py +0 -0
  15. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/dubbing/models.py +0 -0
  16. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/generation/__init__.py +0 -0
  17. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/generation/image.py +0 -0
  18. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/generation/video.py +0 -0
  19. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/registry.py +0 -0
  20. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/swapping/__init__.py +0 -0
  21. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/swapping/inpainter.py +0 -0
  22. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/swapping/models.py +0 -0
  23. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/swapping/segmenter.py +0 -0
  24. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/swapping/swapper.py +0 -0
  25. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/transforms.py +0 -0
  26. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/understanding/__init__.py +0 -0
  27. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/understanding/audio.py +0 -0
  28. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/understanding/image.py +0 -0
  29. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/understanding/separation.py +0 -0
  30. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/understanding/temporal.py +0 -0
  31. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/video_analysis.py +0 -0
  32. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/__init__.py +0 -0
  33. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/audio/__init__.py +0 -0
  34. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/audio/analysis.py +0 -0
  35. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/audio/audio.py +0 -0
  36. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/combine.py +0 -0
  37. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/description.py +0 -0
  38. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/effects.py +0 -0
  39. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/exceptions.py +0 -0
  40. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/progress.py +0 -0
  41. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/registry.py +0 -0
  42. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/scene.py +0 -0
  43. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/text/__init__.py +0 -0
  44. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/text/overlay.py +0 -0
  45. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/text/transcription.py +0 -0
  46. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/transforms.py +0 -0
  47. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/transitions.py +0 -0
  48. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/utils.py +0 -0
  49. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/video.py +0 -0
  50. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/editing/__init__.py +0 -0
  51. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/editing/multicam.py +0 -0
  52. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/editing/premiere_xml.py +0 -0
  53. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/editing/video_edit.py +0 -0
  54. {videopython-0.25.4 → videopython-0.25.5}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.25.4
3
+ Version: 0.25.5
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -25,7 +25,7 @@ Requires-Dist: torchcodec>=0.9.1
25
25
  Requires-Dist: tqdm>=4.66.3
26
26
  Provides-Extra: ai
27
27
  Requires-Dist: accelerate>=0.29.2; extra == 'ai'
28
- Requires-Dist: coqui-tts>=0.24.0; extra == 'ai'
28
+ Requires-Dist: chatterbox-tts>=0.1.7; extra == 'ai'
29
29
  Requires-Dist: demucs>=4.0.0; extra == 'ai'
30
30
  Requires-Dist: diffusers>=0.26.3; extra == 'ai'
31
31
  Requires-Dist: easyocr>=1.7.0; extra == 'ai'
@@ -36,6 +36,7 @@ Requires-Dist: openai-whisper>=20240930; extra == 'ai'
36
36
  Requires-Dist: pyannote-audio>=4.0.0; extra == 'ai'
37
37
  Requires-Dist: scikit-learn>=1.3.0; extra == 'ai'
38
38
  Requires-Dist: scipy>=1.10.0; extra == 'ai'
39
+ Requires-Dist: sentencepiece>=0.1.99; extra == 'ai'
39
40
  Requires-Dist: torch>=2.1.0; extra == 'ai'
40
41
  Requires-Dist: transformers>=5.2.0; extra == 'ai'
41
42
  Requires-Dist: transnetv2-pytorch>=1.0.5; extra == 'ai'
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.25.4"
3
+ version = "0.25.5"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -72,8 +72,10 @@ ai = [
72
72
  # Audio classification (AST via transformers - no separate dep needed)
73
73
  # Scene detection
74
74
  "transnetv2-pytorch>=1.0.5",
75
- # Voice cloning TTS (coqui-tts is the maintained fork of TTS)
76
- "coqui-tts>=0.24.0",
75
+ # Voice cloning TTS (Chatterbox Multilingual by Resemble AI)
76
+ "chatterbox-tts>=0.1.7",
77
+ # Translation (Marian MT tokenizer requires sentencepiece)
78
+ "sentencepiece>=0.1.99",
77
79
  # Audio source separation
78
80
  "demucs>=4.0.0",
79
81
  ]
@@ -107,8 +109,10 @@ ai = [
107
109
  # Audio classification (AST via transformers - no separate dep needed)
108
110
  # Scene detection
109
111
  "transnetv2-pytorch>=1.0.5",
110
- # Voice cloning TTS (coqui-tts is the maintained fork of TTS)
111
- "coqui-tts>=0.24.0",
112
+ # Voice cloning TTS (Chatterbox Multilingual by Resemble AI)
113
+ "chatterbox-tts>=0.1.7",
114
+ # Translation (Marian MT tokenizer requires sentencepiece)
115
+ "sentencepiece>=0.1.99",
112
116
  # Audio source separation
113
117
  "demucs>=4.0.0",
114
118
  ]
@@ -130,13 +134,19 @@ module = [
130
134
  "easyocr", "easyocr.*",
131
135
  "transformers", "transformers.*",
132
136
  "transnetv2_pytorch", "transnetv2_pytorch.*",
133
- "TTS", "TTS.*", "coqui", "coqui.*",
137
+ "chatterbox", "chatterbox.*",
134
138
  "demucs", "demucs.*",
135
139
  "pyannote", "pyannote.*",
136
140
  "cv2", "cv2.*",
137
141
  ]
138
142
  ignore_missing_imports = true
139
143
 
144
+ [tool.uv]
145
+ # chatterbox-tts 0.1.7 pins strict versions of torch, torchaudio, numpy, and
146
+ # diffusers that conflict with pyannote-audio (torch>=2.8) and CogVideoX
147
+ # (diffusers>=0.30). Override to let the resolver pick compatible versions.
148
+ override-dependencies = ["torch>=2.8.0", "torchaudio>=2.8.0", "numpy>=2.0.0", "diffusers>=0.30.0"]
149
+
140
150
  [build-system]
141
151
  requires = ["hatchling"]
142
152
  build-backend = "hatchling.build"
@@ -34,9 +34,15 @@ class VideoDubber:
34
34
  source_lang: str | None = None,
35
35
  preserve_background: bool = True,
36
36
  voice_clone: bool = True,
37
+ enable_diarization: bool = False,
37
38
  progress_callback: Callable[[str, float], None] | None = None,
38
39
  ) -> DubbingResult:
39
- """Dub a video into a target language."""
40
+ """Dub a video into a target language.
41
+
42
+ Args:
43
+ enable_diarization: Enable speaker diarization to clone each speaker's
44
+ voice separately. Requires additional VRAM for the diarization model.
45
+ """
40
46
  if self._local_pipeline is None:
41
47
  self._init_local_pipeline()
42
48
 
@@ -46,6 +52,7 @@ class VideoDubber:
46
52
  source_lang=source_lang,
47
53
  preserve_background=preserve_background,
48
54
  voice_clone=voice_clone,
55
+ enable_diarization=enable_diarization,
49
56
  progress_callback=progress_callback,
50
57
  )
51
58
 
@@ -56,6 +63,7 @@ class VideoDubber:
56
63
  source_lang: str | None = None,
57
64
  preserve_background: bool = True,
58
65
  voice_clone: bool = True,
66
+ enable_diarization: bool = False,
59
67
  progress_callback: Callable[[str, float], None] | None = None,
60
68
  ) -> Video:
61
69
  """Dub a video and return a new video with the dubbed audio."""
@@ -65,6 +73,7 @@ class VideoDubber:
65
73
  source_lang=source_lang,
66
74
  preserve_background=preserve_background,
67
75
  voice_clone=voice_clone,
76
+ enable_diarization=enable_diarization,
68
77
  progress_callback=progress_callback,
69
78
  )
70
79
  return video.add_audio(result.dubbed_audio, overlay=False)
@@ -28,11 +28,11 @@ class LocalDubbingPipeline:
28
28
  self._separator: Any = None
29
29
  self._synchronizer: TimingSynchronizer | None = None
30
30
 
31
- def _init_transcriber(self) -> None:
31
+ def _init_transcriber(self, enable_diarization: bool = False) -> None:
32
32
  """Initialize the transcription model."""
33
33
  from videopython.ai.understanding.audio import AudioToText
34
34
 
35
- self._transcriber = AudioToText(device=self.device)
35
+ self._transcriber = AudioToText(device=self.device, enable_diarization=enable_diarization)
36
36
 
37
37
  def _init_translator(self) -> None:
38
38
  """Initialize the translation model."""
@@ -40,17 +40,18 @@ class LocalDubbingPipeline:
40
40
 
41
41
  self._translator = TextTranslator(device=self.device)
42
42
 
43
- def _init_tts(self, voice_clone: bool = False) -> None:
43
+ def _init_tts(self, voice_clone: bool = False, language: str = "en") -> None:
44
44
  """Initialize the text-to-speech model."""
45
45
  from videopython.ai.generation.audio import TextToSpeech
46
46
 
47
47
  if voice_clone:
48
48
  self._tts = TextToSpeech(
49
- model_size="xtts",
49
+ model_size="chatterbox",
50
50
  device=self.device,
51
+ language=language,
51
52
  )
52
53
  else:
53
- self._tts = TextToSpeech(device=self.device)
54
+ self._tts = TextToSpeech(device=self.device, language=language)
54
55
 
55
56
  def _init_separator(self) -> None:
56
57
  """Initialize the audio separator."""
@@ -108,6 +109,7 @@ class LocalDubbingPipeline:
108
109
  source_lang: str | None = None,
109
110
  preserve_background: bool = True,
110
111
  voice_clone: bool = True,
112
+ enable_diarization: bool = False,
111
113
  progress_callback: Callable[[str, float], None] | None = None,
112
114
  ) -> DubbingResult:
113
115
  """Process a video through the local dubbing pipeline."""
@@ -119,7 +121,7 @@ class LocalDubbingPipeline:
119
121
 
120
122
  report_progress("Transcribing audio", 0.05)
121
123
  if self._transcriber is None:
122
- self._init_transcriber()
124
+ self._init_transcriber(enable_diarization=enable_diarization)
123
125
 
124
126
  source_audio = video.audio
125
127
  transcription = self._transcriber.transcribe(source_audio)
@@ -133,7 +135,7 @@ class LocalDubbingPipeline:
133
135
  target_lang=target_lang,
134
136
  )
135
137
 
136
- detected_lang = source_lang or "en"
138
+ detected_lang = source_lang or transcription.language or "en"
137
139
 
138
140
  separated_audio: SeparatedAudio | None = None
139
141
  vocal_audio = source_audio
@@ -163,13 +165,16 @@ class LocalDubbingPipeline:
163
165
 
164
166
  report_progress("Generating dubbed speech", 0.50)
165
167
  if self._tts is None:
166
- self._init_tts(voice_clone=voice_clone)
168
+ self._init_tts(voice_clone=voice_clone, language=target_lang)
167
169
 
168
170
  dubbed_segments: list[Audio] = []
169
171
  target_durations: list[float] = []
170
172
  start_times: list[float] = []
171
173
 
172
174
  for i, segment in enumerate(translated_segments):
175
+ if segment.duration < 0.1:
176
+ continue
177
+
173
178
  progress = 0.50 + (0.30 * (i / len(translated_segments)))
174
179
  report_progress(f"Generating speech ({i + 1}/{len(translated_segments)})", progress)
175
180
 
@@ -79,8 +79,8 @@ class TimingSynchronizer:
79
79
  """
80
80
  original_duration = audio.metadata.duration_seconds
81
81
 
82
- if original_duration <= 0:
83
- # Empty audio, return as-is
82
+ if original_duration <= 0 or target_duration <= 0:
83
+ # Empty audio or zero-length target, return as-is
84
84
  return audio, TimingAdjustment(
85
85
  segment_index=segment_index,
86
86
  original_duration=original_duration,
@@ -11,10 +11,13 @@ from videopython.base.audio import Audio, AudioMetadata
11
11
  class TextToSpeech:
12
12
  """Generates speech audio from text using local models.
13
13
 
14
- Supports Bark (`base`, `small`) and XTTS voice cloning (`xtts`).
14
+ Supports Bark (`base`, `small`) for general TTS and Chatterbox Multilingual
15
+ (`chatterbox`) for multilingual voice cloning.
15
16
  """
16
17
 
17
- SUPPORTED_LOCAL_MODELS: list[str] = ["base", "small", "xtts"]
18
+ SUPPORTED_LOCAL_MODELS: list[str] = ["base", "small", "chatterbox"]
19
+
20
+ CHATTERBOX_SAMPLE_RATE: int = 24000
18
21
 
19
22
  def __init__(
20
23
  self,
@@ -32,7 +35,7 @@ class TextToSpeech:
32
35
  self.language = language
33
36
  self._model: Any = None
34
37
  self._processor: Any = None
35
- self._xtts_model: Any = None
38
+ self._chatterbox_model: Any = None
36
39
 
37
40
  def _init_local(self) -> None:
38
41
  """Initialize local Bark model."""
@@ -51,43 +54,14 @@ class TextToSpeech:
51
54
  resolved_device=device,
52
55
  )
53
56
 
54
- def _patch_xtts_load_audio(self) -> None:
55
- """Patch XTTS load_audio to avoid torchcodec dependency issues."""
56
- import TTS.tts.models.xtts as xtts_module
57
-
58
- def load_audio_soundfile(audiopath: str, sampling_rate: int):
59
- import soundfile as sf # type: ignore[import-untyped]
60
- import torch
61
- import torchaudio.functional as F # type: ignore[import-untyped]
62
-
63
- audio_np, sr = sf.read(audiopath, dtype="float32")
64
-
65
- audio = torch.from_numpy(audio_np)
66
- if audio.dim() == 1:
67
- audio = audio.unsqueeze(0)
68
- else:
69
- audio = audio.T
70
-
71
- if audio.size(0) != 1:
72
- audio = torch.mean(audio, dim=0, keepdim=True)
73
-
74
- if sr != sampling_rate:
75
- audio = F.resample(audio, sr, sampling_rate)
76
-
77
- return audio
78
-
79
- xtts_module.load_audio = load_audio_soundfile
80
-
81
- def _init_xtts(self) -> None:
82
- """Initialize XTTS-v2 model for voice cloning."""
83
- from TTS.api import TTS
84
-
85
- self._patch_xtts_load_audio()
57
+ def _init_chatterbox(self) -> None:
58
+ """Initialize Chatterbox Multilingual model for voice cloning."""
59
+ from chatterbox.mtl_tts import ChatterboxMultilingualTTS # type: ignore[import-untyped]
86
60
 
87
61
  requested_device = self.device
88
62
  device = select_device(self.device, mps_allowed=False)
89
63
 
90
- self._xtts_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
64
+ self._chatterbox_model = ChatterboxMultilingualTTS.from_pretrained(device=device)
91
65
  self.device = device
92
66
  log_device_initialization(
93
67
  "TextToSpeech",
@@ -120,28 +94,32 @@ class TextToSpeech:
120
94
  )
121
95
  return Audio(audio_data, metadata)
122
96
 
123
- def _generate_xtts(self, text: str, voice_sample: Audio) -> Audio:
124
- """Generate speech using XTTS-v2 with voice cloning."""
97
+ def _generate_chatterbox(self, text: str, voice_sample: Audio) -> Audio:
98
+ """Generate speech using Chatterbox Multilingual with voice cloning."""
125
99
  import tempfile
126
100
  from pathlib import Path
127
101
 
128
102
  import numpy as np
129
103
 
130
- if self._xtts_model is None:
131
- self._init_xtts()
104
+ if self._chatterbox_model is None:
105
+ self._init_chatterbox()
132
106
 
133
107
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
134
108
  voice_sample.save(f.name)
135
109
  speaker_wav_path = Path(f.name)
136
110
 
137
111
  try:
138
- audio_list = self._xtts_model.tts(
112
+ wav = self._chatterbox_model.generate(
139
113
  text=text,
140
- speaker_wav=str(speaker_wav_path),
141
- language=self.language,
114
+ language_id=self.language,
115
+ audio_prompt_path=str(speaker_wav_path),
142
116
  )
143
- audio_data = np.array(audio_list, dtype=np.float32)
144
- sample_rate = 24000
117
+
118
+ audio_data = wav.cpu().float().numpy().squeeze()
119
+ if audio_data.ndim == 0:
120
+ audio_data = np.array([audio_data], dtype=np.float32)
121
+
122
+ sample_rate = self.CHATTERBOX_SAMPLE_RATE
145
123
 
146
124
  metadata = AudioMetadata(
147
125
  sample_rate=sample_rate,
@@ -163,12 +141,13 @@ class TextToSpeech:
163
141
  """Generate speech audio from text."""
164
142
  effective_voice = voice_preset or self.voice
165
143
 
166
- if self.model_size == "xtts" or voice_sample is not None:
144
+ if self.model_size == "chatterbox" or voice_sample is not None:
167
145
  if voice_sample is None:
168
146
  raise ValueError(
169
- "voice_sample is required for XTTS voice cloning. Provide an Audio sample of the voice to clone."
147
+ "voice_sample is required for Chatterbox voice cloning. "
148
+ "Provide an Audio sample of the voice to clone."
170
149
  )
171
- return self._generate_xtts(text, voice_sample)
150
+ return self._generate_chatterbox(text, voice_sample)
172
151
 
173
152
  return self._generate_local(text, effective_voice)
174
153
 
@@ -61,15 +61,15 @@ class TextTranslator:
61
61
  return f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
62
62
 
63
63
  def _init_local(self, source_lang: str, target_lang: str) -> None:
64
- from transformers import AutoModelForSeq2SeqLM, AutoTokenizer # type: ignore[attr-defined]
64
+ from transformers import MarianMTModel, MarianTokenizer # type: ignore[attr-defined]
65
65
 
66
66
  model_name = self._get_local_model_name(source_lang, target_lang)
67
67
 
68
68
  requested_device = self.device
69
69
  device = select_device(self.device, mps_allowed=True)
70
70
 
71
- self._tokenizer = AutoTokenizer.from_pretrained(model_name)
72
- self._model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
71
+ self._tokenizer = MarianTokenizer.from_pretrained(model_name)
72
+ self._model = MarianMTModel.from_pretrained(model_name).to(device)
73
73
  self.device = device
74
74
  log_device_initialization(
75
75
  "TextTranslator",
@@ -103,6 +103,8 @@ class TextTranslator:
103
103
  return text
104
104
 
105
105
  effective_source = source_lang or "en"
106
+ if effective_source == target_lang:
107
+ return text
106
108
  return self._translate_local(text, target_lang, effective_source)
107
109
 
108
110
  def translate_batch(
@@ -118,6 +120,8 @@ class TextTranslator:
118
120
  return []
119
121
 
120
122
  effective_source = source_lang or "en"
123
+ if effective_source == target_lang:
124
+ return list(texts)
121
125
  if self._model is None or self._current_lang_pair != (effective_source, target_lang):
122
126
  self._init_local(effective_source, target_lang)
123
127
 
File without changes
File without changes
File without changes