videopython 0.25.4__tar.gz → 0.25.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.25.4 → videopython-0.25.5}/PKG-INFO +3 -2
- {videopython-0.25.4 → videopython-0.25.5}/pyproject.toml +16 -6
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/dubbing/dubber.py +10 -1
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/dubbing/pipeline.py +13 -8
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/dubbing/timing.py +2 -2
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/generation/audio.py +27 -48
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/generation/translation.py +7 -3
- {videopython-0.25.4 → videopython-0.25.5}/.gitignore +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/LICENSE +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/README.md +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/__init__.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/_device.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/dubbing/__init__.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/dubbing/models.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/registry.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/swapping/__init__.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/swapping/inpainter.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/swapping/models.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/swapping/segmenter.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/swapping/swapper.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/transforms.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/understanding/audio.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/understanding/separation.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/ai/video_analysis.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/__init__.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/audio/__init__.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/audio/analysis.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/audio/audio.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/combine.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/description.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/effects.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/progress.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/registry.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/scene.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/text/__init__.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/text/overlay.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/text/transcription.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/transforms.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/transitions.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/utils.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/base/video.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/editing/__init__.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/editing/multicam.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/editing/premiere_xml.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/editing/video_edit.py +0 -0
- {videopython-0.25.4 → videopython-0.25.5}/src/videopython/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: videopython
|
|
3
|
-
Version: 0.25.
|
|
3
|
+
Version: 0.25.5
|
|
4
4
|
Summary: Minimal video generation and processing library.
|
|
5
5
|
Project-URL: Homepage, https://videopython.com
|
|
6
6
|
Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
|
|
@@ -25,7 +25,7 @@ Requires-Dist: torchcodec>=0.9.1
|
|
|
25
25
|
Requires-Dist: tqdm>=4.66.3
|
|
26
26
|
Provides-Extra: ai
|
|
27
27
|
Requires-Dist: accelerate>=0.29.2; extra == 'ai'
|
|
28
|
-
Requires-Dist:
|
|
28
|
+
Requires-Dist: chatterbox-tts>=0.1.7; extra == 'ai'
|
|
29
29
|
Requires-Dist: demucs>=4.0.0; extra == 'ai'
|
|
30
30
|
Requires-Dist: diffusers>=0.26.3; extra == 'ai'
|
|
31
31
|
Requires-Dist: easyocr>=1.7.0; extra == 'ai'
|
|
@@ -36,6 +36,7 @@ Requires-Dist: openai-whisper>=20240930; extra == 'ai'
|
|
|
36
36
|
Requires-Dist: pyannote-audio>=4.0.0; extra == 'ai'
|
|
37
37
|
Requires-Dist: scikit-learn>=1.3.0; extra == 'ai'
|
|
38
38
|
Requires-Dist: scipy>=1.10.0; extra == 'ai'
|
|
39
|
+
Requires-Dist: sentencepiece>=0.1.99; extra == 'ai'
|
|
39
40
|
Requires-Dist: torch>=2.1.0; extra == 'ai'
|
|
40
41
|
Requires-Dist: transformers>=5.2.0; extra == 'ai'
|
|
41
42
|
Requires-Dist: transnetv2-pytorch>=1.0.5; extra == 'ai'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "videopython"
|
|
3
|
-
version = "0.25.
|
|
3
|
+
version = "0.25.5"
|
|
4
4
|
description = "Minimal video generation and processing library."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
|
|
@@ -72,8 +72,10 @@ ai = [
|
|
|
72
72
|
# Audio classification (AST via transformers - no separate dep needed)
|
|
73
73
|
# Scene detection
|
|
74
74
|
"transnetv2-pytorch>=1.0.5",
|
|
75
|
-
# Voice cloning TTS (
|
|
76
|
-
"
|
|
75
|
+
# Voice cloning TTS (Chatterbox Multilingual by Resemble AI)
|
|
76
|
+
"chatterbox-tts>=0.1.7",
|
|
77
|
+
# Translation (Marian MT tokenizer requires sentencepiece)
|
|
78
|
+
"sentencepiece>=0.1.99",
|
|
77
79
|
# Audio source separation
|
|
78
80
|
"demucs>=4.0.0",
|
|
79
81
|
]
|
|
@@ -107,8 +109,10 @@ ai = [
|
|
|
107
109
|
# Audio classification (AST via transformers - no separate dep needed)
|
|
108
110
|
# Scene detection
|
|
109
111
|
"transnetv2-pytorch>=1.0.5",
|
|
110
|
-
# Voice cloning TTS (
|
|
111
|
-
"
|
|
112
|
+
# Voice cloning TTS (Chatterbox Multilingual by Resemble AI)
|
|
113
|
+
"chatterbox-tts>=0.1.7",
|
|
114
|
+
# Translation (Marian MT tokenizer requires sentencepiece)
|
|
115
|
+
"sentencepiece>=0.1.99",
|
|
112
116
|
# Audio source separation
|
|
113
117
|
"demucs>=4.0.0",
|
|
114
118
|
]
|
|
@@ -130,13 +134,19 @@ module = [
|
|
|
130
134
|
"easyocr", "easyocr.*",
|
|
131
135
|
"transformers", "transformers.*",
|
|
132
136
|
"transnetv2_pytorch", "transnetv2_pytorch.*",
|
|
133
|
-
"
|
|
137
|
+
"chatterbox", "chatterbox.*",
|
|
134
138
|
"demucs", "demucs.*",
|
|
135
139
|
"pyannote", "pyannote.*",
|
|
136
140
|
"cv2", "cv2.*",
|
|
137
141
|
]
|
|
138
142
|
ignore_missing_imports = true
|
|
139
143
|
|
|
144
|
+
[tool.uv]
|
|
145
|
+
# chatterbox-tts 0.1.7 pins strict versions of torch, torchaudio, numpy, and
|
|
146
|
+
# diffusers that conflict with pyannote-audio (torch>=2.8) and CogVideoX
|
|
147
|
+
# (diffusers>=0.30). Override to let the resolver pick compatible versions.
|
|
148
|
+
override-dependencies = ["torch>=2.8.0", "torchaudio>=2.8.0", "numpy>=2.0.0", "diffusers>=0.30.0"]
|
|
149
|
+
|
|
140
150
|
[build-system]
|
|
141
151
|
requires = ["hatchling"]
|
|
142
152
|
build-backend = "hatchling.build"
|
|
@@ -34,9 +34,15 @@ class VideoDubber:
|
|
|
34
34
|
source_lang: str | None = None,
|
|
35
35
|
preserve_background: bool = True,
|
|
36
36
|
voice_clone: bool = True,
|
|
37
|
+
enable_diarization: bool = False,
|
|
37
38
|
progress_callback: Callable[[str, float], None] | None = None,
|
|
38
39
|
) -> DubbingResult:
|
|
39
|
-
"""Dub a video into a target language.
|
|
40
|
+
"""Dub a video into a target language.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
enable_diarization: Enable speaker diarization to clone each speaker's
|
|
44
|
+
voice separately. Requires additional VRAM for the diarization model.
|
|
45
|
+
"""
|
|
40
46
|
if self._local_pipeline is None:
|
|
41
47
|
self._init_local_pipeline()
|
|
42
48
|
|
|
@@ -46,6 +52,7 @@ class VideoDubber:
|
|
|
46
52
|
source_lang=source_lang,
|
|
47
53
|
preserve_background=preserve_background,
|
|
48
54
|
voice_clone=voice_clone,
|
|
55
|
+
enable_diarization=enable_diarization,
|
|
49
56
|
progress_callback=progress_callback,
|
|
50
57
|
)
|
|
51
58
|
|
|
@@ -56,6 +63,7 @@ class VideoDubber:
|
|
|
56
63
|
source_lang: str | None = None,
|
|
57
64
|
preserve_background: bool = True,
|
|
58
65
|
voice_clone: bool = True,
|
|
66
|
+
enable_diarization: bool = False,
|
|
59
67
|
progress_callback: Callable[[str, float], None] | None = None,
|
|
60
68
|
) -> Video:
|
|
61
69
|
"""Dub a video and return a new video with the dubbed audio."""
|
|
@@ -65,6 +73,7 @@ class VideoDubber:
|
|
|
65
73
|
source_lang=source_lang,
|
|
66
74
|
preserve_background=preserve_background,
|
|
67
75
|
voice_clone=voice_clone,
|
|
76
|
+
enable_diarization=enable_diarization,
|
|
68
77
|
progress_callback=progress_callback,
|
|
69
78
|
)
|
|
70
79
|
return video.add_audio(result.dubbed_audio, overlay=False)
|
|
@@ -28,11 +28,11 @@ class LocalDubbingPipeline:
|
|
|
28
28
|
self._separator: Any = None
|
|
29
29
|
self._synchronizer: TimingSynchronizer | None = None
|
|
30
30
|
|
|
31
|
-
def _init_transcriber(self) -> None:
|
|
31
|
+
def _init_transcriber(self, enable_diarization: bool = False) -> None:
|
|
32
32
|
"""Initialize the transcription model."""
|
|
33
33
|
from videopython.ai.understanding.audio import AudioToText
|
|
34
34
|
|
|
35
|
-
self._transcriber = AudioToText(device=self.device)
|
|
35
|
+
self._transcriber = AudioToText(device=self.device, enable_diarization=enable_diarization)
|
|
36
36
|
|
|
37
37
|
def _init_translator(self) -> None:
|
|
38
38
|
"""Initialize the translation model."""
|
|
@@ -40,17 +40,18 @@ class LocalDubbingPipeline:
|
|
|
40
40
|
|
|
41
41
|
self._translator = TextTranslator(device=self.device)
|
|
42
42
|
|
|
43
|
-
def _init_tts(self, voice_clone: bool = False) -> None:
|
|
43
|
+
def _init_tts(self, voice_clone: bool = False, language: str = "en") -> None:
|
|
44
44
|
"""Initialize the text-to-speech model."""
|
|
45
45
|
from videopython.ai.generation.audio import TextToSpeech
|
|
46
46
|
|
|
47
47
|
if voice_clone:
|
|
48
48
|
self._tts = TextToSpeech(
|
|
49
|
-
model_size="
|
|
49
|
+
model_size="chatterbox",
|
|
50
50
|
device=self.device,
|
|
51
|
+
language=language,
|
|
51
52
|
)
|
|
52
53
|
else:
|
|
53
|
-
self._tts = TextToSpeech(device=self.device)
|
|
54
|
+
self._tts = TextToSpeech(device=self.device, language=language)
|
|
54
55
|
|
|
55
56
|
def _init_separator(self) -> None:
|
|
56
57
|
"""Initialize the audio separator."""
|
|
@@ -108,6 +109,7 @@ class LocalDubbingPipeline:
|
|
|
108
109
|
source_lang: str | None = None,
|
|
109
110
|
preserve_background: bool = True,
|
|
110
111
|
voice_clone: bool = True,
|
|
112
|
+
enable_diarization: bool = False,
|
|
111
113
|
progress_callback: Callable[[str, float], None] | None = None,
|
|
112
114
|
) -> DubbingResult:
|
|
113
115
|
"""Process a video through the local dubbing pipeline."""
|
|
@@ -119,7 +121,7 @@ class LocalDubbingPipeline:
|
|
|
119
121
|
|
|
120
122
|
report_progress("Transcribing audio", 0.05)
|
|
121
123
|
if self._transcriber is None:
|
|
122
|
-
self._init_transcriber()
|
|
124
|
+
self._init_transcriber(enable_diarization=enable_diarization)
|
|
123
125
|
|
|
124
126
|
source_audio = video.audio
|
|
125
127
|
transcription = self._transcriber.transcribe(source_audio)
|
|
@@ -133,7 +135,7 @@ class LocalDubbingPipeline:
|
|
|
133
135
|
target_lang=target_lang,
|
|
134
136
|
)
|
|
135
137
|
|
|
136
|
-
detected_lang = source_lang or "en"
|
|
138
|
+
detected_lang = source_lang or transcription.language or "en"
|
|
137
139
|
|
|
138
140
|
separated_audio: SeparatedAudio | None = None
|
|
139
141
|
vocal_audio = source_audio
|
|
@@ -163,13 +165,16 @@ class LocalDubbingPipeline:
|
|
|
163
165
|
|
|
164
166
|
report_progress("Generating dubbed speech", 0.50)
|
|
165
167
|
if self._tts is None:
|
|
166
|
-
self._init_tts(voice_clone=voice_clone)
|
|
168
|
+
self._init_tts(voice_clone=voice_clone, language=target_lang)
|
|
167
169
|
|
|
168
170
|
dubbed_segments: list[Audio] = []
|
|
169
171
|
target_durations: list[float] = []
|
|
170
172
|
start_times: list[float] = []
|
|
171
173
|
|
|
172
174
|
for i, segment in enumerate(translated_segments):
|
|
175
|
+
if segment.duration < 0.1:
|
|
176
|
+
continue
|
|
177
|
+
|
|
173
178
|
progress = 0.50 + (0.30 * (i / len(translated_segments)))
|
|
174
179
|
report_progress(f"Generating speech ({i + 1}/{len(translated_segments)})", progress)
|
|
175
180
|
|
|
@@ -79,8 +79,8 @@ class TimingSynchronizer:
|
|
|
79
79
|
"""
|
|
80
80
|
original_duration = audio.metadata.duration_seconds
|
|
81
81
|
|
|
82
|
-
if original_duration <= 0:
|
|
83
|
-
# Empty audio, return as-is
|
|
82
|
+
if original_duration <= 0 or target_duration <= 0:
|
|
83
|
+
# Empty audio or zero-length target, return as-is
|
|
84
84
|
return audio, TimingAdjustment(
|
|
85
85
|
segment_index=segment_index,
|
|
86
86
|
original_duration=original_duration,
|
|
@@ -11,10 +11,13 @@ from videopython.base.audio import Audio, AudioMetadata
|
|
|
11
11
|
class TextToSpeech:
|
|
12
12
|
"""Generates speech audio from text using local models.
|
|
13
13
|
|
|
14
|
-
Supports Bark (`base`, `small`)
|
|
14
|
+
Supports Bark (`base`, `small`) for general TTS and Chatterbox Multilingual
|
|
15
|
+
(`chatterbox`) for multilingual voice cloning.
|
|
15
16
|
"""
|
|
16
17
|
|
|
17
|
-
SUPPORTED_LOCAL_MODELS: list[str] = ["base", "small", "
|
|
18
|
+
SUPPORTED_LOCAL_MODELS: list[str] = ["base", "small", "chatterbox"]
|
|
19
|
+
|
|
20
|
+
CHATTERBOX_SAMPLE_RATE: int = 24000
|
|
18
21
|
|
|
19
22
|
def __init__(
|
|
20
23
|
self,
|
|
@@ -32,7 +35,7 @@ class TextToSpeech:
|
|
|
32
35
|
self.language = language
|
|
33
36
|
self._model: Any = None
|
|
34
37
|
self._processor: Any = None
|
|
35
|
-
self.
|
|
38
|
+
self._chatterbox_model: Any = None
|
|
36
39
|
|
|
37
40
|
def _init_local(self) -> None:
|
|
38
41
|
"""Initialize local Bark model."""
|
|
@@ -51,43 +54,14 @@ class TextToSpeech:
|
|
|
51
54
|
resolved_device=device,
|
|
52
55
|
)
|
|
53
56
|
|
|
54
|
-
def
|
|
55
|
-
"""
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def load_audio_soundfile(audiopath: str, sampling_rate: int):
|
|
59
|
-
import soundfile as sf # type: ignore[import-untyped]
|
|
60
|
-
import torch
|
|
61
|
-
import torchaudio.functional as F # type: ignore[import-untyped]
|
|
62
|
-
|
|
63
|
-
audio_np, sr = sf.read(audiopath, dtype="float32")
|
|
64
|
-
|
|
65
|
-
audio = torch.from_numpy(audio_np)
|
|
66
|
-
if audio.dim() == 1:
|
|
67
|
-
audio = audio.unsqueeze(0)
|
|
68
|
-
else:
|
|
69
|
-
audio = audio.T
|
|
70
|
-
|
|
71
|
-
if audio.size(0) != 1:
|
|
72
|
-
audio = torch.mean(audio, dim=0, keepdim=True)
|
|
73
|
-
|
|
74
|
-
if sr != sampling_rate:
|
|
75
|
-
audio = F.resample(audio, sr, sampling_rate)
|
|
76
|
-
|
|
77
|
-
return audio
|
|
78
|
-
|
|
79
|
-
xtts_module.load_audio = load_audio_soundfile
|
|
80
|
-
|
|
81
|
-
def _init_xtts(self) -> None:
|
|
82
|
-
"""Initialize XTTS-v2 model for voice cloning."""
|
|
83
|
-
from TTS.api import TTS
|
|
84
|
-
|
|
85
|
-
self._patch_xtts_load_audio()
|
|
57
|
+
def _init_chatterbox(self) -> None:
|
|
58
|
+
"""Initialize Chatterbox Multilingual model for voice cloning."""
|
|
59
|
+
from chatterbox.mtl_tts import ChatterboxMultilingualTTS # type: ignore[import-untyped]
|
|
86
60
|
|
|
87
61
|
requested_device = self.device
|
|
88
62
|
device = select_device(self.device, mps_allowed=False)
|
|
89
63
|
|
|
90
|
-
self.
|
|
64
|
+
self._chatterbox_model = ChatterboxMultilingualTTS.from_pretrained(device=device)
|
|
91
65
|
self.device = device
|
|
92
66
|
log_device_initialization(
|
|
93
67
|
"TextToSpeech",
|
|
@@ -120,28 +94,32 @@ class TextToSpeech:
|
|
|
120
94
|
)
|
|
121
95
|
return Audio(audio_data, metadata)
|
|
122
96
|
|
|
123
|
-
def
|
|
124
|
-
"""Generate speech using
|
|
97
|
+
def _generate_chatterbox(self, text: str, voice_sample: Audio) -> Audio:
|
|
98
|
+
"""Generate speech using Chatterbox Multilingual with voice cloning."""
|
|
125
99
|
import tempfile
|
|
126
100
|
from pathlib import Path
|
|
127
101
|
|
|
128
102
|
import numpy as np
|
|
129
103
|
|
|
130
|
-
if self.
|
|
131
|
-
self.
|
|
104
|
+
if self._chatterbox_model is None:
|
|
105
|
+
self._init_chatterbox()
|
|
132
106
|
|
|
133
107
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
|
134
108
|
voice_sample.save(f.name)
|
|
135
109
|
speaker_wav_path = Path(f.name)
|
|
136
110
|
|
|
137
111
|
try:
|
|
138
|
-
|
|
112
|
+
wav = self._chatterbox_model.generate(
|
|
139
113
|
text=text,
|
|
140
|
-
|
|
141
|
-
|
|
114
|
+
language_id=self.language,
|
|
115
|
+
audio_prompt_path=str(speaker_wav_path),
|
|
142
116
|
)
|
|
143
|
-
|
|
144
|
-
|
|
117
|
+
|
|
118
|
+
audio_data = wav.cpu().float().numpy().squeeze()
|
|
119
|
+
if audio_data.ndim == 0:
|
|
120
|
+
audio_data = np.array([audio_data], dtype=np.float32)
|
|
121
|
+
|
|
122
|
+
sample_rate = self.CHATTERBOX_SAMPLE_RATE
|
|
145
123
|
|
|
146
124
|
metadata = AudioMetadata(
|
|
147
125
|
sample_rate=sample_rate,
|
|
@@ -163,12 +141,13 @@ class TextToSpeech:
|
|
|
163
141
|
"""Generate speech audio from text."""
|
|
164
142
|
effective_voice = voice_preset or self.voice
|
|
165
143
|
|
|
166
|
-
if self.model_size == "
|
|
144
|
+
if self.model_size == "chatterbox" or voice_sample is not None:
|
|
167
145
|
if voice_sample is None:
|
|
168
146
|
raise ValueError(
|
|
169
|
-
"voice_sample is required for
|
|
147
|
+
"voice_sample is required for Chatterbox voice cloning. "
|
|
148
|
+
"Provide an Audio sample of the voice to clone."
|
|
170
149
|
)
|
|
171
|
-
return self.
|
|
150
|
+
return self._generate_chatterbox(text, voice_sample)
|
|
172
151
|
|
|
173
152
|
return self._generate_local(text, effective_voice)
|
|
174
153
|
|
|
@@ -61,15 +61,15 @@ class TextTranslator:
|
|
|
61
61
|
return f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
|
|
62
62
|
|
|
63
63
|
def _init_local(self, source_lang: str, target_lang: str) -> None:
|
|
64
|
-
from transformers import
|
|
64
|
+
from transformers import MarianMTModel, MarianTokenizer # type: ignore[attr-defined]
|
|
65
65
|
|
|
66
66
|
model_name = self._get_local_model_name(source_lang, target_lang)
|
|
67
67
|
|
|
68
68
|
requested_device = self.device
|
|
69
69
|
device = select_device(self.device, mps_allowed=True)
|
|
70
70
|
|
|
71
|
-
self._tokenizer =
|
|
72
|
-
self._model =
|
|
71
|
+
self._tokenizer = MarianTokenizer.from_pretrained(model_name)
|
|
72
|
+
self._model = MarianMTModel.from_pretrained(model_name).to(device)
|
|
73
73
|
self.device = device
|
|
74
74
|
log_device_initialization(
|
|
75
75
|
"TextTranslator",
|
|
@@ -103,6 +103,8 @@ class TextTranslator:
|
|
|
103
103
|
return text
|
|
104
104
|
|
|
105
105
|
effective_source = source_lang or "en"
|
|
106
|
+
if effective_source == target_lang:
|
|
107
|
+
return text
|
|
106
108
|
return self._translate_local(text, target_lang, effective_source)
|
|
107
109
|
|
|
108
110
|
def translate_batch(
|
|
@@ -118,6 +120,8 @@ class TextTranslator:
|
|
|
118
120
|
return []
|
|
119
121
|
|
|
120
122
|
effective_source = source_lang or "en"
|
|
123
|
+
if effective_source == target_lang:
|
|
124
|
+
return list(texts)
|
|
121
125
|
if self._model is None or self._current_lang_pair != (effective_source, target_lang):
|
|
122
126
|
self._init_local(effective_source, target_lang)
|
|
123
127
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|