sinapsis-speech 0.3.5__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/PKG-INFO +38 -2
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/README.md +35 -1
- sinapsis_speech-0.4.0/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/voice_utils.py +100 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/__init__.py +2 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_base.py +7 -7
- sinapsis_speech-0.4.0/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_sts.py +95 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_tts.py +8 -7
- sinapsis_speech-0.4.0/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_clone.py +123 -0
- sinapsis_speech-0.4.0/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/templates/__init__.py +20 -0
- sinapsis_speech-0.4.0/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/templates/orpheus_tts.py +300 -0
- sinapsis_speech-0.4.0/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/thirdparty/helpers.py +69 -0
- sinapsis_speech-0.4.0/packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/templates/__init__.py +20 -0
- sinapsis_speech-0.4.0/packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/templates/parakeet_tdt.py +270 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/packages/sinapsis_speech.egg-info/PKG-INFO +38 -2
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/packages/sinapsis_speech.egg-info/SOURCES.txt +5 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/packages/sinapsis_speech.egg-info/requires.txt +2 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/packages/sinapsis_speech.egg-info/top_level.txt +2 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/pyproject.toml +6 -1
- sinapsis_speech-0.3.5/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/voice_utils.py +0 -64
- sinapsis_speech-0.3.5/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_sts.py +0 -56
- sinapsis_speech-0.3.5/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_clone.py +0 -51
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/LICENSE +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/__init__.py +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/__init__.py +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/env_var_keys.py +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_generation.py +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/__init__.py +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/templates/__init__.py +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/templates/f5_tts_inference.py +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/packages/sinapsis_kokoro/src/sinapsis_kokoro/helpers/kokoro_utils.py +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/packages/sinapsis_kokoro/src/sinapsis_kokoro/templates/__init__.py +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/packages/sinapsis_kokoro/src/sinapsis_kokoro/templates/kokoro_tts.py +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/packages/sinapsis_speech.egg-info/dependency_links.txt +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/packages/sinapsis_zonos/src/sinapsis_zonos/__init__.py +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/packages/sinapsis_zonos/src/sinapsis_zonos/helpers/__init__.py +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/packages/sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_keys.py +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/packages/sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_tts_utils.py +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/packages/sinapsis_zonos/src/sinapsis_zonos/templates/__init__.py +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/packages/sinapsis_zonos/src/sinapsis_zonos/templates/zonos_tts.py +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sinapsis-speech
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Generate speech using various libraries.
|
|
5
5
|
Author-email: SinapsisAI <dev@sinapsis.tech>
|
|
6
6
|
Project-URL: Homepage, https://sinapsis.tech
|
|
@@ -18,6 +18,8 @@ Requires-Dist: sinapsis-f5-tts[all]; extra == "all"
|
|
|
18
18
|
Requires-Dist: sinapsis-kokoro[all]; extra == "all"
|
|
19
19
|
Requires-Dist: sinapsis-speech[gradio-app]; extra == "all"
|
|
20
20
|
Requires-Dist: sinapsis-zonos[all]; extra == "all"
|
|
21
|
+
Requires-Dist: sinapsis-parakeet-tdt[all]; extra == "all"
|
|
22
|
+
Requires-Dist: sinapsis-orpheus-cpp[all]; extra == "all"
|
|
21
23
|
Provides-Extra: gradio-app
|
|
22
24
|
Requires-Dist: sinapsis[webapp]>=0.2.3; extra == "gradio-app"
|
|
23
25
|
Dynamic: license-file
|
|
@@ -55,8 +57,10 @@ This repo includes packages for performing speech synthesis using different tool
|
|
|
55
57
|
|
|
56
58
|
* <code>sinapsis-elevenlabs</code>
|
|
57
59
|
* <code>sinapsis-f5-tts</code>
|
|
58
|
-
*
|
|
60
|
+
* <code>sinapsis-kokoro</code>
|
|
59
61
|
* <code>sinapsis-zonos</code>
|
|
62
|
+
* <code>sinapsis-orpheus-cpp</code>
|
|
63
|
+
* <code>sinapsis-parakeet</code>
|
|
60
64
|
|
|
61
65
|
Install using your preferred package manager. We strongly recommend using <code>uv</code>. To install <code>uv</code>, refer to the [official documentation](https://docs.astral.sh/uv/getting-started/installation/#installation-methods).
|
|
62
66
|
|
|
@@ -205,6 +209,16 @@ docker compose -f docker/compose_apps.yaml up -d sinapsis-kokoro
|
|
|
205
209
|
docker compose -f docker/compose_apps.yaml up -d sinapsis-zonos
|
|
206
210
|
```
|
|
207
211
|
|
|
212
|
+
- For Orpheus-CPP:
|
|
213
|
+
```bash
|
|
214
|
+
docker compose -f docker/compose_apps.yaml up -d sinapsis-orpheus-tts
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
- For Parakeet:
|
|
218
|
+
```bash
|
|
219
|
+
docker compose -f docker/compose_apps.yaml up -d sinapsis-parakeet
|
|
220
|
+
```
|
|
221
|
+
|
|
208
222
|
3. **Check the logs**
|
|
209
223
|
|
|
210
224
|
- For ElevenLabs:
|
|
@@ -224,6 +238,17 @@ docker logs -f sinapsis-kokoro
|
|
|
224
238
|
```bash
|
|
225
239
|
docker logs -f sinapsis-zonos
|
|
226
240
|
```
|
|
241
|
+
|
|
242
|
+
- For Orpheus-CPP:
|
|
243
|
+
```bash
|
|
244
|
+
docker logs -f sinapsis-orpheus-tts
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
- For Parakeet:
|
|
248
|
+
```bash
|
|
249
|
+
docker logs -f sinapsis-parakeet
|
|
250
|
+
```
|
|
251
|
+
|
|
227
252
|
4. **The logs will display the URL to access the webapp, e.g.,:**:
|
|
228
253
|
```bash
|
|
229
254
|
Running on local URL: http://127.0.0.1:7860
|
|
@@ -240,6 +265,17 @@ docker compose -f docker/compose_apps.yaml down
|
|
|
240
265
|
|
|
241
266
|
To run the webapp using the <code>uv</code> package manager, follow these steps:
|
|
242
267
|
|
|
268
|
+
|
|
269
|
+
> [!IMPORTANT]
|
|
270
|
+
> If you're using sinapsis-orpheus-cpp, you need to export cuda environment variables:
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
```bash
|
|
274
|
+
export CMAKE_ARGS="-DGGML_CUDA=on"
|
|
275
|
+
export FORCE_CMAKE="1"
|
|
276
|
+
export CUDACXX=$(command -v nvcc)
|
|
277
|
+
```
|
|
278
|
+
|
|
243
279
|
1. **Sync the virtual environment**:
|
|
244
280
|
|
|
245
281
|
```bash
|
|
@@ -31,8 +31,10 @@ This repo includes packages for performing speech synthesis using different tool
|
|
|
31
31
|
|
|
32
32
|
* <code>sinapsis-elevenlabs</code>
|
|
33
33
|
* <code>sinapsis-f5-tts</code>
|
|
34
|
-
*
|
|
34
|
+
* <code>sinapsis-kokoro</code>
|
|
35
35
|
* <code>sinapsis-zonos</code>
|
|
36
|
+
* <code>sinapsis-orpheus-cpp</code>
|
|
37
|
+
* <code>sinapsis-parakeet</code>
|
|
36
38
|
|
|
37
39
|
Install using your preferred package manager. We strongly recommend using <code>uv</code>. To install <code>uv</code>, refer to the [official documentation](https://docs.astral.sh/uv/getting-started/installation/#installation-methods).
|
|
38
40
|
|
|
@@ -181,6 +183,16 @@ docker compose -f docker/compose_apps.yaml up -d sinapsis-kokoro
|
|
|
181
183
|
docker compose -f docker/compose_apps.yaml up -d sinapsis-zonos
|
|
182
184
|
```
|
|
183
185
|
|
|
186
|
+
- For Orpheus-CPP:
|
|
187
|
+
```bash
|
|
188
|
+
docker compose -f docker/compose_apps.yaml up -d sinapsis-orpheus-tts
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
- For Parakeet:
|
|
192
|
+
```bash
|
|
193
|
+
docker compose -f docker/compose_apps.yaml up -d sinapsis-parakeet
|
|
194
|
+
```
|
|
195
|
+
|
|
184
196
|
3. **Check the logs**
|
|
185
197
|
|
|
186
198
|
- For ElevenLabs:
|
|
@@ -200,6 +212,17 @@ docker logs -f sinapsis-kokoro
|
|
|
200
212
|
```bash
|
|
201
213
|
docker logs -f sinapsis-zonos
|
|
202
214
|
```
|
|
215
|
+
|
|
216
|
+
- For Orpheus-CPP:
|
|
217
|
+
```bash
|
|
218
|
+
docker logs -f sinapsis-orpheus-tts
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
- For Parakeet:
|
|
222
|
+
```bash
|
|
223
|
+
docker logs -f sinapsis-parakeet
|
|
224
|
+
```
|
|
225
|
+
|
|
203
226
|
4. **The logs will display the URL to access the webapp, e.g.,:**:
|
|
204
227
|
```bash
|
|
205
228
|
Running on local URL: http://127.0.0.1:7860
|
|
@@ -216,6 +239,17 @@ docker compose -f docker/compose_apps.yaml down
|
|
|
216
239
|
|
|
217
240
|
To run the webapp using the <code>uv</code> package manager, follow these steps:
|
|
218
241
|
|
|
242
|
+
|
|
243
|
+
> [!IMPORTANT]
|
|
244
|
+
> If you're using sinapsis-orpheus-cpp, you need to export cuda environment variables:
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
```bash
|
|
248
|
+
export CMAKE_ARGS="-DGGML_CUDA=on"
|
|
249
|
+
export FORCE_CMAKE="1"
|
|
250
|
+
export CUDACXX=$(command -v nvcc)
|
|
251
|
+
```
|
|
252
|
+
|
|
219
253
|
1. **Sync the virtual environment**:
|
|
220
254
|
|
|
221
255
|
```bash
|
sinapsis_speech-0.4.0/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/voice_utils.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import json
|
|
3
|
+
|
|
4
|
+
from elevenlabs import Voice, VoiceSettings
|
|
5
|
+
from elevenlabs.client import ElevenLabs
|
|
6
|
+
from sinapsis_core.data_containers.data_packet import TextPacket
|
|
7
|
+
from sinapsis_core.utils.logging_utils import sinapsis_logger
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def create_voice_settings(settings: VoiceSettings, as_json: bool = False) -> VoiceSettings | None | str:
|
|
11
|
+
"""
|
|
12
|
+
Creates or updates a `VoiceSettings` object based on the provided settings.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
settings (VoiceSettings | None): An instance of `VoiceSettings` containing the settings to be applied.
|
|
16
|
+
If `None`, the function returns the default settings.
|
|
17
|
+
as_json (bool): Whether to return the settings as JSON string.
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
VoiceSettings | None | str: The provided `VoiceSettings` object if `settings` is not `None`. Otherwise,
|
|
21
|
+
`None` is returned for default settings.
|
|
22
|
+
"""
|
|
23
|
+
if not settings:
|
|
24
|
+
return None
|
|
25
|
+
|
|
26
|
+
if as_json:
|
|
27
|
+
return json.dumps(settings.model_dump(exclude_none=True))
|
|
28
|
+
|
|
29
|
+
return settings
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_voice_id(client: ElevenLabs, voice: str | Voice | None) -> str:
|
|
33
|
+
"""
|
|
34
|
+
Resolves the voice ID for a given voice name or ID.
|
|
35
|
+
|
|
36
|
+
This function searches through available voices from the ElevenLabs API
|
|
37
|
+
to match the provided voice name or ID. If the specified voice is not found,
|
|
38
|
+
it logs the error and returns the first available voice ID as a fallback.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
client (ElevenLabs): The ElevenLabs API client instance.
|
|
42
|
+
voice (str | Voice | None): The name or ID of the desired voice.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
str: The resolved voice ID.
|
|
46
|
+
|
|
47
|
+
Raises:
|
|
48
|
+
ValueError: If no voices are available to resolve.
|
|
49
|
+
"""
|
|
50
|
+
if not voice:
|
|
51
|
+
return get_default_voice(client).voice_id
|
|
52
|
+
|
|
53
|
+
if isinstance(voice, Voice):
|
|
54
|
+
sinapsis_logger.debug(f"Voice object provided, using voice_id: {voice.voice_id}")
|
|
55
|
+
return voice.voice_id
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
voices_response = client.voices.get_all()
|
|
59
|
+
voices = voices_response.voices
|
|
60
|
+
|
|
61
|
+
for v in voices:
|
|
62
|
+
if voice == v.name or voice == v.voice_id:
|
|
63
|
+
sinapsis_logger.debug(f"Voice {voice} resolved to ID: {v.voice_id}")
|
|
64
|
+
return v.voice_id
|
|
65
|
+
|
|
66
|
+
sinapsis_logger.error(f"Voice {voice} is not available.")
|
|
67
|
+
if voices:
|
|
68
|
+
sinapsis_logger.info(f"Returning default voice ID: {voices[0].voice_id}")
|
|
69
|
+
return voices[0].voice_id
|
|
70
|
+
|
|
71
|
+
raise ValueError("No available voices to resolve. Ensure the client is configured correctly.")
|
|
72
|
+
except Exception as e:
|
|
73
|
+
sinapsis_logger.error(f"Error resolving voice ID: {e}")
|
|
74
|
+
raise
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_default_voice(client: ElevenLabs) -> Voice:
|
|
78
|
+
"""
|
|
79
|
+
Gets the first available voice as default.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
client (ElevenLabs): The ElevenLabs API client instance.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
Voice: The default voice object.
|
|
86
|
+
"""
|
|
87
|
+
try:
|
|
88
|
+
voices_response = client.voices.get_all()
|
|
89
|
+
voices = voices_response.voices
|
|
90
|
+
if voices:
|
|
91
|
+
return voices[0]
|
|
92
|
+
raise ValueError("No voices available")
|
|
93
|
+
except Exception as e:
|
|
94
|
+
sinapsis_logger.error(f"Error getting default voice: {e}")
|
|
95
|
+
raise
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def load_input_text(input_data: list[TextPacket]) -> str:
|
|
99
|
+
"""Loads and concatenates the text content from a list of TextPacket objects."""
|
|
100
|
+
return "".join([item.content for item in input_data])
|
|
@@ -7,6 +7,8 @@ _root_lib_path = "sinapsis_elevenlabs.templates"
|
|
|
7
7
|
_template_lookup = {
|
|
8
8
|
"ElevenLabsTTS": f"{_root_lib_path}.elevenlabs_tts",
|
|
9
9
|
"ElevenLabsVoiceGeneration": f"{_root_lib_path}.elevenlabs_voice_generation",
|
|
10
|
+
"ElevenLabsVoiceClone": f"{_root_lib_path}.elevenlabs_voice_clone",
|
|
11
|
+
"ElevenLabsSTS": f"{_root_lib_path}.elevenlabs_sts",
|
|
10
12
|
}
|
|
11
13
|
|
|
12
14
|
|
|
@@ -5,10 +5,10 @@ import abc
|
|
|
5
5
|
import os
|
|
6
6
|
import uuid
|
|
7
7
|
from io import BytesIO
|
|
8
|
-
from typing import IO, Iterator, Literal
|
|
8
|
+
from typing import IO, Iterable, Iterator, Literal
|
|
9
9
|
|
|
10
10
|
from elevenlabs import Voice, VoiceSettings, save
|
|
11
|
-
from elevenlabs.client import ElevenLabs
|
|
11
|
+
from elevenlabs.client import ElevenLabs
|
|
12
12
|
from elevenlabs.types import OutputFormat
|
|
13
13
|
from pydantic import Field
|
|
14
14
|
from sinapsis_core.data_containers.data_packet import AudioPacket, DataContainer, Packet
|
|
@@ -53,7 +53,7 @@ class ElevenLabsBase(Template, abc.ABC):
|
|
|
53
53
|
"mp3_44100_192", "pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100", "ulaw_8000"]
|
|
54
54
|
output_folder (str): The folder where generated audio files will be saved.
|
|
55
55
|
stream (bool): If True, the audio is returned as a stream; otherwise, saved to a file.
|
|
56
|
-
voice (
|
|
56
|
+
voice (str | Voice | None): The voice to use for speech synthesis. This can be a voice ID (str),
|
|
57
57
|
a voice name (str) or an elevenlabs voice object (Voice).
|
|
58
58
|
voice_settings (VoiceSettings): A dictionary of settings that control the behavior of the voice.
|
|
59
59
|
- stability (float)
|
|
@@ -76,7 +76,7 @@ class ElevenLabsBase(Template, abc.ABC):
|
|
|
76
76
|
output_format: OutputFormat = "mp3_44100_128"
|
|
77
77
|
output_folder: str = os.path.join(SINAPSIS_CACHE_DIR, "elevenlabs", "audios")
|
|
78
78
|
stream: bool = False
|
|
79
|
-
voice:
|
|
79
|
+
voice: str | Voice | None = None
|
|
80
80
|
voice_settings: VoiceSettings = Field(default_factory=dict) # type: ignore[arg-type]
|
|
81
81
|
|
|
82
82
|
UIProperties = UIPropertiesMetadata(category="Elevenlabs", output_type=OutputTypes.AUDIO)
|
|
@@ -100,7 +100,7 @@ class ElevenLabsBase(Template, abc.ABC):
|
|
|
100
100
|
def synthesize_speech(self, input_data: list[Packet]) -> RESPONSE_TYPE:
|
|
101
101
|
"""Abstract method for ElevenLabs speech synthesis."""
|
|
102
102
|
|
|
103
|
-
def _save_audio(self, response:
|
|
103
|
+
def _save_audio(self, response: Iterable | bytes, file_format: str, idx: int) -> str:
|
|
104
104
|
"""Saves the audio to a file and returns the file path."""
|
|
105
105
|
if self.attributes.output_file_name:
|
|
106
106
|
file_name = self.attributes.output_file_name + "_" + str(idx)
|
|
@@ -116,7 +116,7 @@ class ElevenLabsBase(Template, abc.ABC):
|
|
|
116
116
|
self.logger.error(f"File system error while saving speech to file: {e}")
|
|
117
117
|
raise
|
|
118
118
|
|
|
119
|
-
def _generate_audio_stream(self, response:
|
|
119
|
+
def _generate_audio_stream(self, response: Iterable | bytes) -> IO[bytes]:
|
|
120
120
|
"""Generates and returns the audio stream."""
|
|
121
121
|
audio_stream = BytesIO()
|
|
122
122
|
try:
|
|
@@ -139,7 +139,7 @@ class ElevenLabsBase(Template, abc.ABC):
|
|
|
139
139
|
self.logger.error(f"Value error while processing audio chunks: {e}")
|
|
140
140
|
raise
|
|
141
141
|
|
|
142
|
-
def _process_audio_output(self, idx: int, response:
|
|
142
|
+
def _process_audio_output(self, idx: int, response: Iterable | bytes) -> str | IO[bytes]:
|
|
143
143
|
"""Processes a single audio output (either stream or file)."""
|
|
144
144
|
if self.attributes.stream:
|
|
145
145
|
return self._generate_audio_stream(response)
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""Speech-To-Speech template for ElevenLabs."""
|
|
3
|
+
|
|
4
|
+
from typing import Callable, Iterator, Literal
|
|
5
|
+
|
|
6
|
+
from sinapsis_core.data_containers.data_packet import AudioPacket
|
|
7
|
+
|
|
8
|
+
from sinapsis_elevenlabs.helpers.voice_utils import create_voice_settings, get_voice_id
|
|
9
|
+
from sinapsis_elevenlabs.templates.elevenlabs_base import ElevenLabsBase
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ElevenLabsSTS(ElevenLabsBase):
|
|
13
|
+
"""Template to interact with the ElevenLabs Speech-to-Speech API.
|
|
14
|
+
|
|
15
|
+
This template takes an input audio and converts it to a new voice using
|
|
16
|
+
the ElevenLabs Speech-to-Speech (STS) API.
|
|
17
|
+
|
|
18
|
+
Usage example:
|
|
19
|
+
|
|
20
|
+
agent:
|
|
21
|
+
name: my_test_agent
|
|
22
|
+
templates:
|
|
23
|
+
- template_name: InputTemplate
|
|
24
|
+
class_name: InputTemplate
|
|
25
|
+
attributes: {}
|
|
26
|
+
- template_name: ElevenLabsSTS
|
|
27
|
+
class_name: ElevenLabsSTS
|
|
28
|
+
template_input: InputTemplate
|
|
29
|
+
attributes:
|
|
30
|
+
api_key: null
|
|
31
|
+
model: eleven_multilingual_sts_v2
|
|
32
|
+
output_file_name: null
|
|
33
|
+
output_format: mp3_44100_128
|
|
34
|
+
output_folder: ~/.cache/sinapsis/elevenlabs/audios
|
|
35
|
+
stream: false
|
|
36
|
+
voice: null
|
|
37
|
+
voice_settings:
|
|
38
|
+
stability: null
|
|
39
|
+
similarity_boost: null
|
|
40
|
+
style: null
|
|
41
|
+
use_speaker_boost: null
|
|
42
|
+
speed: null
|
|
43
|
+
streaming_latency: null
|
|
44
|
+
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
PACKET_TYPE_NAME: str = "audios"
|
|
48
|
+
|
|
49
|
+
class AttributesBaseModel(ElevenLabsBase.AttributesBaseModel):
|
|
50
|
+
"""Attributes specific to ElevenLabs STS API interaction.
|
|
51
|
+
|
|
52
|
+
Attributes:
|
|
53
|
+
model (Literal): The STS model to use. Options are "eleven_english_sts_v2" or "eleven_multilingual_sts_v2".
|
|
54
|
+
streaming_latency (int | None): Optional latency optimization for streaming. Defaults to None.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
model: Literal["eleven_english_sts_v2", "eleven_multilingual_sts_v2"] = "eleven_multilingual_sts_v2"
|
|
58
|
+
streaming_latency: int | None = None
|
|
59
|
+
|
|
60
|
+
def synthesize_speech(self, input_data: list[AudioPacket]) -> Iterator[bytes]:
|
|
61
|
+
"""Sends an audio input to the ElevenLabs API for speech-to-speech synthesis.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
input_data (list[AudioPacket]): List of AudioPacket objects containing the audio to be converted.
|
|
65
|
+
Only the first AudioPacket in the list is used.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
Iterator[bytes]: An iterator yielding audio data chunks in the output format specified.
|
|
69
|
+
|
|
70
|
+
Raises:
|
|
71
|
+
ValueError: If there is a problem with the input data or parameters.
|
|
72
|
+
TypeError: If the input data or files are of incorrect type.
|
|
73
|
+
KeyError: If the expected key is missing in the API response.
|
|
74
|
+
"""
|
|
75
|
+
try:
|
|
76
|
+
method: Callable[..., Iterator[bytes]] = (
|
|
77
|
+
self.client.speech_to_speech.stream if self.attributes.stream else self.client.speech_to_speech.convert
|
|
78
|
+
)
|
|
79
|
+
return method(
|
|
80
|
+
voice_id=get_voice_id(self.client, voice=self.attributes.voice),
|
|
81
|
+
audio=input_data[0].content,
|
|
82
|
+
model_id=self.attributes.model,
|
|
83
|
+
voice_settings=create_voice_settings(self.attributes.voice_settings, as_json=True),
|
|
84
|
+
output_format=self.attributes.output_format,
|
|
85
|
+
optimize_streaming_latency=self.attributes.streaming_latency,
|
|
86
|
+
)
|
|
87
|
+
except ValueError as e:
|
|
88
|
+
self.logger.error(f"Value error synthesizing speech: {e}")
|
|
89
|
+
raise
|
|
90
|
+
except TypeError as e:
|
|
91
|
+
self.logger.error(f"Type error in input data or parameters: {e}")
|
|
92
|
+
raise
|
|
93
|
+
except KeyError as e:
|
|
94
|
+
self.logger.error(f"Missing key in input data or settings: {e}")
|
|
95
|
+
raise
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
"""Text-To-Speech template for ElevenLabs"""
|
|
3
3
|
|
|
4
|
-
from typing import Iterator, Literal
|
|
4
|
+
from typing import Callable, Iterator, Literal
|
|
5
5
|
|
|
6
6
|
from sinapsis_core.data_containers.data_packet import TextPacket
|
|
7
7
|
|
|
8
8
|
from sinapsis_elevenlabs.helpers.voice_utils import (
|
|
9
9
|
create_voice_settings,
|
|
10
|
+
get_voice_id,
|
|
10
11
|
load_input_text,
|
|
11
12
|
)
|
|
12
13
|
from sinapsis_elevenlabs.templates.elevenlabs_base import ElevenLabsBase
|
|
@@ -64,16 +65,16 @@ class ElevenLabsTTS(ElevenLabsBase):
|
|
|
64
65
|
"""
|
|
65
66
|
input_text: str = load_input_text(input_data)
|
|
66
67
|
try:
|
|
67
|
-
|
|
68
|
+
method: Callable[..., Iterator[bytes]] = (
|
|
69
|
+
self.client.text_to_speech.stream if self.attributes.stream else self.client.text_to_speech.convert
|
|
70
|
+
)
|
|
71
|
+
return method(
|
|
68
72
|
text=input_text,
|
|
69
|
-
|
|
70
|
-
|
|
73
|
+
voice_id=get_voice_id(self.client, self.attributes.voice),
|
|
74
|
+
model_id=self.attributes.model,
|
|
71
75
|
voice_settings=create_voice_settings(self.attributes.voice_settings),
|
|
72
76
|
output_format=self.attributes.output_format,
|
|
73
|
-
stream=self.attributes.stream,
|
|
74
77
|
)
|
|
75
|
-
|
|
76
|
-
return response
|
|
77
78
|
except ValueError as e:
|
|
78
79
|
self.logger.error(f"Value error synthesizing speech: {e}")
|
|
79
80
|
raise
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""Text-To-Speech template for ElevenLabs Voice Cloning."""
|
|
3
|
+
|
|
4
|
+
from elevenlabs import Voice
|
|
5
|
+
from sinapsis_core.data_containers.data_packet import AudioPacket, DataContainer
|
|
6
|
+
|
|
7
|
+
from sinapsis_elevenlabs.templates.elevenlabs_tts import ElevenLabsTTS
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ElevenLabsVoiceClone(ElevenLabsTTS):
|
|
11
|
+
"""Template to clone a voice using the ElevenLabs API.
|
|
12
|
+
|
|
13
|
+
This template allows you to create a new custom voice in ElevenLabs by providing
|
|
14
|
+
one or more audio samples. The cloned voice can then be used for subsequent
|
|
15
|
+
text-to-speech synthesis within the Sinapsis pipeline.
|
|
16
|
+
|
|
17
|
+
Usage example:
|
|
18
|
+
|
|
19
|
+
agent:
|
|
20
|
+
name: my_test_agent
|
|
21
|
+
templates:
|
|
22
|
+
- template_name: InputTemplate
|
|
23
|
+
class_name: InputTemplate
|
|
24
|
+
attributes: {}
|
|
25
|
+
- template_name: ElevenLabsVoiceClone
|
|
26
|
+
class_name: ElevenLabsVoiceClone
|
|
27
|
+
template_input: InputTemplate
|
|
28
|
+
attributes:
|
|
29
|
+
api_key: null
|
|
30
|
+
model: eleven_turbo_v2_5
|
|
31
|
+
output_file_name: null
|
|
32
|
+
output_format: mp3_44100_128
|
|
33
|
+
output_folder: ~/.cache/sinapsis/elevenlabs/audios
|
|
34
|
+
stream: false
|
|
35
|
+
voice: null
|
|
36
|
+
voice_settings:
|
|
37
|
+
stability: null
|
|
38
|
+
similarity_boost: null
|
|
39
|
+
style: null
|
|
40
|
+
use_speaker_boost: null
|
|
41
|
+
speed: null
|
|
42
|
+
name: null
|
|
43
|
+
description: null
|
|
44
|
+
remove_background_noise: false
|
|
45
|
+
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
class AttributesBaseModel(ElevenLabsTTS.AttributesBaseModel):
|
|
49
|
+
"""Attributes specific to the ElevenLabsVoiceClone class.
|
|
50
|
+
|
|
51
|
+
Attributes:
|
|
52
|
+
name (str | None): Name for the cloned voice. If None, a default name may be used.
|
|
53
|
+
description (str | None): Description for the cloned voice. Optional.
|
|
54
|
+
remove_background_noise (bool): Whether to remove background noise from samples. Defaults to False.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
name: str | None = None
|
|
58
|
+
description: str | None = None
|
|
59
|
+
remove_background_noise: bool = False
|
|
60
|
+
|
|
61
|
+
def clone_voice(self, input_data: list[AudioPacket]) -> Voice:
|
|
62
|
+
"""Clones a voice using the provided audio files.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
input_data (list[AudioPacket]): List of AudioPacket objects containing the audio samples
|
|
66
|
+
to be used for voice cloning. Each AudioPacket's `content` should be a file-like object
|
|
67
|
+
or bytes representing the audio data.
|
|
68
|
+
**NOTE:** All provided audio packets are used as reference for a single cloned voice.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Voice: The cloned Voice object as returned by the ElevenLabs API.
|
|
72
|
+
|
|
73
|
+
Raises:
|
|
74
|
+
ValueError: If there is a problem with the input data or parameters.
|
|
75
|
+
TypeError: If the input data or files are of incorrect type.
|
|
76
|
+
KeyError: If the expected key is missing in the API response.
|
|
77
|
+
"""
|
|
78
|
+
files = [audio.content for audio in input_data]
|
|
79
|
+
try:
|
|
80
|
+
clone_response = self.client.voices.ivc.create(
|
|
81
|
+
name=self.attributes.name,
|
|
82
|
+
files=files,
|
|
83
|
+
description=self.attributes.description,
|
|
84
|
+
remove_background_noise=self.attributes.remove_background_noise,
|
|
85
|
+
)
|
|
86
|
+
cloned_voice = self.client.voices.get(clone_response.voice_id)
|
|
87
|
+
self.logger.info(f"Voice cloned successfully with IVC: {cloned_voice.name}")
|
|
88
|
+
return cloned_voice
|
|
89
|
+
except ValueError as e:
|
|
90
|
+
self.logger.error(f"Value error in input data or parameters: {e}")
|
|
91
|
+
raise
|
|
92
|
+
except TypeError as e:
|
|
93
|
+
self.logger.error(f"Type error with input data or files: {e}")
|
|
94
|
+
raise
|
|
95
|
+
except KeyError as e:
|
|
96
|
+
self.logger.error(f"Missing expected key in API response: {e}")
|
|
97
|
+
raise
|
|
98
|
+
|
|
99
|
+
def execute(self, container: DataContainer) -> DataContainer:
|
|
100
|
+
"""Executes the voice cloning process and generates the speech output.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
container (DataContainer): The input DataContainer, expected to contain
|
|
104
|
+
one or more AudioPacket objects in the `audios` attribute.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
DataContainer: The updated DataContainer. If cloning is successful,
|
|
108
|
+
the cloned voice is set in `self.attributes.voice` and the parent
|
|
109
|
+
TTS execution is performed using the new voice.
|
|
110
|
+
|
|
111
|
+
Side Effects:
|
|
112
|
+
- Updates `self.attributes.voice` with the cloned Voice object.
|
|
113
|
+
- May log errors or info messages.
|
|
114
|
+
"""
|
|
115
|
+
audios = container.audios
|
|
116
|
+
if not audios:
|
|
117
|
+
self.logger.debug("No audios provided to clone voice")
|
|
118
|
+
return container
|
|
119
|
+
self.attributes.voice = self.clone_voice(audios)
|
|
120
|
+
|
|
121
|
+
container = super().execute(container)
|
|
122
|
+
|
|
123
|
+
return container
|
sinapsis_speech-0.4.0/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/templates/__init__.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import importlib
|
|
3
|
+
from typing import Callable
|
|
4
|
+
|
|
5
|
+
_root_lib_path = "sinapsis_orpheus_cpp.templates"
|
|
6
|
+
|
|
7
|
+
_template_lookup = {
|
|
8
|
+
"OrpheusTTS": f"{_root_lib_path}.orpheus_tts",
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def __getattr__(name: str) -> Callable:
|
|
13
|
+
if name in _template_lookup:
|
|
14
|
+
module = importlib.import_module(_template_lookup[name])
|
|
15
|
+
return getattr(module, name)
|
|
16
|
+
|
|
17
|
+
raise AttributeError(f"template `{name}` not found in {_root_lib_path}")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
__all__ = list(_template_lookup.keys())
|