sinapsis-speech 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/__init__.py +0 -0
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/__init__.py +0 -0
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/env_var_keys.py +35 -0
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/voice_utils.py +80 -0
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/__init__.py +21 -0
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_base.py +204 -0
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_tts.py +85 -0
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_generation.py +78 -0
- sinapsis_speech-0.1.0.dist-info/METADATA +872 -0
- sinapsis_speech-0.1.0.dist-info/RECORD +13 -0
- sinapsis_speech-0.1.0.dist-info/WHEEL +5 -0
- sinapsis_speech-0.1.0.dist-info/licenses/LICENSE +661 -0
- sinapsis_speech-0.1.0.dist-info/top_level.txt +1 -0
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
from sinapsis_core.utils.env_var_keys import EnvVarEntry, doc_str, return_docs_for_vars
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class _ElevenlabsKeys(BaseModel):
|
|
9
|
+
"""
|
|
10
|
+
Env vars for Elevenlabs
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
ELEVENLABS_API_KEY: EnvVarEntry = EnvVarEntry(
|
|
14
|
+
var_name="ELEVENLABS_API_KEY",
|
|
15
|
+
default_value=" ",
|
|
16
|
+
allowed_values=None,
|
|
17
|
+
description="set api key for Elevenlabs",
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
ElevenlabsEnvVars = _ElevenlabsKeys()
|
|
22
|
+
|
|
23
|
+
doc_str = return_docs_for_vars(ElevenlabsEnvVars, docs=doc_str, string_for_doc="""Elevenlabs env vars available: \n""")
|
|
24
|
+
__doc__ = doc_str
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def __getattr__(name: str) -> Any:
|
|
28
|
+
"""to use as an import, when updating the value is not important"""
|
|
29
|
+
if name in ElevenlabsEnvVars.model_fields:
|
|
30
|
+
return ElevenlabsEnvVars.model_fields[name].default.value
|
|
31
|
+
|
|
32
|
+
raise AttributeError(f"Agent does not have `{name}` env var")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
_all__ = (*list(ElevenlabsEnvVars.model_fields.keys()), "ElevenlabsEnvVars")
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
from elevenlabs import VoiceSettings
|
|
3
|
+
from elevenlabs.client import DEFAULT_VOICE, ElevenLabs, VoiceId, VoiceName
|
|
4
|
+
from sinapsis_core.data_containers.data_packet import TextPacket
|
|
5
|
+
from sinapsis_core.utils.logging_utils import sinapsis_logger
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def create_voice_settings(settings: VoiceSettings) -> VoiceSettings | None:
|
|
9
|
+
"""
|
|
10
|
+
Creates or updates a `VoiceSettings` object based on the provided settings.
|
|
11
|
+
|
|
12
|
+
This function attempts to create or update a `VoiceSettings` object using the provided
|
|
13
|
+
`VoiceSettings` instance. If any of the fields in the settings contain `None`,
|
|
14
|
+
the corresponding field is populated with a default value from `DEFAULT_VOICE.settings`.
|
|
15
|
+
If all fields are valid (i.e., none are `None`), the provided `settings` object is returned unchanged.
|
|
16
|
+
|
|
17
|
+
If the settings argument is `None` or if no valid settings are provided, the function
|
|
18
|
+
returns `None`.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
settings (VoiceSettings): An instance of `VoiceSettings` containing the settings to be applied.
|
|
22
|
+
This object may have fields with `None` values that should be replaced with default values.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
VoiceSettings: A `VoiceSettings` object created or updated with the provided settings. If any field
|
|
26
|
+
was `None`, it is updated with default values. If the settings are invalid or empty,
|
|
27
|
+
`None` is returned.
|
|
28
|
+
"""
|
|
29
|
+
if settings:
|
|
30
|
+
settings_dict = settings.model_dump()
|
|
31
|
+
if any(value is None for value in settings_dict.values()):
|
|
32
|
+
for field, value in settings_dict.items():
|
|
33
|
+
if value is None:
|
|
34
|
+
settings_dict[field] = getattr(DEFAULT_VOICE.settings, field)
|
|
35
|
+
|
|
36
|
+
return VoiceSettings(**settings_dict)
|
|
37
|
+
else:
|
|
38
|
+
return settings
|
|
39
|
+
return None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_voice_id(client: ElevenLabs, voice: VoiceId | VoiceName) -> VoiceId:
|
|
43
|
+
"""
|
|
44
|
+
Resolves the voice ID for a given voice name or ID.
|
|
45
|
+
|
|
46
|
+
This function searches through available voices from the ElevenLabs API
|
|
47
|
+
to match the provided voice name or ID. If the specified voice is not found,
|
|
48
|
+
it logs the error and returns the first available voice ID as a fallback.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
client (ElevenLabs): The ElevenLabs API client instance.
|
|
52
|
+
voice (VoiceId | VoiceName): The name or ID of the desired voice.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
VoiceId: The resolved voice ID.
|
|
56
|
+
|
|
57
|
+
Raises:
|
|
58
|
+
ValueError: If no voices are available to resolve.
|
|
59
|
+
"""
|
|
60
|
+
try:
|
|
61
|
+
voices = client.voices.get_all().voices
|
|
62
|
+
for v in voices:
|
|
63
|
+
if voice == v.name or voice == v.voice_id:
|
|
64
|
+
sinapsis_logger.debug("Voice '%s' resolved to ID: %s", voice, v.voice_id)
|
|
65
|
+
return v.voice_id
|
|
66
|
+
|
|
67
|
+
sinapsis_logger.error("Voice '%s' is not available.", voice)
|
|
68
|
+
if voices:
|
|
69
|
+
sinapsis_logger.info("Returning default voice ID: %s", voices[0].voice_id)
|
|
70
|
+
return voices[0].voice_id
|
|
71
|
+
|
|
72
|
+
raise ValueError("No available voices to resolve. Ensure the client is configured correctly.")
|
|
73
|
+
except Exception as e:
|
|
74
|
+
sinapsis_logger.error("Error resolving voice ID: %s", e)
|
|
75
|
+
raise
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def load_input_text(input_data: list[TextPacket]) -> str:
|
|
79
|
+
"""Loads and concatenates the text content from a list of TextPacket objects."""
|
|
80
|
+
return "".join([item.content for item in input_data])
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import importlib
|
|
3
|
+
from typing import Callable
|
|
4
|
+
|
|
5
|
+
_root_lib_path = "sinapsis_elevenlabs.templates"
|
|
6
|
+
|
|
7
|
+
_template_lookup = {
|
|
8
|
+
"ElevenLabsTTS": f"{_root_lib_path}.elevenlabs_tts",
|
|
9
|
+
"ElevenLabsVoiceGeneration": f"{_root_lib_path}.elevenlabs_voice_generation",
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def __getattr__(name: str) -> Callable:
|
|
14
|
+
if name in _template_lookup:
|
|
15
|
+
module = importlib.import_module(_template_lookup[name])
|
|
16
|
+
return getattr(module, name)
|
|
17
|
+
|
|
18
|
+
raise AttributeError(f"template `{name}` not found in {_root_lib_path}")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
__all__ = list(_template_lookup.keys())
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""Base template for ElevenLabs speech synthesis"""
|
|
3
|
+
|
|
4
|
+
import abc
|
|
5
|
+
import os
|
|
6
|
+
import uuid
|
|
7
|
+
from io import BytesIO
|
|
8
|
+
from typing import IO, Iterator, Literal
|
|
9
|
+
|
|
10
|
+
from elevenlabs import Voice, VoiceSettings, save
|
|
11
|
+
from elevenlabs.client import ElevenLabs, VoiceId, VoiceName
|
|
12
|
+
from sinapsis_core.data_containers.data_packet import AudioPacket, DataContainer, Packet
|
|
13
|
+
from sinapsis_core.template_base.template import (
|
|
14
|
+
Template,
|
|
15
|
+
TemplateAttributes,
|
|
16
|
+
TemplateAttributeType,
|
|
17
|
+
)
|
|
18
|
+
from sinapsis_core.utils.env_var_keys import SINAPSIS_CACHE_DIR
|
|
19
|
+
|
|
20
|
+
from sinapsis_elevenlabs.helpers.env_var_keys import ELEVENLABS_API_KEY
|
|
21
|
+
|
|
22
|
+
RESPONSE_TYPE = Iterator[bytes] | list[bytes] | list[Iterator[bytes]] | None
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ElevenLabsBase(Template, abc.ABC):
|
|
26
|
+
"""
|
|
27
|
+
Base template to perform audio generation tasks using the Elevenlabs package.
|
|
28
|
+
|
|
29
|
+
The template takes as attributes the elevenlabs api key, the voice for the generated audio,
|
|
30
|
+
settings associated with the audio (such as stability, style, etc.), the model to be used,
|
|
31
|
+
the format for the audio, the path, etc. It implements methods to process the
|
|
32
|
+
DataContainer, initialize the Elevenlabs client, perform the inference,
|
|
33
|
+
and store the audio.
|
|
34
|
+
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
PACKET_TYPE_NAME: str = "texts"
|
|
38
|
+
|
|
39
|
+
class AttributesBaseModel(TemplateAttributes):
|
|
40
|
+
"""
|
|
41
|
+
Attributes for ElevenLabs Base Class.
|
|
42
|
+
Args:
|
|
43
|
+
api_key (str): The API key to authenticate with ElevenLabs' API.
|
|
44
|
+
voice (str|elevenlabs.Voice): The voice to use for speech synthesis. This can be a voice ID (str),
|
|
45
|
+
a voice name (str) or an elevenlabs voice object (Voice).
|
|
46
|
+
voice_settings (VoiceSettings): A dictionary of settings that control the behavior of the voice.
|
|
47
|
+
- stability (float)
|
|
48
|
+
- similarity_boost (float)
|
|
49
|
+
- style (float)
|
|
50
|
+
- use_speaker_boost (bool)
|
|
51
|
+
model (Literal): The model identifier to use for speech synthesis.
|
|
52
|
+
output_format (Literal): The output audio format and quality. Options include:
|
|
53
|
+
["mp3_22050_32", "mp3_44100_32", "mp3_44100_64", "mp3_44100_96", "mp3_44100_128",
|
|
54
|
+
"mp3_44100_192", "pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100", "ulaw_8000"]
|
|
55
|
+
output_folder (str): The folder where generated audio files will be saved.
|
|
56
|
+
stream (bool): If True, the audio is returned as a stream; otherwise, saved to a file.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
api_key: str | None = None
|
|
60
|
+
voice: VoiceId | VoiceName | Voice = None
|
|
61
|
+
voice_settings: VoiceSettings | None = None
|
|
62
|
+
model: Literal[
|
|
63
|
+
"eleven_turbo_v2_5",
|
|
64
|
+
"eleven_multilingual_v2",
|
|
65
|
+
"eleven_turbo_v2",
|
|
66
|
+
"eleven_monolingual_v1",
|
|
67
|
+
"eleven_multilingual_v1",
|
|
68
|
+
"eleven_english_sts_v2",
|
|
69
|
+
"eleven_multilingual_sts_v2",
|
|
70
|
+
] = "eleven_turbo_v2_5"
|
|
71
|
+
output_format: Literal[
|
|
72
|
+
"mp3_22050_32",
|
|
73
|
+
"mp3_44100_32",
|
|
74
|
+
"mp3_44100_64",
|
|
75
|
+
"mp3_44100_96",
|
|
76
|
+
"mp3_44100_128",
|
|
77
|
+
"mp3_44100_192",
|
|
78
|
+
"pcm_16000",
|
|
79
|
+
"pcm_22050",
|
|
80
|
+
"pcm_24000",
|
|
81
|
+
"pcm_44100",
|
|
82
|
+
"ulaw_8000",
|
|
83
|
+
] = "mp3_44100_128"
|
|
84
|
+
output_folder: str = os.path.join(SINAPSIS_CACHE_DIR, "elevenlabs", "audios")
|
|
85
|
+
stream: bool = False
|
|
86
|
+
|
|
87
|
+
def __init__(self, attributes: TemplateAttributeType) -> None:
|
|
88
|
+
"""Initializes the ElevenLabs API client with the given attributes."""
|
|
89
|
+
super().__init__(attributes)
|
|
90
|
+
os.makedirs(self.attributes.output_folder, exist_ok=True)
|
|
91
|
+
self.client = self.init_elevenlabs_client()
|
|
92
|
+
|
|
93
|
+
def init_elevenlabs_client(self) -> ElevenLabs:
|
|
94
|
+
"""Resets client object"""
|
|
95
|
+
key = self.attributes.api_key if self.attributes.api_key else ELEVENLABS_API_KEY
|
|
96
|
+
return ElevenLabs(api_key=key)
|
|
97
|
+
|
|
98
|
+
def reset_state(self) -> None:
|
|
99
|
+
"""Resets state of model"""
|
|
100
|
+
self.client = self.init_elevenlabs_client()
|
|
101
|
+
|
|
102
|
+
@abc.abstractmethod
|
|
103
|
+
def synthesize_speech(self, input_data: list[Packet]) -> RESPONSE_TYPE:
|
|
104
|
+
"""Abstract method for ElevenLabs speech synthesis."""
|
|
105
|
+
|
|
106
|
+
def _save_audio(self, response: Iterator[bytes] | bytes, file_format: str) -> str:
|
|
107
|
+
"""Saves the audio to a file and returns the file path."""
|
|
108
|
+
output_file = os.path.join(self.attributes.output_folder, f"{uuid.uuid4()}.{file_format}")
|
|
109
|
+
try:
|
|
110
|
+
save(response, output_file)
|
|
111
|
+
self.logger.info(f"Audio saved to: {output_file}")
|
|
112
|
+
return output_file
|
|
113
|
+
except OSError as e:
|
|
114
|
+
self.logger.error(f"File system error while saving speech to file: {e}")
|
|
115
|
+
raise
|
|
116
|
+
|
|
117
|
+
def _generate_audio_stream(self, response: Iterator[bytes] | bytes) -> IO[bytes]:
|
|
118
|
+
"""Generates and returns the audio stream."""
|
|
119
|
+
audio_stream = BytesIO()
|
|
120
|
+
try:
|
|
121
|
+
if isinstance(response, Iterator):
|
|
122
|
+
for chunk in response:
|
|
123
|
+
if chunk:
|
|
124
|
+
audio_stream.write(chunk)
|
|
125
|
+
elif isinstance(response, bytes):
|
|
126
|
+
audio_stream.write(response)
|
|
127
|
+
else:
|
|
128
|
+
raise TypeError(f"Unsupported response type: {type(response)}")
|
|
129
|
+
|
|
130
|
+
audio_stream.seek(0)
|
|
131
|
+
self.logger.info("Returning audio stream")
|
|
132
|
+
return audio_stream
|
|
133
|
+
except IOError as e:
|
|
134
|
+
self.logger.error(f"I/O error while processing the audio stream: {e}")
|
|
135
|
+
raise
|
|
136
|
+
except ValueError as e:
|
|
137
|
+
self.logger.error(f"Value error while processing audio chunks: {e}")
|
|
138
|
+
raise
|
|
139
|
+
|
|
140
|
+
def _process_audio_output(self, response: Iterator[bytes] | bytes) -> str | IO[bytes]:
|
|
141
|
+
"""Processes a single audio output (either stream or file)."""
|
|
142
|
+
if self.attributes.stream:
|
|
143
|
+
return self._generate_audio_stream(response)
|
|
144
|
+
else:
|
|
145
|
+
file_format = "mp3" if "mp3" in self.attributes.output_format else "wav"
|
|
146
|
+
return self._save_audio(response, file_format)
|
|
147
|
+
|
|
148
|
+
def generate_speech(self, input_data: list[Packet]) -> list[str | IO[bytes]] | None:
|
|
149
|
+
"""Generates speech and saves it to a file."""
|
|
150
|
+
responses: RESPONSE_TYPE = self.synthesize_speech(input_data)
|
|
151
|
+
if not responses:
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
if isinstance(responses, Iterator):
|
|
155
|
+
responses = [responses]
|
|
156
|
+
|
|
157
|
+
audio_outputs = [self._process_audio_output(response) for response in responses]
|
|
158
|
+
return audio_outputs
|
|
159
|
+
|
|
160
|
+
def _handle_streaming_output(self, audio_outputs: list[str | IO[bytes]]) -> list[AudioPacket]:
|
|
161
|
+
"""Handles audio stream output by adding it to the container as AudioPackets."""
|
|
162
|
+
generated_audios: list[AudioPacket] = []
|
|
163
|
+
sample_rate = int(self.attributes.output_format.split("_")[1])
|
|
164
|
+
for audio_output in audio_outputs:
|
|
165
|
+
audio_packet = AudioPacket(
|
|
166
|
+
content=audio_output,
|
|
167
|
+
sample_rate=sample_rate,
|
|
168
|
+
)
|
|
169
|
+
generated_audios.append(audio_packet)
|
|
170
|
+
return generated_audios
|
|
171
|
+
|
|
172
|
+
def _handle_audio_outputs(self, audio_outputs: list[str | IO[bytes]], container: DataContainer) -> None:
|
|
173
|
+
"""Handles the audio outputs by appending to the container based on the output type (stream or file)."""
|
|
174
|
+
if self.attributes.stream:
|
|
175
|
+
container.audios = container.audios or []
|
|
176
|
+
container.audios.extend(self._handle_streaming_output(audio_outputs))
|
|
177
|
+
else:
|
|
178
|
+
self._set_generic_data(container, audio_outputs)
|
|
179
|
+
|
|
180
|
+
def execute(self, container: DataContainer) -> DataContainer:
|
|
181
|
+
"""
|
|
182
|
+
Processes the input data and generates a speech output.
|
|
183
|
+
Depending on the configuration, either a file or a stream of audio is
|
|
184
|
+
generated and added to the provided `container`.
|
|
185
|
+
"""
|
|
186
|
+
|
|
187
|
+
if ELEVENLABS_API_KEY is None and self.attributes.api_key is None:
|
|
188
|
+
self.logger.error("Api key was not provided")
|
|
189
|
+
return container
|
|
190
|
+
|
|
191
|
+
data_packet = getattr(container, self.PACKET_TYPE_NAME)
|
|
192
|
+
|
|
193
|
+
if not data_packet:
|
|
194
|
+
self.logger.debug("No query to enter")
|
|
195
|
+
return container
|
|
196
|
+
|
|
197
|
+
audio_outputs = self.generate_speech(data_packet)
|
|
198
|
+
if not audio_outputs:
|
|
199
|
+
self.logger.error("Unable to generate speech")
|
|
200
|
+
return container
|
|
201
|
+
|
|
202
|
+
self._handle_audio_outputs(audio_outputs, container)
|
|
203
|
+
|
|
204
|
+
return container
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""Text-To-Speech template for ElevenLabs"""
|
|
3
|
+
|
|
4
|
+
from typing import Iterator, Literal
|
|
5
|
+
|
|
6
|
+
from sinapsis_core.data_containers.data_packet import TextPacket
|
|
7
|
+
|
|
8
|
+
from sinapsis_elevenlabs.helpers.voice_utils import (
|
|
9
|
+
create_voice_settings,
|
|
10
|
+
load_input_text,
|
|
11
|
+
)
|
|
12
|
+
from sinapsis_elevenlabs.templates.elevenlabs_base import ElevenLabsBase
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ElevenLabsTTS(ElevenLabsBase):
|
|
16
|
+
"""Template to interact with ElevenLabs text-to-speech API.
|
|
17
|
+
|
|
18
|
+
This class provides an implementation to generate speech from text using the
|
|
19
|
+
ElevenLabs text-to-speech API. It allows customization of voice, model settings,
|
|
20
|
+
and audio output format.
|
|
21
|
+
|
|
22
|
+
Usage example:
|
|
23
|
+
|
|
24
|
+
agent:
|
|
25
|
+
name: my_test_agent
|
|
26
|
+
templates:
|
|
27
|
+
- template_name: InputTemplate
|
|
28
|
+
class_name: InputTemplate
|
|
29
|
+
attributes: {}
|
|
30
|
+
- template_name: ElevenLabsTTS
|
|
31
|
+
class_name: ElevenLabsTTS
|
|
32
|
+
template_input: InputTemplate
|
|
33
|
+
attributes:
|
|
34
|
+
voice: null
|
|
35
|
+
voice_settings: null
|
|
36
|
+
model: eleven_turbo_v2_5
|
|
37
|
+
output_format: mp3_44100_128
|
|
38
|
+
output_folder: /sinapsis/cache/dir/elevenlabs/audios
|
|
39
|
+
stream: false
|
|
40
|
+
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
class AttributesBaseModel(ElevenLabsBase.AttributesBaseModel):
|
|
44
|
+
"""Attributes specific to ElevenLabs TTS API interaction.
|
|
45
|
+
|
|
46
|
+
This class overrides the base attributes of `ElevenLabsBase` to define
|
|
47
|
+
default models specific to the ElevenLabs TTS system.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
model: Literal[
|
|
51
|
+
"eleven_turbo_v2_5",
|
|
52
|
+
"eleven_multilingual_v2",
|
|
53
|
+
"eleven_turbo_v2",
|
|
54
|
+
"eleven_monolingual_v1",
|
|
55
|
+
"eleven_multilingual_v1",
|
|
56
|
+
] = "eleven_turbo_v2_5"
|
|
57
|
+
|
|
58
|
+
def synthesize_speech(self, input_data: list[TextPacket]) -> Iterator[bytes]:
|
|
59
|
+
"""
|
|
60
|
+
Sends the text to ElevenLabs API to generate speech.
|
|
61
|
+
|
|
62
|
+
This method communicates with the ElevenLabs API to generate the audio
|
|
63
|
+
response based on the provided text, voice, and model settings.
|
|
64
|
+
"""
|
|
65
|
+
input_text: str = load_input_text(input_data)
|
|
66
|
+
try:
|
|
67
|
+
response: Iterator[bytes] = self.client.generate(
|
|
68
|
+
text=input_text,
|
|
69
|
+
voice=self.attributes.voice,
|
|
70
|
+
model=self.attributes.model,
|
|
71
|
+
voice_settings=create_voice_settings(self.attributes.voice_settings),
|
|
72
|
+
output_format=self.attributes.output_format,
|
|
73
|
+
stream=self.attributes.stream,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
return response
|
|
77
|
+
except ValueError as e:
|
|
78
|
+
self.logger.error(f"Value error synthesizing speech: {e}")
|
|
79
|
+
raise
|
|
80
|
+
except TypeError as e:
|
|
81
|
+
self.logger.error(f"Type error in input data or parameters: {e}")
|
|
82
|
+
raise
|
|
83
|
+
except KeyError as e:
|
|
84
|
+
self.logger.error(f"Missing key in input data or settings: {e}")
|
|
85
|
+
raise
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""Text-To-Speech template for ElevenLabs"""
|
|
3
|
+
|
|
4
|
+
import base64
|
|
5
|
+
|
|
6
|
+
from sinapsis_core.data_containers.data_packet import TextPacket
|
|
7
|
+
|
|
8
|
+
from sinapsis_elevenlabs.helpers.voice_utils import load_input_text
|
|
9
|
+
from sinapsis_elevenlabs.templates.elevenlabs_base import ElevenLabsBase
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ElevenLabsVoiceGeneration(ElevenLabsBase):
|
|
13
|
+
"""
|
|
14
|
+
Template to generate a voice using ElevenLabs API.
|
|
15
|
+
|
|
16
|
+
The template takes the voice description as an attribute and
|
|
17
|
+
the prompt for the audio as a TextPacket stored in the DataContainer
|
|
18
|
+
and stores the generated audio in the DataContainer.
|
|
19
|
+
|
|
20
|
+
Usage example:
|
|
21
|
+
|
|
22
|
+
agent:
|
|
23
|
+
name: my_test_agent
|
|
24
|
+
templates:
|
|
25
|
+
- template_name: InputTemplate
|
|
26
|
+
class_name: InputTemplate
|
|
27
|
+
attributes: {}
|
|
28
|
+
- template_name: ElevenLabsVoiceGeneration
|
|
29
|
+
class_name: ElevenLabsVoiceGeneration
|
|
30
|
+
template_input: InputTemplate
|
|
31
|
+
attributes:
|
|
32
|
+
voice: null
|
|
33
|
+
voice_settings: null
|
|
34
|
+
model: eleven_turbo_v2_5
|
|
35
|
+
output_format: mp3_44100_128
|
|
36
|
+
output_folder: /sinapsis/cache/dir/elevenlabs/audios
|
|
37
|
+
stream: false
|
|
38
|
+
voice_description: An old British male with a raspy, deep voice. Professional,
|
|
39
|
+
relaxed and assertive
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
class AttributesBaseModel(ElevenLabsBase.AttributesBaseModel):
|
|
43
|
+
"""
|
|
44
|
+
Attributes for voice generation in ElevenLabs API.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
voice_description: str = "An old British male with a raspy, deep voice. Professional, relaxed and assertive"
|
|
48
|
+
|
|
49
|
+
def synthesize_speech(self, input_data: list[TextPacket]) -> list[bytes] | None:
|
|
50
|
+
"""
|
|
51
|
+
Sends the text to ElevenLabs API to generate speech.
|
|
52
|
+
|
|
53
|
+
This method communicates with the ElevenLabs API to generate the audio
|
|
54
|
+
response based on the provided text, voice, and model settings.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
input_text: str = load_input_text(input_data)
|
|
58
|
+
if len(input_text) < 100:
|
|
59
|
+
self.logger.error("The text to be spoken must be at least 100 characters long.")
|
|
60
|
+
return None
|
|
61
|
+
try:
|
|
62
|
+
voice_previews = self.client.text_to_voice.create_previews(
|
|
63
|
+
voice_description=self.attributes.voice_description,
|
|
64
|
+
text=input_text,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
responses: list[bytes] = [base64.b64decode(preview.audio_base_64) for preview in voice_previews.previews]
|
|
68
|
+
|
|
69
|
+
return responses
|
|
70
|
+
except ValueError as e:
|
|
71
|
+
self.logger.error(f"Value error with voice description or input text: {e}")
|
|
72
|
+
raise
|
|
73
|
+
except TypeError as e:
|
|
74
|
+
self.logger.error(f"Type error with input data or voice preview parameters: {e}")
|
|
75
|
+
raise
|
|
76
|
+
except KeyError as e:
|
|
77
|
+
self.logger.error(f"Missing expected key in voice preview response: {e}")
|
|
78
|
+
raise
|