sinapsis-speech 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,35 @@
1
+ # -*- coding: utf-8 -*-
2
+ from typing import Any
3
+
4
+ from pydantic import BaseModel
5
+ from sinapsis_core.utils.env_var_keys import EnvVarEntry, doc_str, return_docs_for_vars
6
+
7
+
8
+ class _ElevenlabsKeys(BaseModel):
9
+ """
10
+ Env vars for Elevenlabs
11
+ """
12
+
13
+ ELEVENLABS_API_KEY: EnvVarEntry = EnvVarEntry(
14
+ var_name="ELEVENLABS_API_KEY",
15
+ default_value=" ",
16
+ allowed_values=None,
17
+ description="set api key for Elevenlabs",
18
+ )
19
+
20
+
21
+ ElevenlabsEnvVars = _ElevenlabsKeys()
22
+
23
+ doc_str = return_docs_for_vars(ElevenlabsEnvVars, docs=doc_str, string_for_doc="""Elevenlabs env vars available: \n""")
24
+ __doc__ = doc_str
25
+
26
+
27
+ def __getattr__(name: str) -> Any:
28
+ """to use as an import, when updating the value is not important"""
29
+ if name in ElevenlabsEnvVars.model_fields:
30
+ return ElevenlabsEnvVars.model_fields[name].default.value
31
+
32
+ raise AttributeError(f"Agent does not have `{name}` env var")
33
+
34
+
35
+ _all__ = (*list(ElevenlabsEnvVars.model_fields.keys()), "ElevenlabsEnvVars")
@@ -0,0 +1,80 @@
1
+ # -*- coding: utf-8 -*-
2
+ from elevenlabs import VoiceSettings
3
+ from elevenlabs.client import DEFAULT_VOICE, ElevenLabs, VoiceId, VoiceName
4
+ from sinapsis_core.data_containers.data_packet import TextPacket
5
+ from sinapsis_core.utils.logging_utils import sinapsis_logger
6
+
7
+
8
+ def create_voice_settings(settings: VoiceSettings) -> VoiceSettings | None:
9
+ """
10
+ Creates or updates a `VoiceSettings` object based on the provided settings.
11
+
12
+ This function attempts to create or update a `VoiceSettings` object using the provided
13
+ `VoiceSettings` instance. If any of the fields in the settings contain `None`,
14
+ the corresponding field is populated with a default value from `DEFAULT_VOICE.settings`.
15
+ If all fields are valid (i.e., none are `None`), the provided `settings` object is returned unchanged.
16
+
17
+ If the settings argument is `None` or if no valid settings are provided, the function
18
+ returns `None`.
19
+
20
+ Args:
21
+ settings (VoiceSettings): An instance of `VoiceSettings` containing the settings to be applied.
22
+ This object may have fields with `None` values that should be replaced with default values.
23
+
24
+ Returns:
25
+ VoiceSettings: A `VoiceSettings` object created or updated with the provided settings. If any field
26
+ was `None`, it is updated with default values. If the settings are invalid or empty,
27
+ `None` is returned.
28
+ """
29
+ if settings:
30
+ settings_dict = settings.model_dump()
31
+ if any(value is None for value in settings_dict.values()):
32
+ for field, value in settings_dict.items():
33
+ if value is None:
34
+ settings_dict[field] = getattr(DEFAULT_VOICE.settings, field)
35
+
36
+ return VoiceSettings(**settings_dict)
37
+ else:
38
+ return settings
39
+ return None
40
+
41
+
42
+ def get_voice_id(client: ElevenLabs, voice: VoiceId | VoiceName) -> VoiceId:
43
+ """
44
+ Resolves the voice ID for a given voice name or ID.
45
+
46
+ This function searches through available voices from the ElevenLabs API
47
+ to match the provided voice name or ID. If the specified voice is not found,
48
+ it logs the error and returns the first available voice ID as a fallback.
49
+
50
+ Args:
51
+ client (ElevenLabs): The ElevenLabs API client instance.
52
+ voice (VoiceId | VoiceName): The name or ID of the desired voice.
53
+
54
+ Returns:
55
+ VoiceId: The resolved voice ID.
56
+
57
+ Raises:
58
+ ValueError: If no voices are available to resolve.
59
+ """
60
+ try:
61
+ voices = client.voices.get_all().voices
62
+ for v in voices:
63
+ if voice == v.name or voice == v.voice_id:
64
+ sinapsis_logger.debug("Voice '%s' resolved to ID: %s", voice, v.voice_id)
65
+ return v.voice_id
66
+
67
+ sinapsis_logger.error("Voice '%s' is not available.", voice)
68
+ if voices:
69
+ sinapsis_logger.info("Returning default voice ID: %s", voices[0].voice_id)
70
+ return voices[0].voice_id
71
+
72
+ raise ValueError("No available voices to resolve. Ensure the client is configured correctly.")
73
+ except Exception as e:
74
+ sinapsis_logger.error("Error resolving voice ID: %s", e)
75
+ raise
76
+
77
+
78
+ def load_input_text(input_data: list[TextPacket]) -> str:
79
+ """Loads and concatenates the text content from a list of TextPacket objects."""
80
+ return "".join([item.content for item in input_data])
@@ -0,0 +1,21 @@
1
+ # -*- coding: utf-8 -*-
2
+ import importlib
3
+ from typing import Callable
4
+
5
+ _root_lib_path = "sinapsis_elevenlabs.templates"
6
+
7
+ _template_lookup = {
8
+ "ElevenLabsTTS": f"{_root_lib_path}.elevenlabs_tts",
9
+ "ElevenLabsVoiceGeneration": f"{_root_lib_path}.elevenlabs_voice_generation",
10
+ }
11
+
12
+
13
+ def __getattr__(name: str) -> Callable:
14
+ if name in _template_lookup:
15
+ module = importlib.import_module(_template_lookup[name])
16
+ return getattr(module, name)
17
+
18
+ raise AttributeError(f"template `{name}` not found in {_root_lib_path}")
19
+
20
+
21
+ __all__ = list(_template_lookup.keys())
@@ -0,0 +1,204 @@
1
+ # -*- coding: utf-8 -*-
2
+ """Base template for ElevenLabs speech synthesis"""
3
+
4
+ import abc
5
+ import os
6
+ import uuid
7
+ from io import BytesIO
8
+ from typing import IO, Iterator, Literal
9
+
10
+ from elevenlabs import Voice, VoiceSettings, save
11
+ from elevenlabs.client import ElevenLabs, VoiceId, VoiceName
12
+ from sinapsis_core.data_containers.data_packet import AudioPacket, DataContainer, Packet
13
+ from sinapsis_core.template_base.template import (
14
+ Template,
15
+ TemplateAttributes,
16
+ TemplateAttributeType,
17
+ )
18
+ from sinapsis_core.utils.env_var_keys import SINAPSIS_CACHE_DIR
19
+
20
+ from sinapsis_elevenlabs.helpers.env_var_keys import ELEVENLABS_API_KEY
21
+
22
+ RESPONSE_TYPE = Iterator[bytes] | list[bytes] | list[Iterator[bytes]] | None
23
+
24
+
25
+ class ElevenLabsBase(Template, abc.ABC):
26
+ """
27
+ Base template to perform audio generation tasks using the Elevenlabs package.
28
+
29
+ The template takes as attributes the elevenlabs api key, the voice for the generated audio,
30
+ settings associated with the audio (such as stability, style, etc.), the model to be used,
31
+ the format for the audio, the path, etc. It implements methods to process the
32
+ DataContainer, initialize the Elevenlabs client, perform the inference,
33
+ and store the audio.
34
+
35
+ """
36
+
37
+ PACKET_TYPE_NAME: str = "texts"
38
+
39
+ class AttributesBaseModel(TemplateAttributes):
40
+ """
41
+ Attributes for ElevenLabs Base Class.
42
+ Args:
43
+ api_key (str): The API key to authenticate with ElevenLabs' API.
44
+ voice (str|elevenlabs.Voice): The voice to use for speech synthesis. This can be a voice ID (str),
45
+ a voice name (str) or an elevenlabs voice object (Voice).
46
+ voice_settings (VoiceSettings): A dictionary of settings that control the behavior of the voice.
47
+ - stability (float)
48
+ - similarity_boost (float)
49
+ - style (float)
50
+ - use_speaker_boost (bool)
51
+ model (Literal): The model identifier to use for speech synthesis.
52
+ output_format (Literal): The output audio format and quality. Options include:
53
+ ["mp3_22050_32", "mp3_44100_32", "mp3_44100_64", "mp3_44100_96", "mp3_44100_128",
54
+ "mp3_44100_192", "pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100", "ulaw_8000"]
55
+ output_folder (str): The folder where generated audio files will be saved.
56
+ stream (bool): If True, the audio is returned as a stream; otherwise, saved to a file.
57
+ """
58
+
59
+ api_key: str | None = None
60
+ voice: VoiceId | VoiceName | Voice = None
61
+ voice_settings: VoiceSettings | None = None
62
+ model: Literal[
63
+ "eleven_turbo_v2_5",
64
+ "eleven_multilingual_v2",
65
+ "eleven_turbo_v2",
66
+ "eleven_monolingual_v1",
67
+ "eleven_multilingual_v1",
68
+ "eleven_english_sts_v2",
69
+ "eleven_multilingual_sts_v2",
70
+ ] = "eleven_turbo_v2_5"
71
+ output_format: Literal[
72
+ "mp3_22050_32",
73
+ "mp3_44100_32",
74
+ "mp3_44100_64",
75
+ "mp3_44100_96",
76
+ "mp3_44100_128",
77
+ "mp3_44100_192",
78
+ "pcm_16000",
79
+ "pcm_22050",
80
+ "pcm_24000",
81
+ "pcm_44100",
82
+ "ulaw_8000",
83
+ ] = "mp3_44100_128"
84
+ output_folder: str = os.path.join(SINAPSIS_CACHE_DIR, "elevenlabs", "audios")
85
+ stream: bool = False
86
+
87
+ def __init__(self, attributes: TemplateAttributeType) -> None:
88
+ """Initializes the ElevenLabs API client with the given attributes."""
89
+ super().__init__(attributes)
90
+ os.makedirs(self.attributes.output_folder, exist_ok=True)
91
+ self.client = self.init_elevenlabs_client()
92
+
93
+ def init_elevenlabs_client(self) -> ElevenLabs:
94
+ """Resets client object"""
95
+ key = self.attributes.api_key if self.attributes.api_key else ELEVENLABS_API_KEY
96
+ return ElevenLabs(api_key=key)
97
+
98
+ def reset_state(self) -> None:
99
+ """Resets state of model"""
100
+ self.client = self.init_elevenlabs_client()
101
+
102
+ @abc.abstractmethod
103
+ def synthesize_speech(self, input_data: list[Packet]) -> RESPONSE_TYPE:
104
+ """Abstract method for ElevenLabs speech synthesis."""
105
+
106
+ def _save_audio(self, response: Iterator[bytes] | bytes, file_format: str) -> str:
107
+ """Saves the audio to a file and returns the file path."""
108
+ output_file = os.path.join(self.attributes.output_folder, f"{uuid.uuid4()}.{file_format}")
109
+ try:
110
+ save(response, output_file)
111
+ self.logger.info(f"Audio saved to: {output_file}")
112
+ return output_file
113
+ except OSError as e:
114
+ self.logger.error(f"File system error while saving speech to file: {e}")
115
+ raise
116
+
117
+ def _generate_audio_stream(self, response: Iterator[bytes] | bytes) -> IO[bytes]:
118
+ """Generates and returns the audio stream."""
119
+ audio_stream = BytesIO()
120
+ try:
121
+ if isinstance(response, Iterator):
122
+ for chunk in response:
123
+ if chunk:
124
+ audio_stream.write(chunk)
125
+ elif isinstance(response, bytes):
126
+ audio_stream.write(response)
127
+ else:
128
+ raise TypeError(f"Unsupported response type: {type(response)}")
129
+
130
+ audio_stream.seek(0)
131
+ self.logger.info("Returning audio stream")
132
+ return audio_stream
133
+ except IOError as e:
134
+ self.logger.error(f"I/O error while processing the audio stream: {e}")
135
+ raise
136
+ except ValueError as e:
137
+ self.logger.error(f"Value error while processing audio chunks: {e}")
138
+ raise
139
+
140
+ def _process_audio_output(self, response: Iterator[bytes] | bytes) -> str | IO[bytes]:
141
+ """Processes a single audio output (either stream or file)."""
142
+ if self.attributes.stream:
143
+ return self._generate_audio_stream(response)
144
+ else:
145
+ file_format = "mp3" if "mp3" in self.attributes.output_format else "wav"
146
+ return self._save_audio(response, file_format)
147
+
148
+ def generate_speech(self, input_data: list[Packet]) -> list[str | IO[bytes]] | None:
149
+ """Generates speech and saves it to a file."""
150
+ responses: RESPONSE_TYPE = self.synthesize_speech(input_data)
151
+ if not responses:
152
+ return None
153
+
154
+ if isinstance(responses, Iterator):
155
+ responses = [responses]
156
+
157
+ audio_outputs = [self._process_audio_output(response) for response in responses]
158
+ return audio_outputs
159
+
160
+ def _handle_streaming_output(self, audio_outputs: list[str | IO[bytes]]) -> list[AudioPacket]:
161
+ """Handles audio stream output by adding it to the container as AudioPackets."""
162
+ generated_audios: list[AudioPacket] = []
163
+ sample_rate = int(self.attributes.output_format.split("_")[1])
164
+ for audio_output in audio_outputs:
165
+ audio_packet = AudioPacket(
166
+ content=audio_output,
167
+ sample_rate=sample_rate,
168
+ )
169
+ generated_audios.append(audio_packet)
170
+ return generated_audios
171
+
172
+ def _handle_audio_outputs(self, audio_outputs: list[str | IO[bytes]], container: DataContainer) -> None:
173
+ """Handles the audio outputs by appending to the container based on the output type (stream or file)."""
174
+ if self.attributes.stream:
175
+ container.audios = container.audios or []
176
+ container.audios.extend(self._handle_streaming_output(audio_outputs))
177
+ else:
178
+ self._set_generic_data(container, audio_outputs)
179
+
180
+ def execute(self, container: DataContainer) -> DataContainer:
181
+ """
182
+ Processes the input data and generates a speech output.
183
+ Depending on the configuration, either a file or a stream of audio is
184
+ generated and added to the provided `container`.
185
+ """
186
+
187
+ if ELEVENLABS_API_KEY is None and self.attributes.api_key is None:
188
+ self.logger.error("Api key was not provided")
189
+ return container
190
+
191
+ data_packet = getattr(container, self.PACKET_TYPE_NAME)
192
+
193
+ if not data_packet:
194
+ self.logger.debug("No query to enter")
195
+ return container
196
+
197
+ audio_outputs = self.generate_speech(data_packet)
198
+ if not audio_outputs:
199
+ self.logger.error("Unable to generate speech")
200
+ return container
201
+
202
+ self._handle_audio_outputs(audio_outputs, container)
203
+
204
+ return container
@@ -0,0 +1,85 @@
1
+ # -*- coding: utf-8 -*-
2
+ """Text-To-Speech template for ElevenLabs"""
3
+
4
+ from typing import Iterator, Literal
5
+
6
+ from sinapsis_core.data_containers.data_packet import TextPacket
7
+
8
+ from sinapsis_elevenlabs.helpers.voice_utils import (
9
+ create_voice_settings,
10
+ load_input_text,
11
+ )
12
+ from sinapsis_elevenlabs.templates.elevenlabs_base import ElevenLabsBase
13
+
14
+
15
+ class ElevenLabsTTS(ElevenLabsBase):
16
+ """Template to interact with ElevenLabs text-to-speech API.
17
+
18
+ This class provides an implementation to generate speech from text using the
19
+ ElevenLabs text-to-speech API. It allows customization of voice, model settings,
20
+ and audio output format.
21
+
22
+ Usage example:
23
+
24
+ agent:
25
+ name: my_test_agent
26
+ templates:
27
+ - template_name: InputTemplate
28
+ class_name: InputTemplate
29
+ attributes: {}
30
+ - template_name: ElevenLabsTTS
31
+ class_name: ElevenLabsTTS
32
+ template_input: InputTemplate
33
+ attributes:
34
+ voice: null
35
+ voice_settings: null
36
+ model: eleven_turbo_v2_5
37
+ output_format: mp3_44100_128
38
+ output_folder: /sinapsis/cache/dir/elevenlabs/audios
39
+ stream: false
40
+
41
+ """
42
+
43
+ class AttributesBaseModel(ElevenLabsBase.AttributesBaseModel):
44
+ """Attributes specific to ElevenLabs TTS API interaction.
45
+
46
+ This class overrides the base attributes of `ElevenLabsBase` to define
47
+ default models specific to the ElevenLabs TTS system.
48
+ """
49
+
50
+ model: Literal[
51
+ "eleven_turbo_v2_5",
52
+ "eleven_multilingual_v2",
53
+ "eleven_turbo_v2",
54
+ "eleven_monolingual_v1",
55
+ "eleven_multilingual_v1",
56
+ ] = "eleven_turbo_v2_5"
57
+
58
+ def synthesize_speech(self, input_data: list[TextPacket]) -> Iterator[bytes]:
59
+ """
60
+ Sends the text to ElevenLabs API to generate speech.
61
+
62
+ This method communicates with the ElevenLabs API to generate the audio
63
+ response based on the provided text, voice, and model settings.
64
+ """
65
+ input_text: str = load_input_text(input_data)
66
+ try:
67
+ response: Iterator[bytes] = self.client.generate(
68
+ text=input_text,
69
+ voice=self.attributes.voice,
70
+ model=self.attributes.model,
71
+ voice_settings=create_voice_settings(self.attributes.voice_settings),
72
+ output_format=self.attributes.output_format,
73
+ stream=self.attributes.stream,
74
+ )
75
+
76
+ return response
77
+ except ValueError as e:
78
+ self.logger.error(f"Value error synthesizing speech: {e}")
79
+ raise
80
+ except TypeError as e:
81
+ self.logger.error(f"Type error in input data or parameters: {e}")
82
+ raise
83
+ except KeyError as e:
84
+ self.logger.error(f"Missing key in input data or settings: {e}")
85
+ raise
@@ -0,0 +1,78 @@
1
+ # -*- coding: utf-8 -*-
2
+ """Text-To-Speech template for ElevenLabs"""
3
+
4
+ import base64
5
+
6
+ from sinapsis_core.data_containers.data_packet import TextPacket
7
+
8
+ from sinapsis_elevenlabs.helpers.voice_utils import load_input_text
9
+ from sinapsis_elevenlabs.templates.elevenlabs_base import ElevenLabsBase
10
+
11
+
12
+ class ElevenLabsVoiceGeneration(ElevenLabsBase):
13
+ """
14
+ Template to generate a voice using ElevenLabs API.
15
+
16
+ The template takes the voice description as an attribute and
17
+ the prompt for the audio as a TextPacket stored in the DataContainer
18
+ and stores the generated audio in the DataContainer.
19
+
20
+ Usage example:
21
+
22
+ agent:
23
+ name: my_test_agent
24
+ templates:
25
+ - template_name: InputTemplate
26
+ class_name: InputTemplate
27
+ attributes: {}
28
+ - template_name: ElevenLabsVoiceGeneration
29
+ class_name: ElevenLabsVoiceGeneration
30
+ template_input: InputTemplate
31
+ attributes:
32
+ voice: null
33
+ voice_settings: null
34
+ model: eleven_turbo_v2_5
35
+ output_format: mp3_44100_128
36
+ output_folder: /sinapsis/cache/dir/elevenlabs/audios
37
+ stream: false
38
+ voice_description: An old British male with a raspy, deep voice. Professional,
39
+ relaxed and assertive
40
+ """
41
+
42
+ class AttributesBaseModel(ElevenLabsBase.AttributesBaseModel):
43
+ """
44
+ Attributes for voice generation in ElevenLabs API.
45
+ """
46
+
47
+ voice_description: str = "An old British male with a raspy, deep voice. Professional, relaxed and assertive"
48
+
49
+ def synthesize_speech(self, input_data: list[TextPacket]) -> list[bytes] | None:
50
+ """
51
+ Sends the text to ElevenLabs API to generate speech.
52
+
53
+ This method communicates with the ElevenLabs API to generate the audio
54
+ response based on the provided text, voice, and model settings.
55
+ """
56
+
57
+ input_text: str = load_input_text(input_data)
58
+ if len(input_text) < 100:
59
+ self.logger.error("The text to be spoken must be at least 100 characters long.")
60
+ return None
61
+ try:
62
+ voice_previews = self.client.text_to_voice.create_previews(
63
+ voice_description=self.attributes.voice_description,
64
+ text=input_text,
65
+ )
66
+
67
+ responses: list[bytes] = [base64.b64decode(preview.audio_base_64) for preview in voice_previews.previews]
68
+
69
+ return responses
70
+ except ValueError as e:
71
+ self.logger.error(f"Value error with voice description or input text: {e}")
72
+ raise
73
+ except TypeError as e:
74
+ self.logger.error(f"Type error with input data or voice preview parameters: {e}")
75
+ raise
76
+ except KeyError as e:
77
+ self.logger.error(f"Missing expected key in voice preview response: {e}")
78
+ raise