sinapsis-speech 0.3.5__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/tags.py +15 -0
  2. sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/voice_utils.py +50 -14
  3. sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/__init__.py +2 -0
  4. sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_base.py +40 -54
  5. sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_sts.py +60 -17
  6. sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_tts.py +12 -8
  7. sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_clone.py +89 -11
  8. sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_generation.py +7 -1
  9. sinapsis_f5_tts/src/sinapsis_f5_tts/helpers/__init__.py +0 -0
  10. sinapsis_f5_tts/src/sinapsis_f5_tts/helpers/tags.py +10 -0
  11. sinapsis_f5_tts/src/sinapsis_f5_tts/templates/f5_tts_inference.py +13 -1
  12. sinapsis_kokoro/src/sinapsis_kokoro/__init__.py +0 -0
  13. sinapsis_kokoro/src/sinapsis_kokoro/helpers/tags.py +10 -0
  14. sinapsis_kokoro/src/sinapsis_kokoro/templates/kokoro_tts.py +14 -3
  15. sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/__init__.py +0 -0
  16. sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/helpers/__init__.py +0 -0
  17. sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/helpers/tags.py +10 -0
  18. sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/templates/__init__.py +20 -0
  19. sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/templates/orpheus_tts.py +312 -0
  20. sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/thirdparty/helpers.py +69 -0
  21. sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/__init__.py +0 -0
  22. sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/helpers/__init__.py +0 -0
  23. sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/helpers/tags.py +11 -0
  24. sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/templates/__init__.py +20 -0
  25. sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/templates/parakeet_tdt.py +289 -0
  26. {sinapsis_speech-0.3.5.dist-info → sinapsis_speech-0.4.1.dist-info}/METADATA +68 -5
  27. sinapsis_speech-0.4.1.dist-info/RECORD +44 -0
  28. {sinapsis_speech-0.3.5.dist-info → sinapsis_speech-0.4.1.dist-info}/WHEEL +1 -1
  29. {sinapsis_speech-0.3.5.dist-info → sinapsis_speech-0.4.1.dist-info}/top_level.txt +2 -0
  30. sinapsis_zonos/src/sinapsis_zonos/helpers/tags.py +11 -0
  31. sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_tts_utils.py +1 -1
  32. sinapsis_zonos/src/sinapsis_zonos/templates/zonos_tts.py +13 -13
  33. sinapsis_speech-0.3.5.dist-info/RECORD +0 -27
  34. {sinapsis_speech-0.3.5.dist-info → sinapsis_speech-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,15 @@
1
+ # -*- coding: utf-8 -*-
2
+ from enum import Enum
3
+
4
+
5
+ class Tags(Enum):
6
+ AUDIO = "audio"
7
+ AUDIO_GENERATION = "audio_generation"
8
+ ELEVENLABS = "elevenlabs"
9
+ PROMPT = "prompt"
10
+ SPEECH = "speech"
11
+ SPEECH_TO_SPEECH = "speech_to_speech"
12
+ TEXT_TO_SPEECH = "text_to_speech"
13
+ VOICE_CONVERSION = "voice_conversion"
14
+ VOICE_CLONING = "voice_cloning"
15
+ VOICE_GENERATION = "voice_generation"
@@ -1,29 +1,35 @@
1
1
  # -*- coding: utf-8 -*-
2
- from elevenlabs import VoiceSettings
3
- from elevenlabs.client import DEFAULT_VOICE, ElevenLabs, VoiceId, VoiceName
2
+ import json
3
+
4
+ from elevenlabs import Voice, VoiceSettings
5
+ from elevenlabs.client import ElevenLabs
4
6
  from sinapsis_core.data_containers.data_packet import TextPacket
5
7
  from sinapsis_core.utils.logging_utils import sinapsis_logger
6
8
 
7
9
 
8
- def create_voice_settings(settings: VoiceSettings) -> VoiceSettings | None:
10
+ def create_voice_settings(settings: VoiceSettings, as_json: bool = False) -> VoiceSettings | None | str:
9
11
  """
10
12
  Creates or updates a `VoiceSettings` object based on the provided settings.
11
13
 
12
14
  Args:
13
15
  settings (VoiceSettings | None): An instance of `VoiceSettings` containing the settings to be applied.
14
16
  If `None`, the function returns the default settings.
17
+ as_json (bool): Whether to return the settings as JSON string.
15
18
 
16
19
  Returns:
17
- VoiceSettings: The provided `VoiceSettings` object if `settings` is not `None`. Otherwise,
18
- `DEFAULT_VOICE.settings` is returned.
20
+ VoiceSettings | None | str: The provided `VoiceSettings` object if `settings` is not `None`. Otherwise,
21
+ `None` is returned for default settings.
19
22
  """
20
23
  if not settings:
21
- return DEFAULT_VOICE.settings
24
+ return None
25
+
26
+ if as_json:
27
+ return json.dumps(settings.model_dump(exclude_none=True))
22
28
 
23
29
  return settings
24
30
 
25
31
 
26
- def get_voice_id(client: ElevenLabs, voice: VoiceId | VoiceName) -> VoiceId:
32
+ def get_voice_id(client: ElevenLabs, voice: str | Voice | None) -> str:
27
33
  """
28
34
  Resolves the voice ID for a given voice name or ID.
29
35
 
@@ -33,29 +39,59 @@ def get_voice_id(client: ElevenLabs, voice: VoiceId | VoiceName) -> VoiceId:
33
39
 
34
40
  Args:
35
41
  client (ElevenLabs): The ElevenLabs API client instance.
36
- voice (VoiceId | VoiceName): The name or ID of the desired voice.
42
+ voice (str | Voice | None): The name or ID of the desired voice.
37
43
 
38
44
  Returns:
39
- VoiceId: The resolved voice ID.
45
+ str: The resolved voice ID.
40
46
 
41
47
  Raises:
42
48
  ValueError: If no voices are available to resolve.
43
49
  """
50
+ if not voice:
51
+ return get_default_voice(client).voice_id
52
+
53
+ if isinstance(voice, Voice):
54
+ sinapsis_logger.debug(f"Voice object provided, using voice_id: {voice.voice_id}")
55
+ return voice.voice_id
56
+
44
57
  try:
45
- voices = client.voices.get_all().voices
58
+ voices_response = client.voices.get_all()
59
+ voices = voices_response.voices
60
+
46
61
  for v in voices:
47
62
  if voice == v.name or voice == v.voice_id:
48
- sinapsis_logger.debug("Voice '%s' resolved to ID: %s", voice, v.voice_id)
63
+ sinapsis_logger.debug(f"Voice {voice} resolved to ID: {v.voice_id}")
49
64
  return v.voice_id
50
65
 
51
- sinapsis_logger.error("Voice '%s' is not available.", voice)
66
+ sinapsis_logger.error(f"Voice {voice} is not available.")
52
67
  if voices:
53
- sinapsis_logger.info("Returning default voice ID: %s", voices[0].voice_id)
68
+ sinapsis_logger.info(f"Returning default voice ID: {voices[0].voice_id}")
54
69
  return voices[0].voice_id
55
70
 
56
71
  raise ValueError("No available voices to resolve. Ensure the client is configured correctly.")
57
72
  except Exception as e:
58
- sinapsis_logger.error("Error resolving voice ID: %s", e)
73
+ sinapsis_logger.error(f"Error resolving voice ID: {e}")
74
+ raise
75
+
76
+
77
+ def get_default_voice(client: ElevenLabs) -> Voice:
78
+ """
79
+ Gets the first available voice as default.
80
+
81
+ Args:
82
+ client (ElevenLabs): The ElevenLabs API client instance.
83
+
84
+ Returns:
85
+ Voice: The default voice object.
86
+ """
87
+ try:
88
+ voices_response = client.voices.get_all()
89
+ voices = voices_response.voices
90
+ if voices:
91
+ return voices[0]
92
+ raise ValueError("No voices available")
93
+ except Exception as e:
94
+ sinapsis_logger.error(f"Error getting default voice: {e}")
59
95
  raise
60
96
 
61
97
 
@@ -7,6 +7,8 @@ _root_lib_path = "sinapsis_elevenlabs.templates"
7
7
  _template_lookup = {
8
8
  "ElevenLabsTTS": f"{_root_lib_path}.elevenlabs_tts",
9
9
  "ElevenLabsVoiceGeneration": f"{_root_lib_path}.elevenlabs_voice_generation",
10
+ "ElevenLabsVoiceClone": f"{_root_lib_path}.elevenlabs_voice_clone",
11
+ "ElevenLabsSTS": f"{_root_lib_path}.elevenlabs_sts",
10
12
  }
11
13
 
12
14
 
@@ -3,12 +3,11 @@
3
3
 
4
4
  import abc
5
5
  import os
6
- import uuid
7
- from io import BytesIO
8
- from typing import IO, Iterator, Literal
6
+ from typing import Generator, Iterable, Iterator, Literal
9
7
 
10
- from elevenlabs import Voice, VoiceSettings, save
11
- from elevenlabs.client import ElevenLabs, VoiceId, VoiceName
8
+ import numpy as np
9
+ from elevenlabs import Voice, VoiceSettings
10
+ from elevenlabs.client import ElevenLabs
12
11
  from elevenlabs.types import OutputFormat
13
12
  from pydantic import Field
14
13
  from sinapsis_core.data_containers.data_packet import AudioPacket, DataContainer, Packet
@@ -19,9 +18,11 @@ from sinapsis_core.template_base.base_models import (
19
18
  UIPropertiesMetadata,
20
19
  )
21
20
  from sinapsis_core.template_base.template import Template
22
- from sinapsis_core.utils.env_var_keys import SINAPSIS_CACHE_DIR
21
+ from sinapsis_core.utils.env_var_keys import WORKING_DIR
22
+ from sinapsis_generic_data_tools.helpers.audio_encoder import audio_bytes_to_numpy
23
23
 
24
24
  from sinapsis_elevenlabs.helpers.env_var_keys import ELEVENLABS_API_KEY
25
+ from sinapsis_elevenlabs.helpers.tags import Tags
25
26
 
26
27
  RESPONSE_TYPE = Iterator[bytes] | list[bytes] | list[Iterator[bytes]] | None
27
28
 
@@ -51,9 +52,7 @@ class ElevenLabsBase(Template, abc.ABC):
51
52
  output_format (OutputFormat): The output audio format and quality. Options include:
52
53
  ["mp3_22050_32", "mp3_44100_32", "mp3_44100_64", "mp3_44100_96", "mp3_44100_128",
53
54
  "mp3_44100_192", "pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100", "ulaw_8000"]
54
- output_folder (str): The folder where generated audio files will be saved.
55
- stream (bool): If True, the audio is returned as a stream; otherwise, saved to a file.
56
- voice (VoiceId | VoiceName | Voice): The voice to use for speech synthesis. This can be a voice ID (str),
55
+ voice (str | Voice | None): The voice to use for speech synthesis. This can be a voice ID (str),
57
56
  a voice name (str) or an elevenlabs voice object (Voice).
58
57
  voice_settings (VoiceSettings): A dictionary of settings that control the behavior of the voice.
59
58
  - stability (float)
@@ -74,17 +73,20 @@ class ElevenLabsBase(Template, abc.ABC):
74
73
  ] = "eleven_turbo_v2_5"
75
74
  output_file_name: str | None = None
76
75
  output_format: OutputFormat = "mp3_44100_128"
77
- output_folder: str = os.path.join(SINAPSIS_CACHE_DIR, "elevenlabs", "audios")
76
+ output_folder: str = os.path.join(WORKING_DIR, "elevenlabs", "audios")
78
77
  stream: bool = False
79
- voice: VoiceId | VoiceName | Voice = None
78
+ voice: str | Voice | None = None
80
79
  voice_settings: VoiceSettings = Field(default_factory=dict) # type: ignore[arg-type]
81
80
 
82
- UIProperties = UIPropertiesMetadata(category="Elevenlabs", output_type=OutputTypes.AUDIO)
81
+ UIProperties = UIPropertiesMetadata(
82
+ category="Elevenlabs",
83
+ output_type=OutputTypes.AUDIO,
84
+ tags=[Tags.AUDIO, Tags.ELEVENLABS, Tags.SPEECH],
85
+ )
83
86
 
84
87
  def __init__(self, attributes: TemplateAttributeType) -> None:
85
88
  """Initializes the ElevenLabs API client with the given attributes."""
86
89
  super().__init__(attributes)
87
- os.makedirs(self.attributes.output_folder, exist_ok=True)
88
90
  self.client = self.init_elevenlabs_client()
89
91
 
90
92
  def init_elevenlabs_client(self) -> ElevenLabs:
@@ -92,44 +94,27 @@ class ElevenLabsBase(Template, abc.ABC):
92
94
  key = self.attributes.api_key if self.attributes.api_key else ELEVENLABS_API_KEY
93
95
  return ElevenLabs(api_key=key)
94
96
 
95
- def reset_state(self) -> None:
97
+ def reset_state(self, template_name: str | None = None) -> None:
96
98
  """Resets state of model"""
99
+ _ = template_name
97
100
  self.client = self.init_elevenlabs_client()
98
101
 
99
102
  @abc.abstractmethod
100
103
  def synthesize_speech(self, input_data: list[Packet]) -> RESPONSE_TYPE:
101
104
  """Abstract method for ElevenLabs speech synthesis."""
102
105
 
103
- def _save_audio(self, response: Iterator[bytes] | bytes, file_format: str, idx: int) -> str:
104
- """Saves the audio to a file and returns the file path."""
105
- if self.attributes.output_file_name:
106
- file_name = self.attributes.output_file_name + "_" + str(idx)
107
- else:
108
- file_name = uuid.uuid4()
109
-
110
- output_file = os.path.join(self.attributes.output_folder, f"{file_name}.{file_format}")
111
- try:
112
- save(response, output_file)
113
- self.logger.info(f"Audio saved to: {output_file}")
114
- return output_file
115
- except OSError as e:
116
- self.logger.error(f"File system error while saving speech to file: {e}")
117
- raise
118
-
119
- def _generate_audio_stream(self, response: Iterator[bytes] | bytes) -> IO[bytes]:
106
+ def _generate_audio_stream(self, response: Iterable | bytes) -> bytes:
120
107
  """Generates and returns the audio stream."""
121
- audio_stream = BytesIO()
108
+
122
109
  try:
123
110
  if isinstance(response, Iterator):
124
- for chunk in response:
125
- if chunk:
126
- audio_stream.write(chunk)
111
+ audio_stream = b"".join(chunk for chunk in response)
127
112
  elif isinstance(response, bytes):
128
- audio_stream.write(response)
113
+ audio_stream = response
114
+
129
115
  else:
130
116
  raise TypeError(f"Unsupported response type: {type(response)}")
131
117
 
132
- audio_stream.seek(0)
133
118
  self.logger.info("Returning audio stream")
134
119
  return audio_stream
135
120
  except IOError as e:
@@ -139,14 +124,15 @@ class ElevenLabsBase(Template, abc.ABC):
139
124
  self.logger.error(f"Value error while processing audio chunks: {e}")
140
125
  raise
141
126
 
142
- def _process_audio_output(self, idx: int, response: Iterator[bytes] | bytes) -> str | IO[bytes]:
127
+ def _process_audio_output(self, response: Iterable | bytes) -> tuple[np.ndarray, int]:
143
128
  """Processes a single audio output (either stream or file)."""
144
- if self.attributes.stream:
145
- return self._generate_audio_stream(response)
146
- file_format = "mp3" if "mp3" in self.attributes.output_format else "wav"
147
- return self._save_audio(response, file_format, idx)
148
129
 
149
- def generate_speech(self, input_data: list[Packet]) -> list[str | IO[bytes]] | None:
130
+ result = self._generate_audio_stream(response)
131
+ audio_np, sample_rate = audio_bytes_to_numpy(result)
132
+
133
+ return audio_np, sample_rate
134
+
135
+ def generate_speech(self, input_data: list[Packet]) -> list[tuple] | None:
150
136
  """Generates speech and saves it to a file."""
151
137
  responses: RESPONSE_TYPE = self.synthesize_speech(input_data)
152
138
  if not responses:
@@ -154,29 +140,29 @@ class ElevenLabsBase(Template, abc.ABC):
154
140
 
155
141
  if isinstance(responses, Iterator):
156
142
  responses = [responses]
157
-
158
- audio_outputs = [self._process_audio_output(idx, response) for idx, response in enumerate(responses)]
143
+ elif isinstance(responses, Generator):
144
+ responses = list(responses)
145
+ audio_outputs = [self._process_audio_output(response) for response in responses]
159
146
  return audio_outputs
160
147
 
161
- def _handle_streaming_output(self, audio_outputs: list[str | IO[bytes]]) -> list[AudioPacket]:
148
+ def _handle_streaming_output(self, audio_outputs: list[tuple]) -> list[AudioPacket]:
162
149
  """Handles audio stream output by adding it to the container as AudioPackets."""
163
150
  generated_audios: list[AudioPacket] = []
164
- sample_rate = int(self.attributes.output_format.split("_")[1])
151
+ # sample_rate = int(self.attributes.output_format.split("_")[1])
165
152
  for audio_output in audio_outputs:
153
+ audio = audio_output[0]
154
+ sample_rate = audio_output[1]
166
155
  audio_packet = AudioPacket(
167
- content=audio_output,
156
+ content=audio,
168
157
  sample_rate=sample_rate,
169
158
  )
170
159
  generated_audios.append(audio_packet)
171
160
  return generated_audios
172
161
 
173
- def _handle_audio_outputs(self, audio_outputs: list[str | IO[bytes]], container: DataContainer) -> None:
162
+ def _handle_audio_outputs(self, audio_outputs: list[tuple], container: DataContainer) -> None:
174
163
  """Handles the audio outputs by appending to the container based on the output type (stream or file)."""
175
- if self.attributes.stream:
176
- container.audios = container.audios or []
177
- container.audios.extend(self._handle_streaming_output(audio_outputs))
178
- else:
179
- self._set_generic_data(container, audio_outputs)
164
+ container.audios = container.audios or []
165
+ container.audios = self._handle_streaming_output(audio_outputs)
180
166
 
181
167
  def execute(self, container: DataContainer) -> DataContainer:
182
168
  """
@@ -1,49 +1,92 @@
1
1
  # -*- coding: utf-8 -*-
2
- """Speech-To-Speech template for ElevenLabs"""
2
+ """Speech-To-Speech template for ElevenLabs."""
3
3
 
4
4
  from typing import Callable, Iterator, Literal
5
5
 
6
6
  from sinapsis_core.data_containers.data_packet import AudioPacket
7
7
 
8
+ from sinapsis_elevenlabs.helpers.tags import Tags
8
9
  from sinapsis_elevenlabs.helpers.voice_utils import create_voice_settings, get_voice_id
9
10
  from sinapsis_elevenlabs.templates.elevenlabs_base import ElevenLabsBase
10
11
 
12
+ ElevenLabsSTSUIProperties = ElevenLabsBase.UIProperties
13
+ ElevenLabsSTSUIProperties.tags.extend([Tags.SPEECH_TO_SPEECH, Tags.VOICE_CONVERSION])
14
+
11
15
 
12
16
  class ElevenLabsSTS(ElevenLabsBase):
13
- """Template to interact with ElevenLabs speech-to-speech API."""
17
+ """Template to interact with the ElevenLabs Speech-to-Speech API.
18
+
19
+ This template takes an input audio and converts it to a new voice using
20
+ the ElevenLabs Speech-to-Speech (STS) API.
21
+
22
+ Usage example:
23
+
24
+ agent:
25
+ name: my_test_agent
26
+ templates:
27
+ - template_name: InputTemplate
28
+ class_name: InputTemplate
29
+ attributes: {}
30
+ - template_name: ElevenLabsSTS
31
+ class_name: ElevenLabsSTS
32
+ template_input: InputTemplate
33
+ attributes:
34
+ api_key: null
35
+ model: eleven_multilingual_sts_v2
36
+ output_file_name: null
37
+ output_format: mp3_44100_128
38
+ output_folder: <WORKING_DIR>/elevenlabs/audios
39
+ stream: false
40
+ voice: null
41
+ voice_settings:
42
+ stability: null
43
+ similarity_boost: null
44
+ style: null
45
+ use_speaker_boost: null
46
+ speed: null
47
+ streaming_latency: null
48
+
49
+ """
14
50
 
15
51
  PACKET_TYPE_NAME: str = "audios"
52
+ UIProperties = ElevenLabsSTSUIProperties
16
53
 
17
54
  class AttributesBaseModel(ElevenLabsBase.AttributesBaseModel):
18
55
  """Attributes specific to ElevenLabs STS API interaction.
19
56
 
20
- This class overrides the base attributes of `ElevenLabsBase` to define
21
- default models specific to the ElevenLabs STS system.
57
+ Attributes:
58
+ model (Literal): The STS model to use. Options are "eleven_english_sts_v2" or "eleven_multilingual_sts_v2".
59
+ streaming_latency (int | None): Optional latency optimization for streaming. Defaults to None.
22
60
  """
23
61
 
24
62
  model: Literal["eleven_english_sts_v2", "eleven_multilingual_sts_v2"] = "eleven_multilingual_sts_v2"
63
+ streaming_latency: int | None = None
25
64
 
26
65
  def synthesize_speech(self, input_data: list[AudioPacket]) -> Iterator[bytes]:
27
- """
28
- Sends an audio input to the ElevenLabs API for speech-to-speech synthesis.
66
+ """Sends an audio input to the ElevenLabs API for speech-to-speech synthesis.
29
67
 
30
- This method processes the provided audio input using the specified voice, model,
31
- and settings to generate a new audio response.
32
- """
68
+ Args:
69
+ input_data (list[AudioPacket]): List of AudioPacket objects containing the audio to be converted.
70
+ Only the first AudioPacket in the list is used.
33
71
 
72
+ Returns:
73
+ Iterator[bytes]: An iterator yielding audio data chunks in the output format specified.
74
+
75
+ Raises:
76
+ ValueError: If there is a problem with the input data or parameters.
77
+ TypeError: If the input data or files are of incorrect type.
78
+ KeyError: If the expected key is missing in the API response.
79
+ """
34
80
  try:
35
- method: Callable[..., Iterator[bytes]] = (
36
- self.client.speech_to_speech.convert_as_stream
37
- if self.attributes.stream
38
- else self.client.speech_to_speech.convert
39
- )
81
+ method: Callable[..., Iterator[bytes]] = self.client.speech_to_speech.stream # (
82
+
40
83
  return method(
41
- audio=input_data[0].content,
42
84
  voice_id=get_voice_id(self.client, voice=self.attributes.voice),
85
+ audio=input_data[0].content,
43
86
  model_id=self.attributes.model,
44
- voice_settings=create_voice_settings(self.attributes.voice_settings),
87
+ voice_settings=create_voice_settings(self.attributes.voice_settings, as_json=True),
45
88
  output_format=self.attributes.output_format,
46
- optimize_streaming_latency=str(self.attributes.streaming_latency),
89
+ optimize_streaming_latency=self.attributes.streaming_latency,
47
90
  )
48
91
  except ValueError as e:
49
92
  self.logger.error(f"Value error synthesizing speech: {e}")
@@ -1,16 +1,21 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  """Text-To-Speech template for ElevenLabs"""
3
3
 
4
- from typing import Iterator, Literal
4
+ from typing import Callable, Iterator, Literal
5
5
 
6
6
  from sinapsis_core.data_containers.data_packet import TextPacket
7
7
 
8
+ from sinapsis_elevenlabs.helpers.tags import Tags
8
9
  from sinapsis_elevenlabs.helpers.voice_utils import (
9
10
  create_voice_settings,
11
+ get_voice_id,
10
12
  load_input_text,
11
13
  )
12
14
  from sinapsis_elevenlabs.templates.elevenlabs_base import ElevenLabsBase
13
15
 
16
+ ElevenLabsTTSUIProperties = ElevenLabsBase.UIProperties
17
+ ElevenLabsTTSUIProperties.tags.extend([Tags.TEXT_TO_SPEECH])
18
+
14
19
 
15
20
  class ElevenLabsTTS(ElevenLabsBase):
16
21
  """Template to interact with ElevenLabs text-to-speech API.
@@ -35,7 +40,7 @@ class ElevenLabsTTS(ElevenLabsBase):
35
40
  voice_settings: null
36
41
  model: eleven_turbo_v2_5
37
42
  output_format: mp3_44100_128
38
- output_folder: /sinapsis/cache/dir/elevenlabs/audios
43
+ output_folder: <WORKING_DIR>/elevenlabs/audios
39
44
  stream: false
40
45
 
41
46
  """
@@ -64,16 +69,15 @@ class ElevenLabsTTS(ElevenLabsBase):
64
69
  """
65
70
  input_text: str = load_input_text(input_data)
66
71
  try:
67
- response: Iterator[bytes] = self.client.generate(
72
+ method: Callable[..., Iterator[bytes]] = self.client.text_to_speech.stream
73
+
74
+ return method(
68
75
  text=input_text,
69
- voice=self.attributes.voice,
70
- model=self.attributes.model,
76
+ voice_id=get_voice_id(self.client, self.attributes.voice),
77
+ model_id=self.attributes.model,
71
78
  voice_settings=create_voice_settings(self.attributes.voice_settings),
72
79
  output_format=self.attributes.output_format,
73
- stream=self.attributes.stream,
74
80
  )
75
-
76
- return response
77
81
  except ValueError as e:
78
82
  self.logger.error(f"Value error synthesizing speech: {e}")
79
83
  raise
@@ -1,32 +1,96 @@
1
1
  # -*- coding: utf-8 -*-
2
- """Text-To-Speech template for ElevenLabs"""
2
+ """Text-To-Speech template for ElevenLabs Voice Cloning."""
3
3
 
4
4
  from elevenlabs import Voice
5
5
  from sinapsis_core.data_containers.data_packet import AudioPacket, DataContainer
6
6
 
7
+ from sinapsis_elevenlabs.helpers.tags import Tags
7
8
  from sinapsis_elevenlabs.templates.elevenlabs_tts import ElevenLabsTTS
8
9
 
10
+ ElevenLabsVoiceCloneUIProperties = ElevenLabsTTS.UIProperties
11
+ ElevenLabsVoiceCloneUIProperties.tags.extend([Tags.VOICE_CLONING])
12
+
9
13
 
10
14
  class ElevenLabsVoiceClone(ElevenLabsTTS):
11
- """Template to clone a voice using ElevenLabs API."""
15
+ """Template to clone a voice using the ElevenLabs API.
16
+
17
+ This template allows you to create a new custom voice in ElevenLabs by providing
18
+ one or more audio samples. The cloned voice can then be used for subsequent
19
+ text-to-speech synthesis within the Sinapsis pipeline.
20
+
21
+ Usage example:
22
+
23
+ agent:
24
+ name: my_test_agent
25
+ templates:
26
+ - template_name: InputTemplate
27
+ class_name: InputTemplate
28
+ attributes: {}
29
+ - template_name: ElevenLabsVoiceClone
30
+ class_name: ElevenLabsVoiceClone
31
+ template_input: InputTemplate
32
+ attributes:
33
+ api_key: null
34
+ model: eleven_turbo_v2_5
35
+ output_file_name: null
36
+ output_format: mp3_44100_128
37
+ output_folder: <WORKING_DIR>/elevenlabs/audios
38
+ stream: false
39
+ voice: null
40
+ voice_settings:
41
+ stability: null
42
+ similarity_boost: null
43
+ style: null
44
+ use_speaker_boost: null
45
+ speed: null
46
+ name: null
47
+ description: null
48
+ remove_background_noise: false
49
+
50
+ """
51
+
52
+ UIProperties = ElevenLabsVoiceCloneUIProperties
12
53
 
13
54
  class AttributesBaseModel(ElevenLabsTTS.AttributesBaseModel):
14
- """Attributes specific to the ElevenLabsVoiceClone class."""
55
+ """Attributes specific to the ElevenLabsVoiceClone class.
56
+
57
+ Attributes:
58
+ name (str | None): Name for the cloned voice. If None, a default name may be used.
59
+ description (str | None): Description for the cloned voice. Optional.
60
+ remove_background_noise (bool): Whether to remove background noise from samples. Defaults to False.
61
+ """
15
62
 
16
63
  name: str | None = None
17
64
  description: str | None = None
65
+ remove_background_noise: bool = False
18
66
 
19
67
  def clone_voice(self, input_data: list[AudioPacket]) -> Voice:
20
- """Clones a voice using the provided audio files."""
21
- files = [f.content for f in input_data]
68
+ """Clones a voice using the provided audio files.
69
+
70
+ Args:
71
+ input_data (list[AudioPacket]): List of AudioPacket objects containing the audio samples
72
+ to be used for voice cloning. Each AudioPacket's `content` should be a file-like object
73
+ or bytes representing the audio data.
74
+ **NOTE:** All provided audio packets are used as reference for a single cloned voice.
75
+
76
+ Returns:
77
+ Voice: The cloned Voice object as returned by the ElevenLabs API.
78
+
79
+ Raises:
80
+ ValueError: If there is a problem with the input data or parameters.
81
+ TypeError: If the input data or files are of incorrect type.
82
+ KeyError: If the expected key is missing in the API response.
83
+ """
84
+ files = [audio.content for audio in input_data]
22
85
  try:
23
- add_voice_response = self.client.voices.add(
86
+ clone_response = self.client.voices.ivc.create(
24
87
  name=self.attributes.name,
25
- description=self.attributes.description,
26
88
  files=files,
89
+ description=self.attributes.description,
90
+ remove_background_noise=self.attributes.remove_background_noise,
27
91
  )
28
- cloned_voice = self.client.voices.get(add_voice_response.voice_id)
29
- self.logger.info(f"Voice cloned successfully: {cloned_voice.name}")
92
+ cloned_voice = self.client.voices.get(clone_response.voice_id)
93
+ self.logger.info(f"Voice cloned successfully with IVC: {cloned_voice.name}")
30
94
  return cloned_voice
31
95
  except ValueError as e:
32
96
  self.logger.error(f"Value error in input data or parameters: {e}")
@@ -39,8 +103,22 @@ class ElevenLabsVoiceClone(ElevenLabsTTS):
39
103
  raise
40
104
 
41
105
  def execute(self, container: DataContainer) -> DataContainer:
42
- """Executes the voice cloning process and generates the speech output."""
43
- audios = getattr(container, "audios", None)
106
+ """Executes the voice cloning process and generates the speech output.
107
+
108
+ Args:
109
+ container (DataContainer): The input DataContainer, expected to contain
110
+ one or more AudioPacket objects in the `audios` attribute.
111
+
112
+ Returns:
113
+ DataContainer: The updated DataContainer. If cloning is successful,
114
+ the cloned voice is set in `self.attributes.voice` and the parent
115
+ TTS execution is performed using the new voice.
116
+
117
+ Side Effects:
118
+ - Updates `self.attributes.voice` with the cloned Voice object.
119
+ - May log errors or info messages.
120
+ """
121
+ audios = container.audios
44
122
  if not audios:
45
123
  self.logger.debug("No audios provided to clone voice")
46
124
  return container
@@ -5,9 +5,13 @@ import base64
5
5
 
6
6
  from sinapsis_core.data_containers.data_packet import TextPacket
7
7
 
8
+ from sinapsis_elevenlabs.helpers.tags import Tags
8
9
  from sinapsis_elevenlabs.helpers.voice_utils import load_input_text
9
10
  from sinapsis_elevenlabs.templates.elevenlabs_base import ElevenLabsBase
10
11
 
12
+ ElevenLabsVoiceGenerationUIProperties = ElevenLabsBase.UIProperties
13
+ ElevenLabsVoiceGenerationUIProperties.tags.extend([Tags.VOICE_GENERATION, Tags.PROMPT])
14
+
11
15
 
12
16
  class ElevenLabsVoiceGeneration(ElevenLabsBase):
13
17
  """
@@ -33,12 +37,14 @@ class ElevenLabsVoiceGeneration(ElevenLabsBase):
33
37
  voice_settings: null
34
38
  model: eleven_turbo_v2_5
35
39
  output_format: mp3_44100_128
36
- output_folder: /sinapsis/cache/dir/elevenlabs/audios
40
+ output_folder: <WORKING_DIR>/elevenlabs/audios
37
41
  stream: false
38
42
  voice_description: An old British male with a raspy, deep voice. Professional,
39
43
  relaxed and assertive
40
44
  """
41
45
 
46
+ UIProperties = ElevenLabsVoiceGenerationUIProperties
47
+
42
48
  class AttributesBaseModel(ElevenLabsBase.AttributesBaseModel):
43
49
  """
44
50
  Attributes for voice generation in ElevenLabs API.
@@ -0,0 +1,10 @@
1
+ # -*- coding: utf-8 -*-
2
+ from enum import Enum
3
+
4
+
5
+ class Tags(Enum):
6
+ AUDIO = "audio"
7
+ AUDIO_GENERATION = "audio_generation"
8
+ F5TTS = "f5tts"
9
+ SPEECH = "speech"
10
+ TEXT_TO_SPEECH = "text_to_speech"