sinapsis-speech 0.3.5__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. {sinapsis_speech-0.3.5/packages/sinapsis_speech.egg-info → sinapsis_speech-0.4.1}/PKG-INFO +68 -5
  2. {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/README.md +65 -4
  3. sinapsis_speech-0.4.1/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/tags.py +15 -0
  4. sinapsis_speech-0.4.1/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/voice_utils.py +100 -0
  5. {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/__init__.py +2 -0
  6. {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_base.py +40 -54
  7. sinapsis_speech-0.4.1/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_sts.py +99 -0
  8. {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_tts.py +12 -8
  9. sinapsis_speech-0.4.1/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_clone.py +129 -0
  10. {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_generation.py +7 -1
  11. sinapsis_speech-0.4.1/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/helpers/tags.py +10 -0
  12. {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/templates/f5_tts_inference.py +13 -1
  13. sinapsis_speech-0.4.1/packages/sinapsis_kokoro/src/sinapsis_kokoro/helpers/tags.py +10 -0
  14. {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_kokoro/src/sinapsis_kokoro/templates/kokoro_tts.py +14 -3
  15. sinapsis_speech-0.4.1/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/__init__.py +0 -0
  16. sinapsis_speech-0.4.1/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/helpers/__init__.py +0 -0
  17. sinapsis_speech-0.4.1/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/helpers/tags.py +10 -0
  18. sinapsis_speech-0.4.1/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/templates/__init__.py +20 -0
  19. sinapsis_speech-0.4.1/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/templates/orpheus_tts.py +312 -0
  20. sinapsis_speech-0.4.1/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/thirdparty/helpers.py +69 -0
  21. sinapsis_speech-0.4.1/packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/__init__.py +0 -0
  22. sinapsis_speech-0.4.1/packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/helpers/__init__.py +0 -0
  23. sinapsis_speech-0.4.1/packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/helpers/tags.py +11 -0
  24. sinapsis_speech-0.4.1/packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/templates/__init__.py +20 -0
  25. sinapsis_speech-0.4.1/packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/templates/parakeet_tdt.py +289 -0
  26. {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1/packages/sinapsis_speech.egg-info}/PKG-INFO +68 -5
  27. {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_speech.egg-info/SOURCES.txt +17 -0
  28. {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_speech.egg-info/requires.txt +2 -0
  29. {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_speech.egg-info/top_level.txt +2 -0
  30. sinapsis_speech-0.4.1/packages/sinapsis_zonos/src/sinapsis_zonos/__init__.py +0 -0
  31. sinapsis_speech-0.4.1/packages/sinapsis_zonos/src/sinapsis_zonos/helpers/__init__.py +0 -0
  32. sinapsis_speech-0.4.1/packages/sinapsis_zonos/src/sinapsis_zonos/helpers/tags.py +11 -0
  33. {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_tts_utils.py +1 -1
  34. {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_zonos/src/sinapsis_zonos/templates/zonos_tts.py +13 -13
  35. {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/pyproject.toml +8 -1
  36. sinapsis_speech-0.3.5/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/voice_utils.py +0 -64
  37. sinapsis_speech-0.3.5/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_sts.py +0 -56
  38. sinapsis_speech-0.3.5/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_clone.py +0 -51
  39. {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/LICENSE +0 -0
  40. {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/__init__.py +0 -0
  41. {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/__init__.py +0 -0
  42. {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/env_var_keys.py +0 -0
  43. {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/__init__.py +0 -0
  44. {sinapsis_speech-0.3.5/packages/sinapsis_zonos/src/sinapsis_zonos → sinapsis_speech-0.4.1/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/helpers}/__init__.py +0 -0
  45. {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/templates/__init__.py +0 -0
  46. {sinapsis_speech-0.3.5/packages/sinapsis_zonos/src/sinapsis_zonos/helpers → sinapsis_speech-0.4.1/packages/sinapsis_kokoro/src/sinapsis_kokoro}/__init__.py +0 -0
  47. {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_kokoro/src/sinapsis_kokoro/helpers/kokoro_utils.py +0 -0
  48. {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_kokoro/src/sinapsis_kokoro/templates/__init__.py +0 -0
  49. {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_speech.egg-info/dependency_links.txt +0 -0
  50. {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_keys.py +0 -0
  51. {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_zonos/src/sinapsis_zonos/templates/__init__.py +0 -0
  52. {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sinapsis-speech
3
- Version: 0.3.5
3
+ Version: 0.4.1
4
4
  Summary: Generate speech using various libraries.
5
5
  Author-email: SinapsisAI <dev@sinapsis.tech>
6
6
  Project-URL: Homepage, https://sinapsis.tech
@@ -18,6 +18,8 @@ Requires-Dist: sinapsis-f5-tts[all]; extra == "all"
18
18
  Requires-Dist: sinapsis-kokoro[all]; extra == "all"
19
19
  Requires-Dist: sinapsis-speech[gradio-app]; extra == "all"
20
20
  Requires-Dist: sinapsis-zonos[all]; extra == "all"
21
+ Requires-Dist: sinapsis-parakeet-tdt[all]; extra == "all"
22
+ Requires-Dist: sinapsis-orpheus-cpp[all]; extra == "all"
21
23
  Provides-Extra: gradio-app
22
24
  Requires-Dist: sinapsis[webapp]>=0.2.3; extra == "gradio-app"
23
25
  Dynamic: license-file
@@ -33,7 +35,7 @@ Sinapsis Speech
33
35
  <br>
34
36
  </h1>
35
37
 
36
- <h4 align="center"> Templates for a wide range of voice generation tasks.</h4>
38
+ <h4 align="center"> A monorepo housing multiple packages and templates for versatile voice generation, text-to-speech, speech-to-text, and beyond.</h4>
37
39
 
38
40
  <p align="center">
39
41
  <a href="#installation">🐍 Installation</a> •
@@ -55,8 +57,10 @@ This repo includes packages for performing speech synthesis using different tool
55
57
 
56
58
  * <code>sinapsis-elevenlabs</code>
57
59
  * <code>sinapsis-f5-tts</code>
58
- * * <code>sinapsis-kokoro</code>
60
+ * <code>sinapsis-kokoro</code>
59
61
  * <code>sinapsis-zonos</code>
62
+ * <code>sinapsis-orpheus-cpp</code>
63
+ * <code>sinapsis-parakeet</code>
60
64
 
61
65
  Install using your preferred package manager. We strongly recommend using <code>uv</code>. To install <code>uv</code>, refer to the [official documentation](https://docs.astral.sh/uv/getting-started/installation/#installation-methods).
62
66
 
@@ -104,10 +108,14 @@ This repository is organized into modular packages, each designed for integratio
104
108
  <details>
105
109
  <summary id="elevenlabs"><strong><span style="font-size: 1.4em;"> Sinapsis ElevenLabs </span></strong></summary>
106
110
 
107
- This package offers a suite of templates and utilities designed for effortless integrating, configuration, and execution of **text-to-speech (TTS)** and **voice generation** functionalities powered by [ElevenLabs](https://elevenlabs.io/).
111
+ This package offers a suite of templates and utilities designed for effortless integrating, configuration, and execution of **text-to-speech (TTS)**, **speech-to-speech (STS)**, **voice cloning**, and **voice generation** functionalities powered by [ElevenLabs](https://elevenlabs.io/).
112
+
113
+ - **ElevenLabsSTS**: Template for transforming a voice into a different character or style using the ElevenLabs Speech-to-Speech API.
108
114
 
109
115
  - **ElevenLabsTTS**: Template for converting text into speech using ElevenLabs' voice models.
110
116
 
117
+ - **ElevenLabsVoiceClone**: Template for creating a synthetic copy of an existing voice using the ElevenLabs API.
118
+
111
119
  - **ElevenLabsVoiceGeneration**: Template for generating custom synthetic voices based on user-provided descriptions.
112
120
 
113
121
  For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_elevenlabs/README.md).
@@ -144,6 +152,30 @@ For specific instructions and further details, see the [README.md](https://githu
144
152
 
145
153
  </details>
146
154
 
155
+
156
+ <details>
157
+ <summary id="orpheus-cpp"><strong><span style="font-size: 1.4em;"> Sinapsis Orppheus-CPP</span></strong></summary>
158
+
159
+ This package provides a template for seamlessly integrating, configuring, and running **text-to-speech (TTS)** functionalities powered by [Orpheus-TTS](https://github.com/canopyai/Orpheus-TTS).
160
+
161
+ - **OrpheusTTS**: Converts text to speech using the Orpheus TTS model with advanced neural voice synthesis. The template processes text packets from the input container, generates corresponding audio using Orpheus TTS, and adds the resulting audio packets to the container. Features graceful error handling for out-of-memory conditions
162
+
163
+ For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_orpheus_cpp/README.md).
164
+
165
+ </details>
166
+
167
+ <details>
168
+ <summary id="parakeet-tdt"><strong><span style="font-size: 1.4em;"> Sinapsis Parakeet-TDT</span></strong></summary>
169
+
170
+ This package provides a template for seamlessly integrating, configuring, and running **speech-to-text (STT)** functionalities powered by [NVIDIA's Parakeet TDT model](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2).
171
+
172
+
173
+ - **ParakeetTDTInference**: Converts speech to text using NVIDIA's Parakeet TDT 0.6B model. This template processes audio packets from the input container or specified file paths, performs transcription with optional timestamp prediction, and adds the resulting text packets to the container.
174
+
175
+ For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_parakeet_tdt/README.md).
176
+
177
+ </details>
178
+
147
179
  <h2 id="webapp">🌐 Webapps</h2>
148
180
  The webapps included in this project showcase the modularity of the templates, in this case for speech generation tasks.
149
181
 
@@ -182,7 +214,6 @@ cd sinapsis-speech
182
214
  docker compose -f docker/compose.yaml build
183
215
  ```
184
216
 
185
-
186
217
  2. **Start the app container**:
187
218
 
188
219
  - For ElevenLabs:
@@ -205,6 +236,16 @@ docker compose -f docker/compose_apps.yaml up -d sinapsis-kokoro
205
236
  docker compose -f docker/compose_apps.yaml up -d sinapsis-zonos
206
237
  ```
207
238
 
239
+ - For Orpheus-CPP:
240
+ ```bash
241
+ docker compose -f docker/compose_apps.yaml up -d sinapsis-orpheus-tts
242
+ ```
243
+
244
+ - For Parakeet:
245
+ ```bash
246
+ docker compose -f docker/compose_apps.yaml up -d sinapsis-parakeet
247
+ ```
248
+
208
249
  3. **Check the logs**
209
250
 
210
251
  - For ElevenLabs:
@@ -224,6 +265,17 @@ docker logs -f sinapsis-kokoro
224
265
  ```bash
225
266
  docker logs -f sinapsis-zonos
226
267
  ```
268
+
269
+ - For Orpheus-CPP:
270
+ ```bash
271
+ docker logs -f sinapsis-orpheus-tts
272
+ ```
273
+
274
+ - For Parakeet:
275
+ ```bash
276
+ docker logs -f sinapsis-parakeet
277
+ ```
278
+
227
279
  4. **The logs will display the URL to access the webapp, e.g.,:**:
228
280
  ```bash
229
281
  Running on local URL: http://127.0.0.1:7860
@@ -240,6 +292,17 @@ docker compose -f docker/compose_apps.yaml down
240
292
 
241
293
  To run the webapp using the <code>uv</code> package manager, follow these steps:
242
294
 
295
+
296
+ > [!IMPORTANT]
297
+ > If you're using sinapsis-orpheus-cpp, you need to export cuda environment variables:
298
+
299
+
300
+ ```bash
301
+ export CMAKE_ARGS="-DGGML_CUDA=on"
302
+ export FORCE_CMAKE="1"
303
+ export CUDACXX=$(command -v nvcc)
304
+ ```
305
+
243
306
  1. **Sync the virtual environment**:
244
307
 
245
308
  ```bash
@@ -9,7 +9,7 @@ Sinapsis Speech
9
9
  <br>
10
10
  </h1>
11
11
 
12
- <h4 align="center"> Templates for a wide range of voice generation tasks.</h4>
12
+ <h4 align="center"> A monorepo housing multiple packages and templates for versatile voice generation, text-to-speech, speech-to-text, and beyond.</h4>
13
13
 
14
14
  <p align="center">
15
15
  <a href="#installation">🐍 Installation</a> •
@@ -31,8 +31,10 @@ This repo includes packages for performing speech synthesis using different tool
31
31
 
32
32
  * <code>sinapsis-elevenlabs</code>
33
33
  * <code>sinapsis-f5-tts</code>
34
- * * <code>sinapsis-kokoro</code>
34
+ * <code>sinapsis-kokoro</code>
35
35
  * <code>sinapsis-zonos</code>
36
+ * <code>sinapsis-orpheus-cpp</code>
37
+ * <code>sinapsis-parakeet</code>
36
38
 
37
39
  Install using your preferred package manager. We strongly recommend using <code>uv</code>. To install <code>uv</code>, refer to the [official documentation](https://docs.astral.sh/uv/getting-started/installation/#installation-methods).
38
40
 
@@ -80,10 +82,14 @@ This repository is organized into modular packages, each designed for integratio
80
82
  <details>
81
83
  <summary id="elevenlabs"><strong><span style="font-size: 1.4em;"> Sinapsis ElevenLabs </span></strong></summary>
82
84
 
83
- This package offers a suite of templates and utilities designed for effortless integrating, configuration, and execution of **text-to-speech (TTS)** and **voice generation** functionalities powered by [ElevenLabs](https://elevenlabs.io/).
85
+ This package offers a suite of templates and utilities designed for effortless integrating, configuration, and execution of **text-to-speech (TTS)**, **speech-to-speech (STS)**, **voice cloning**, and **voice generation** functionalities powered by [ElevenLabs](https://elevenlabs.io/).
86
+
87
+ - **ElevenLabsSTS**: Template for transforming a voice into a different character or style using the ElevenLabs Speech-to-Speech API.
84
88
 
85
89
  - **ElevenLabsTTS**: Template for converting text into speech using ElevenLabs' voice models.
86
90
 
91
+ - **ElevenLabsVoiceClone**: Template for creating a synthetic copy of an existing voice using the ElevenLabs API.
92
+
87
93
  - **ElevenLabsVoiceGeneration**: Template for generating custom synthetic voices based on user-provided descriptions.
88
94
 
89
95
  For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_elevenlabs/README.md).
@@ -120,6 +126,30 @@ For specific instructions and further details, see the [README.md](https://githu
120
126
 
121
127
  </details>
122
128
 
129
+
130
+ <details>
131
+ <summary id="orpheus-cpp"><strong><span style="font-size: 1.4em;"> Sinapsis Orppheus-CPP</span></strong></summary>
132
+
133
+ This package provides a template for seamlessly integrating, configuring, and running **text-to-speech (TTS)** functionalities powered by [Orpheus-TTS](https://github.com/canopyai/Orpheus-TTS).
134
+
135
+ - **OrpheusTTS**: Converts text to speech using the Orpheus TTS model with advanced neural voice synthesis. The template processes text packets from the input container, generates corresponding audio using Orpheus TTS, and adds the resulting audio packets to the container. Features graceful error handling for out-of-memory conditions
136
+
137
+ For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_orpheus_cpp/README.md).
138
+
139
+ </details>
140
+
141
+ <details>
142
+ <summary id="parakeet-tdt"><strong><span style="font-size: 1.4em;"> Sinapsis Parakeet-TDT</span></strong></summary>
143
+
144
+ This package provides a template for seamlessly integrating, configuring, and running **speech-to-text (STT)** functionalities powered by [NVIDIA's Parakeet TDT model](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2).
145
+
146
+
147
+ - **ParakeetTDTInference**: Converts speech to text using NVIDIA's Parakeet TDT 0.6B model. This template processes audio packets from the input container or specified file paths, performs transcription with optional timestamp prediction, and adds the resulting text packets to the container.
148
+
149
+ For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_parakeet_tdt/README.md).
150
+
151
+ </details>
152
+
123
153
  <h2 id="webapp">🌐 Webapps</h2>
124
154
  The webapps included in this project showcase the modularity of the templates, in this case for speech generation tasks.
125
155
 
@@ -158,7 +188,6 @@ cd sinapsis-speech
158
188
  docker compose -f docker/compose.yaml build
159
189
  ```
160
190
 
161
-
162
191
  2. **Start the app container**:
163
192
 
164
193
  - For ElevenLabs:
@@ -181,6 +210,16 @@ docker compose -f docker/compose_apps.yaml up -d sinapsis-kokoro
181
210
  docker compose -f docker/compose_apps.yaml up -d sinapsis-zonos
182
211
  ```
183
212
 
213
+ - For Orpheus-CPP:
214
+ ```bash
215
+ docker compose -f docker/compose_apps.yaml up -d sinapsis-orpheus-tts
216
+ ```
217
+
218
+ - For Parakeet:
219
+ ```bash
220
+ docker compose -f docker/compose_apps.yaml up -d sinapsis-parakeet
221
+ ```
222
+
184
223
  3. **Check the logs**
185
224
 
186
225
  - For ElevenLabs:
@@ -200,6 +239,17 @@ docker logs -f sinapsis-kokoro
200
239
  ```bash
201
240
  docker logs -f sinapsis-zonos
202
241
  ```
242
+
243
+ - For Orpheus-CPP:
244
+ ```bash
245
+ docker logs -f sinapsis-orpheus-tts
246
+ ```
247
+
248
+ - For Parakeet:
249
+ ```bash
250
+ docker logs -f sinapsis-parakeet
251
+ ```
252
+
203
253
  4. **The logs will display the URL to access the webapp, e.g.,:**:
204
254
  ```bash
205
255
  Running on local URL: http://127.0.0.1:7860
@@ -216,6 +266,17 @@ docker compose -f docker/compose_apps.yaml down
216
266
 
217
267
  To run the webapp using the <code>uv</code> package manager, follow these steps:
218
268
 
269
+
270
+ > [!IMPORTANT]
271
+ > If you're using sinapsis-orpheus-cpp, you need to export cuda environment variables:
272
+
273
+
274
+ ```bash
275
+ export CMAKE_ARGS="-DGGML_CUDA=on"
276
+ export FORCE_CMAKE="1"
277
+ export CUDACXX=$(command -v nvcc)
278
+ ```
279
+
219
280
  1. **Sync the virtual environment**:
220
281
 
221
282
  ```bash
@@ -0,0 +1,15 @@
1
+ # -*- coding: utf-8 -*-
2
+ from enum import Enum
3
+
4
+
5
+ class Tags(Enum):
6
+ AUDIO = "audio"
7
+ AUDIO_GENERATION = "audio_generation"
8
+ ELEVENLABS = "elevenlabs"
9
+ PROMPT = "prompt"
10
+ SPEECH = "speech"
11
+ SPEECH_TO_SPEECH = "speech_to_speech"
12
+ TEXT_TO_SPEECH = "text_to_speech"
13
+ VOICE_CONVERSION = "voice_conversion"
14
+ VOICE_CLONING = "voice_cloning"
15
+ VOICE_GENERATION = "voice_generation"
@@ -0,0 +1,100 @@
1
+ # -*- coding: utf-8 -*-
2
+ import json
3
+
4
+ from elevenlabs import Voice, VoiceSettings
5
+ from elevenlabs.client import ElevenLabs
6
+ from sinapsis_core.data_containers.data_packet import TextPacket
7
+ from sinapsis_core.utils.logging_utils import sinapsis_logger
8
+
9
+
10
+ def create_voice_settings(settings: VoiceSettings, as_json: bool = False) -> VoiceSettings | None | str:
11
+ """
12
+ Creates or updates a `VoiceSettings` object based on the provided settings.
13
+
14
+ Args:
15
+ settings (VoiceSettings | None): An instance of `VoiceSettings` containing the settings to be applied.
16
+ If `None`, the function returns the default settings.
17
+ as_json (bool): Whether to return the settings as JSON string.
18
+
19
+ Returns:
20
+ VoiceSettings | None | str: The provided `VoiceSettings` object if `settings` is not `None`. Otherwise,
21
+ `None` is returned for default settings.
22
+ """
23
+ if not settings:
24
+ return None
25
+
26
+ if as_json:
27
+ return json.dumps(settings.model_dump(exclude_none=True))
28
+
29
+ return settings
30
+
31
+
32
+ def get_voice_id(client: ElevenLabs, voice: str | Voice | None) -> str:
33
+ """
34
+ Resolves the voice ID for a given voice name or ID.
35
+
36
+ This function searches through available voices from the ElevenLabs API
37
+ to match the provided voice name or ID. If the specified voice is not found,
38
+ it logs the error and returns the first available voice ID as a fallback.
39
+
40
+ Args:
41
+ client (ElevenLabs): The ElevenLabs API client instance.
42
+ voice (str | Voice | None): The name or ID of the desired voice.
43
+
44
+ Returns:
45
+ str: The resolved voice ID.
46
+
47
+ Raises:
48
+ ValueError: If no voices are available to resolve.
49
+ """
50
+ if not voice:
51
+ return get_default_voice(client).voice_id
52
+
53
+ if isinstance(voice, Voice):
54
+ sinapsis_logger.debug(f"Voice object provided, using voice_id: {voice.voice_id}")
55
+ return voice.voice_id
56
+
57
+ try:
58
+ voices_response = client.voices.get_all()
59
+ voices = voices_response.voices
60
+
61
+ for v in voices:
62
+ if voice == v.name or voice == v.voice_id:
63
+ sinapsis_logger.debug(f"Voice {voice} resolved to ID: {v.voice_id}")
64
+ return v.voice_id
65
+
66
+ sinapsis_logger.error(f"Voice {voice} is not available.")
67
+ if voices:
68
+ sinapsis_logger.info(f"Returning default voice ID: {voices[0].voice_id}")
69
+ return voices[0].voice_id
70
+
71
+ raise ValueError("No available voices to resolve. Ensure the client is configured correctly.")
72
+ except Exception as e:
73
+ sinapsis_logger.error(f"Error resolving voice ID: {e}")
74
+ raise
75
+
76
+
77
+ def get_default_voice(client: ElevenLabs) -> Voice:
78
+ """
79
+ Gets the first available voice as default.
80
+
81
+ Args:
82
+ client (ElevenLabs): The ElevenLabs API client instance.
83
+
84
+ Returns:
85
+ Voice: The default voice object.
86
+ """
87
+ try:
88
+ voices_response = client.voices.get_all()
89
+ voices = voices_response.voices
90
+ if voices:
91
+ return voices[0]
92
+ raise ValueError("No voices available")
93
+ except Exception as e:
94
+ sinapsis_logger.error(f"Error getting default voice: {e}")
95
+ raise
96
+
97
+
98
+ def load_input_text(input_data: list[TextPacket]) -> str:
99
+ """Loads and concatenates the text content from a list of TextPacket objects."""
100
+ return "".join([item.content for item in input_data])
@@ -7,6 +7,8 @@ _root_lib_path = "sinapsis_elevenlabs.templates"
7
7
  _template_lookup = {
8
8
  "ElevenLabsTTS": f"{_root_lib_path}.elevenlabs_tts",
9
9
  "ElevenLabsVoiceGeneration": f"{_root_lib_path}.elevenlabs_voice_generation",
10
+ "ElevenLabsVoiceClone": f"{_root_lib_path}.elevenlabs_voice_clone",
11
+ "ElevenLabsSTS": f"{_root_lib_path}.elevenlabs_sts",
10
12
  }
11
13
 
12
14
 
@@ -3,12 +3,11 @@
3
3
 
4
4
  import abc
5
5
  import os
6
- import uuid
7
- from io import BytesIO
8
- from typing import IO, Iterator, Literal
6
+ from typing import Generator, Iterable, Iterator, Literal
9
7
 
10
- from elevenlabs import Voice, VoiceSettings, save
11
- from elevenlabs.client import ElevenLabs, VoiceId, VoiceName
8
+ import numpy as np
9
+ from elevenlabs import Voice, VoiceSettings
10
+ from elevenlabs.client import ElevenLabs
12
11
  from elevenlabs.types import OutputFormat
13
12
  from pydantic import Field
14
13
  from sinapsis_core.data_containers.data_packet import AudioPacket, DataContainer, Packet
@@ -19,9 +18,11 @@ from sinapsis_core.template_base.base_models import (
19
18
  UIPropertiesMetadata,
20
19
  )
21
20
  from sinapsis_core.template_base.template import Template
22
- from sinapsis_core.utils.env_var_keys import SINAPSIS_CACHE_DIR
21
+ from sinapsis_core.utils.env_var_keys import WORKING_DIR
22
+ from sinapsis_generic_data_tools.helpers.audio_encoder import audio_bytes_to_numpy
23
23
 
24
24
  from sinapsis_elevenlabs.helpers.env_var_keys import ELEVENLABS_API_KEY
25
+ from sinapsis_elevenlabs.helpers.tags import Tags
25
26
 
26
27
  RESPONSE_TYPE = Iterator[bytes] | list[bytes] | list[Iterator[bytes]] | None
27
28
 
@@ -51,9 +52,7 @@ class ElevenLabsBase(Template, abc.ABC):
51
52
  output_format (OutputFormat): The output audio format and quality. Options include:
52
53
  ["mp3_22050_32", "mp3_44100_32", "mp3_44100_64", "mp3_44100_96", "mp3_44100_128",
53
54
  "mp3_44100_192", "pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100", "ulaw_8000"]
54
- output_folder (str): The folder where generated audio files will be saved.
55
- stream (bool): If True, the audio is returned as a stream; otherwise, saved to a file.
56
- voice (VoiceId | VoiceName | Voice): The voice to use for speech synthesis. This can be a voice ID (str),
55
+ voice (str | Voice | None): The voice to use for speech synthesis. This can be a voice ID (str),
57
56
  a voice name (str) or an elevenlabs voice object (Voice).
58
57
  voice_settings (VoiceSettings): A dictionary of settings that control the behavior of the voice.
59
58
  - stability (float)
@@ -74,17 +73,20 @@ class ElevenLabsBase(Template, abc.ABC):
74
73
  ] = "eleven_turbo_v2_5"
75
74
  output_file_name: str | None = None
76
75
  output_format: OutputFormat = "mp3_44100_128"
77
- output_folder: str = os.path.join(SINAPSIS_CACHE_DIR, "elevenlabs", "audios")
76
+ output_folder: str = os.path.join(WORKING_DIR, "elevenlabs", "audios")
78
77
  stream: bool = False
79
- voice: VoiceId | VoiceName | Voice = None
78
+ voice: str | Voice | None = None
80
79
  voice_settings: VoiceSettings = Field(default_factory=dict) # type: ignore[arg-type]
81
80
 
82
- UIProperties = UIPropertiesMetadata(category="Elevenlabs", output_type=OutputTypes.AUDIO)
81
+ UIProperties = UIPropertiesMetadata(
82
+ category="Elevenlabs",
83
+ output_type=OutputTypes.AUDIO,
84
+ tags=[Tags.AUDIO, Tags.ELEVENLABS, Tags.SPEECH],
85
+ )
83
86
 
84
87
  def __init__(self, attributes: TemplateAttributeType) -> None:
85
88
  """Initializes the ElevenLabs API client with the given attributes."""
86
89
  super().__init__(attributes)
87
- os.makedirs(self.attributes.output_folder, exist_ok=True)
88
90
  self.client = self.init_elevenlabs_client()
89
91
 
90
92
  def init_elevenlabs_client(self) -> ElevenLabs:
@@ -92,44 +94,27 @@ class ElevenLabsBase(Template, abc.ABC):
92
94
  key = self.attributes.api_key if self.attributes.api_key else ELEVENLABS_API_KEY
93
95
  return ElevenLabs(api_key=key)
94
96
 
95
- def reset_state(self) -> None:
97
+ def reset_state(self, template_name: str | None = None) -> None:
96
98
  """Resets state of model"""
99
+ _ = template_name
97
100
  self.client = self.init_elevenlabs_client()
98
101
 
99
102
  @abc.abstractmethod
100
103
  def synthesize_speech(self, input_data: list[Packet]) -> RESPONSE_TYPE:
101
104
  """Abstract method for ElevenLabs speech synthesis."""
102
105
 
103
- def _save_audio(self, response: Iterator[bytes] | bytes, file_format: str, idx: int) -> str:
104
- """Saves the audio to a file and returns the file path."""
105
- if self.attributes.output_file_name:
106
- file_name = self.attributes.output_file_name + "_" + str(idx)
107
- else:
108
- file_name = uuid.uuid4()
109
-
110
- output_file = os.path.join(self.attributes.output_folder, f"{file_name}.{file_format}")
111
- try:
112
- save(response, output_file)
113
- self.logger.info(f"Audio saved to: {output_file}")
114
- return output_file
115
- except OSError as e:
116
- self.logger.error(f"File system error while saving speech to file: {e}")
117
- raise
118
-
119
- def _generate_audio_stream(self, response: Iterator[bytes] | bytes) -> IO[bytes]:
106
+ def _generate_audio_stream(self, response: Iterable | bytes) -> bytes:
120
107
  """Generates and returns the audio stream."""
121
- audio_stream = BytesIO()
108
+
122
109
  try:
123
110
  if isinstance(response, Iterator):
124
- for chunk in response:
125
- if chunk:
126
- audio_stream.write(chunk)
111
+ audio_stream = b"".join(chunk for chunk in response)
127
112
  elif isinstance(response, bytes):
128
- audio_stream.write(response)
113
+ audio_stream = response
114
+
129
115
  else:
130
116
  raise TypeError(f"Unsupported response type: {type(response)}")
131
117
 
132
- audio_stream.seek(0)
133
118
  self.logger.info("Returning audio stream")
134
119
  return audio_stream
135
120
  except IOError as e:
@@ -139,14 +124,15 @@ class ElevenLabsBase(Template, abc.ABC):
139
124
  self.logger.error(f"Value error while processing audio chunks: {e}")
140
125
  raise
141
126
 
142
- def _process_audio_output(self, idx: int, response: Iterator[bytes] | bytes) -> str | IO[bytes]:
127
+ def _process_audio_output(self, response: Iterable | bytes) -> tuple[np.ndarray, int]:
143
128
  """Processes a single audio output (either stream or file)."""
144
- if self.attributes.stream:
145
- return self._generate_audio_stream(response)
146
- file_format = "mp3" if "mp3" in self.attributes.output_format else "wav"
147
- return self._save_audio(response, file_format, idx)
148
129
 
149
- def generate_speech(self, input_data: list[Packet]) -> list[str | IO[bytes]] | None:
130
+ result = self._generate_audio_stream(response)
131
+ audio_np, sample_rate = audio_bytes_to_numpy(result)
132
+
133
+ return audio_np, sample_rate
134
+
135
+ def generate_speech(self, input_data: list[Packet]) -> list[tuple] | None:
150
136
  """Generates speech and saves it to a file."""
151
137
  responses: RESPONSE_TYPE = self.synthesize_speech(input_data)
152
138
  if not responses:
@@ -154,29 +140,29 @@ class ElevenLabsBase(Template, abc.ABC):
154
140
 
155
141
  if isinstance(responses, Iterator):
156
142
  responses = [responses]
157
-
158
- audio_outputs = [self._process_audio_output(idx, response) for idx, response in enumerate(responses)]
143
+ elif isinstance(responses, Generator):
144
+ responses = list(responses)
145
+ audio_outputs = [self._process_audio_output(response) for response in responses]
159
146
  return audio_outputs
160
147
 
161
- def _handle_streaming_output(self, audio_outputs: list[str | IO[bytes]]) -> list[AudioPacket]:
148
+ def _handle_streaming_output(self, audio_outputs: list[tuple]) -> list[AudioPacket]:
162
149
  """Handles audio stream output by adding it to the container as AudioPackets."""
163
150
  generated_audios: list[AudioPacket] = []
164
- sample_rate = int(self.attributes.output_format.split("_")[1])
151
+ # sample_rate = int(self.attributes.output_format.split("_")[1])
165
152
  for audio_output in audio_outputs:
153
+ audio = audio_output[0]
154
+ sample_rate = audio_output[1]
166
155
  audio_packet = AudioPacket(
167
- content=audio_output,
156
+ content=audio,
168
157
  sample_rate=sample_rate,
169
158
  )
170
159
  generated_audios.append(audio_packet)
171
160
  return generated_audios
172
161
 
173
- def _handle_audio_outputs(self, audio_outputs: list[str | IO[bytes]], container: DataContainer) -> None:
162
+ def _handle_audio_outputs(self, audio_outputs: list[tuple], container: DataContainer) -> None:
174
163
  """Handles the audio outputs by appending to the container based on the output type (stream or file)."""
175
- if self.attributes.stream:
176
- container.audios = container.audios or []
177
- container.audios.extend(self._handle_streaming_output(audio_outputs))
178
- else:
179
- self._set_generic_data(container, audio_outputs)
164
+ container.audios = container.audios or []
165
+ container.audios = self._handle_streaming_output(audio_outputs)
180
166
 
181
167
  def execute(self, container: DataContainer) -> DataContainer:
182
168
  """
@@ -0,0 +1,99 @@
1
+ # -*- coding: utf-8 -*-
2
+ """Speech-To-Speech template for ElevenLabs."""
3
+
4
+ from typing import Callable, Iterator, Literal
5
+
6
+ from sinapsis_core.data_containers.data_packet import AudioPacket
7
+
8
+ from sinapsis_elevenlabs.helpers.tags import Tags
9
+ from sinapsis_elevenlabs.helpers.voice_utils import create_voice_settings, get_voice_id
10
+ from sinapsis_elevenlabs.templates.elevenlabs_base import ElevenLabsBase
11
+
12
+ ElevenLabsSTSUIProperties = ElevenLabsBase.UIProperties
13
+ ElevenLabsSTSUIProperties.tags.extend([Tags.SPEECH_TO_SPEECH, Tags.VOICE_CONVERSION])
14
+
15
+
16
+ class ElevenLabsSTS(ElevenLabsBase):
17
+ """Template to interact with the ElevenLabs Speech-to-Speech API.
18
+
19
+ This template takes an input audio and converts it to a new voice using
20
+ the ElevenLabs Speech-to-Speech (STS) API.
21
+
22
+ Usage example:
23
+
24
+ agent:
25
+ name: my_test_agent
26
+ templates:
27
+ - template_name: InputTemplate
28
+ class_name: InputTemplate
29
+ attributes: {}
30
+ - template_name: ElevenLabsSTS
31
+ class_name: ElevenLabsSTS
32
+ template_input: InputTemplate
33
+ attributes:
34
+ api_key: null
35
+ model: eleven_multilingual_sts_v2
36
+ output_file_name: null
37
+ output_format: mp3_44100_128
38
+ output_folder: <WORKING_DIR>/elevenlabs/audios
39
+ stream: false
40
+ voice: null
41
+ voice_settings:
42
+ stability: null
43
+ similarity_boost: null
44
+ style: null
45
+ use_speaker_boost: null
46
+ speed: null
47
+ streaming_latency: null
48
+
49
+ """
50
+
51
+ PACKET_TYPE_NAME: str = "audios"
52
+ UIProperties = ElevenLabsSTSUIProperties
53
+
54
+ class AttributesBaseModel(ElevenLabsBase.AttributesBaseModel):
55
+ """Attributes specific to ElevenLabs STS API interaction.
56
+
57
+ Attributes:
58
+ model (Literal): The STS model to use. Options are "eleven_english_sts_v2" or "eleven_multilingual_sts_v2".
59
+ streaming_latency (int | None): Optional latency optimization for streaming. Defaults to None.
60
+ """
61
+
62
+ model: Literal["eleven_english_sts_v2", "eleven_multilingual_sts_v2"] = "eleven_multilingual_sts_v2"
63
+ streaming_latency: int | None = None
64
+
65
+ def synthesize_speech(self, input_data: list[AudioPacket]) -> Iterator[bytes]:
66
+ """Sends an audio input to the ElevenLabs API for speech-to-speech synthesis.
67
+
68
+ Args:
69
+ input_data (list[AudioPacket]): List of AudioPacket objects containing the audio to be converted.
70
+ Only the first AudioPacket in the list is used.
71
+
72
+ Returns:
73
+ Iterator[bytes]: An iterator yielding audio data chunks in the output format specified.
74
+
75
+ Raises:
76
+ ValueError: If there is a problem with the input data or parameters.
77
+ TypeError: If the input data or files are of incorrect type.
78
+ KeyError: If the expected key is missing in the API response.
79
+ """
80
+ try:
81
+ method: Callable[..., Iterator[bytes]] = self.client.speech_to_speech.stream # (
82
+
83
+ return method(
84
+ voice_id=get_voice_id(self.client, voice=self.attributes.voice),
85
+ audio=input_data[0].content,
86
+ model_id=self.attributes.model,
87
+ voice_settings=create_voice_settings(self.attributes.voice_settings, as_json=True),
88
+ output_format=self.attributes.output_format,
89
+ optimize_streaming_latency=self.attributes.streaming_latency,
90
+ )
91
+ except ValueError as e:
92
+ self.logger.error(f"Value error synthesizing speech: {e}")
93
+ raise
94
+ except TypeError as e:
95
+ self.logger.error(f"Type error in input data or parameters: {e}")
96
+ raise
97
+ except KeyError as e:
98
+ self.logger.error(f"Missing key in input data or settings: {e}")
99
+ raise