sinapsis-speech 0.3.5__tar.gz → 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sinapsis_speech-0.3.5/packages/sinapsis_speech.egg-info → sinapsis_speech-0.4.1}/PKG-INFO +68 -5
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/README.md +65 -4
- sinapsis_speech-0.4.1/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/tags.py +15 -0
- sinapsis_speech-0.4.1/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/voice_utils.py +100 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/__init__.py +2 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_base.py +40 -54
- sinapsis_speech-0.4.1/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_sts.py +99 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_tts.py +12 -8
- sinapsis_speech-0.4.1/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_clone.py +129 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_generation.py +7 -1
- sinapsis_speech-0.4.1/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/helpers/tags.py +10 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/templates/f5_tts_inference.py +13 -1
- sinapsis_speech-0.4.1/packages/sinapsis_kokoro/src/sinapsis_kokoro/helpers/tags.py +10 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_kokoro/src/sinapsis_kokoro/templates/kokoro_tts.py +14 -3
- sinapsis_speech-0.4.1/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/__init__.py +0 -0
- sinapsis_speech-0.4.1/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/helpers/__init__.py +0 -0
- sinapsis_speech-0.4.1/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/helpers/tags.py +10 -0
- sinapsis_speech-0.4.1/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/templates/__init__.py +20 -0
- sinapsis_speech-0.4.1/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/templates/orpheus_tts.py +312 -0
- sinapsis_speech-0.4.1/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/thirdparty/helpers.py +69 -0
- sinapsis_speech-0.4.1/packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/__init__.py +0 -0
- sinapsis_speech-0.4.1/packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/helpers/__init__.py +0 -0
- sinapsis_speech-0.4.1/packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/helpers/tags.py +11 -0
- sinapsis_speech-0.4.1/packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/templates/__init__.py +20 -0
- sinapsis_speech-0.4.1/packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/templates/parakeet_tdt.py +289 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1/packages/sinapsis_speech.egg-info}/PKG-INFO +68 -5
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_speech.egg-info/SOURCES.txt +17 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_speech.egg-info/requires.txt +2 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_speech.egg-info/top_level.txt +2 -0
- sinapsis_speech-0.4.1/packages/sinapsis_zonos/src/sinapsis_zonos/__init__.py +0 -0
- sinapsis_speech-0.4.1/packages/sinapsis_zonos/src/sinapsis_zonos/helpers/__init__.py +0 -0
- sinapsis_speech-0.4.1/packages/sinapsis_zonos/src/sinapsis_zonos/helpers/tags.py +11 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_tts_utils.py +1 -1
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_zonos/src/sinapsis_zonos/templates/zonos_tts.py +13 -13
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/pyproject.toml +8 -1
- sinapsis_speech-0.3.5/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/voice_utils.py +0 -64
- sinapsis_speech-0.3.5/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_sts.py +0 -56
- sinapsis_speech-0.3.5/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_clone.py +0 -51
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/LICENSE +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/__init__.py +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/__init__.py +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/env_var_keys.py +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/__init__.py +0 -0
- {sinapsis_speech-0.3.5/packages/sinapsis_zonos/src/sinapsis_zonos → sinapsis_speech-0.4.1/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/helpers}/__init__.py +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/templates/__init__.py +0 -0
- {sinapsis_speech-0.3.5/packages/sinapsis_zonos/src/sinapsis_zonos/helpers → sinapsis_speech-0.4.1/packages/sinapsis_kokoro/src/sinapsis_kokoro}/__init__.py +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_kokoro/src/sinapsis_kokoro/helpers/kokoro_utils.py +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_kokoro/src/sinapsis_kokoro/templates/__init__.py +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_speech.egg-info/dependency_links.txt +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_keys.py +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/packages/sinapsis_zonos/src/sinapsis_zonos/templates/__init__.py +0 -0
- {sinapsis_speech-0.3.5 → sinapsis_speech-0.4.1}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sinapsis-speech
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: Generate speech using various libraries.
|
|
5
5
|
Author-email: SinapsisAI <dev@sinapsis.tech>
|
|
6
6
|
Project-URL: Homepage, https://sinapsis.tech
|
|
@@ -18,6 +18,8 @@ Requires-Dist: sinapsis-f5-tts[all]; extra == "all"
|
|
|
18
18
|
Requires-Dist: sinapsis-kokoro[all]; extra == "all"
|
|
19
19
|
Requires-Dist: sinapsis-speech[gradio-app]; extra == "all"
|
|
20
20
|
Requires-Dist: sinapsis-zonos[all]; extra == "all"
|
|
21
|
+
Requires-Dist: sinapsis-parakeet-tdt[all]; extra == "all"
|
|
22
|
+
Requires-Dist: sinapsis-orpheus-cpp[all]; extra == "all"
|
|
21
23
|
Provides-Extra: gradio-app
|
|
22
24
|
Requires-Dist: sinapsis[webapp]>=0.2.3; extra == "gradio-app"
|
|
23
25
|
Dynamic: license-file
|
|
@@ -33,7 +35,7 @@ Sinapsis Speech
|
|
|
33
35
|
<br>
|
|
34
36
|
</h1>
|
|
35
37
|
|
|
36
|
-
<h4 align="center">
|
|
38
|
+
<h4 align="center"> A monorepo housing multiple packages and templates for versatile voice generation, text-to-speech, speech-to-text, and beyond.</h4>
|
|
37
39
|
|
|
38
40
|
<p align="center">
|
|
39
41
|
<a href="#installation">🐍 Installation</a> •
|
|
@@ -55,8 +57,10 @@ This repo includes packages for performing speech synthesis using different tool
|
|
|
55
57
|
|
|
56
58
|
* <code>sinapsis-elevenlabs</code>
|
|
57
59
|
* <code>sinapsis-f5-tts</code>
|
|
58
|
-
*
|
|
60
|
+
* <code>sinapsis-kokoro</code>
|
|
59
61
|
* <code>sinapsis-zonos</code>
|
|
62
|
+
* <code>sinapsis-orpheus-cpp</code>
|
|
63
|
+
* <code>sinapsis-parakeet</code>
|
|
60
64
|
|
|
61
65
|
Install using your preferred package manager. We strongly recommend using <code>uv</code>. To install <code>uv</code>, refer to the [official documentation](https://docs.astral.sh/uv/getting-started/installation/#installation-methods).
|
|
62
66
|
|
|
@@ -104,10 +108,14 @@ This repository is organized into modular packages, each designed for integratio
|
|
|
104
108
|
<details>
|
|
105
109
|
<summary id="elevenlabs"><strong><span style="font-size: 1.4em;"> Sinapsis ElevenLabs </span></strong></summary>
|
|
106
110
|
|
|
107
|
-
This package offers a suite of templates and utilities designed for effortless integrating, configuration, and execution of **text-to-speech (TTS)** and **voice generation** functionalities powered by [ElevenLabs](https://elevenlabs.io/).
|
|
111
|
+
This package offers a suite of templates and utilities designed for effortless integrating, configuration, and execution of **text-to-speech (TTS)**, **speech-to-speech (STS)**, **voice cloning**, and **voice generation** functionalities powered by [ElevenLabs](https://elevenlabs.io/).
|
|
112
|
+
|
|
113
|
+
- **ElevenLabsSTS**: Template for transforming a voice into a different character or style using the ElevenLabs Speech-to-Speech API.
|
|
108
114
|
|
|
109
115
|
- **ElevenLabsTTS**: Template for converting text into speech using ElevenLabs' voice models.
|
|
110
116
|
|
|
117
|
+
- **ElevenLabsVoiceClone**: Template for creating a synthetic copy of an existing voice using the ElevenLabs API.
|
|
118
|
+
|
|
111
119
|
- **ElevenLabsVoiceGeneration**: Template for generating custom synthetic voices based on user-provided descriptions.
|
|
112
120
|
|
|
113
121
|
For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_elevenlabs/README.md).
|
|
@@ -144,6 +152,30 @@ For specific instructions and further details, see the [README.md](https://githu
|
|
|
144
152
|
|
|
145
153
|
</details>
|
|
146
154
|
|
|
155
|
+
|
|
156
|
+
<details>
|
|
157
|
+
<summary id="orpheus-cpp"><strong><span style="font-size: 1.4em;"> Sinapsis Orppheus-CPP</span></strong></summary>
|
|
158
|
+
|
|
159
|
+
This package provides a template for seamlessly integrating, configuring, and running **text-to-speech (TTS)** functionalities powered by [Orpheus-TTS](https://github.com/canopyai/Orpheus-TTS).
|
|
160
|
+
|
|
161
|
+
- **OrpheusTTS**: Converts text to speech using the Orpheus TTS model with advanced neural voice synthesis. The template processes text packets from the input container, generates corresponding audio using Orpheus TTS, and adds the resulting audio packets to the container. Features graceful error handling for out-of-memory conditions
|
|
162
|
+
|
|
163
|
+
For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_orpheus_cpp/README.md).
|
|
164
|
+
|
|
165
|
+
</details>
|
|
166
|
+
|
|
167
|
+
<details>
|
|
168
|
+
<summary id="parakeet-tdt"><strong><span style="font-size: 1.4em;"> Sinapsis Parakeet-TDT</span></strong></summary>
|
|
169
|
+
|
|
170
|
+
This package provides a template for seamlessly integrating, configuring, and running **speech-to-text (STT)** functionalities powered by [NVIDIA's Parakeet TDT model](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2).
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
- **ParakeetTDTInference**: Converts speech to text using NVIDIA's Parakeet TDT 0.6B model. This template processes audio packets from the input container or specified file paths, performs transcription with optional timestamp prediction, and adds the resulting text packets to the container.
|
|
174
|
+
|
|
175
|
+
For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_parakeet_tdt/README.md).
|
|
176
|
+
|
|
177
|
+
</details>
|
|
178
|
+
|
|
147
179
|
<h2 id="webapp">🌐 Webapps</h2>
|
|
148
180
|
The webapps included in this project showcase the modularity of the templates, in this case for speech generation tasks.
|
|
149
181
|
|
|
@@ -182,7 +214,6 @@ cd sinapsis-speech
|
|
|
182
214
|
docker compose -f docker/compose.yaml build
|
|
183
215
|
```
|
|
184
216
|
|
|
185
|
-
|
|
186
217
|
2. **Start the app container**:
|
|
187
218
|
|
|
188
219
|
- For ElevenLabs:
|
|
@@ -205,6 +236,16 @@ docker compose -f docker/compose_apps.yaml up -d sinapsis-kokoro
|
|
|
205
236
|
docker compose -f docker/compose_apps.yaml up -d sinapsis-zonos
|
|
206
237
|
```
|
|
207
238
|
|
|
239
|
+
- For Orpheus-CPP:
|
|
240
|
+
```bash
|
|
241
|
+
docker compose -f docker/compose_apps.yaml up -d sinapsis-orpheus-tts
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
- For Parakeet:
|
|
245
|
+
```bash
|
|
246
|
+
docker compose -f docker/compose_apps.yaml up -d sinapsis-parakeet
|
|
247
|
+
```
|
|
248
|
+
|
|
208
249
|
3. **Check the logs**
|
|
209
250
|
|
|
210
251
|
- For ElevenLabs:
|
|
@@ -224,6 +265,17 @@ docker logs -f sinapsis-kokoro
|
|
|
224
265
|
```bash
|
|
225
266
|
docker logs -f sinapsis-zonos
|
|
226
267
|
```
|
|
268
|
+
|
|
269
|
+
- For Orpheus-CPP:
|
|
270
|
+
```bash
|
|
271
|
+
docker logs -f sinapsis-orpheus-tts
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
- For Parakeet:
|
|
275
|
+
```bash
|
|
276
|
+
docker logs -f sinapsis-parakeet
|
|
277
|
+
```
|
|
278
|
+
|
|
227
279
|
4. **The logs will display the URL to access the webapp, e.g.,:**:
|
|
228
280
|
```bash
|
|
229
281
|
Running on local URL: http://127.0.0.1:7860
|
|
@@ -240,6 +292,17 @@ docker compose -f docker/compose_apps.yaml down
|
|
|
240
292
|
|
|
241
293
|
To run the webapp using the <code>uv</code> package manager, follow these steps:
|
|
242
294
|
|
|
295
|
+
|
|
296
|
+
> [!IMPORTANT]
|
|
297
|
+
> If you're using sinapsis-orpheus-cpp, you need to export cuda environment variables:
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
```bash
|
|
301
|
+
export CMAKE_ARGS="-DGGML_CUDA=on"
|
|
302
|
+
export FORCE_CMAKE="1"
|
|
303
|
+
export CUDACXX=$(command -v nvcc)
|
|
304
|
+
```
|
|
305
|
+
|
|
243
306
|
1. **Sync the virtual environment**:
|
|
244
307
|
|
|
245
308
|
```bash
|
|
@@ -9,7 +9,7 @@ Sinapsis Speech
|
|
|
9
9
|
<br>
|
|
10
10
|
</h1>
|
|
11
11
|
|
|
12
|
-
<h4 align="center">
|
|
12
|
+
<h4 align="center"> A monorepo housing multiple packages and templates for versatile voice generation, text-to-speech, speech-to-text, and beyond.</h4>
|
|
13
13
|
|
|
14
14
|
<p align="center">
|
|
15
15
|
<a href="#installation">🐍 Installation</a> •
|
|
@@ -31,8 +31,10 @@ This repo includes packages for performing speech synthesis using different tool
|
|
|
31
31
|
|
|
32
32
|
* <code>sinapsis-elevenlabs</code>
|
|
33
33
|
* <code>sinapsis-f5-tts</code>
|
|
34
|
-
*
|
|
34
|
+
* <code>sinapsis-kokoro</code>
|
|
35
35
|
* <code>sinapsis-zonos</code>
|
|
36
|
+
* <code>sinapsis-orpheus-cpp</code>
|
|
37
|
+
* <code>sinapsis-parakeet</code>
|
|
36
38
|
|
|
37
39
|
Install using your preferred package manager. We strongly recommend using <code>uv</code>. To install <code>uv</code>, refer to the [official documentation](https://docs.astral.sh/uv/getting-started/installation/#installation-methods).
|
|
38
40
|
|
|
@@ -80,10 +82,14 @@ This repository is organized into modular packages, each designed for integratio
|
|
|
80
82
|
<details>
|
|
81
83
|
<summary id="elevenlabs"><strong><span style="font-size: 1.4em;"> Sinapsis ElevenLabs </span></strong></summary>
|
|
82
84
|
|
|
83
|
-
This package offers a suite of templates and utilities designed for effortless integrating, configuration, and execution of **text-to-speech (TTS)** and **voice generation** functionalities powered by [ElevenLabs](https://elevenlabs.io/).
|
|
85
|
+
This package offers a suite of templates and utilities designed for effortless integrating, configuration, and execution of **text-to-speech (TTS)**, **speech-to-speech (STS)**, **voice cloning**, and **voice generation** functionalities powered by [ElevenLabs](https://elevenlabs.io/).
|
|
86
|
+
|
|
87
|
+
- **ElevenLabsSTS**: Template for transforming a voice into a different character or style using the ElevenLabs Speech-to-Speech API.
|
|
84
88
|
|
|
85
89
|
- **ElevenLabsTTS**: Template for converting text into speech using ElevenLabs' voice models.
|
|
86
90
|
|
|
91
|
+
- **ElevenLabsVoiceClone**: Template for creating a synthetic copy of an existing voice using the ElevenLabs API.
|
|
92
|
+
|
|
87
93
|
- **ElevenLabsVoiceGeneration**: Template for generating custom synthetic voices based on user-provided descriptions.
|
|
88
94
|
|
|
89
95
|
For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_elevenlabs/README.md).
|
|
@@ -120,6 +126,30 @@ For specific instructions and further details, see the [README.md](https://githu
|
|
|
120
126
|
|
|
121
127
|
</details>
|
|
122
128
|
|
|
129
|
+
|
|
130
|
+
<details>
|
|
131
|
+
<summary id="orpheus-cpp"><strong><span style="font-size: 1.4em;"> Sinapsis Orppheus-CPP</span></strong></summary>
|
|
132
|
+
|
|
133
|
+
This package provides a template for seamlessly integrating, configuring, and running **text-to-speech (TTS)** functionalities powered by [Orpheus-TTS](https://github.com/canopyai/Orpheus-TTS).
|
|
134
|
+
|
|
135
|
+
- **OrpheusTTS**: Converts text to speech using the Orpheus TTS model with advanced neural voice synthesis. The template processes text packets from the input container, generates corresponding audio using Orpheus TTS, and adds the resulting audio packets to the container. Features graceful error handling for out-of-memory conditions
|
|
136
|
+
|
|
137
|
+
For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_orpheus_cpp/README.md).
|
|
138
|
+
|
|
139
|
+
</details>
|
|
140
|
+
|
|
141
|
+
<details>
|
|
142
|
+
<summary id="parakeet-tdt"><strong><span style="font-size: 1.4em;"> Sinapsis Parakeet-TDT</span></strong></summary>
|
|
143
|
+
|
|
144
|
+
This package provides a template for seamlessly integrating, configuring, and running **speech-to-text (STT)** functionalities powered by [NVIDIA's Parakeet TDT model](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2).
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
- **ParakeetTDTInference**: Converts speech to text using NVIDIA's Parakeet TDT 0.6B model. This template processes audio packets from the input container or specified file paths, performs transcription with optional timestamp prediction, and adds the resulting text packets to the container.
|
|
148
|
+
|
|
149
|
+
For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_parakeet_tdt/README.md).
|
|
150
|
+
|
|
151
|
+
</details>
|
|
152
|
+
|
|
123
153
|
<h2 id="webapp">🌐 Webapps</h2>
|
|
124
154
|
The webapps included in this project showcase the modularity of the templates, in this case for speech generation tasks.
|
|
125
155
|
|
|
@@ -158,7 +188,6 @@ cd sinapsis-speech
|
|
|
158
188
|
docker compose -f docker/compose.yaml build
|
|
159
189
|
```
|
|
160
190
|
|
|
161
|
-
|
|
162
191
|
2. **Start the app container**:
|
|
163
192
|
|
|
164
193
|
- For ElevenLabs:
|
|
@@ -181,6 +210,16 @@ docker compose -f docker/compose_apps.yaml up -d sinapsis-kokoro
|
|
|
181
210
|
docker compose -f docker/compose_apps.yaml up -d sinapsis-zonos
|
|
182
211
|
```
|
|
183
212
|
|
|
213
|
+
- For Orpheus-CPP:
|
|
214
|
+
```bash
|
|
215
|
+
docker compose -f docker/compose_apps.yaml up -d sinapsis-orpheus-tts
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
- For Parakeet:
|
|
219
|
+
```bash
|
|
220
|
+
docker compose -f docker/compose_apps.yaml up -d sinapsis-parakeet
|
|
221
|
+
```
|
|
222
|
+
|
|
184
223
|
3. **Check the logs**
|
|
185
224
|
|
|
186
225
|
- For ElevenLabs:
|
|
@@ -200,6 +239,17 @@ docker logs -f sinapsis-kokoro
|
|
|
200
239
|
```bash
|
|
201
240
|
docker logs -f sinapsis-zonos
|
|
202
241
|
```
|
|
242
|
+
|
|
243
|
+
- For Orpheus-CPP:
|
|
244
|
+
```bash
|
|
245
|
+
docker logs -f sinapsis-orpheus-tts
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
- For Parakeet:
|
|
249
|
+
```bash
|
|
250
|
+
docker logs -f sinapsis-parakeet
|
|
251
|
+
```
|
|
252
|
+
|
|
203
253
|
4. **The logs will display the URL to access the webapp, e.g.,:**:
|
|
204
254
|
```bash
|
|
205
255
|
Running on local URL: http://127.0.0.1:7860
|
|
@@ -216,6 +266,17 @@ docker compose -f docker/compose_apps.yaml down
|
|
|
216
266
|
|
|
217
267
|
To run the webapp using the <code>uv</code> package manager, follow these steps:
|
|
218
268
|
|
|
269
|
+
|
|
270
|
+
> [!IMPORTANT]
|
|
271
|
+
> If you're using sinapsis-orpheus-cpp, you need to export cuda environment variables:
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
```bash
|
|
275
|
+
export CMAKE_ARGS="-DGGML_CUDA=on"
|
|
276
|
+
export FORCE_CMAKE="1"
|
|
277
|
+
export CUDACXX=$(command -v nvcc)
|
|
278
|
+
```
|
|
279
|
+
|
|
219
280
|
1. **Sync the virtual environment**:
|
|
220
281
|
|
|
221
282
|
```bash
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
from enum import Enum
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Tags(Enum):
|
|
6
|
+
AUDIO = "audio"
|
|
7
|
+
AUDIO_GENERATION = "audio_generation"
|
|
8
|
+
ELEVENLABS = "elevenlabs"
|
|
9
|
+
PROMPT = "prompt"
|
|
10
|
+
SPEECH = "speech"
|
|
11
|
+
SPEECH_TO_SPEECH = "speech_to_speech"
|
|
12
|
+
TEXT_TO_SPEECH = "text_to_speech"
|
|
13
|
+
VOICE_CONVERSION = "voice_conversion"
|
|
14
|
+
VOICE_CLONING = "voice_cloning"
|
|
15
|
+
VOICE_GENERATION = "voice_generation"
|
sinapsis_speech-0.4.1/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/voice_utils.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import json
|
|
3
|
+
|
|
4
|
+
from elevenlabs import Voice, VoiceSettings
|
|
5
|
+
from elevenlabs.client import ElevenLabs
|
|
6
|
+
from sinapsis_core.data_containers.data_packet import TextPacket
|
|
7
|
+
from sinapsis_core.utils.logging_utils import sinapsis_logger
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def create_voice_settings(settings: VoiceSettings, as_json: bool = False) -> VoiceSettings | None | str:
|
|
11
|
+
"""
|
|
12
|
+
Creates or updates a `VoiceSettings` object based on the provided settings.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
settings (VoiceSettings | None): An instance of `VoiceSettings` containing the settings to be applied.
|
|
16
|
+
If `None`, the function returns the default settings.
|
|
17
|
+
as_json (bool): Whether to return the settings as JSON string.
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
VoiceSettings | None | str: The provided `VoiceSettings` object if `settings` is not `None`. Otherwise,
|
|
21
|
+
`None` is returned for default settings.
|
|
22
|
+
"""
|
|
23
|
+
if not settings:
|
|
24
|
+
return None
|
|
25
|
+
|
|
26
|
+
if as_json:
|
|
27
|
+
return json.dumps(settings.model_dump(exclude_none=True))
|
|
28
|
+
|
|
29
|
+
return settings
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_voice_id(client: ElevenLabs, voice: str | Voice | None) -> str:
|
|
33
|
+
"""
|
|
34
|
+
Resolves the voice ID for a given voice name or ID.
|
|
35
|
+
|
|
36
|
+
This function searches through available voices from the ElevenLabs API
|
|
37
|
+
to match the provided voice name or ID. If the specified voice is not found,
|
|
38
|
+
it logs the error and returns the first available voice ID as a fallback.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
client (ElevenLabs): The ElevenLabs API client instance.
|
|
42
|
+
voice (str | Voice | None): The name or ID of the desired voice.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
str: The resolved voice ID.
|
|
46
|
+
|
|
47
|
+
Raises:
|
|
48
|
+
ValueError: If no voices are available to resolve.
|
|
49
|
+
"""
|
|
50
|
+
if not voice:
|
|
51
|
+
return get_default_voice(client).voice_id
|
|
52
|
+
|
|
53
|
+
if isinstance(voice, Voice):
|
|
54
|
+
sinapsis_logger.debug(f"Voice object provided, using voice_id: {voice.voice_id}")
|
|
55
|
+
return voice.voice_id
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
voices_response = client.voices.get_all()
|
|
59
|
+
voices = voices_response.voices
|
|
60
|
+
|
|
61
|
+
for v in voices:
|
|
62
|
+
if voice == v.name or voice == v.voice_id:
|
|
63
|
+
sinapsis_logger.debug(f"Voice {voice} resolved to ID: {v.voice_id}")
|
|
64
|
+
return v.voice_id
|
|
65
|
+
|
|
66
|
+
sinapsis_logger.error(f"Voice {voice} is not available.")
|
|
67
|
+
if voices:
|
|
68
|
+
sinapsis_logger.info(f"Returning default voice ID: {voices[0].voice_id}")
|
|
69
|
+
return voices[0].voice_id
|
|
70
|
+
|
|
71
|
+
raise ValueError("No available voices to resolve. Ensure the client is configured correctly.")
|
|
72
|
+
except Exception as e:
|
|
73
|
+
sinapsis_logger.error(f"Error resolving voice ID: {e}")
|
|
74
|
+
raise
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_default_voice(client: ElevenLabs) -> Voice:
|
|
78
|
+
"""
|
|
79
|
+
Gets the first available voice as default.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
client (ElevenLabs): The ElevenLabs API client instance.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
Voice: The default voice object.
|
|
86
|
+
"""
|
|
87
|
+
try:
|
|
88
|
+
voices_response = client.voices.get_all()
|
|
89
|
+
voices = voices_response.voices
|
|
90
|
+
if voices:
|
|
91
|
+
return voices[0]
|
|
92
|
+
raise ValueError("No voices available")
|
|
93
|
+
except Exception as e:
|
|
94
|
+
sinapsis_logger.error(f"Error getting default voice: {e}")
|
|
95
|
+
raise
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def load_input_text(input_data: list[TextPacket]) -> str:
|
|
99
|
+
"""Loads and concatenates the text content from a list of TextPacket objects."""
|
|
100
|
+
return "".join([item.content for item in input_data])
|
|
@@ -7,6 +7,8 @@ _root_lib_path = "sinapsis_elevenlabs.templates"
|
|
|
7
7
|
_template_lookup = {
|
|
8
8
|
"ElevenLabsTTS": f"{_root_lib_path}.elevenlabs_tts",
|
|
9
9
|
"ElevenLabsVoiceGeneration": f"{_root_lib_path}.elevenlabs_voice_generation",
|
|
10
|
+
"ElevenLabsVoiceClone": f"{_root_lib_path}.elevenlabs_voice_clone",
|
|
11
|
+
"ElevenLabsSTS": f"{_root_lib_path}.elevenlabs_sts",
|
|
10
12
|
}
|
|
11
13
|
|
|
12
14
|
|
|
@@ -3,12 +3,11 @@
|
|
|
3
3
|
|
|
4
4
|
import abc
|
|
5
5
|
import os
|
|
6
|
-
import
|
|
7
|
-
from io import BytesIO
|
|
8
|
-
from typing import IO, Iterator, Literal
|
|
6
|
+
from typing import Generator, Iterable, Iterator, Literal
|
|
9
7
|
|
|
10
|
-
|
|
11
|
-
from elevenlabs
|
|
8
|
+
import numpy as np
|
|
9
|
+
from elevenlabs import Voice, VoiceSettings
|
|
10
|
+
from elevenlabs.client import ElevenLabs
|
|
12
11
|
from elevenlabs.types import OutputFormat
|
|
13
12
|
from pydantic import Field
|
|
14
13
|
from sinapsis_core.data_containers.data_packet import AudioPacket, DataContainer, Packet
|
|
@@ -19,9 +18,11 @@ from sinapsis_core.template_base.base_models import (
|
|
|
19
18
|
UIPropertiesMetadata,
|
|
20
19
|
)
|
|
21
20
|
from sinapsis_core.template_base.template import Template
|
|
22
|
-
from sinapsis_core.utils.env_var_keys import
|
|
21
|
+
from sinapsis_core.utils.env_var_keys import WORKING_DIR
|
|
22
|
+
from sinapsis_generic_data_tools.helpers.audio_encoder import audio_bytes_to_numpy
|
|
23
23
|
|
|
24
24
|
from sinapsis_elevenlabs.helpers.env_var_keys import ELEVENLABS_API_KEY
|
|
25
|
+
from sinapsis_elevenlabs.helpers.tags import Tags
|
|
25
26
|
|
|
26
27
|
RESPONSE_TYPE = Iterator[bytes] | list[bytes] | list[Iterator[bytes]] | None
|
|
27
28
|
|
|
@@ -51,9 +52,7 @@ class ElevenLabsBase(Template, abc.ABC):
|
|
|
51
52
|
output_format (OutputFormat): The output audio format and quality. Options include:
|
|
52
53
|
["mp3_22050_32", "mp3_44100_32", "mp3_44100_64", "mp3_44100_96", "mp3_44100_128",
|
|
53
54
|
"mp3_44100_192", "pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100", "ulaw_8000"]
|
|
54
|
-
|
|
55
|
-
stream (bool): If True, the audio is returned as a stream; otherwise, saved to a file.
|
|
56
|
-
voice (VoiceId | VoiceName | Voice): The voice to use for speech synthesis. This can be a voice ID (str),
|
|
55
|
+
voice (str | Voice | None): The voice to use for speech synthesis. This can be a voice ID (str),
|
|
57
56
|
a voice name (str) or an elevenlabs voice object (Voice).
|
|
58
57
|
voice_settings (VoiceSettings): A dictionary of settings that control the behavior of the voice.
|
|
59
58
|
- stability (float)
|
|
@@ -74,17 +73,20 @@ class ElevenLabsBase(Template, abc.ABC):
|
|
|
74
73
|
] = "eleven_turbo_v2_5"
|
|
75
74
|
output_file_name: str | None = None
|
|
76
75
|
output_format: OutputFormat = "mp3_44100_128"
|
|
77
|
-
output_folder: str = os.path.join(
|
|
76
|
+
output_folder: str = os.path.join(WORKING_DIR, "elevenlabs", "audios")
|
|
78
77
|
stream: bool = False
|
|
79
|
-
voice:
|
|
78
|
+
voice: str | Voice | None = None
|
|
80
79
|
voice_settings: VoiceSettings = Field(default_factory=dict) # type: ignore[arg-type]
|
|
81
80
|
|
|
82
|
-
UIProperties = UIPropertiesMetadata(
|
|
81
|
+
UIProperties = UIPropertiesMetadata(
|
|
82
|
+
category="Elevenlabs",
|
|
83
|
+
output_type=OutputTypes.AUDIO,
|
|
84
|
+
tags=[Tags.AUDIO, Tags.ELEVENLABS, Tags.SPEECH],
|
|
85
|
+
)
|
|
83
86
|
|
|
84
87
|
def __init__(self, attributes: TemplateAttributeType) -> None:
|
|
85
88
|
"""Initializes the ElevenLabs API client with the given attributes."""
|
|
86
89
|
super().__init__(attributes)
|
|
87
|
-
os.makedirs(self.attributes.output_folder, exist_ok=True)
|
|
88
90
|
self.client = self.init_elevenlabs_client()
|
|
89
91
|
|
|
90
92
|
def init_elevenlabs_client(self) -> ElevenLabs:
|
|
@@ -92,44 +94,27 @@ class ElevenLabsBase(Template, abc.ABC):
|
|
|
92
94
|
key = self.attributes.api_key if self.attributes.api_key else ELEVENLABS_API_KEY
|
|
93
95
|
return ElevenLabs(api_key=key)
|
|
94
96
|
|
|
95
|
-
def reset_state(self) -> None:
|
|
97
|
+
def reset_state(self, template_name: str | None = None) -> None:
|
|
96
98
|
"""Resets state of model"""
|
|
99
|
+
_ = template_name
|
|
97
100
|
self.client = self.init_elevenlabs_client()
|
|
98
101
|
|
|
99
102
|
@abc.abstractmethod
|
|
100
103
|
def synthesize_speech(self, input_data: list[Packet]) -> RESPONSE_TYPE:
|
|
101
104
|
"""Abstract method for ElevenLabs speech synthesis."""
|
|
102
105
|
|
|
103
|
-
def
|
|
104
|
-
"""Saves the audio to a file and returns the file path."""
|
|
105
|
-
if self.attributes.output_file_name:
|
|
106
|
-
file_name = self.attributes.output_file_name + "_" + str(idx)
|
|
107
|
-
else:
|
|
108
|
-
file_name = uuid.uuid4()
|
|
109
|
-
|
|
110
|
-
output_file = os.path.join(self.attributes.output_folder, f"{file_name}.{file_format}")
|
|
111
|
-
try:
|
|
112
|
-
save(response, output_file)
|
|
113
|
-
self.logger.info(f"Audio saved to: {output_file}")
|
|
114
|
-
return output_file
|
|
115
|
-
except OSError as e:
|
|
116
|
-
self.logger.error(f"File system error while saving speech to file: {e}")
|
|
117
|
-
raise
|
|
118
|
-
|
|
119
|
-
def _generate_audio_stream(self, response: Iterator[bytes] | bytes) -> IO[bytes]:
|
|
106
|
+
def _generate_audio_stream(self, response: Iterable | bytes) -> bytes:
|
|
120
107
|
"""Generates and returns the audio stream."""
|
|
121
|
-
|
|
108
|
+
|
|
122
109
|
try:
|
|
123
110
|
if isinstance(response, Iterator):
|
|
124
|
-
for chunk in response
|
|
125
|
-
if chunk:
|
|
126
|
-
audio_stream.write(chunk)
|
|
111
|
+
audio_stream = b"".join(chunk for chunk in response)
|
|
127
112
|
elif isinstance(response, bytes):
|
|
128
|
-
audio_stream
|
|
113
|
+
audio_stream = response
|
|
114
|
+
|
|
129
115
|
else:
|
|
130
116
|
raise TypeError(f"Unsupported response type: {type(response)}")
|
|
131
117
|
|
|
132
|
-
audio_stream.seek(0)
|
|
133
118
|
self.logger.info("Returning audio stream")
|
|
134
119
|
return audio_stream
|
|
135
120
|
except IOError as e:
|
|
@@ -139,14 +124,15 @@ class ElevenLabsBase(Template, abc.ABC):
|
|
|
139
124
|
self.logger.error(f"Value error while processing audio chunks: {e}")
|
|
140
125
|
raise
|
|
141
126
|
|
|
142
|
-
def _process_audio_output(self,
|
|
127
|
+
def _process_audio_output(self, response: Iterable | bytes) -> tuple[np.ndarray, int]:
|
|
143
128
|
"""Processes a single audio output (either stream or file)."""
|
|
144
|
-
if self.attributes.stream:
|
|
145
|
-
return self._generate_audio_stream(response)
|
|
146
|
-
file_format = "mp3" if "mp3" in self.attributes.output_format else "wav"
|
|
147
|
-
return self._save_audio(response, file_format, idx)
|
|
148
129
|
|
|
149
|
-
|
|
130
|
+
result = self._generate_audio_stream(response)
|
|
131
|
+
audio_np, sample_rate = audio_bytes_to_numpy(result)
|
|
132
|
+
|
|
133
|
+
return audio_np, sample_rate
|
|
134
|
+
|
|
135
|
+
def generate_speech(self, input_data: list[Packet]) -> list[tuple] | None:
|
|
150
136
|
"""Generates speech and saves it to a file."""
|
|
151
137
|
responses: RESPONSE_TYPE = self.synthesize_speech(input_data)
|
|
152
138
|
if not responses:
|
|
@@ -154,29 +140,29 @@ class ElevenLabsBase(Template, abc.ABC):
|
|
|
154
140
|
|
|
155
141
|
if isinstance(responses, Iterator):
|
|
156
142
|
responses = [responses]
|
|
157
|
-
|
|
158
|
-
|
|
143
|
+
elif isinstance(responses, Generator):
|
|
144
|
+
responses = list(responses)
|
|
145
|
+
audio_outputs = [self._process_audio_output(response) for response in responses]
|
|
159
146
|
return audio_outputs
|
|
160
147
|
|
|
161
|
-
def _handle_streaming_output(self, audio_outputs: list[
|
|
148
|
+
def _handle_streaming_output(self, audio_outputs: list[tuple]) -> list[AudioPacket]:
|
|
162
149
|
"""Handles audio stream output by adding it to the container as AudioPackets."""
|
|
163
150
|
generated_audios: list[AudioPacket] = []
|
|
164
|
-
sample_rate = int(self.attributes.output_format.split("_")[1])
|
|
151
|
+
# sample_rate = int(self.attributes.output_format.split("_")[1])
|
|
165
152
|
for audio_output in audio_outputs:
|
|
153
|
+
audio = audio_output[0]
|
|
154
|
+
sample_rate = audio_output[1]
|
|
166
155
|
audio_packet = AudioPacket(
|
|
167
|
-
content=
|
|
156
|
+
content=audio,
|
|
168
157
|
sample_rate=sample_rate,
|
|
169
158
|
)
|
|
170
159
|
generated_audios.append(audio_packet)
|
|
171
160
|
return generated_audios
|
|
172
161
|
|
|
173
|
-
def _handle_audio_outputs(self, audio_outputs: list[
|
|
162
|
+
def _handle_audio_outputs(self, audio_outputs: list[tuple], container: DataContainer) -> None:
|
|
174
163
|
"""Handles the audio outputs by appending to the container based on the output type (stream or file)."""
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
container.audios.extend(self._handle_streaming_output(audio_outputs))
|
|
178
|
-
else:
|
|
179
|
-
self._set_generic_data(container, audio_outputs)
|
|
164
|
+
container.audios = container.audios or []
|
|
165
|
+
container.audios = self._handle_streaming_output(audio_outputs)
|
|
180
166
|
|
|
181
167
|
def execute(self, container: DataContainer) -> DataContainer:
|
|
182
168
|
"""
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""Speech-To-Speech template for ElevenLabs."""
|
|
3
|
+
|
|
4
|
+
from typing import Callable, Iterator, Literal
|
|
5
|
+
|
|
6
|
+
from sinapsis_core.data_containers.data_packet import AudioPacket
|
|
7
|
+
|
|
8
|
+
from sinapsis_elevenlabs.helpers.tags import Tags
|
|
9
|
+
from sinapsis_elevenlabs.helpers.voice_utils import create_voice_settings, get_voice_id
|
|
10
|
+
from sinapsis_elevenlabs.templates.elevenlabs_base import ElevenLabsBase
|
|
11
|
+
|
|
12
|
+
ElevenLabsSTSUIProperties = ElevenLabsBase.UIProperties
|
|
13
|
+
ElevenLabsSTSUIProperties.tags.extend([Tags.SPEECH_TO_SPEECH, Tags.VOICE_CONVERSION])
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ElevenLabsSTS(ElevenLabsBase):
|
|
17
|
+
"""Template to interact with the ElevenLabs Speech-to-Speech API.
|
|
18
|
+
|
|
19
|
+
This template takes an input audio and converts it to a new voice using
|
|
20
|
+
the ElevenLabs Speech-to-Speech (STS) API.
|
|
21
|
+
|
|
22
|
+
Usage example:
|
|
23
|
+
|
|
24
|
+
agent:
|
|
25
|
+
name: my_test_agent
|
|
26
|
+
templates:
|
|
27
|
+
- template_name: InputTemplate
|
|
28
|
+
class_name: InputTemplate
|
|
29
|
+
attributes: {}
|
|
30
|
+
- template_name: ElevenLabsSTS
|
|
31
|
+
class_name: ElevenLabsSTS
|
|
32
|
+
template_input: InputTemplate
|
|
33
|
+
attributes:
|
|
34
|
+
api_key: null
|
|
35
|
+
model: eleven_multilingual_sts_v2
|
|
36
|
+
output_file_name: null
|
|
37
|
+
output_format: mp3_44100_128
|
|
38
|
+
output_folder: <WORKING_DIR>/elevenlabs/audios
|
|
39
|
+
stream: false
|
|
40
|
+
voice: null
|
|
41
|
+
voice_settings:
|
|
42
|
+
stability: null
|
|
43
|
+
similarity_boost: null
|
|
44
|
+
style: null
|
|
45
|
+
use_speaker_boost: null
|
|
46
|
+
speed: null
|
|
47
|
+
streaming_latency: null
|
|
48
|
+
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
PACKET_TYPE_NAME: str = "audios"
|
|
52
|
+
UIProperties = ElevenLabsSTSUIProperties
|
|
53
|
+
|
|
54
|
+
class AttributesBaseModel(ElevenLabsBase.AttributesBaseModel):
|
|
55
|
+
"""Attributes specific to ElevenLabs STS API interaction.
|
|
56
|
+
|
|
57
|
+
Attributes:
|
|
58
|
+
model (Literal): The STS model to use. Options are "eleven_english_sts_v2" or "eleven_multilingual_sts_v2".
|
|
59
|
+
streaming_latency (int | None): Optional latency optimization for streaming. Defaults to None.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
model: Literal["eleven_english_sts_v2", "eleven_multilingual_sts_v2"] = "eleven_multilingual_sts_v2"
|
|
63
|
+
streaming_latency: int | None = None
|
|
64
|
+
|
|
65
|
+
def synthesize_speech(self, input_data: list[AudioPacket]) -> Iterator[bytes]:
|
|
66
|
+
"""Sends an audio input to the ElevenLabs API for speech-to-speech synthesis.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
input_data (list[AudioPacket]): List of AudioPacket objects containing the audio to be converted.
|
|
70
|
+
Only the first AudioPacket in the list is used.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Iterator[bytes]: An iterator yielding audio data chunks in the output format specified.
|
|
74
|
+
|
|
75
|
+
Raises:
|
|
76
|
+
ValueError: If there is a problem with the input data or parameters.
|
|
77
|
+
TypeError: If the input data or files are of incorrect type.
|
|
78
|
+
KeyError: If the expected key is missing in the API response.
|
|
79
|
+
"""
|
|
80
|
+
try:
|
|
81
|
+
method: Callable[..., Iterator[bytes]] = self.client.speech_to_speech.stream # (
|
|
82
|
+
|
|
83
|
+
return method(
|
|
84
|
+
voice_id=get_voice_id(self.client, voice=self.attributes.voice),
|
|
85
|
+
audio=input_data[0].content,
|
|
86
|
+
model_id=self.attributes.model,
|
|
87
|
+
voice_settings=create_voice_settings(self.attributes.voice_settings, as_json=True),
|
|
88
|
+
output_format=self.attributes.output_format,
|
|
89
|
+
optimize_streaming_latency=self.attributes.streaming_latency,
|
|
90
|
+
)
|
|
91
|
+
except ValueError as e:
|
|
92
|
+
self.logger.error(f"Value error synthesizing speech: {e}")
|
|
93
|
+
raise
|
|
94
|
+
except TypeError as e:
|
|
95
|
+
self.logger.error(f"Type error in input data or parameters: {e}")
|
|
96
|
+
raise
|
|
97
|
+
except KeyError as e:
|
|
98
|
+
self.logger.error(f"Missing key in input data or settings: {e}")
|
|
99
|
+
raise
|