sinapsis-speech 0.2.0__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sinapsis_speech-0.3.1/PKG-INFO +298 -0
- {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/README.md +44 -13
- {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_base.py +2 -2
- {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/templates/f5_tts_inference.py +2 -1
- sinapsis_speech-0.3.1/packages/sinapsis_kokoro/src/sinapsis_kokoro/helpers/kokoro_utils.py +74 -0
- sinapsis_speech-0.3.1/packages/sinapsis_kokoro/src/sinapsis_kokoro/templates/__init__.py +20 -0
- sinapsis_speech-0.3.1/packages/sinapsis_kokoro/src/sinapsis_kokoro/templates/kokoro_tts.py +149 -0
- sinapsis_speech-0.3.1/packages/sinapsis_speech.egg-info/PKG-INFO +298 -0
- {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_speech.egg-info/SOURCES.txt +3 -0
- {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_speech.egg-info/requires.txt +2 -1
- {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_speech.egg-info/top_level.txt +1 -0
- {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_zonos/src/sinapsis_zonos/templates/zonos_tts.py +2 -2
- {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/pyproject.toml +8 -5
- sinapsis_speech-0.2.0/PKG-INFO +0 -926
- sinapsis_speech-0.2.0/packages/sinapsis_speech.egg-info/PKG-INFO +0 -926
- {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/LICENSE +0 -0
- {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/__init__.py +0 -0
- {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/__init__.py +0 -0
- {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/env_var_keys.py +0 -0
- {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/voice_utils.py +0 -0
- {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/__init__.py +0 -0
- {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_tts.py +0 -0
- {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_generation.py +0 -0
- {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/__init__.py +0 -0
- {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/templates/__init__.py +0 -0
- {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_speech.egg-info/dependency_links.txt +0 -0
- {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_zonos/src/sinapsis_zonos/__init__.py +0 -0
- {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_zonos/src/sinapsis_zonos/helpers/__init__.py +0 -0
- {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_keys.py +0 -0
- {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_tts_utils.py +0 -0
- {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_zonos/src/sinapsis_zonos/templates/__init__.py +0 -0
- {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/setup.cfg +0 -0
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sinapsis-speech
|
|
3
|
+
Version: 0.3.1
|
|
4
|
+
Summary: Generate speech using various libraries.
|
|
5
|
+
Author-email: SinapsisAI <dev@sinapsis.tech>
|
|
6
|
+
Project-URL: Homepage, https://sinapsis.tech
|
|
7
|
+
Project-URL: Documentation, https://docs.sinapsis.tech/docs/sinapsis-speech
|
|
8
|
+
Project-URL: Tutorials, https://docs.sinapsis.tech/tutorials
|
|
9
|
+
Project-URL: Repository, https://github.com/Sinapsis-AI/sinapsis-speech.git
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: pip>=24.3.1
|
|
14
|
+
Requires-Dist: sinapsis>=0.2.2
|
|
15
|
+
Provides-Extra: all
|
|
16
|
+
Requires-Dist: sinapsis-elevenlabs[all]; extra == "all"
|
|
17
|
+
Requires-Dist: sinapsis-f5-tts[all]; extra == "all"
|
|
18
|
+
Requires-Dist: sinapsis-kokoro[all]; extra == "all"
|
|
19
|
+
Requires-Dist: sinapsis-speech[gradio-app]; extra == "all"
|
|
20
|
+
Requires-Dist: sinapsis-zonos[all]; extra == "all"
|
|
21
|
+
Provides-Extra: gradio-app
|
|
22
|
+
Requires-Dist: sinapsis[webapp]>=0.2.3; extra == "gradio-app"
|
|
23
|
+
Dynamic: license-file
|
|
24
|
+
|
|
25
|
+
<h1 align="center">
|
|
26
|
+
<br>
|
|
27
|
+
<a href="https://sinapsis.tech/">
|
|
28
|
+
<img
|
|
29
|
+
src="https://github.com/Sinapsis-AI/brand-resources/blob/main/sinapsis_logo/4x/logo.png?raw=true"
|
|
30
|
+
alt="" width="300">
|
|
31
|
+
</a><br>
|
|
32
|
+
Sinapsis Speech
|
|
33
|
+
<br>
|
|
34
|
+
</h1>
|
|
35
|
+
|
|
36
|
+
<h4 align="center"> Templates for a wide range of voice generation tasks.</h4>
|
|
37
|
+
|
|
38
|
+
<p align="center">
|
|
39
|
+
<a href="#installation">🐍 Installation</a> •
|
|
40
|
+
<a href="#packages">📦 Packages</a> •
|
|
41
|
+
<a href="#webapp">🌐 Webapps</a> •
|
|
42
|
+
<a href="#documentation">📙 Documentation</a> •
|
|
43
|
+
<a href="#packages">🔍 License</a>
|
|
44
|
+
</p>
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
<h2 id="installation">🐍 Installation</h2>
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
> [!IMPORTANT]
|
|
51
|
+
> Sinapsis projects requires Python 3.10 or higher.
|
|
52
|
+
>
|
|
53
|
+
|
|
54
|
+
This repo includes packages for performing speech synthesis using different tools:
|
|
55
|
+
|
|
56
|
+
* <code>sinapsis-elevenlabs</code>
|
|
57
|
+
* <code>sinapsis-f5-tts</code>
|
|
58
|
+
* * <code>sinapsis-kokoro</code>
|
|
59
|
+
* <code>sinapsis-zonos</code>
|
|
60
|
+
|
|
61
|
+
Install using your preferred package manager. We strongly recommend using <code>uv</code>. To install <code>uv</code>, refer to the [official documentation](https://docs.astral.sh/uv/getting-started/installation/#installation-methods).
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
Install with <code>uv</code>:
|
|
65
|
+
```bash
|
|
66
|
+
uv pip install sinapsis-elevenlabs --extra-index-url https://pypi.sinapsis.tech
|
|
67
|
+
```
|
|
68
|
+
Or with raw <code>pip</code>:
|
|
69
|
+
```bash
|
|
70
|
+
pip install sinapsis-elevenlabs --extra-index-url https://pypi.sinapsis.tech
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
**Replace `sinapsis-elevenlabs` with the name of the package you intend to install**.
|
|
74
|
+
|
|
75
|
+
> [!IMPORTANT]
|
|
76
|
+
> Templates in each package may require additional dependencies. For development, we recommend installing the package all optional dependencies:
|
|
77
|
+
>
|
|
78
|
+
With <code>uv</code>:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
uv pip install sinapsis-elevenlabs[all] --extra-index-url https://pypi.sinapsis.tech
|
|
82
|
+
```
|
|
83
|
+
Or with raw <code>pip</code>:
|
|
84
|
+
```bash
|
|
85
|
+
pip install sinapsis-elevenlabs[all] --extra-index-url https://pypi.sinapsis.tech
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
**Be sure to substitute `sinapsis-elevenlabs` with the appropriate package name**.
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
> [!TIP]
|
|
93
|
+
> You can also install all the packages within this project:
|
|
94
|
+
>
|
|
95
|
+
```bash
|
|
96
|
+
uv pip install sinapsis-speech[all] --extra-index-url https://pypi.sinapsis.tech
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
<h2 id="packages">📦 Packages</h2>
|
|
101
|
+
|
|
102
|
+
This repository is organized into modular packages, each designed for integration with different text-to-speech tools. These packages provide ready-to-use templates for speech synthesis. Below is an overview of the available packages:
|
|
103
|
+
|
|
104
|
+
<details>
|
|
105
|
+
<summary id="elevenlabs"><strong><span style="font-size: 1.4em;"> Sinapsis ElevenLabs </span></strong></summary>
|
|
106
|
+
|
|
107
|
+
This package offers a suite of templates and utilities designed for effortless integrating, configuration, and execution of **text-to-speech (TTS)** and **voice generation** functionalities powered by [ElevenLabs](https://elevenlabs.io/).
|
|
108
|
+
|
|
109
|
+
- **ElevenLabsTTS**: Template for converting text into speech using ElevenLabs' voice models.
|
|
110
|
+
|
|
111
|
+
- **ElevenLabsVoiceGeneration**: Template for generating custom synthetic voices based on user-provided descriptions.
|
|
112
|
+
|
|
113
|
+
For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_elevenlabs/README.md).
|
|
114
|
+
|
|
115
|
+
</details>
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
<details>
|
|
119
|
+
<summary id="f5tts"><strong><span style="font-size: 1.4em;"> Sinapsis F5-TTS</span></strong></summary>
|
|
120
|
+
|
|
121
|
+
This package provides a template for seamlessly integrating, configuring, and running **text-to-speech (TTS)** functionalities powered by [F5TTS](https://github.com/SWivid/F5-TTS).
|
|
122
|
+
|
|
123
|
+
- **F5TTSInference**: Converts text to speech using the F5TTS model with voice cloning capabilities.
|
|
124
|
+
|
|
125
|
+
For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_f5_tts/README.md).
|
|
126
|
+
|
|
127
|
+
</details>
|
|
128
|
+
<details>
|
|
129
|
+
<summary id="f5tts"><strong><span style="font-size: 1.4em;"> Sinapsis Kokoro</span></strong></summary>
|
|
130
|
+
|
|
131
|
+
This package provides a single template for integrating, configuring, and running text-to-speech (TTS) synthesis using the [Kokoro 82M v1.0](https://huggingface.co/hexgrad/Kokoro-82M) model.
|
|
132
|
+
|
|
133
|
+
KokoroTTS: Converts text to speech using the Kokoro TTS model. The template processes text packets from the input container, generates corresponding audio using Kokoro, and adds the resulting audio packets to the container.
|
|
134
|
+
For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_kokoro/README.md).
|
|
135
|
+
</details>
|
|
136
|
+
<details>
|
|
137
|
+
<summary id="zonos"><strong><span style="font-size: 1.4em;"> Sinapsis Zonos</span></strong></summary>
|
|
138
|
+
|
|
139
|
+
This package provides a single template for integrating, configuring, and running **text-to-speech (TTS)** and **voice cloning** functionalities powered by [Zonos](https://github.com/Zyphra/Zonos/tree/main).
|
|
140
|
+
|
|
141
|
+
- **ZonosTTS**: Template for converting text to speech or performing voice cloning based on the presence of an audio sample.
|
|
142
|
+
|
|
143
|
+
For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_zonos/README.md).
|
|
144
|
+
|
|
145
|
+
</details>
|
|
146
|
+
|
|
147
|
+
<h2 id="webapp">🌐 Webapps</h2>
|
|
148
|
+
The webapps included in this project showcase the modularity of the templates, in this case for speech generation tasks.
|
|
149
|
+
|
|
150
|
+
> [!IMPORTANT]
|
|
151
|
+
> To run the app you first need to clone this repository:
|
|
152
|
+
|
|
153
|
+
```bash
|
|
154
|
+
git clone git@github.com:Sinapsis-ai/sinapsis-speech.git
|
|
155
|
+
cd sinapsis-speech
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
> [!NOTE]
|
|
159
|
+
> If you'd like to enable external app sharing in Gradio, `export GRADIO_SHARE_APP=True`
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
> [!IMPORTANT]
|
|
163
|
+
> Elevenlabs requires an API key to run any inference. To get started, visit the [official website](https://elevenlabs.io) and create an account. If you already have an account, go to the [API keys page](https://elevenlabs.io/app/settings/api-keys) to generate a token.
|
|
164
|
+
|
|
165
|
+
> [!IMPORTANT]
|
|
166
|
+
> Set your env var using <code> export ELEVENLABS_API_KEY='your-api-key'</code>
|
|
167
|
+
|
|
168
|
+
> [!IMPORTANT]
|
|
169
|
+
> F5-TTS requires a reference audio file for voice cloning. Make sure you have a reference audio file in the artifacts directory.
|
|
170
|
+
|
|
171
|
+
> [!NOTE]
|
|
172
|
+
> Agent configuration can be changed through the `AGENT_CONFIG_PATH` env var. You can check the available configurations in each package configs folder.
|
|
173
|
+
|
|
174
|
+
<details>
|
|
175
|
+
<summary id="docker"><strong><span style="font-size: 1.4em;">🐳 Docker</span></strong></summary>
|
|
176
|
+
|
|
177
|
+
**IMPORTANT**: This Docker image depends on the `sinapsis-nvidia:base` image. For detailed instructions, please refer to the [Sinapsis README](https://github.com/Sinapsis-ai/sinapsis?tab=readme-ov-file#docker).
|
|
178
|
+
|
|
179
|
+
1. **Build the sinapsis-speech image**:
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
docker compose -f docker/compose.yaml build
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
2. **Start the app container**:
|
|
187
|
+
|
|
188
|
+
- For ElevenLabs:
|
|
189
|
+
```bash
|
|
190
|
+
docker compose -f docker/compose_apps.yaml up -d sinapsis-elevenlabs
|
|
191
|
+
```
|
|
192
|
+
- For F5-TTS:
|
|
193
|
+
```bash
|
|
194
|
+
docker compose -f docker/compose_apps.yaml up -d sinapsis-f5_tts
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
- For Kokoro:
|
|
198
|
+
|
|
199
|
+
```bash
|
|
200
|
+
docker compose -f docker/compose_apps.yaml up -d sinapsis-kokoro
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
- For Zonos:
|
|
204
|
+
```bash
|
|
205
|
+
docker compose -f docker/compose_apps.yaml up -d sinapsis-zonos
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
3. **Check the logs**
|
|
209
|
+
|
|
210
|
+
- For ElevenLabs:
|
|
211
|
+
```bash
|
|
212
|
+
docker logs -f sinapsis-elevenlabs
|
|
213
|
+
```
|
|
214
|
+
- For F5-TTS:
|
|
215
|
+
```bash
|
|
216
|
+
docker logs -f sinapsis-f5tts
|
|
217
|
+
```
|
|
218
|
+
- For Kokoro:
|
|
219
|
+
```bash
|
|
220
|
+
docker logs -f sinapsis-kokoro
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
- For Zonos:
|
|
224
|
+
```bash
|
|
225
|
+
docker logs -f sinapsis-zonos
|
|
226
|
+
```
|
|
227
|
+
4. **The logs will display the URL to access the webapp, e.g.,:**:
|
|
228
|
+
```bash
|
|
229
|
+
Running on local URL: http://127.0.0.1:7860
|
|
230
|
+
```
|
|
231
|
+
**NOTE**: The url may be different, check the output of logs.
|
|
232
|
+
5. **To stop the app**:
|
|
233
|
+
```bash
|
|
234
|
+
docker compose -f docker/compose_apps.yaml down
|
|
235
|
+
```
|
|
236
|
+
</details>
|
|
237
|
+
|
|
238
|
+
<details>
|
|
239
|
+
<summary id="virtual-environment"><strong><span style="font-size: 1.4em;">💻 UV</span></strong></summary>
|
|
240
|
+
|
|
241
|
+
To run the webapp using the <code>uv</code> package manager, follow these steps:
|
|
242
|
+
|
|
243
|
+
1. **Sync the virtual environment**:
|
|
244
|
+
|
|
245
|
+
```bash
|
|
246
|
+
uv sync --frozen
|
|
247
|
+
```
|
|
248
|
+
2. **Install the wheel**:
|
|
249
|
+
|
|
250
|
+
```bash
|
|
251
|
+
uv pip install sinapsis-speech[all] --extra-index-url https://pypi.sinapsis.tech
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
3. **Run the webapp**:
|
|
257
|
+
|
|
258
|
+
- For ElevenLabs:
|
|
259
|
+
```bash
|
|
260
|
+
uv run webapps/generic_tts_apps/elevenlabs_tts_app.py
|
|
261
|
+
```
|
|
262
|
+
- For F5-TTS:
|
|
263
|
+
```bash
|
|
264
|
+
uv run webapps/packet_tts_apps/f5_tts_app.py
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
- For Kokoro:
|
|
268
|
+
```bash
|
|
269
|
+
uv run webapps/packet_tts_apps/kokoro_tts_app.py
|
|
270
|
+
```
|
|
271
|
+
- For Zonos:
|
|
272
|
+
```bash
|
|
273
|
+
uv run webapps/generic_tts_apps/zonos_tts_app.py
|
|
274
|
+
```
|
|
275
|
+
4. **The terminal will display the URL to access the webapp (e.g.)**:
|
|
276
|
+
```bash
|
|
277
|
+
Running on local URL: http://127.0.0.1:7860
|
|
278
|
+
```
|
|
279
|
+
**NOTE**: The URL may vary; check the terminal output for the correct address.
|
|
280
|
+
|
|
281
|
+
</details>
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
<h2 id="documentation">📙 Documentation</h2>
|
|
286
|
+
|
|
287
|
+
Documentation is available on the [sinapsis website](https://docs.sinapsis.tech/docs)
|
|
288
|
+
|
|
289
|
+
Tutorials for different projects within sinapsis are available at [sinapsis tutorials page](https://docs.sinapsis.tech/tutorials)
|
|
290
|
+
|
|
291
|
+
<h2 id="license">🔍 License</h2>
|
|
292
|
+
|
|
293
|
+
This project is licensed under the AGPLv3 license, which encourages open collaboration and sharing. For more details, please refer to the [LICENSE](LICENSE) file.
|
|
294
|
+
|
|
295
|
+
For commercial use, please refer to our [official Sinapsis website](https://sinapsis.tech) for information on obtaining a commercial license.
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
|
|
@@ -31,6 +31,7 @@ This repo includes packages for performing speech synthesis using different tool
|
|
|
31
31
|
|
|
32
32
|
* <code>sinapsis-elevenlabs</code>
|
|
33
33
|
* <code>sinapsis-f5-tts</code>
|
|
34
|
+
* * <code>sinapsis-kokoro</code>
|
|
34
35
|
* <code>sinapsis-zonos</code>
|
|
35
36
|
|
|
36
37
|
Install using your preferred package manager. We strongly recommend using <code>uv</code>. To install <code>uv</code>, refer to the [official documentation](https://docs.astral.sh/uv/getting-started/installation/#installation-methods).
|
|
@@ -100,7 +101,14 @@ This package provides a template for seamlessly integrating, configuring, and ru
|
|
|
100
101
|
For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_f5_tts/README.md).
|
|
101
102
|
|
|
102
103
|
</details>
|
|
104
|
+
<details>
|
|
105
|
+
<summary id="f5tts"><strong><span style="font-size: 1.4em;"> Sinapsis Kokoro</span></strong></summary>
|
|
106
|
+
|
|
107
|
+
This package provides a single template for integrating, configuring, and running text-to-speech (TTS) synthesis using the [Kokoro 82M v1.0](https://huggingface.co/hexgrad/Kokoro-82M) model.
|
|
103
108
|
|
|
109
|
+
KokoroTTS: Converts text to speech using the Kokoro TTS model. The template processes text packets from the input container, generates corresponding audio using Kokoro, and adds the resulting audio packets to the container.
|
|
110
|
+
For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_kokoro/README.md).
|
|
111
|
+
</details>
|
|
104
112
|
<details>
|
|
105
113
|
<summary id="zonos"><strong><span style="font-size: 1.4em;"> Sinapsis Zonos</span></strong></summary>
|
|
106
114
|
|
|
@@ -139,41 +147,56 @@ cd sinapsis-speech
|
|
|
139
147
|
> [!NOTE]
|
|
140
148
|
> Agent configuration can be changed through the `AGENT_CONFIG_PATH` env var. You can check the available configurations in each package configs folder.
|
|
141
149
|
|
|
142
|
-
|
|
143
150
|
<details>
|
|
144
151
|
<summary id="docker"><strong><span style="font-size: 1.4em;">🐳 Docker</span></strong></summary>
|
|
145
152
|
|
|
146
153
|
**IMPORTANT**: This Docker image depends on the `sinapsis-nvidia:base` image. For detailed instructions, please refer to the [Sinapsis README](https://github.com/Sinapsis-ai/sinapsis?tab=readme-ov-file#docker).
|
|
147
154
|
|
|
148
155
|
1. **Build the sinapsis-speech image**:
|
|
156
|
+
|
|
149
157
|
```bash
|
|
150
158
|
docker compose -f docker/compose.yaml build
|
|
151
159
|
```
|
|
152
160
|
|
|
161
|
+
|
|
153
162
|
2. **Start the app container**:
|
|
154
|
-
|
|
163
|
+
|
|
164
|
+
- For ElevenLabs:
|
|
155
165
|
```bash
|
|
156
166
|
docker compose -f docker/compose_apps.yaml up -d sinapsis-elevenlabs
|
|
157
167
|
```
|
|
158
|
-
For F5-TTS:
|
|
168
|
+
- For F5-TTS:
|
|
159
169
|
```bash
|
|
160
170
|
docker compose -f docker/compose_apps.yaml up -d sinapsis-f5_tts
|
|
161
171
|
```
|
|
162
|
-
|
|
172
|
+
|
|
173
|
+
- For Kokoro:
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
docker compose -f docker/compose_apps.yaml up -d sinapsis-kokoro
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
- For Zonos:
|
|
163
180
|
```bash
|
|
164
181
|
docker compose -f docker/compose_apps.yaml up -d sinapsis-zonos
|
|
165
182
|
```
|
|
166
183
|
|
|
167
184
|
3. **Check the logs**
|
|
168
|
-
|
|
185
|
+
|
|
186
|
+
- For ElevenLabs:
|
|
169
187
|
```bash
|
|
170
188
|
docker logs -f sinapsis-elevenlabs
|
|
171
189
|
```
|
|
172
|
-
For F5-TTS:
|
|
190
|
+
- For F5-TTS:
|
|
173
191
|
```bash
|
|
174
192
|
docker logs -f sinapsis-f5tts
|
|
175
193
|
```
|
|
176
|
-
For
|
|
194
|
+
- For Kokoro:
|
|
195
|
+
```bash
|
|
196
|
+
docker logs -f sinapsis-kokoro
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
- For Zonos:
|
|
177
200
|
```bash
|
|
178
201
|
docker logs -f sinapsis-zonos
|
|
179
202
|
```
|
|
@@ -204,18 +227,26 @@ uv sync --frozen
|
|
|
204
227
|
uv pip install sinapsis-speech[all] --extra-index-url https://pypi.sinapsis.tech
|
|
205
228
|
```
|
|
206
229
|
|
|
230
|
+
|
|
231
|
+
|
|
207
232
|
3. **Run the webapp**:
|
|
208
|
-
|
|
233
|
+
|
|
234
|
+
- For ElevenLabs:
|
|
209
235
|
```bash
|
|
210
|
-
uv run webapps/
|
|
236
|
+
uv run webapps/generic_tts_apps/elevenlabs_tts_app.py
|
|
211
237
|
```
|
|
212
|
-
For F5-TTS:
|
|
238
|
+
- For F5-TTS:
|
|
239
|
+
```bash
|
|
240
|
+
uv run webapps/packet_tts_apps/f5_tts_app.py
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
- For Kokoro:
|
|
213
244
|
```bash
|
|
214
|
-
uv run webapps/
|
|
245
|
+
uv run webapps/packet_tts_apps/kokoro_tts_app.py
|
|
215
246
|
```
|
|
216
|
-
For Zonos:
|
|
247
|
+
- For Zonos:
|
|
217
248
|
```bash
|
|
218
|
-
uv run webapps/
|
|
249
|
+
uv run webapps/generic_tts_apps/zonos_tts_app.py
|
|
219
250
|
```
|
|
220
251
|
4. **The terminal will display the URL to access the webapp (e.g.)**:
|
|
221
252
|
```bash
|
|
@@ -12,11 +12,11 @@ from elevenlabs.client import ElevenLabs, VoiceId, VoiceName
|
|
|
12
12
|
from elevenlabs.types import OutputFormat
|
|
13
13
|
from pydantic import Field
|
|
14
14
|
from sinapsis_core.data_containers.data_packet import AudioPacket, DataContainer, Packet
|
|
15
|
-
from sinapsis_core.template_base.
|
|
16
|
-
Template,
|
|
15
|
+
from sinapsis_core.template_base.base_models import (
|
|
17
16
|
TemplateAttributes,
|
|
18
17
|
TemplateAttributeType,
|
|
19
18
|
)
|
|
19
|
+
from sinapsis_core.template_base.template import Template
|
|
20
20
|
from sinapsis_core.utils.env_var_keys import SINAPSIS_CACHE_DIR
|
|
21
21
|
|
|
22
22
|
from sinapsis_elevenlabs.helpers.env_var_keys import ELEVENLABS_API_KEY
|
|
@@ -12,7 +12,8 @@ from sinapsis_core.data_containers.data_packet import (
|
|
|
12
12
|
AudioPacket,
|
|
13
13
|
DataContainer,
|
|
14
14
|
)
|
|
15
|
-
from sinapsis_core.template_base import Template
|
|
15
|
+
from sinapsis_core.template_base import Template
|
|
16
|
+
from sinapsis_core.template_base.base_models import TemplateAttributes
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
@dataclass
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
from typing import Literal
|
|
3
|
+
|
|
4
|
+
from pydantic.dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
kokoro_voices = Literal[
|
|
7
|
+
"af_heart",
|
|
8
|
+
"af_alloy",
|
|
9
|
+
"af_aoede",
|
|
10
|
+
"af_bella",
|
|
11
|
+
"af_jessicaaf_kore",
|
|
12
|
+
"af_nicole",
|
|
13
|
+
"af_nova",
|
|
14
|
+
"af_river",
|
|
15
|
+
"af_sarah",
|
|
16
|
+
"af_sky",
|
|
17
|
+
"am_adam",
|
|
18
|
+
"am_echo",
|
|
19
|
+
"am_eric",
|
|
20
|
+
"am_fenrir",
|
|
21
|
+
"am_liam",
|
|
22
|
+
"am_michael",
|
|
23
|
+
"am_onyx",
|
|
24
|
+
"am_puck",
|
|
25
|
+
"am_santa",
|
|
26
|
+
"bf_alice",
|
|
27
|
+
"bf_emma",
|
|
28
|
+
"bf_isabella",
|
|
29
|
+
"bf_lily",
|
|
30
|
+
"bm_daniel",
|
|
31
|
+
"bm_fable",
|
|
32
|
+
"bm_george",
|
|
33
|
+
"bm_lewis",
|
|
34
|
+
"jf_alpha",
|
|
35
|
+
"jf_gongitsune",
|
|
36
|
+
"jf_nezumi",
|
|
37
|
+
"jf_tebukuro",
|
|
38
|
+
"jm_kumo",
|
|
39
|
+
"zf_xiaobei",
|
|
40
|
+
"zf_xiaoni",
|
|
41
|
+
"zf_xiaoxiao",
|
|
42
|
+
"zf_xiaoyi",
|
|
43
|
+
"zm_yunjian",
|
|
44
|
+
"zm_yunxi",
|
|
45
|
+
"zm_yunxia",
|
|
46
|
+
"zm_yunyang",
|
|
47
|
+
"ef_dora",
|
|
48
|
+
"em_alex",
|
|
49
|
+
"em_santa",
|
|
50
|
+
"ff_siwis",
|
|
51
|
+
"hf_alpha",
|
|
52
|
+
"hf_beta",
|
|
53
|
+
"hm_omega",
|
|
54
|
+
"hm_psi",
|
|
55
|
+
"if_sara",
|
|
56
|
+
"im_nicola",
|
|
57
|
+
"pf_dora",
|
|
58
|
+
"pm_alex",
|
|
59
|
+
"pm_santa",
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass(frozen=True)
|
|
64
|
+
class KokoroKeys:
|
|
65
|
+
"""
|
|
66
|
+
A class to hold constants for the keys used in the Text-to-Speech (TTS) model configuration.
|
|
67
|
+
|
|
68
|
+
These keys represent standard fields that are used to configure various parameters of the TTS model,
|
|
69
|
+
such as speaker attributes, emotions, and other audio-related settings. They are typically used in
|
|
70
|
+
templates and potentially a TTS web application to adjust and access specific TTS settings."
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
repo_id: Literal["hexgrad/Kokoro-82M"] = "hexgrad/Kokoro-82M"
|
|
74
|
+
default_voice: Literal["af_heart"] = "af_heart"
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import importlib
|
|
3
|
+
from typing import Callable
|
|
4
|
+
|
|
5
|
+
_root_lib_path = "sinapsis_kokoro.templates"
|
|
6
|
+
|
|
7
|
+
_template_lookup = {
|
|
8
|
+
"KokoroTTS": f"{_root_lib_path}.kokoro_tts",
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def __getattr__(name: str) -> Callable:
|
|
13
|
+
if name in _template_lookup:
|
|
14
|
+
module = importlib.import_module(_template_lookup[name])
|
|
15
|
+
return getattr(module, name)
|
|
16
|
+
|
|
17
|
+
raise AttributeError(f"template `{name}` not found in {_root_lib_path}")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
__all__ = list(_template_lookup.keys())
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
from typing import Generator
|
|
3
|
+
from urllib.error import HTTPError
|
|
4
|
+
|
|
5
|
+
import torch
|
|
6
|
+
from kokoro import KPipeline
|
|
7
|
+
from sinapsis_core.data_containers.data_packet import AudioPacket, DataContainer
|
|
8
|
+
from sinapsis_core.template_base.base_models import (
|
|
9
|
+
TemplateAttributes,
|
|
10
|
+
TemplateAttributeType,
|
|
11
|
+
)
|
|
12
|
+
from sinapsis_core.template_base.template import Template
|
|
13
|
+
from sinapsis_core.utils.logging_utils import make_loguru
|
|
14
|
+
|
|
15
|
+
from sinapsis_kokoro.helpers.kokoro_utils import KokoroKeys, kokoro_voices
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class KokoroTTS(Template):
|
|
19
|
+
"""
|
|
20
|
+
Template for text-to-speech (TTS) synthesis using the Kokoro 82M v1.0 model.
|
|
21
|
+
This class handles the initialization of the TTS pipeline, speech generation,
|
|
22
|
+
and packaging the output audio in the desired format.
|
|
23
|
+
|
|
24
|
+
Usage example:
|
|
25
|
+
|
|
26
|
+
agent:
|
|
27
|
+
name: my_test_agent
|
|
28
|
+
templates:
|
|
29
|
+
- template_name: InputTemplate
|
|
30
|
+
class_name: InputTemplate
|
|
31
|
+
attributes: {}
|
|
32
|
+
- template_name: KokoroTTS
|
|
33
|
+
class_name: KokoroTTS
|
|
34
|
+
template_input: InputTemplate
|
|
35
|
+
attributes:
|
|
36
|
+
speed: 1
|
|
37
|
+
voice: af_heart
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
class AttributesBaseModel(TemplateAttributes):
|
|
41
|
+
"""
|
|
42
|
+
Configuration attributes for the Kokoro TTS model.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
speed (int | float): The speed at which the speech will be generated. Default is 1 (normal speed).
|
|
46
|
+
split_pattern (str): The regular expression pattern used to split the input text into smaller chunks.
|
|
47
|
+
Default is r"\n+" (split on newlines).
|
|
48
|
+
voice (kokoro_voices): The voice model to use for speech synthesis. Default is "af_heart".
|
|
49
|
+
|
|
50
|
+
Notes:
|
|
51
|
+
The list of languages and voices supported by Kokoro can be found at:
|
|
52
|
+
https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
speed: int | float = 1
|
|
56
|
+
split_pattern: str = r"\n+"
|
|
57
|
+
voice: kokoro_voices = KokoroKeys.default_voice
|
|
58
|
+
|
|
59
|
+
def __init__(self, attributes: TemplateAttributeType) -> None:
|
|
60
|
+
"""Initializes the Kokoro TTS pipeline with the provided attributes."""
|
|
61
|
+
super().__init__(attributes)
|
|
62
|
+
self.pipeline = self.init_pipeline()
|
|
63
|
+
self.logger = make_loguru()
|
|
64
|
+
|
|
65
|
+
def init_pipeline(self) -> KPipeline:
|
|
66
|
+
"""
|
|
67
|
+
Initializes the Kokoro TTS pipeline with the voice model and repository id.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
KPipeline: The initialized TTS pipeline for generating speech.
|
|
71
|
+
"""
|
|
72
|
+
return KPipeline(lang_code=self.attributes.voice[0], repo_id=KokoroKeys.repo_id)
|
|
73
|
+
|
|
74
|
+
def _create_audio_packet(
|
|
75
|
+
self,
|
|
76
|
+
audio_data: torch.tensor,
|
|
77
|
+
sample_rate: int,
|
|
78
|
+
container: DataContainer,
|
|
79
|
+
) -> None:
|
|
80
|
+
"""
|
|
81
|
+
Creates an audio packet from the generated audio data and adds it to the container.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
audio_data (torch.tensor): The generated audio data (raw audio).
|
|
85
|
+
sample_rate (int): The sample rate of the generated audio (typically 24000 Hz).
|
|
86
|
+
container (DataContainer): The container to which the audio packet will be added.
|
|
87
|
+
"""
|
|
88
|
+
audio_packet = AudioPacket(
|
|
89
|
+
content=audio_data,
|
|
90
|
+
source=self.instance_name,
|
|
91
|
+
sample_rate=sample_rate,
|
|
92
|
+
)
|
|
93
|
+
container.audios.append(audio_packet)
|
|
94
|
+
|
|
95
|
+
def _process_audio_chunks(self, generator: Generator, container: DataContainer) -> None:
|
|
96
|
+
"""
|
|
97
|
+
Processes the audio chunks generated by the pipeline and creates audio packets.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
generator: The generator that yields text, phonemes, and audio data.
|
|
101
|
+
container (DataContainer): The container holding the input data.
|
|
102
|
+
"""
|
|
103
|
+
for i, (gs, ps, audio) in enumerate(generator):
|
|
104
|
+
self.logger.debug(f"Index: {i}")
|
|
105
|
+
self.logger.debug(f"Text: {gs}")
|
|
106
|
+
self.logger.debug(f"Phonemes: {ps}")
|
|
107
|
+
if audio is not None:
|
|
108
|
+
self._create_audio_packet(audio, 24000, container)
|
|
109
|
+
else:
|
|
110
|
+
self.logger.warning(f"Audio is None for index {i}")
|
|
111
|
+
|
|
112
|
+
def generate_speech(self, container: DataContainer) -> None:
|
|
113
|
+
"""
|
|
114
|
+
Generates speech from the input text in the provided data container.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
container (DataContainer): The container holding the input text data to be converted into speech.
|
|
118
|
+
"""
|
|
119
|
+
input_text = "".join(t.content for t in container.texts)
|
|
120
|
+
generator = self.pipeline(
|
|
121
|
+
input_text,
|
|
122
|
+
voice=self.attributes.voice,
|
|
123
|
+
speed=self.attributes.speed,
|
|
124
|
+
split_pattern=self.attributes.split_pattern,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
try:
|
|
128
|
+
self._process_audio_chunks(generator, container)
|
|
129
|
+
except HTTPError as e:
|
|
130
|
+
self.logger.error(f"Unable to generate speech: {e}")
|
|
131
|
+
|
|
132
|
+
def execute(self, container: DataContainer) -> DataContainer:
|
|
133
|
+
"""
|
|
134
|
+
Processes the input data and generates the corresponding speech output.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
container (DataContainer): The container holding the input text data.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
DataContainer: The updated container with the generated audio.
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
if not container.texts:
|
|
144
|
+
self.logger.debug("No query to enter")
|
|
145
|
+
return container
|
|
146
|
+
|
|
147
|
+
self.generate_speech(container)
|
|
148
|
+
|
|
149
|
+
return container
|