sinapsis-speech 0.2.0__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. sinapsis_speech-0.3.1/PKG-INFO +298 -0
  2. {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/README.md +44 -13
  3. {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_base.py +2 -2
  4. {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/templates/f5_tts_inference.py +2 -1
  5. sinapsis_speech-0.3.1/packages/sinapsis_kokoro/src/sinapsis_kokoro/helpers/kokoro_utils.py +74 -0
  6. sinapsis_speech-0.3.1/packages/sinapsis_kokoro/src/sinapsis_kokoro/templates/__init__.py +20 -0
  7. sinapsis_speech-0.3.1/packages/sinapsis_kokoro/src/sinapsis_kokoro/templates/kokoro_tts.py +149 -0
  8. sinapsis_speech-0.3.1/packages/sinapsis_speech.egg-info/PKG-INFO +298 -0
  9. {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_speech.egg-info/SOURCES.txt +3 -0
  10. {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_speech.egg-info/requires.txt +2 -1
  11. {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_speech.egg-info/top_level.txt +1 -0
  12. {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_zonos/src/sinapsis_zonos/templates/zonos_tts.py +2 -2
  13. {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/pyproject.toml +8 -5
  14. sinapsis_speech-0.2.0/PKG-INFO +0 -926
  15. sinapsis_speech-0.2.0/packages/sinapsis_speech.egg-info/PKG-INFO +0 -926
  16. {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/LICENSE +0 -0
  17. {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/__init__.py +0 -0
  18. {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/__init__.py +0 -0
  19. {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/env_var_keys.py +0 -0
  20. {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/voice_utils.py +0 -0
  21. {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/__init__.py +0 -0
  22. {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_tts.py +0 -0
  23. {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_generation.py +0 -0
  24. {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/__init__.py +0 -0
  25. {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/templates/__init__.py +0 -0
  26. {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_speech.egg-info/dependency_links.txt +0 -0
  27. {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_zonos/src/sinapsis_zonos/__init__.py +0 -0
  28. {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_zonos/src/sinapsis_zonos/helpers/__init__.py +0 -0
  29. {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_keys.py +0 -0
  30. {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_tts_utils.py +0 -0
  31. {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/packages/sinapsis_zonos/src/sinapsis_zonos/templates/__init__.py +0 -0
  32. {sinapsis_speech-0.2.0 → sinapsis_speech-0.3.1}/setup.cfg +0 -0
@@ -0,0 +1,298 @@
1
+ Metadata-Version: 2.4
2
+ Name: sinapsis-speech
3
+ Version: 0.3.1
4
+ Summary: Generate speech using various libraries.
5
+ Author-email: SinapsisAI <dev@sinapsis.tech>
6
+ Project-URL: Homepage, https://sinapsis.tech
7
+ Project-URL: Documentation, https://docs.sinapsis.tech/docs/sinapsis-speech
8
+ Project-URL: Tutorials, https://docs.sinapsis.tech/tutorials
9
+ Project-URL: Repository, https://github.com/Sinapsis-AI/sinapsis-speech.git
10
+ Requires-Python: >=3.10
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: pip>=24.3.1
14
+ Requires-Dist: sinapsis>=0.2.2
15
+ Provides-Extra: all
16
+ Requires-Dist: sinapsis-elevenlabs[all]; extra == "all"
17
+ Requires-Dist: sinapsis-f5-tts[all]; extra == "all"
18
+ Requires-Dist: sinapsis-kokoro[all]; extra == "all"
19
+ Requires-Dist: sinapsis-speech[gradio-app]; extra == "all"
20
+ Requires-Dist: sinapsis-zonos[all]; extra == "all"
21
+ Provides-Extra: gradio-app
22
+ Requires-Dist: sinapsis[webapp]>=0.2.3; extra == "gradio-app"
23
+ Dynamic: license-file
24
+
25
+ <h1 align="center">
26
+ <br>
27
+ <a href="https://sinapsis.tech/">
28
+ <img
29
+ src="https://github.com/Sinapsis-AI/brand-resources/blob/main/sinapsis_logo/4x/logo.png?raw=true"
30
+ alt="" width="300">
31
+ </a><br>
32
+ Sinapsis Speech
33
+ <br>
34
+ </h1>
35
+
36
+ <h4 align="center"> Templates for a wide range of voice generation tasks.</h4>
37
+
38
+ <p align="center">
39
+ <a href="#installation">🐍 Installation</a> •
40
+ <a href="#packages">📦 Packages</a> •
41
+ <a href="#webapp">🌐 Webapps</a> •
42
+ <a href="#documentation">📙 Documentation</a> •
43
+ <a href="#packages">🔍 License</a>
44
+ </p>
45
+
46
+
47
+ <h2 id="installation">🐍 Installation</h2>
48
+
49
+
50
+ > [!IMPORTANT]
51
+ > Sinapsis projects requires Python 3.10 or higher.
52
+ >
53
+
54
+ This repo includes packages for performing speech synthesis using different tools:
55
+
56
+ * <code>sinapsis-elevenlabs</code>
57
+ * <code>sinapsis-f5-tts</code>
58
+ * * <code>sinapsis-kokoro</code>
59
+ * <code>sinapsis-zonos</code>
60
+
61
+ Install using your preferred package manager. We strongly recommend using <code>uv</code>. To install <code>uv</code>, refer to the [official documentation](https://docs.astral.sh/uv/getting-started/installation/#installation-methods).
62
+
63
+
64
+ Install with <code>uv</code>:
65
+ ```bash
66
+ uv pip install sinapsis-elevenlabs --extra-index-url https://pypi.sinapsis.tech
67
+ ```
68
+ Or with raw <code>pip</code>:
69
+ ```bash
70
+ pip install sinapsis-elevenlabs --extra-index-url https://pypi.sinapsis.tech
71
+ ```
72
+
73
+ **Replace `sinapsis-elevenlabs` with the name of the package you intend to install**.
74
+
75
+ > [!IMPORTANT]
76
+ > Templates in each package may require additional dependencies. For development, we recommend installing the package all optional dependencies:
77
+ >
78
+ With <code>uv</code>:
79
+
80
+ ```bash
81
+ uv pip install sinapsis-elevenlabs[all] --extra-index-url https://pypi.sinapsis.tech
82
+ ```
83
+ Or with raw <code>pip</code>:
84
+ ```bash
85
+ pip install sinapsis-elevenlabs[all] --extra-index-url https://pypi.sinapsis.tech
86
+ ```
87
+
88
+ **Be sure to substitute `sinapsis-elevenlabs` with the appropriate package name**.
89
+
90
+
91
+
92
+ > [!TIP]
93
+ > You can also install all the packages within this project:
94
+ >
95
+ ```bash
96
+ uv pip install sinapsis-speech[all] --extra-index-url https://pypi.sinapsis.tech
97
+ ```
98
+
99
+
100
+ <h2 id="packages">📦 Packages</h2>
101
+
102
+ This repository is organized into modular packages, each designed for integration with different text-to-speech tools. These packages provide ready-to-use templates for speech synthesis. Below is an overview of the available packages:
103
+
104
+ <details>
105
+ <summary id="elevenlabs"><strong><span style="font-size: 1.4em;"> Sinapsis ElevenLabs </span></strong></summary>
106
+
107
+ This package offers a suite of templates and utilities designed for effortless integrating, configuration, and execution of **text-to-speech (TTS)** and **voice generation** functionalities powered by [ElevenLabs](https://elevenlabs.io/).
108
+
109
+ - **ElevenLabsTTS**: Template for converting text into speech using ElevenLabs' voice models.
110
+
111
+ - **ElevenLabsVoiceGeneration**: Template for generating custom synthetic voices based on user-provided descriptions.
112
+
113
+ For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_elevenlabs/README.md).
114
+
115
+ </details>
116
+
117
+
118
+ <details>
119
+ <summary id="f5tts"><strong><span style="font-size: 1.4em;"> Sinapsis F5-TTS</span></strong></summary>
120
+
121
+ This package provides a template for seamlessly integrating, configuring, and running **text-to-speech (TTS)** functionalities powered by [F5TTS](https://github.com/SWivid/F5-TTS).
122
+
123
+ - **F5TTSInference**: Converts text to speech using the F5TTS model with voice cloning capabilities.
124
+
125
+ For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_f5_tts/README.md).
126
+
127
+ </details>
128
+ <details>
129
+ <summary id="f5tts"><strong><span style="font-size: 1.4em;"> Sinapsis Kokoro</span></strong></summary>
130
+
131
+ This package provides a single template for integrating, configuring, and running text-to-speech (TTS) synthesis using the [Kokoro 82M v1.0](https://huggingface.co/hexgrad/Kokoro-82M) model.
132
+
133
+ KokoroTTS: Converts text to speech using the Kokoro TTS model. The template processes text packets from the input container, generates corresponding audio using Kokoro, and adds the resulting audio packets to the container.
134
+ For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_kokoro/README.md).
135
+ </details>
136
+ <details>
137
+ <summary id="zonos"><strong><span style="font-size: 1.4em;"> Sinapsis Zonos</span></strong></summary>
138
+
139
+ This package provides a single template for integrating, configuring, and running **text-to-speech (TTS)** and **voice cloning** functionalities powered by [Zonos](https://github.com/Zyphra/Zonos/tree/main).
140
+
141
+ - **ZonosTTS**: Template for converting text to speech or performing voice cloning based on the presence of an audio sample.​
142
+
143
+ For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_zonos/README.md).
144
+
145
+ </details>
146
+
147
+ <h2 id="webapp">🌐 Webapps</h2>
148
+ The webapps included in this project showcase the modularity of the templates, in this case for speech generation tasks.
149
+
150
+ > [!IMPORTANT]
151
+ > To run the app you first need to clone this repository:
152
+
153
+ ```bash
154
+ git clone git@github.com:Sinapsis-ai/sinapsis-speech.git
155
+ cd sinapsis-speech
156
+ ```
157
+
158
+ > [!NOTE]
159
+ > If you'd like to enable external app sharing in Gradio, `export GRADIO_SHARE_APP=True`
160
+
161
+
162
+ > [!IMPORTANT]
163
+ > Elevenlabs requires an API key to run any inference. To get started, visit the [official website](https://elevenlabs.io) and create an account. If you already have an account, go to the [API keys page](https://elevenlabs.io/app/settings/api-keys) to generate a token.
164
+
165
+ > [!IMPORTANT]
166
+ > Set your env var using <code> export ELEVENLABS_API_KEY='your-api-key'</code>
167
+
168
+ > [!IMPORTANT]
169
+ > F5-TTS requires a reference audio file for voice cloning. Make sure you have a reference audio file in the artifacts directory.
170
+
171
+ > [!NOTE]
172
+ > Agent configuration can be changed through the `AGENT_CONFIG_PATH` env var. You can check the available configurations in each package configs folder.
173
+
174
+ <details>
175
+ <summary id="docker"><strong><span style="font-size: 1.4em;">🐳 Docker</span></strong></summary>
176
+
177
+ **IMPORTANT**: This Docker image depends on the `sinapsis-nvidia:base` image. For detailed instructions, please refer to the [Sinapsis README](https://github.com/Sinapsis-ai/sinapsis?tab=readme-ov-file#docker).
178
+
179
+ 1. **Build the sinapsis-speech image**:
180
+
181
+ ```bash
182
+ docker compose -f docker/compose.yaml build
183
+ ```
184
+
185
+
186
+ 2. **Start the app container**:
187
+
188
+ - For ElevenLabs:
189
+ ```bash
190
+ docker compose -f docker/compose_apps.yaml up -d sinapsis-elevenlabs
191
+ ```
192
+ - For F5-TTS:
193
+ ```bash
194
+ docker compose -f docker/compose_apps.yaml up -d sinapsis-f5_tts
195
+ ```
196
+
197
+ - For Kokoro:
198
+
199
+ ```bash
200
+ docker compose -f docker/compose_apps.yaml up -d sinapsis-kokoro
201
+ ```
202
+
203
+ - For Zonos:
204
+ ```bash
205
+ docker compose -f docker/compose_apps.yaml up -d sinapsis-zonos
206
+ ```
207
+
208
+ 3. **Check the logs**
209
+
210
+ - For ElevenLabs:
211
+ ```bash
212
+ docker logs -f sinapsis-elevenlabs
213
+ ```
214
+ - For F5-TTS:
215
+ ```bash
216
+ docker logs -f sinapsis-f5tts
217
+ ```
218
+ - For Kokoro:
219
+ ```bash
220
+ docker logs -f sinapsis-kokoro
221
+ ```
222
+
223
+ - For Zonos:
224
+ ```bash
225
+ docker logs -f sinapsis-zonos
226
+ ```
227
+ 4. **The logs will display the URL to access the webapp, e.g.,:**:
228
+ ```bash
229
+ Running on local URL: http://127.0.0.1:7860
230
+ ```
231
+ **NOTE**: The url may be different, check the output of logs.
232
+ 5. **To stop the app**:
233
+ ```bash
234
+ docker compose -f docker/compose_apps.yaml down
235
+ ```
236
+ </details>
237
+
238
+ <details>
239
+ <summary id="virtual-environment"><strong><span style="font-size: 1.4em;">💻 UV</span></strong></summary>
240
+
241
+ To run the webapp using the <code>uv</code> package manager, follow these steps:
242
+
243
+ 1. **Sync the virtual environment**:
244
+
245
+ ```bash
246
+ uv sync --frozen
247
+ ```
248
+ 2. **Install the wheel**:
249
+
250
+ ```bash
251
+ uv pip install sinapsis-speech[all] --extra-index-url https://pypi.sinapsis.tech
252
+ ```
253
+
254
+
255
+
256
+ 3. **Run the webapp**:
257
+
258
+ - For ElevenLabs:
259
+ ```bash
260
+ uv run webapps/generic_tts_apps/elevenlabs_tts_app.py
261
+ ```
262
+ - For F5-TTS:
263
+ ```bash
264
+ uv run webapps/packet_tts_apps/f5_tts_app.py
265
+ ```
266
+
267
+ - For Kokoro:
268
+ ```bash
269
+ uv run webapps/packet_tts_apps/kokoro_tts_app.py
270
+ ```
271
+ - For Zonos:
272
+ ```bash
273
+ uv run webapps/generic_tts_apps/zonos_tts_app.py
274
+ ```
275
+ 4. **The terminal will display the URL to access the webapp (e.g.)**:
276
+ ```bash
277
+ Running on local URL: http://127.0.0.1:7860
278
+ ```
279
+ **NOTE**: The URL may vary; check the terminal output for the correct address.
280
+
281
+ </details>
282
+
283
+
284
+
285
+ <h2 id="documentation">📙 Documentation</h2>
286
+
287
+ Documentation is available on the [sinapsis website](https://docs.sinapsis.tech/docs)
288
+
289
+ Tutorials for different projects within sinapsis are available at [sinapsis tutorials page](https://docs.sinapsis.tech/tutorials)
290
+
291
+ <h2 id="license">🔍 License</h2>
292
+
293
+ This project is licensed under the AGPLv3 license, which encourages open collaboration and sharing. For more details, please refer to the [LICENSE](LICENSE) file.
294
+
295
+ For commercial use, please refer to our [official Sinapsis website](https://sinapsis.tech) for information on obtaining a commercial license.
296
+
297
+
298
+
@@ -31,6 +31,7 @@ This repo includes packages for performing speech synthesis using different tool
31
31
 
32
32
  * <code>sinapsis-elevenlabs</code>
33
33
  * <code>sinapsis-f5-tts</code>
34
+ * * <code>sinapsis-kokoro</code>
34
35
  * <code>sinapsis-zonos</code>
35
36
 
36
37
  Install using your preferred package manager. We strongly recommend using <code>uv</code>. To install <code>uv</code>, refer to the [official documentation](https://docs.astral.sh/uv/getting-started/installation/#installation-methods).
@@ -100,7 +101,14 @@ This package provides a template for seamlessly integrating, configuring, and ru
100
101
  For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_f5_tts/README.md).
101
102
 
102
103
  </details>
104
+ <details>
105
+ <summary id="f5tts"><strong><span style="font-size: 1.4em;"> Sinapsis Kokoro</span></strong></summary>
106
+
107
+ This package provides a single template for integrating, configuring, and running text-to-speech (TTS) synthesis using the [Kokoro 82M v1.0](https://huggingface.co/hexgrad/Kokoro-82M) model.
103
108
 
109
+ KokoroTTS: Converts text to speech using the Kokoro TTS model. The template processes text packets from the input container, generates corresponding audio using Kokoro, and adds the resulting audio packets to the container.
110
+ For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_kokoro/README.md).
111
+ </details>
104
112
  <details>
105
113
  <summary id="zonos"><strong><span style="font-size: 1.4em;"> Sinapsis Zonos</span></strong></summary>
106
114
 
@@ -139,41 +147,56 @@ cd sinapsis-speech
139
147
  > [!NOTE]
140
148
  > Agent configuration can be changed through the `AGENT_CONFIG_PATH` env var. You can check the available configurations in each package configs folder.
141
149
 
142
-
143
150
  <details>
144
151
  <summary id="docker"><strong><span style="font-size: 1.4em;">🐳 Docker</span></strong></summary>
145
152
 
146
153
  **IMPORTANT**: This Docker image depends on the `sinapsis-nvidia:base` image. For detailed instructions, please refer to the [Sinapsis README](https://github.com/Sinapsis-ai/sinapsis?tab=readme-ov-file#docker).
147
154
 
148
155
  1. **Build the sinapsis-speech image**:
156
+
149
157
  ```bash
150
158
  docker compose -f docker/compose.yaml build
151
159
  ```
152
160
 
161
+
153
162
  2. **Start the app container**:
154
- For ElevenLabs:
163
+
164
+ - For ElevenLabs:
155
165
  ```bash
156
166
  docker compose -f docker/compose_apps.yaml up -d sinapsis-elevenlabs
157
167
  ```
158
- For F5-TTS:
168
+ - For F5-TTS:
159
169
  ```bash
160
170
  docker compose -f docker/compose_apps.yaml up -d sinapsis-f5_tts
161
171
  ```
162
- For Zonos:
172
+
173
+ - For Kokoro:
174
+
175
+ ```bash
176
+ docker compose -f docker/compose_apps.yaml up -d sinapsis-kokoro
177
+ ```
178
+
179
+ - For Zonos:
163
180
  ```bash
164
181
  docker compose -f docker/compose_apps.yaml up -d sinapsis-zonos
165
182
  ```
166
183
 
167
184
  3. **Check the logs**
168
- For ElevenLabs:
185
+
186
+ - For ElevenLabs:
169
187
  ```bash
170
188
  docker logs -f sinapsis-elevenlabs
171
189
  ```
172
- For F5-TTS:
190
+ - For F5-TTS:
173
191
  ```bash
174
192
  docker logs -f sinapsis-f5tts
175
193
  ```
176
- For Zonos:
194
+ - For Kokoro:
195
+ ```bash
196
+ docker logs -f sinapsis-kokoro
197
+ ```
198
+
199
+ - For Zonos:
177
200
  ```bash
178
201
  docker logs -f sinapsis-zonos
179
202
  ```
@@ -204,18 +227,26 @@ uv sync --frozen
204
227
  uv pip install sinapsis-speech[all] --extra-index-url https://pypi.sinapsis.tech
205
228
  ```
206
229
 
230
+
231
+
207
232
  3. **Run the webapp**:
208
- For ElevenLabs:
233
+
234
+ - For ElevenLabs:
209
235
  ```bash
210
- uv run webapps/elevenlabs/elevenlabs_tts_app.py
236
+ uv run webapps/generic_tts_apps/elevenlabs_tts_app.py
211
237
  ```
212
- For F5-TTS:
238
+ - For F5-TTS:
239
+ ```bash
240
+ uv run webapps/packet_tts_apps/f5_tts_app.py
241
+ ```
242
+
243
+ - For Kokoro:
213
244
  ```bash
214
- uv run webapps/f5-tts/f5_tts_app.py
245
+ uv run webapps/packet_tts_apps/kokoro_tts_app.py
215
246
  ```
216
- For Zonos:
247
+ - For Zonos:
217
248
  ```bash
218
- uv run webapps/zonos/zonos_tts_app.py
249
+ uv run webapps/generic_tts_apps/zonos_tts_app.py
219
250
  ```
220
251
  4. **The terminal will display the URL to access the webapp (e.g.)**:
221
252
  ```bash
@@ -12,11 +12,11 @@ from elevenlabs.client import ElevenLabs, VoiceId, VoiceName
12
12
  from elevenlabs.types import OutputFormat
13
13
  from pydantic import Field
14
14
  from sinapsis_core.data_containers.data_packet import AudioPacket, DataContainer, Packet
15
- from sinapsis_core.template_base.template import (
16
- Template,
15
+ from sinapsis_core.template_base.base_models import (
17
16
  TemplateAttributes,
18
17
  TemplateAttributeType,
19
18
  )
19
+ from sinapsis_core.template_base.template import Template
20
20
  from sinapsis_core.utils.env_var_keys import SINAPSIS_CACHE_DIR
21
21
 
22
22
  from sinapsis_elevenlabs.helpers.env_var_keys import ELEVENLABS_API_KEY
@@ -12,7 +12,8 @@ from sinapsis_core.data_containers.data_packet import (
12
12
  AudioPacket,
13
13
  DataContainer,
14
14
  )
15
- from sinapsis_core.template_base import Template, TemplateAttributes
15
+ from sinapsis_core.template_base import Template
16
+ from sinapsis_core.template_base.base_models import TemplateAttributes
16
17
 
17
18
 
18
19
  @dataclass
@@ -0,0 +1,74 @@
1
+ # -*- coding: utf-8 -*-
2
+ from typing import Literal
3
+
4
+ from pydantic.dataclasses import dataclass
5
+
6
+ kokoro_voices = Literal[
7
+ "af_heart",
8
+ "af_alloy",
9
+ "af_aoede",
10
+ "af_bella",
11
+ "af_jessicaaf_kore",
12
+ "af_nicole",
13
+ "af_nova",
14
+ "af_river",
15
+ "af_sarah",
16
+ "af_sky",
17
+ "am_adam",
18
+ "am_echo",
19
+ "am_eric",
20
+ "am_fenrir",
21
+ "am_liam",
22
+ "am_michael",
23
+ "am_onyx",
24
+ "am_puck",
25
+ "am_santa",
26
+ "bf_alice",
27
+ "bf_emma",
28
+ "bf_isabella",
29
+ "bf_lily",
30
+ "bm_daniel",
31
+ "bm_fable",
32
+ "bm_george",
33
+ "bm_lewis",
34
+ "jf_alpha",
35
+ "jf_gongitsune",
36
+ "jf_nezumi",
37
+ "jf_tebukuro",
38
+ "jm_kumo",
39
+ "zf_xiaobei",
40
+ "zf_xiaoni",
41
+ "zf_xiaoxiao",
42
+ "zf_xiaoyi",
43
+ "zm_yunjian",
44
+ "zm_yunxi",
45
+ "zm_yunxia",
46
+ "zm_yunyang",
47
+ "ef_dora",
48
+ "em_alex",
49
+ "em_santa",
50
+ "ff_siwis",
51
+ "hf_alpha",
52
+ "hf_beta",
53
+ "hm_omega",
54
+ "hm_psi",
55
+ "if_sara",
56
+ "im_nicola",
57
+ "pf_dora",
58
+ "pm_alex",
59
+ "pm_santa",
60
+ ]
61
+
62
+
63
+ @dataclass(frozen=True)
64
+ class KokoroKeys:
65
+ """
66
+ A class to hold constants for the keys used in the Text-to-Speech (TTS) model configuration.
67
+
68
+ These keys represent standard fields that are used to configure various parameters of the TTS model,
69
+ such as speaker attributes, emotions, and other audio-related settings. They are typically used in
70
+ templates and potentially a TTS web application to adjust and access specific TTS settings."
71
+ """
72
+
73
+ repo_id: Literal["hexgrad/Kokoro-82M"] = "hexgrad/Kokoro-82M"
74
+ default_voice: Literal["af_heart"] = "af_heart"
@@ -0,0 +1,20 @@
1
+ # -*- coding: utf-8 -*-
2
+ import importlib
3
+ from typing import Callable
4
+
5
+ _root_lib_path = "sinapsis_kokoro.templates"
6
+
7
+ _template_lookup = {
8
+ "KokoroTTS": f"{_root_lib_path}.kokoro_tts",
9
+ }
10
+
11
+
12
+ def __getattr__(name: str) -> Callable:
13
+ if name in _template_lookup:
14
+ module = importlib.import_module(_template_lookup[name])
15
+ return getattr(module, name)
16
+
17
+ raise AttributeError(f"template `{name}` not found in {_root_lib_path}")
18
+
19
+
20
+ __all__ = list(_template_lookup.keys())
@@ -0,0 +1,149 @@
1
+ # -*- coding: utf-8 -*-
2
+ from typing import Generator
3
+ from urllib.error import HTTPError
4
+
5
+ import torch
6
+ from kokoro import KPipeline
7
+ from sinapsis_core.data_containers.data_packet import AudioPacket, DataContainer
8
+ from sinapsis_core.template_base.base_models import (
9
+ TemplateAttributes,
10
+ TemplateAttributeType,
11
+ )
12
+ from sinapsis_core.template_base.template import Template
13
+ from sinapsis_core.utils.logging_utils import make_loguru
14
+
15
+ from sinapsis_kokoro.helpers.kokoro_utils import KokoroKeys, kokoro_voices
16
+
17
+
18
+ class KokoroTTS(Template):
19
+ """
20
+ Template for text-to-speech (TTS) synthesis using the Kokoro 82M v1.0 model.
21
+ This class handles the initialization of the TTS pipeline, speech generation,
22
+ and packaging the output audio in the desired format.
23
+
24
+ Usage example:
25
+
26
+ agent:
27
+ name: my_test_agent
28
+ templates:
29
+ - template_name: InputTemplate
30
+ class_name: InputTemplate
31
+ attributes: {}
32
+ - template_name: KokoroTTS
33
+ class_name: KokoroTTS
34
+ template_input: InputTemplate
35
+ attributes:
36
+ speed: 1
37
+ voice: af_heart
38
+ """
39
+
40
+ class AttributesBaseModel(TemplateAttributes):
41
+ """
42
+ Configuration attributes for the Kokoro TTS model.
43
+
44
+ Args:
45
+ speed (int | float): The speed at which the speech will be generated. Default is 1 (normal speed).
46
+ split_pattern (str): The regular expression pattern used to split the input text into smaller chunks.
47
+ Default is r"\n+" (split on newlines).
48
+ voice (kokoro_voices): The voice model to use for speech synthesis. Default is "af_heart".
49
+
50
+ Notes:
51
+ The list of languages and voices supported by Kokoro can be found at:
52
+ https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md
53
+ """
54
+
55
+ speed: int | float = 1
56
+ split_pattern: str = r"\n+"
57
+ voice: kokoro_voices = KokoroKeys.default_voice
58
+
59
+ def __init__(self, attributes: TemplateAttributeType) -> None:
60
+ """Initializes the Kokoro TTS pipeline with the provided attributes."""
61
+ super().__init__(attributes)
62
+ self.pipeline = self.init_pipeline()
63
+ self.logger = make_loguru()
64
+
65
+ def init_pipeline(self) -> KPipeline:
66
+ """
67
+ Initializes the Kokoro TTS pipeline with the voice model and repository id.
68
+
69
+ Returns:
70
+ KPipeline: The initialized TTS pipeline for generating speech.
71
+ """
72
+ return KPipeline(lang_code=self.attributes.voice[0], repo_id=KokoroKeys.repo_id)
73
+
74
+ def _create_audio_packet(
75
+ self,
76
+ audio_data: torch.tensor,
77
+ sample_rate: int,
78
+ container: DataContainer,
79
+ ) -> None:
80
+ """
81
+ Creates an audio packet from the generated audio data and adds it to the container.
82
+
83
+ Args:
84
+ audio_data (torch.tensor): The generated audio data (raw audio).
85
+ sample_rate (int): The sample rate of the generated audio (typically 24000 Hz).
86
+ container (DataContainer): The container to which the audio packet will be added.
87
+ """
88
+ audio_packet = AudioPacket(
89
+ content=audio_data,
90
+ source=self.instance_name,
91
+ sample_rate=sample_rate,
92
+ )
93
+ container.audios.append(audio_packet)
94
+
95
+ def _process_audio_chunks(self, generator: Generator, container: DataContainer) -> None:
96
+ """
97
+ Processes the audio chunks generated by the pipeline and creates audio packets.
98
+
99
+ Args:
100
+ generator: The generator that yields text, phonemes, and audio data.
101
+ container (DataContainer): The container holding the input data.
102
+ """
103
+ for i, (gs, ps, audio) in enumerate(generator):
104
+ self.logger.debug(f"Index: {i}")
105
+ self.logger.debug(f"Text: {gs}")
106
+ self.logger.debug(f"Phonemes: {ps}")
107
+ if audio is not None:
108
+ self._create_audio_packet(audio, 24000, container)
109
+ else:
110
+ self.logger.warning(f"Audio is None for index {i}")
111
+
112
+ def generate_speech(self, container: DataContainer) -> None:
113
+ """
114
+ Generates speech from the input text in the provided data container.
115
+
116
+ Args:
117
+ container (DataContainer): The container holding the input text data to be converted into speech.
118
+ """
119
+ input_text = "".join(t.content for t in container.texts)
120
+ generator = self.pipeline(
121
+ input_text,
122
+ voice=self.attributes.voice,
123
+ speed=self.attributes.speed,
124
+ split_pattern=self.attributes.split_pattern,
125
+ )
126
+
127
+ try:
128
+ self._process_audio_chunks(generator, container)
129
+ except HTTPError as e:
130
+ self.logger.error(f"Unable to generate speech: {e}")
131
+
132
+ def execute(self, container: DataContainer) -> DataContainer:
133
+ """
134
+ Processes the input data and generates the corresponding speech output.
135
+
136
+ Args:
137
+ container (DataContainer): The container holding the input text data.
138
+
139
+ Returns:
140
+ DataContainer: The updated container with the generated audio.
141
+ """
142
+
143
+ if not container.texts:
144
+ self.logger.debug("No query to enter")
145
+ return container
146
+
147
+ self.generate_speech(container)
148
+
149
+ return container