sinapsis-speech 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,74 @@
1
+ # -*- coding: utf-8 -*-
2
+ from typing import Literal
3
+
4
+ from pydantic.dataclasses import dataclass
5
+
6
+ kokoro_voices = Literal[
7
+ "af_heart",
8
+ "af_alloy",
9
+ "af_aoede",
10
+ "af_bella",
11
+ "af_jessicaaf_kore",
12
+ "af_nicole",
13
+ "af_nova",
14
+ "af_river",
15
+ "af_sarah",
16
+ "af_sky",
17
+ "am_adam",
18
+ "am_echo",
19
+ "am_eric",
20
+ "am_fenrir",
21
+ "am_liam",
22
+ "am_michael",
23
+ "am_onyx",
24
+ "am_puck",
25
+ "am_santa",
26
+ "bf_alice",
27
+ "bf_emma",
28
+ "bf_isabella",
29
+ "bf_lily",
30
+ "bm_daniel",
31
+ "bm_fable",
32
+ "bm_george",
33
+ "bm_lewis",
34
+ "jf_alpha",
35
+ "jf_gongitsune",
36
+ "jf_nezumi",
37
+ "jf_tebukuro",
38
+ "jm_kumo",
39
+ "zf_xiaobei",
40
+ "zf_xiaoni",
41
+ "zf_xiaoxiao",
42
+ "zf_xiaoyi",
43
+ "zm_yunjian",
44
+ "zm_yunxi",
45
+ "zm_yunxia",
46
+ "zm_yunyang",
47
+ "ef_dora",
48
+ "em_alex",
49
+ "em_santa",
50
+ "ff_siwis",
51
+ "hf_alpha",
52
+ "hf_beta",
53
+ "hm_omega",
54
+ "hm_psi",
55
+ "if_sara",
56
+ "im_nicola",
57
+ "pf_dora",
58
+ "pm_alex",
59
+ "pm_santa",
60
+ ]
61
+
62
+
63
+ @dataclass(frozen=True)
64
+ class KokoroKeys:
65
+ """
66
+ A class to hold constants for the keys used in the Text-to-Speech (TTS) model configuration.
67
+
68
+ These keys represent standard fields that are used to configure various parameters of the TTS model,
69
+ such as speaker attributes, emotions, and other audio-related settings. They are typically used in
70
+ templates and potentially a TTS web application to adjust and access specific TTS settings."
71
+ """
72
+
73
+ repo_id: Literal["hexgrad/Kokoro-82M"] = "hexgrad/Kokoro-82M"
74
+ default_voice: Literal["af_heart"] = "af_heart"
@@ -0,0 +1,20 @@
1
+ # -*- coding: utf-8 -*-
2
+ import importlib
3
+ from typing import Callable
4
+
5
+ _root_lib_path = "sinapsis_kokoro.templates"
6
+
7
+ _template_lookup = {
8
+ "KokoroTTS": f"{_root_lib_path}.kokoro_tts",
9
+ }
10
+
11
+
12
+ def __getattr__(name: str) -> Callable:
13
+ if name in _template_lookup:
14
+ module = importlib.import_module(_template_lookup[name])
15
+ return getattr(module, name)
16
+
17
+ raise AttributeError(f"template `{name}` not found in {_root_lib_path}")
18
+
19
+
20
+ __all__ = list(_template_lookup.keys())
@@ -0,0 +1,149 @@
1
+ # -*- coding: utf-8 -*-
2
+ from typing import Generator
3
+ from urllib.error import HTTPError
4
+
5
+ import torch
6
+ from kokoro import KPipeline
7
+ from sinapsis_core.data_containers.data_packet import AudioPacket, DataContainer
8
+ from sinapsis_core.template_base.template import (
9
+ Template,
10
+ TemplateAttributes,
11
+ TemplateAttributeType,
12
+ )
13
+ from sinapsis_core.utils.logging_utils import make_loguru
14
+
15
+ from sinapsis_kokoro.helpers.kokoro_utils import KokoroKeys, kokoro_voices
16
+
17
+
18
+ class KokoroTTS(Template):
19
+ """
20
+ Template for text-to-speech (TTS) synthesis using the Kokoro 82M v1.0 model.
21
+ This class handles the initialization of the TTS pipeline, speech generation,
22
+ and packaging the output audio in the desired format.
23
+
24
+ Usage example:
25
+
26
+ agent:
27
+ name: my_test_agent
28
+ templates:
29
+ - template_name: InputTemplate
30
+ class_name: InputTemplate
31
+ attributes: {}
32
+ - template_name: KokoroTTS
33
+ class_name: KokoroTTS
34
+ template_input: InputTemplate
35
+ attributes:
36
+ speed: 1
37
+ voice: af_heart
38
+ """
39
+
40
+ class AttributesBaseModel(TemplateAttributes):
41
+ """
42
+ Configuration attributes for the Kokoro TTS model.
43
+
44
+ Args:
45
+ speed (int | float): The speed at which the speech will be generated. Default is 1 (normal speed).
46
+ split_pattern (str): The regular expression pattern used to split the input text into smaller chunks.
47
+ Default is r"\n+" (split on newlines).
48
+ voice (kokoro_voices): The voice model to use for speech synthesis. Default is "af_heart".
49
+
50
+ Notes:
51
+ The list of languages and voices supported by Kokoro can be found at:
52
+ https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md
53
+ """
54
+
55
+ speed: int | float = 1
56
+ split_pattern: str = r"\n+"
57
+ voice: kokoro_voices = KokoroKeys.default_voice
58
+
59
+ def __init__(self, attributes: TemplateAttributeType) -> None:
60
+ """Initializes the Kokoro TTS pipeline with the provided attributes."""
61
+ super().__init__(attributes)
62
+ self.pipeline = self.init_pipeline()
63
+ self.logger = make_loguru()
64
+
65
+ def init_pipeline(self) -> KPipeline:
66
+ """
67
+ Initializes the Kokoro TTS pipeline with the voice model and repository id.
68
+
69
+ Returns:
70
+ KPipeline: The initialized TTS pipeline for generating speech.
71
+ """
72
+ return KPipeline(lang_code=self.attributes.voice[0], repo_id=KokoroKeys.repo_id)
73
+
74
+ def _create_audio_packet(
75
+ self,
76
+ audio_data: torch.tensor,
77
+ sample_rate: int,
78
+ container: DataContainer,
79
+ ) -> None:
80
+ """
81
+ Creates an audio packet from the generated audio data and adds it to the container.
82
+
83
+ Args:
84
+ audio_data (torch.tensor): The generated audio data (raw audio).
85
+ sample_rate (int): The sample rate of the generated audio (typically 24000 Hz).
86
+ container (DataContainer): The container to which the audio packet will be added.
87
+ """
88
+ audio_packet = AudioPacket(
89
+ content=audio_data,
90
+ source=self.instance_name,
91
+ sample_rate=sample_rate,
92
+ )
93
+ container.audios.append(audio_packet)
94
+
95
+ def _process_audio_chunks(self, generator: Generator, container: DataContainer) -> None:
96
+ """
97
+ Processes the audio chunks generated by the pipeline and creates audio packets.
98
+
99
+ Args:
100
+ generator: The generator that yields text, phonemes, and audio data.
101
+ container (DataContainer): The container holding the input data.
102
+ """
103
+ for i, (gs, ps, audio) in enumerate(generator):
104
+ self.logger.debug(f"Index: {i}")
105
+ self.logger.debug(f"Text: {gs}")
106
+ self.logger.debug(f"Phonemes: {ps}")
107
+ if audio is not None:
108
+ self._create_audio_packet(audio, 24000, container)
109
+ else:
110
+ self.logger.warning(f"Audio is None for index {i}")
111
+
112
+ def generate_speech(self, container: DataContainer) -> None:
113
+ """
114
+ Generates speech from the input text in the provided data container.
115
+
116
+ Args:
117
+ container (DataContainer): The container holding the input text data to be converted into speech.
118
+ """
119
+ input_text = "".join(t.content for t in container.texts)
120
+ generator = self.pipeline(
121
+ input_text,
122
+ voice=self.attributes.voice,
123
+ speed=self.attributes.speed,
124
+ split_pattern=self.attributes.split_pattern,
125
+ )
126
+
127
+ try:
128
+ self._process_audio_chunks(generator, container)
129
+ except HTTPError as e:
130
+ self.logger.error(f"Unable to generate speech: {e}")
131
+
132
+ def execute(self, container: DataContainer) -> DataContainer:
133
+ """
134
+ Processes the input data and generates the corresponding speech output.
135
+
136
+ Args:
137
+ container (DataContainer): The container holding the input text data.
138
+
139
+ Returns:
140
+ DataContainer: The updated container with the generated audio.
141
+ """
142
+
143
+ if not container.texts:
144
+ self.logger.debug("No query to enter")
145
+ return container
146
+
147
+ self.generate_speech(container)
148
+
149
+ return container
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sinapsis-speech
3
- Version: 0.2.2
3
+ Version: 0.3.0
4
4
  Summary: Generate speech using various libraries.
5
5
  Author-email: SinapsisAI <dev@sinapsis-ai.com>
6
6
  Project-URL: Homepage, https://sinapsis.tech
@@ -15,6 +15,7 @@ Requires-Dist: sinapsis>=0.2.2
15
15
  Provides-Extra: all
16
16
  Requires-Dist: sinapsis-elevenlabs[all]; extra == "all"
17
17
  Requires-Dist: sinapsis-f5-tts[all]; extra == "all"
18
+ Requires-Dist: sinapsis-kokoro[all]; extra == "all"
18
19
  Requires-Dist: sinapsis-speech[gradio-app]; extra == "all"
19
20
  Requires-Dist: sinapsis-zonos[all]; extra == "all"
20
21
  Provides-Extra: gradio-app
@@ -54,6 +55,7 @@ This repo includes packages for performing speech synthesis using different tool
54
55
 
55
56
  * <code>sinapsis-elevenlabs</code>
56
57
  * <code>sinapsis-f5-tts</code>
58
+ * * <code>sinapsis-kokoro</code>
57
59
  * <code>sinapsis-zonos</code>
58
60
 
59
61
  Install using your preferred package manager. We strongly recommend using <code>uv</code>. To install <code>uv</code>, refer to the [official documentation](https://docs.astral.sh/uv/getting-started/installation/#installation-methods).
@@ -123,7 +125,14 @@ This package provides a template for seamlessly integrating, configuring, and ru
123
125
  For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_f5_tts/README.md).
124
126
 
125
127
  </details>
128
+ <details>
129
+ <summary id="f5tts"><strong><span style="font-size: 1.4em;"> Sinapsis Kokoro</span></strong></summary>
130
+
131
+ This package provides a single template for integrating, configuring, and running text-to-speech (TTS) synthesis using the [Kokoro 82M v1.0](https://huggingface.co/hexgrad/Kokoro-82M) model.
126
132
 
133
+ KokoroTTS: Converts text to speech using the Kokoro TTS model. The template processes text packets from the input container, generates corresponding audio using Kokoro, and adds the resulting audio packets to the container.
134
+ For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_kokoro/README.md).
135
+ </details>
127
136
  <details>
128
137
  <summary id="zonos"><strong><span style="font-size: 1.4em;"> Sinapsis Zonos</span></strong></summary>
129
138
 
@@ -162,41 +171,56 @@ cd sinapsis-speech
162
171
  > [!NOTE]
163
172
  > Agent configuration can be changed through the `AGENT_CONFIG_PATH` env var. You can check the available configurations in each package configs folder.
164
173
 
165
-
166
174
  <details>
167
175
  <summary id="docker"><strong><span style="font-size: 1.4em;">🐳 Docker</span></strong></summary>
168
176
 
169
177
  **IMPORTANT**: This Docker image depends on the `sinapsis-nvidia:base` image. For detailed instructions, please refer to the [Sinapsis README](https://github.com/Sinapsis-ai/sinapsis?tab=readme-ov-file#docker).
170
178
 
171
179
  1. **Build the sinapsis-speech image**:
180
+
172
181
  ```bash
173
182
  docker compose -f docker/compose.yaml build
174
183
  ```
175
184
 
185
+
176
186
  2. **Start the app container**:
177
- For ElevenLabs:
187
+
188
+ - For ElevenLabs:
178
189
  ```bash
179
190
  docker compose -f docker/compose_apps.yaml up -d sinapsis-elevenlabs
180
191
  ```
181
- For F5-TTS:
192
+ - For F5-TTS:
182
193
  ```bash
183
194
  docker compose -f docker/compose_apps.yaml up -d sinapsis-f5_tts
184
195
  ```
185
- For Zonos:
196
+
197
+ - For Kokoro:
198
+
199
+ ```bash
200
+ docker compose -f docker/compose_apps.yaml up -d sinapsis-kokoro
201
+ ```
202
+
203
+ - For Zonos:
186
204
  ```bash
187
205
  docker compose -f docker/compose_apps.yaml up -d sinapsis-zonos
188
206
  ```
189
207
 
190
208
  3. **Check the logs**
191
- For ElevenLabs:
209
+
210
+ - For ElevenLabs:
192
211
  ```bash
193
212
  docker logs -f sinapsis-elevenlabs
194
213
  ```
195
- For F5-TTS:
214
+ - For F5-TTS:
196
215
  ```bash
197
216
  docker logs -f sinapsis-f5tts
198
217
  ```
199
- For Zonos:
218
+ - For Kokoro:
219
+ ```bash
220
+ docker logs -f sinapsis-kokoro
221
+ ```
222
+
223
+ - For Zonos:
200
224
  ```bash
201
225
  docker logs -f sinapsis-zonos
202
226
  ```
@@ -227,18 +251,26 @@ uv sync --frozen
227
251
  uv pip install sinapsis-speech[all] --extra-index-url https://pypi.sinapsis.tech
228
252
  ```
229
253
 
254
+
255
+
230
256
  3. **Run the webapp**:
231
- For ElevenLabs:
257
+
258
+ - For ElevenLabs:
232
259
  ```bash
233
- uv run webapps/elevenlabs/elevenlabs_tts_app.py
260
+ uv run webapps/generic_tts_apps/elevenlabs_tts_app.py
234
261
  ```
235
- For F5-TTS:
262
+ - For F5-TTS:
263
+ ```bash
264
+ uv run webapps/packet_tts_apps/f5_tts_app.py
265
+ ```
266
+
267
+ - For Kokoro:
236
268
  ```bash
237
- uv run webapps/f5-tts/f5_tts_app.py
269
+ uv run webapps/packet_tts_apps/kokoro_tts_app.py
238
270
  ```
239
- For Zonos:
271
+ - For Zonos:
240
272
  ```bash
241
- uv run webapps/zonos/zonos_tts_app.py
273
+ uv run webapps/generic_tts_apps/zonos_tts_app.py
242
274
  ```
243
275
  4. **The terminal will display the URL to access the webapp (e.g.)**:
244
276
  ```bash
@@ -9,14 +9,17 @@ sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_generatio
9
9
  sinapsis_f5_tts/src/sinapsis_f5_tts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  sinapsis_f5_tts/src/sinapsis_f5_tts/templates/__init__.py,sha256=28BOPAr9GG1jYcrXi45ZWO1n2FAZJOdDcmRkOXdEYmk,496
11
11
  sinapsis_f5_tts/src/sinapsis_f5_tts/templates/f5_tts_inference.py,sha256=7EBxw-tRthbPDz0zFopaLdBhv7DXwxyMGXam6F1MwGs,15802
12
- sinapsis_speech-0.2.2.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
12
+ sinapsis_kokoro/src/sinapsis_kokoro/helpers/kokoro_utils.py,sha256=2IMJuwURPKK7keIkgS-rpGD28REG5M1FwW0COGcm3nI,1573
13
+ sinapsis_kokoro/src/sinapsis_kokoro/templates/__init__.py,sha256=aX25GCUNGzIBeY5kifomsB-nSzW-unfq0-aC2Rpnaws,485
14
+ sinapsis_kokoro/src/sinapsis_kokoro/templates/kokoro_tts.py,sha256=17fAmVD-uLaM6zZHdBXjLcKEJbe5s0uDV9IYtmjC57Q,5259
15
+ sinapsis_speech-0.3.0.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
13
16
  sinapsis_zonos/src/sinapsis_zonos/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
17
  sinapsis_zonos/src/sinapsis_zonos/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
18
  sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_keys.py,sha256=m1GdOYfzP73JGmtxH30mNiqbNkzFsQl9o2QaT7QxSVU,2470
16
19
  sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_tts_utils.py,sha256=8Tr2YgxjBfRqv_Hf6sw36X2pLzW7fdQWqa6QPBxNZK8,6419
17
20
  sinapsis_zonos/src/sinapsis_zonos/templates/__init__.py,sha256=A-_F0K3hbEFqeWWAh4YftgU9CFX-WHrauSiCAww9yp8,482
18
21
  sinapsis_zonos/src/sinapsis_zonos/templates/zonos_tts.py,sha256=KsNuT8cFTTjTEqjfEWsIr4B-DjGhVacSw2SdPckuFvk,7507
19
- sinapsis_speech-0.2.2.dist-info/METADATA,sha256=dHZvwWrOxQAlvOYlqM96pazsQfT-Byw_EVDAU0innXc,8968
20
- sinapsis_speech-0.2.2.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
21
- sinapsis_speech-0.2.2.dist-info/top_level.txt,sha256=vQFjL84TMSRld2lKvEVMUNyY2b3AVluCT1Ijws7o7_c,51
22
- sinapsis_speech-0.2.2.dist-info/RECORD,,
22
+ sinapsis_speech-0.3.0.dist-info/METADATA,sha256=9fQtDUnhPIesfZg-FF8Rk6074yGwG0WHZDgNjrsGa24,10032
23
+ sinapsis_speech-0.3.0.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
24
+ sinapsis_speech-0.3.0.dist-info/top_level.txt,sha256=dd-bGAKXxelJCHcNxFZM4OTJ2mylgM2astOGPpj91yo,67
25
+ sinapsis_speech-0.3.0.dist-info/RECORD,,
@@ -1,3 +1,4 @@
1
1
  sinapsis_elevenlabs
2
2
  sinapsis_f5_tts
3
+ sinapsis_kokoro
3
4
  sinapsis_zonos