sinapsis-speech 0.4.0__tar.gz → 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/PKG-INFO +31 -4
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/README.md +30 -3
- sinapsis_speech-0.4.1/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/tags.py +15 -0
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_base.py +37 -51
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_sts.py +8 -4
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_tts.py +7 -4
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_clone.py +7 -1
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_generation.py +7 -1
- sinapsis_speech-0.4.1/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/helpers/tags.py +10 -0
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/templates/f5_tts_inference.py +13 -1
- sinapsis_speech-0.4.1/packages/sinapsis_kokoro/src/sinapsis_kokoro/helpers/tags.py +10 -0
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_kokoro/src/sinapsis_kokoro/templates/kokoro_tts.py +14 -3
- sinapsis_speech-0.4.1/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/__init__.py +0 -0
- sinapsis_speech-0.4.1/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/helpers/__init__.py +0 -0
- sinapsis_speech-0.4.1/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/helpers/tags.py +10 -0
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/templates/orpheus_tts.py +15 -3
- sinapsis_speech-0.4.1/packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/__init__.py +0 -0
- sinapsis_speech-0.4.1/packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/helpers/__init__.py +0 -0
- sinapsis_speech-0.4.1/packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/helpers/tags.py +11 -0
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/templates/parakeet_tdt.py +20 -1
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_speech.egg-info/PKG-INFO +31 -4
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_speech.egg-info/SOURCES.txt +12 -0
- sinapsis_speech-0.4.1/packages/sinapsis_zonos/src/sinapsis_zonos/__init__.py +0 -0
- sinapsis_speech-0.4.1/packages/sinapsis_zonos/src/sinapsis_zonos/helpers/__init__.py +0 -0
- sinapsis_speech-0.4.1/packages/sinapsis_zonos/src/sinapsis_zonos/helpers/tags.py +11 -0
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_tts_utils.py +1 -1
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_zonos/src/sinapsis_zonos/templates/zonos_tts.py +13 -13
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/pyproject.toml +3 -1
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/LICENSE +0 -0
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/__init__.py +0 -0
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/__init__.py +0 -0
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/env_var_keys.py +0 -0
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/voice_utils.py +0 -0
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/__init__.py +0 -0
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/__init__.py +0 -0
- {sinapsis_speech-0.4.0/packages/sinapsis_zonos/src/sinapsis_zonos → sinapsis_speech-0.4.1/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/helpers}/__init__.py +0 -0
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/templates/__init__.py +0 -0
- {sinapsis_speech-0.4.0/packages/sinapsis_zonos/src/sinapsis_zonos/helpers → sinapsis_speech-0.4.1/packages/sinapsis_kokoro/src/sinapsis_kokoro}/__init__.py +0 -0
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_kokoro/src/sinapsis_kokoro/helpers/kokoro_utils.py +0 -0
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_kokoro/src/sinapsis_kokoro/templates/__init__.py +0 -0
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/templates/__init__.py +0 -0
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/thirdparty/helpers.py +0 -0
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/templates/__init__.py +0 -0
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_speech.egg-info/dependency_links.txt +0 -0
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_speech.egg-info/requires.txt +0 -0
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_speech.egg-info/top_level.txt +0 -0
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_keys.py +0 -0
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_zonos/src/sinapsis_zonos/templates/__init__.py +0 -0
- {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sinapsis-speech
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: Generate speech using various libraries.
|
|
5
5
|
Author-email: SinapsisAI <dev@sinapsis.tech>
|
|
6
6
|
Project-URL: Homepage, https://sinapsis.tech
|
|
@@ -35,7 +35,7 @@ Sinapsis Speech
|
|
|
35
35
|
<br>
|
|
36
36
|
</h1>
|
|
37
37
|
|
|
38
|
-
<h4 align="center">
|
|
38
|
+
<h4 align="center"> A monorepo housing multiple packages and templates for versatile voice generation, text-to-speech, speech-to-text, and beyond.</h4>
|
|
39
39
|
|
|
40
40
|
<p align="center">
|
|
41
41
|
<a href="#installation">🐍 Installation</a> •
|
|
@@ -108,10 +108,14 @@ This repository is organized into modular packages, each designed for integratio
|
|
|
108
108
|
<details>
|
|
109
109
|
<summary id="elevenlabs"><strong><span style="font-size: 1.4em;"> Sinapsis ElevenLabs </span></strong></summary>
|
|
110
110
|
|
|
111
|
-
This package offers a suite of templates and utilities designed for effortless integrating, configuration, and execution of **text-to-speech (TTS)** and **voice generation** functionalities powered by [ElevenLabs](https://elevenlabs.io/).
|
|
111
|
+
This package offers a suite of templates and utilities designed for effortless integrating, configuration, and execution of **text-to-speech (TTS)**, **speech-to-speech (STS)**, **voice cloning**, and **voice generation** functionalities powered by [ElevenLabs](https://elevenlabs.io/).
|
|
112
|
+
|
|
113
|
+
- **ElevenLabsSTS**: Template for transforming a voice into a different character or style using the ElevenLabs Speech-to-Speech API.
|
|
112
114
|
|
|
113
115
|
- **ElevenLabsTTS**: Template for converting text into speech using ElevenLabs' voice models.
|
|
114
116
|
|
|
117
|
+
- **ElevenLabsVoiceClone**: Template for creating a synthetic copy of an existing voice using the ElevenLabs API.
|
|
118
|
+
|
|
115
119
|
- **ElevenLabsVoiceGeneration**: Template for generating custom synthetic voices based on user-provided descriptions.
|
|
116
120
|
|
|
117
121
|
For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_elevenlabs/README.md).
|
|
@@ -148,6 +152,30 @@ For specific instructions and further details, see the [README.md](https://githu
|
|
|
148
152
|
|
|
149
153
|
</details>
|
|
150
154
|
|
|
155
|
+
|
|
156
|
+
<details>
|
|
157
|
+
<summary id="orpheus-cpp"><strong><span style="font-size: 1.4em;"> Sinapsis Orppheus-CPP</span></strong></summary>
|
|
158
|
+
|
|
159
|
+
This package provides a template for seamlessly integrating, configuring, and running **text-to-speech (TTS)** functionalities powered by [Orpheus-TTS](https://github.com/canopyai/Orpheus-TTS).
|
|
160
|
+
|
|
161
|
+
- **OrpheusTTS**: Converts text to speech using the Orpheus TTS model with advanced neural voice synthesis. The template processes text packets from the input container, generates corresponding audio using Orpheus TTS, and adds the resulting audio packets to the container. Features graceful error handling for out-of-memory conditions
|
|
162
|
+
|
|
163
|
+
For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_orpheus_cpp/README.md).
|
|
164
|
+
|
|
165
|
+
</details>
|
|
166
|
+
|
|
167
|
+
<details>
|
|
168
|
+
<summary id="parakeet-tdt"><strong><span style="font-size: 1.4em;"> Sinapsis Parakeet-TDT</span></strong></summary>
|
|
169
|
+
|
|
170
|
+
This package provides a template for seamlessly integrating, configuring, and running **speech-to-text (STT)** functionalities powered by [NVIDIA's Parakeet TDT model](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2).
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
- **ParakeetTDTInference**: Converts speech to text using NVIDIA's Parakeet TDT 0.6B model. This template processes audio packets from the input container or specified file paths, performs transcription with optional timestamp prediction, and adds the resulting text packets to the container.
|
|
174
|
+
|
|
175
|
+
For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_parakeet_tdt/README.md).
|
|
176
|
+
|
|
177
|
+
</details>
|
|
178
|
+
|
|
151
179
|
<h2 id="webapp">🌐 Webapps</h2>
|
|
152
180
|
The webapps included in this project showcase the modularity of the templates, in this case for speech generation tasks.
|
|
153
181
|
|
|
@@ -186,7 +214,6 @@ cd sinapsis-speech
|
|
|
186
214
|
docker compose -f docker/compose.yaml build
|
|
187
215
|
```
|
|
188
216
|
|
|
189
|
-
|
|
190
217
|
2. **Start the app container**:
|
|
191
218
|
|
|
192
219
|
- For ElevenLabs:
|
|
@@ -9,7 +9,7 @@ Sinapsis Speech
|
|
|
9
9
|
<br>
|
|
10
10
|
</h1>
|
|
11
11
|
|
|
12
|
-
<h4 align="center">
|
|
12
|
+
<h4 align="center"> A monorepo housing multiple packages and templates for versatile voice generation, text-to-speech, speech-to-text, and beyond.</h4>
|
|
13
13
|
|
|
14
14
|
<p align="center">
|
|
15
15
|
<a href="#installation">🐍 Installation</a> •
|
|
@@ -82,10 +82,14 @@ This repository is organized into modular packages, each designed for integratio
|
|
|
82
82
|
<details>
|
|
83
83
|
<summary id="elevenlabs"><strong><span style="font-size: 1.4em;"> Sinapsis ElevenLabs </span></strong></summary>
|
|
84
84
|
|
|
85
|
-
This package offers a suite of templates and utilities designed for effortless integrating, configuration, and execution of **text-to-speech (TTS)** and **voice generation** functionalities powered by [ElevenLabs](https://elevenlabs.io/).
|
|
85
|
+
This package offers a suite of templates and utilities designed for effortless integrating, configuration, and execution of **text-to-speech (TTS)**, **speech-to-speech (STS)**, **voice cloning**, and **voice generation** functionalities powered by [ElevenLabs](https://elevenlabs.io/).
|
|
86
|
+
|
|
87
|
+
- **ElevenLabsSTS**: Template for transforming a voice into a different character or style using the ElevenLabs Speech-to-Speech API.
|
|
86
88
|
|
|
87
89
|
- **ElevenLabsTTS**: Template for converting text into speech using ElevenLabs' voice models.
|
|
88
90
|
|
|
91
|
+
- **ElevenLabsVoiceClone**: Template for creating a synthetic copy of an existing voice using the ElevenLabs API.
|
|
92
|
+
|
|
89
93
|
- **ElevenLabsVoiceGeneration**: Template for generating custom synthetic voices based on user-provided descriptions.
|
|
90
94
|
|
|
91
95
|
For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_elevenlabs/README.md).
|
|
@@ -122,6 +126,30 @@ For specific instructions and further details, see the [README.md](https://githu
|
|
|
122
126
|
|
|
123
127
|
</details>
|
|
124
128
|
|
|
129
|
+
|
|
130
|
+
<details>
|
|
131
|
+
<summary id="orpheus-cpp"><strong><span style="font-size: 1.4em;"> Sinapsis Orppheus-CPP</span></strong></summary>
|
|
132
|
+
|
|
133
|
+
This package provides a template for seamlessly integrating, configuring, and running **text-to-speech (TTS)** functionalities powered by [Orpheus-TTS](https://github.com/canopyai/Orpheus-TTS).
|
|
134
|
+
|
|
135
|
+
- **OrpheusTTS**: Converts text to speech using the Orpheus TTS model with advanced neural voice synthesis. The template processes text packets from the input container, generates corresponding audio using Orpheus TTS, and adds the resulting audio packets to the container. Features graceful error handling for out-of-memory conditions
|
|
136
|
+
|
|
137
|
+
For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_orpheus_cpp/README.md).
|
|
138
|
+
|
|
139
|
+
</details>
|
|
140
|
+
|
|
141
|
+
<details>
|
|
142
|
+
<summary id="parakeet-tdt"><strong><span style="font-size: 1.4em;"> Sinapsis Parakeet-TDT</span></strong></summary>
|
|
143
|
+
|
|
144
|
+
This package provides a template for seamlessly integrating, configuring, and running **speech-to-text (STT)** functionalities powered by [NVIDIA's Parakeet TDT model](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2).
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
- **ParakeetTDTInference**: Converts speech to text using NVIDIA's Parakeet TDT 0.6B model. This template processes audio packets from the input container or specified file paths, performs transcription with optional timestamp prediction, and adds the resulting text packets to the container.
|
|
148
|
+
|
|
149
|
+
For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_parakeet_tdt/README.md).
|
|
150
|
+
|
|
151
|
+
</details>
|
|
152
|
+
|
|
125
153
|
<h2 id="webapp">🌐 Webapps</h2>
|
|
126
154
|
The webapps included in this project showcase the modularity of the templates, in this case for speech generation tasks.
|
|
127
155
|
|
|
@@ -160,7 +188,6 @@ cd sinapsis-speech
|
|
|
160
188
|
docker compose -f docker/compose.yaml build
|
|
161
189
|
```
|
|
162
190
|
|
|
163
|
-
|
|
164
191
|
2. **Start the app container**:
|
|
165
192
|
|
|
166
193
|
- For ElevenLabs:
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
from enum import Enum
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Tags(Enum):
|
|
6
|
+
AUDIO = "audio"
|
|
7
|
+
AUDIO_GENERATION = "audio_generation"
|
|
8
|
+
ELEVENLABS = "elevenlabs"
|
|
9
|
+
PROMPT = "prompt"
|
|
10
|
+
SPEECH = "speech"
|
|
11
|
+
SPEECH_TO_SPEECH = "speech_to_speech"
|
|
12
|
+
TEXT_TO_SPEECH = "text_to_speech"
|
|
13
|
+
VOICE_CONVERSION = "voice_conversion"
|
|
14
|
+
VOICE_CLONING = "voice_cloning"
|
|
15
|
+
VOICE_GENERATION = "voice_generation"
|
|
@@ -3,11 +3,10 @@
|
|
|
3
3
|
|
|
4
4
|
import abc
|
|
5
5
|
import os
|
|
6
|
-
import
|
|
7
|
-
from io import BytesIO
|
|
8
|
-
from typing import IO, Iterable, Iterator, Literal
|
|
6
|
+
from typing import Generator, Iterable, Iterator, Literal
|
|
9
7
|
|
|
10
|
-
|
|
8
|
+
import numpy as np
|
|
9
|
+
from elevenlabs import Voice, VoiceSettings
|
|
11
10
|
from elevenlabs.client import ElevenLabs
|
|
12
11
|
from elevenlabs.types import OutputFormat
|
|
13
12
|
from pydantic import Field
|
|
@@ -19,9 +18,11 @@ from sinapsis_core.template_base.base_models import (
|
|
|
19
18
|
UIPropertiesMetadata,
|
|
20
19
|
)
|
|
21
20
|
from sinapsis_core.template_base.template import Template
|
|
22
|
-
from sinapsis_core.utils.env_var_keys import
|
|
21
|
+
from sinapsis_core.utils.env_var_keys import WORKING_DIR
|
|
22
|
+
from sinapsis_generic_data_tools.helpers.audio_encoder import audio_bytes_to_numpy
|
|
23
23
|
|
|
24
24
|
from sinapsis_elevenlabs.helpers.env_var_keys import ELEVENLABS_API_KEY
|
|
25
|
+
from sinapsis_elevenlabs.helpers.tags import Tags
|
|
25
26
|
|
|
26
27
|
RESPONSE_TYPE = Iterator[bytes] | list[bytes] | list[Iterator[bytes]] | None
|
|
27
28
|
|
|
@@ -51,8 +52,6 @@ class ElevenLabsBase(Template, abc.ABC):
|
|
|
51
52
|
output_format (OutputFormat): The output audio format and quality. Options include:
|
|
52
53
|
["mp3_22050_32", "mp3_44100_32", "mp3_44100_64", "mp3_44100_96", "mp3_44100_128",
|
|
53
54
|
"mp3_44100_192", "pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100", "ulaw_8000"]
|
|
54
|
-
output_folder (str): The folder where generated audio files will be saved.
|
|
55
|
-
stream (bool): If True, the audio is returned as a stream; otherwise, saved to a file.
|
|
56
55
|
voice (str | Voice | None): The voice to use for speech synthesis. This can be a voice ID (str),
|
|
57
56
|
a voice name (str) or an elevenlabs voice object (Voice).
|
|
58
57
|
voice_settings (VoiceSettings): A dictionary of settings that control the behavior of the voice.
|
|
@@ -74,17 +73,20 @@ class ElevenLabsBase(Template, abc.ABC):
|
|
|
74
73
|
] = "eleven_turbo_v2_5"
|
|
75
74
|
output_file_name: str | None = None
|
|
76
75
|
output_format: OutputFormat = "mp3_44100_128"
|
|
77
|
-
output_folder: str = os.path.join(
|
|
76
|
+
output_folder: str = os.path.join(WORKING_DIR, "elevenlabs", "audios")
|
|
78
77
|
stream: bool = False
|
|
79
78
|
voice: str | Voice | None = None
|
|
80
79
|
voice_settings: VoiceSettings = Field(default_factory=dict) # type: ignore[arg-type]
|
|
81
80
|
|
|
82
|
-
UIProperties = UIPropertiesMetadata(
|
|
81
|
+
UIProperties = UIPropertiesMetadata(
|
|
82
|
+
category="Elevenlabs",
|
|
83
|
+
output_type=OutputTypes.AUDIO,
|
|
84
|
+
tags=[Tags.AUDIO, Tags.ELEVENLABS, Tags.SPEECH],
|
|
85
|
+
)
|
|
83
86
|
|
|
84
87
|
def __init__(self, attributes: TemplateAttributeType) -> None:
|
|
85
88
|
"""Initializes the ElevenLabs API client with the given attributes."""
|
|
86
89
|
super().__init__(attributes)
|
|
87
|
-
os.makedirs(self.attributes.output_folder, exist_ok=True)
|
|
88
90
|
self.client = self.init_elevenlabs_client()
|
|
89
91
|
|
|
90
92
|
def init_elevenlabs_client(self) -> ElevenLabs:
|
|
@@ -92,44 +94,27 @@ class ElevenLabsBase(Template, abc.ABC):
|
|
|
92
94
|
key = self.attributes.api_key if self.attributes.api_key else ELEVENLABS_API_KEY
|
|
93
95
|
return ElevenLabs(api_key=key)
|
|
94
96
|
|
|
95
|
-
def reset_state(self) -> None:
|
|
97
|
+
def reset_state(self, template_name: str | None = None) -> None:
|
|
96
98
|
"""Resets state of model"""
|
|
99
|
+
_ = template_name
|
|
97
100
|
self.client = self.init_elevenlabs_client()
|
|
98
101
|
|
|
99
102
|
@abc.abstractmethod
|
|
100
103
|
def synthesize_speech(self, input_data: list[Packet]) -> RESPONSE_TYPE:
|
|
101
104
|
"""Abstract method for ElevenLabs speech synthesis."""
|
|
102
105
|
|
|
103
|
-
def
|
|
104
|
-
"""Saves the audio to a file and returns the file path."""
|
|
105
|
-
if self.attributes.output_file_name:
|
|
106
|
-
file_name = self.attributes.output_file_name + "_" + str(idx)
|
|
107
|
-
else:
|
|
108
|
-
file_name = uuid.uuid4()
|
|
109
|
-
|
|
110
|
-
output_file = os.path.join(self.attributes.output_folder, f"{file_name}.{file_format}")
|
|
111
|
-
try:
|
|
112
|
-
save(response, output_file)
|
|
113
|
-
self.logger.info(f"Audio saved to: {output_file}")
|
|
114
|
-
return output_file
|
|
115
|
-
except OSError as e:
|
|
116
|
-
self.logger.error(f"File system error while saving speech to file: {e}")
|
|
117
|
-
raise
|
|
118
|
-
|
|
119
|
-
def _generate_audio_stream(self, response: Iterable | bytes) -> IO[bytes]:
|
|
106
|
+
def _generate_audio_stream(self, response: Iterable | bytes) -> bytes:
|
|
120
107
|
"""Generates and returns the audio stream."""
|
|
121
|
-
|
|
108
|
+
|
|
122
109
|
try:
|
|
123
110
|
if isinstance(response, Iterator):
|
|
124
|
-
for chunk in response
|
|
125
|
-
if chunk:
|
|
126
|
-
audio_stream.write(chunk)
|
|
111
|
+
audio_stream = b"".join(chunk for chunk in response)
|
|
127
112
|
elif isinstance(response, bytes):
|
|
128
|
-
audio_stream
|
|
113
|
+
audio_stream = response
|
|
114
|
+
|
|
129
115
|
else:
|
|
130
116
|
raise TypeError(f"Unsupported response type: {type(response)}")
|
|
131
117
|
|
|
132
|
-
audio_stream.seek(0)
|
|
133
118
|
self.logger.info("Returning audio stream")
|
|
134
119
|
return audio_stream
|
|
135
120
|
except IOError as e:
|
|
@@ -139,14 +124,15 @@ class ElevenLabsBase(Template, abc.ABC):
|
|
|
139
124
|
self.logger.error(f"Value error while processing audio chunks: {e}")
|
|
140
125
|
raise
|
|
141
126
|
|
|
142
|
-
def _process_audio_output(self,
|
|
127
|
+
def _process_audio_output(self, response: Iterable | bytes) -> tuple[np.ndarray, int]:
|
|
143
128
|
"""Processes a single audio output (either stream or file)."""
|
|
144
|
-
if self.attributes.stream:
|
|
145
|
-
return self._generate_audio_stream(response)
|
|
146
|
-
file_format = "mp3" if "mp3" in self.attributes.output_format else "wav"
|
|
147
|
-
return self._save_audio(response, file_format, idx)
|
|
148
129
|
|
|
149
|
-
|
|
130
|
+
result = self._generate_audio_stream(response)
|
|
131
|
+
audio_np, sample_rate = audio_bytes_to_numpy(result)
|
|
132
|
+
|
|
133
|
+
return audio_np, sample_rate
|
|
134
|
+
|
|
135
|
+
def generate_speech(self, input_data: list[Packet]) -> list[tuple] | None:
|
|
150
136
|
"""Generates speech and saves it to a file."""
|
|
151
137
|
responses: RESPONSE_TYPE = self.synthesize_speech(input_data)
|
|
152
138
|
if not responses:
|
|
@@ -154,29 +140,29 @@ class ElevenLabsBase(Template, abc.ABC):
|
|
|
154
140
|
|
|
155
141
|
if isinstance(responses, Iterator):
|
|
156
142
|
responses = [responses]
|
|
157
|
-
|
|
158
|
-
|
|
143
|
+
elif isinstance(responses, Generator):
|
|
144
|
+
responses = list(responses)
|
|
145
|
+
audio_outputs = [self._process_audio_output(response) for response in responses]
|
|
159
146
|
return audio_outputs
|
|
160
147
|
|
|
161
|
-
def _handle_streaming_output(self, audio_outputs: list[
|
|
148
|
+
def _handle_streaming_output(self, audio_outputs: list[tuple]) -> list[AudioPacket]:
|
|
162
149
|
"""Handles audio stream output by adding it to the container as AudioPackets."""
|
|
163
150
|
generated_audios: list[AudioPacket] = []
|
|
164
|
-
sample_rate = int(self.attributes.output_format.split("_")[1])
|
|
151
|
+
# sample_rate = int(self.attributes.output_format.split("_")[1])
|
|
165
152
|
for audio_output in audio_outputs:
|
|
153
|
+
audio = audio_output[0]
|
|
154
|
+
sample_rate = audio_output[1]
|
|
166
155
|
audio_packet = AudioPacket(
|
|
167
|
-
content=
|
|
156
|
+
content=audio,
|
|
168
157
|
sample_rate=sample_rate,
|
|
169
158
|
)
|
|
170
159
|
generated_audios.append(audio_packet)
|
|
171
160
|
return generated_audios
|
|
172
161
|
|
|
173
|
-
def _handle_audio_outputs(self, audio_outputs: list[
|
|
162
|
+
def _handle_audio_outputs(self, audio_outputs: list[tuple], container: DataContainer) -> None:
|
|
174
163
|
"""Handles the audio outputs by appending to the container based on the output type (stream or file)."""
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
container.audios.extend(self._handle_streaming_output(audio_outputs))
|
|
178
|
-
else:
|
|
179
|
-
self._set_generic_data(container, audio_outputs)
|
|
164
|
+
container.audios = container.audios or []
|
|
165
|
+
container.audios = self._handle_streaming_output(audio_outputs)
|
|
180
166
|
|
|
181
167
|
def execute(self, container: DataContainer) -> DataContainer:
|
|
182
168
|
"""
|
|
@@ -5,9 +5,13 @@ from typing import Callable, Iterator, Literal
|
|
|
5
5
|
|
|
6
6
|
from sinapsis_core.data_containers.data_packet import AudioPacket
|
|
7
7
|
|
|
8
|
+
from sinapsis_elevenlabs.helpers.tags import Tags
|
|
8
9
|
from sinapsis_elevenlabs.helpers.voice_utils import create_voice_settings, get_voice_id
|
|
9
10
|
from sinapsis_elevenlabs.templates.elevenlabs_base import ElevenLabsBase
|
|
10
11
|
|
|
12
|
+
ElevenLabsSTSUIProperties = ElevenLabsBase.UIProperties
|
|
13
|
+
ElevenLabsSTSUIProperties.tags.extend([Tags.SPEECH_TO_SPEECH, Tags.VOICE_CONVERSION])
|
|
14
|
+
|
|
11
15
|
|
|
12
16
|
class ElevenLabsSTS(ElevenLabsBase):
|
|
13
17
|
"""Template to interact with the ElevenLabs Speech-to-Speech API.
|
|
@@ -31,7 +35,7 @@ class ElevenLabsSTS(ElevenLabsBase):
|
|
|
31
35
|
model: eleven_multilingual_sts_v2
|
|
32
36
|
output_file_name: null
|
|
33
37
|
output_format: mp3_44100_128
|
|
34
|
-
output_folder:
|
|
38
|
+
output_folder: <WORKING_DIR>/elevenlabs/audios
|
|
35
39
|
stream: false
|
|
36
40
|
voice: null
|
|
37
41
|
voice_settings:
|
|
@@ -45,6 +49,7 @@ class ElevenLabsSTS(ElevenLabsBase):
|
|
|
45
49
|
"""
|
|
46
50
|
|
|
47
51
|
PACKET_TYPE_NAME: str = "audios"
|
|
52
|
+
UIProperties = ElevenLabsSTSUIProperties
|
|
48
53
|
|
|
49
54
|
class AttributesBaseModel(ElevenLabsBase.AttributesBaseModel):
|
|
50
55
|
"""Attributes specific to ElevenLabs STS API interaction.
|
|
@@ -73,9 +78,8 @@ class ElevenLabsSTS(ElevenLabsBase):
|
|
|
73
78
|
KeyError: If the expected key is missing in the API response.
|
|
74
79
|
"""
|
|
75
80
|
try:
|
|
76
|
-
method: Callable[..., Iterator[bytes]] = (
|
|
77
|
-
|
|
78
|
-
)
|
|
81
|
+
method: Callable[..., Iterator[bytes]] = self.client.speech_to_speech.stream # (
|
|
82
|
+
|
|
79
83
|
return method(
|
|
80
84
|
voice_id=get_voice_id(self.client, voice=self.attributes.voice),
|
|
81
85
|
audio=input_data[0].content,
|
|
@@ -5,6 +5,7 @@ from typing import Callable, Iterator, Literal
|
|
|
5
5
|
|
|
6
6
|
from sinapsis_core.data_containers.data_packet import TextPacket
|
|
7
7
|
|
|
8
|
+
from sinapsis_elevenlabs.helpers.tags import Tags
|
|
8
9
|
from sinapsis_elevenlabs.helpers.voice_utils import (
|
|
9
10
|
create_voice_settings,
|
|
10
11
|
get_voice_id,
|
|
@@ -12,6 +13,9 @@ from sinapsis_elevenlabs.helpers.voice_utils import (
|
|
|
12
13
|
)
|
|
13
14
|
from sinapsis_elevenlabs.templates.elevenlabs_base import ElevenLabsBase
|
|
14
15
|
|
|
16
|
+
ElevenLabsTTSUIProperties = ElevenLabsBase.UIProperties
|
|
17
|
+
ElevenLabsTTSUIProperties.tags.extend([Tags.TEXT_TO_SPEECH])
|
|
18
|
+
|
|
15
19
|
|
|
16
20
|
class ElevenLabsTTS(ElevenLabsBase):
|
|
17
21
|
"""Template to interact with ElevenLabs text-to-speech API.
|
|
@@ -36,7 +40,7 @@ class ElevenLabsTTS(ElevenLabsBase):
|
|
|
36
40
|
voice_settings: null
|
|
37
41
|
model: eleven_turbo_v2_5
|
|
38
42
|
output_format: mp3_44100_128
|
|
39
|
-
output_folder:
|
|
43
|
+
output_folder: <WORKING_DIR>/elevenlabs/audios
|
|
40
44
|
stream: false
|
|
41
45
|
|
|
42
46
|
"""
|
|
@@ -65,9 +69,8 @@ class ElevenLabsTTS(ElevenLabsBase):
|
|
|
65
69
|
"""
|
|
66
70
|
input_text: str = load_input_text(input_data)
|
|
67
71
|
try:
|
|
68
|
-
method: Callable[..., Iterator[bytes]] =
|
|
69
|
-
|
|
70
|
-
)
|
|
72
|
+
method: Callable[..., Iterator[bytes]] = self.client.text_to_speech.stream
|
|
73
|
+
|
|
71
74
|
return method(
|
|
72
75
|
text=input_text,
|
|
73
76
|
voice_id=get_voice_id(self.client, self.attributes.voice),
|
|
@@ -4,8 +4,12 @@
|
|
|
4
4
|
from elevenlabs import Voice
|
|
5
5
|
from sinapsis_core.data_containers.data_packet import AudioPacket, DataContainer
|
|
6
6
|
|
|
7
|
+
from sinapsis_elevenlabs.helpers.tags import Tags
|
|
7
8
|
from sinapsis_elevenlabs.templates.elevenlabs_tts import ElevenLabsTTS
|
|
8
9
|
|
|
10
|
+
ElevenLabsVoiceCloneUIProperties = ElevenLabsTTS.UIProperties
|
|
11
|
+
ElevenLabsVoiceCloneUIProperties.tags.extend([Tags.VOICE_CLONING])
|
|
12
|
+
|
|
9
13
|
|
|
10
14
|
class ElevenLabsVoiceClone(ElevenLabsTTS):
|
|
11
15
|
"""Template to clone a voice using the ElevenLabs API.
|
|
@@ -30,7 +34,7 @@ class ElevenLabsVoiceClone(ElevenLabsTTS):
|
|
|
30
34
|
model: eleven_turbo_v2_5
|
|
31
35
|
output_file_name: null
|
|
32
36
|
output_format: mp3_44100_128
|
|
33
|
-
output_folder:
|
|
37
|
+
output_folder: <WORKING_DIR>/elevenlabs/audios
|
|
34
38
|
stream: false
|
|
35
39
|
voice: null
|
|
36
40
|
voice_settings:
|
|
@@ -45,6 +49,8 @@ class ElevenLabsVoiceClone(ElevenLabsTTS):
|
|
|
45
49
|
|
|
46
50
|
"""
|
|
47
51
|
|
|
52
|
+
UIProperties = ElevenLabsVoiceCloneUIProperties
|
|
53
|
+
|
|
48
54
|
class AttributesBaseModel(ElevenLabsTTS.AttributesBaseModel):
|
|
49
55
|
"""Attributes specific to the ElevenLabsVoiceClone class.
|
|
50
56
|
|
|
@@ -5,9 +5,13 @@ import base64
|
|
|
5
5
|
|
|
6
6
|
from sinapsis_core.data_containers.data_packet import TextPacket
|
|
7
7
|
|
|
8
|
+
from sinapsis_elevenlabs.helpers.tags import Tags
|
|
8
9
|
from sinapsis_elevenlabs.helpers.voice_utils import load_input_text
|
|
9
10
|
from sinapsis_elevenlabs.templates.elevenlabs_base import ElevenLabsBase
|
|
10
11
|
|
|
12
|
+
ElevenLabsVoiceGenerationUIProperties = ElevenLabsBase.UIProperties
|
|
13
|
+
ElevenLabsVoiceGenerationUIProperties.tags.extend([Tags.VOICE_GENERATION, Tags.PROMPT])
|
|
14
|
+
|
|
11
15
|
|
|
12
16
|
class ElevenLabsVoiceGeneration(ElevenLabsBase):
|
|
13
17
|
"""
|
|
@@ -33,12 +37,14 @@ class ElevenLabsVoiceGeneration(ElevenLabsBase):
|
|
|
33
37
|
voice_settings: null
|
|
34
38
|
model: eleven_turbo_v2_5
|
|
35
39
|
output_format: mp3_44100_128
|
|
36
|
-
output_folder:
|
|
40
|
+
output_folder: <WORKING_DIR>/elevenlabs/audios
|
|
37
41
|
stream: false
|
|
38
42
|
voice_description: An old British male with a raspy, deep voice. Professional,
|
|
39
43
|
relaxed and assertive
|
|
40
44
|
"""
|
|
41
45
|
|
|
46
|
+
UIProperties = ElevenLabsVoiceGenerationUIProperties
|
|
47
|
+
|
|
42
48
|
class AttributesBaseModel(ElevenLabsBase.AttributesBaseModel):
|
|
43
49
|
"""
|
|
44
50
|
Attributes for voice generation in ElevenLabs API.
|
|
@@ -6,6 +6,7 @@ from typing import Any, Literal
|
|
|
6
6
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
import soundfile as sf
|
|
9
|
+
import torch
|
|
9
10
|
from pydantic import Field
|
|
10
11
|
from pydantic.dataclasses import dataclass
|
|
11
12
|
from sinapsis_core.data_containers.data_packet import (
|
|
@@ -15,6 +16,8 @@ from sinapsis_core.data_containers.data_packet import (
|
|
|
15
16
|
from sinapsis_core.template_base import Template
|
|
16
17
|
from sinapsis_core.template_base.base_models import OutputTypes, TemplateAttributes, UIPropertiesMetadata
|
|
17
18
|
|
|
19
|
+
from sinapsis_f5_tts.helpers.tags import Tags
|
|
20
|
+
|
|
18
21
|
|
|
19
22
|
@dataclass
|
|
20
23
|
class F5CliKeys:
|
|
@@ -146,7 +149,11 @@ class F5TTSInference(Template):
|
|
|
146
149
|
"""
|
|
147
150
|
|
|
148
151
|
AttributesBaseModel = F5TTSInferenceAttributes
|
|
149
|
-
UIProperties = UIPropertiesMetadata(
|
|
152
|
+
UIProperties = UIPropertiesMetadata(
|
|
153
|
+
category="F5TTS",
|
|
154
|
+
output_type=OutputTypes.AUDIO,
|
|
155
|
+
tags=[Tags.AUDIO, Tags.AUDIO_GENERATION, Tags.F5TTS, Tags.SPEECH, Tags.TEXT_TO_SPEECH],
|
|
156
|
+
)
|
|
150
157
|
|
|
151
158
|
def _add_attribute_to_command(self, cli_command: list[str], field_name: str, field: Any) -> None:
|
|
152
159
|
"""
|
|
@@ -357,3 +364,8 @@ class F5TTSInference(Template):
|
|
|
357
364
|
)
|
|
358
365
|
|
|
359
366
|
return container
|
|
367
|
+
|
|
368
|
+
def reset_state(self, template_name: str | None = None) -> None:
|
|
369
|
+
if "cuda" in self.attributes.device:
|
|
370
|
+
torch.cuda.empty_cache()
|
|
371
|
+
super().reset_state(template_name)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
|
-
from typing import Generator
|
|
2
|
+
from typing import Generator, Literal
|
|
3
3
|
from urllib.error import HTTPError
|
|
4
4
|
|
|
5
5
|
import torch
|
|
@@ -15,6 +15,7 @@ from sinapsis_core.template_base.template import Template
|
|
|
15
15
|
from sinapsis_core.utils.logging_utils import make_loguru
|
|
16
16
|
|
|
17
17
|
from sinapsis_kokoro.helpers.kokoro_utils import KokoroKeys, kokoro_voices
|
|
18
|
+
from sinapsis_kokoro.helpers.tags import Tags
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
class KokoroTTS(Template):
|
|
@@ -39,7 +40,11 @@ class KokoroTTS(Template):
|
|
|
39
40
|
voice: af_heart
|
|
40
41
|
"""
|
|
41
42
|
|
|
42
|
-
UIProperties = UIPropertiesMetadata(
|
|
43
|
+
UIProperties = UIPropertiesMetadata(
|
|
44
|
+
category="Kokoro",
|
|
45
|
+
output_type=OutputTypes.AUDIO,
|
|
46
|
+
tags=[Tags.AUDIO, Tags.AUDIO_GENERATION, Tags.KOKORO, Tags.SPEECH, Tags.TEXT_TO_SPEECH],
|
|
47
|
+
)
|
|
43
48
|
|
|
44
49
|
class AttributesBaseModel(TemplateAttributes):
|
|
45
50
|
"""
|
|
@@ -56,6 +61,7 @@ class KokoroTTS(Template):
|
|
|
56
61
|
https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md
|
|
57
62
|
"""
|
|
58
63
|
|
|
64
|
+
device: Literal["cpu", "cuda"] = "cpu"
|
|
59
65
|
speed: int | float = 1
|
|
60
66
|
split_pattern: str = r"\n+"
|
|
61
67
|
voice: kokoro_voices = KokoroKeys.default_voice
|
|
@@ -73,7 +79,7 @@ class KokoroTTS(Template):
|
|
|
73
79
|
Returns:
|
|
74
80
|
KPipeline: The initialized TTS pipeline for generating speech.
|
|
75
81
|
"""
|
|
76
|
-
return KPipeline(lang_code=self.attributes.voice[0], repo_id=KokoroKeys.repo_id)
|
|
82
|
+
return KPipeline(lang_code=self.attributes.voice[0], repo_id=KokoroKeys.repo_id, device=self.attributes.device)
|
|
77
83
|
|
|
78
84
|
def _create_audio_packet(
|
|
79
85
|
self,
|
|
@@ -151,3 +157,8 @@ class KokoroTTS(Template):
|
|
|
151
157
|
self.generate_speech(container)
|
|
152
158
|
|
|
153
159
|
return container
|
|
160
|
+
|
|
161
|
+
def reset_state(self, template_name: str | None = None) -> None:
|
|
162
|
+
if "cuda" in self.attributes.device:
|
|
163
|
+
torch.cuda.empty_cache()
|
|
164
|
+
super().reset_state(template_name)
|
|
File without changes
|
sinapsis_speech-0.4.1/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/helpers/__init__.py
ADDED
|
File without changes
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
|
|
3
3
|
import numpy as np
|
|
4
|
+
import torch
|
|
4
5
|
from llama_cpp import Llama
|
|
5
6
|
from orpheus_cpp import OrpheusCpp
|
|
6
7
|
from orpheus_cpp.model import TTSOptions
|
|
@@ -18,6 +19,7 @@ from sinapsis_core.template_base.base_models import (
|
|
|
18
19
|
)
|
|
19
20
|
from sinapsis_core.utils.env_var_keys import SINAPSIS_CACHE_DIR
|
|
20
21
|
|
|
22
|
+
from sinapsis_orpheus_cpp.helpers.tags import Tags
|
|
21
23
|
from sinapsis_orpheus_cpp.thirdparty.helpers import download_model, setup_snac_session
|
|
22
24
|
|
|
23
25
|
|
|
@@ -129,7 +131,11 @@ class OrpheusTTS(Template):
|
|
|
129
131
|
"""
|
|
130
132
|
|
|
131
133
|
AttributesBaseModel = OrpheusTTSAttributes
|
|
132
|
-
UIProperties = UIPropertiesMetadata(
|
|
134
|
+
UIProperties = UIPropertiesMetadata(
|
|
135
|
+
category="TTS",
|
|
136
|
+
output_type=OutputTypes.AUDIO,
|
|
137
|
+
tags=[Tags.AUDIO, Tags.AUDIO_GENERATION, Tags.ORPHEUS_CPP, Tags.SPEECH, Tags.TEXT_TO_SPEECH],
|
|
138
|
+
)
|
|
133
139
|
|
|
134
140
|
def __init__(self, attributes: TemplateAttributeType) -> None:
|
|
135
141
|
super().__init__(attributes)
|
|
@@ -154,8 +160,9 @@ class OrpheusTTS(Template):
|
|
|
154
160
|
model_variant=self.attributes.model_variant,
|
|
155
161
|
cache_dir=self.attributes.cache_dir,
|
|
156
162
|
)
|
|
157
|
-
|
|
158
|
-
|
|
163
|
+
if model_file:
|
|
164
|
+
self._setup_llm(model_file)
|
|
165
|
+
self._setup_snac_session()
|
|
159
166
|
|
|
160
167
|
def _setup_llm(self, model_file: str) -> None:
|
|
161
168
|
"""Setup the Large Language Model component with specified parameters.
|
|
@@ -298,3 +305,8 @@ class OrpheusTTS(Template):
|
|
|
298
305
|
container.audios.append(audio_packet)
|
|
299
306
|
|
|
300
307
|
return container
|
|
308
|
+
|
|
309
|
+
def reset_state(self, template_name: str | None = None) -> None:
|
|
310
|
+
if torch.cuda.is_available():
|
|
311
|
+
torch.cuda.empty_cache()
|
|
312
|
+
super().reset_state(template_name)
|
|
File without changes
|
sinapsis_speech-0.4.1/packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/helpers/__init__.py
ADDED
|
File without changes
|
|
@@ -3,6 +3,7 @@ import os
|
|
|
3
3
|
from typing import Any, Literal
|
|
4
4
|
|
|
5
5
|
import nemo.collections.asr as nemo_asr
|
|
6
|
+
import torch
|
|
6
7
|
from sinapsis_core.data_containers.data_packet import (
|
|
7
8
|
AudioPacket,
|
|
8
9
|
DataContainer,
|
|
@@ -15,6 +16,8 @@ from sinapsis_core.template_base.base_models import (
|
|
|
15
16
|
)
|
|
16
17
|
from sinapsis_core.template_base.template import Template
|
|
17
18
|
|
|
19
|
+
from sinapsis_parakeet_tdt.helpers.tags import Tags
|
|
20
|
+
|
|
18
21
|
|
|
19
22
|
class ParakeetTDTInferenceAttributes(TemplateAttributes):
|
|
20
23
|
"""
|
|
@@ -68,7 +71,18 @@ class ParakeetTDTInference(Template):
|
|
|
68
71
|
refresh_cache: False
|
|
69
72
|
"""
|
|
70
73
|
|
|
71
|
-
UIProperties = UIPropertiesMetadata(
|
|
74
|
+
UIProperties = UIPropertiesMetadata(
|
|
75
|
+
category="Parakeet TDT",
|
|
76
|
+
output_type=OutputTypes.TEXT,
|
|
77
|
+
tags=[
|
|
78
|
+
Tags.AUDIO,
|
|
79
|
+
Tags.SPEECH,
|
|
80
|
+
Tags.PARAKEET_TDT,
|
|
81
|
+
Tags.SPEECH_RECOGNITION,
|
|
82
|
+
Tags.SPEECH_TO_TEXT,
|
|
83
|
+
Tags.TRANSCRIPTION,
|
|
84
|
+
],
|
|
85
|
+
)
|
|
72
86
|
|
|
73
87
|
AttributesBaseModel = ParakeetTDTInferenceAttributes
|
|
74
88
|
|
|
@@ -268,3 +282,8 @@ class ParakeetTDTInference(Template):
|
|
|
268
282
|
container.texts.extend(text_packets)
|
|
269
283
|
|
|
270
284
|
return container
|
|
285
|
+
|
|
286
|
+
def reset_state(self, template_name: str | None = None) -> None:
|
|
287
|
+
if "cuda" in self.attributes.device:
|
|
288
|
+
torch.cuda.empty_cache()
|
|
289
|
+
super().reset_state(template_name)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sinapsis-speech
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: Generate speech using various libraries.
|
|
5
5
|
Author-email: SinapsisAI <dev@sinapsis.tech>
|
|
6
6
|
Project-URL: Homepage, https://sinapsis.tech
|
|
@@ -35,7 +35,7 @@ Sinapsis Speech
|
|
|
35
35
|
<br>
|
|
36
36
|
</h1>
|
|
37
37
|
|
|
38
|
-
<h4 align="center">
|
|
38
|
+
<h4 align="center"> A monorepo housing multiple packages and templates for versatile voice generation, text-to-speech, speech-to-text, and beyond.</h4>
|
|
39
39
|
|
|
40
40
|
<p align="center">
|
|
41
41
|
<a href="#installation">🐍 Installation</a> •
|
|
@@ -108,10 +108,14 @@ This repository is organized into modular packages, each designed for integratio
|
|
|
108
108
|
<details>
|
|
109
109
|
<summary id="elevenlabs"><strong><span style="font-size: 1.4em;"> Sinapsis ElevenLabs </span></strong></summary>
|
|
110
110
|
|
|
111
|
-
This package offers a suite of templates and utilities designed for effortless integrating, configuration, and execution of **text-to-speech (TTS)** and **voice generation** functionalities powered by [ElevenLabs](https://elevenlabs.io/).
|
|
111
|
+
This package offers a suite of templates and utilities designed for effortless integrating, configuration, and execution of **text-to-speech (TTS)**, **speech-to-speech (STS)**, **voice cloning**, and **voice generation** functionalities powered by [ElevenLabs](https://elevenlabs.io/).
|
|
112
|
+
|
|
113
|
+
- **ElevenLabsSTS**: Template for transforming a voice into a different character or style using the ElevenLabs Speech-to-Speech API.
|
|
112
114
|
|
|
113
115
|
- **ElevenLabsTTS**: Template for converting text into speech using ElevenLabs' voice models.
|
|
114
116
|
|
|
117
|
+
- **ElevenLabsVoiceClone**: Template for creating a synthetic copy of an existing voice using the ElevenLabs API.
|
|
118
|
+
|
|
115
119
|
- **ElevenLabsVoiceGeneration**: Template for generating custom synthetic voices based on user-provided descriptions.
|
|
116
120
|
|
|
117
121
|
For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_elevenlabs/README.md).
|
|
@@ -148,6 +152,30 @@ For specific instructions and further details, see the [README.md](https://githu
|
|
|
148
152
|
|
|
149
153
|
</details>
|
|
150
154
|
|
|
155
|
+
|
|
156
|
+
<details>
|
|
157
|
+
<summary id="orpheus-cpp"><strong><span style="font-size: 1.4em;"> Sinapsis Orppheus-CPP</span></strong></summary>
|
|
158
|
+
|
|
159
|
+
This package provides a template for seamlessly integrating, configuring, and running **text-to-speech (TTS)** functionalities powered by [Orpheus-TTS](https://github.com/canopyai/Orpheus-TTS).
|
|
160
|
+
|
|
161
|
+
- **OrpheusTTS**: Converts text to speech using the Orpheus TTS model with advanced neural voice synthesis. The template processes text packets from the input container, generates corresponding audio using Orpheus TTS, and adds the resulting audio packets to the container. Features graceful error handling for out-of-memory conditions
|
|
162
|
+
|
|
163
|
+
For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_orpheus_cpp/README.md).
|
|
164
|
+
|
|
165
|
+
</details>
|
|
166
|
+
|
|
167
|
+
<details>
|
|
168
|
+
<summary id="parakeet-tdt"><strong><span style="font-size: 1.4em;"> Sinapsis Parakeet-TDT</span></strong></summary>
|
|
169
|
+
|
|
170
|
+
This package provides a template for seamlessly integrating, configuring, and running **speech-to-text (STT)** functionalities powered by [NVIDIA's Parakeet TDT model](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2).
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
- **ParakeetTDTInference**: Converts speech to text using NVIDIA's Parakeet TDT 0.6B model. This template processes audio packets from the input container or specified file paths, performs transcription with optional timestamp prediction, and adds the resulting text packets to the container.
|
|
174
|
+
|
|
175
|
+
For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_parakeet_tdt/README.md).
|
|
176
|
+
|
|
177
|
+
</details>
|
|
178
|
+
|
|
151
179
|
<h2 id="webapp">🌐 Webapps</h2>
|
|
152
180
|
The webapps included in this project showcase the modularity of the templates, in this case for speech generation tasks.
|
|
153
181
|
|
|
@@ -186,7 +214,6 @@ cd sinapsis-speech
|
|
|
186
214
|
docker compose -f docker/compose.yaml build
|
|
187
215
|
```
|
|
188
216
|
|
|
189
|
-
|
|
190
217
|
2. **Start the app container**:
|
|
191
218
|
|
|
192
219
|
- For ElevenLabs:
|
{sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_speech.egg-info/SOURCES.txt
RENAMED
|
@@ -4,6 +4,7 @@ pyproject.toml
|
|
|
4
4
|
packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/__init__.py
|
|
5
5
|
packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/__init__.py
|
|
6
6
|
packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/env_var_keys.py
|
|
7
|
+
packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/tags.py
|
|
7
8
|
packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/voice_utils.py
|
|
8
9
|
packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/__init__.py
|
|
9
10
|
packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_base.py
|
|
@@ -12,14 +13,24 @@ packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_tts.py
|
|
|
12
13
|
packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_clone.py
|
|
13
14
|
packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_generation.py
|
|
14
15
|
packages/sinapsis_f5_tts/src/sinapsis_f5_tts/__init__.py
|
|
16
|
+
packages/sinapsis_f5_tts/src/sinapsis_f5_tts/helpers/__init__.py
|
|
17
|
+
packages/sinapsis_f5_tts/src/sinapsis_f5_tts/helpers/tags.py
|
|
15
18
|
packages/sinapsis_f5_tts/src/sinapsis_f5_tts/templates/__init__.py
|
|
16
19
|
packages/sinapsis_f5_tts/src/sinapsis_f5_tts/templates/f5_tts_inference.py
|
|
20
|
+
packages/sinapsis_kokoro/src/sinapsis_kokoro/__init__.py
|
|
17
21
|
packages/sinapsis_kokoro/src/sinapsis_kokoro/helpers/kokoro_utils.py
|
|
22
|
+
packages/sinapsis_kokoro/src/sinapsis_kokoro/helpers/tags.py
|
|
18
23
|
packages/sinapsis_kokoro/src/sinapsis_kokoro/templates/__init__.py
|
|
19
24
|
packages/sinapsis_kokoro/src/sinapsis_kokoro/templates/kokoro_tts.py
|
|
25
|
+
packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/__init__.py
|
|
26
|
+
packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/helpers/__init__.py
|
|
27
|
+
packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/helpers/tags.py
|
|
20
28
|
packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/templates/__init__.py
|
|
21
29
|
packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/templates/orpheus_tts.py
|
|
22
30
|
packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/thirdparty/helpers.py
|
|
31
|
+
packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/__init__.py
|
|
32
|
+
packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/helpers/__init__.py
|
|
33
|
+
packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/helpers/tags.py
|
|
23
34
|
packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/templates/__init__.py
|
|
24
35
|
packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/templates/parakeet_tdt.py
|
|
25
36
|
packages/sinapsis_speech.egg-info/PKG-INFO
|
|
@@ -29,6 +40,7 @@ packages/sinapsis_speech.egg-info/requires.txt
|
|
|
29
40
|
packages/sinapsis_speech.egg-info/top_level.txt
|
|
30
41
|
packages/sinapsis_zonos/src/sinapsis_zonos/__init__.py
|
|
31
42
|
packages/sinapsis_zonos/src/sinapsis_zonos/helpers/__init__.py
|
|
43
|
+
packages/sinapsis_zonos/src/sinapsis_zonos/helpers/tags.py
|
|
32
44
|
packages/sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_keys.py
|
|
33
45
|
packages/sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_tts_utils.py
|
|
34
46
|
packages/sinapsis_zonos/src/sinapsis_zonos/templates/__init__.py
|
|
File without changes
|
|
File without changes
|
|
@@ -3,7 +3,7 @@ from typing import Set
|
|
|
3
3
|
|
|
4
4
|
import torch
|
|
5
5
|
import torchaudio
|
|
6
|
-
from sinapsis_core.template_base.
|
|
6
|
+
from sinapsis_core.template_base.base_models import TemplateAttributeType
|
|
7
7
|
from sinapsis_core.utils.logging_utils import sinapsis_logger
|
|
8
8
|
from zonos.conditioning import make_cond_dict, supported_language_codes
|
|
9
9
|
from zonos.model import Zonos
|
|
@@ -1,14 +1,11 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
"""Base template for Zonos speech synthesis"""
|
|
3
3
|
|
|
4
|
-
import os
|
|
5
|
-
import uuid
|
|
6
4
|
from typing import Literal, Set
|
|
7
5
|
|
|
8
6
|
import torch
|
|
9
|
-
import torchaudio
|
|
10
7
|
from pydantic import Field
|
|
11
|
-
from sinapsis_core.data_containers.data_packet import DataContainer, TextPacket
|
|
8
|
+
from sinapsis_core.data_containers.data_packet import AudioPacket, DataContainer, TextPacket
|
|
12
9
|
from sinapsis_core.template_base.base_models import (
|
|
13
10
|
OutputTypes,
|
|
14
11
|
TemplateAttributes,
|
|
@@ -16,10 +13,10 @@ from sinapsis_core.template_base.base_models import (
|
|
|
16
13
|
UIPropertiesMetadata,
|
|
17
14
|
)
|
|
18
15
|
from sinapsis_core.template_base.template import Template
|
|
19
|
-
from sinapsis_core.utils.env_var_keys import SINAPSIS_CACHE_DIR
|
|
20
16
|
from zonos.model import Zonos
|
|
21
17
|
from zonos.utils import DEFAULT_DEVICE as device
|
|
22
18
|
|
|
19
|
+
from sinapsis_zonos.helpers.tags import Tags
|
|
23
20
|
from sinapsis_zonos.helpers.zonos_keys import EmotionsConfig, SamplingParams, TTSKeys
|
|
24
21
|
from sinapsis_zonos.helpers.zonos_tts_utils import (
|
|
25
22
|
get_audio_prefix_codes,
|
|
@@ -38,7 +35,11 @@ class ZonosTTS(Template):
|
|
|
38
35
|
and fine control over various speech attributes like pitch, speaking rate, and emotions.
|
|
39
36
|
"""
|
|
40
37
|
|
|
41
|
-
UIProperties = UIPropertiesMetadata(
|
|
38
|
+
UIProperties = UIPropertiesMetadata(
|
|
39
|
+
category="Zonos",
|
|
40
|
+
output_type=OutputTypes.AUDIO,
|
|
41
|
+
tags=[Tags.AUDIO, Tags.AUDIO_GENERATION, Tags.ZONOS, Tags.SPEECH, Tags.TEXT_TO_SPEECH, Tags.VOICE_CLONING],
|
|
42
|
+
)
|
|
42
43
|
|
|
43
44
|
class AttributesBaseModel(TemplateAttributes):
|
|
44
45
|
"""
|
|
@@ -71,7 +72,7 @@ class ZonosTTS(Template):
|
|
|
71
72
|
fmax: float = 22050.0
|
|
72
73
|
language: str = TTSKeys.en_language
|
|
73
74
|
model: Literal["Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid"] = "Zyphra/Zonos-v0.1-transformer"
|
|
74
|
-
output_folder: str = os.path.join(SINAPSIS_CACHE_DIR, "zonos", "audios")
|
|
75
|
+
# output_folder: str = os.path.join(SINAPSIS_CACHE_DIR, "zonos", "audios")
|
|
75
76
|
pitch_std: float = 20.0
|
|
76
77
|
prefix_audio: str | None = None
|
|
77
78
|
randomized_seed: bool = True
|
|
@@ -85,7 +86,7 @@ class ZonosTTS(Template):
|
|
|
85
86
|
def __init__(self, attributes: TemplateAttributeType) -> None:
|
|
86
87
|
"""Initializes the Zonos model with the provided attributes."""
|
|
87
88
|
super().__init__(attributes)
|
|
88
|
-
os.makedirs(self.attributes.output_folder, exist_ok=True)
|
|
89
|
+
# os.makedirs(self.attributes.output_folder, exist_ok=True)
|
|
89
90
|
self.device = device
|
|
90
91
|
self.model = self._init_model()
|
|
91
92
|
init_seed(self.attributes)
|
|
@@ -112,8 +113,9 @@ class ZonosTTS(Template):
|
|
|
112
113
|
del self.model
|
|
113
114
|
torch.cuda.empty_cache()
|
|
114
115
|
|
|
115
|
-
def reset_state(self) -> None:
|
|
116
|
+
def reset_state(self, template_name: str | None = None) -> None:
|
|
116
117
|
"""Reinitialize the model and random seed."""
|
|
118
|
+
_ = template_name
|
|
117
119
|
self._del_model()
|
|
118
120
|
self.model = self._init_model()
|
|
119
121
|
init_seed(self.attributes)
|
|
@@ -154,10 +156,8 @@ class ZonosTTS(Template):
|
|
|
154
156
|
output_audio (torch.Tensor): The generated audio output tensor.
|
|
155
157
|
container (DataContainer): The container to store metadata.
|
|
156
158
|
"""
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
self._set_generic_data(container, [output_path])
|
|
160
|
-
self.logger.debug(f"Audio saved to: {output_path}")
|
|
159
|
+
audio_np = output_audio[0].cpu().numpy()
|
|
160
|
+
container.audios.append(AudioPacket(content=audio_np, sample_rate=self.model.autoencoder.sampling_rate))
|
|
161
161
|
|
|
162
162
|
def execute(self, container: DataContainer) -> DataContainer:
|
|
163
163
|
"""Processes the input data and generates a speech output."""
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "sinapsis-speech"
|
|
3
|
-
version = "0.4.
|
|
3
|
+
version = "0.4.1"
|
|
4
4
|
description = "Generate speech using various libraries."
|
|
5
5
|
authors = [
|
|
6
6
|
{name = "SinapsisAI", email = "dev@sinapsis.tech"},
|
|
@@ -28,6 +28,7 @@ all = [
|
|
|
28
28
|
"sinapsis-zonos[all]",
|
|
29
29
|
"sinapsis-parakeet-tdt[all]",
|
|
30
30
|
"sinapsis-orpheus-cpp[all]",
|
|
31
|
+
|
|
31
32
|
]
|
|
32
33
|
gradio-app = [
|
|
33
34
|
"sinapsis[webapp]>=0.2.3",
|
|
@@ -50,6 +51,7 @@ sinapsis-zonos = { workspace = true }
|
|
|
50
51
|
sinapsis-speech = { workspace = true }
|
|
51
52
|
sinapsis-parakeet-tdt = { workspace = true }
|
|
52
53
|
sinapsis-orpheus-cpp = { workspace = true }
|
|
54
|
+
sinapsis-chatterbox = { workspace = true }
|
|
53
55
|
|
|
54
56
|
|
|
55
57
|
[[tool.uv.index]]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_speech.egg-info/requires.txt
RENAMED
|
File without changes
|
{sinapsis_speech-0.4.0 → sinapsis_speech-0.4.1}/packages/sinapsis_speech.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|