sinapsis-speech 0.4.0__tar.gz → 0.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. {sinapsis_speech-0.4.0/packages/sinapsis_speech.egg-info → sinapsis_speech-0.4.2}/PKG-INFO +31 -4
  2. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/README.md +30 -3
  3. sinapsis_speech-0.4.2/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/tags.py +15 -0
  4. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_base.py +37 -51
  5. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_sts.py +8 -4
  6. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_tts.py +7 -4
  7. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_clone.py +7 -1
  8. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_generation.py +7 -1
  9. sinapsis_speech-0.4.2/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/helpers/tags.py +10 -0
  10. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/templates/f5_tts_inference.py +13 -1
  11. sinapsis_speech-0.4.2/packages/sinapsis_kokoro/src/sinapsis_kokoro/helpers/tags.py +10 -0
  12. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_kokoro/src/sinapsis_kokoro/templates/kokoro_tts.py +14 -3
  13. sinapsis_speech-0.4.2/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/__init__.py +0 -0
  14. sinapsis_speech-0.4.2/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/helpers/__init__.py +0 -0
  15. sinapsis_speech-0.4.2/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/helpers/tags.py +10 -0
  16. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/templates/orpheus_tts.py +15 -3
  17. sinapsis_speech-0.4.2/packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/__init__.py +0 -0
  18. sinapsis_speech-0.4.2/packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/helpers/__init__.py +0 -0
  19. sinapsis_speech-0.4.2/packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/helpers/tags.py +11 -0
  20. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/templates/parakeet_tdt.py +20 -1
  21. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2/packages/sinapsis_speech.egg-info}/PKG-INFO +31 -4
  22. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_speech.egg-info/SOURCES.txt +12 -0
  23. sinapsis_speech-0.4.2/packages/sinapsis_zonos/src/sinapsis_zonos/__init__.py +0 -0
  24. sinapsis_speech-0.4.2/packages/sinapsis_zonos/src/sinapsis_zonos/helpers/__init__.py +0 -0
  25. sinapsis_speech-0.4.2/packages/sinapsis_zonos/src/sinapsis_zonos/helpers/tags.py +11 -0
  26. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_tts_utils.py +1 -1
  27. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_zonos/src/sinapsis_zonos/templates/zonos_tts.py +13 -13
  28. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/pyproject.toml +3 -1
  29. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/LICENSE +0 -0
  30. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/__init__.py +0 -0
  31. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/__init__.py +0 -0
  32. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/env_var_keys.py +0 -0
  33. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/voice_utils.py +0 -0
  34. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/__init__.py +0 -0
  35. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/__init__.py +0 -0
  36. {sinapsis_speech-0.4.0/packages/sinapsis_zonos/src/sinapsis_zonos → sinapsis_speech-0.4.2/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/helpers}/__init__.py +0 -0
  37. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_f5_tts/src/sinapsis_f5_tts/templates/__init__.py +0 -0
  38. {sinapsis_speech-0.4.0/packages/sinapsis_zonos/src/sinapsis_zonos/helpers → sinapsis_speech-0.4.2/packages/sinapsis_kokoro/src/sinapsis_kokoro}/__init__.py +0 -0
  39. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_kokoro/src/sinapsis_kokoro/helpers/kokoro_utils.py +0 -0
  40. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_kokoro/src/sinapsis_kokoro/templates/__init__.py +0 -0
  41. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/templates/__init__.py +0 -0
  42. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/thirdparty/helpers.py +0 -0
  43. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/templates/__init__.py +0 -0
  44. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_speech.egg-info/dependency_links.txt +0 -0
  45. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_speech.egg-info/requires.txt +0 -0
  46. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_speech.egg-info/top_level.txt +0 -0
  47. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_keys.py +0 -0
  48. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/packages/sinapsis_zonos/src/sinapsis_zonos/templates/__init__.py +0 -0
  49. {sinapsis_speech-0.4.0 → sinapsis_speech-0.4.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sinapsis-speech
3
- Version: 0.4.0
3
+ Version: 0.4.2
4
4
  Summary: Generate speech using various libraries.
5
5
  Author-email: SinapsisAI <dev@sinapsis.tech>
6
6
  Project-URL: Homepage, https://sinapsis.tech
@@ -35,7 +35,7 @@ Sinapsis Speech
35
35
  <br>
36
36
  </h1>
37
37
 
38
- <h4 align="center"> Templates for a wide range of voice generation tasks.</h4>
38
+ <h4 align="center"> A monorepo housing multiple packages and templates for versatile voice generation, text-to-speech, speech-to-text, and beyond.</h4>
39
39
 
40
40
  <p align="center">
41
41
  <a href="#installation">🐍 Installation</a> •
@@ -108,10 +108,14 @@ This repository is organized into modular packages, each designed for integratio
108
108
  <details>
109
109
  <summary id="elevenlabs"><strong><span style="font-size: 1.4em;"> Sinapsis ElevenLabs </span></strong></summary>
110
110
 
111
- This package offers a suite of templates and utilities designed for effortless integrating, configuration, and execution of **text-to-speech (TTS)** and **voice generation** functionalities powered by [ElevenLabs](https://elevenlabs.io/).
111
+ This package offers a suite of templates and utilities designed for effortless integrating, configuration, and execution of **text-to-speech (TTS)**, **speech-to-speech (STS)**, **voice cloning**, and **voice generation** functionalities powered by [ElevenLabs](https://elevenlabs.io/).
112
+
113
+ - **ElevenLabsSTS**: Template for transforming a voice into a different character or style using the ElevenLabs Speech-to-Speech API.
112
114
 
113
115
  - **ElevenLabsTTS**: Template for converting text into speech using ElevenLabs' voice models.
114
116
 
117
+ - **ElevenLabsVoiceClone**: Template for creating a synthetic copy of an existing voice using the ElevenLabs API.
118
+
115
119
  - **ElevenLabsVoiceGeneration**: Template for generating custom synthetic voices based on user-provided descriptions.
116
120
 
117
121
  For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_elevenlabs/README.md).
@@ -148,6 +152,30 @@ For specific instructions and further details, see the [README.md](https://githu
148
152
 
149
153
  </details>
150
154
 
155
+
156
+ <details>
157
+ <summary id="orpheus-cpp"><strong><span style="font-size: 1.4em;"> Sinapsis Orppheus-CPP</span></strong></summary>
158
+
159
+ This package provides a template for seamlessly integrating, configuring, and running **text-to-speech (TTS)** functionalities powered by [Orpheus-TTS](https://github.com/canopyai/Orpheus-TTS).
160
+
161
+ - **OrpheusTTS**: Converts text to speech using the Orpheus TTS model with advanced neural voice synthesis. The template processes text packets from the input container, generates corresponding audio using Orpheus TTS, and adds the resulting audio packets to the container. Features graceful error handling for out-of-memory conditions
162
+
163
+ For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_orpheus_cpp/README.md).
164
+
165
+ </details>
166
+
167
+ <details>
168
+ <summary id="parakeet-tdt"><strong><span style="font-size: 1.4em;"> Sinapsis Parakeet-TDT</span></strong></summary>
169
+
170
+ This package provides a template for seamlessly integrating, configuring, and running **speech-to-text (STT)** functionalities powered by [NVIDIA's Parakeet TDT model](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2).
171
+
172
+
173
+ - **ParakeetTDTInference**: Converts speech to text using NVIDIA's Parakeet TDT 0.6B model. This template processes audio packets from the input container or specified file paths, performs transcription with optional timestamp prediction, and adds the resulting text packets to the container.
174
+
175
+ For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_parakeet_tdt/README.md).
176
+
177
+ </details>
178
+
151
179
  <h2 id="webapp">🌐 Webapps</h2>
152
180
  The webapps included in this project showcase the modularity of the templates, in this case for speech generation tasks.
153
181
 
@@ -186,7 +214,6 @@ cd sinapsis-speech
186
214
  docker compose -f docker/compose.yaml build
187
215
  ```
188
216
 
189
-
190
217
  2. **Start the app container**:
191
218
 
192
219
  - For ElevenLabs:
@@ -9,7 +9,7 @@ Sinapsis Speech
9
9
  <br>
10
10
  </h1>
11
11
 
12
- <h4 align="center"> Templates for a wide range of voice generation tasks.</h4>
12
+ <h4 align="center"> A monorepo housing multiple packages and templates for versatile voice generation, text-to-speech, speech-to-text, and beyond.</h4>
13
13
 
14
14
  <p align="center">
15
15
  <a href="#installation">🐍 Installation</a> •
@@ -82,10 +82,14 @@ This repository is organized into modular packages, each designed for integratio
82
82
  <details>
83
83
  <summary id="elevenlabs"><strong><span style="font-size: 1.4em;"> Sinapsis ElevenLabs </span></strong></summary>
84
84
 
85
- This package offers a suite of templates and utilities designed for effortless integrating, configuration, and execution of **text-to-speech (TTS)** and **voice generation** functionalities powered by [ElevenLabs](https://elevenlabs.io/).
85
+ This package offers a suite of templates and utilities designed for effortless integrating, configuration, and execution of **text-to-speech (TTS)**, **speech-to-speech (STS)**, **voice cloning**, and **voice generation** functionalities powered by [ElevenLabs](https://elevenlabs.io/).
86
+
87
+ - **ElevenLabsSTS**: Template for transforming a voice into a different character or style using the ElevenLabs Speech-to-Speech API.
86
88
 
87
89
  - **ElevenLabsTTS**: Template for converting text into speech using ElevenLabs' voice models.
88
90
 
91
+ - **ElevenLabsVoiceClone**: Template for creating a synthetic copy of an existing voice using the ElevenLabs API.
92
+
89
93
  - **ElevenLabsVoiceGeneration**: Template for generating custom synthetic voices based on user-provided descriptions.
90
94
 
91
95
  For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_elevenlabs/README.md).
@@ -122,6 +126,30 @@ For specific instructions and further details, see the [README.md](https://githu
122
126
 
123
127
  </details>
124
128
 
129
+
130
+ <details>
131
+ <summary id="orpheus-cpp"><strong><span style="font-size: 1.4em;"> Sinapsis Orppheus-CPP</span></strong></summary>
132
+
133
+ This package provides a template for seamlessly integrating, configuring, and running **text-to-speech (TTS)** functionalities powered by [Orpheus-TTS](https://github.com/canopyai/Orpheus-TTS).
134
+
135
+ - **OrpheusTTS**: Converts text to speech using the Orpheus TTS model with advanced neural voice synthesis. The template processes text packets from the input container, generates corresponding audio using Orpheus TTS, and adds the resulting audio packets to the container. Features graceful error handling for out-of-memory conditions
136
+
137
+ For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_orpheus_cpp/README.md).
138
+
139
+ </details>
140
+
141
+ <details>
142
+ <summary id="parakeet-tdt"><strong><span style="font-size: 1.4em;"> Sinapsis Parakeet-TDT</span></strong></summary>
143
+
144
+ This package provides a template for seamlessly integrating, configuring, and running **speech-to-text (STT)** functionalities powered by [NVIDIA's Parakeet TDT model](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2).
145
+
146
+
147
+ - **ParakeetTDTInference**: Converts speech to text using NVIDIA's Parakeet TDT 0.6B model. This template processes audio packets from the input container or specified file paths, performs transcription with optional timestamp prediction, and adds the resulting text packets to the container.
148
+
149
+ For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_parakeet_tdt/README.md).
150
+
151
+ </details>
152
+
125
153
  <h2 id="webapp">🌐 Webapps</h2>
126
154
  The webapps included in this project showcase the modularity of the templates, in this case for speech generation tasks.
127
155
 
@@ -160,7 +188,6 @@ cd sinapsis-speech
160
188
  docker compose -f docker/compose.yaml build
161
189
  ```
162
190
 
163
-
164
191
  2. **Start the app container**:
165
192
 
166
193
  - For ElevenLabs:
@@ -0,0 +1,15 @@
1
+ # -*- coding: utf-8 -*-
2
+ from enum import Enum
3
+
4
+
5
+ class Tags(Enum):
6
+ AUDIO = "audio"
7
+ AUDIO_GENERATION = "audio_generation"
8
+ ELEVENLABS = "elevenlabs"
9
+ PROMPT = "prompt"
10
+ SPEECH = "speech"
11
+ SPEECH_TO_SPEECH = "speech_to_speech"
12
+ TEXT_TO_SPEECH = "text_to_speech"
13
+ VOICE_CONVERSION = "voice_conversion"
14
+ VOICE_CLONING = "voice_cloning"
15
+ VOICE_GENERATION = "voice_generation"
@@ -3,11 +3,10 @@
3
3
 
4
4
  import abc
5
5
  import os
6
- import uuid
7
- from io import BytesIO
8
- from typing import IO, Iterable, Iterator, Literal
6
+ from typing import Generator, Iterable, Iterator, Literal
9
7
 
10
- from elevenlabs import Voice, VoiceSettings, save
8
+ import numpy as np
9
+ from elevenlabs import Voice, VoiceSettings
11
10
  from elevenlabs.client import ElevenLabs
12
11
  from elevenlabs.types import OutputFormat
13
12
  from pydantic import Field
@@ -19,9 +18,11 @@ from sinapsis_core.template_base.base_models import (
19
18
  UIPropertiesMetadata,
20
19
  )
21
20
  from sinapsis_core.template_base.template import Template
22
- from sinapsis_core.utils.env_var_keys import SINAPSIS_CACHE_DIR
21
+ from sinapsis_core.utils.env_var_keys import WORKING_DIR
22
+ from sinapsis_generic_data_tools.helpers.audio_encoder import audio_bytes_to_numpy
23
23
 
24
24
  from sinapsis_elevenlabs.helpers.env_var_keys import ELEVENLABS_API_KEY
25
+ from sinapsis_elevenlabs.helpers.tags import Tags
25
26
 
26
27
  RESPONSE_TYPE = Iterator[bytes] | list[bytes] | list[Iterator[bytes]] | None
27
28
 
@@ -51,8 +52,6 @@ class ElevenLabsBase(Template, abc.ABC):
51
52
  output_format (OutputFormat): The output audio format and quality. Options include:
52
53
  ["mp3_22050_32", "mp3_44100_32", "mp3_44100_64", "mp3_44100_96", "mp3_44100_128",
53
54
  "mp3_44100_192", "pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100", "ulaw_8000"]
54
- output_folder (str): The folder where generated audio files will be saved.
55
- stream (bool): If True, the audio is returned as a stream; otherwise, saved to a file.
56
55
  voice (str | Voice | None): The voice to use for speech synthesis. This can be a voice ID (str),
57
56
  a voice name (str) or an elevenlabs voice object (Voice).
58
57
  voice_settings (VoiceSettings): A dictionary of settings that control the behavior of the voice.
@@ -74,17 +73,20 @@ class ElevenLabsBase(Template, abc.ABC):
74
73
  ] = "eleven_turbo_v2_5"
75
74
  output_file_name: str | None = None
76
75
  output_format: OutputFormat = "mp3_44100_128"
77
- output_folder: str = os.path.join(SINAPSIS_CACHE_DIR, "elevenlabs", "audios")
76
+ output_folder: str = os.path.join(WORKING_DIR, "elevenlabs", "audios")
78
77
  stream: bool = False
79
78
  voice: str | Voice | None = None
80
79
  voice_settings: VoiceSettings = Field(default_factory=dict) # type: ignore[arg-type]
81
80
 
82
- UIProperties = UIPropertiesMetadata(category="Elevenlabs", output_type=OutputTypes.AUDIO)
81
+ UIProperties = UIPropertiesMetadata(
82
+ category="Elevenlabs",
83
+ output_type=OutputTypes.AUDIO,
84
+ tags=[Tags.AUDIO, Tags.ELEVENLABS, Tags.SPEECH],
85
+ )
83
86
 
84
87
  def __init__(self, attributes: TemplateAttributeType) -> None:
85
88
  """Initializes the ElevenLabs API client with the given attributes."""
86
89
  super().__init__(attributes)
87
- os.makedirs(self.attributes.output_folder, exist_ok=True)
88
90
  self.client = self.init_elevenlabs_client()
89
91
 
90
92
  def init_elevenlabs_client(self) -> ElevenLabs:
@@ -92,44 +94,27 @@ class ElevenLabsBase(Template, abc.ABC):
92
94
  key = self.attributes.api_key if self.attributes.api_key else ELEVENLABS_API_KEY
93
95
  return ElevenLabs(api_key=key)
94
96
 
95
- def reset_state(self) -> None:
97
+ def reset_state(self, template_name: str | None = None) -> None:
96
98
  """Resets state of model"""
99
+ _ = template_name
97
100
  self.client = self.init_elevenlabs_client()
98
101
 
99
102
  @abc.abstractmethod
100
103
  def synthesize_speech(self, input_data: list[Packet]) -> RESPONSE_TYPE:
101
104
  """Abstract method for ElevenLabs speech synthesis."""
102
105
 
103
- def _save_audio(self, response: Iterable | bytes, file_format: str, idx: int) -> str:
104
- """Saves the audio to a file and returns the file path."""
105
- if self.attributes.output_file_name:
106
- file_name = self.attributes.output_file_name + "_" + str(idx)
107
- else:
108
- file_name = uuid.uuid4()
109
-
110
- output_file = os.path.join(self.attributes.output_folder, f"{file_name}.{file_format}")
111
- try:
112
- save(response, output_file)
113
- self.logger.info(f"Audio saved to: {output_file}")
114
- return output_file
115
- except OSError as e:
116
- self.logger.error(f"File system error while saving speech to file: {e}")
117
- raise
118
-
119
- def _generate_audio_stream(self, response: Iterable | bytes) -> IO[bytes]:
106
+ def _generate_audio_stream(self, response: Iterable | bytes) -> bytes:
120
107
  """Generates and returns the audio stream."""
121
- audio_stream = BytesIO()
108
+
122
109
  try:
123
110
  if isinstance(response, Iterator):
124
- for chunk in response:
125
- if chunk:
126
- audio_stream.write(chunk)
111
+ audio_stream = b"".join(chunk for chunk in response)
127
112
  elif isinstance(response, bytes):
128
- audio_stream.write(response)
113
+ audio_stream = response
114
+
129
115
  else:
130
116
  raise TypeError(f"Unsupported response type: {type(response)}")
131
117
 
132
- audio_stream.seek(0)
133
118
  self.logger.info("Returning audio stream")
134
119
  return audio_stream
135
120
  except IOError as e:
@@ -139,14 +124,15 @@ class ElevenLabsBase(Template, abc.ABC):
139
124
  self.logger.error(f"Value error while processing audio chunks: {e}")
140
125
  raise
141
126
 
142
- def _process_audio_output(self, idx: int, response: Iterable | bytes) -> str | IO[bytes]:
127
+ def _process_audio_output(self, response: Iterable | bytes) -> tuple[np.ndarray, int]:
143
128
  """Processes a single audio output (either stream or file)."""
144
- if self.attributes.stream:
145
- return self._generate_audio_stream(response)
146
- file_format = "mp3" if "mp3" in self.attributes.output_format else "wav"
147
- return self._save_audio(response, file_format, idx)
148
129
 
149
- def generate_speech(self, input_data: list[Packet]) -> list[str | IO[bytes]] | None:
130
+ result = self._generate_audio_stream(response)
131
+ audio_np, sample_rate = audio_bytes_to_numpy(result)
132
+
133
+ return audio_np, sample_rate
134
+
135
+ def generate_speech(self, input_data: list[Packet]) -> list[tuple] | None:
150
136
  """Generates speech and saves it to a file."""
151
137
  responses: RESPONSE_TYPE = self.synthesize_speech(input_data)
152
138
  if not responses:
@@ -154,29 +140,29 @@ class ElevenLabsBase(Template, abc.ABC):
154
140
 
155
141
  if isinstance(responses, Iterator):
156
142
  responses = [responses]
157
-
158
- audio_outputs = [self._process_audio_output(idx, response) for idx, response in enumerate(responses)]
143
+ elif isinstance(responses, Generator):
144
+ responses = list(responses)
145
+ audio_outputs = [self._process_audio_output(response) for response in responses]
159
146
  return audio_outputs
160
147
 
161
- def _handle_streaming_output(self, audio_outputs: list[str | IO[bytes]]) -> list[AudioPacket]:
148
+ def _handle_streaming_output(self, audio_outputs: list[tuple]) -> list[AudioPacket]:
162
149
  """Handles audio stream output by adding it to the container as AudioPackets."""
163
150
  generated_audios: list[AudioPacket] = []
164
- sample_rate = int(self.attributes.output_format.split("_")[1])
151
+ # sample_rate = int(self.attributes.output_format.split("_")[1])
165
152
  for audio_output in audio_outputs:
153
+ audio = audio_output[0]
154
+ sample_rate = audio_output[1]
166
155
  audio_packet = AudioPacket(
167
- content=audio_output,
156
+ content=audio,
168
157
  sample_rate=sample_rate,
169
158
  )
170
159
  generated_audios.append(audio_packet)
171
160
  return generated_audios
172
161
 
173
- def _handle_audio_outputs(self, audio_outputs: list[str | IO[bytes]], container: DataContainer) -> None:
162
+ def _handle_audio_outputs(self, audio_outputs: list[tuple], container: DataContainer) -> None:
174
163
  """Handles the audio outputs by appending to the container based on the output type (stream or file)."""
175
- if self.attributes.stream:
176
- container.audios = container.audios or []
177
- container.audios.extend(self._handle_streaming_output(audio_outputs))
178
- else:
179
- self._set_generic_data(container, audio_outputs)
164
+ container.audios = container.audios or []
165
+ container.audios = self._handle_streaming_output(audio_outputs)
180
166
 
181
167
  def execute(self, container: DataContainer) -> DataContainer:
182
168
  """
@@ -5,9 +5,13 @@ from typing import Callable, Iterator, Literal
5
5
 
6
6
  from sinapsis_core.data_containers.data_packet import AudioPacket
7
7
 
8
+ from sinapsis_elevenlabs.helpers.tags import Tags
8
9
  from sinapsis_elevenlabs.helpers.voice_utils import create_voice_settings, get_voice_id
9
10
  from sinapsis_elevenlabs.templates.elevenlabs_base import ElevenLabsBase
10
11
 
12
+ ElevenLabsSTSUIProperties = ElevenLabsBase.UIProperties
13
+ ElevenLabsSTSUIProperties.tags.extend([Tags.SPEECH_TO_SPEECH, Tags.VOICE_CONVERSION])
14
+
11
15
 
12
16
  class ElevenLabsSTS(ElevenLabsBase):
13
17
  """Template to interact with the ElevenLabs Speech-to-Speech API.
@@ -31,7 +35,7 @@ class ElevenLabsSTS(ElevenLabsBase):
31
35
  model: eleven_multilingual_sts_v2
32
36
  output_file_name: null
33
37
  output_format: mp3_44100_128
34
- output_folder: ~/.cache/sinapsis/elevenlabs/audios
38
+ output_folder: <WORKING_DIR>/elevenlabs/audios
35
39
  stream: false
36
40
  voice: null
37
41
  voice_settings:
@@ -45,6 +49,7 @@ class ElevenLabsSTS(ElevenLabsBase):
45
49
  """
46
50
 
47
51
  PACKET_TYPE_NAME: str = "audios"
52
+ UIProperties = ElevenLabsSTSUIProperties
48
53
 
49
54
  class AttributesBaseModel(ElevenLabsBase.AttributesBaseModel):
50
55
  """Attributes specific to ElevenLabs STS API interaction.
@@ -73,9 +78,8 @@ class ElevenLabsSTS(ElevenLabsBase):
73
78
  KeyError: If the expected key is missing in the API response.
74
79
  """
75
80
  try:
76
- method: Callable[..., Iterator[bytes]] = (
77
- self.client.speech_to_speech.stream if self.attributes.stream else self.client.speech_to_speech.convert
78
- )
81
+ method: Callable[..., Iterator[bytes]] = self.client.speech_to_speech.stream # (
82
+
79
83
  return method(
80
84
  voice_id=get_voice_id(self.client, voice=self.attributes.voice),
81
85
  audio=input_data[0].content,
@@ -5,6 +5,7 @@ from typing import Callable, Iterator, Literal
5
5
 
6
6
  from sinapsis_core.data_containers.data_packet import TextPacket
7
7
 
8
+ from sinapsis_elevenlabs.helpers.tags import Tags
8
9
  from sinapsis_elevenlabs.helpers.voice_utils import (
9
10
  create_voice_settings,
10
11
  get_voice_id,
@@ -12,6 +13,9 @@ from sinapsis_elevenlabs.helpers.voice_utils import (
12
13
  )
13
14
  from sinapsis_elevenlabs.templates.elevenlabs_base import ElevenLabsBase
14
15
 
16
+ ElevenLabsTTSUIProperties = ElevenLabsBase.UIProperties
17
+ ElevenLabsTTSUIProperties.tags.extend([Tags.TEXT_TO_SPEECH])
18
+
15
19
 
16
20
  class ElevenLabsTTS(ElevenLabsBase):
17
21
  """Template to interact with ElevenLabs text-to-speech API.
@@ -36,7 +40,7 @@ class ElevenLabsTTS(ElevenLabsBase):
36
40
  voice_settings: null
37
41
  model: eleven_turbo_v2_5
38
42
  output_format: mp3_44100_128
39
- output_folder: /sinapsis/cache/dir/elevenlabs/audios
43
+ output_folder: <WORKING_DIR>/elevenlabs/audios
40
44
  stream: false
41
45
 
42
46
  """
@@ -65,9 +69,8 @@ class ElevenLabsTTS(ElevenLabsBase):
65
69
  """
66
70
  input_text: str = load_input_text(input_data)
67
71
  try:
68
- method: Callable[..., Iterator[bytes]] = (
69
- self.client.text_to_speech.stream if self.attributes.stream else self.client.text_to_speech.convert
70
- )
72
+ method: Callable[..., Iterator[bytes]] = self.client.text_to_speech.stream
73
+
71
74
  return method(
72
75
  text=input_text,
73
76
  voice_id=get_voice_id(self.client, self.attributes.voice),
@@ -4,8 +4,12 @@
4
4
  from elevenlabs import Voice
5
5
  from sinapsis_core.data_containers.data_packet import AudioPacket, DataContainer
6
6
 
7
+ from sinapsis_elevenlabs.helpers.tags import Tags
7
8
  from sinapsis_elevenlabs.templates.elevenlabs_tts import ElevenLabsTTS
8
9
 
10
+ ElevenLabsVoiceCloneUIProperties = ElevenLabsTTS.UIProperties
11
+ ElevenLabsVoiceCloneUIProperties.tags.extend([Tags.VOICE_CLONING])
12
+
9
13
 
10
14
  class ElevenLabsVoiceClone(ElevenLabsTTS):
11
15
  """Template to clone a voice using the ElevenLabs API.
@@ -30,7 +34,7 @@ class ElevenLabsVoiceClone(ElevenLabsTTS):
30
34
  model: eleven_turbo_v2_5
31
35
  output_file_name: null
32
36
  output_format: mp3_44100_128
33
- output_folder: ~/.cache/sinapsis/elevenlabs/audios
37
+ output_folder: <WORKING_DIR>/elevenlabs/audios
34
38
  stream: false
35
39
  voice: null
36
40
  voice_settings:
@@ -45,6 +49,8 @@ class ElevenLabsVoiceClone(ElevenLabsTTS):
45
49
 
46
50
  """
47
51
 
52
+ UIProperties = ElevenLabsVoiceCloneUIProperties
53
+
48
54
  class AttributesBaseModel(ElevenLabsTTS.AttributesBaseModel):
49
55
  """Attributes specific to the ElevenLabsVoiceClone class.
50
56
 
@@ -5,9 +5,13 @@ import base64
5
5
 
6
6
  from sinapsis_core.data_containers.data_packet import TextPacket
7
7
 
8
+ from sinapsis_elevenlabs.helpers.tags import Tags
8
9
  from sinapsis_elevenlabs.helpers.voice_utils import load_input_text
9
10
  from sinapsis_elevenlabs.templates.elevenlabs_base import ElevenLabsBase
10
11
 
12
+ ElevenLabsVoiceGenerationUIProperties = ElevenLabsBase.UIProperties
13
+ ElevenLabsVoiceGenerationUIProperties.tags.extend([Tags.VOICE_GENERATION, Tags.PROMPT])
14
+
11
15
 
12
16
  class ElevenLabsVoiceGeneration(ElevenLabsBase):
13
17
  """
@@ -33,12 +37,14 @@ class ElevenLabsVoiceGeneration(ElevenLabsBase):
33
37
  voice_settings: null
34
38
  model: eleven_turbo_v2_5
35
39
  output_format: mp3_44100_128
36
- output_folder: /sinapsis/cache/dir/elevenlabs/audios
40
+ output_folder: <WORKING_DIR>/elevenlabs/audios
37
41
  stream: false
38
42
  voice_description: An old British male with a raspy, deep voice. Professional,
39
43
  relaxed and assertive
40
44
  """
41
45
 
46
+ UIProperties = ElevenLabsVoiceGenerationUIProperties
47
+
42
48
  class AttributesBaseModel(ElevenLabsBase.AttributesBaseModel):
43
49
  """
44
50
  Attributes for voice generation in ElevenLabs API.
@@ -0,0 +1,10 @@
1
+ # -*- coding: utf-8 -*-
2
+ from enum import Enum
3
+
4
+
5
+ class Tags(Enum):
6
+ AUDIO = "audio"
7
+ AUDIO_GENERATION = "audio_generation"
8
+ F5TTS = "f5tts"
9
+ SPEECH = "speech"
10
+ TEXT_TO_SPEECH = "text_to_speech"
@@ -6,6 +6,7 @@ from typing import Any, Literal
6
6
 
7
7
  import numpy as np
8
8
  import soundfile as sf
9
+ import torch
9
10
  from pydantic import Field
10
11
  from pydantic.dataclasses import dataclass
11
12
  from sinapsis_core.data_containers.data_packet import (
@@ -15,6 +16,8 @@ from sinapsis_core.data_containers.data_packet import (
15
16
  from sinapsis_core.template_base import Template
16
17
  from sinapsis_core.template_base.base_models import OutputTypes, TemplateAttributes, UIPropertiesMetadata
17
18
 
19
+ from sinapsis_f5_tts.helpers.tags import Tags
20
+
18
21
 
19
22
  @dataclass
20
23
  class F5CliKeys:
@@ -146,7 +149,11 @@ class F5TTSInference(Template):
146
149
  """
147
150
 
148
151
  AttributesBaseModel = F5TTSInferenceAttributes
149
- UIProperties = UIPropertiesMetadata(category="F5TTS", output_type=OutputTypes.AUDIO)
152
+ UIProperties = UIPropertiesMetadata(
153
+ category="F5TTS",
154
+ output_type=OutputTypes.AUDIO,
155
+ tags=[Tags.AUDIO, Tags.AUDIO_GENERATION, Tags.F5TTS, Tags.SPEECH, Tags.TEXT_TO_SPEECH],
156
+ )
150
157
 
151
158
  def _add_attribute_to_command(self, cli_command: list[str], field_name: str, field: Any) -> None:
152
159
  """
@@ -357,3 +364,8 @@ class F5TTSInference(Template):
357
364
  )
358
365
 
359
366
  return container
367
+
368
+ def reset_state(self, template_name: str | None = None) -> None:
369
+ if torch.cuda.is_available():
370
+ torch.cuda.empty_cache()
371
+ super().reset_state(template_name)
@@ -0,0 +1,10 @@
1
+ # -*- coding: utf-8 -*-
2
+ from enum import Enum
3
+
4
+
5
+ class Tags(Enum):
6
+ AUDIO = "audio"
7
+ AUDIO_GENERATION = "audio_generation"
8
+ KOKORO = "kokoro"
9
+ SPEECH = "speech"
10
+ TEXT_TO_SPEECH = "text_to_speech"
@@ -1,5 +1,5 @@
1
1
  # -*- coding: utf-8 -*-
2
- from typing import Generator
2
+ from typing import Generator, Literal
3
3
  from urllib.error import HTTPError
4
4
 
5
5
  import torch
@@ -15,6 +15,7 @@ from sinapsis_core.template_base.template import Template
15
15
  from sinapsis_core.utils.logging_utils import make_loguru
16
16
 
17
17
  from sinapsis_kokoro.helpers.kokoro_utils import KokoroKeys, kokoro_voices
18
+ from sinapsis_kokoro.helpers.tags import Tags
18
19
 
19
20
 
20
21
  class KokoroTTS(Template):
@@ -39,7 +40,11 @@ class KokoroTTS(Template):
39
40
  voice: af_heart
40
41
  """
41
42
 
42
- UIProperties = UIPropertiesMetadata(category="Kokoro", output_type=OutputTypes.AUDIO)
43
+ UIProperties = UIPropertiesMetadata(
44
+ category="Kokoro",
45
+ output_type=OutputTypes.AUDIO,
46
+ tags=[Tags.AUDIO, Tags.AUDIO_GENERATION, Tags.KOKORO, Tags.SPEECH, Tags.TEXT_TO_SPEECH],
47
+ )
43
48
 
44
49
  class AttributesBaseModel(TemplateAttributes):
45
50
  """
@@ -56,6 +61,7 @@ class KokoroTTS(Template):
56
61
  https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md
57
62
  """
58
63
 
64
+ device: Literal["cpu", "cuda"] = "cpu"
59
65
  speed: int | float = 1
60
66
  split_pattern: str = r"\n+"
61
67
  voice: kokoro_voices = KokoroKeys.default_voice
@@ -73,7 +79,7 @@ class KokoroTTS(Template):
73
79
  Returns:
74
80
  KPipeline: The initialized TTS pipeline for generating speech.
75
81
  """
76
- return KPipeline(lang_code=self.attributes.voice[0], repo_id=KokoroKeys.repo_id)
82
+ return KPipeline(lang_code=self.attributes.voice[0], repo_id=KokoroKeys.repo_id, device=self.attributes.device)
77
83
 
78
84
  def _create_audio_packet(
79
85
  self,
@@ -151,3 +157,8 @@ class KokoroTTS(Template):
151
157
  self.generate_speech(container)
152
158
 
153
159
  return container
160
+
161
+ def reset_state(self, template_name: str | None = None) -> None:
162
+ if "cuda" in self.attributes.device:
163
+ torch.cuda.empty_cache()
164
+ super().reset_state(template_name)
@@ -0,0 +1,10 @@
1
+ # -*- coding: utf-8 -*-
2
+ from enum import Enum
3
+
4
+
5
+ class Tags(Enum):
6
+ AUDIO = "audio"
7
+ AUDIO_GENERATION = "audio_generation"
8
+ ORPHEUS_CPP = "orpheus_cpp"
9
+ SPEECH = "speech"
10
+ TEXT_TO_SPEECH = "text_to_speech"
@@ -1,6 +1,7 @@
1
1
  # -*- coding: utf-8 -*-
2
2
 
3
3
  import numpy as np
4
+ import torch
4
5
  from llama_cpp import Llama
5
6
  from orpheus_cpp import OrpheusCpp
6
7
  from orpheus_cpp.model import TTSOptions
@@ -18,6 +19,7 @@ from sinapsis_core.template_base.base_models import (
18
19
  )
19
20
  from sinapsis_core.utils.env_var_keys import SINAPSIS_CACHE_DIR
20
21
 
22
+ from sinapsis_orpheus_cpp.helpers.tags import Tags
21
23
  from sinapsis_orpheus_cpp.thirdparty.helpers import download_model, setup_snac_session
22
24
 
23
25
 
@@ -129,7 +131,11 @@ class OrpheusTTS(Template):
129
131
  """
130
132
 
131
133
  AttributesBaseModel = OrpheusTTSAttributes
132
- UIProperties = UIPropertiesMetadata(category="TTS", output_type=OutputTypes.AUDIO)
134
+ UIProperties = UIPropertiesMetadata(
135
+ category="TTS",
136
+ output_type=OutputTypes.AUDIO,
137
+ tags=[Tags.AUDIO, Tags.AUDIO_GENERATION, Tags.ORPHEUS_CPP, Tags.SPEECH, Tags.TEXT_TO_SPEECH],
138
+ )
133
139
 
134
140
  def __init__(self, attributes: TemplateAttributeType) -> None:
135
141
  super().__init__(attributes)
@@ -154,8 +160,9 @@ class OrpheusTTS(Template):
154
160
  model_variant=self.attributes.model_variant,
155
161
  cache_dir=self.attributes.cache_dir,
156
162
  )
157
- self._setup_llm(model_file)
158
- self._setup_snac_session()
163
+ if model_file:
164
+ self._setup_llm(model_file)
165
+ self._setup_snac_session()
159
166
 
160
167
  def _setup_llm(self, model_file: str) -> None:
161
168
  """Setup the Large Language Model component with specified parameters.
@@ -298,3 +305,8 @@ class OrpheusTTS(Template):
298
305
  container.audios.append(audio_packet)
299
306
 
300
307
  return container
308
+
309
+ def reset_state(self, template_name: str | None = None) -> None:
310
+ if torch.cuda.is_available():
311
+ torch.cuda.empty_cache()
312
+ super().reset_state(template_name)
@@ -0,0 +1,11 @@
1
+ # -*- coding: utf-8 -*-
2
+ from enum import Enum
3
+
4
+
5
+ class Tags(Enum):
6
+ AUDIO = "audio"
7
+ SPEECH = "speech"
8
+ SPEECH_RECOGNITION = "speech_recognition"
9
+ PARAKEET_TDT = "parakeet_tdt"
10
+ SPEECH_TO_TEXT = "speech_to_text"
11
+ TRANSCRIPTION = "transcription"
@@ -3,6 +3,7 @@ import os
3
3
  from typing import Any, Literal
4
4
 
5
5
  import nemo.collections.asr as nemo_asr
6
+ import torch
6
7
  from sinapsis_core.data_containers.data_packet import (
7
8
  AudioPacket,
8
9
  DataContainer,
@@ -15,6 +16,8 @@ from sinapsis_core.template_base.base_models import (
15
16
  )
16
17
  from sinapsis_core.template_base.template import Template
17
18
 
19
+ from sinapsis_parakeet_tdt.helpers.tags import Tags
20
+
18
21
 
19
22
  class ParakeetTDTInferenceAttributes(TemplateAttributes):
20
23
  """
@@ -68,7 +71,18 @@ class ParakeetTDTInference(Template):
68
71
  refresh_cache: False
69
72
  """
70
73
 
71
- UIProperties = UIPropertiesMetadata(category="Parakeet TDT", output_type=OutputTypes.TEXT)
74
+ UIProperties = UIPropertiesMetadata(
75
+ category="Parakeet TDT",
76
+ output_type=OutputTypes.TEXT,
77
+ tags=[
78
+ Tags.AUDIO,
79
+ Tags.SPEECH,
80
+ Tags.PARAKEET_TDT,
81
+ Tags.SPEECH_RECOGNITION,
82
+ Tags.SPEECH_TO_TEXT,
83
+ Tags.TRANSCRIPTION,
84
+ ],
85
+ )
72
86
 
73
87
  AttributesBaseModel = ParakeetTDTInferenceAttributes
74
88
 
@@ -268,3 +282,8 @@ class ParakeetTDTInference(Template):
268
282
  container.texts.extend(text_packets)
269
283
 
270
284
  return container
285
+
286
+ def reset_state(self, template_name: str | None = None) -> None:
287
+ if "cuda" in self.attributes.device:
288
+ torch.cuda.empty_cache()
289
+ super().reset_state(template_name)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sinapsis-speech
3
- Version: 0.4.0
3
+ Version: 0.4.2
4
4
  Summary: Generate speech using various libraries.
5
5
  Author-email: SinapsisAI <dev@sinapsis.tech>
6
6
  Project-URL: Homepage, https://sinapsis.tech
@@ -35,7 +35,7 @@ Sinapsis Speech
35
35
  <br>
36
36
  </h1>
37
37
 
38
- <h4 align="center"> Templates for a wide range of voice generation tasks.</h4>
38
+ <h4 align="center"> A monorepo housing multiple packages and templates for versatile voice generation, text-to-speech, speech-to-text, and beyond.</h4>
39
39
 
40
40
  <p align="center">
41
41
  <a href="#installation">🐍 Installation</a> •
@@ -108,10 +108,14 @@ This repository is organized into modular packages, each designed for integratio
108
108
  <details>
109
109
  <summary id="elevenlabs"><strong><span style="font-size: 1.4em;"> Sinapsis ElevenLabs </span></strong></summary>
110
110
 
111
- This package offers a suite of templates and utilities designed for effortless integrating, configuration, and execution of **text-to-speech (TTS)** and **voice generation** functionalities powered by [ElevenLabs](https://elevenlabs.io/).
111
+ This package offers a suite of templates and utilities designed for effortless integrating, configuration, and execution of **text-to-speech (TTS)**, **speech-to-speech (STS)**, **voice cloning**, and **voice generation** functionalities powered by [ElevenLabs](https://elevenlabs.io/).
112
+
113
+ - **ElevenLabsSTS**: Template for transforming a voice into a different character or style using the ElevenLabs Speech-to-Speech API.
112
114
 
113
115
  - **ElevenLabsTTS**: Template for converting text into speech using ElevenLabs' voice models.
114
116
 
117
+ - **ElevenLabsVoiceClone**: Template for creating a synthetic copy of an existing voice using the ElevenLabs API.
118
+
115
119
  - **ElevenLabsVoiceGeneration**: Template for generating custom synthetic voices based on user-provided descriptions.
116
120
 
117
121
  For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_elevenlabs/README.md).
@@ -148,6 +152,30 @@ For specific instructions and further details, see the [README.md](https://githu
148
152
 
149
153
  </details>
150
154
 
155
+
156
+ <details>
157
+ <summary id="orpheus-cpp"><strong><span style="font-size: 1.4em;"> Sinapsis Orppheus-CPP</span></strong></summary>
158
+
159
+ This package provides a template for seamlessly integrating, configuring, and running **text-to-speech (TTS)** functionalities powered by [Orpheus-TTS](https://github.com/canopyai/Orpheus-TTS).
160
+
161
+ - **OrpheusTTS**: Converts text to speech using the Orpheus TTS model with advanced neural voice synthesis. The template processes text packets from the input container, generates corresponding audio using Orpheus TTS, and adds the resulting audio packets to the container. Features graceful error handling for out-of-memory conditions
162
+
163
+ For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_orpheus_cpp/README.md).
164
+
165
+ </details>
166
+
167
+ <details>
168
+ <summary id="parakeet-tdt"><strong><span style="font-size: 1.4em;"> Sinapsis Parakeet-TDT</span></strong></summary>
169
+
170
+ This package provides a template for seamlessly integrating, configuring, and running **speech-to-text (STT)** functionalities powered by [NVIDIA's Parakeet TDT model](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2).
171
+
172
+
173
+ - **ParakeetTDTInference**: Converts speech to text using NVIDIA's Parakeet TDT 0.6B model. This template processes audio packets from the input container or specified file paths, performs transcription with optional timestamp prediction, and adds the resulting text packets to the container.
174
+
175
+ For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_parakeet_tdt/README.md).
176
+
177
+ </details>
178
+
151
179
  <h2 id="webapp">🌐 Webapps</h2>
152
180
  The webapps included in this project showcase the modularity of the templates, in this case for speech generation tasks.
153
181
 
@@ -186,7 +214,6 @@ cd sinapsis-speech
186
214
  docker compose -f docker/compose.yaml build
187
215
  ```
188
216
 
189
-
190
217
  2. **Start the app container**:
191
218
 
192
219
  - For ElevenLabs:
@@ -4,6 +4,7 @@ pyproject.toml
4
4
  packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/__init__.py
5
5
  packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/__init__.py
6
6
  packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/env_var_keys.py
7
+ packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/tags.py
7
8
  packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/voice_utils.py
8
9
  packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/__init__.py
9
10
  packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_base.py
@@ -12,14 +13,24 @@ packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_tts.py
12
13
  packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_clone.py
13
14
  packages/sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_generation.py
14
15
  packages/sinapsis_f5_tts/src/sinapsis_f5_tts/__init__.py
16
+ packages/sinapsis_f5_tts/src/sinapsis_f5_tts/helpers/__init__.py
17
+ packages/sinapsis_f5_tts/src/sinapsis_f5_tts/helpers/tags.py
15
18
  packages/sinapsis_f5_tts/src/sinapsis_f5_tts/templates/__init__.py
16
19
  packages/sinapsis_f5_tts/src/sinapsis_f5_tts/templates/f5_tts_inference.py
20
+ packages/sinapsis_kokoro/src/sinapsis_kokoro/__init__.py
17
21
  packages/sinapsis_kokoro/src/sinapsis_kokoro/helpers/kokoro_utils.py
22
+ packages/sinapsis_kokoro/src/sinapsis_kokoro/helpers/tags.py
18
23
  packages/sinapsis_kokoro/src/sinapsis_kokoro/templates/__init__.py
19
24
  packages/sinapsis_kokoro/src/sinapsis_kokoro/templates/kokoro_tts.py
25
+ packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/__init__.py
26
+ packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/helpers/__init__.py
27
+ packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/helpers/tags.py
20
28
  packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/templates/__init__.py
21
29
  packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/templates/orpheus_tts.py
22
30
  packages/sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/thirdparty/helpers.py
31
+ packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/__init__.py
32
+ packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/helpers/__init__.py
33
+ packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/helpers/tags.py
23
34
  packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/templates/__init__.py
24
35
  packages/sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/templates/parakeet_tdt.py
25
36
  packages/sinapsis_speech.egg-info/PKG-INFO
@@ -29,6 +40,7 @@ packages/sinapsis_speech.egg-info/requires.txt
29
40
  packages/sinapsis_speech.egg-info/top_level.txt
30
41
  packages/sinapsis_zonos/src/sinapsis_zonos/__init__.py
31
42
  packages/sinapsis_zonos/src/sinapsis_zonos/helpers/__init__.py
43
+ packages/sinapsis_zonos/src/sinapsis_zonos/helpers/tags.py
32
44
  packages/sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_keys.py
33
45
  packages/sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_tts_utils.py
34
46
  packages/sinapsis_zonos/src/sinapsis_zonos/templates/__init__.py
@@ -0,0 +1,11 @@
1
+ # -*- coding: utf-8 -*-
2
+ from enum import Enum
3
+
4
+
5
+ class Tags(Enum):
6
+ AUDIO = "audio"
7
+ AUDIO_GENERATION = "audio_generation"
8
+ SPEECH = "speech"
9
+ TEXT_TO_SPEECH = "text_to_speech"
10
+ VOICE_CLONING = "voice_cloning"
11
+ ZONOS = "zonos"
@@ -3,7 +3,7 @@ from typing import Set
3
3
 
4
4
  import torch
5
5
  import torchaudio
6
- from sinapsis_core.template_base.template import TemplateAttributeType
6
+ from sinapsis_core.template_base.base_models import TemplateAttributeType
7
7
  from sinapsis_core.utils.logging_utils import sinapsis_logger
8
8
  from zonos.conditioning import make_cond_dict, supported_language_codes
9
9
  from zonos.model import Zonos
@@ -1,14 +1,11 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  """Base template for Zonos speech synthesis"""
3
3
 
4
- import os
5
- import uuid
6
4
  from typing import Literal, Set
7
5
 
8
6
  import torch
9
- import torchaudio
10
7
  from pydantic import Field
11
- from sinapsis_core.data_containers.data_packet import DataContainer, TextPacket
8
+ from sinapsis_core.data_containers.data_packet import AudioPacket, DataContainer, TextPacket
12
9
  from sinapsis_core.template_base.base_models import (
13
10
  OutputTypes,
14
11
  TemplateAttributes,
@@ -16,10 +13,10 @@ from sinapsis_core.template_base.base_models import (
16
13
  UIPropertiesMetadata,
17
14
  )
18
15
  from sinapsis_core.template_base.template import Template
19
- from sinapsis_core.utils.env_var_keys import SINAPSIS_CACHE_DIR
20
16
  from zonos.model import Zonos
21
17
  from zonos.utils import DEFAULT_DEVICE as device
22
18
 
19
+ from sinapsis_zonos.helpers.tags import Tags
23
20
  from sinapsis_zonos.helpers.zonos_keys import EmotionsConfig, SamplingParams, TTSKeys
24
21
  from sinapsis_zonos.helpers.zonos_tts_utils import (
25
22
  get_audio_prefix_codes,
@@ -38,7 +35,11 @@ class ZonosTTS(Template):
38
35
  and fine control over various speech attributes like pitch, speaking rate, and emotions.
39
36
  """
40
37
 
41
- UIProperties = UIPropertiesMetadata(category="Zonos", output_type=OutputTypes.AUDIO)
38
+ UIProperties = UIPropertiesMetadata(
39
+ category="Zonos",
40
+ output_type=OutputTypes.AUDIO,
41
+ tags=[Tags.AUDIO, Tags.AUDIO_GENERATION, Tags.ZONOS, Tags.SPEECH, Tags.TEXT_TO_SPEECH, Tags.VOICE_CLONING],
42
+ )
42
43
 
43
44
  class AttributesBaseModel(TemplateAttributes):
44
45
  """
@@ -71,7 +72,7 @@ class ZonosTTS(Template):
71
72
  fmax: float = 22050.0
72
73
  language: str = TTSKeys.en_language
73
74
  model: Literal["Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid"] = "Zyphra/Zonos-v0.1-transformer"
74
- output_folder: str = os.path.join(SINAPSIS_CACHE_DIR, "zonos", "audios")
75
+ # output_folder: str = os.path.join(SINAPSIS_CACHE_DIR, "zonos", "audios")
75
76
  pitch_std: float = 20.0
76
77
  prefix_audio: str | None = None
77
78
  randomized_seed: bool = True
@@ -85,7 +86,7 @@ class ZonosTTS(Template):
85
86
  def __init__(self, attributes: TemplateAttributeType) -> None:
86
87
  """Initializes the Zonos model with the provided attributes."""
87
88
  super().__init__(attributes)
88
- os.makedirs(self.attributes.output_folder, exist_ok=True)
89
+ # os.makedirs(self.attributes.output_folder, exist_ok=True)
89
90
  self.device = device
90
91
  self.model = self._init_model()
91
92
  init_seed(self.attributes)
@@ -112,8 +113,9 @@ class ZonosTTS(Template):
112
113
  del self.model
113
114
  torch.cuda.empty_cache()
114
115
 
115
- def reset_state(self) -> None:
116
+ def reset_state(self, template_name: str | None = None) -> None:
116
117
  """Reinitialize the model and random seed."""
118
+ _ = template_name
117
119
  self._del_model()
118
120
  self.model = self._init_model()
119
121
  init_seed(self.attributes)
@@ -154,10 +156,8 @@ class ZonosTTS(Template):
154
156
  output_audio (torch.Tensor): The generated audio output tensor.
155
157
  container (DataContainer): The container to store metadata.
156
158
  """
157
- output_path = os.path.join(self.attributes.output_folder, f"{uuid.uuid4()}.{TTSKeys.wav}")
158
- torchaudio.save(output_path, output_audio[0], self.model.autoencoder.sampling_rate)
159
- self._set_generic_data(container, [output_path])
160
- self.logger.debug(f"Audio saved to: {output_path}")
159
+ audio_np = output_audio[0].cpu().numpy()
160
+ container.audios.append(AudioPacket(content=audio_np, sample_rate=self.model.autoencoder.sampling_rate))
161
161
 
162
162
  def execute(self, container: DataContainer) -> DataContainer:
163
163
  """Processes the input data and generates a speech output."""
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sinapsis-speech"
3
- version = "0.4.0"
3
+ version = "0.4.2"
4
4
  description = "Generate speech using various libraries."
5
5
  authors = [
6
6
  {name = "SinapsisAI", email = "dev@sinapsis.tech"},
@@ -28,6 +28,7 @@ all = [
28
28
  "sinapsis-zonos[all]",
29
29
  "sinapsis-parakeet-tdt[all]",
30
30
  "sinapsis-orpheus-cpp[all]",
31
+
31
32
  ]
32
33
  gradio-app = [
33
34
  "sinapsis[webapp]>=0.2.3",
@@ -50,6 +51,7 @@ sinapsis-zonos = { workspace = true }
50
51
  sinapsis-speech = { workspace = true }
51
52
  sinapsis-parakeet-tdt = { workspace = true }
52
53
  sinapsis-orpheus-cpp = { workspace = true }
54
+ sinapsis-chatterbox = { workspace = true }
53
55
 
54
56
 
55
57
  [[tool.uv.index]]
File without changes