sinapsis-speech 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/voice_utils.py +50 -14
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/__init__.py +2 -0
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_base.py +19 -13
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_sts.py +95 -0
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_tts.py +8 -7
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_clone.py +123 -0
- sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/templates/__init__.py +20 -0
- sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/templates/orpheus_tts.py +300 -0
- sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/thirdparty/helpers.py +69 -0
- sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/templates/__init__.py +20 -0
- sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/templates/parakeet_tdt.py +270 -0
- {sinapsis_speech-0.3.4.dist-info → sinapsis_speech-0.4.0.dist-info}/METADATA +38 -2
- {sinapsis_speech-0.3.4.dist-info → sinapsis_speech-0.4.0.dist-info}/RECORD +16 -9
- {sinapsis_speech-0.3.4.dist-info → sinapsis_speech-0.4.0.dist-info}/WHEEL +1 -1
- {sinapsis_speech-0.3.4.dist-info → sinapsis_speech-0.4.0.dist-info}/top_level.txt +2 -0
- {sinapsis_speech-0.3.4.dist-info → sinapsis_speech-0.4.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,29 +1,35 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
|
-
|
|
2
|
+
import json
|
|
3
|
+
|
|
4
|
+
from elevenlabs import Voice, VoiceSettings
|
|
5
|
+
from elevenlabs.client import ElevenLabs
|
|
4
6
|
from sinapsis_core.data_containers.data_packet import TextPacket
|
|
5
7
|
from sinapsis_core.utils.logging_utils import sinapsis_logger
|
|
6
8
|
|
|
7
9
|
|
|
8
|
-
def create_voice_settings(settings: VoiceSettings) -> VoiceSettings | None:
|
|
10
|
+
def create_voice_settings(settings: VoiceSettings, as_json: bool = False) -> VoiceSettings | None | str:
|
|
9
11
|
"""
|
|
10
12
|
Creates or updates a `VoiceSettings` object based on the provided settings.
|
|
11
13
|
|
|
12
14
|
Args:
|
|
13
15
|
settings (VoiceSettings | None): An instance of `VoiceSettings` containing the settings to be applied.
|
|
14
16
|
If `None`, the function returns the default settings.
|
|
17
|
+
as_json (bool): Whether to return the settings as JSON string.
|
|
15
18
|
|
|
16
19
|
Returns:
|
|
17
|
-
VoiceSettings: The provided `VoiceSettings` object if `settings` is not `None`. Otherwise,
|
|
18
|
-
`
|
|
20
|
+
VoiceSettings | None | str: The provided `VoiceSettings` object if `settings` is not `None`. Otherwise,
|
|
21
|
+
`None` is returned for default settings.
|
|
19
22
|
"""
|
|
20
23
|
if not settings:
|
|
21
|
-
return
|
|
24
|
+
return None
|
|
25
|
+
|
|
26
|
+
if as_json:
|
|
27
|
+
return json.dumps(settings.model_dump(exclude_none=True))
|
|
22
28
|
|
|
23
29
|
return settings
|
|
24
30
|
|
|
25
31
|
|
|
26
|
-
def get_voice_id(client: ElevenLabs, voice:
|
|
32
|
+
def get_voice_id(client: ElevenLabs, voice: str | Voice | None) -> str:
|
|
27
33
|
"""
|
|
28
34
|
Resolves the voice ID for a given voice name or ID.
|
|
29
35
|
|
|
@@ -33,29 +39,59 @@ def get_voice_id(client: ElevenLabs, voice: VoiceId | VoiceName) -> VoiceId:
|
|
|
33
39
|
|
|
34
40
|
Args:
|
|
35
41
|
client (ElevenLabs): The ElevenLabs API client instance.
|
|
36
|
-
voice (
|
|
42
|
+
voice (str | Voice | None): The name or ID of the desired voice.
|
|
37
43
|
|
|
38
44
|
Returns:
|
|
39
|
-
|
|
45
|
+
str: The resolved voice ID.
|
|
40
46
|
|
|
41
47
|
Raises:
|
|
42
48
|
ValueError: If no voices are available to resolve.
|
|
43
49
|
"""
|
|
50
|
+
if not voice:
|
|
51
|
+
return get_default_voice(client).voice_id
|
|
52
|
+
|
|
53
|
+
if isinstance(voice, Voice):
|
|
54
|
+
sinapsis_logger.debug(f"Voice object provided, using voice_id: {voice.voice_id}")
|
|
55
|
+
return voice.voice_id
|
|
56
|
+
|
|
44
57
|
try:
|
|
45
|
-
|
|
58
|
+
voices_response = client.voices.get_all()
|
|
59
|
+
voices = voices_response.voices
|
|
60
|
+
|
|
46
61
|
for v in voices:
|
|
47
62
|
if voice == v.name or voice == v.voice_id:
|
|
48
|
-
sinapsis_logger.debug("Voice
|
|
63
|
+
sinapsis_logger.debug(f"Voice {voice} resolved to ID: {v.voice_id}")
|
|
49
64
|
return v.voice_id
|
|
50
65
|
|
|
51
|
-
sinapsis_logger.error("Voice
|
|
66
|
+
sinapsis_logger.error(f"Voice {voice} is not available.")
|
|
52
67
|
if voices:
|
|
53
|
-
sinapsis_logger.info("Returning default voice ID:
|
|
68
|
+
sinapsis_logger.info(f"Returning default voice ID: {voices[0].voice_id}")
|
|
54
69
|
return voices[0].voice_id
|
|
55
70
|
|
|
56
71
|
raise ValueError("No available voices to resolve. Ensure the client is configured correctly.")
|
|
57
72
|
except Exception as e:
|
|
58
|
-
sinapsis_logger.error("Error resolving voice ID:
|
|
73
|
+
sinapsis_logger.error(f"Error resolving voice ID: {e}")
|
|
74
|
+
raise
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_default_voice(client: ElevenLabs) -> Voice:
|
|
78
|
+
"""
|
|
79
|
+
Gets the first available voice as default.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
client (ElevenLabs): The ElevenLabs API client instance.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
Voice: The default voice object.
|
|
86
|
+
"""
|
|
87
|
+
try:
|
|
88
|
+
voices_response = client.voices.get_all()
|
|
89
|
+
voices = voices_response.voices
|
|
90
|
+
if voices:
|
|
91
|
+
return voices[0]
|
|
92
|
+
raise ValueError("No voices available")
|
|
93
|
+
except Exception as e:
|
|
94
|
+
sinapsis_logger.error(f"Error getting default voice: {e}")
|
|
59
95
|
raise
|
|
60
96
|
|
|
61
97
|
|
|
@@ -7,6 +7,8 @@ _root_lib_path = "sinapsis_elevenlabs.templates"
|
|
|
7
7
|
_template_lookup = {
|
|
8
8
|
"ElevenLabsTTS": f"{_root_lib_path}.elevenlabs_tts",
|
|
9
9
|
"ElevenLabsVoiceGeneration": f"{_root_lib_path}.elevenlabs_voice_generation",
|
|
10
|
+
"ElevenLabsVoiceClone": f"{_root_lib_path}.elevenlabs_voice_clone",
|
|
11
|
+
"ElevenLabsSTS": f"{_root_lib_path}.elevenlabs_sts",
|
|
10
12
|
}
|
|
11
13
|
|
|
12
14
|
|
|
@@ -5,10 +5,10 @@ import abc
|
|
|
5
5
|
import os
|
|
6
6
|
import uuid
|
|
7
7
|
from io import BytesIO
|
|
8
|
-
from typing import IO, Iterator, Literal
|
|
8
|
+
from typing import IO, Iterable, Iterator, Literal
|
|
9
9
|
|
|
10
10
|
from elevenlabs import Voice, VoiceSettings, save
|
|
11
|
-
from elevenlabs.client import ElevenLabs
|
|
11
|
+
from elevenlabs.client import ElevenLabs
|
|
12
12
|
from elevenlabs.types import OutputFormat
|
|
13
13
|
from pydantic import Field
|
|
14
14
|
from sinapsis_core.data_containers.data_packet import AudioPacket, DataContainer, Packet
|
|
@@ -46,12 +46,14 @@ class ElevenLabsBase(Template, abc.ABC):
|
|
|
46
46
|
Args:
|
|
47
47
|
api_key (str): The API used key to authenticate with ElevenLabs' API.
|
|
48
48
|
model (Literal): The model identifier to use for speech synthesis.
|
|
49
|
+
output_file_name (str | None): Optional name for saved audio file.
|
|
50
|
+
If not provided a random UUI will be used as file name. Defaults to None.
|
|
49
51
|
output_format (OutputFormat): The output audio format and quality. Options include:
|
|
50
52
|
["mp3_22050_32", "mp3_44100_32", "mp3_44100_64", "mp3_44100_96", "mp3_44100_128",
|
|
51
53
|
"mp3_44100_192", "pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100", "ulaw_8000"]
|
|
52
54
|
output_folder (str): The folder where generated audio files will be saved.
|
|
53
55
|
stream (bool): If True, the audio is returned as a stream; otherwise, saved to a file.
|
|
54
|
-
voice (
|
|
56
|
+
voice (str | Voice | None): The voice to use for speech synthesis. This can be a voice ID (str),
|
|
55
57
|
a voice name (str) or an elevenlabs voice object (Voice).
|
|
56
58
|
voice_settings (VoiceSettings): A dictionary of settings that control the behavior of the voice.
|
|
57
59
|
- stability (float)
|
|
@@ -70,11 +72,11 @@ class ElevenLabsBase(Template, abc.ABC):
|
|
|
70
72
|
"eleven_english_sts_v2",
|
|
71
73
|
"eleven_multilingual_sts_v2",
|
|
72
74
|
] = "eleven_turbo_v2_5"
|
|
75
|
+
output_file_name: str | None = None
|
|
73
76
|
output_format: OutputFormat = "mp3_44100_128"
|
|
74
77
|
output_folder: str = os.path.join(SINAPSIS_CACHE_DIR, "elevenlabs", "audios")
|
|
75
78
|
stream: bool = False
|
|
76
|
-
|
|
77
|
-
voice: VoiceId | VoiceName | Voice = None
|
|
79
|
+
voice: str | Voice | None = None
|
|
78
80
|
voice_settings: VoiceSettings = Field(default_factory=dict) # type: ignore[arg-type]
|
|
79
81
|
|
|
80
82
|
UIProperties = UIPropertiesMetadata(category="Elevenlabs", output_type=OutputTypes.AUDIO)
|
|
@@ -98,9 +100,14 @@ class ElevenLabsBase(Template, abc.ABC):
|
|
|
98
100
|
def synthesize_speech(self, input_data: list[Packet]) -> RESPONSE_TYPE:
|
|
99
101
|
"""Abstract method for ElevenLabs speech synthesis."""
|
|
100
102
|
|
|
101
|
-
def _save_audio(self, response:
|
|
103
|
+
def _save_audio(self, response: Iterable | bytes, file_format: str, idx: int) -> str:
|
|
102
104
|
"""Saves the audio to a file and returns the file path."""
|
|
103
|
-
|
|
105
|
+
if self.attributes.output_file_name:
|
|
106
|
+
file_name = self.attributes.output_file_name + "_" + str(idx)
|
|
107
|
+
else:
|
|
108
|
+
file_name = uuid.uuid4()
|
|
109
|
+
|
|
110
|
+
output_file = os.path.join(self.attributes.output_folder, f"{file_name}.{file_format}")
|
|
104
111
|
try:
|
|
105
112
|
save(response, output_file)
|
|
106
113
|
self.logger.info(f"Audio saved to: {output_file}")
|
|
@@ -109,7 +116,7 @@ class ElevenLabsBase(Template, abc.ABC):
|
|
|
109
116
|
self.logger.error(f"File system error while saving speech to file: {e}")
|
|
110
117
|
raise
|
|
111
118
|
|
|
112
|
-
def _generate_audio_stream(self, response:
|
|
119
|
+
def _generate_audio_stream(self, response: Iterable | bytes) -> IO[bytes]:
|
|
113
120
|
"""Generates and returns the audio stream."""
|
|
114
121
|
audio_stream = BytesIO()
|
|
115
122
|
try:
|
|
@@ -132,13 +139,12 @@ class ElevenLabsBase(Template, abc.ABC):
|
|
|
132
139
|
self.logger.error(f"Value error while processing audio chunks: {e}")
|
|
133
140
|
raise
|
|
134
141
|
|
|
135
|
-
def _process_audio_output(self, response:
|
|
142
|
+
def _process_audio_output(self, idx: int, response: Iterable | bytes) -> str | IO[bytes]:
|
|
136
143
|
"""Processes a single audio output (either stream or file)."""
|
|
137
144
|
if self.attributes.stream:
|
|
138
145
|
return self._generate_audio_stream(response)
|
|
139
|
-
else
|
|
140
|
-
|
|
141
|
-
return self._save_audio(response, file_format)
|
|
146
|
+
file_format = "mp3" if "mp3" in self.attributes.output_format else "wav"
|
|
147
|
+
return self._save_audio(response, file_format, idx)
|
|
142
148
|
|
|
143
149
|
def generate_speech(self, input_data: list[Packet]) -> list[str | IO[bytes]] | None:
|
|
144
150
|
"""Generates speech and saves it to a file."""
|
|
@@ -149,7 +155,7 @@ class ElevenLabsBase(Template, abc.ABC):
|
|
|
149
155
|
if isinstance(responses, Iterator):
|
|
150
156
|
responses = [responses]
|
|
151
157
|
|
|
152
|
-
audio_outputs = [self._process_audio_output(response) for response in responses]
|
|
158
|
+
audio_outputs = [self._process_audio_output(idx, response) for idx, response in enumerate(responses)]
|
|
153
159
|
return audio_outputs
|
|
154
160
|
|
|
155
161
|
def _handle_streaming_output(self, audio_outputs: list[str | IO[bytes]]) -> list[AudioPacket]:
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""Speech-To-Speech template for ElevenLabs."""
|
|
3
|
+
|
|
4
|
+
from typing import Callable, Iterator, Literal
|
|
5
|
+
|
|
6
|
+
from sinapsis_core.data_containers.data_packet import AudioPacket
|
|
7
|
+
|
|
8
|
+
from sinapsis_elevenlabs.helpers.voice_utils import create_voice_settings, get_voice_id
|
|
9
|
+
from sinapsis_elevenlabs.templates.elevenlabs_base import ElevenLabsBase
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ElevenLabsSTS(ElevenLabsBase):
|
|
13
|
+
"""Template to interact with the ElevenLabs Speech-to-Speech API.
|
|
14
|
+
|
|
15
|
+
This template takes an input audio and converts it to a new voice using
|
|
16
|
+
the ElevenLabs Speech-to-Speech (STS) API.
|
|
17
|
+
|
|
18
|
+
Usage example:
|
|
19
|
+
|
|
20
|
+
agent:
|
|
21
|
+
name: my_test_agent
|
|
22
|
+
templates:
|
|
23
|
+
- template_name: InputTemplate
|
|
24
|
+
class_name: InputTemplate
|
|
25
|
+
attributes: {}
|
|
26
|
+
- template_name: ElevenLabsSTS
|
|
27
|
+
class_name: ElevenLabsSTS
|
|
28
|
+
template_input: InputTemplate
|
|
29
|
+
attributes:
|
|
30
|
+
api_key: null
|
|
31
|
+
model: eleven_multilingual_sts_v2
|
|
32
|
+
output_file_name: null
|
|
33
|
+
output_format: mp3_44100_128
|
|
34
|
+
output_folder: ~/.cache/sinapsis/elevenlabs/audios
|
|
35
|
+
stream: false
|
|
36
|
+
voice: null
|
|
37
|
+
voice_settings:
|
|
38
|
+
stability: null
|
|
39
|
+
similarity_boost: null
|
|
40
|
+
style: null
|
|
41
|
+
use_speaker_boost: null
|
|
42
|
+
speed: null
|
|
43
|
+
streaming_latency: null
|
|
44
|
+
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
PACKET_TYPE_NAME: str = "audios"
|
|
48
|
+
|
|
49
|
+
class AttributesBaseModel(ElevenLabsBase.AttributesBaseModel):
|
|
50
|
+
"""Attributes specific to ElevenLabs STS API interaction.
|
|
51
|
+
|
|
52
|
+
Attributes:
|
|
53
|
+
model (Literal): The STS model to use. Options are "eleven_english_sts_v2" or "eleven_multilingual_sts_v2".
|
|
54
|
+
streaming_latency (int | None): Optional latency optimization for streaming. Defaults to None.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
model: Literal["eleven_english_sts_v2", "eleven_multilingual_sts_v2"] = "eleven_multilingual_sts_v2"
|
|
58
|
+
streaming_latency: int | None = None
|
|
59
|
+
|
|
60
|
+
def synthesize_speech(self, input_data: list[AudioPacket]) -> Iterator[bytes]:
|
|
61
|
+
"""Sends an audio input to the ElevenLabs API for speech-to-speech synthesis.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
input_data (list[AudioPacket]): List of AudioPacket objects containing the audio to be converted.
|
|
65
|
+
Only the first AudioPacket in the list is used.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
Iterator[bytes]: An iterator yielding audio data chunks in the output format specified.
|
|
69
|
+
|
|
70
|
+
Raises:
|
|
71
|
+
ValueError: If there is a problem with the input data or parameters.
|
|
72
|
+
TypeError: If the input data or files are of incorrect type.
|
|
73
|
+
KeyError: If the expected key is missing in the API response.
|
|
74
|
+
"""
|
|
75
|
+
try:
|
|
76
|
+
method: Callable[..., Iterator[bytes]] = (
|
|
77
|
+
self.client.speech_to_speech.stream if self.attributes.stream else self.client.speech_to_speech.convert
|
|
78
|
+
)
|
|
79
|
+
return method(
|
|
80
|
+
voice_id=get_voice_id(self.client, voice=self.attributes.voice),
|
|
81
|
+
audio=input_data[0].content,
|
|
82
|
+
model_id=self.attributes.model,
|
|
83
|
+
voice_settings=create_voice_settings(self.attributes.voice_settings, as_json=True),
|
|
84
|
+
output_format=self.attributes.output_format,
|
|
85
|
+
optimize_streaming_latency=self.attributes.streaming_latency,
|
|
86
|
+
)
|
|
87
|
+
except ValueError as e:
|
|
88
|
+
self.logger.error(f"Value error synthesizing speech: {e}")
|
|
89
|
+
raise
|
|
90
|
+
except TypeError as e:
|
|
91
|
+
self.logger.error(f"Type error in input data or parameters: {e}")
|
|
92
|
+
raise
|
|
93
|
+
except KeyError as e:
|
|
94
|
+
self.logger.error(f"Missing key in input data or settings: {e}")
|
|
95
|
+
raise
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
"""Text-To-Speech template for ElevenLabs"""
|
|
3
3
|
|
|
4
|
-
from typing import Iterator, Literal
|
|
4
|
+
from typing import Callable, Iterator, Literal
|
|
5
5
|
|
|
6
6
|
from sinapsis_core.data_containers.data_packet import TextPacket
|
|
7
7
|
|
|
8
8
|
from sinapsis_elevenlabs.helpers.voice_utils import (
|
|
9
9
|
create_voice_settings,
|
|
10
|
+
get_voice_id,
|
|
10
11
|
load_input_text,
|
|
11
12
|
)
|
|
12
13
|
from sinapsis_elevenlabs.templates.elevenlabs_base import ElevenLabsBase
|
|
@@ -64,16 +65,16 @@ class ElevenLabsTTS(ElevenLabsBase):
|
|
|
64
65
|
"""
|
|
65
66
|
input_text: str = load_input_text(input_data)
|
|
66
67
|
try:
|
|
67
|
-
|
|
68
|
+
method: Callable[..., Iterator[bytes]] = (
|
|
69
|
+
self.client.text_to_speech.stream if self.attributes.stream else self.client.text_to_speech.convert
|
|
70
|
+
)
|
|
71
|
+
return method(
|
|
68
72
|
text=input_text,
|
|
69
|
-
|
|
70
|
-
|
|
73
|
+
voice_id=get_voice_id(self.client, self.attributes.voice),
|
|
74
|
+
model_id=self.attributes.model,
|
|
71
75
|
voice_settings=create_voice_settings(self.attributes.voice_settings),
|
|
72
76
|
output_format=self.attributes.output_format,
|
|
73
|
-
stream=self.attributes.stream,
|
|
74
77
|
)
|
|
75
|
-
|
|
76
|
-
return response
|
|
77
78
|
except ValueError as e:
|
|
78
79
|
self.logger.error(f"Value error synthesizing speech: {e}")
|
|
79
80
|
raise
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""Text-To-Speech template for ElevenLabs Voice Cloning."""
|
|
3
|
+
|
|
4
|
+
from elevenlabs import Voice
|
|
5
|
+
from sinapsis_core.data_containers.data_packet import AudioPacket, DataContainer
|
|
6
|
+
|
|
7
|
+
from sinapsis_elevenlabs.templates.elevenlabs_tts import ElevenLabsTTS
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ElevenLabsVoiceClone(ElevenLabsTTS):
|
|
11
|
+
"""Template to clone a voice using the ElevenLabs API.
|
|
12
|
+
|
|
13
|
+
This template allows you to create a new custom voice in ElevenLabs by providing
|
|
14
|
+
one or more audio samples. The cloned voice can then be used for subsequent
|
|
15
|
+
text-to-speech synthesis within the Sinapsis pipeline.
|
|
16
|
+
|
|
17
|
+
Usage example:
|
|
18
|
+
|
|
19
|
+
agent:
|
|
20
|
+
name: my_test_agent
|
|
21
|
+
templates:
|
|
22
|
+
- template_name: InputTemplate
|
|
23
|
+
class_name: InputTemplate
|
|
24
|
+
attributes: {}
|
|
25
|
+
- template_name: ElevenLabsVoiceClone
|
|
26
|
+
class_name: ElevenLabsVoiceClone
|
|
27
|
+
template_input: InputTemplate
|
|
28
|
+
attributes:
|
|
29
|
+
api_key: null
|
|
30
|
+
model: eleven_turbo_v2_5
|
|
31
|
+
output_file_name: null
|
|
32
|
+
output_format: mp3_44100_128
|
|
33
|
+
output_folder: ~/.cache/sinapsis/elevenlabs/audios
|
|
34
|
+
stream: false
|
|
35
|
+
voice: null
|
|
36
|
+
voice_settings:
|
|
37
|
+
stability: null
|
|
38
|
+
similarity_boost: null
|
|
39
|
+
style: null
|
|
40
|
+
use_speaker_boost: null
|
|
41
|
+
speed: null
|
|
42
|
+
name: null
|
|
43
|
+
description: null
|
|
44
|
+
remove_background_noise: false
|
|
45
|
+
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
class AttributesBaseModel(ElevenLabsTTS.AttributesBaseModel):
|
|
49
|
+
"""Attributes specific to the ElevenLabsVoiceClone class.
|
|
50
|
+
|
|
51
|
+
Attributes:
|
|
52
|
+
name (str | None): Name for the cloned voice. If None, a default name may be used.
|
|
53
|
+
description (str | None): Description for the cloned voice. Optional.
|
|
54
|
+
remove_background_noise (bool): Whether to remove background noise from samples. Defaults to False.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
name: str | None = None
|
|
58
|
+
description: str | None = None
|
|
59
|
+
remove_background_noise: bool = False
|
|
60
|
+
|
|
61
|
+
def clone_voice(self, input_data: list[AudioPacket]) -> Voice:
|
|
62
|
+
"""Clones a voice using the provided audio files.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
input_data (list[AudioPacket]): List of AudioPacket objects containing the audio samples
|
|
66
|
+
to be used for voice cloning. Each AudioPacket's `content` should be a file-like object
|
|
67
|
+
or bytes representing the audio data.
|
|
68
|
+
**NOTE:** All provided audio packets are used as reference for a single cloned voice.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Voice: The cloned Voice object as returned by the ElevenLabs API.
|
|
72
|
+
|
|
73
|
+
Raises:
|
|
74
|
+
ValueError: If there is a problem with the input data or parameters.
|
|
75
|
+
TypeError: If the input data or files are of incorrect type.
|
|
76
|
+
KeyError: If the expected key is missing in the API response.
|
|
77
|
+
"""
|
|
78
|
+
files = [audio.content for audio in input_data]
|
|
79
|
+
try:
|
|
80
|
+
clone_response = self.client.voices.ivc.create(
|
|
81
|
+
name=self.attributes.name,
|
|
82
|
+
files=files,
|
|
83
|
+
description=self.attributes.description,
|
|
84
|
+
remove_background_noise=self.attributes.remove_background_noise,
|
|
85
|
+
)
|
|
86
|
+
cloned_voice = self.client.voices.get(clone_response.voice_id)
|
|
87
|
+
self.logger.info(f"Voice cloned successfully with IVC: {cloned_voice.name}")
|
|
88
|
+
return cloned_voice
|
|
89
|
+
except ValueError as e:
|
|
90
|
+
self.logger.error(f"Value error in input data or parameters: {e}")
|
|
91
|
+
raise
|
|
92
|
+
except TypeError as e:
|
|
93
|
+
self.logger.error(f"Type error with input data or files: {e}")
|
|
94
|
+
raise
|
|
95
|
+
except KeyError as e:
|
|
96
|
+
self.logger.error(f"Missing expected key in API response: {e}")
|
|
97
|
+
raise
|
|
98
|
+
|
|
99
|
+
def execute(self, container: DataContainer) -> DataContainer:
|
|
100
|
+
"""Executes the voice cloning process and generates the speech output.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
container (DataContainer): The input DataContainer, expected to contain
|
|
104
|
+
one or more AudioPacket objects in the `audios` attribute.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
DataContainer: The updated DataContainer. If cloning is successful,
|
|
108
|
+
the cloned voice is set in `self.attributes.voice` and the parent
|
|
109
|
+
TTS execution is performed using the new voice.
|
|
110
|
+
|
|
111
|
+
Side Effects:
|
|
112
|
+
- Updates `self.attributes.voice` with the cloned Voice object.
|
|
113
|
+
- May log errors or info messages.
|
|
114
|
+
"""
|
|
115
|
+
audios = container.audios
|
|
116
|
+
if not audios:
|
|
117
|
+
self.logger.debug("No audios provided to clone voice")
|
|
118
|
+
return container
|
|
119
|
+
self.attributes.voice = self.clone_voice(audios)
|
|
120
|
+
|
|
121
|
+
container = super().execute(container)
|
|
122
|
+
|
|
123
|
+
return container
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import importlib
|
|
3
|
+
from typing import Callable
|
|
4
|
+
|
|
5
|
+
_root_lib_path = "sinapsis_orpheus_cpp.templates"
|
|
6
|
+
|
|
7
|
+
_template_lookup = {
|
|
8
|
+
"OrpheusTTS": f"{_root_lib_path}.orpheus_tts",
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def __getattr__(name: str) -> Callable:
|
|
13
|
+
if name in _template_lookup:
|
|
14
|
+
module = importlib.import_module(_template_lookup[name])
|
|
15
|
+
return getattr(module, name)
|
|
16
|
+
|
|
17
|
+
raise AttributeError(f"template `{name}` not found in {_root_lib_path}")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
__all__ = list(_template_lookup.keys())
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from llama_cpp import Llama
|
|
5
|
+
from orpheus_cpp import OrpheusCpp
|
|
6
|
+
from orpheus_cpp.model import TTSOptions
|
|
7
|
+
from pydantic import TypeAdapter
|
|
8
|
+
from sinapsis_core.data_containers.data_packet import (
|
|
9
|
+
AudioPacket,
|
|
10
|
+
DataContainer,
|
|
11
|
+
)
|
|
12
|
+
from sinapsis_core.template_base import Template
|
|
13
|
+
from sinapsis_core.template_base.base_models import (
|
|
14
|
+
OutputTypes,
|
|
15
|
+
TemplateAttributes,
|
|
16
|
+
TemplateAttributeType,
|
|
17
|
+
UIPropertiesMetadata,
|
|
18
|
+
)
|
|
19
|
+
from sinapsis_core.utils.env_var_keys import SINAPSIS_CACHE_DIR
|
|
20
|
+
|
|
21
|
+
from sinapsis_orpheus_cpp.thirdparty.helpers import download_model, setup_snac_session
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class OrpheusTTSAttributes(TemplateAttributes):
|
|
25
|
+
"""Attributes configuration for Orpheus TTS Template.
|
|
26
|
+
|
|
27
|
+
This class defines all configurable parameters for the Orpheus TTS model,
|
|
28
|
+
including model configuration, GPU settings, and audio generation parameters.
|
|
29
|
+
|
|
30
|
+
Attributes:
|
|
31
|
+
n_gpu_layers (int): Number of model layers to offload to GPU.
|
|
32
|
+
-1 means use all available layers on GPU for maximum performance.
|
|
33
|
+
0 means use CPU only. Default: -1.
|
|
34
|
+
n_threads (int): Number of CPU threads to use for model inference.
|
|
35
|
+
0 means auto-detect optimal thread count. Default: 0.
|
|
36
|
+
n_ctx (int): Context window size (maximum number of tokens).
|
|
37
|
+
0 means use the model's maximum trained context size.
|
|
38
|
+
Larger values require more GPU/RAM memory. Default: 8192.
|
|
39
|
+
model_id (str): Hugging Face model repository ID.
|
|
40
|
+
Must be a valid repository containing GGUF model files.
|
|
41
|
+
Required parameter with no default.
|
|
42
|
+
model_variant (str | None): Specific GGUF file to download from the repository.
|
|
43
|
+
If None, will auto-detect based on model_id naming convention.
|
|
44
|
+
Use this to specify exact quantization (e.g., "model-q4_k_m.gguf").
|
|
45
|
+
Default: None.
|
|
46
|
+
cache_dir (str): Directory to store downloaded models and cache files.
|
|
47
|
+
Default: SINAPSIS_CACHE_DIR environment variable.
|
|
48
|
+
verbose (bool): Enable verbose logging for model operations.
|
|
49
|
+
Shows detailed model loading and inference information. Default: False.
|
|
50
|
+
voice_id (str): Voice identifier for speech synthesis.
|
|
51
|
+
Must be a valid voice supported by the Orpheus model.
|
|
52
|
+
Available voices depend on the specific model variant.
|
|
53
|
+
Required parameter with no default.
|
|
54
|
+
batch_size (int): Batch size for model inference.
|
|
55
|
+
Higher values may improve throughput but require more memory.
|
|
56
|
+
Default: 1.
|
|
57
|
+
max_tokens (int): Maximum number of tokens to generate for speech.
|
|
58
|
+
Controls the length of generated audio sequences. Default: 2048.
|
|
59
|
+
temperature (float): Sampling temperature for token generation.
|
|
60
|
+
Higher values (>1.0) make output more random, lower values (<1.0)
|
|
61
|
+
make it more deterministic. Default: 0.8.
|
|
62
|
+
top_p (float): Nucleus sampling probability threshold.
|
|
63
|
+
Only tokens with cumulative probability <= top_p are considered.
|
|
64
|
+
Range: 0.0-1.0. Default: 0.95.
|
|
65
|
+
top_k (int): Top-k sampling parameter.
|
|
66
|
+
Only the top k most likely tokens are considered for sampling.
|
|
67
|
+
Default: 40.
|
|
68
|
+
min_p (float): Minimum probability threshold for token selection.
|
|
69
|
+
Tokens with probability below this threshold are filtered out.
|
|
70
|
+
Range: 0.0-1.0. Default: 0.05.
|
|
71
|
+
pre_buffer_size (float): Duration in seconds of audio to generate
|
|
72
|
+
before yielding the first chunk during streaming.
|
|
73
|
+
Larger values provide smoother audio but higher latency.
|
|
74
|
+
Default: 1.5.
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
n_gpu_layers: int = -1
|
|
78
|
+
n_threads: int = 0
|
|
79
|
+
n_ctx: int = 8192
|
|
80
|
+
model_id: str
|
|
81
|
+
model_variant: str | None = None
|
|
82
|
+
cache_dir: str = SINAPSIS_CACHE_DIR
|
|
83
|
+
verbose: bool = False
|
|
84
|
+
voice_id: str
|
|
85
|
+
batch_size: int = 1
|
|
86
|
+
max_tokens: int = 2048
|
|
87
|
+
temperature: float = 0.8
|
|
88
|
+
top_p: float = 0.95
|
|
89
|
+
top_k: int = 40
|
|
90
|
+
min_p: float = 0.05
|
|
91
|
+
pre_buffer_size: float = 1.5
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class OrpheusTTS(Template):
|
|
95
|
+
"""Text-to-Speech template using Orpheus model for speech synthesis.
|
|
96
|
+
|
|
97
|
+
This template converts text input into high-quality speech audio using
|
|
98
|
+
the Orpheus neural TTS model. It handles model downloading, initialization,
|
|
99
|
+
and audio generation with configurable voice parameters.
|
|
100
|
+
|
|
101
|
+
Usage example:
|
|
102
|
+
|
|
103
|
+
agent:
|
|
104
|
+
name: my_test_agent
|
|
105
|
+
templates:
|
|
106
|
+
- template_name: InputTemplate
|
|
107
|
+
class_name: InputTemplate
|
|
108
|
+
attributes: {}
|
|
109
|
+
- template_name: OrpheusTTS
|
|
110
|
+
class_name: OrpheusTTS
|
|
111
|
+
template_input: InputTemplate
|
|
112
|
+
attributes:
|
|
113
|
+
n_gpu_layers: -1
|
|
114
|
+
n_threads: 0
|
|
115
|
+
n_ctx: 8192
|
|
116
|
+
model_id: '`replace_me:<class ''str''>`'
|
|
117
|
+
model_variant: null
|
|
118
|
+
cache_dir: ~/sinapsis
|
|
119
|
+
verbose: false
|
|
120
|
+
voice_id: '`replace_me:<class ''str''>`'
|
|
121
|
+
batch_size: 1
|
|
122
|
+
max_tokens: 2048
|
|
123
|
+
temperature: 0.8
|
|
124
|
+
top_p: 0.95
|
|
125
|
+
top_k: 40
|
|
126
|
+
min_p: 0.05
|
|
127
|
+
pre_buffer_size: 1.5
|
|
128
|
+
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
AttributesBaseModel = OrpheusTTSAttributes
|
|
132
|
+
UIProperties = UIPropertiesMetadata(category="TTS", output_type=OutputTypes.AUDIO)
|
|
133
|
+
|
|
134
|
+
def __init__(self, attributes: TemplateAttributeType) -> None:
|
|
135
|
+
super().__init__(attributes)
|
|
136
|
+
self._engine: OrpheusCpp
|
|
137
|
+
self._llm_available: bool = False
|
|
138
|
+
self._initialize_engine()
|
|
139
|
+
|
|
140
|
+
def _initialize_engine(self) -> None:
|
|
141
|
+
"""Initialize the OrpheusCpp engine with downloaded model.
|
|
142
|
+
|
|
143
|
+
Creates a new OrpheusCpp instance without calling its constructor
|
|
144
|
+
to avoid default parameter conflicts, then manually configures
|
|
145
|
+
both the LLM and SNAC session components.
|
|
146
|
+
|
|
147
|
+
Raises:
|
|
148
|
+
ValueError: If model download fails.
|
|
149
|
+
RuntimeError: If engine initialization fails.
|
|
150
|
+
"""
|
|
151
|
+
self._engine = OrpheusCpp.__new__(OrpheusCpp)
|
|
152
|
+
model_file = download_model(
|
|
153
|
+
model_id=self.attributes.model_id,
|
|
154
|
+
model_variant=self.attributes.model_variant,
|
|
155
|
+
cache_dir=self.attributes.cache_dir,
|
|
156
|
+
)
|
|
157
|
+
self._setup_llm(model_file)
|
|
158
|
+
self._setup_snac_session()
|
|
159
|
+
|
|
160
|
+
def _setup_llm(self, model_file: str) -> None:
|
|
161
|
+
"""Setup the Large Language Model component with specified parameters.
|
|
162
|
+
|
|
163
|
+
Initializes the Llama model with custom configuration parameters.
|
|
164
|
+
Implements graceful error handling for Out-of-Memory conditions
|
|
165
|
+
by setting the LLM as unavailable instead of crashing.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
model_file (str): Path to the downloaded GGUF model file.
|
|
169
|
+
|
|
170
|
+
Raises:
|
|
171
|
+
ValueError: For non-OOM related model initialization errors.
|
|
172
|
+
|
|
173
|
+
Note:
|
|
174
|
+
If a "Failed to create llama_context" error occurs (typically OOM),
|
|
175
|
+
the method logs the error and disables TTS functionality instead
|
|
176
|
+
of terminating the program.
|
|
177
|
+
"""
|
|
178
|
+
try:
|
|
179
|
+
self._engine._llm = Llama(
|
|
180
|
+
model_path=model_file,
|
|
181
|
+
n_ctx=self.attributes.n_ctx,
|
|
182
|
+
verbose=self.attributes.verbose,
|
|
183
|
+
n_gpu_layers=self.attributes.n_gpu_layers,
|
|
184
|
+
n_threads=self.attributes.n_threads,
|
|
185
|
+
batch_size=self.attributes.batch_size,
|
|
186
|
+
)
|
|
187
|
+
self._llm_available = True
|
|
188
|
+
except ValueError as e:
|
|
189
|
+
if "Failed to create llama_context" in str(e):
|
|
190
|
+
error_msg = (
|
|
191
|
+
f"Failed to create llama_context - Out of Memory (OOM) issue. "
|
|
192
|
+
f"Current n_ctx: {self.attributes.n_ctx}, n_gpu_layers: {self.attributes.n_gpu_layers}. "
|
|
193
|
+
f"Try reducing n_ctx or "
|
|
194
|
+
f"reduce n_gpu_layers if using GPU. "
|
|
195
|
+
)
|
|
196
|
+
self.logger.error(error_msg)
|
|
197
|
+
self._engine._llm = None
|
|
198
|
+
self._llm_available = False
|
|
199
|
+
else:
|
|
200
|
+
raise
|
|
201
|
+
|
|
202
|
+
def _setup_snac_session(self) -> None:
|
|
203
|
+
"""
|
|
204
|
+
Initializes the SNAC (Streaming Neural Audio Codec) session required
|
|
205
|
+
for converting model tokens to audio waveforms. Only sets up the session
|
|
206
|
+
if the LLM was successfully initialized.
|
|
207
|
+
|
|
208
|
+
Note:
|
|
209
|
+
SNAC session is only created when LLM is available to avoid
|
|
210
|
+
unnecessary resource allocation when TTS is disabled.
|
|
211
|
+
"""
|
|
212
|
+
if self._llm_available:
|
|
213
|
+
self._engine._snac_session = setup_snac_session(self.attributes.cache_dir)
|
|
214
|
+
else:
|
|
215
|
+
self._engine._snac_session = None
|
|
216
|
+
|
|
217
|
+
def _create_tts_options(self) -> TTSOptions:
|
|
218
|
+
"""
|
|
219
|
+
Dynamically builds a TTSOptions dictionary by filtering template attributes
|
|
220
|
+
to include only those that are valid TTSOptions parameters.
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
TTSOptions: Dictionary containing TTS generation parameters.
|
|
224
|
+
"""
|
|
225
|
+
tts_option_fields = TypeAdapter(TTSOptions)
|
|
226
|
+
attributes_dict = self.attributes.model_dump()
|
|
227
|
+
return tts_option_fields.validate_python(attributes_dict)
|
|
228
|
+
|
|
229
|
+
def generate_speech(self, text: str) -> tuple[int, np.ndarray] | None:
|
|
230
|
+
"""
|
|
231
|
+
Converts text to speech using the Orpheus TTS model with configured
|
|
232
|
+
voice and generation parameters.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
text (str): Input text to convert to speech.
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
tuple[int, np.ndarray] | None: Tuple of (sample_rate, audio_array)
|
|
239
|
+
if generation succeeds, None if LLM is unavailable.
|
|
240
|
+
|
|
241
|
+
Note:
|
|
242
|
+
Returns None when LLM is not available (e.g., due to OOM errors)
|
|
243
|
+
instead of raising an exception, allowing graceful degradation.
|
|
244
|
+
"""
|
|
245
|
+
if not self._llm_available:
|
|
246
|
+
return None
|
|
247
|
+
return self._engine.tts(text, options=self._create_tts_options())
|
|
248
|
+
|
|
249
|
+
def create_audio_packet(self, text: str, source: str | None = None) -> AudioPacket | None:
|
|
250
|
+
"""
|
|
251
|
+
Generates speech from text and wraps the result in a
|
|
252
|
+
`AudioPacket` for data pipeline compatibility.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
text (str): Input text to convert to speech.
|
|
256
|
+
source (str | None): Optional source identifier for traceability.
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
AudioPacket | None: Audio packet containing generated speech,
|
|
260
|
+
or None if speech generation fails or is unavailable.
|
|
261
|
+
"""
|
|
262
|
+
speech_result = self.generate_speech(text)
|
|
263
|
+
if speech_result is None:
|
|
264
|
+
return None
|
|
265
|
+
|
|
266
|
+
sample_rate, audio_data = speech_result
|
|
267
|
+
return AudioPacket(
|
|
268
|
+
content=audio_data,
|
|
269
|
+
source=source,
|
|
270
|
+
sample_rate=sample_rate,
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
def execute(self, container: DataContainer) -> DataContainer:
|
|
274
|
+
"""
|
|
275
|
+
Processes all text packets in the input container and generates
|
|
276
|
+
corresponding audio packets using the Orpheus TTS model.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
container (DataContainer): Input container with text packets to process.
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
DataContainer: Updated container with generated audio packets added.
|
|
283
|
+
|
|
284
|
+
Note:
|
|
285
|
+
When LLM is unavailable (due to initialization failures), the method
|
|
286
|
+
logs a warning and returns the container without modifications rather
|
|
287
|
+
than raising an exception.
|
|
288
|
+
"""
|
|
289
|
+
if not container.texts:
|
|
290
|
+
return container
|
|
291
|
+
|
|
292
|
+
if not self._llm_available:
|
|
293
|
+
return container
|
|
294
|
+
|
|
295
|
+
for text_packet in container.texts:
|
|
296
|
+
audio_packet = self.create_audio_packet(text=text_packet.content, source=text_packet.source)
|
|
297
|
+
if audio_packet is not None:
|
|
298
|
+
container.audios.append(audio_packet)
|
|
299
|
+
|
|
300
|
+
return container
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
Themethods declared in this file are inspired in the following source:
|
|
4
|
+
|
|
5
|
+
https://github.com/freddyaboulton/orpheus-cpp
|
|
6
|
+
|
|
7
|
+
which is Licensed under the MIT License.
|
|
8
|
+
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import onnxruntime
|
|
12
|
+
from huggingface_hub import hf_hub_download
|
|
13
|
+
from sinapsis_core.utils.logging_utils import sinapsis_logger
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def download_model(cache_dir: str, model_id: str, model_variant: str | None = None) -> str | None:
|
|
17
|
+
"""
|
|
18
|
+
Download a model from Hugging Face Hub.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
model_id: The model ID on Hugging Face Hub.
|
|
22
|
+
model_variant: The specific model variant file to download.
|
|
23
|
+
cache_dir: Directory to store downloaded models.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Path to the downloaded model file or None if download fails.
|
|
27
|
+
"""
|
|
28
|
+
if model_variant:
|
|
29
|
+
filename = model_variant
|
|
30
|
+
elif model_id.endswith(("-GGUF", "-gguf")):
|
|
31
|
+
filename = model_id.split("/")[-1].lower().replace("-gguf", ".gguf")
|
|
32
|
+
else:
|
|
33
|
+
filename = f"{model_id.split('/')[-1]}.gguf"
|
|
34
|
+
|
|
35
|
+
sinapsis_logger.info(f"Downloading model {model_id} with filename {filename}")
|
|
36
|
+
|
|
37
|
+
model_file = hf_hub_download(
|
|
38
|
+
repo_id=model_id,
|
|
39
|
+
filename=filename,
|
|
40
|
+
cache_dir=cache_dir,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
sinapsis_logger.info(f"Successfully downloaded model to {model_file}")
|
|
44
|
+
return model_file
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def setup_snac_session(cache_dir: str) -> onnxruntime.InferenceSession:
|
|
48
|
+
"""
|
|
49
|
+
Download and setup the SNAC ONNX session for audio processing.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
cache_dir: Directory to store downloaded models.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
Configured ONNX inference session.
|
|
56
|
+
"""
|
|
57
|
+
repo_id = "onnx-community/snac_24khz-ONNX"
|
|
58
|
+
snac_model_file = "decoder_model.onnx"
|
|
59
|
+
snac_model_path = hf_hub_download(
|
|
60
|
+
repo_id,
|
|
61
|
+
subfolder="onnx",
|
|
62
|
+
filename=snac_model_file,
|
|
63
|
+
cache_dir=cache_dir,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
return onnxruntime.InferenceSession(
|
|
67
|
+
snac_model_path,
|
|
68
|
+
providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
|
|
69
|
+
)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import importlib
|
|
3
|
+
from typing import Callable
|
|
4
|
+
|
|
5
|
+
_root_lib_path = "sinapsis_parakeet_tdt.templates"
|
|
6
|
+
|
|
7
|
+
_template_lookup = {
|
|
8
|
+
"ParakeetTDTInference": f"{_root_lib_path}.parakeet_tdt",
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def __getattr__(name: str) -> Callable:
|
|
13
|
+
if name in _template_lookup:
|
|
14
|
+
module = importlib.import_module(_template_lookup[name])
|
|
15
|
+
return getattr(module, name)
|
|
16
|
+
|
|
17
|
+
raise AttributeError(f"template `{name}` not found in {_root_lib_path}")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
__all__ = list(_template_lookup.keys())
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import os
|
|
3
|
+
from typing import Any, Literal
|
|
4
|
+
|
|
5
|
+
import nemo.collections.asr as nemo_asr
|
|
6
|
+
from sinapsis_core.data_containers.data_packet import (
|
|
7
|
+
AudioPacket,
|
|
8
|
+
DataContainer,
|
|
9
|
+
TextPacket,
|
|
10
|
+
)
|
|
11
|
+
from sinapsis_core.template_base.base_models import (
|
|
12
|
+
OutputTypes,
|
|
13
|
+
TemplateAttributes,
|
|
14
|
+
UIPropertiesMetadata,
|
|
15
|
+
)
|
|
16
|
+
from sinapsis_core.template_base.template import Template
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ParakeetTDTInferenceAttributes(TemplateAttributes):
|
|
20
|
+
"""
|
|
21
|
+
Attributes for the ParakeetTDT model.
|
|
22
|
+
|
|
23
|
+
Attributes:
|
|
24
|
+
model_name (str): Name or path of the Parakeet TDT model. Defaults to "nvidia/parakeet-tdt-0.6b-v2".
|
|
25
|
+
audio_paths (list[str]): Optional list of audio file paths to transcribe. If empty, audio will be
|
|
26
|
+
taken from the `AudioPackets` in the `DataContainer`. Defaults to an empty list.
|
|
27
|
+
enable_timestamps (bool): Whether to generate timestamps for the transcription. Defaults to False.
|
|
28
|
+
timestamp_level (Literal["char", "word", "segment"]): Level of timestamp detail. Defaults to "word".
|
|
29
|
+
device (Literal["cpu", "cuda"]): Device to run the model on. Defaults to "cuda".
|
|
30
|
+
refresh_cache (bool): Whether to refresh the cache when downloading the model. Defaults to False.
|
|
31
|
+
This is useful if the model has been updated and you want to ensure you have the latest version.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
model_name: str = "nvidia/parakeet-tdt-0.6b-v2"
|
|
35
|
+
audio_paths: list[str] | None = None
|
|
36
|
+
enable_timestamps: bool = False
|
|
37
|
+
timestamp_level: Literal["char", "word", "segment"] = "word"
|
|
38
|
+
device: Literal["cpu", "cuda"] = "cuda"
|
|
39
|
+
refresh_cache: bool = False
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class ParakeetTDTInference(Template):
|
|
43
|
+
"""Template for NVIDIA Parakeet TDT 0.6B speech recognition.
|
|
44
|
+
|
|
45
|
+
This template uses NVIDIA's Parakeet TDT 0.6B automatic speech recognition (ASR) model
|
|
46
|
+
to transcribe audio. It can read audio directly from AudioPackets in the DataContainer
|
|
47
|
+
or from specified file paths. The model supports punctuation, capitalization, and
|
|
48
|
+
timestamp prediction.
|
|
49
|
+
|
|
50
|
+
Usage example:
|
|
51
|
+
|
|
52
|
+
agent:
|
|
53
|
+
name: my_transcription_agent
|
|
54
|
+
templates:
|
|
55
|
+
- template_name: InputTemplate
|
|
56
|
+
class_name: InputTemplate
|
|
57
|
+
attributes: {}
|
|
58
|
+
|
|
59
|
+
- template_name: ParakeetTDT
|
|
60
|
+
class_name: ParakeetTDTInference
|
|
61
|
+
template_input: InputTemplate
|
|
62
|
+
attributes:
|
|
63
|
+
model_name: "nvidia/parakeet-tdt-0.6b-v2"
|
|
64
|
+
audio_paths: ['/path/to/file.wav']
|
|
65
|
+
enable_timestamps: True
|
|
66
|
+
timestamp_level: "word"
|
|
67
|
+
device: "cuda"
|
|
68
|
+
refresh_cache: False
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
UIProperties = UIPropertiesMetadata(category="Parakeet TDT", output_type=OutputTypes.TEXT)
|
|
72
|
+
|
|
73
|
+
AttributesBaseModel = ParakeetTDTInferenceAttributes
|
|
74
|
+
|
|
75
|
+
def __init__(self, attributes: TemplateAttributes) -> None:
|
|
76
|
+
super().__init__(attributes)
|
|
77
|
+
self._load_model()
|
|
78
|
+
|
|
79
|
+
def _load_model(self) -> None:
|
|
80
|
+
"""
|
|
81
|
+
Load the ASR model from pretrained source.
|
|
82
|
+
|
|
83
|
+
This method initializes the NeMo ASR model using the specified model name
|
|
84
|
+
and device configuration from the template attributes.
|
|
85
|
+
"""
|
|
86
|
+
self.model = nemo_asr.models.ASRModel.from_pretrained(
|
|
87
|
+
model_name=self.attributes.model_name,
|
|
88
|
+
map_location=self.attributes.device,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
def get_sources_from_packets(self, audio_packets: list[AudioPacket]) -> list[str]:
|
|
92
|
+
"""
|
|
93
|
+
Extract valid audio file paths from AudioPackets.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
audio_packets: List of audio packets to extract source paths from.
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
list[str]: List of valid audio file paths extracted from the packets.
|
|
100
|
+
"""
|
|
101
|
+
sources = []
|
|
102
|
+
for audio_packet in audio_packets:
|
|
103
|
+
if not audio_packet or not audio_packet.source:
|
|
104
|
+
self.logger.warning(f"Invalid or nonexistent audio source: {audio_packet.source}")
|
|
105
|
+
sources.append(audio_packet.source)
|
|
106
|
+
return sources
|
|
107
|
+
|
|
108
|
+
def get_sources_from_paths(self, paths: list[str]) -> list[str]:
|
|
109
|
+
"""
|
|
110
|
+
Extract valid audio file paths from a list of paths.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
paths: List of file paths to validate and extract.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
list[str]: List of valid audio file paths.
|
|
117
|
+
"""
|
|
118
|
+
sources = []
|
|
119
|
+
for path in paths:
|
|
120
|
+
if not os.path.exists(path):
|
|
121
|
+
self.logger.warning(f"Audio file not found: {path}")
|
|
122
|
+
sources.append(path)
|
|
123
|
+
return sources
|
|
124
|
+
|
|
125
|
+
def get_audio_sources(self, container: DataContainer) -> list[str]:
|
|
126
|
+
"""
|
|
127
|
+
Get audio sources from container or attributes.
|
|
128
|
+
|
|
129
|
+
This method first attempts to extract audio sources from the DataContainer's
|
|
130
|
+
audio packets. If no sources are found, it falls back to using the audio paths
|
|
131
|
+
specified in the template attributes.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
container: DataContainer containing possible audio packets.
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
list[str]: List of audio file paths to be transcribed.
|
|
138
|
+
"""
|
|
139
|
+
sources = []
|
|
140
|
+
if container.audios:
|
|
141
|
+
sources = self.get_sources_from_packets(container.audios)
|
|
142
|
+
|
|
143
|
+
if not sources and self.attributes.audio_paths:
|
|
144
|
+
sources = self.get_sources_from_paths(self.attributes.audio_paths)
|
|
145
|
+
|
|
146
|
+
return sources
|
|
147
|
+
|
|
148
|
+
@staticmethod
|
|
149
|
+
def _process_transcription_result(result: Any) -> str:
|
|
150
|
+
"""
|
|
151
|
+
Extract text from transcription result.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
result: Transcription result object from the ASR model.
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
str: The extracted text content.
|
|
158
|
+
"""
|
|
159
|
+
return result.text if hasattr(result, "text") else str(result)
|
|
160
|
+
|
|
161
|
+
def _extract_timestamps(self, result: Any) -> dict | None:
|
|
162
|
+
"""
|
|
163
|
+
Extract timestamps from result if available.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
result: Transcription result object from the ASR model.
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
dict | None: Dictionary containing timestamp information at the specified
|
|
170
|
+
level (char, word, or segment) or None if no timestamps are available.
|
|
171
|
+
"""
|
|
172
|
+
if not result.timestamp:
|
|
173
|
+
return None
|
|
174
|
+
|
|
175
|
+
return result.timestamp.get(self.attributes.timestamp_level, [])
|
|
176
|
+
|
|
177
|
+
@staticmethod
|
|
178
|
+
def create_text_packet(text: str, source: str) -> TextPacket:
|
|
179
|
+
"""
|
|
180
|
+
Create text packet from transcription data.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
text: Transcribed text content.
|
|
184
|
+
source: Source identifier for the text packet.
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
TextPacket: A text packet containing the transcription.
|
|
188
|
+
"""
|
|
189
|
+
text_packet = TextPacket(
|
|
190
|
+
content=text,
|
|
191
|
+
source=source,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
return text_packet
|
|
195
|
+
|
|
196
|
+
def transcribe_sources(self, sources: list[str]) -> list[Any]:
|
|
197
|
+
"""
|
|
198
|
+
Transcribe audio sources and return results.
|
|
199
|
+
|
|
200
|
+
This method passes the audio sources to the ASR model for transcription,
|
|
201
|
+
with timestamp generation enabled based on the template attributes.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
sources: List of audio file paths to transcribe.
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
list[Any]: List of transcription results from the ASR model.
|
|
208
|
+
"""
|
|
209
|
+
return self.model.transcribe(
|
|
210
|
+
sources,
|
|
211
|
+
timestamps=self.attributes.enable_timestamps,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
def process_results(
|
|
215
|
+
self,
|
|
216
|
+
transcription_results: list[Any],
|
|
217
|
+
sources: list[str],
|
|
218
|
+
container: DataContainer,
|
|
219
|
+
) -> list[TextPacket]:
|
|
220
|
+
"""
|
|
221
|
+
Process transcription results into text packets.
|
|
222
|
+
|
|
223
|
+
This method extracts text and timestamps from the transcription results
|
|
224
|
+
and creates corresponding text packets. Timestamps are stored in the
|
|
225
|
+
container's generic data dictionary.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
transcription_results: List of transcription results from the ASR model.
|
|
229
|
+
sources: List of audio file paths that were transcribed.
|
|
230
|
+
container: DataContainer to store additional data.
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
list[TextPacket]: List of text packets containing the transcriptions.
|
|
234
|
+
"""
|
|
235
|
+
text_packets = []
|
|
236
|
+
|
|
237
|
+
for i, result in enumerate(transcription_results):
|
|
238
|
+
source = sources[i] if i < len(sources) else f"transcription_{i}"
|
|
239
|
+
text = self._process_transcription_result(result)
|
|
240
|
+
timestamps = self._extract_timestamps(result)
|
|
241
|
+
text_packet = self.create_text_packet(text, source)
|
|
242
|
+
text_packets.append(text_packet)
|
|
243
|
+
if timestamps:
|
|
244
|
+
self._set_generic_data(container, {f"timestamps_{source}": timestamps})
|
|
245
|
+
|
|
246
|
+
return text_packets
|
|
247
|
+
|
|
248
|
+
def execute(self, container: DataContainer) -> DataContainer:
|
|
249
|
+
"""
|
|
250
|
+
Transcribe audio to text using the Parakeet TDT model.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
container: DataContainer with audio packets to transcribe.
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
DataContainer: The same container with added text packets containing transcriptions.
|
|
257
|
+
"""
|
|
258
|
+
sources = self.get_audio_sources(container)
|
|
259
|
+
|
|
260
|
+
if not sources:
|
|
261
|
+
self.logger.info("No audio sources found for transcription")
|
|
262
|
+
return container
|
|
263
|
+
|
|
264
|
+
self.logger.info(f"Transcribing {len(sources)} audio files")
|
|
265
|
+
transcription_results = self.transcribe_sources(sources)
|
|
266
|
+
text_packets = self.process_results(transcription_results, sources, container)
|
|
267
|
+
container.texts = container.texts or []
|
|
268
|
+
container.texts.extend(text_packets)
|
|
269
|
+
|
|
270
|
+
return container
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sinapsis-speech
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Generate speech using various libraries.
|
|
5
5
|
Author-email: SinapsisAI <dev@sinapsis.tech>
|
|
6
6
|
Project-URL: Homepage, https://sinapsis.tech
|
|
@@ -18,6 +18,8 @@ Requires-Dist: sinapsis-f5-tts[all]; extra == "all"
|
|
|
18
18
|
Requires-Dist: sinapsis-kokoro[all]; extra == "all"
|
|
19
19
|
Requires-Dist: sinapsis-speech[gradio-app]; extra == "all"
|
|
20
20
|
Requires-Dist: sinapsis-zonos[all]; extra == "all"
|
|
21
|
+
Requires-Dist: sinapsis-parakeet-tdt[all]; extra == "all"
|
|
22
|
+
Requires-Dist: sinapsis-orpheus-cpp[all]; extra == "all"
|
|
21
23
|
Provides-Extra: gradio-app
|
|
22
24
|
Requires-Dist: sinapsis[webapp]>=0.2.3; extra == "gradio-app"
|
|
23
25
|
Dynamic: license-file
|
|
@@ -55,8 +57,10 @@ This repo includes packages for performing speech synthesis using different tool
|
|
|
55
57
|
|
|
56
58
|
* <code>sinapsis-elevenlabs</code>
|
|
57
59
|
* <code>sinapsis-f5-tts</code>
|
|
58
|
-
*
|
|
60
|
+
* <code>sinapsis-kokoro</code>
|
|
59
61
|
* <code>sinapsis-zonos</code>
|
|
62
|
+
* <code>sinapsis-orpheus-cpp</code>
|
|
63
|
+
* <code>sinapsis-parakeet</code>
|
|
60
64
|
|
|
61
65
|
Install using your preferred package manager. We strongly recommend using <code>uv</code>. To install <code>uv</code>, refer to the [official documentation](https://docs.astral.sh/uv/getting-started/installation/#installation-methods).
|
|
62
66
|
|
|
@@ -205,6 +209,16 @@ docker compose -f docker/compose_apps.yaml up -d sinapsis-kokoro
|
|
|
205
209
|
docker compose -f docker/compose_apps.yaml up -d sinapsis-zonos
|
|
206
210
|
```
|
|
207
211
|
|
|
212
|
+
- For Orpheus-CPP:
|
|
213
|
+
```bash
|
|
214
|
+
docker compose -f docker/compose_apps.yaml up -d sinapsis-orpheus-tts
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
- For Parakeet:
|
|
218
|
+
```bash
|
|
219
|
+
docker compose -f docker/compose_apps.yaml up -d sinapsis-parakeet
|
|
220
|
+
```
|
|
221
|
+
|
|
208
222
|
3. **Check the logs**
|
|
209
223
|
|
|
210
224
|
- For ElevenLabs:
|
|
@@ -224,6 +238,17 @@ docker logs -f sinapsis-kokoro
|
|
|
224
238
|
```bash
|
|
225
239
|
docker logs -f sinapsis-zonos
|
|
226
240
|
```
|
|
241
|
+
|
|
242
|
+
- For Orpheus-CPP:
|
|
243
|
+
```bash
|
|
244
|
+
docker logs -f sinapsis-orpheus-tts
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
- For Parakeet:
|
|
248
|
+
```bash
|
|
249
|
+
docker logs -f sinapsis-parakeet
|
|
250
|
+
```
|
|
251
|
+
|
|
227
252
|
4. **The logs will display the URL to access the webapp, e.g.,:**:
|
|
228
253
|
```bash
|
|
229
254
|
Running on local URL: http://127.0.0.1:7860
|
|
@@ -240,6 +265,17 @@ docker compose -f docker/compose_apps.yaml down
|
|
|
240
265
|
|
|
241
266
|
To run the webapp using the <code>uv</code> package manager, follow these steps:
|
|
242
267
|
|
|
268
|
+
|
|
269
|
+
> [!IMPORTANT]
|
|
270
|
+
> If you're using sinapsis-orpheus-cpp, you need to export cuda environment variables:
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
```bash
|
|
274
|
+
export CMAKE_ARGS="-DGGML_CUDA=on"
|
|
275
|
+
export FORCE_CMAKE="1"
|
|
276
|
+
export CUDACXX=$(command -v nvcc)
|
|
277
|
+
```
|
|
278
|
+
|
|
243
279
|
1. **Sync the virtual environment**:
|
|
244
280
|
|
|
245
281
|
```bash
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
sinapsis_elevenlabs/src/sinapsis_elevenlabs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/env_var_keys.py,sha256=j8J64iplBNaff1WvmfJ03eJozE1f5SdqtqQeldV2vPY,998
|
|
4
|
-
sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/voice_utils.py,sha256=
|
|
5
|
-
sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/__init__.py,sha256=
|
|
6
|
-
sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_base.py,sha256=
|
|
7
|
-
sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/
|
|
4
|
+
sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/voice_utils.py,sha256=2Ym4suCk8wy-Nj2Hmk0uu3_-3nu1QlSs_KubDydm5wY,3383
|
|
5
|
+
sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/__init__.py,sha256=UG35_hown3HITVR42iK_e3yVsUbuq2oYTLpCGwJ89L4,708
|
|
6
|
+
sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_base.py,sha256=Dyxv_VAsjjuCGpdmW0anG5gqON8GS0oQWptgwX2pB44,8832
|
|
7
|
+
sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_sts.py,sha256=O6BK05Noc-HffWNXY-4ow-RQ1xLK9f3jZ-8w71TpDAY,3659
|
|
8
|
+
sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_tts.py,sha256=ll1l17VIgAsJJukZvQBmHO1-77XcpVzNCZr7frnZQEY,2982
|
|
9
|
+
sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_clone.py,sha256=oZKUuG6TrNFHG8vUCTkUbR0IFFp7ZoPE-6nDKhsD0Yg,4738
|
|
8
10
|
sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_generation.py,sha256=bKo7zhfsiZwsn-qZx_MCVAIx_MmaKnaP3lc-07AwAaY,2819
|
|
9
11
|
sinapsis_f5_tts/src/sinapsis_f5_tts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
12
|
sinapsis_f5_tts/src/sinapsis_f5_tts/templates/__init__.py,sha256=28BOPAr9GG1jYcrXi45ZWO1n2FAZJOdDcmRkOXdEYmk,496
|
|
@@ -12,14 +14,19 @@ sinapsis_f5_tts/src/sinapsis_f5_tts/templates/f5_tts_inference.py,sha256=lEkcimV
|
|
|
12
14
|
sinapsis_kokoro/src/sinapsis_kokoro/helpers/kokoro_utils.py,sha256=2IMJuwURPKK7keIkgS-rpGD28REG5M1FwW0COGcm3nI,1573
|
|
13
15
|
sinapsis_kokoro/src/sinapsis_kokoro/templates/__init__.py,sha256=aX25GCUNGzIBeY5kifomsB-nSzW-unfq0-aC2Rpnaws,485
|
|
14
16
|
sinapsis_kokoro/src/sinapsis_kokoro/templates/kokoro_tts.py,sha256=eRSEpH1HAUR3sy9Eb7ZRWhrk1IPZ7Z-ymS34ONFmxOg,5440
|
|
15
|
-
|
|
17
|
+
sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/templates/__init__.py,sha256=XC7cqr1xC-0_yiKsNeob7CzEcHuduBFqvoXQBWVBKtI,492
|
|
18
|
+
sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/templates/orpheus_tts.py,sha256=I9aw_QmsCj6r4XXmPHL85rqjvtfyU2F33jCFFES-i9E,11715
|
|
19
|
+
sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/thirdparty/helpers.py,sha256=5Kat4QDPf1g-lC8CUF8T6yRHWyLQ4VoV7ELKazFMmRA,1964
|
|
20
|
+
sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/templates/__init__.py,sha256=3LppgbS6v70Rmx__yXXQgnoZ2ZBHcXkXeWZYQQf6Zwg,504
|
|
21
|
+
sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/templates/parakeet_tdt.py,sha256=PNE_SQkN72-kzfqvCit-GSEVP5NE8HjvTwxWd6wcem8,9456
|
|
22
|
+
sinapsis_speech-0.4.0.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
|
16
23
|
sinapsis_zonos/src/sinapsis_zonos/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
24
|
sinapsis_zonos/src/sinapsis_zonos/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
25
|
sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_keys.py,sha256=m1GdOYfzP73JGmtxH30mNiqbNkzFsQl9o2QaT7QxSVU,2470
|
|
19
26
|
sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_tts_utils.py,sha256=8Tr2YgxjBfRqv_Hf6sw36X2pLzW7fdQWqa6QPBxNZK8,6419
|
|
20
27
|
sinapsis_zonos/src/sinapsis_zonos/templates/__init__.py,sha256=A-_F0K3hbEFqeWWAh4YftgU9CFX-WHrauSiCAww9yp8,482
|
|
21
28
|
sinapsis_zonos/src/sinapsis_zonos/templates/zonos_tts.py,sha256=Zz0hcXVevPyho7d0q3Q2Zl9yDTPl_XhtueerxmzY_Jc,7687
|
|
22
|
-
sinapsis_speech-0.
|
|
23
|
-
sinapsis_speech-0.
|
|
24
|
-
sinapsis_speech-0.
|
|
25
|
-
sinapsis_speech-0.
|
|
29
|
+
sinapsis_speech-0.4.0.dist-info/METADATA,sha256=s0-CB1Db680tyrtrEbiajzlCP1-Khte1z4hAei-9KJg,10751
|
|
30
|
+
sinapsis_speech-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
31
|
+
sinapsis_speech-0.4.0.dist-info/top_level.txt,sha256=KvdwXupt5wnqb_4XGRcuJaL9Glgdw-DBvRkNzhgl_Ds,110
|
|
32
|
+
sinapsis_speech-0.4.0.dist-info/RECORD,,
|
|
File without changes
|