sinapsis-speech 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/env_var_keys.py +1 -1
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/voice_utils.py +7 -23
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_base.py +13 -23
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_generation.py +4 -1
- sinapsis_f5_tts/src/sinapsis_f5_tts/__init__.py +0 -0
- sinapsis_f5_tts/src/sinapsis_f5_tts/templates/__init__.py +20 -0
- sinapsis_f5_tts/src/sinapsis_f5_tts/templates/f5_tts_inference.py +357 -0
- {sinapsis_speech-0.1.0.dist-info → sinapsis_speech-0.2.0.dist-info}/METADATA +117 -63
- sinapsis_speech-0.2.0.dist-info/RECORD +21 -0
- {sinapsis_speech-0.1.0.dist-info → sinapsis_speech-0.2.0.dist-info}/WHEEL +1 -1
- sinapsis_speech-0.2.0.dist-info/top_level.txt +3 -0
- sinapsis_zonos/src/sinapsis_zonos/__init__.py +0 -0
- sinapsis_zonos/src/sinapsis_zonos/helpers/__init__.py +0 -0
- sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_keys.py +67 -0
- sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_tts_utils.py +153 -0
- sinapsis_zonos/src/sinapsis_zonos/templates/__init__.py +20 -0
- sinapsis_zonos/src/sinapsis_zonos/templates/zonos_tts.py +172 -0
- sinapsis_speech-0.1.0.dist-info/RECORD +0 -13
- sinapsis_speech-0.1.0.dist-info/licenses/LICENSE +0 -661
- sinapsis_speech-0.1.0.dist-info/top_level.txt +0 -1
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""Base template for Zonos speech synthesis"""
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
import uuid
|
|
6
|
+
from typing import Literal, Set
|
|
7
|
+
|
|
8
|
+
import torch
|
|
9
|
+
import torchaudio
|
|
10
|
+
from pydantic import Field
|
|
11
|
+
from sinapsis_core.data_containers.data_packet import DataContainer, TextPacket
|
|
12
|
+
from sinapsis_core.template_base.template import (
|
|
13
|
+
Template,
|
|
14
|
+
TemplateAttributes,
|
|
15
|
+
TemplateAttributeType,
|
|
16
|
+
)
|
|
17
|
+
from sinapsis_core.utils.env_var_keys import SINAPSIS_CACHE_DIR
|
|
18
|
+
from zonos.model import Zonos
|
|
19
|
+
from zonos.utils import DEFAULT_DEVICE as device
|
|
20
|
+
|
|
21
|
+
from sinapsis_zonos.helpers.zonos_keys import EmotionsConfig, SamplingParams, TTSKeys
|
|
22
|
+
from sinapsis_zonos.helpers.zonos_tts_utils import (
|
|
23
|
+
get_audio_prefix_codes,
|
|
24
|
+
get_conditioning,
|
|
25
|
+
get_sampling_params,
|
|
26
|
+
init_seed,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ZonosTTS(Template):
|
|
31
|
+
"""
|
|
32
|
+
Base template for speech synthesis using the Zonos model.
|
|
33
|
+
|
|
34
|
+
This template is designed to generate high-quality, expressive text-to-speech (TTS) outputs
|
|
35
|
+
using the Zonos TTS model, which supports multilingual speech generation, speaker cloning,
|
|
36
|
+
and fine control over various speech attributes like pitch, speaking rate, and emotions.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
class AttributesBaseModel(TemplateAttributes):
|
|
40
|
+
"""
|
|
41
|
+
Attributes for Zonos TTS model configuration.
|
|
42
|
+
Args:
|
|
43
|
+
model (str): Model identifier for Zonos. Options:
|
|
44
|
+
"Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid" (default: "Zyphra/Zonos-v0.1-transformer").
|
|
45
|
+
language (str): Language for speech synthesis (default: 'en-us').
|
|
46
|
+
emotions (Emotions | None): Emotions to apply to generated speech, fine-tuning emotional tone.
|
|
47
|
+
fmax (float): Max frequency for speech in Hz (default: 22050, range: 0-24000).
|
|
48
|
+
pitch_std (float): Standard deviation for pitch variation (default: 20.0, range: 0-300).
|
|
49
|
+
speaking_rate (float): Rate of speech (default: 15.0, range: 5-30).
|
|
50
|
+
dnsmos (float): Denoising strength for hybrid models (default: 4.0, range: 1-5).
|
|
51
|
+
vq_score (float): VQ score for hybrid models (default: 0.78, range: 0.5-0.8).
|
|
52
|
+
cfg_scale (float): Controls randomness in speech (default: 2.0, range: 1-5).
|
|
53
|
+
sampling_params (SamplingParams | None): Controls sampling behavior, including `top_p`, `top_k`, `min_p`,
|
|
54
|
+
`linear`, `conf`, and `quad` parameters for sampling.
|
|
55
|
+
randomized_seed (bool): If True, the seed is randomized (default: True).
|
|
56
|
+
denoised_speaker (bool): If True, applies denoising to speaker embedding.
|
|
57
|
+
unconditional_keys (Iterable[str]): Keys for conditioning speech without speaker embedding.
|
|
58
|
+
prefix_audio (str | None): Path to an audio file for prefix conditioning (e.g., whispering).
|
|
59
|
+
speaker_audio (str | None): Path to an audio file for creating a speaker embedding for voice cloning.
|
|
60
|
+
output_folder (str): Folder for saving generated audio files (default: SINAPSIS_CACHE_DIR/zonos/audios).
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
cfg_scale: float = 2.0
|
|
64
|
+
denoised_speaker: bool = False
|
|
65
|
+
dnsmos: float = 4.0
|
|
66
|
+
emotions: EmotionsConfig = Field(default_factory=dict) # type: ignore[arg-type]
|
|
67
|
+
fmax: float = 22050.0
|
|
68
|
+
language: str = TTSKeys.en_language
|
|
69
|
+
model: Literal["Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid"] = "Zyphra/Zonos-v0.1-transformer"
|
|
70
|
+
output_folder: str = os.path.join(SINAPSIS_CACHE_DIR, "zonos", "audios")
|
|
71
|
+
pitch_std: float = 20.0
|
|
72
|
+
prefix_audio: str | None = None
|
|
73
|
+
randomized_seed: bool = True
|
|
74
|
+
sampling_params: SamplingParams = Field(default_factory=dict) # type: ignore[arg-type]
|
|
75
|
+
seed: int = 420
|
|
76
|
+
speaker_audio: str | None = None
|
|
77
|
+
speaking_rate: float = 15.0
|
|
78
|
+
unconditional_keys: Set[str] = Field(default={TTSKeys.vqscore_8, TTSKeys.dnsmos_ovrl})
|
|
79
|
+
vq_score: float = 0.7
|
|
80
|
+
|
|
81
|
+
def __init__(self, attributes: TemplateAttributeType) -> None:
|
|
82
|
+
"""Initializes the Zonos model with the provided attributes."""
|
|
83
|
+
super().__init__(attributes)
|
|
84
|
+
os.makedirs(self.attributes.output_folder, exist_ok=True)
|
|
85
|
+
self.device = device
|
|
86
|
+
self.model = self._init_model()
|
|
87
|
+
init_seed(self.attributes)
|
|
88
|
+
self.logger.debug(f"Model {self.attributes.model} initalized\nSeed: {self.attributes.seed}")
|
|
89
|
+
|
|
90
|
+
def _init_model(self) -> Zonos:
|
|
91
|
+
"""
|
|
92
|
+
Initialize and load the specified Zonos model.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Zonos: The loaded and prepared Zonos model, set to evaluation mode with gradients disabled.
|
|
96
|
+
"""
|
|
97
|
+
model = Zonos.from_pretrained(self.attributes.model, device=self.device)
|
|
98
|
+
model.requires_grad_(False).eval()
|
|
99
|
+
return model
|
|
100
|
+
|
|
101
|
+
def _del_model(self) -> None:
|
|
102
|
+
"""
|
|
103
|
+
Delete the current model instance and clear CUDA cache.
|
|
104
|
+
|
|
105
|
+
Frees GPU memory by deleting the model and explicitly emptying the CUDA cache.
|
|
106
|
+
"""
|
|
107
|
+
if self.model:
|
|
108
|
+
del self.model
|
|
109
|
+
torch.cuda.empty_cache()
|
|
110
|
+
|
|
111
|
+
def reset_state(self) -> None:
|
|
112
|
+
"""Reinitialize the model and random seed."""
|
|
113
|
+
self._del_model()
|
|
114
|
+
self.model = self._init_model()
|
|
115
|
+
init_seed(self.attributes)
|
|
116
|
+
self.logger.debug(f"Model {self.attributes.model} reset\nSeed: {self.attributes.seed}")
|
|
117
|
+
|
|
118
|
+
def generate_speech(self, input_data: list[TextPacket]) -> torch.Tensor:
|
|
119
|
+
"""
|
|
120
|
+
Generates speech for the input text data.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
input_data (list[TextPacket]): A list of `TextPacket` objects containing the text data.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
torch.Tensor: The generated speech audio tensor.
|
|
127
|
+
"""
|
|
128
|
+
input_text = "".join(t.content for t in input_data)
|
|
129
|
+
conditioning = get_conditioning(self.attributes, self.model, input_text, self.device)
|
|
130
|
+
prefix_codes = get_audio_prefix_codes(
|
|
131
|
+
self.attributes.prefix_audio,
|
|
132
|
+
self.model,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
codes = self.model.generate(
|
|
136
|
+
prefix_conditioning=conditioning,
|
|
137
|
+
audio_prefix_codes=prefix_codes,
|
|
138
|
+
cfg_scale=self.attributes.cfg_scale,
|
|
139
|
+
batch_size=1,
|
|
140
|
+
sampling_params=get_sampling_params(self.attributes.sampling_params),
|
|
141
|
+
)
|
|
142
|
+
output_wav = self.model.autoencoder.decode(codes).cpu().detach()
|
|
143
|
+
return output_wav
|
|
144
|
+
|
|
145
|
+
def save_audio_output(self, output_audio: torch.Tensor, container: DataContainer) -> None:
|
|
146
|
+
"""
|
|
147
|
+
Saves a single generated audio output to the specified folder.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
output_audio (torch.Tensor): The generated audio output tensor.
|
|
151
|
+
container (DataContainer): The container to store metadata.
|
|
152
|
+
"""
|
|
153
|
+
output_path = os.path.join(self.attributes.output_folder, f"{uuid.uuid4()}.{TTSKeys.wav}")
|
|
154
|
+
torchaudio.save(output_path, output_audio[0], self.model.autoencoder.sampling_rate)
|
|
155
|
+
self._set_generic_data(container, [output_path])
|
|
156
|
+
self.logger.debug(f"Audio saved to: {output_path}")
|
|
157
|
+
|
|
158
|
+
def execute(self, container: DataContainer) -> DataContainer:
|
|
159
|
+
"""Processes the input data and generates a speech output."""
|
|
160
|
+
|
|
161
|
+
if not container.texts:
|
|
162
|
+
self.logger.debug("No query to enter")
|
|
163
|
+
return container
|
|
164
|
+
|
|
165
|
+
audio_output = self.generate_speech(container.texts)
|
|
166
|
+
if audio_output is None:
|
|
167
|
+
self.logger.error("Unable to generate speech")
|
|
168
|
+
return container
|
|
169
|
+
|
|
170
|
+
self.save_audio_output(audio_output, container)
|
|
171
|
+
|
|
172
|
+
return container
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
sinapsis_elevenlabs/src/sinapsis_elevenlabs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/env_var_keys.py,sha256=caUrXvOG0VFqsm7VuuiogF9hju6jKCkxiipUswu7MyY,997
|
|
4
|
-
sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/voice_utils.py,sha256=L5mbkMsvFmIjFCs0OfuxRG4bLvrqJn7KB9MbXQ7ix0w,3377
|
|
5
|
-
sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/__init__.py,sha256=pyTWPBLN_P6sxFTF1QqfL7iTZd9E0EaggpfwB0qLLHI,579
|
|
6
|
-
sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_base.py,sha256=qL4i-fF49JDBbzqxCTGg-a1Ixf8wNB-YmU0dvdugY-g,8464
|
|
7
|
-
sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_tts.py,sha256=WVTROfB2ODAksHmWwV5RKcub3Hoc29OM_eAw75c9yio,2847
|
|
8
|
-
sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_generation.py,sha256=aOK35KvHUgFf-8-RTFIu6aOIHG_yL_lGkzXSDhCLEk4,2800
|
|
9
|
-
sinapsis_speech-0.1.0.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
|
10
|
-
sinapsis_speech-0.1.0.dist-info/METADATA,sha256=61Jo48emEIp5apO8TGru2VBxS5giOMZhVXxClWIEw-8,46349
|
|
11
|
-
sinapsis_speech-0.1.0.dist-info/WHEEL,sha256=tTnHoFhvKQHCh4jz3yCn0WPTYIy7wXx3CJtJ7SJGV7c,91
|
|
12
|
-
sinapsis_speech-0.1.0.dist-info/top_level.txt,sha256=g4-HsOOymejdXrBMi_5g3IJqPvJixiO8_XYbMlHtAEc,20
|
|
13
|
-
sinapsis_speech-0.1.0.dist-info/RECORD,,
|