sinapsis-speech 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,172 @@
1
+ # -*- coding: utf-8 -*-
2
+ """Base template for Zonos speech synthesis"""
3
+
4
+ import os
5
+ import uuid
6
+ from typing import Literal, Set
7
+
8
+ import torch
9
+ import torchaudio
10
+ from pydantic import Field
11
+ from sinapsis_core.data_containers.data_packet import DataContainer, TextPacket
12
+ from sinapsis_core.template_base.template import (
13
+ Template,
14
+ TemplateAttributes,
15
+ TemplateAttributeType,
16
+ )
17
+ from sinapsis_core.utils.env_var_keys import SINAPSIS_CACHE_DIR
18
+ from zonos.model import Zonos
19
+ from zonos.utils import DEFAULT_DEVICE as device
20
+
21
+ from sinapsis_zonos.helpers.zonos_keys import EmotionsConfig, SamplingParams, TTSKeys
22
+ from sinapsis_zonos.helpers.zonos_tts_utils import (
23
+ get_audio_prefix_codes,
24
+ get_conditioning,
25
+ get_sampling_params,
26
+ init_seed,
27
+ )
28
+
29
+
30
+ class ZonosTTS(Template):
31
+ """
32
+ Base template for speech synthesis using the Zonos model.
33
+
34
+ This template is designed to generate high-quality, expressive text-to-speech (TTS) outputs
35
+ using the Zonos TTS model, which supports multilingual speech generation, speaker cloning,
36
+ and fine control over various speech attributes like pitch, speaking rate, and emotions.
37
+ """
38
+
39
+ class AttributesBaseModel(TemplateAttributes):
40
+ """
41
+ Attributes for Zonos TTS model configuration.
42
+ Args:
43
+ model (str): Model identifier for Zonos. Options:
44
+ "Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid" (default: "Zyphra/Zonos-v0.1-transformer").
45
+ language (str): Language for speech synthesis (default: 'en-us').
46
+ emotions (Emotions | None): Emotions to apply to generated speech, fine-tuning emotional tone.
47
+ fmax (float): Max frequency for speech in Hz (default: 22050, range: 0-24000).
48
+ pitch_std (float): Standard deviation for pitch variation (default: 20.0, range: 0-300).
49
+ speaking_rate (float): Rate of speech (default: 15.0, range: 5-30).
50
+ dnsmos (float): Denoising strength for hybrid models (default: 4.0, range: 1-5).
51
+ vq_score (float): VQ score for hybrid models (default: 0.78, range: 0.5-0.8).
52
+ cfg_scale (float): Controls randomness in speech (default: 2.0, range: 1-5).
53
+ sampling_params (SamplingParams | None): Controls sampling behavior, including `top_p`, `top_k`, `min_p`,
54
+ `linear`, `conf`, and `quad` parameters for sampling.
55
+ randomized_seed (bool): If True, the seed is randomized (default: True).
56
+ denoised_speaker (bool): If True, applies denoising to speaker embedding.
57
+ unconditional_keys (Iterable[str]): Keys for conditioning speech without speaker embedding.
58
+ prefix_audio (str | None): Path to an audio file for prefix conditioning (e.g., whispering).
59
+ speaker_audio (str | None): Path to an audio file for creating a speaker embedding for voice cloning.
60
+ output_folder (str): Folder for saving generated audio files (default: SINAPSIS_CACHE_DIR/zonos/audios).
61
+ """
62
+
63
+ cfg_scale: float = 2.0
64
+ denoised_speaker: bool = False
65
+ dnsmos: float = 4.0
66
+ emotions: EmotionsConfig = Field(default_factory=dict) # type: ignore[arg-type]
67
+ fmax: float = 22050.0
68
+ language: str = TTSKeys.en_language
69
+ model: Literal["Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid"] = "Zyphra/Zonos-v0.1-transformer"
70
+ output_folder: str = os.path.join(SINAPSIS_CACHE_DIR, "zonos", "audios")
71
+ pitch_std: float = 20.0
72
+ prefix_audio: str | None = None
73
+ randomized_seed: bool = True
74
+ sampling_params: SamplingParams = Field(default_factory=dict) # type: ignore[arg-type]
75
+ seed: int = 420
76
+ speaker_audio: str | None = None
77
+ speaking_rate: float = 15.0
78
+ unconditional_keys: Set[str] = Field(default={TTSKeys.vqscore_8, TTSKeys.dnsmos_ovrl})
79
+ vq_score: float = 0.7
80
+
81
+ def __init__(self, attributes: TemplateAttributeType) -> None:
82
+ """Initializes the Zonos model with the provided attributes."""
83
+ super().__init__(attributes)
84
+ os.makedirs(self.attributes.output_folder, exist_ok=True)
85
+ self.device = device
86
+ self.model = self._init_model()
87
+ init_seed(self.attributes)
88
+ self.logger.debug(f"Model {self.attributes.model} initalized\nSeed: {self.attributes.seed}")
89
+
90
+ def _init_model(self) -> Zonos:
91
+ """
92
+ Initialize and load the specified Zonos model.
93
+
94
+ Returns:
95
+ Zonos: The loaded and prepared Zonos model, set to evaluation mode with gradients disabled.
96
+ """
97
+ model = Zonos.from_pretrained(self.attributes.model, device=self.device)
98
+ model.requires_grad_(False).eval()
99
+ return model
100
+
101
+ def _del_model(self) -> None:
102
+ """
103
+ Delete the current model instance and clear CUDA cache.
104
+
105
+ Frees GPU memory by deleting the model and explicitly emptying the CUDA cache.
106
+ """
107
+ if self.model:
108
+ del self.model
109
+ torch.cuda.empty_cache()
110
+
111
+ def reset_state(self) -> None:
112
+ """Reinitialize the model and random seed."""
113
+ self._del_model()
114
+ self.model = self._init_model()
115
+ init_seed(self.attributes)
116
+ self.logger.debug(f"Model {self.attributes.model} reset\nSeed: {self.attributes.seed}")
117
+
118
+ def generate_speech(self, input_data: list[TextPacket]) -> torch.Tensor:
119
+ """
120
+ Generates speech for the input text data.
121
+
122
+ Args:
123
+ input_data (list[TextPacket]): A list of `TextPacket` objects containing the text data.
124
+
125
+ Returns:
126
+ torch.Tensor: The generated speech audio tensor.
127
+ """
128
+ input_text = "".join(t.content for t in input_data)
129
+ conditioning = get_conditioning(self.attributes, self.model, input_text, self.device)
130
+ prefix_codes = get_audio_prefix_codes(
131
+ self.attributes.prefix_audio,
132
+ self.model,
133
+ )
134
+
135
+ codes = self.model.generate(
136
+ prefix_conditioning=conditioning,
137
+ audio_prefix_codes=prefix_codes,
138
+ cfg_scale=self.attributes.cfg_scale,
139
+ batch_size=1,
140
+ sampling_params=get_sampling_params(self.attributes.sampling_params),
141
+ )
142
+ output_wav = self.model.autoencoder.decode(codes).cpu().detach()
143
+ return output_wav
144
+
145
+ def save_audio_output(self, output_audio: torch.Tensor, container: DataContainer) -> None:
146
+ """
147
+ Saves a single generated audio output to the specified folder.
148
+
149
+ Args:
150
+ output_audio (torch.Tensor): The generated audio output tensor.
151
+ container (DataContainer): The container to store metadata.
152
+ """
153
+ output_path = os.path.join(self.attributes.output_folder, f"{uuid.uuid4()}.{TTSKeys.wav}")
154
+ torchaudio.save(output_path, output_audio[0], self.model.autoencoder.sampling_rate)
155
+ self._set_generic_data(container, [output_path])
156
+ self.logger.debug(f"Audio saved to: {output_path}")
157
+
158
+ def execute(self, container: DataContainer) -> DataContainer:
159
+ """Processes the input data and generates a speech output."""
160
+
161
+ if not container.texts:
162
+ self.logger.debug("No query to enter")
163
+ return container
164
+
165
+ audio_output = self.generate_speech(container.texts)
166
+ if audio_output is None:
167
+ self.logger.error("Unable to generate speech")
168
+ return container
169
+
170
+ self.save_audio_output(audio_output, container)
171
+
172
+ return container
@@ -1,13 +0,0 @@
1
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/env_var_keys.py,sha256=caUrXvOG0VFqsm7VuuiogF9hju6jKCkxiipUswu7MyY,997
4
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/voice_utils.py,sha256=L5mbkMsvFmIjFCs0OfuxRG4bLvrqJn7KB9MbXQ7ix0w,3377
5
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/__init__.py,sha256=pyTWPBLN_P6sxFTF1QqfL7iTZd9E0EaggpfwB0qLLHI,579
6
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_base.py,sha256=qL4i-fF49JDBbzqxCTGg-a1Ixf8wNB-YmU0dvdugY-g,8464
7
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_tts.py,sha256=WVTROfB2ODAksHmWwV5RKcub3Hoc29OM_eAw75c9yio,2847
8
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_generation.py,sha256=aOK35KvHUgFf-8-RTFIu6aOIHG_yL_lGkzXSDhCLEk4,2800
9
- sinapsis_speech-0.1.0.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
10
- sinapsis_speech-0.1.0.dist-info/METADATA,sha256=61Jo48emEIp5apO8TGru2VBxS5giOMZhVXxClWIEw-8,46349
11
- sinapsis_speech-0.1.0.dist-info/WHEEL,sha256=tTnHoFhvKQHCh4jz3yCn0WPTYIy7wXx3CJtJ7SJGV7c,91
12
- sinapsis_speech-0.1.0.dist-info/top_level.txt,sha256=g4-HsOOymejdXrBMi_5g3IJqPvJixiO8_XYbMlHtAEc,20
13
- sinapsis_speech-0.1.0.dist-info/RECORD,,