sinapsis-speech 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,7 +12,7 @@ class _ElevenlabsKeys(BaseModel):
12
12
 
13
13
  ELEVENLABS_API_KEY: EnvVarEntry = EnvVarEntry(
14
14
  var_name="ELEVENLABS_API_KEY",
15
- default_value=" ",
15
+ default_value=None,
16
16
  allowed_values=None,
17
17
  description="set api key for Elevenlabs",
18
18
  )
@@ -9,34 +9,18 @@ def create_voice_settings(settings: VoiceSettings) -> VoiceSettings | None:
9
9
  """
10
10
  Creates or updates a `VoiceSettings` object based on the provided settings.
11
11
 
12
- This function attempts to create or update a `VoiceSettings` object using the provided
13
- `VoiceSettings` instance. If any of the fields in the settings contain `None`,
14
- the corresponding field is populated with a default value from `DEFAULT_VOICE.settings`.
15
- If all fields are valid (i.e., none are `None`), the provided `settings` object is returned unchanged.
16
-
17
- If the settings argument is `None` or if no valid settings are provided, the function
18
- returns `None`.
19
-
20
12
  Args:
21
- settings (VoiceSettings): An instance of `VoiceSettings` containing the settings to be applied.
22
- This object may have fields with `None` values that should be replaced with default values.
13
+ settings (VoiceSettings | None): An instance of `VoiceSettings` containing the settings to be applied.
14
+ If `None`, the function returns the default settings.
23
15
 
24
16
  Returns:
25
- VoiceSettings: A `VoiceSettings` object created or updated with the provided settings. If any field
26
- was `None`, it is updated with default values. If the settings are invalid or empty,
27
- `None` is returned.
17
+ VoiceSettings: The provided `VoiceSettings` object if `settings` is not `None`. Otherwise,
18
+ `DEFAULT_VOICE.settings` is returned.
28
19
  """
29
- if settings:
30
- settings_dict = settings.model_dump()
31
- if any(value is None for value in settings_dict.values()):
32
- for field, value in settings_dict.items():
33
- if value is None:
34
- settings_dict[field] = getattr(DEFAULT_VOICE.settings, field)
20
+ if not settings:
21
+ return DEFAULT_VOICE.settings
35
22
 
36
- return VoiceSettings(**settings_dict)
37
- else:
38
- return settings
39
- return None
23
+ return settings
40
24
 
41
25
 
42
26
  def get_voice_id(client: ElevenLabs, voice: VoiceId | VoiceName) -> VoiceId:
@@ -9,6 +9,8 @@ from typing import IO, Iterator, Literal
9
9
 
10
10
  from elevenlabs import Voice, VoiceSettings, save
11
11
  from elevenlabs.client import ElevenLabs, VoiceId, VoiceName
12
+ from elevenlabs.types import OutputFormat
13
+ from pydantic import Field
12
14
  from sinapsis_core.data_containers.data_packet import AudioPacket, DataContainer, Packet
13
15
  from sinapsis_core.template_base.template import (
14
16
  Template,
@@ -40,25 +42,23 @@ class ElevenLabsBase(Template, abc.ABC):
40
42
  """
41
43
  Attributes for ElevenLabs Base Class.
42
44
  Args:
43
- api_key (str): The API key to authenticate with ElevenLabs' API.
44
- voice (str|elevenlabs.Voice): The voice to use for speech synthesis. This can be a voice ID (str),
45
+ api_key (str): The API used key to authenticate with ElevenLabs' API.
46
+ model (Literal): The model identifier to use for speech synthesis.
47
+ output_format (OutputFormat): The output audio format and quality. Options include:
48
+ ["mp3_22050_32", "mp3_44100_32", "mp3_44100_64", "mp3_44100_96", "mp3_44100_128",
49
+ "mp3_44100_192", "pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100", "ulaw_8000"]
50
+ output_folder (str): The folder where generated audio files will be saved.
51
+ stream (bool): If True, the audio is returned as a stream; otherwise, saved to a file.
52
+ voice (VoiceId | VoiceName | Voice): The voice to use for speech synthesis. This can be a voice ID (str),
45
53
  a voice name (str) or an elevenlabs voice object (Voice).
46
54
  voice_settings (VoiceSettings): A dictionary of settings that control the behavior of the voice.
47
55
  - stability (float)
48
56
  - similarity_boost (float)
49
57
  - style (float)
50
58
  - use_speaker_boost (bool)
51
- model (Literal): The model identifier to use for speech synthesis.
52
- output_format (Literal): The output audio format and quality. Options include:
53
- ["mp3_22050_32", "mp3_44100_32", "mp3_44100_64", "mp3_44100_96", "mp3_44100_128",
54
- "mp3_44100_192", "pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100", "ulaw_8000"]
55
- output_folder (str): The folder where generated audio files will be saved.
56
- stream (bool): If True, the audio is returned as a stream; otherwise, saved to a file.
57
59
  """
58
60
 
59
61
  api_key: str | None = None
60
- voice: VoiceId | VoiceName | Voice = None
61
- voice_settings: VoiceSettings | None = None
62
62
  model: Literal[
63
63
  "eleven_turbo_v2_5",
64
64
  "eleven_multilingual_v2",
@@ -68,21 +68,11 @@ class ElevenLabsBase(Template, abc.ABC):
68
68
  "eleven_english_sts_v2",
69
69
  "eleven_multilingual_sts_v2",
70
70
  ] = "eleven_turbo_v2_5"
71
- output_format: Literal[
72
- "mp3_22050_32",
73
- "mp3_44100_32",
74
- "mp3_44100_64",
75
- "mp3_44100_96",
76
- "mp3_44100_128",
77
- "mp3_44100_192",
78
- "pcm_16000",
79
- "pcm_22050",
80
- "pcm_24000",
81
- "pcm_44100",
82
- "ulaw_8000",
83
- ] = "mp3_44100_128"
71
+ output_format: OutputFormat = "mp3_44100_128"
84
72
  output_folder: str = os.path.join(SINAPSIS_CACHE_DIR, "elevenlabs", "audios")
85
73
  stream: bool = False
74
+ voice: VoiceId | VoiceName | Voice = None
75
+ voice_settings: VoiceSettings = Field(default_factory=dict) # type: ignore[arg-type]
86
76
 
87
77
  def __init__(self, attributes: TemplateAttributeType) -> None:
88
78
  """Initializes the ElevenLabs API client with the given attributes."""
@@ -42,9 +42,12 @@ class ElevenLabsVoiceGeneration(ElevenLabsBase):
42
42
  class AttributesBaseModel(ElevenLabsBase.AttributesBaseModel):
43
43
  """
44
44
  Attributes for voice generation in ElevenLabs API.
45
+
46
+ Args:
47
+ voice_description (str): A description of the voice to be used for synthesis.
45
48
  """
46
49
 
47
- voice_description: str = "An old British male with a raspy, deep voice. Professional, relaxed and assertive"
50
+ voice_description: str
48
51
 
49
52
  def synthesize_speech(self, input_data: list[TextPacket]) -> list[bytes] | None:
50
53
  """
File without changes
@@ -0,0 +1,20 @@
1
+ # -*- coding: utf-8 -*-
2
+ import importlib
3
+ from typing import Callable
4
+
5
+ _root_lib_path = "sinapsis_f5_tts.templates"
6
+
7
+ _template_lookup = {
8
+ "F5TTSInference": f"{_root_lib_path}.f5_tts_inference",
9
+ }
10
+
11
+
12
+ def __getattr__(name: str) -> Callable:
13
+ if name in _template_lookup:
14
+ module = importlib.import_module(_template_lookup[name])
15
+ return getattr(module, name)
16
+
17
+ raise AttributeError(f"template `{name}` not found in {_root_lib_path}")
18
+
19
+
20
+ __all__ = list(_template_lookup.keys())
@@ -0,0 +1,357 @@
1
+ # -*- coding: utf-8 -*-
2
+ import os
3
+ import subprocess
4
+ import tempfile
5
+ from typing import Any, Literal
6
+
7
+ import numpy as np
8
+ import soundfile as sf
9
+ from pydantic import Field
10
+ from pydantic.dataclasses import dataclass
11
+ from sinapsis_core.data_containers.data_packet import (
12
+ AudioPacket,
13
+ DataContainer,
14
+ )
15
+ from sinapsis_core.template_base import Template, TemplateAttributes
16
+
17
+
18
+ @dataclass
19
+ class F5CliKeys:
20
+ cli_flag: str = "cli_flag"
21
+ cli_param: str = "cli_param"
22
+
23
+
24
+ class F5TTSInferenceAttributes(TemplateAttributes):
25
+ """Configuration attributes for the F5TTS text-to-speech inference template.
26
+
27
+ Attributes:
28
+ model (str): The model name to use for synthesis. Options include 'F5TTS_v1_Base',
29
+ 'F5TTS_Base', 'E2TTS_Base', etc. Default is 'F5TTS_v1_Base'.
30
+ model_cfg (str | None): Path to the F5-TTS model config file (.yaml). If None,
31
+ the default configuration will be used.
32
+ ckpt_file (str | None): Path to model checkpoint file (.pt). If None, the default
33
+ checkpoint will be used.
34
+ vocab_file (str | None): Path to vocabulary file (.txt). If None, the default
35
+ vocabulary will be used.
36
+ ref_audio (str): Path to the reference audio file. This is required to clone the voice
37
+ characteristics.
38
+ ref_text (str): The transcript/subtitle for the reference audio. Default is a space character.
39
+ When left empty, the system will attempt to extract text from the audio automatically.
40
+ It's recommended to leave this empty for automatic extraction.
41
+ vocoder_name (Literal["vocos", "bigvgan"]): The vocoder to use for audio generation.
42
+ Options are 'vocos' or 'bigvgan'. Default is 'vocos'.
43
+ load_vocoder_from_local (bool): Whether to load the vocoder from a local directory
44
+ (default: ../checkpoints/vocos-mel-24khz) instead of downloading it. Default is False.
45
+ nfe_step (int): The number of function evaluation steps (denoising steps) to perform
46
+ during inference. Higher values may produce better quality at the cost of speed. Default is 32.
47
+ cfg_strength (float): Classifier-free guidance strength. Controls how closely the output
48
+ follows the reference voice. Default is 2.0.
49
+ cross_fade_duration (float): Duration of cross-fade between audio segments in seconds.
50
+ Used when generating longer audio that requires multiple segments. Default is 0.15.
51
+ speed (float): The speed of the generated audio. Values > 1.0 speed up the audio,
52
+ values < 1.0 slow it down. Default is 1.0.
53
+ sway_sampling_coef (float): Sway Sampling coefficient for controlling variability
54
+ in the generated speech. Default is -1.0.
55
+ target_rms (float | None): Target output speech loudness normalization value.
56
+ Controls the volume of the output. Default is None.
57
+ fix_duration (float | None): Fix the total duration (reference and generated audios)
58
+ in seconds. Default is None.
59
+ remove_silence (bool): Whether to remove long silence found in the output. Default is False.
60
+ save_chunk (bool): Whether to save each audio chunk during inference. Useful for
61
+ debugging or analyzing the generation process. Default is False.
62
+ device (str | None): Specify the device to run inference on (e.g., 'cuda:0', 'cpu').
63
+ Default is None, which uses the system's default device.
64
+ """
65
+
66
+ model: str = Field(default="F5TTS_v1_Base", json_schema_extra={F5CliKeys.cli_param: "-m"})
67
+
68
+ model_cfg: str | None = Field(default=None, json_schema_extra={F5CliKeys.cli_param: "-mc"})
69
+
70
+ ckpt_file: str | None = Field(default=None, json_schema_extra={F5CliKeys.cli_param: "-p"})
71
+
72
+ vocab_file: str | None = Field(default=None, json_schema_extra={F5CliKeys.cli_param: "-v"})
73
+
74
+ ref_audio: str = Field(json_schema_extra={F5CliKeys.cli_param: "-r"})
75
+
76
+ ref_text: str = Field(default=" ", json_schema_extra={F5CliKeys.cli_param: "-s"})
77
+
78
+ vocoder_name: Literal["vocos", "bigvgan"] = Field(
79
+ default="vocos", json_schema_extra={F5CliKeys.cli_param: "--vocoder_name"}
80
+ )
81
+
82
+ load_vocoder_from_local: bool = Field(
83
+ default=False, json_schema_extra={F5CliKeys.cli_flag: "--load_vocoder_from_local"}
84
+ )
85
+
86
+ nfe_step: int = Field(default=32, json_schema_extra={F5CliKeys.cli_param: "--nfe_step"})
87
+
88
+ cfg_strength: float = Field(default=2.0, json_schema_extra={F5CliKeys.cli_param: "--cfg_strength"})
89
+
90
+ cross_fade_duration: float = Field(default=0.15, json_schema_extra={F5CliKeys.cli_param: "--cross_fade_duration"})
91
+
92
+ speed: float = Field(default=1.0, json_schema_extra={F5CliKeys.cli_param: "--speed"})
93
+
94
+ sway_sampling_coef: float = Field(default=-1.0, json_schema_extra={F5CliKeys.cli_param: "--sway_sampling_coef"})
95
+
96
+ target_rms: float | None = Field(default=None, json_schema_extra={F5CliKeys.cli_param: "--target_rms"})
97
+
98
+ fix_duration: float | None = Field(default=None, json_schema_extra={F5CliKeys.cli_param: "--fix_duration"})
99
+
100
+ remove_silence: bool = Field(default=False, json_schema_extra={F5CliKeys.cli_flag: "--remove_silence"})
101
+
102
+ save_chunk: bool = Field(default=False, json_schema_extra={F5CliKeys.cli_flag: "--save_chunk"})
103
+
104
+ device: str | None = Field(default=None, json_schema_extra={F5CliKeys.cli_param: "--device"})
105
+
106
+
107
+ class F5TTSInference(Template):
108
+ """Template for performing text-to-speech synthesis using the F5TTS model.
109
+
110
+ This template uses the F5TTS CLI tool to generate speech from text input.
111
+ It processes text packets from the input container, generates corresponding
112
+ audio using F5TTS, and adds the resulting audio packets to the container.
113
+
114
+ Usage example:
115
+
116
+ agent:
117
+ name: my_test_agent
118
+ templates:
119
+ - template_name: InputTemplate
120
+ class_name: InputTemplate
121
+ attributes: {}
122
+ - template_name: F5TTSInference
123
+ class_name: F5TTSInference
124
+ template_input: InputTemplate
125
+ attributes:
126
+ model: F5TTS_v1_Base
127
+ model_cfg: null
128
+ ckpt_file: null
129
+ vocab_file: null
130
+ ref_audio: '`replace_me:<class ''str''>`'
131
+ ref_text: ' '
132
+ vocoder_name: vocos
133
+ load_vocoder_from_local: false
134
+ nfe_step: 32
135
+ cfg_strength: 2.0
136
+ cross_fade_duration: 0.15
137
+ speed: 1.0
138
+ sway_sampling_coef: -1.0
139
+ target_rms: null
140
+ fix_duration: null
141
+ remove_silence: false
142
+ save_chunk: false
143
+ device: null
144
+
145
+ """
146
+
147
+ AttributesBaseModel = F5TTSInferenceAttributes
148
+
149
+ def _add_attribute_to_command(self, cli_command: list[str], field_name: str, field: Any) -> None:
150
+ """
151
+ This method examines each attribute field's metadata to determine if and how
152
+ it should be added to the CLI command. It handles both parameter-style options
153
+ (--param value) and flag-style options (--flag).
154
+
155
+ Args:
156
+ cli_command (list[str]): The command list being built, modified in-place.
157
+ This list will be extended with the appropriate CLI arguments.
158
+ field_name (str): Name of the attribute field to process from the template's
159
+ attributes.
160
+ field (Any): Field definition containing metadata about the attribute,
161
+ including CLI parameter information.
162
+ """
163
+ attribute_value = getattr(self.attributes, field_name)
164
+ json_schema_extra = field.json_schema_extra
165
+
166
+ if json_schema_extra is None:
167
+ return
168
+
169
+ if F5CliKeys.cli_param in json_schema_extra and attribute_value is not None:
170
+ cli_param = json_schema_extra[F5CliKeys.cli_param]
171
+ cli_command.extend([cli_param, str(attribute_value)])
172
+
173
+ if F5CliKeys.cli_flag in json_schema_extra and attribute_value:
174
+ cli_flag = json_schema_extra[F5CliKeys.cli_flag]
175
+ cli_command.append(cli_flag)
176
+
177
+ @staticmethod
178
+ def _add_io_parameters(cli_command: list[str], input_text: str, output_file_path: str) -> None:
179
+ """
180
+ Configures the input text to synthesize and the output location for the
181
+ generated audio file. Sets up the output directory based on the temporary
182
+ file path and adds the necessary CLI parameters.
183
+
184
+ Args:
185
+ cli_command (list[str]): The command list being built, modified in-place.
186
+ This list will be extended with the input/output CLI arguments.
187
+ input_text (str): The text to synthesize into speech. This will be passed
188
+ to the F5TTS CLI with the -t/--gen_text parameter.
189
+ output_file_path (str): Path where the generated audio file will be saved.
190
+ This will be passed to the F5TTS CLI with the -w/--output_file parameter.
191
+ """
192
+ temp_dir = os.path.dirname(output_file_path)
193
+ cli_command.extend(["-o", temp_dir])
194
+ cli_command.extend(["-t", input_text, "-w", output_file_path])
195
+
196
+ def _build_cli_command(self, input_text: str, output_file_path: str) -> list[str]:
197
+ """Builds the complete F5TTS CLI command for speech synthesis.
198
+
199
+ Constructs a command list that includes:
200
+ 1. The base CLI command
201
+ 2. All applicable template attributes converted to CLI parameters
202
+ 3. Input/output parameters for text and audio file paths
203
+
204
+ Args:
205
+ input_text (str): The text to synthesize into speech.
206
+ output_file_path (str): Path where the generated audio file will be saved.
207
+
208
+ Returns:
209
+ list[str]: A list of strings representing the complete command to be executed
210
+ by the subprocess module. The command includes all necessary parameters
211
+ and flags for the F5TTS CLI tool.
212
+ """
213
+ cli_command = ["f5-tts_infer-cli"]
214
+
215
+ for field_name, field in self.AttributesBaseModel.model_fields.items():
216
+ self._add_attribute_to_command(cli_command, field_name, field)
217
+
218
+ self._add_io_parameters(cli_command, input_text, output_file_path)
219
+ return cli_command
220
+
221
+ def _run_cli_command(self, cli_command: list[str]) -> bool:
222
+ """
223
+ Runs the constructed CLI command as a subprocess, captures its output,
224
+ and logs the results. Handles both successful execution and errors.
225
+
226
+ Args:
227
+ cli_command (list[str]): The complete command list to execute.
228
+ This should be a list of strings as produced by _build_cli_command().
229
+
230
+ Returns:
231
+ bool: True if the command executed successfully (return code 0),
232
+ False if an error occurred during execution.
233
+ """
234
+ try:
235
+ process_result = subprocess.run(cli_command, capture_output=True, text=True, check=True)
236
+ self.logger.info(f"Command output: {process_result.stdout}")
237
+ if process_result.stderr:
238
+ self.logger.info(f"Command stderr: {process_result.stderr}")
239
+ return True
240
+ except subprocess.CalledProcessError as error:
241
+ self.logger.error(f"CLI error: {error.stderr}")
242
+ return False
243
+
244
+ def _load_audio_file(self, file_path: str) -> tuple[np.ndarray, int] | None:
245
+ """Loads audio data from a file using soundfile.
246
+
247
+ Attempts to read an audio file from the specified path and returns the
248
+ audio data as a numpy array along with its sample rate. Handles file
249
+ existence checks and error conditions.
250
+
251
+ Args:
252
+ file_path (str): Path to the audio file to load. This should be a
253
+ valid audio file format supported by the soundfile library.
254
+
255
+ Returns:
256
+ tuple[np.ndarray, int] | None: A tuple containing audio data as a numpy array of shape (samples, channels)
257
+ and sample rate as an integer in Hz. Or None if the file could not be read or does not exist.
258
+ """
259
+ if not os.path.exists(file_path):
260
+ self.logger.error(f"Output file not found: {file_path}")
261
+ return None
262
+
263
+ try:
264
+ return sf.read(file_path)
265
+ except (ValueError, RuntimeError, IOError) as error:
266
+ self.logger.error(f"Error reading audio file: {error!s}")
267
+ return None
268
+
269
+ def _generate_speech(self, input_text: str) -> tuple[np.ndarray, int] | None:
270
+ """Generates speech audio from the input text using F5TTS.
271
+
272
+ This method orchestrates the entire speech generation process:
273
+ 1. Creates a temporary file for the output audio
274
+ 2. Builds and executes the F5TTS CLI command
275
+ 3. Loads the resulting audio file
276
+ 4. Cleans up the temporary file
277
+
278
+ The method ensures proper resource cleanup even if errors occur during
279
+ the generation process.
280
+
281
+ Args:
282
+ input_text (str): The text to synthesize into speech. This text will
283
+ be passed to the F5TTS CLI tool for synthesis.
284
+
285
+ Returns:
286
+ tuple[np.ndarray, int] | None: A tuple containing:
287
+ - The generated audio data as a numpy array
288
+ - The sample rate as an integer in Hz
289
+ Or None if speech generation failed at any stage.
290
+ """
291
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
292
+ temp_file_path = temp_file.name
293
+
294
+ try:
295
+ cli_command = self._build_cli_command(input_text, temp_file_path)
296
+ if not self._run_cli_command(cli_command):
297
+ return None
298
+
299
+ audio_data = self._load_audio_file(temp_file_path)
300
+ return audio_data
301
+ finally:
302
+ if os.path.exists(temp_file_path):
303
+ os.remove(temp_file_path)
304
+
305
+ def _create_audio_packet(
306
+ self,
307
+ audio_data: np.ndarray,
308
+ sample_rate: int,
309
+ container: DataContainer,
310
+ ) -> None:
311
+ """Creates an audio packet and adds it to the data container.
312
+
313
+ Constructs an AudioPacket object from the generated speech data and
314
+ adds it to the container's audio collection. The packet includes
315
+ metadata about the source (this template instance) and audio properties.
316
+
317
+ Args:
318
+ audio_data (np.ndarray): The audio samples as a numpy array of shape
319
+ (samples, channels).
320
+ sample_rate (int): The sample rate of the audio in Hz.
321
+ container (DataContainer): The data container to add the audio packet to.
322
+ The packet will be appended to the container's audios list.
323
+ """
324
+ audio_packet = AudioPacket(
325
+ content=audio_data,
326
+ source=self.instance_name,
327
+ sample_rate=sample_rate,
328
+ )
329
+ container.audios.append(audio_packet)
330
+
331
+ def execute(self, container: DataContainer) -> DataContainer:
332
+ """Processes text packets and generates corresponding speech audio.
333
+
334
+ Args:
335
+ container (DataContainer): The data container with text packets to process.
336
+ Each text packet's content will be synthesized into speech.
337
+
338
+ Returns:
339
+ DataContainer: The same data container with added audio packets containing
340
+ the generated speech. If no text packets were present or speech generation
341
+ failed for all texts, the container is returned unchanged.
342
+ """
343
+ if not container.texts:
344
+ return container
345
+
346
+ for text_packet in container.texts:
347
+ speech_result = self._generate_speech(text_packet.content)
348
+
349
+ if speech_result:
350
+ audio_data, sample_rate = speech_result
351
+ self._create_audio_packet(
352
+ audio_data=audio_data,
353
+ sample_rate=sample_rate,
354
+ container=container,
355
+ )
356
+
357
+ return container