sinapsis-speech 0.1.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/env_var_keys.py +1 -1
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/voice_utils.py +7 -23
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_base.py +13 -23
- sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_generation.py +4 -1
- sinapsis_f5_tts/src/sinapsis_f5_tts/__init__.py +0 -0
- sinapsis_f5_tts/src/sinapsis_f5_tts/templates/__init__.py +20 -0
- sinapsis_f5_tts/src/sinapsis_f5_tts/templates/f5_tts_inference.py +357 -0
- sinapsis_speech-0.2.2.dist-info/METADATA +266 -0
- sinapsis_speech-0.2.2.dist-info/RECORD +22 -0
- {sinapsis_speech-0.1.0.dist-info → sinapsis_speech-0.2.2.dist-info}/WHEEL +1 -1
- sinapsis_speech-0.2.2.dist-info/top_level.txt +3 -0
- sinapsis_zonos/src/sinapsis_zonos/__init__.py +0 -0
- sinapsis_zonos/src/sinapsis_zonos/helpers/__init__.py +0 -0
- sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_keys.py +67 -0
- sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_tts_utils.py +153 -0
- sinapsis_zonos/src/sinapsis_zonos/templates/__init__.py +20 -0
- sinapsis_zonos/src/sinapsis_zonos/templates/zonos_tts.py +172 -0
- sinapsis_speech-0.1.0.dist-info/METADATA +0 -872
- sinapsis_speech-0.1.0.dist-info/RECORD +0 -13
- sinapsis_speech-0.1.0.dist-info/top_level.txt +0 -1
- {sinapsis_speech-0.1.0.dist-info → sinapsis_speech-0.2.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -9,34 +9,18 @@ def create_voice_settings(settings: VoiceSettings) -> VoiceSettings | None:
|
|
|
9
9
|
"""
|
|
10
10
|
Creates or updates a `VoiceSettings` object based on the provided settings.
|
|
11
11
|
|
|
12
|
-
This function attempts to create or update a `VoiceSettings` object using the provided
|
|
13
|
-
`VoiceSettings` instance. If any of the fields in the settings contain `None`,
|
|
14
|
-
the corresponding field is populated with a default value from `DEFAULT_VOICE.settings`.
|
|
15
|
-
If all fields are valid (i.e., none are `None`), the provided `settings` object is returned unchanged.
|
|
16
|
-
|
|
17
|
-
If the settings argument is `None` or if no valid settings are provided, the function
|
|
18
|
-
returns `None`.
|
|
19
|
-
|
|
20
12
|
Args:
|
|
21
|
-
settings (VoiceSettings): An instance of `VoiceSettings` containing the settings to be applied.
|
|
22
|
-
|
|
13
|
+
settings (VoiceSettings | None): An instance of `VoiceSettings` containing the settings to be applied.
|
|
14
|
+
If `None`, the function returns the default settings.
|
|
23
15
|
|
|
24
16
|
Returns:
|
|
25
|
-
VoiceSettings:
|
|
26
|
-
|
|
27
|
-
`None` is returned.
|
|
17
|
+
VoiceSettings: The provided `VoiceSettings` object if `settings` is not `None`. Otherwise,
|
|
18
|
+
`DEFAULT_VOICE.settings` is returned.
|
|
28
19
|
"""
|
|
29
|
-
if settings:
|
|
30
|
-
|
|
31
|
-
if any(value is None for value in settings_dict.values()):
|
|
32
|
-
for field, value in settings_dict.items():
|
|
33
|
-
if value is None:
|
|
34
|
-
settings_dict[field] = getattr(DEFAULT_VOICE.settings, field)
|
|
20
|
+
if not settings:
|
|
21
|
+
return DEFAULT_VOICE.settings
|
|
35
22
|
|
|
36
|
-
|
|
37
|
-
else:
|
|
38
|
-
return settings
|
|
39
|
-
return None
|
|
23
|
+
return settings
|
|
40
24
|
|
|
41
25
|
|
|
42
26
|
def get_voice_id(client: ElevenLabs, voice: VoiceId | VoiceName) -> VoiceId:
|
|
@@ -9,6 +9,8 @@ from typing import IO, Iterator, Literal
|
|
|
9
9
|
|
|
10
10
|
from elevenlabs import Voice, VoiceSettings, save
|
|
11
11
|
from elevenlabs.client import ElevenLabs, VoiceId, VoiceName
|
|
12
|
+
from elevenlabs.types import OutputFormat
|
|
13
|
+
from pydantic import Field
|
|
12
14
|
from sinapsis_core.data_containers.data_packet import AudioPacket, DataContainer, Packet
|
|
13
15
|
from sinapsis_core.template_base.template import (
|
|
14
16
|
Template,
|
|
@@ -40,25 +42,23 @@ class ElevenLabsBase(Template, abc.ABC):
|
|
|
40
42
|
"""
|
|
41
43
|
Attributes for ElevenLabs Base Class.
|
|
42
44
|
Args:
|
|
43
|
-
api_key (str): The API key to authenticate with ElevenLabs' API.
|
|
44
|
-
|
|
45
|
+
api_key (str): The API used key to authenticate with ElevenLabs' API.
|
|
46
|
+
model (Literal): The model identifier to use for speech synthesis.
|
|
47
|
+
output_format (OutputFormat): The output audio format and quality. Options include:
|
|
48
|
+
["mp3_22050_32", "mp3_44100_32", "mp3_44100_64", "mp3_44100_96", "mp3_44100_128",
|
|
49
|
+
"mp3_44100_192", "pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100", "ulaw_8000"]
|
|
50
|
+
output_folder (str): The folder where generated audio files will be saved.
|
|
51
|
+
stream (bool): If True, the audio is returned as a stream; otherwise, saved to a file.
|
|
52
|
+
voice (VoiceId | VoiceName | Voice): The voice to use for speech synthesis. This can be a voice ID (str),
|
|
45
53
|
a voice name (str) or an elevenlabs voice object (Voice).
|
|
46
54
|
voice_settings (VoiceSettings): A dictionary of settings that control the behavior of the voice.
|
|
47
55
|
- stability (float)
|
|
48
56
|
- similarity_boost (float)
|
|
49
57
|
- style (float)
|
|
50
58
|
- use_speaker_boost (bool)
|
|
51
|
-
model (Literal): The model identifier to use for speech synthesis.
|
|
52
|
-
output_format (Literal): The output audio format and quality. Options include:
|
|
53
|
-
["mp3_22050_32", "mp3_44100_32", "mp3_44100_64", "mp3_44100_96", "mp3_44100_128",
|
|
54
|
-
"mp3_44100_192", "pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100", "ulaw_8000"]
|
|
55
|
-
output_folder (str): The folder where generated audio files will be saved.
|
|
56
|
-
stream (bool): If True, the audio is returned as a stream; otherwise, saved to a file.
|
|
57
59
|
"""
|
|
58
60
|
|
|
59
61
|
api_key: str | None = None
|
|
60
|
-
voice: VoiceId | VoiceName | Voice = None
|
|
61
|
-
voice_settings: VoiceSettings | None = None
|
|
62
62
|
model: Literal[
|
|
63
63
|
"eleven_turbo_v2_5",
|
|
64
64
|
"eleven_multilingual_v2",
|
|
@@ -68,21 +68,11 @@ class ElevenLabsBase(Template, abc.ABC):
|
|
|
68
68
|
"eleven_english_sts_v2",
|
|
69
69
|
"eleven_multilingual_sts_v2",
|
|
70
70
|
] = "eleven_turbo_v2_5"
|
|
71
|
-
output_format:
|
|
72
|
-
"mp3_22050_32",
|
|
73
|
-
"mp3_44100_32",
|
|
74
|
-
"mp3_44100_64",
|
|
75
|
-
"mp3_44100_96",
|
|
76
|
-
"mp3_44100_128",
|
|
77
|
-
"mp3_44100_192",
|
|
78
|
-
"pcm_16000",
|
|
79
|
-
"pcm_22050",
|
|
80
|
-
"pcm_24000",
|
|
81
|
-
"pcm_44100",
|
|
82
|
-
"ulaw_8000",
|
|
83
|
-
] = "mp3_44100_128"
|
|
71
|
+
output_format: OutputFormat = "mp3_44100_128"
|
|
84
72
|
output_folder: str = os.path.join(SINAPSIS_CACHE_DIR, "elevenlabs", "audios")
|
|
85
73
|
stream: bool = False
|
|
74
|
+
voice: VoiceId | VoiceName | Voice = None
|
|
75
|
+
voice_settings: VoiceSettings = Field(default_factory=dict) # type: ignore[arg-type]
|
|
86
76
|
|
|
87
77
|
def __init__(self, attributes: TemplateAttributeType) -> None:
|
|
88
78
|
"""Initializes the ElevenLabs API client with the given attributes."""
|
|
@@ -42,9 +42,12 @@ class ElevenLabsVoiceGeneration(ElevenLabsBase):
|
|
|
42
42
|
class AttributesBaseModel(ElevenLabsBase.AttributesBaseModel):
|
|
43
43
|
"""
|
|
44
44
|
Attributes for voice generation in ElevenLabs API.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
voice_description (str): A description of the voice to be used for synthesis.
|
|
45
48
|
"""
|
|
46
49
|
|
|
47
|
-
voice_description: str
|
|
50
|
+
voice_description: str
|
|
48
51
|
|
|
49
52
|
def synthesize_speech(self, input_data: list[TextPacket]) -> list[bytes] | None:
|
|
50
53
|
"""
|
|
File without changes
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import importlib
|
|
3
|
+
from typing import Callable
|
|
4
|
+
|
|
5
|
+
_root_lib_path = "sinapsis_f5_tts.templates"
|
|
6
|
+
|
|
7
|
+
_template_lookup = {
|
|
8
|
+
"F5TTSInference": f"{_root_lib_path}.f5_tts_inference",
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def __getattr__(name: str) -> Callable:
|
|
13
|
+
if name in _template_lookup:
|
|
14
|
+
module = importlib.import_module(_template_lookup[name])
|
|
15
|
+
return getattr(module, name)
|
|
16
|
+
|
|
17
|
+
raise AttributeError(f"template `{name}` not found in {_root_lib_path}")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
__all__ = list(_template_lookup.keys())
|
|
@@ -0,0 +1,357 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import os
|
|
3
|
+
import subprocess
|
|
4
|
+
import tempfile
|
|
5
|
+
from typing import Any, Literal
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import soundfile as sf
|
|
9
|
+
from pydantic import Field
|
|
10
|
+
from pydantic.dataclasses import dataclass
|
|
11
|
+
from sinapsis_core.data_containers.data_packet import (
|
|
12
|
+
AudioPacket,
|
|
13
|
+
DataContainer,
|
|
14
|
+
)
|
|
15
|
+
from sinapsis_core.template_base import Template, TemplateAttributes
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class F5CliKeys:
|
|
20
|
+
cli_flag: str = "cli_flag"
|
|
21
|
+
cli_param: str = "cli_param"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class F5TTSInferenceAttributes(TemplateAttributes):
|
|
25
|
+
"""Configuration attributes for the F5TTS text-to-speech inference template.
|
|
26
|
+
|
|
27
|
+
Attributes:
|
|
28
|
+
model (str): The model name to use for synthesis. Options include 'F5TTS_v1_Base',
|
|
29
|
+
'F5TTS_Base', 'E2TTS_Base', etc. Default is 'F5TTS_v1_Base'.
|
|
30
|
+
model_cfg (str | None): Path to the F5-TTS model config file (.yaml). If None,
|
|
31
|
+
the default configuration will be used.
|
|
32
|
+
ckpt_file (str | None): Path to model checkpoint file (.pt). If None, the default
|
|
33
|
+
checkpoint will be used.
|
|
34
|
+
vocab_file (str | None): Path to vocabulary file (.txt). If None, the default
|
|
35
|
+
vocabulary will be used.
|
|
36
|
+
ref_audio (str): Path to the reference audio file. This is required to clone the voice
|
|
37
|
+
characteristics.
|
|
38
|
+
ref_text (str): The transcript/subtitle for the reference audio. Default is a space character.
|
|
39
|
+
When left empty, the system will attempt to extract text from the audio automatically.
|
|
40
|
+
It's recommended to leave this empty for automatic extraction.
|
|
41
|
+
vocoder_name (Literal["vocos", "bigvgan"]): The vocoder to use for audio generation.
|
|
42
|
+
Options are 'vocos' or 'bigvgan'. Default is 'vocos'.
|
|
43
|
+
load_vocoder_from_local (bool): Whether to load the vocoder from a local directory
|
|
44
|
+
(default: ../checkpoints/vocos-mel-24khz) instead of downloading it. Default is False.
|
|
45
|
+
nfe_step (int): The number of function evaluation steps (denoising steps) to perform
|
|
46
|
+
during inference. Higher values may produce better quality at the cost of speed. Default is 32.
|
|
47
|
+
cfg_strength (float): Classifier-free guidance strength. Controls how closely the output
|
|
48
|
+
follows the reference voice. Default is 2.0.
|
|
49
|
+
cross_fade_duration (float): Duration of cross-fade between audio segments in seconds.
|
|
50
|
+
Used when generating longer audio that requires multiple segments. Default is 0.15.
|
|
51
|
+
speed (float): The speed of the generated audio. Values > 1.0 speed up the audio,
|
|
52
|
+
values < 1.0 slow it down. Default is 1.0.
|
|
53
|
+
sway_sampling_coef (float): Sway Sampling coefficient for controlling variability
|
|
54
|
+
in the generated speech. Default is -1.0.
|
|
55
|
+
target_rms (float | None): Target output speech loudness normalization value.
|
|
56
|
+
Controls the volume of the output. Default is None.
|
|
57
|
+
fix_duration (float | None): Fix the total duration (reference and generated audios)
|
|
58
|
+
in seconds. Default is None.
|
|
59
|
+
remove_silence (bool): Whether to remove long silence found in the output. Default is False.
|
|
60
|
+
save_chunk (bool): Whether to save each audio chunk during inference. Useful for
|
|
61
|
+
debugging or analyzing the generation process. Default is False.
|
|
62
|
+
device (str | None): Specify the device to run inference on (e.g., 'cuda:0', 'cpu').
|
|
63
|
+
Default is None, which uses the system's default device.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
model: str = Field(default="F5TTS_v1_Base", json_schema_extra={F5CliKeys.cli_param: "-m"})
|
|
67
|
+
|
|
68
|
+
model_cfg: str | None = Field(default=None, json_schema_extra={F5CliKeys.cli_param: "-mc"})
|
|
69
|
+
|
|
70
|
+
ckpt_file: str | None = Field(default=None, json_schema_extra={F5CliKeys.cli_param: "-p"})
|
|
71
|
+
|
|
72
|
+
vocab_file: str | None = Field(default=None, json_schema_extra={F5CliKeys.cli_param: "-v"})
|
|
73
|
+
|
|
74
|
+
ref_audio: str = Field(json_schema_extra={F5CliKeys.cli_param: "-r"})
|
|
75
|
+
|
|
76
|
+
ref_text: str = Field(default=" ", json_schema_extra={F5CliKeys.cli_param: "-s"})
|
|
77
|
+
|
|
78
|
+
vocoder_name: Literal["vocos", "bigvgan"] = Field(
|
|
79
|
+
default="vocos", json_schema_extra={F5CliKeys.cli_param: "--vocoder_name"}
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
load_vocoder_from_local: bool = Field(
|
|
83
|
+
default=False, json_schema_extra={F5CliKeys.cli_flag: "--load_vocoder_from_local"}
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
nfe_step: int = Field(default=32, json_schema_extra={F5CliKeys.cli_param: "--nfe_step"})
|
|
87
|
+
|
|
88
|
+
cfg_strength: float = Field(default=2.0, json_schema_extra={F5CliKeys.cli_param: "--cfg_strength"})
|
|
89
|
+
|
|
90
|
+
cross_fade_duration: float = Field(default=0.15, json_schema_extra={F5CliKeys.cli_param: "--cross_fade_duration"})
|
|
91
|
+
|
|
92
|
+
speed: float = Field(default=1.0, json_schema_extra={F5CliKeys.cli_param: "--speed"})
|
|
93
|
+
|
|
94
|
+
sway_sampling_coef: float = Field(default=-1.0, json_schema_extra={F5CliKeys.cli_param: "--sway_sampling_coef"})
|
|
95
|
+
|
|
96
|
+
target_rms: float | None = Field(default=None, json_schema_extra={F5CliKeys.cli_param: "--target_rms"})
|
|
97
|
+
|
|
98
|
+
fix_duration: float | None = Field(default=None, json_schema_extra={F5CliKeys.cli_param: "--fix_duration"})
|
|
99
|
+
|
|
100
|
+
remove_silence: bool = Field(default=False, json_schema_extra={F5CliKeys.cli_flag: "--remove_silence"})
|
|
101
|
+
|
|
102
|
+
save_chunk: bool = Field(default=False, json_schema_extra={F5CliKeys.cli_flag: "--save_chunk"})
|
|
103
|
+
|
|
104
|
+
device: str | None = Field(default=None, json_schema_extra={F5CliKeys.cli_param: "--device"})
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class F5TTSInference(Template):
|
|
108
|
+
"""Template for performing text-to-speech synthesis using the F5TTS model.
|
|
109
|
+
|
|
110
|
+
This template uses the F5TTS CLI tool to generate speech from text input.
|
|
111
|
+
It processes text packets from the input container, generates corresponding
|
|
112
|
+
audio using F5TTS, and adds the resulting audio packets to the container.
|
|
113
|
+
|
|
114
|
+
Usage example:
|
|
115
|
+
|
|
116
|
+
agent:
|
|
117
|
+
name: my_test_agent
|
|
118
|
+
templates:
|
|
119
|
+
- template_name: InputTemplate
|
|
120
|
+
class_name: InputTemplate
|
|
121
|
+
attributes: {}
|
|
122
|
+
- template_name: F5TTSInference
|
|
123
|
+
class_name: F5TTSInference
|
|
124
|
+
template_input: InputTemplate
|
|
125
|
+
attributes:
|
|
126
|
+
model: F5TTS_v1_Base
|
|
127
|
+
model_cfg: null
|
|
128
|
+
ckpt_file: null
|
|
129
|
+
vocab_file: null
|
|
130
|
+
ref_audio: '`replace_me:<class ''str''>`'
|
|
131
|
+
ref_text: ' '
|
|
132
|
+
vocoder_name: vocos
|
|
133
|
+
load_vocoder_from_local: false
|
|
134
|
+
nfe_step: 32
|
|
135
|
+
cfg_strength: 2.0
|
|
136
|
+
cross_fade_duration: 0.15
|
|
137
|
+
speed: 1.0
|
|
138
|
+
sway_sampling_coef: -1.0
|
|
139
|
+
target_rms: null
|
|
140
|
+
fix_duration: null
|
|
141
|
+
remove_silence: false
|
|
142
|
+
save_chunk: false
|
|
143
|
+
device: null
|
|
144
|
+
|
|
145
|
+
"""
|
|
146
|
+
|
|
147
|
+
AttributesBaseModel = F5TTSInferenceAttributes
|
|
148
|
+
|
|
149
|
+
def _add_attribute_to_command(self, cli_command: list[str], field_name: str, field: Any) -> None:
|
|
150
|
+
"""
|
|
151
|
+
This method examines each attribute field's metadata to determine if and how
|
|
152
|
+
it should be added to the CLI command. It handles both parameter-style options
|
|
153
|
+
(--param value) and flag-style options (--flag).
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
cli_command (list[str]): The command list being built, modified in-place.
|
|
157
|
+
This list will be extended with the appropriate CLI arguments.
|
|
158
|
+
field_name (str): Name of the attribute field to process from the template's
|
|
159
|
+
attributes.
|
|
160
|
+
field (Any): Field definition containing metadata about the attribute,
|
|
161
|
+
including CLI parameter information.
|
|
162
|
+
"""
|
|
163
|
+
attribute_value = getattr(self.attributes, field_name)
|
|
164
|
+
json_schema_extra = field.json_schema_extra
|
|
165
|
+
|
|
166
|
+
if json_schema_extra is None:
|
|
167
|
+
return
|
|
168
|
+
|
|
169
|
+
if F5CliKeys.cli_param in json_schema_extra and attribute_value is not None:
|
|
170
|
+
cli_param = json_schema_extra[F5CliKeys.cli_param]
|
|
171
|
+
cli_command.extend([cli_param, str(attribute_value)])
|
|
172
|
+
|
|
173
|
+
if F5CliKeys.cli_flag in json_schema_extra and attribute_value:
|
|
174
|
+
cli_flag = json_schema_extra[F5CliKeys.cli_flag]
|
|
175
|
+
cli_command.append(cli_flag)
|
|
176
|
+
|
|
177
|
+
@staticmethod
|
|
178
|
+
def _add_io_parameters(cli_command: list[str], input_text: str, output_file_path: str) -> None:
|
|
179
|
+
"""
|
|
180
|
+
Configures the input text to synthesize and the output location for the
|
|
181
|
+
generated audio file. Sets up the output directory based on the temporary
|
|
182
|
+
file path and adds the necessary CLI parameters.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
cli_command (list[str]): The command list being built, modified in-place.
|
|
186
|
+
This list will be extended with the input/output CLI arguments.
|
|
187
|
+
input_text (str): The text to synthesize into speech. This will be passed
|
|
188
|
+
to the F5TTS CLI with the -t/--gen_text parameter.
|
|
189
|
+
output_file_path (str): Path where the generated audio file will be saved.
|
|
190
|
+
This will be passed to the F5TTS CLI with the -w/--output_file parameter.
|
|
191
|
+
"""
|
|
192
|
+
temp_dir = os.path.dirname(output_file_path)
|
|
193
|
+
cli_command.extend(["-o", temp_dir])
|
|
194
|
+
cli_command.extend(["-t", input_text, "-w", output_file_path])
|
|
195
|
+
|
|
196
|
+
def _build_cli_command(self, input_text: str, output_file_path: str) -> list[str]:
|
|
197
|
+
"""Builds the complete F5TTS CLI command for speech synthesis.
|
|
198
|
+
|
|
199
|
+
Constructs a command list that includes:
|
|
200
|
+
1. The base CLI command
|
|
201
|
+
2. All applicable template attributes converted to CLI parameters
|
|
202
|
+
3. Input/output parameters for text and audio file paths
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
input_text (str): The text to synthesize into speech.
|
|
206
|
+
output_file_path (str): Path where the generated audio file will be saved.
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
list[str]: A list of strings representing the complete command to be executed
|
|
210
|
+
by the subprocess module. The command includes all necessary parameters
|
|
211
|
+
and flags for the F5TTS CLI tool.
|
|
212
|
+
"""
|
|
213
|
+
cli_command = ["f5-tts_infer-cli"]
|
|
214
|
+
|
|
215
|
+
for field_name, field in self.AttributesBaseModel.model_fields.items():
|
|
216
|
+
self._add_attribute_to_command(cli_command, field_name, field)
|
|
217
|
+
|
|
218
|
+
self._add_io_parameters(cli_command, input_text, output_file_path)
|
|
219
|
+
return cli_command
|
|
220
|
+
|
|
221
|
+
def _run_cli_command(self, cli_command: list[str]) -> bool:
|
|
222
|
+
"""
|
|
223
|
+
Runs the constructed CLI command as a subprocess, captures its output,
|
|
224
|
+
and logs the results. Handles both successful execution and errors.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
cli_command (list[str]): The complete command list to execute.
|
|
228
|
+
This should be a list of strings as produced by _build_cli_command().
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
bool: True if the command executed successfully (return code 0),
|
|
232
|
+
False if an error occurred during execution.
|
|
233
|
+
"""
|
|
234
|
+
try:
|
|
235
|
+
process_result = subprocess.run(cli_command, capture_output=True, text=True, check=True)
|
|
236
|
+
self.logger.info(f"Command output: {process_result.stdout}")
|
|
237
|
+
if process_result.stderr:
|
|
238
|
+
self.logger.info(f"Command stderr: {process_result.stderr}")
|
|
239
|
+
return True
|
|
240
|
+
except subprocess.CalledProcessError as error:
|
|
241
|
+
self.logger.error(f"CLI error: {error.stderr}")
|
|
242
|
+
return False
|
|
243
|
+
|
|
244
|
+
def _load_audio_file(self, file_path: str) -> tuple[np.ndarray, int] | None:
|
|
245
|
+
"""Loads audio data from a file using soundfile.
|
|
246
|
+
|
|
247
|
+
Attempts to read an audio file from the specified path and returns the
|
|
248
|
+
audio data as a numpy array along with its sample rate. Handles file
|
|
249
|
+
existence checks and error conditions.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
file_path (str): Path to the audio file to load. This should be a
|
|
253
|
+
valid audio file format supported by the soundfile library.
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
tuple[np.ndarray, int] | None: A tuple containing audio data as a numpy array of shape (samples, channels)
|
|
257
|
+
and sample rate as an integer in Hz. Or None if the file could not be read or does not exist.
|
|
258
|
+
"""
|
|
259
|
+
if not os.path.exists(file_path):
|
|
260
|
+
self.logger.error(f"Output file not found: {file_path}")
|
|
261
|
+
return None
|
|
262
|
+
|
|
263
|
+
try:
|
|
264
|
+
return sf.read(file_path)
|
|
265
|
+
except (ValueError, RuntimeError, IOError) as error:
|
|
266
|
+
self.logger.error(f"Error reading audio file: {error!s}")
|
|
267
|
+
return None
|
|
268
|
+
|
|
269
|
+
def _generate_speech(self, input_text: str) -> tuple[np.ndarray, int] | None:
|
|
270
|
+
"""Generates speech audio from the input text using F5TTS.
|
|
271
|
+
|
|
272
|
+
This method orchestrates the entire speech generation process:
|
|
273
|
+
1. Creates a temporary file for the output audio
|
|
274
|
+
2. Builds and executes the F5TTS CLI command
|
|
275
|
+
3. Loads the resulting audio file
|
|
276
|
+
4. Cleans up the temporary file
|
|
277
|
+
|
|
278
|
+
The method ensures proper resource cleanup even if errors occur during
|
|
279
|
+
the generation process.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
input_text (str): The text to synthesize into speech. This text will
|
|
283
|
+
be passed to the F5TTS CLI tool for synthesis.
|
|
284
|
+
|
|
285
|
+
Returns:
|
|
286
|
+
tuple[np.ndarray, int] | None: A tuple containing:
|
|
287
|
+
- The generated audio data as a numpy array
|
|
288
|
+
- The sample rate as an integer in Hz
|
|
289
|
+
Or None if speech generation failed at any stage.
|
|
290
|
+
"""
|
|
291
|
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
|
|
292
|
+
temp_file_path = temp_file.name
|
|
293
|
+
|
|
294
|
+
try:
|
|
295
|
+
cli_command = self._build_cli_command(input_text, temp_file_path)
|
|
296
|
+
if not self._run_cli_command(cli_command):
|
|
297
|
+
return None
|
|
298
|
+
|
|
299
|
+
audio_data = self._load_audio_file(temp_file_path)
|
|
300
|
+
return audio_data
|
|
301
|
+
finally:
|
|
302
|
+
if os.path.exists(temp_file_path):
|
|
303
|
+
os.remove(temp_file_path)
|
|
304
|
+
|
|
305
|
+
def _create_audio_packet(
|
|
306
|
+
self,
|
|
307
|
+
audio_data: np.ndarray,
|
|
308
|
+
sample_rate: int,
|
|
309
|
+
container: DataContainer,
|
|
310
|
+
) -> None:
|
|
311
|
+
"""Creates an audio packet and adds it to the data container.
|
|
312
|
+
|
|
313
|
+
Constructs an AudioPacket object from the generated speech data and
|
|
314
|
+
adds it to the container's audio collection. The packet includes
|
|
315
|
+
metadata about the source (this template instance) and audio properties.
|
|
316
|
+
|
|
317
|
+
Args:
|
|
318
|
+
audio_data (np.ndarray): The audio samples as a numpy array of shape
|
|
319
|
+
(samples, channels).
|
|
320
|
+
sample_rate (int): The sample rate of the audio in Hz.
|
|
321
|
+
container (DataContainer): The data container to add the audio packet to.
|
|
322
|
+
The packet will be appended to the container's audios list.
|
|
323
|
+
"""
|
|
324
|
+
audio_packet = AudioPacket(
|
|
325
|
+
content=audio_data,
|
|
326
|
+
source=self.instance_name,
|
|
327
|
+
sample_rate=sample_rate,
|
|
328
|
+
)
|
|
329
|
+
container.audios.append(audio_packet)
|
|
330
|
+
|
|
331
|
+
def execute(self, container: DataContainer) -> DataContainer:
|
|
332
|
+
"""Processes text packets and generates corresponding speech audio.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
container (DataContainer): The data container with text packets to process.
|
|
336
|
+
Each text packet's content will be synthesized into speech.
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
DataContainer: The same data container with added audio packets containing
|
|
340
|
+
the generated speech. If no text packets were present or speech generation
|
|
341
|
+
failed for all texts, the container is returned unchanged.
|
|
342
|
+
"""
|
|
343
|
+
if not container.texts:
|
|
344
|
+
return container
|
|
345
|
+
|
|
346
|
+
for text_packet in container.texts:
|
|
347
|
+
speech_result = self._generate_speech(text_packet.content)
|
|
348
|
+
|
|
349
|
+
if speech_result:
|
|
350
|
+
audio_data, sample_rate = speech_result
|
|
351
|
+
self._create_audio_packet(
|
|
352
|
+
audio_data=audio_data,
|
|
353
|
+
sample_rate=sample_rate,
|
|
354
|
+
container=container,
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
return container
|