lollms-client 0.15.1__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lollms-client might be problematic. Click here for more details.
- examples/generate_and_speak/generate_and_speak.py +251 -0
- examples/generate_game_sfx/generate_game_fx.py +240 -0
- examples/text_2_image.py +0 -1
- lollms_client/__init__.py +1 -1
- lollms_client/llm_bindings/llamacpp/__init__.py +561 -734
- lollms_client/lollms_core.py +49 -29
- lollms_client/lollms_stt_binding.py +3 -15
- lollms_client/lollms_tti_binding.py +5 -29
- lollms_client/lollms_ttm_binding.py +5 -28
- lollms_client/lollms_tts_binding.py +4 -28
- lollms_client/lollms_ttv_binding.py +4 -28
- lollms_client/stt_bindings/lollms/__init__.py +5 -4
- lollms_client/stt_bindings/whisper/__init__.py +304 -0
- lollms_client/stt_bindings/whispercpp/__init__.py +380 -0
- lollms_client/tti_bindings/lollms/__init__.py +4 -6
- lollms_client/ttm_bindings/audiocraft/__init__.py +281 -0
- lollms_client/ttm_bindings/bark/__init__.py +339 -0
- lollms_client/tts_bindings/bark/__init__.py +336 -0
- lollms_client/tts_bindings/piper_tts/__init__.py +343 -0
- lollms_client/tts_bindings/xtts/__init__.py +317 -0
- lollms_client-0.16.0.dist-info/METADATA +183 -0
- {lollms_client-0.15.1.dist-info → lollms_client-0.16.0.dist-info}/RECORD +25 -16
- lollms_client-0.15.1.dist-info/METADATA +0 -192
- {lollms_client-0.15.1.dist-info → lollms_client-0.16.0.dist-info}/WHEEL +0 -0
- {lollms_client-0.15.1.dist-info → lollms_client-0.16.0.dist-info}/licenses/LICENSE +0 -0
- {lollms_client-0.15.1.dist-info → lollms_client-0.16.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
# lollms_client/tts_bindings/piper/__init__.py
|
|
2
|
+
import io
|
|
3
|
+
import os
|
|
4
|
+
import wave # Standard Python library for WAV files
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Optional, List, Union, Dict, Any
|
|
8
|
+
|
|
9
|
+
from ascii_colors import trace_exception, ASCIIColors
|
|
10
|
+
|
|
11
|
+
# --- Package Management and Conditional Imports ---
|
|
12
|
+
_piper_tts_installed = False
|
|
13
|
+
_piper_tts_installation_error = ""
|
|
14
|
+
try:
|
|
15
|
+
import pipmaster as pm
|
|
16
|
+
# piper-tts should handle onnxruntime, but ensure it's there if needed
|
|
17
|
+
# We might need specific onnxruntime for CUDA/DirectML later if we extend device support
|
|
18
|
+
pm.ensure_packages(["piper-tts", "onnxruntime"])
|
|
19
|
+
|
|
20
|
+
from piper import PiperVoice
|
|
21
|
+
import numpy as np # For converting audio samples if needed
|
|
22
|
+
|
|
23
|
+
_piper_tts_installed = True
|
|
24
|
+
except Exception as e:
|
|
25
|
+
_piper_tts_installation_error = str(e)
|
|
26
|
+
PiperVoice = None
|
|
27
|
+
np = None # Piper often returns bytes, but numpy can be handy for sample rate conversion if needed
|
|
28
|
+
# --- End Package Management ---
|
|
29
|
+
|
|
30
|
+
from lollms_client.lollms_tts_binding import LollmsTTSBinding
|
|
31
|
+
|
|
32
|
+
BindingName = "PiperTTSBinding"
|
|
33
|
+
|
|
34
|
+
# Example of a known good voice URL prefix from rhasspy.github.io/piper-voices/
|
|
35
|
+
PIPER_VOICES_BASE_URL = "https://huggingface.co/rhasspy/piper-voices/resolve/main/"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class PiperTTSBinding(LollmsTTSBinding):
|
|
39
|
+
def __init__(self,
|
|
40
|
+
default_voice_model_path: Optional[Union[str, Path]] = None, # Path to .onnx file
|
|
41
|
+
piper_voices_dir: Optional[Union[str, Path]] = None, # Directory to scan for voices
|
|
42
|
+
# Standard LollmsTTSBinding args (host, service_key, verify_ssl are not used for local Piper)
|
|
43
|
+
host_address: Optional[str] = None,
|
|
44
|
+
service_key: Optional[str] = None,
|
|
45
|
+
verify_ssl_certificate: bool = True,
|
|
46
|
+
**kwargs): # Catch-all for future params or Piper-specific init options
|
|
47
|
+
|
|
48
|
+
super().__init__(binding_name="piper")
|
|
49
|
+
|
|
50
|
+
if not _piper_tts_installed:
|
|
51
|
+
raise ImportError(f"Piper TTS binding dependencies not met. Error: {_piper_tts_installation_error}")
|
|
52
|
+
|
|
53
|
+
self.piper_voices_dir = Path(piper_voices_dir).resolve() if piper_voices_dir else None
|
|
54
|
+
if self.piper_voices_dir and not self.piper_voices_dir.is_dir():
|
|
55
|
+
ASCIIColors.warning(f"Piper voices directory does not exist: {self.piper_voices_dir}. Voice listing will be limited.")
|
|
56
|
+
self.piper_voices_dir = None
|
|
57
|
+
|
|
58
|
+
self.current_voice_model_path: Optional[Path] = None
|
|
59
|
+
self.piper_voice: Optional[PiperVoice] = None
|
|
60
|
+
self.voice_config: Optional[Dict] = None # To store sample rate, channels etc.
|
|
61
|
+
|
|
62
|
+
if default_voice_model_path:
|
|
63
|
+
self._load_piper_voice(default_voice_model_path)
|
|
64
|
+
else:
|
|
65
|
+
ASCIIColors.info("No default_voice_model_path provided for Piper. Load a voice via generate_audio or ensure piper_voices_dir is set.")
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _load_piper_voice(self, voice_model_identifier: Union[str, Path]):
|
|
69
|
+
"""
|
|
70
|
+
Loads a Piper voice model.
|
|
71
|
+
identifier can be a full path to .onnx or a filename to be found in piper_voices_dir.
|
|
72
|
+
"""
|
|
73
|
+
voice_model_path_onnx: Optional[Path] = None
|
|
74
|
+
voice_model_path_json: Optional[Path] = None
|
|
75
|
+
|
|
76
|
+
potential_path = Path(voice_model_identifier)
|
|
77
|
+
|
|
78
|
+
if potential_path.is_absolute() and potential_path.suffix == ".onnx" and potential_path.exists():
|
|
79
|
+
voice_model_path_onnx = potential_path
|
|
80
|
+
voice_model_path_json = potential_path.with_suffix(".onnx.json")
|
|
81
|
+
elif self.piper_voices_dir and (self.piper_voices_dir / voice_model_identifier).exists():
|
|
82
|
+
# Assume voice_model_identifier is a filename like "en_US-ryan-medium.onnx"
|
|
83
|
+
p = self.piper_voices_dir / voice_model_identifier
|
|
84
|
+
if p.suffix == ".onnx":
|
|
85
|
+
voice_model_path_onnx = p
|
|
86
|
+
voice_model_path_json = p.with_suffix(".onnx.json")
|
|
87
|
+
elif potential_path.suffix == ".onnx" and potential_path.exists(): # Relative path
|
|
88
|
+
voice_model_path_onnx = potential_path.resolve()
|
|
89
|
+
voice_model_path_json = voice_model_path_onnx.with_suffix(".onnx.json")
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
if not voice_model_path_onnx or not voice_model_path_onnx.exists():
|
|
93
|
+
raise FileNotFoundError(f"Piper ONNX voice model not found: {voice_model_identifier}")
|
|
94
|
+
if not voice_model_path_json or not voice_model_path_json.exists():
|
|
95
|
+
raise FileNotFoundError(f"Piper voice JSON config not found for {voice_model_path_onnx} (expected: {voice_model_path_json})")
|
|
96
|
+
|
|
97
|
+
if self.piper_voice and self.current_voice_model_path == voice_model_path_onnx:
|
|
98
|
+
ASCIIColors.info(f"Piper voice '{voice_model_path_onnx.name}' already loaded.")
|
|
99
|
+
return
|
|
100
|
+
|
|
101
|
+
ASCIIColors.info(f"Loading Piper voice: {voice_model_path_onnx.name}...")
|
|
102
|
+
try:
|
|
103
|
+
# Piper documentation often shows use_cuda=True for GPU with onnxruntime-gpu.
|
|
104
|
+
# For simplicity and Piper's primary CPU strength, we'll omit it for now.
|
|
105
|
+
# onnxruntime will use CPU by default.
|
|
106
|
+
# To enable GPU: user needs onnxruntime-gpu and then `PiperVoice.from_files(..., use_cuda=True)`
|
|
107
|
+
self.piper_voice = PiperVoice.from_files(
|
|
108
|
+
onnx_path=str(voice_model_path_onnx),
|
|
109
|
+
config_path=str(voice_model_path_json)
|
|
110
|
+
# use_cuda=True # if onnxruntime-gpu is installed and desired
|
|
111
|
+
)
|
|
112
|
+
with open(voice_model_path_json, 'r', encoding='utf-8') as f:
|
|
113
|
+
self.voice_config = json.load(f)
|
|
114
|
+
|
|
115
|
+
self.current_voice_model_path = voice_model_path_onnx
|
|
116
|
+
ASCIIColors.green(f"Piper voice '{voice_model_path_onnx.name}' loaded successfully.")
|
|
117
|
+
except Exception as e:
|
|
118
|
+
self.piper_voice = None
|
|
119
|
+
self.current_voice_model_path = None
|
|
120
|
+
self.voice_config = None
|
|
121
|
+
ASCIIColors.error(f"Failed to load Piper voice '{voice_model_path_onnx.name}': {e}"); trace_exception(e)
|
|
122
|
+
raise RuntimeError(f"Failed to load Piper voice '{voice_model_path_onnx.name}'") from e
|
|
123
|
+
|
|
124
|
+
def generate_audio(self,
|
|
125
|
+
text: str,
|
|
126
|
+
voice: Optional[Union[str, Path]] = None, # Filename or path to .onnx
|
|
127
|
+
**kwargs) -> bytes: # kwargs can include Piper synthesis options
|
|
128
|
+
if voice:
|
|
129
|
+
try:
|
|
130
|
+
self._load_piper_voice(voice) # Attempt to switch voice
|
|
131
|
+
except Exception as e_load:
|
|
132
|
+
ASCIIColors.error(f"Failed to switch to Piper voice '{voice}': {e_load}. Using previously loaded voice if available.")
|
|
133
|
+
if not self.piper_voice: # If no voice was previously loaded either
|
|
134
|
+
raise RuntimeError("No Piper voice loaded and failed to switch.") from e_load
|
|
135
|
+
|
|
136
|
+
if not self.piper_voice or not self.voice_config:
|
|
137
|
+
raise RuntimeError("Piper voice model not loaded. Cannot generate audio.")
|
|
138
|
+
|
|
139
|
+
ASCIIColors.info(f"Generating speech with Piper voice '{self.current_voice_model_path.name}': '{text[:60]}...'")
|
|
140
|
+
|
|
141
|
+
try:
|
|
142
|
+
# Piper's synthesize returns raw audio bytes (PCM s16le)
|
|
143
|
+
# Piper can also stream with synthesize_stream_raw if needed for very long texts
|
|
144
|
+
# For simplicity, using synthesize which returns all bytes at once.
|
|
145
|
+
|
|
146
|
+
# synthesis_kwargs: length_scale, noise_scale, noise_w
|
|
147
|
+
piper_synthesis_kwargs = {}
|
|
148
|
+
if 'length_scale' in kwargs: piper_synthesis_kwargs['length_scale'] = float(kwargs['length_scale'])
|
|
149
|
+
if 'noise_scale' in kwargs: piper_synthesis_kwargs['noise_scale'] = float(kwargs['noise_scale'])
|
|
150
|
+
if 'noise_w' in kwargs: piper_synthesis_kwargs['noise_w'] = float(kwargs['noise_w'])
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
audio_bytes_iterable = self.piper_voice.synthesize_stream_raw(text, **piper_synthesis_kwargs)
|
|
154
|
+
|
|
155
|
+
# Accumulate bytes from the stream
|
|
156
|
+
pcm_s16le_data = b"".join(audio_bytes_iterable)
|
|
157
|
+
|
|
158
|
+
if not pcm_s16le_data:
|
|
159
|
+
raise RuntimeError("Piper synthesize_stream_raw returned empty audio data.")
|
|
160
|
+
|
|
161
|
+
# Now package these raw PCM bytes into a WAV container
|
|
162
|
+
buffer = io.BytesIO()
|
|
163
|
+
sample_rate = self.voice_config.get("audio", {}).get("sample_rate", 22050) # Default if not in config
|
|
164
|
+
num_channels = 1 # Piper voices are typically mono
|
|
165
|
+
sample_width = 2 # 16-bit audio means 2 bytes per sample
|
|
166
|
+
|
|
167
|
+
with wave.open(buffer, 'wb') as wf:
|
|
168
|
+
wf.setnchannels(num_channels)
|
|
169
|
+
wf.setsampwidth(sample_width)
|
|
170
|
+
wf.setframerate(sample_rate)
|
|
171
|
+
wf.writeframes(pcm_s16le_data)
|
|
172
|
+
|
|
173
|
+
wav_bytes = buffer.getvalue()
|
|
174
|
+
buffer.close()
|
|
175
|
+
|
|
176
|
+
ASCIIColors.green("Piper TTS audio generation successful.")
|
|
177
|
+
return wav_bytes
|
|
178
|
+
except Exception as e:
|
|
179
|
+
ASCIIColors.error(f"Piper TTS audio generation failed: {e}"); trace_exception(e)
|
|
180
|
+
raise RuntimeError(f"Piper TTS audio generation error: {e}") from e
|
|
181
|
+
|
|
182
|
+
def list_voices(self, **kwargs) -> List[str]:
|
|
183
|
+
"""
|
|
184
|
+
Lists available Piper voice models found in the piper_voices_dir.
|
|
185
|
+
Returns a list of .onnx filenames.
|
|
186
|
+
"""
|
|
187
|
+
voices = []
|
|
188
|
+
if self.piper_voices_dir and self.piper_voices_dir.is_dir():
|
|
189
|
+
for item in self.piper_voices_dir.iterdir():
|
|
190
|
+
if item.is_file() and item.suffix == ".onnx":
|
|
191
|
+
json_config_path = item.with_suffix(".onnx.json")
|
|
192
|
+
if json_config_path.exists():
|
|
193
|
+
voices.append(item.name) # Return just the filename
|
|
194
|
+
|
|
195
|
+
if not voices and not self.current_voice_model_path:
|
|
196
|
+
ASCIIColors.warning("No voices found in piper_voices_dir and no default voice loaded.")
|
|
197
|
+
ASCIIColors.info(f"Download Piper voices (e.g., from {PIPER_VOICES_BASE_URL} or https://rhasspy.github.io/piper-voices/) "
|
|
198
|
+
"and place the .onnx and .onnx.json files into your voices directory.")
|
|
199
|
+
elif not voices and self.current_voice_model_path:
|
|
200
|
+
voices.append(self.current_voice_model_path.name) # Add the default loaded one if dir is empty
|
|
201
|
+
|
|
202
|
+
return sorted(list(set(voices))) # Ensure unique and sorted
|
|
203
|
+
|
|
204
|
+
def __del__(self):
|
|
205
|
+
# PiperVoice objects don't have an explicit close/del, Python's GC should handle C extensions
|
|
206
|
+
if hasattr(self, 'piper_voice') and self.piper_voice is not None:
|
|
207
|
+
del self.piper_voice
|
|
208
|
+
self.piper_voice = None
|
|
209
|
+
ASCIIColors.info(f"PiperTTSBinding voice '{getattr(self, 'current_voice_model_path', 'N/A')}' resources released.")
|
|
210
|
+
|
|
211
|
+
# --- Main Test Block ---
|
|
212
|
+
if __name__ == '__main__':
|
|
213
|
+
if not _piper_tts_installed:
|
|
214
|
+
print(f"{ASCIIColors.RED}Piper TTS binding dependencies not met. Skipping tests. Error: {_piper_tts_installation_error}{ASCIIColors.RESET}")
|
|
215
|
+
exit()
|
|
216
|
+
|
|
217
|
+
ASCIIColors.yellow("--- PiperTTSBinding Test ---")
|
|
218
|
+
|
|
219
|
+
# --- USER CONFIGURATION FOR TEST ---
|
|
220
|
+
# 1. Create a directory to store Piper voices, e.g., "./test_piper_voices"
|
|
221
|
+
TEST_PIPER_VOICES_DIR = Path("./test_piper_voices")
|
|
222
|
+
TEST_PIPER_VOICES_DIR.mkdir(exist_ok=True)
|
|
223
|
+
|
|
224
|
+
# 2. Download at least one voice model (ONNX + JSON files) into that directory.
|
|
225
|
+
# From: https://rhasspy.github.io/piper-voices/
|
|
226
|
+
# Example: Download en_US-lessac-medium.onnx and en_US-lessac-medium.onnx.json
|
|
227
|
+
# and place them in TEST_PIPER_VOICES_DIR
|
|
228
|
+
# Or find direct links on Hugging Face: e.g., from https://huggingface.co/rhasspy/piper-voices/tree/main/en/en_US/lessac/medium
|
|
229
|
+
# Let's pick a common English voice for testing.
|
|
230
|
+
DEFAULT_TEST_VOICE_FILENAME = "en_US-lessac-medium.onnx" # Ensure this (and .json) is in TEST_PIPER_VOICES_DIR
|
|
231
|
+
DEFAULT_TEST_VOICE_ONNX_URL = f"{PIPER_VOICES_BASE_URL}en/en_US/lessac/medium/en_US-lessac-medium.onnx"
|
|
232
|
+
DEFAULT_TEST_VOICE_JSON_URL = f"{PIPER_VOICES_BASE_URL}en/en_US/lessac/medium/en_US-lessac-medium.onnx.json"
|
|
233
|
+
|
|
234
|
+
# Function to download test voice if missing
|
|
235
|
+
def ensure_test_voice(voices_dir: Path, voice_filename: str, onnx_url: str, json_url: str):
|
|
236
|
+
onnx_path = voices_dir / voice_filename
|
|
237
|
+
json_path = voices_dir / f"{voice_filename}.json"
|
|
238
|
+
if not onnx_path.exists() or not json_path.exists():
|
|
239
|
+
ASCIIColors.info(f"Test voice '{voice_filename}' not found. Attempting to download...")
|
|
240
|
+
try:
|
|
241
|
+
import requests
|
|
242
|
+
# Download ONNX
|
|
243
|
+
if not onnx_path.exists():
|
|
244
|
+
ASCIIColors.info(f"Downloading {onnx_url} to {onnx_path}")
|
|
245
|
+
r_onnx = requests.get(onnx_url, stream=True)
|
|
246
|
+
r_onnx.raise_for_status()
|
|
247
|
+
with open(onnx_path, 'wb') as f:
|
|
248
|
+
for chunk in r_onnx.iter_content(chunk_size=8192): f.write(chunk)
|
|
249
|
+
# Download JSON
|
|
250
|
+
if not json_path.exists():
|
|
251
|
+
ASCIIColors.info(f"Downloading {json_url} to {json_path}")
|
|
252
|
+
r_json = requests.get(json_url)
|
|
253
|
+
r_json.raise_for_status()
|
|
254
|
+
with open(json_path, 'w', encoding='utf-8') as f: f.write(r_json.text)
|
|
255
|
+
ASCIIColors.green(f"Test voice '{voice_filename}' downloaded successfully.")
|
|
256
|
+
except Exception as e_download:
|
|
257
|
+
ASCIIColors.error(f"Failed to download test voice '{voice_filename}': {e_download}")
|
|
258
|
+
ASCIIColors.warning(f"Please manually download '{voice_filename}' and '{voice_filename}.json' "
|
|
259
|
+
f"from {PIPER_VOICES_BASE_URL} (or rhasspy.github.io/piper-voices/) "
|
|
260
|
+
f"and place them in {voices_dir.resolve()}")
|
|
261
|
+
return False
|
|
262
|
+
return True
|
|
263
|
+
|
|
264
|
+
if not ensure_test_voice(TEST_PIPER_VOICES_DIR, DEFAULT_TEST_VOICE_FILENAME, DEFAULT_TEST_VOICE_ONNX_URL, DEFAULT_TEST_VOICE_JSON_URL):
|
|
265
|
+
ASCIIColors.error("Cannot proceed with test without a default voice model.")
|
|
266
|
+
exit(1)
|
|
267
|
+
|
|
268
|
+
# Optional: Download a second voice for testing voice switching
|
|
269
|
+
SECOND_TEST_VOICE_FILENAME = "de_DE-thorsten-medium.onnx" # Example German voice
|
|
270
|
+
SECOND_TEST_VOICE_ONNX_URL = f"{PIPER_VOICES_BASE_URL}de/de_DE/thorsten/medium/de_DE-thorsten-medium.onnx"
|
|
271
|
+
SECOND_TEST_VOICE_JSON_URL = f"{PIPER_VOICES_BASE_URL}de/de_DE/thorsten/medium/de_DE-thorsten-medium.onnx.json"
|
|
272
|
+
ensure_test_voice(TEST_PIPER_VOICES_DIR, SECOND_TEST_VOICE_FILENAME, SECOND_TEST_VOICE_ONNX_URL, SECOND_TEST_VOICE_JSON_URL)
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
test_output_dir = Path("./test_piper_tts_output")
|
|
276
|
+
test_output_dir.mkdir(exist_ok=True)
|
|
277
|
+
tts_binding = None
|
|
278
|
+
# --- END USER CONFIGURATION FOR TEST ---
|
|
279
|
+
|
|
280
|
+
try:
|
|
281
|
+
ASCIIColors.cyan(f"\n--- Initializing PiperTTSBinding ---")
|
|
282
|
+
# Initialize with the path to the ONNX file for the default voice
|
|
283
|
+
tts_binding = PiperTTSBinding(
|
|
284
|
+
default_voice_model_path = TEST_PIPER_VOICES_DIR / DEFAULT_TEST_VOICE_FILENAME,
|
|
285
|
+
piper_voices_dir = TEST_PIPER_VOICES_DIR
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
ASCIIColors.cyan("\n--- Listing available Piper voices ---")
|
|
289
|
+
voices = tts_binding.list_voices();
|
|
290
|
+
if voices: print(f"Available voices in '{TEST_PIPER_VOICES_DIR}': {voices}")
|
|
291
|
+
else: ASCIIColors.warning(f"No voices found in {TEST_PIPER_VOICES_DIR}. Check paths and ensure .onnx/.json pairs exist.")
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
texts_to_synthesize = [
|
|
295
|
+
("english_hello", "Hello world, this is a test of the Piper text to speech binding."),
|
|
296
|
+
("english_question", "Can you generate speech quickly and efficiently? Let's find out!"),
|
|
297
|
+
]
|
|
298
|
+
if (TEST_PIPER_VOICES_DIR / SECOND_TEST_VOICE_FILENAME).exists():
|
|
299
|
+
texts_to_synthesize.append(
|
|
300
|
+
("german_greeting", "Hallo Welt, wie geht es Ihnen heute?", SECOND_TEST_VOICE_FILENAME)
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
for name, text, *voice_file_arg in texts_to_synthesize:
|
|
305
|
+
voice_to_use_filename = voice_file_arg[0] if voice_file_arg else None # Filename like "en_US-lessac-medium.onnx"
|
|
306
|
+
|
|
307
|
+
ASCIIColors.cyan(f"\n--- Synthesizing TTS for: '{name}' (Voice file: {voice_to_use_filename or DEFAULT_TEST_VOICE_FILENAME}) ---")
|
|
308
|
+
print(f"Text: {text}")
|
|
309
|
+
try:
|
|
310
|
+
# Example of passing Piper-specific synthesis parameters
|
|
311
|
+
synthesis_kwargs = {"length_scale": 1.0} # Default is 1.0. Smaller is faster, larger is slower.
|
|
312
|
+
if "question" in name:
|
|
313
|
+
synthesis_kwargs["length_scale"] = 0.9 # Slightly faster for questions
|
|
314
|
+
|
|
315
|
+
audio_bytes = tts_binding.generate_audio(text, voice=voice_to_use_filename, **synthesis_kwargs)
|
|
316
|
+
if audio_bytes:
|
|
317
|
+
output_filename = f"tts_piper_{name}.wav"
|
|
318
|
+
output_path = test_output_dir / output_filename
|
|
319
|
+
with open(output_path, "wb") as f: f.write(audio_bytes)
|
|
320
|
+
ASCIIColors.green(f"TTS for '{name}' saved to: {output_path} ({len(audio_bytes) / 1024:.2f} KB)")
|
|
321
|
+
else: ASCIIColors.error(f"TTS generation for '{name}' returned empty bytes.")
|
|
322
|
+
except Exception as e_gen: ASCIIColors.error(f"Failed to generate TTS for '{name}': {e_gen}")
|
|
323
|
+
|
|
324
|
+
except ImportError as e_imp: ASCIIColors.error(f"Import error: {e_imp}")
|
|
325
|
+
except FileNotFoundError as e_fnf: ASCIIColors.error(f"File not found error during init/load: {e_fnf}")
|
|
326
|
+
except RuntimeError as e_rt: ASCIIColors.error(f"Runtime error: {e_rt}")
|
|
327
|
+
except Exception as e: ASCIIColors.error(f"Unexpected error: {e}"); trace_exception(e)
|
|
328
|
+
finally:
|
|
329
|
+
if tts_binding: del tts_binding
|
|
330
|
+
ASCIIColors.info(f"Test TTS audio (if any) are in: {test_output_dir.resolve()}")
|
|
331
|
+
print(f"{ASCIIColors.YELLOW}Check the audio files in '{test_output_dir.resolve()}'!{ASCIIColors.RESET}")
|
|
332
|
+
# Optional: Clean up downloaded test voices
|
|
333
|
+
# if input("Clean up downloaded test voices? (y/N): ").lower() == 'y':
|
|
334
|
+
# for f_name in [DEFAULT_TEST_VOICE_FILENAME, SECOND_TEST_VOICE_FILENAME]:
|
|
335
|
+
# onnx_p = TEST_PIPER_VOICES_DIR / f_name
|
|
336
|
+
# json_p = TEST_PIPER_VOICES_DIR / f"{f_name}.json"
|
|
337
|
+
# if onnx_p.exists(): onnx_p.unlink()
|
|
338
|
+
# if json_p.exists(): json_p.unlink()
|
|
339
|
+
# if not any(TEST_PIPER_VOICES_DIR.iterdir()): TEST_PIPER_VOICES_DIR.rmdir()
|
|
340
|
+
# ASCIIColors.info("Cleaned up test voices.")
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
ASCIIColors.yellow("\n--- PiperTTSBinding Test Finished ---")
|
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
# lollms_client/tts_bindings/xtts/__init__.py
|
|
2
|
+
import io
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional, List, Union, Dict, Any
|
|
6
|
+
|
|
7
|
+
from ascii_colors import trace_exception, ASCIIColors
|
|
8
|
+
|
|
9
|
+
# --- Package Management and Conditional Imports ---
|
|
10
|
+
_xtts_deps_installed_with_correct_torch = False
|
|
11
|
+
_xtts_installation_error = ""
|
|
12
|
+
try:
|
|
13
|
+
import pipmaster as pm
|
|
14
|
+
import platform
|
|
15
|
+
|
|
16
|
+
preferred_torch_device_for_install = "cpu"
|
|
17
|
+
if platform.system() == "Linux" or platform.system() == "Windows":
|
|
18
|
+
preferred_torch_device_for_install = "cuda"
|
|
19
|
+
elif platform.system() == "Darwin":
|
|
20
|
+
preferred_torch_device_for_install = "mps"
|
|
21
|
+
|
|
22
|
+
torch_pkgs = ["torch", "torchaudio"] # TTS often needs torchaudio
|
|
23
|
+
# Coqui-TTS has specific version requirements sometimes, ensure_packages handles this
|
|
24
|
+
xtts_core_pkgs = ["TTS"]
|
|
25
|
+
other_deps = ["scipy", "numpy", "soundfile"] # soundfile is often a TTS dependency
|
|
26
|
+
|
|
27
|
+
torch_index_url = None
|
|
28
|
+
if preferred_torch_device_for_install == "cuda":
|
|
29
|
+
torch_index_url = "https://download.pytorch.org/whl/cu126"
|
|
30
|
+
ASCIIColors.info(f"Attempting to ensure PyTorch with CUDA support (target index: {torch_index_url}) for XTTS binding.")
|
|
31
|
+
pm.ensure_packages(torch_pkgs, index_url=torch_index_url)
|
|
32
|
+
pm.ensure_packages(xtts_core_pkgs + other_deps)
|
|
33
|
+
else:
|
|
34
|
+
ASCIIColors.info("Ensuring PyTorch, Coqui-TTS, and dependencies using default PyPI index for XTTS binding.")
|
|
35
|
+
pm.ensure_packages(torch_pkgs + xtts_core_pkgs + other_deps)
|
|
36
|
+
|
|
37
|
+
import torch
|
|
38
|
+
from TTS.api import TTS # Main Coqui TTS class
|
|
39
|
+
import scipy.io.wavfile
|
|
40
|
+
import numpy as np
|
|
41
|
+
import soundfile as sf # For reading speaker_wav if not in standard wav
|
|
42
|
+
|
|
43
|
+
_xtts_deps_installed_with_correct_torch = True
|
|
44
|
+
except ImportError as e_imp: # Catch ImportError specifically if TTS itself fails
|
|
45
|
+
_xtts_installation_error = f"ImportError: {e_imp}. Coqui TTS (TTS lib) might not be installed correctly or has missing dependencies."
|
|
46
|
+
TTS, torch, scipy, np, sf = None, None, None, None, None
|
|
47
|
+
except Exception as e:
|
|
48
|
+
_xtts_installation_error = str(e)
|
|
49
|
+
TTS, torch, scipy, np, sf = None, None, None, None, None
|
|
50
|
+
# --- End Package Management ---
|
|
51
|
+
|
|
52
|
+
from lollms_client.lollms_tts_binding import LollmsTTSBinding
|
|
53
|
+
|
|
54
|
+
BindingName = "XTTSBinding"
|
|
55
|
+
|
|
56
|
+
# Common XTTS model IDs from Coqui on Hugging Face
|
|
57
|
+
# The primary one is usually "coqui/XTTS-v2" or similar official releases.
|
|
58
|
+
# Users might also point to fine-tuned versions or local paths.
|
|
59
|
+
XTTS_MODELS = [
|
|
60
|
+
"tts_models/multilingual/multi-dataset/xtts_v2", # Standard XTTS v2 model string for Coqui TTS lib
|
|
61
|
+
# "coqui/XTTS-v2" # This is the HF repo ID, TTS lib might map it or expect the above format
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
# Supported languages by XTTS v2 (example, check latest Coqui docs)
|
|
65
|
+
XTTS_SUPPORTED_LANGUAGES = [
|
|
66
|
+
"en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko", "hi"
|
|
67
|
+
]
|
|
68
|
+
|
|
69
|
+
class XTTSBinding(LollmsTTSBinding):
|
|
70
|
+
def __init__(self,
|
|
71
|
+
model_name: str = "tts_models/multilingual/multi-dataset/xtts_v2", # Coqui TTS model identifier
|
|
72
|
+
default_speaker_wav: Optional[Union[str, Path]] = None, # Path to a reference WAV for default voice
|
|
73
|
+
default_language: str = "en",
|
|
74
|
+
device: Optional[str] = None,
|
|
75
|
+
# Standard LollmsTTSBinding args
|
|
76
|
+
host_address: Optional[str] = None,
|
|
77
|
+
service_key: Optional[str] = None,
|
|
78
|
+
verify_ssl_certificate: bool = True,
|
|
79
|
+
**kwargs): # Catch-all for future TTS API changes or specific params
|
|
80
|
+
|
|
81
|
+
super().__init__(binding_name="xtts")
|
|
82
|
+
|
|
83
|
+
if not _xtts_deps_installed_with_correct_torch:
|
|
84
|
+
raise ImportError(f"XTTS binding dependencies not met. Error: {_xtts_installation_error}")
|
|
85
|
+
|
|
86
|
+
self.device = device
|
|
87
|
+
if self.device is None:
|
|
88
|
+
if torch.cuda.is_available(): self.device = "cuda"; ASCIIColors.info("CUDA device detected by PyTorch for XTTS.")
|
|
89
|
+
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): self.device = "mps"; ASCIIColors.info("MPS device detected for XTTS.")
|
|
90
|
+
else: self.device = "cpu"; ASCIIColors.info("No GPU (CUDA/MPS) by PyTorch, using CPU for XTTS.")
|
|
91
|
+
elif self.device == "cuda" and not torch.cuda.is_available(): self.device = "cpu"; ASCIIColors.warning("CUDA req, not avail. CPU for XTTS.")
|
|
92
|
+
elif self.device == "mps" and not (hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()): self.device = "cpu"; ASCIIColors.warning("MPS req, not avail. CPU for XTTS.")
|
|
93
|
+
|
|
94
|
+
ASCIIColors.info(f"XTTSBinding: Using device '{self.device}'.")
|
|
95
|
+
|
|
96
|
+
self.xtts_model_id_or_path = model_name # Store the model identifier passed by user
|
|
97
|
+
self.loaded_xtts_model_id = None
|
|
98
|
+
self.tts_model: Optional[TTS] = None
|
|
99
|
+
self.default_speaker_wav = str(default_speaker_wav) if default_speaker_wav else None
|
|
100
|
+
self.default_language = default_language
|
|
101
|
+
|
|
102
|
+
if self.default_speaker_wav and not Path(self.default_speaker_wav).exists():
|
|
103
|
+
ASCIIColors.warning(f"Default speaker WAV not found: {self.default_speaker_wav}. Voice cloning will require a speaker_wav per call.")
|
|
104
|
+
self.default_speaker_wav = None # Invalidate if not found
|
|
105
|
+
|
|
106
|
+
self._load_xtts_model(self.xtts_model_id_or_path)
|
|
107
|
+
|
|
108
|
+
def _load_xtts_model(self, model_id_to_load: str):
|
|
109
|
+
if self.tts_model is not None and self.loaded_xtts_model_id == model_id_to_load:
|
|
110
|
+
ASCIIColors.info(f"XTTS model '{model_id_to_load}' already loaded.")
|
|
111
|
+
return
|
|
112
|
+
|
|
113
|
+
ASCIIColors.info(f"Loading XTTS model: '{model_id_to_load}' on device '{self.device}'...")
|
|
114
|
+
try:
|
|
115
|
+
# TTS class handles model downloading from Hugging Face or loading from local path.
|
|
116
|
+
# It also manages moving to the specified device.
|
|
117
|
+
self.tts_model = TTS(model_name=model_id_to_load, progress_bar=True).to(self.device)
|
|
118
|
+
self.loaded_xtts_model_id = model_id_to_load
|
|
119
|
+
ASCIIColors.green(f"XTTS model '{model_id_to_load}' loaded successfully.")
|
|
120
|
+
except Exception as e:
|
|
121
|
+
self.tts_model = None; self.loaded_xtts_model_id = None
|
|
122
|
+
ASCIIColors.error(f"Failed to load XTTS model '{model_id_to_load}': {e}"); trace_exception(e)
|
|
123
|
+
raise RuntimeError(f"Failed to load XTTS model '{model_id_to_load}'") from e
|
|
124
|
+
|
|
125
|
+
def generate_audio(self,
|
|
126
|
+
text: str,
|
|
127
|
+
voice: Optional[Union[str, Path]] = None, # Path to speaker WAV for XTTS
|
|
128
|
+
language: Optional[str] = None,
|
|
129
|
+
# XTTS specific parameters (can be passed via kwargs)
|
|
130
|
+
# speed: float = 1.0, # Not directly in XTTS v2 tts() method's main signature
|
|
131
|
+
**kwargs) -> bytes:
|
|
132
|
+
if self.tts_model is None:
|
|
133
|
+
raise RuntimeError("XTTS model not loaded.")
|
|
134
|
+
|
|
135
|
+
speaker_wav_path = voice if voice is not None else self.default_speaker_wav
|
|
136
|
+
effective_language = language if language is not None else self.default_language
|
|
137
|
+
|
|
138
|
+
if not speaker_wav_path:
|
|
139
|
+
raise ValueError("XTTS requires a 'speaker_wav' path for voice cloning. Provide it in the 'voice' argument or set 'default_speaker_wav' during initialization.")
|
|
140
|
+
|
|
141
|
+
speaker_wav_p = Path(speaker_wav_path)
|
|
142
|
+
if not speaker_wav_p.exists():
|
|
143
|
+
raise FileNotFoundError(f"Speaker WAV file not found: {speaker_wav_path}")
|
|
144
|
+
|
|
145
|
+
if effective_language not in XTTS_SUPPORTED_LANGUAGES:
|
|
146
|
+
ASCIIColors.warning(f"Language '{effective_language}' might not be officially supported by XTTS v2. "
|
|
147
|
+
f"Known supported: {XTTS_SUPPORTED_LANGUAGES}. Attempting anyway.")
|
|
148
|
+
|
|
149
|
+
ASCIIColors.info(f"Generating speech with XTTS: '{text[:60]}...' (Speaker: {speaker_wav_p.name}, Lang: {effective_language})")
|
|
150
|
+
|
|
151
|
+
try:
|
|
152
|
+
# The tts() method returns a NumPy array (waveform)
|
|
153
|
+
# It expects speaker_wav and language as direct arguments.
|
|
154
|
+
# Other TTS generation parameters might be available via model's config or specific methods.
|
|
155
|
+
# For XTTS, common ones like speed are handled internally or via config.
|
|
156
|
+
# We can pass other kwargs if the TTS library might pick them up for specific models.
|
|
157
|
+
|
|
158
|
+
# XTTS's tts() returns list of ints (scaled PCM), not float numpy array directly
|
|
159
|
+
wav_array_int_list = self.tts_model.tts(
|
|
160
|
+
text=text,
|
|
161
|
+
speaker_wav=str(speaker_wav_path), # Must be a string path
|
|
162
|
+
language=effective_language,
|
|
163
|
+
# split_sentences=True, # Default True, good for longer texts
|
|
164
|
+
**kwargs # Pass other potential TTS lib args
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
if not wav_array_int_list: # Check if list is empty
|
|
168
|
+
raise RuntimeError("XTTS model returned empty audio data (list of ints was empty).")
|
|
169
|
+
|
|
170
|
+
# Convert list of ints to a NumPy array of int16
|
|
171
|
+
# The TTS library usually returns samples scaled appropriately for int16.
|
|
172
|
+
audio_array_np = np.array(wav_array_int_list, dtype=np.int16)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
if audio_array_np.ndim == 0 or audio_array_np.size == 0: # Double check after conversion
|
|
176
|
+
raise RuntimeError("XTTS model resulted in empty NumPy audio array.")
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
buffer = io.BytesIO()
|
|
180
|
+
# Get sample rate from the loaded TTS model's config
|
|
181
|
+
sample_rate = self.tts_model.synthesizer.output_sample_rate if hasattr(self.tts_model, 'synthesizer') and hasattr(self.tts_model.synthesizer, 'output_sample_rate') else 24000 # XTTS v2 default is 24kHz
|
|
182
|
+
|
|
183
|
+
scipy.io.wavfile.write(buffer, rate=sample_rate, data=audio_array_np)
|
|
184
|
+
audio_bytes = buffer.getvalue()
|
|
185
|
+
buffer.close()
|
|
186
|
+
|
|
187
|
+
ASCIIColors.green("XTTS audio generation successful.")
|
|
188
|
+
return audio_bytes
|
|
189
|
+
except Exception as e:
|
|
190
|
+
ASCIIColors.error(f"XTTS audio generation failed: {e}"); trace_exception(e)
|
|
191
|
+
if "out of memory" in str(e).lower() and self.device == "cuda":
|
|
192
|
+
ASCIIColors.yellow("CUDA out of memory. Ensure GPU has sufficient VRAM for XTTS (can be several GB).")
|
|
193
|
+
raise RuntimeError(f"XTTS audio generation error: {e}") from e
|
|
194
|
+
|
|
195
|
+
def list_voices(self, **kwargs) -> List[str]:
|
|
196
|
+
"""
|
|
197
|
+
For XTTS, voices are determined by the `speaker_wav` file.
|
|
198
|
+
This method returns a message or an empty list, as there are no predefined voices.
|
|
199
|
+
Optionally, one could implement scanning a user-defined directory of speaker WAVs.
|
|
200
|
+
"""
|
|
201
|
+
# return ["Dynamic (provide 'speaker_wav' path to generate_audio)"]
|
|
202
|
+
ASCIIColors.info("XTTS voices are dynamic and determined by the 'speaker_wav' file provided during generation.")
|
|
203
|
+
ASCIIColors.info("You can provide a path to any reference WAV file for voice cloning.")
|
|
204
|
+
return [] # Or provide a helper message as above in a different way
|
|
205
|
+
|
|
206
|
+
def get_xtts_model_ids(self) -> List[str]:
|
|
207
|
+
"""Helper to list known XTTS model identifiers for Coqui TTS library."""
|
|
208
|
+
return XTTS_MODELS.copy()
|
|
209
|
+
|
|
210
|
+
def get_supported_languages(self) -> List[str]:
|
|
211
|
+
"""Helper to list known supported languages for XTTS v2."""
|
|
212
|
+
return XTTS_SUPPORTED_LANGUAGES.copy()
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def __del__(self):
|
|
216
|
+
if hasattr(self, 'tts_model') and self.tts_model is not None:
|
|
217
|
+
del self.tts_model; self.tts_model = None
|
|
218
|
+
if torch and hasattr(torch, 'cuda') and torch.cuda.is_available():
|
|
219
|
+
torch.cuda.empty_cache()
|
|
220
|
+
loaded_name = getattr(self, 'loaded_xtts_model_id', None)
|
|
221
|
+
msg = f"XTTSBinding for model '{loaded_name}' destroyed." if loaded_name else "XTTSBinding destroyed."
|
|
222
|
+
ASCIIColors.info(msg)
|
|
223
|
+
|
|
224
|
+
# --- Main Test Block ---
|
|
225
|
+
if __name__ == '__main__':
|
|
226
|
+
if not _xtts_deps_installed_with_correct_torch:
|
|
227
|
+
print(f"{ASCIIColors.RED}XTTS binding dependencies not met. Skipping tests. Error: {_xtts_installation_error}{ASCIIColors.RESET}")
|
|
228
|
+
exit()
|
|
229
|
+
|
|
230
|
+
ASCIIColors.yellow("--- XTTSBinding Test ---")
|
|
231
|
+
# For XTTS, model_name is the Coqui TTS model string or HF repo ID if supported by TTS lib directly
|
|
232
|
+
test_xtts_model_id = "tts_models/multilingual/multi-dataset/xtts_v2"
|
|
233
|
+
test_output_dir = Path("./test_xtts_output")
|
|
234
|
+
test_output_dir.mkdir(exist_ok=True)
|
|
235
|
+
|
|
236
|
+
# --- IMPORTANT: Create or provide a speaker reference WAV file ---
|
|
237
|
+
# For this test to work, you need a short (~5-15 seconds) clean audio file of a voice.
|
|
238
|
+
# Name it 'speaker_ref.wav' and place it in the same directory as this script,
|
|
239
|
+
# or update the path below.
|
|
240
|
+
default_speaker_wav_path = Path(__file__).parent / "speaker_ref.wav" # Assumes it's next to this __init__.py
|
|
241
|
+
|
|
242
|
+
if not default_speaker_wav_path.exists():
|
|
243
|
+
ASCIIColors.warning(f"Reference speaker WAV file not found: {default_speaker_wav_path}")
|
|
244
|
+
ASCIIColors.warning("Please create/place a 'speaker_ref.wav' (clean, ~5-15s audio) in the "
|
|
245
|
+
f"'{default_speaker_wav_path.parent}' directory for the test to run properly.")
|
|
246
|
+
# Attempt to create a very basic dummy if scipy available, NOT suitable for good cloning
|
|
247
|
+
try:
|
|
248
|
+
import numpy as np; import scipy.io.wavfile
|
|
249
|
+
samplerate = 22050; duration = 2; frequency = 440
|
|
250
|
+
t = np.linspace(0., duration, int(samplerate * duration), endpoint=False)
|
|
251
|
+
data = (np.iinfo(np.int16).max * 0.1 * np.sin(2. * np.pi * frequency * t)).astype(np.int16)
|
|
252
|
+
scipy.io.wavfile.write(default_speaker_wav_path, samplerate, data)
|
|
253
|
+
ASCIIColors.info(f"Created a VERY BASIC dummy 'speaker_ref.wav'. Replace with a real voice sample for good results.")
|
|
254
|
+
except Exception as e_dummy_spk:
|
|
255
|
+
ASCIIColors.error(f"Could not create dummy speaker_ref.wav: {e_dummy_spk}. Test will likely fail or use no speaker.")
|
|
256
|
+
default_speaker_wav_path = None # Ensure it's None if creation failed
|
|
257
|
+
|
|
258
|
+
tts_binding = None
|
|
259
|
+
try:
|
|
260
|
+
ASCIIColors.cyan(f"\n--- Initializing XTTSBinding (XTTS Model: '{test_xtts_model_id}') ---")
|
|
261
|
+
tts_binding = XTTSBinding(
|
|
262
|
+
model_name=test_xtts_model_id,
|
|
263
|
+
default_speaker_wav=str(default_speaker_wav_path) if default_speaker_wav_path else None,
|
|
264
|
+
default_language="en"
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
ASCIIColors.cyan("\n--- Listing XTTS 'voices' (dynamic, requires speaker_wav) ---")
|
|
268
|
+
voices = tts_binding.list_voices(); # This will print an informational message
|
|
269
|
+
|
|
270
|
+
ASCIIColors.cyan("\n--- Listing known XTTS model IDs for Coqui TTS library ---")
|
|
271
|
+
xtts_models = tts_binding.get_xtts_model_ids(); print(f"Known XTTS model IDs: {xtts_models}")
|
|
272
|
+
ASCIIColors.cyan("\n--- Listing known XTTS supported languages ---")
|
|
273
|
+
langs = tts_binding.get_supported_languages(); print(f"Supported languages (example): {langs[:5]}...")
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
texts_to_synthesize = [
|
|
277
|
+
("english_greeting", "Hello, this is a test of the XTTS voice synthesis system. I hope you like my voice!", "en"),
|
|
278
|
+
("spanish_question", "¿Cómo estás hoy? Espero que tengas un día maravilloso.", "es"),
|
|
279
|
+
# ("short_custom_voice", "This voice should sound like your reference audio.", "en", "path/to/your/custom_speaker.wav"), # Example for custom
|
|
280
|
+
]
|
|
281
|
+
if not default_speaker_wav_path: # If no default speaker, we can't run text loop as is
|
|
282
|
+
ASCIIColors.error("No default_speaker_wav available. Skipping synthesis loop.")
|
|
283
|
+
texts_to_synthesize = []
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
for name, text, lang, *speaker_override_list in texts_to_synthesize:
|
|
287
|
+
speaker_to_use = speaker_override_list[0] if speaker_override_list else None # Uses binding default if None
|
|
288
|
+
|
|
289
|
+
ASCIIColors.cyan(f"\n--- Synthesizing TTS for: '{name}' (Lang: {lang}, Speaker: {speaker_to_use or tts_binding.default_speaker_wav}) ---")
|
|
290
|
+
print(f"Text: {text}")
|
|
291
|
+
try:
|
|
292
|
+
# XTTS tts() doesn't have as many direct generation params as Bark's generate()
|
|
293
|
+
# Control is more via the model config or specific methods if available.
|
|
294
|
+
audio_bytes = tts_binding.generate_audio(text, voice=speaker_to_use, language=lang)
|
|
295
|
+
if audio_bytes:
|
|
296
|
+
output_filename = f"tts_{name}_{tts_binding.loaded_xtts_model_id.replace('/','_')}.wav"
|
|
297
|
+
output_path = test_output_dir / output_filename
|
|
298
|
+
with open(output_path, "wb") as f: f.write(audio_bytes)
|
|
299
|
+
ASCIIColors.green(f"TTS for '{name}' saved to: {output_path} ({len(audio_bytes) / 1024:.2f} KB)")
|
|
300
|
+
else: ASCIIColors.error(f"TTS generation for '{name}' returned empty bytes.")
|
|
301
|
+
except Exception as e_gen: ASCIIColors.error(f"Failed to generate TTS for '{name}': {e_gen}")
|
|
302
|
+
|
|
303
|
+
except ImportError as e_imp: ASCIIColors.error(f"Import error: {e_imp}")
|
|
304
|
+
except RuntimeError as e_rt: ASCIIColors.error(f"Runtime error: {e_rt}")
|
|
305
|
+
except Exception as e: ASCIIColors.error(f"Unexpected error: {e}"); trace_exception(e)
|
|
306
|
+
finally:
|
|
307
|
+
if tts_binding: del tts_binding
|
|
308
|
+
ASCIIColors.info(f"Test TTS audio (if any) are in: {test_output_dir.resolve()}")
|
|
309
|
+
print(f"{ASCIIColors.YELLOW}Check the audio files in '{test_output_dir.resolve()}'!{ASCIIColors.RESET}")
|
|
310
|
+
# Clean up dummy speaker_ref.wav if we created it
|
|
311
|
+
if "samplerate" in locals() and default_speaker_wav_path and default_speaker_wav_path.name == "speaker_ref.wav" and "dummy" in str(default_speaker_wav_path).lower():
|
|
312
|
+
if default_speaker_wav_path.exists():
|
|
313
|
+
try: default_speaker_wav_path.unlink(); ASCIIColors.info("Removed dummy speaker_ref.wav")
|
|
314
|
+
except: pass
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
ASCIIColors.yellow("\n--- XTTSBinding Test Finished ---")
|