lollms-client 1.3.3__py3-none-any.whl → 1.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lollms-client might be problematic. Click here for more details.

@@ -1,317 +1,111 @@
1
- # lollms_client/tts_bindings/xtts/__init__.py
2
- import io
3
- import os
4
- from pathlib import Path
5
- from typing import Optional, List, Union, Dict, Any
6
-
7
- from ascii_colors import trace_exception, ASCIIColors
8
-
9
- # --- Package Management and Conditional Imports ---
10
- _xtts_deps_installed_with_correct_torch = False
11
- _xtts_installation_error = ""
12
- try:
13
- import pipmaster as pm
14
- import platform
15
-
16
- preferred_torch_device_for_install = "cpu"
17
- if platform.system() == "Linux" or platform.system() == "Windows":
18
- preferred_torch_device_for_install = "cuda"
19
- elif platform.system() == "Darwin":
20
- preferred_torch_device_for_install = "mps"
21
-
22
- torch_pkgs = ["torch", "torchaudio"] # TTS often needs torchaudio
23
- # Coqui-TTS has specific version requirements sometimes, ensure_packages handles this
24
- xtts_core_pkgs = ["TTS"]
25
- other_deps = ["scipy", "numpy", "soundfile"] # soundfile is often a TTS dependency
26
-
27
- torch_index_url = None
28
- if preferred_torch_device_for_install == "cuda":
29
- torch_index_url = "https://download.pytorch.org/whl/cu126"
30
- ASCIIColors.info(f"Attempting to ensure PyTorch with CUDA support (target index: {torch_index_url}) for XTTS binding.")
31
- pm.ensure_packages(torch_pkgs, index_url=torch_index_url)
32
- pm.ensure_packages(xtts_core_pkgs + other_deps)
33
- else:
34
- ASCIIColors.info("Ensuring PyTorch, Coqui-TTS, and dependencies using default PyPI index for XTTS binding.")
35
- pm.ensure_packages(torch_pkgs + xtts_core_pkgs + other_deps)
36
-
37
- import torch
38
- from TTS.api import TTS # Main Coqui TTS class
39
- import scipy.io.wavfile
40
- import numpy as np
41
- import soundfile as sf # For reading speaker_wav if not in standard wav
42
-
43
- _xtts_deps_installed_with_correct_torch = True
44
- except ImportError as e_imp: # Catch ImportError specifically if TTS itself fails
45
- _xtts_installation_error = f"ImportError: {e_imp}. Coqui TTS (TTS lib) might not be installed correctly or has missing dependencies."
46
- TTS, torch, scipy, np, sf = None, None, None, None, None
47
- except Exception as e:
48
- _xtts_installation_error = str(e)
49
- TTS, torch, scipy, np, sf = None, None, None, None, None
50
- # --- End Package Management ---
51
-
1
+ # File: lollms_client/tts_bindings/xtts/__init__.py
52
2
  from lollms_client.lollms_tts_binding import LollmsTTSBinding
53
-
54
- BindingName = "XTTSBinding"
55
-
56
- # Common XTTS model IDs from Coqui on Hugging Face
57
- # The primary one is usually "coqui/XTTS-v2" or similar official releases.
58
- # Users might also point to fine-tuned versions or local paths.
59
- XTTS_MODELS = [
60
- "tts_models/multilingual/multi-dataset/xtts_v2", # Standard XTTS v2 model string for Coqui TTS lib
61
- # "coqui/XTTS-v2" # This is the HF repo ID, TTS lib might map it or expect the above format
62
- ]
63
-
64
- # Supported languages by XTTS v2 (example, check latest Coqui docs)
65
- XTTS_SUPPORTED_LANGUAGES = [
66
- "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko", "hi"
67
- ]
68
-
69
- class XTTSBinding(LollmsTTSBinding):
70
- def __init__(self,
71
- model_name: str = "tts_models/multilingual/multi-dataset/xtts_v2", # Coqui TTS model identifier
72
- default_speaker_wav: Optional[Union[str, Path]] = None, # Path to a reference WAV for default voice
73
- default_language: str = "en",
74
- device: Optional[str] = None,
75
- # Standard LollmsTTSBinding args
76
- host_address: Optional[str] = None,
77
- service_key: Optional[str] = None,
78
- verify_ssl_certificate: bool = True,
79
- **kwargs): # Catch-all for future TTS API changes or specific params
80
-
81
- super().__init__(binding_name="xtts")
82
-
83
- if not _xtts_deps_installed_with_correct_torch:
84
- raise ImportError(f"XTTS binding dependencies not met. Error: {_xtts_installation_error}")
85
-
86
- self.device = device
87
- if self.device is None:
88
- if torch.cuda.is_available(): self.device = "cuda"; ASCIIColors.info("CUDA device detected by PyTorch for XTTS.")
89
- elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): self.device = "mps"; ASCIIColors.info("MPS device detected for XTTS.")
90
- else: self.device = "cpu"; ASCIIColors.info("No GPU (CUDA/MPS) by PyTorch, using CPU for XTTS.")
91
- elif self.device == "cuda" and not torch.cuda.is_available(): self.device = "cpu"; ASCIIColors.warning("CUDA req, not avail. CPU for XTTS.")
92
- elif self.device == "mps" and not (hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()): self.device = "cpu"; ASCIIColors.warning("MPS req, not avail. CPU for XTTS.")
93
-
94
- ASCIIColors.info(f"XTTSBinding: Using device '{self.device}'.")
95
-
96
- self.xtts_model_id_or_path = model_name # Store the model identifier passed by user
97
- self.loaded_xtts_model_id = None
98
- self.tts_model: Optional[TTS] = None
99
- self.default_speaker_wav = str(default_speaker_wav) if default_speaker_wav else None
100
- self.default_language = default_language
3
+ from typing import Optional, List
4
+ from pathlib import Path
5
+ import requests
6
+ import subprocess
7
+ import sys
8
+ import time
9
+ import pipmaster as pm
10
+
11
+ BindingName = "XTTSClientBinding"
12
+
13
+ class XTTSClientBinding(LollmsTTSBinding):
14
+ def __init__(self,
15
+ host: str = "localhost",
16
+ port: int = 8081,
17
+ auto_start_server: bool = True,
18
+ **kwargs):
101
19
 
102
- if self.default_speaker_wav and not Path(self.default_speaker_wav).exists():
103
- ASCIIColors.warning(f"Default speaker WAV not found: {self.default_speaker_wav}. Voice cloning will require a speaker_wav per call.")
104
- self.default_speaker_wav = None # Invalidate if not found
105
-
106
- self._load_xtts_model(self.xtts_model_id_or_path)
107
-
108
- def _load_xtts_model(self, model_id_to_load: str):
109
- if self.tts_model is not None and self.loaded_xtts_model_id == model_id_to_load:
110
- ASCIIColors.info(f"XTTS model '{model_id_to_load}' already loaded.")
111
- return
112
-
113
- ASCIIColors.info(f"Loading XTTS model: '{model_id_to_load}' on device '{self.device}'...")
114
- try:
115
- # TTS class handles model downloading from Hugging Face or loading from local path.
116
- # It also manages moving to the specified device.
117
- self.tts_model = TTS(model_name=model_id_to_load, progress_bar=True).to(self.device)
118
- self.loaded_xtts_model_id = model_id_to_load
119
- ASCIIColors.green(f"XTTS model '{model_id_to_load}' loaded successfully.")
120
- except Exception as e:
121
- self.tts_model = None; self.loaded_xtts_model_id = None
122
- ASCIIColors.error(f"Failed to load XTTS model '{model_id_to_load}': {e}"); trace_exception(e)
123
- raise RuntimeError(f"Failed to load XTTS model '{model_id_to_load}'") from e
124
-
125
- def generate_audio(self,
126
- text: str,
127
- voice: Optional[Union[str, Path]] = None, # Path to speaker WAV for XTTS
128
- language: Optional[str] = None,
129
- # XTTS specific parameters (can be passed via kwargs)
130
- # speed: float = 1.0, # Not directly in XTTS v2 tts() method's main signature
131
- **kwargs) -> bytes:
132
- if self.tts_model is None:
133
- raise RuntimeError("XTTS model not loaded.")
134
-
135
- speaker_wav_path = voice if voice is not None else self.default_speaker_wav
136
- effective_language = language if language is not None else self.default_language
137
-
138
- if not speaker_wav_path:
139
- raise ValueError("XTTS requires a 'speaker_wav' path for voice cloning. Provide it in the 'voice' argument or set 'default_speaker_wav' during initialization.")
20
+ binding_name = "xtts"
21
+ super().__init__(binding_name=binding_name, **kwargs)
22
+ self.host = host
23
+ self.port = port
24
+ self.auto_start_server = auto_start_server
25
+ self.server_process = None
26
+ self.base_url = f"http://{self.host}:{self.port}"
27
+
28
+ if self.auto_start_server:
29
+ self.start_server()
30
+
31
+ def start_server(self):
32
+ print("XTTS Client: Starting dedicated server...")
33
+ binding_root = Path(__file__).parent
34
+ server_dir = binding_root / "server"
35
+ requirements_file = server_dir / "requirements.txt"
36
+ server_script = server_dir / "main.py"
37
+
38
+ # 1. Ensure a virtual environment and dependencies
39
+ venv_path = server_dir / "venv"
40
+ pm_v = pm.PackageManager(venv_path=venv_path)
41
+ pm_v.ensure_requirements(str(requirements_file))
42
+
43
+ # 2. Get the python executable from the venv
44
+ if sys.platform == "win32":
45
+ python_executable = venv_path / "Scripts" / "python.exe"
46
+ else:
47
+ python_executable = venv_path / "bin" / "python"
48
+
49
+ # 3. Launch the server as a subprocess with stdout/stderr forwarded to console
50
+ command = [
51
+ str(python_executable),
52
+ str(server_script),
53
+ "--host", self.host,
54
+ "--port", str(self.port)
55
+ ]
140
56
 
141
- speaker_wav_p = Path(speaker_wav_path)
142
- if not speaker_wav_p.exists():
143
- raise FileNotFoundError(f"Speaker WAV file not found: {speaker_wav_path}")
57
+ # Forward stdout and stderr to the parent process console
58
+ self.server_process = subprocess.Popen(
59
+ command,
60
+ stdout=None, # Inherit parent's stdout (shows in console)
61
+ stderr=None, # Inherit parent's stderr (shows in console)
62
+ )
144
63
 
145
- if effective_language not in XTTS_SUPPORTED_LANGUAGES:
146
- ASCIIColors.warning(f"Language '{effective_language}' might not be officially supported by XTTS v2. "
147
- f"Known supported: {XTTS_SUPPORTED_LANGUAGES}. Attempting anyway.")
64
+ # 4. Wait for the server to be ready
65
+ self._wait_for_server()
148
66
 
149
- ASCIIColors.info(f"Generating speech with XTTS: '{text[:60]}...' (Speaker: {speaker_wav_p.name}, Lang: {effective_language})")
67
+ def _wait_for_server(self, timeout=60):
68
+ start_time = time.time()
69
+ while time.time() - start_time < timeout:
70
+ try:
71
+ response = requests.get(f"{self.base_url}/status")
72
+ if response.status_code == 200 and response.json().get("status") == "running":
73
+ print("XTTS Server is up and running.")
74
+ return
75
+ except requests.ConnectionError:
76
+ time.sleep(1)
150
77
 
151
- try:
152
- # The tts() method returns a NumPy array (waveform)
153
- # It expects speaker_wav and language as direct arguments.
154
- # Other TTS generation parameters might be available via model's config or specific methods.
155
- # For XTTS, common ones like speed are handled internally or via config.
156
- # We can pass other kwargs if the TTS library might pick them up for specific models.
157
-
158
- # XTTS's tts() returns list of ints (scaled PCM), not float numpy array directly
159
- wav_array_int_list = self.tts_model.tts(
160
- text=text,
161
- speaker_wav=str(speaker_wav_path), # Must be a string path
162
- language=effective_language,
163
- # split_sentences=True, # Default True, good for longer texts
164
- **kwargs # Pass other potential TTS lib args
165
- )
166
-
167
- if not wav_array_int_list: # Check if list is empty
168
- raise RuntimeError("XTTS model returned empty audio data (list of ints was empty).")
169
-
170
- # Convert list of ints to a NumPy array of int16
171
- # The TTS library usually returns samples scaled appropriately for int16.
172
- audio_array_np = np.array(wav_array_int_list, dtype=np.int16)
173
-
174
-
175
- if audio_array_np.ndim == 0 or audio_array_np.size == 0: # Double check after conversion
176
- raise RuntimeError("XTTS model resulted in empty NumPy audio array.")
177
-
178
-
179
- buffer = io.BytesIO()
180
- # Get sample rate from the loaded TTS model's config
181
- sample_rate = self.tts_model.synthesizer.output_sample_rate if hasattr(self.tts_model, 'synthesizer') and hasattr(self.tts_model.synthesizer, 'output_sample_rate') else 24000 # XTTS v2 default is 24kHz
182
-
183
- scipy.io.wavfile.write(buffer, rate=sample_rate, data=audio_array_np)
184
- audio_bytes = buffer.getvalue()
185
- buffer.close()
186
-
187
- ASCIIColors.green("XTTS audio generation successful.")
188
- return audio_bytes
189
- except Exception as e:
190
- ASCIIColors.error(f"XTTS audio generation failed: {e}"); trace_exception(e)
191
- if "out of memory" in str(e).lower() and self.device == "cuda":
192
- ASCIIColors.yellow("CUDA out of memory. Ensure GPU has sufficient VRAM for XTTS (can be several GB).")
193
- raise RuntimeError(f"XTTS audio generation error: {e}") from e
194
-
195
- def list_voices(self, **kwargs) -> List[str]:
196
- """
197
- For XTTS, voices are determined by the `speaker_wav` file.
198
- This method returns a message or an empty list, as there are no predefined voices.
199
- Optionally, one could implement scanning a user-defined directory of speaker WAVs.
200
- """
201
- # return ["Dynamic (provide 'speaker_wav' path to generate_audio)"]
202
- ASCIIColors.info("XTTS voices are dynamic and determined by the 'speaker_wav' file provided during generation.")
203
- ASCIIColors.info("You can provide a path to any reference WAV file for voice cloning.")
204
- return [] # Or provide a helper message as above in a different way
205
-
206
- def get_xtts_model_ids(self) -> List[str]:
207
- """Helper to list known XTTS model identifiers for Coqui TTS library."""
208
- return XTTS_MODELS.copy()
78
+ self.stop_server()
79
+ raise RuntimeError("Failed to start the XTTS server in the specified timeout.")
80
+
81
+ def stop_server(self):
82
+ if self.server_process:
83
+ print("XTTS Client: Stopping dedicated server...")
84
+ self.server_process.terminate()
85
+ self.server_process.wait()
86
+ self.server_process = None
87
+ print("Server stopped.")
209
88
 
210
- def get_supported_languages(self) -> List[str]:
211
- """Helper to list known supported languages for XTTS v2."""
212
- return XTTS_SUPPORTED_LANGUAGES.copy()
213
-
214
-
215
89
  def __del__(self):
216
- if hasattr(self, 'tts_model') and self.tts_model is not None:
217
- del self.tts_model; self.tts_model = None
218
- if torch and hasattr(torch, 'cuda') and torch.cuda.is_available():
219
- torch.cuda.empty_cache()
220
- loaded_name = getattr(self, 'loaded_xtts_model_id', None)
221
- msg = f"XTTSBinding for model '{loaded_name}' destroyed." if loaded_name else "XTTSBinding destroyed."
222
- ASCIIColors.info(msg)
90
+ # Ensure the server is stopped when the object is destroyed
91
+ self.stop_server()
223
92
 
224
- # --- Main Test Block ---
225
- if __name__ == '__main__':
226
- if not _xtts_deps_installed_with_correct_torch:
227
- print(f"{ASCIIColors.RED}XTTS binding dependencies not met. Skipping tests. Error: {_xtts_installation_error}{ASCIIColors.RESET}")
228
- exit()
93
+ def generate_audio(self, text: str, voice: Optional[str] = None, **kwargs) -> bytes:
94
+ """Generate audio by calling the server's API"""
95
+ payload = {"text": text, "voice": voice, **kwargs}
96
+ response = requests.post(f"{self.base_url}/generate_audio", json=payload)
97
+ response.raise_for_status()
98
+ return response.content
229
99
 
230
- ASCIIColors.yellow("--- XTTSBinding Test ---")
231
- # For XTTS, model_name is the Coqui TTS model string or HF repo ID if supported by TTS lib directly
232
- test_xtts_model_id = "tts_models/multilingual/multi-dataset/xtts_v2"
233
- test_output_dir = Path("./test_xtts_output")
234
- test_output_dir.mkdir(exist_ok=True)
235
-
236
- # --- IMPORTANT: Create or provide a speaker reference WAV file ---
237
- # For this test to work, you need a short (~5-15 seconds) clean audio file of a voice.
238
- # Name it 'speaker_ref.wav' and place it in the same directory as this script,
239
- # or update the path below.
240
- default_speaker_wav_path = Path(__file__).parent / "speaker_ref.wav" # Assumes it's next to this __init__.py
241
-
242
- if not default_speaker_wav_path.exists():
243
- ASCIIColors.warning(f"Reference speaker WAV file not found: {default_speaker_wav_path}")
244
- ASCIIColors.warning("Please create/place a 'speaker_ref.wav' (clean, ~5-15s audio) in the "
245
- f"'{default_speaker_wav_path.parent}' directory for the test to run properly.")
246
- # Attempt to create a very basic dummy if scipy available, NOT suitable for good cloning
247
- try:
248
- import numpy as np; import scipy.io.wavfile
249
- samplerate = 22050; duration = 2; frequency = 440
250
- t = np.linspace(0., duration, int(samplerate * duration), endpoint=False)
251
- data = (np.iinfo(np.int16).max * 0.1 * np.sin(2. * np.pi * frequency * t)).astype(np.int16)
252
- scipy.io.wavfile.write(default_speaker_wav_path, samplerate, data)
253
- ASCIIColors.info(f"Created a VERY BASIC dummy 'speaker_ref.wav'. Replace with a real voice sample for good results.")
254
- except Exception as e_dummy_spk:
255
- ASCIIColors.error(f"Could not create dummy speaker_ref.wav: {e_dummy_spk}. Test will likely fail or use no speaker.")
256
- default_speaker_wav_path = None # Ensure it's None if creation failed
257
-
258
- tts_binding = None
259
- try:
260
- ASCIIColors.cyan(f"\n--- Initializing XTTSBinding (XTTS Model: '{test_xtts_model_id}') ---")
261
- tts_binding = XTTSBinding(
262
- model_name=test_xtts_model_id,
263
- default_speaker_wav=str(default_speaker_wav_path) if default_speaker_wav_path else None,
264
- default_language="en"
265
- )
266
-
267
- ASCIIColors.cyan("\n--- Listing XTTS 'voices' (dynamic, requires speaker_wav) ---")
268
- voices = tts_binding.list_voices(); # This will print an informational message
269
-
270
- ASCIIColors.cyan("\n--- Listing known XTTS model IDs for Coqui TTS library ---")
271
- xtts_models = tts_binding.get_xtts_model_ids(); print(f"Known XTTS model IDs: {xtts_models}")
272
- ASCIIColors.cyan("\n--- Listing known XTTS supported languages ---")
273
- langs = tts_binding.get_supported_languages(); print(f"Supported languages (example): {langs[:5]}...")
274
-
275
-
276
- texts_to_synthesize = [
277
- ("english_greeting", "Hello, this is a test of the XTTS voice synthesis system. I hope you like my voice!", "en"),
278
- ("spanish_question", "¿Cómo estás hoy? Espero que tengas un día maravilloso.", "es"),
279
- # ("short_custom_voice", "This voice should sound like your reference audio.", "en", "path/to/your/custom_speaker.wav"), # Example for custom
280
- ]
281
- if not default_speaker_wav_path: # If no default speaker, we can't run text loop as is
282
- ASCIIColors.error("No default_speaker_wav available. Skipping synthesis loop.")
283
- texts_to_synthesize = []
284
-
285
-
286
- for name, text, lang, *speaker_override_list in texts_to_synthesize:
287
- speaker_to_use = speaker_override_list[0] if speaker_override_list else None # Uses binding default if None
288
-
289
- ASCIIColors.cyan(f"\n--- Synthesizing TTS for: '{name}' (Lang: {lang}, Speaker: {speaker_to_use or tts_binding.default_speaker_wav}) ---")
290
- print(f"Text: {text}")
291
- try:
292
- # XTTS tts() doesn't have as many direct generation params as Bark's generate()
293
- # Control is more via the model config or specific methods if available.
294
- audio_bytes = tts_binding.generate_audio(text, voice=speaker_to_use, language=lang)
295
- if audio_bytes:
296
- output_filename = f"tts_{name}_{tts_binding.loaded_xtts_model_id.replace('/','_')}.wav"
297
- output_path = test_output_dir / output_filename
298
- with open(output_path, "wb") as f: f.write(audio_bytes)
299
- ASCIIColors.green(f"TTS for '{name}' saved to: {output_path} ({len(audio_bytes) / 1024:.2f} KB)")
300
- else: ASCIIColors.error(f"TTS generation for '{name}' returned empty bytes.")
301
- except Exception as e_gen: ASCIIColors.error(f"Failed to generate TTS for '{name}': {e_gen}")
302
-
303
- except ImportError as e_imp: ASCIIColors.error(f"Import error: {e_imp}")
304
- except RuntimeError as e_rt: ASCIIColors.error(f"Runtime error: {e_rt}")
305
- except Exception as e: ASCIIColors.error(f"Unexpected error: {e}"); trace_exception(e)
306
- finally:
307
- if tts_binding: del tts_binding
308
- ASCIIColors.info(f"Test TTS audio (if any) are in: {test_output_dir.resolve()}")
309
- print(f"{ASCIIColors.YELLOW}Check the audio files in '{test_output_dir.resolve()}'!{ASCIIColors.RESET}")
310
- # Clean up dummy speaker_ref.wav if we created it
311
- if "samplerate" in locals() and default_speaker_wav_path and default_speaker_wav_path.name == "speaker_ref.wav" and "dummy" in str(default_speaker_wav_path).lower():
312
- if default_speaker_wav_path.exists():
313
- try: default_speaker_wav_path.unlink(); ASCIIColors.info("Removed dummy speaker_ref.wav")
314
- except: pass
100
+ def list_voices(self, **kwargs) -> List[str]:
101
+ """Get available voices from the server"""
102
+ response = requests.get(f"{self.base_url}/list_voices")
103
+ response.raise_for_status()
104
+ return response.json().get("voices", [])
315
105
 
106
+ def list_models(self, **kwargs) -> List[str]:
107
+ """Get available models from the server"""
108
+ response = requests.get(f"{self.base_url}/list_models")
109
+ response.raise_for_status()
110
+ return response.json().get("models", [])
316
111
 
317
- ASCIIColors.yellow("\n--- XTTSBinding Test Finished ---")