abstractvoice 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractvoice/__init__.py +2 -5
- abstractvoice/__main__.py +82 -3
- abstractvoice/adapters/__init__.py +12 -0
- abstractvoice/adapters/base.py +207 -0
- abstractvoice/adapters/stt_faster_whisper.py +401 -0
- abstractvoice/adapters/tts_piper.py +480 -0
- abstractvoice/aec/__init__.py +10 -0
- abstractvoice/aec/webrtc_apm.py +56 -0
- abstractvoice/artifacts.py +173 -0
- abstractvoice/audio/__init__.py +7 -0
- abstractvoice/audio/recorder.py +46 -0
- abstractvoice/audio/resample.py +25 -0
- abstractvoice/cloning/__init__.py +7 -0
- abstractvoice/cloning/engine_chroma.py +738 -0
- abstractvoice/cloning/engine_f5.py +546 -0
- abstractvoice/cloning/manager.py +349 -0
- abstractvoice/cloning/store.py +362 -0
- abstractvoice/compute/__init__.py +6 -0
- abstractvoice/compute/device.py +73 -0
- abstractvoice/config/__init__.py +2 -0
- abstractvoice/config/voice_catalog.py +19 -0
- abstractvoice/dependency_check.py +0 -1
- abstractvoice/examples/cli_repl.py +2403 -243
- abstractvoice/examples/voice_cli.py +64 -63
- abstractvoice/integrations/__init__.py +2 -0
- abstractvoice/integrations/abstractcore.py +116 -0
- abstractvoice/integrations/abstractcore_plugin.py +253 -0
- abstractvoice/prefetch.py +82 -0
- abstractvoice/recognition.py +424 -42
- abstractvoice/stop_phrase.py +103 -0
- abstractvoice/tts/__init__.py +3 -3
- abstractvoice/tts/adapter_tts_engine.py +210 -0
- abstractvoice/tts/tts_engine.py +257 -1208
- abstractvoice/vm/__init__.py +2 -0
- abstractvoice/vm/common.py +21 -0
- abstractvoice/vm/core.py +139 -0
- abstractvoice/vm/manager.py +108 -0
- abstractvoice/vm/stt_mixin.py +158 -0
- abstractvoice/vm/tts_mixin.py +550 -0
- abstractvoice/voice_manager.py +6 -1061
- abstractvoice-0.6.1.dist-info/METADATA +213 -0
- abstractvoice-0.6.1.dist-info/RECORD +52 -0
- {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/WHEEL +1 -1
- abstractvoice-0.6.1.dist-info/entry_points.txt +6 -0
- abstractvoice/instant_setup.py +0 -83
- abstractvoice/simple_model_manager.py +0 -539
- abstractvoice-0.5.1.dist-info/METADATA +0 -1458
- abstractvoice-0.5.1.dist-info/RECORD +0 -23
- abstractvoice-0.5.1.dist-info/entry_points.txt +0 -2
- {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/licenses/LICENSE +0 -0
- {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/top_level.txt +0 -0
abstractvoice/voice_manager.py
CHANGED
|
@@ -1,1065 +1,10 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Public `VoiceManager` façade.
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
try:
|
|
7
|
-
from .tts import TTSEngine
|
|
8
|
-
return TTSEngine
|
|
9
|
-
except ImportError as e:
|
|
10
|
-
if "TTS" in str(e) or "torch" in str(e) or "librosa" in str(e):
|
|
11
|
-
raise ImportError(
|
|
12
|
-
"TTS functionality requires optional dependencies. Install with:\n"
|
|
13
|
-
" pip install abstractvoice[tts] # For TTS only\n"
|
|
14
|
-
" pip install abstractvoice[all] # For all features\n"
|
|
15
|
-
f"Original error: {e}"
|
|
16
|
-
) from e
|
|
17
|
-
raise
|
|
3
|
+
Implementation is split into small focused modules under `abstractvoice/vm/`
|
|
4
|
+
to keep files readable and responsibilities clear.
|
|
5
|
+
"""
|
|
18
6
|
|
|
19
|
-
|
|
20
|
-
"""Import VoiceRecognizer with helpful error message if dependencies missing."""
|
|
21
|
-
try:
|
|
22
|
-
from .recognition import VoiceRecognizer
|
|
23
|
-
return VoiceRecognizer
|
|
24
|
-
except ImportError as e:
|
|
25
|
-
if "whisper" in str(e) or "tiktoken" in str(e):
|
|
26
|
-
raise ImportError(
|
|
27
|
-
"Speech recognition functionality requires optional dependencies. Install with:\n"
|
|
28
|
-
" pip install abstractvoice[stt] # For speech recognition only\n"
|
|
29
|
-
" pip install abstractvoice[all] # For all features\n"
|
|
30
|
-
f"Original error: {e}"
|
|
31
|
-
) from e
|
|
32
|
-
raise
|
|
7
|
+
from .vm.manager import VoiceManager
|
|
33
8
|
|
|
9
|
+
__all__ = ["VoiceManager"]
|
|
34
10
|
|
|
35
|
-
class VoiceManager:
|
|
36
|
-
"""Main class for voice interaction capabilities with multilingual support."""
|
|
37
|
-
|
|
38
|
-
# Smart language configuration - high quality stable defaults
|
|
39
|
-
LANGUAGES = {
|
|
40
|
-
'en': {
|
|
41
|
-
'default': 'tts_models/en/ljspeech/tacotron2-DDC', # Reliable, compatible voice
|
|
42
|
-
'premium': 'tts_models/en/ljspeech/vits', # High quality (requires espeak)
|
|
43
|
-
'name': 'English'
|
|
44
|
-
},
|
|
45
|
-
'fr': {
|
|
46
|
-
'default': 'tts_models/fr/css10/vits', # High quality cleaner audio
|
|
47
|
-
'premium': 'tts_models/fr/css10/vits', # Use same stable model
|
|
48
|
-
'name': 'French'
|
|
49
|
-
},
|
|
50
|
-
'es': {
|
|
51
|
-
'default': 'tts_models/es/mai/tacotron2-DDC', # Keep stable Spanish model
|
|
52
|
-
'premium': 'tts_models/es/mai/tacotron2-DDC', # Same model (reliable)
|
|
53
|
-
'name': 'Spanish'
|
|
54
|
-
},
|
|
55
|
-
'de': {
|
|
56
|
-
'default': 'tts_models/de/thorsten/vits', # High quality German
|
|
57
|
-
'premium': 'tts_models/de/thorsten/vits', # Use same stable model
|
|
58
|
-
'name': 'German'
|
|
59
|
-
},
|
|
60
|
-
'it': {
|
|
61
|
-
'default': 'tts_models/it/mai_male/vits', # Use slower male voice as default
|
|
62
|
-
'premium': 'tts_models/it/mai_male/vits', # Same stable model
|
|
63
|
-
'name': 'Italian'
|
|
64
|
-
}
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
# Universal safe fallback
|
|
68
|
-
SAFE_FALLBACK = 'tts_models/en/ljspeech/fast_pitch'
|
|
69
|
-
|
|
70
|
-
# Complete voice catalog with metadata
|
|
71
|
-
VOICE_CATALOG = {
|
|
72
|
-
'en': {
|
|
73
|
-
'tacotron2': {
|
|
74
|
-
'model': 'tts_models/en/ljspeech/tacotron2-DDC',
|
|
75
|
-
'quality': 'good',
|
|
76
|
-
'gender': 'female',
|
|
77
|
-
'accent': 'US English',
|
|
78
|
-
'license': 'Open source (LJSpeech)',
|
|
79
|
-
'requires': 'none'
|
|
80
|
-
},
|
|
81
|
-
'jenny': {
|
|
82
|
-
'model': 'tts_models/en/jenny/jenny',
|
|
83
|
-
'quality': 'excellent',
|
|
84
|
-
'gender': 'female',
|
|
85
|
-
'accent': 'US English',
|
|
86
|
-
'license': 'Open source (Jenny)',
|
|
87
|
-
'requires': 'none'
|
|
88
|
-
},
|
|
89
|
-
'ek1': {
|
|
90
|
-
'model': 'tts_models/en/ek1/tacotron2',
|
|
91
|
-
'quality': 'excellent',
|
|
92
|
-
'gender': 'male',
|
|
93
|
-
'accent': 'British English',
|
|
94
|
-
'license': 'Open source (EK1)',
|
|
95
|
-
'requires': 'none'
|
|
96
|
-
},
|
|
97
|
-
'sam': {
|
|
98
|
-
'model': 'tts_models/en/sam/tacotron-DDC',
|
|
99
|
-
'quality': 'good',
|
|
100
|
-
'gender': 'male',
|
|
101
|
-
'accent': 'US English',
|
|
102
|
-
'license': 'Open source (Sam)',
|
|
103
|
-
'requires': 'none'
|
|
104
|
-
},
|
|
105
|
-
'fast_pitch': {
|
|
106
|
-
'model': 'tts_models/en/ljspeech/fast_pitch',
|
|
107
|
-
'quality': 'good',
|
|
108
|
-
'gender': 'female',
|
|
109
|
-
'accent': 'US English',
|
|
110
|
-
'license': 'Open source (LJSpeech)',
|
|
111
|
-
'requires': 'none'
|
|
112
|
-
},
|
|
113
|
-
'vits': {
|
|
114
|
-
'model': 'tts_models/en/ljspeech/vits',
|
|
115
|
-
'quality': 'premium',
|
|
116
|
-
'gender': 'female',
|
|
117
|
-
'accent': 'US English',
|
|
118
|
-
'license': 'Open source (LJSpeech)',
|
|
119
|
-
'requires': 'espeak-ng'
|
|
120
|
-
}
|
|
121
|
-
},
|
|
122
|
-
'fr': {
|
|
123
|
-
'css10_vits': {
|
|
124
|
-
'model': 'tts_models/fr/css10/vits',
|
|
125
|
-
'quality': 'premium',
|
|
126
|
-
'gender': 'male',
|
|
127
|
-
'accent': 'France French',
|
|
128
|
-
'license': 'Apache 2.0 (CSS10/LibriVox)',
|
|
129
|
-
'requires': 'espeak-ng'
|
|
130
|
-
},
|
|
131
|
-
'mai_tacotron': {
|
|
132
|
-
'model': 'tts_models/fr/mai/tacotron2-DDC',
|
|
133
|
-
'quality': 'good',
|
|
134
|
-
'gender': 'female',
|
|
135
|
-
'accent': 'France French',
|
|
136
|
-
'license': 'Permissive (M-AILABS/LibriVox)',
|
|
137
|
-
'requires': 'none'
|
|
138
|
-
}
|
|
139
|
-
},
|
|
140
|
-
'es': {
|
|
141
|
-
'mai_tacotron': {
|
|
142
|
-
'model': 'tts_models/es/mai/tacotron2-DDC',
|
|
143
|
-
'quality': 'good',
|
|
144
|
-
'gender': 'female',
|
|
145
|
-
'accent': 'Spain Spanish',
|
|
146
|
-
'license': 'Permissive (M-AILABS)',
|
|
147
|
-
'requires': 'none'
|
|
148
|
-
}
|
|
149
|
-
},
|
|
150
|
-
'de': {
|
|
151
|
-
'thorsten_vits': {
|
|
152
|
-
'model': 'tts_models/de/thorsten/vits',
|
|
153
|
-
'quality': 'premium',
|
|
154
|
-
'gender': 'male',
|
|
155
|
-
'accent': 'Standard German',
|
|
156
|
-
'license': 'Open source (Thorsten)',
|
|
157
|
-
'requires': 'espeak-ng'
|
|
158
|
-
},
|
|
159
|
-
'thorsten_tacotron': {
|
|
160
|
-
'model': 'tts_models/de/thorsten/tacotron2-DDC',
|
|
161
|
-
'quality': 'good',
|
|
162
|
-
'gender': 'male',
|
|
163
|
-
'accent': 'Standard German',
|
|
164
|
-
'license': 'Open source (Thorsten)',
|
|
165
|
-
'requires': 'none'
|
|
166
|
-
}
|
|
167
|
-
},
|
|
168
|
-
'it': {
|
|
169
|
-
'mai_male_vits': {
|
|
170
|
-
'model': 'tts_models/it/mai_male/vits',
|
|
171
|
-
'quality': 'premium',
|
|
172
|
-
'gender': 'male',
|
|
173
|
-
'accent': 'Standard Italian',
|
|
174
|
-
'license': 'Permissive (M-AILABS)',
|
|
175
|
-
'requires': 'espeak-ng',
|
|
176
|
-
'speed': 0.8 # Slow down to fix pace issues
|
|
177
|
-
},
|
|
178
|
-
'mai_female_vits': {
|
|
179
|
-
'model': 'tts_models/it/mai_female/vits',
|
|
180
|
-
'quality': 'premium',
|
|
181
|
-
'gender': 'female',
|
|
182
|
-
'accent': 'Standard Italian',
|
|
183
|
-
'license': 'Permissive (M-AILABS)',
|
|
184
|
-
'requires': 'espeak-ng',
|
|
185
|
-
'speed': 0.8 # Slow down to fix pace issues
|
|
186
|
-
}
|
|
187
|
-
}
|
|
188
|
-
}
|
|
189
|
-
|
|
190
|
-
def __init__(self, language='en', tts_model=None, whisper_model="tiny", debug_mode=False):
|
|
191
|
-
"""Initialize the Voice Manager with language support.
|
|
192
|
-
|
|
193
|
-
Args:
|
|
194
|
-
language: Language code ('en', 'fr', 'es', 'de', 'it')
|
|
195
|
-
tts_model: Specific TTS model name or None for language default
|
|
196
|
-
whisper_model: Whisper model name to use
|
|
197
|
-
debug_mode: Enable debug logging
|
|
198
|
-
"""
|
|
199
|
-
self.debug_mode = debug_mode
|
|
200
|
-
self.speed = 1.0
|
|
201
|
-
|
|
202
|
-
# Validate and set language
|
|
203
|
-
language = language.lower()
|
|
204
|
-
if language not in self.LANGUAGES:
|
|
205
|
-
if debug_mode:
|
|
206
|
-
available = ', '.join(self.LANGUAGES.keys())
|
|
207
|
-
print(f"⚠️ Unsupported language '{language}', using English. Available: {available}")
|
|
208
|
-
language = 'en'
|
|
209
|
-
self.language = language
|
|
210
|
-
|
|
211
|
-
# Select TTS model with smart detection
|
|
212
|
-
if tts_model is None:
|
|
213
|
-
tts_model = self._select_best_model(self.language)
|
|
214
|
-
if debug_mode:
|
|
215
|
-
lang_name = self.LANGUAGES[self.language]['name']
|
|
216
|
-
print(f"🌍 Using {lang_name} voice: {tts_model}")
|
|
217
|
-
|
|
218
|
-
# Initialize TTS engine with instant setup for new users
|
|
219
|
-
from .instant_setup import ensure_instant_tts, get_instant_model, is_model_cached
|
|
220
|
-
|
|
221
|
-
# If using default VITS model but it's not cached, use instant setup
|
|
222
|
-
if tts_model == "tts_models/en/ljspeech/vits" and not is_model_cached(tts_model):
|
|
223
|
-
if debug_mode:
|
|
224
|
-
print("🚀 First-time setup: ensuring instant TTS availability...")
|
|
225
|
-
|
|
226
|
-
# Try instant setup with lightweight model
|
|
227
|
-
if ensure_instant_tts():
|
|
228
|
-
tts_model = get_instant_model() # Use fast_pitch instead
|
|
229
|
-
if debug_mode:
|
|
230
|
-
print(f"✅ Using essential model: {tts_model}")
|
|
231
|
-
|
|
232
|
-
# Initialize TTS engine using lazy import
|
|
233
|
-
TTSEngine = _import_tts_engine()
|
|
234
|
-
self.tts_engine = TTSEngine(
|
|
235
|
-
model_name=tts_model,
|
|
236
|
-
debug_mode=debug_mode
|
|
237
|
-
)
|
|
238
|
-
|
|
239
|
-
# Set up callbacks to pause/resume voice recognition during TTS playback
|
|
240
|
-
# This prevents the system from interrupting its own speech
|
|
241
|
-
self.tts_engine.on_playback_start = self._on_tts_start
|
|
242
|
-
self.tts_engine.on_playback_end = self._on_tts_end
|
|
243
|
-
|
|
244
|
-
# NEW: Enhanced audio lifecycle callbacks (v0.5.1)
|
|
245
|
-
self.on_audio_start = None # Called when first audio sample plays
|
|
246
|
-
self.on_audio_end = None # Called when last audio sample finishes
|
|
247
|
-
self.on_audio_pause = None # Called when audio is paused
|
|
248
|
-
self.on_audio_resume = None # Called when audio is resumed
|
|
249
|
-
|
|
250
|
-
# Wire callbacks directly to audio player (skip TTSEngine layer)
|
|
251
|
-
self.tts_engine.audio_player.on_audio_start = self._on_audio_start
|
|
252
|
-
self.tts_engine.audio_player.on_audio_end = self._on_audio_end
|
|
253
|
-
self.tts_engine.audio_player.on_audio_pause = self._on_audio_pause
|
|
254
|
-
self.tts_engine.audio_player.on_audio_resume = self._on_audio_resume
|
|
255
|
-
|
|
256
|
-
# Voice recognizer is initialized on demand
|
|
257
|
-
self.voice_recognizer = None
|
|
258
|
-
self.whisper_model = whisper_model
|
|
259
|
-
|
|
260
|
-
# State tracking
|
|
261
|
-
self._transcription_callback = None
|
|
262
|
-
self._stop_callback = None
|
|
263
|
-
self._voice_mode = "full" # full, wait, stop, ptt
|
|
264
|
-
|
|
265
|
-
def _on_tts_start(self):
|
|
266
|
-
"""Called when TTS playback starts - handle based on voice mode."""
|
|
267
|
-
if not self.voice_recognizer:
|
|
268
|
-
return
|
|
269
|
-
|
|
270
|
-
if self._voice_mode == "full":
|
|
271
|
-
# Full mode: Keep listening but pause interrupt capability
|
|
272
|
-
self.voice_recognizer.pause_tts_interrupt()
|
|
273
|
-
elif self._voice_mode in ["wait", "stop", "ptt"]:
|
|
274
|
-
# Wait/Stop/PTT modes: Pause listening entirely during TTS
|
|
275
|
-
self.voice_recognizer.pause_listening()
|
|
276
|
-
|
|
277
|
-
def _on_tts_end(self):
|
|
278
|
-
"""Called when TTS playback ends - handle based on voice mode."""
|
|
279
|
-
if not self.voice_recognizer:
|
|
280
|
-
return
|
|
281
|
-
|
|
282
|
-
if self._voice_mode == "full":
|
|
283
|
-
# Full mode: Resume interrupt capability
|
|
284
|
-
self.voice_recognizer.resume_tts_interrupt()
|
|
285
|
-
elif self._voice_mode in ["wait", "stop", "ptt"]:
|
|
286
|
-
# Wait/Stop/PTT modes: Resume listening
|
|
287
|
-
self.voice_recognizer.resume_listening()
|
|
288
|
-
|
|
289
|
-
def speak(self, text, speed=1.0, callback=None):
|
|
290
|
-
"""Convert text to speech and play audio.
|
|
291
|
-
|
|
292
|
-
Args:
|
|
293
|
-
text: Text to convert to speech
|
|
294
|
-
speed: Speech speed (0.5-2.0)
|
|
295
|
-
callback: Function to call when speech completes
|
|
296
|
-
|
|
297
|
-
Returns:
|
|
298
|
-
True if speech started, False otherwise
|
|
299
|
-
"""
|
|
300
|
-
sp = 1.0
|
|
301
|
-
if speed != 1.0:
|
|
302
|
-
sp = speed
|
|
303
|
-
else:
|
|
304
|
-
sp = self.speed
|
|
305
|
-
|
|
306
|
-
return self.tts_engine.speak(text, sp, callback)
|
|
307
|
-
|
|
308
|
-
def stop_speaking(self):
|
|
309
|
-
"""Stop current speech playback.
|
|
310
|
-
|
|
311
|
-
Returns:
|
|
312
|
-
True if stopped, False if no playback was active
|
|
313
|
-
"""
|
|
314
|
-
return self.tts_engine.stop()
|
|
315
|
-
|
|
316
|
-
def pause_speaking(self):
|
|
317
|
-
"""Pause current speech playback.
|
|
318
|
-
|
|
319
|
-
Pauses at chunk boundaries in streaming mode. Can be resumed with resume_speaking().
|
|
320
|
-
|
|
321
|
-
Returns:
|
|
322
|
-
True if paused, False if no playback was active
|
|
323
|
-
"""
|
|
324
|
-
return self.tts_engine.pause()
|
|
325
|
-
|
|
326
|
-
def resume_speaking(self):
|
|
327
|
-
"""Resume paused speech playback.
|
|
328
|
-
|
|
329
|
-
Returns:
|
|
330
|
-
True if resumed, False if not paused or no playback active
|
|
331
|
-
"""
|
|
332
|
-
return self.tts_engine.resume()
|
|
333
|
-
|
|
334
|
-
def is_paused(self):
|
|
335
|
-
"""Check if TTS is currently paused.
|
|
336
|
-
|
|
337
|
-
Returns:
|
|
338
|
-
True if paused, False otherwise
|
|
339
|
-
"""
|
|
340
|
-
return self.tts_engine.is_paused()
|
|
341
|
-
|
|
342
|
-
def is_speaking(self):
|
|
343
|
-
"""Check if TTS is currently active.
|
|
344
|
-
|
|
345
|
-
Returns:
|
|
346
|
-
True if speaking, False otherwise
|
|
347
|
-
"""
|
|
348
|
-
return self.tts_engine.is_active()
|
|
349
|
-
|
|
350
|
-
def listen(self, on_transcription, on_stop=None):
|
|
351
|
-
"""Start listening for speech with callbacks.
|
|
352
|
-
|
|
353
|
-
Args:
|
|
354
|
-
on_transcription: Callback for transcribed text
|
|
355
|
-
on_stop: Callback when 'stop' command detected
|
|
356
|
-
|
|
357
|
-
Returns:
|
|
358
|
-
True if started, False if already listening
|
|
359
|
-
"""
|
|
360
|
-
# Store callbacks
|
|
361
|
-
self._transcription_callback = on_transcription
|
|
362
|
-
self._stop_callback = on_stop
|
|
363
|
-
|
|
364
|
-
# Initialize recognizer if not already done
|
|
365
|
-
if not self.voice_recognizer:
|
|
366
|
-
def _transcription_handler(text):
|
|
367
|
-
if self._transcription_callback:
|
|
368
|
-
self._transcription_callback(text)
|
|
369
|
-
|
|
370
|
-
def _stop_handler():
|
|
371
|
-
# Stop listening
|
|
372
|
-
self.stop_listening()
|
|
373
|
-
# Call user's stop callback if provided
|
|
374
|
-
if self._stop_callback:
|
|
375
|
-
self._stop_callback()
|
|
376
|
-
|
|
377
|
-
# Use lazy import for VoiceRecognizer
|
|
378
|
-
VoiceRecognizer = _import_voice_recognizer()
|
|
379
|
-
self.voice_recognizer = VoiceRecognizer(
|
|
380
|
-
transcription_callback=_transcription_handler,
|
|
381
|
-
stop_callback=_stop_handler,
|
|
382
|
-
whisper_model=self.whisper_model,
|
|
383
|
-
debug_mode=self.debug_mode
|
|
384
|
-
)
|
|
385
|
-
|
|
386
|
-
# Start with TTS interrupt capability
|
|
387
|
-
return self.voice_recognizer.start(
|
|
388
|
-
tts_interrupt_callback=self.stop_speaking
|
|
389
|
-
)
|
|
390
|
-
|
|
391
|
-
def stop_listening(self):
|
|
392
|
-
"""Stop listening for speech.
|
|
393
|
-
|
|
394
|
-
Returns:
|
|
395
|
-
True if stopped, False if not listening
|
|
396
|
-
"""
|
|
397
|
-
if self.voice_recognizer:
|
|
398
|
-
return self.voice_recognizer.stop()
|
|
399
|
-
return False
|
|
400
|
-
|
|
401
|
-
def is_listening(self):
|
|
402
|
-
"""Check if currently listening for speech.
|
|
403
|
-
|
|
404
|
-
Returns:
|
|
405
|
-
True if listening, False otherwise
|
|
406
|
-
"""
|
|
407
|
-
return self.voice_recognizer and self.voice_recognizer.is_running
|
|
408
|
-
|
|
409
|
-
def set_voice_mode(self, mode):
|
|
410
|
-
"""Set the voice mode (full, wait, stop, ptt).
|
|
411
|
-
|
|
412
|
-
Args:
|
|
413
|
-
mode: Voice mode to use
|
|
414
|
-
|
|
415
|
-
Returns:
|
|
416
|
-
True if successful
|
|
417
|
-
"""
|
|
418
|
-
if mode in ["full", "wait", "stop", "ptt"]:
|
|
419
|
-
self._voice_mode = mode
|
|
420
|
-
return True
|
|
421
|
-
return False
|
|
422
|
-
|
|
423
|
-
def set_speed(self, speed):
|
|
424
|
-
"""Set the TTS speed.
|
|
425
|
-
|
|
426
|
-
Args:
|
|
427
|
-
speed: Speech speed multiplier (0.5-2.0)
|
|
428
|
-
|
|
429
|
-
Returns:
|
|
430
|
-
True if successful
|
|
431
|
-
"""
|
|
432
|
-
self.speed = speed
|
|
433
|
-
return True
|
|
434
|
-
|
|
435
|
-
def get_speed(self):
|
|
436
|
-
"""Get the TTS speed.
|
|
437
|
-
|
|
438
|
-
Returns:
|
|
439
|
-
Current TTS speed multiplier
|
|
440
|
-
"""
|
|
441
|
-
return self.speed
|
|
442
|
-
|
|
443
|
-
def set_tts_model(self, model_name):
|
|
444
|
-
"""Change the TTS model safely without memory conflicts.
|
|
445
|
-
|
|
446
|
-
Available models (all pure Python, cross-platform):
|
|
447
|
-
- "tts_models/en/ljspeech/fast_pitch" (default, recommended)
|
|
448
|
-
- "tts_models/en/ljspeech/glow-tts" (alternative)
|
|
449
|
-
- "tts_models/en/ljspeech/tacotron2-DDC" (legacy)
|
|
450
|
-
|
|
451
|
-
Args:
|
|
452
|
-
model_name: TTS model name to use
|
|
453
|
-
|
|
454
|
-
Returns:
|
|
455
|
-
True if successful
|
|
456
|
-
|
|
457
|
-
Example:
|
|
458
|
-
vm.set_tts_model("tts_models/en/ljspeech/glow-tts")
|
|
459
|
-
"""
|
|
460
|
-
# Stop any current speech
|
|
461
|
-
self.stop_speaking()
|
|
462
|
-
|
|
463
|
-
# CRITICAL: Crash-safe cleanup of old TTS engine
|
|
464
|
-
if hasattr(self, 'tts_engine') and self.tts_engine:
|
|
465
|
-
try:
|
|
466
|
-
# Stop all audio and cleanup player
|
|
467
|
-
if hasattr(self.tts_engine, 'audio_player') and self.tts_engine.audio_player:
|
|
468
|
-
# Try stop method if available
|
|
469
|
-
if hasattr(self.tts_engine.audio_player, 'stop'):
|
|
470
|
-
self.tts_engine.audio_player.stop()
|
|
471
|
-
self.tts_engine.audio_player.cleanup()
|
|
472
|
-
|
|
473
|
-
# Force cleanup of TTS object and release GPU memory
|
|
474
|
-
if hasattr(self.tts_engine, 'tts') and self.tts_engine.tts:
|
|
475
|
-
# Clear CUDA cache if using GPU
|
|
476
|
-
try:
|
|
477
|
-
import torch
|
|
478
|
-
if torch.cuda.is_available():
|
|
479
|
-
torch.cuda.empty_cache()
|
|
480
|
-
except:
|
|
481
|
-
pass
|
|
482
|
-
|
|
483
|
-
del self.tts_engine.tts
|
|
484
|
-
|
|
485
|
-
# Clear the engine itself
|
|
486
|
-
del self.tts_engine
|
|
487
|
-
self.tts_engine = None
|
|
488
|
-
|
|
489
|
-
# Force garbage collection to prevent memory leaks
|
|
490
|
-
import gc
|
|
491
|
-
gc.collect()
|
|
492
|
-
|
|
493
|
-
except Exception as e:
|
|
494
|
-
if self.debug_mode:
|
|
495
|
-
print(f"Warning: TTS cleanup issue: {e}")
|
|
496
|
-
# Force clear even if cleanup failed
|
|
497
|
-
self.tts_engine = None
|
|
498
|
-
|
|
499
|
-
# Reinitialize TTS engine with new model using lazy import
|
|
500
|
-
TTSEngine = _import_tts_engine()
|
|
501
|
-
self.tts_engine = TTSEngine(
|
|
502
|
-
model_name=model_name,
|
|
503
|
-
debug_mode=self.debug_mode
|
|
504
|
-
)
|
|
505
|
-
|
|
506
|
-
# Restore callbacks
|
|
507
|
-
self.tts_engine.on_playback_start = self._on_tts_start
|
|
508
|
-
self.tts_engine.on_playback_end = self._on_tts_end
|
|
509
|
-
|
|
510
|
-
return True
|
|
511
|
-
|
|
512
|
-
def set_whisper(self, model_name):
|
|
513
|
-
"""Set the Whisper model.
|
|
514
|
-
|
|
515
|
-
Args:
|
|
516
|
-
whisper_model: Whisper model name (tiny, base, etc.)
|
|
517
|
-
|
|
518
|
-
Returns:
|
|
519
|
-
True if successful
|
|
520
|
-
"""
|
|
521
|
-
self.whisper_model = model_name
|
|
522
|
-
if self.voice_recognizer:
|
|
523
|
-
return self.voice_recognizer.change_whisper_model(model_name)
|
|
524
|
-
|
|
525
|
-
def get_whisper(self):
|
|
526
|
-
"""Get the Whisper model.
|
|
527
|
-
|
|
528
|
-
Returns:
|
|
529
|
-
Current Whisper model name
|
|
530
|
-
"""
|
|
531
|
-
return self.whisper_model
|
|
532
|
-
|
|
533
|
-
def set_language(self, language):
|
|
534
|
-
"""Set the voice language.
|
|
535
|
-
|
|
536
|
-
Args:
|
|
537
|
-
language: Language code ('en', 'fr', 'es', 'de', 'it')
|
|
538
|
-
|
|
539
|
-
Returns:
|
|
540
|
-
True if successful, False otherwise
|
|
541
|
-
"""
|
|
542
|
-
# Validate language
|
|
543
|
-
language = language.lower()
|
|
544
|
-
if language not in self.LANGUAGES:
|
|
545
|
-
if self.debug_mode:
|
|
546
|
-
available = ', '.join(self.LANGUAGES.keys())
|
|
547
|
-
print(f"⚠️ Unsupported language '{language}'. Available: {available}")
|
|
548
|
-
return False
|
|
549
|
-
|
|
550
|
-
# Skip if already using this language
|
|
551
|
-
if language == self.language:
|
|
552
|
-
if self.debug_mode:
|
|
553
|
-
print(f"✓ Already using {self.LANGUAGES[language]['name']} voice")
|
|
554
|
-
return True
|
|
555
|
-
|
|
556
|
-
# Stop any current operations
|
|
557
|
-
self.stop_speaking()
|
|
558
|
-
if self.voice_recognizer:
|
|
559
|
-
self.voice_recognizer.stop()
|
|
560
|
-
|
|
561
|
-
# Select best model for this language
|
|
562
|
-
selected_model = self._select_best_model(language)
|
|
563
|
-
|
|
564
|
-
# CRITICAL FIX: Check if model is available, download if not
|
|
565
|
-
from .instant_setup import is_model_cached
|
|
566
|
-
from .simple_model_manager import download_model
|
|
567
|
-
|
|
568
|
-
if not is_model_cached(selected_model):
|
|
569
|
-
if self.debug_mode:
|
|
570
|
-
print(f"📥 Model {selected_model} not cached, downloading...")
|
|
571
|
-
|
|
572
|
-
# Try to download the model
|
|
573
|
-
success = download_model(selected_model)
|
|
574
|
-
if not success:
|
|
575
|
-
if self.debug_mode:
|
|
576
|
-
print(f"❌ Failed to download {selected_model}")
|
|
577
|
-
# If download fails and it's not English, we have a problem
|
|
578
|
-
if language != 'en':
|
|
579
|
-
print(f"❌ Cannot switch to {self.LANGUAGES[language]['name']}: Model download failed")
|
|
580
|
-
print(f" Try: abstractvoice download-models --language {language}")
|
|
581
|
-
return False
|
|
582
|
-
|
|
583
|
-
models_to_try = [selected_model]
|
|
584
|
-
|
|
585
|
-
# Only add fallback if it's different from selected
|
|
586
|
-
if selected_model != self.SAFE_FALLBACK:
|
|
587
|
-
models_to_try.append(self.SAFE_FALLBACK)
|
|
588
|
-
|
|
589
|
-
for model_name in models_to_try:
|
|
590
|
-
try:
|
|
591
|
-
if self.debug_mode:
|
|
592
|
-
lang_name = self.LANGUAGES[language]['name']
|
|
593
|
-
print(f"🌍 Loading {lang_name} voice: {model_name}")
|
|
594
|
-
|
|
595
|
-
# Reinitialize TTS engine
|
|
596
|
-
TTSEngine = _import_tts_engine()
|
|
597
|
-
self.tts_engine = TTSEngine(model_name=model_name, debug_mode=self.debug_mode)
|
|
598
|
-
|
|
599
|
-
# Restore callbacks
|
|
600
|
-
self.tts_engine.on_playback_start = self._on_tts_start
|
|
601
|
-
self.tts_engine.on_playback_end = self._on_tts_end
|
|
602
|
-
|
|
603
|
-
# Update language and set appropriate speed for Italian voices
|
|
604
|
-
self.language = language
|
|
605
|
-
|
|
606
|
-
# Set language-specific speed adjustments
|
|
607
|
-
if language == 'it':
|
|
608
|
-
self.speed = 0.8 # Slow down Italian voices to fix pace issues
|
|
609
|
-
if self.debug_mode:
|
|
610
|
-
print(f" Speed: {self.speed} (adjusted for optimal Italian pace)")
|
|
611
|
-
else:
|
|
612
|
-
self.speed = 1.0 # Default speed for other languages
|
|
613
|
-
|
|
614
|
-
return True
|
|
615
|
-
|
|
616
|
-
except Exception as e:
|
|
617
|
-
if self.debug_mode:
|
|
618
|
-
print(f"⚠️ Model {model_name} failed to load: {e}")
|
|
619
|
-
# Don't silently continue - report the failure
|
|
620
|
-
if model_name == selected_model and language != 'en':
|
|
621
|
-
print(f"❌ Failed to load {lang_name} voice model")
|
|
622
|
-
print(f" The model might be corrupted. Try:")
|
|
623
|
-
print(f" abstractvoice download-models --language {language}")
|
|
624
|
-
continue
|
|
625
|
-
|
|
626
|
-
# All models failed
|
|
627
|
-
print(f"❌ Cannot switch to {self.LANGUAGES[language]['name']}: No working models")
|
|
628
|
-
return False
|
|
629
|
-
|
|
630
|
-
def get_language(self):
|
|
631
|
-
"""Get the current voice language.
|
|
632
|
-
|
|
633
|
-
Returns:
|
|
634
|
-
Current language code
|
|
635
|
-
"""
|
|
636
|
-
return self.language
|
|
637
|
-
|
|
638
|
-
def get_supported_languages(self):
|
|
639
|
-
"""Get list of supported language codes.
|
|
640
|
-
|
|
641
|
-
Returns:
|
|
642
|
-
List of supported language codes
|
|
643
|
-
"""
|
|
644
|
-
return list(self.LANGUAGES.keys())
|
|
645
|
-
|
|
646
|
-
def get_language_name(self, language_code=None):
|
|
647
|
-
"""Get the display name for a language.
|
|
648
|
-
|
|
649
|
-
Args:
|
|
650
|
-
language_code: Language code (defaults to current language)
|
|
651
|
-
|
|
652
|
-
Returns:
|
|
653
|
-
Language display name
|
|
654
|
-
"""
|
|
655
|
-
lang = language_code or self.language
|
|
656
|
-
return self.LANGUAGES.get(lang, {}).get('name', lang)
|
|
657
|
-
|
|
658
|
-
def _select_best_model(self, language):
|
|
659
|
-
"""Select the best available TTS model for a language.
|
|
660
|
-
|
|
661
|
-
Try premium model first (higher quality), fallback to default (reliable).
|
|
662
|
-
|
|
663
|
-
Args:
|
|
664
|
-
language: Language code
|
|
665
|
-
|
|
666
|
-
Returns:
|
|
667
|
-
Model name string
|
|
668
|
-
"""
|
|
669
|
-
if language not in self.LANGUAGES:
|
|
670
|
-
return self.SAFE_FALLBACK
|
|
671
|
-
|
|
672
|
-
lang_config = self.LANGUAGES[language]
|
|
673
|
-
|
|
674
|
-
# Try premium model first (better quality)
|
|
675
|
-
if 'premium' in lang_config:
|
|
676
|
-
try:
|
|
677
|
-
premium_model = lang_config['premium']
|
|
678
|
-
# Quick test to see if this model type works
|
|
679
|
-
if self._test_model_compatibility(premium_model):
|
|
680
|
-
if self.debug_mode:
|
|
681
|
-
print(f"✨ Using premium quality model: {premium_model}")
|
|
682
|
-
return premium_model
|
|
683
|
-
elif self.debug_mode:
|
|
684
|
-
print(f"⚠️ Premium model not compatible, using default")
|
|
685
|
-
except Exception:
|
|
686
|
-
if self.debug_mode:
|
|
687
|
-
print(f"⚠️ Premium model failed, using default")
|
|
688
|
-
|
|
689
|
-
# Use reliable default model
|
|
690
|
-
default_model = lang_config.get('default', self.SAFE_FALLBACK)
|
|
691
|
-
if self.debug_mode:
|
|
692
|
-
print(f"🔧 Using reliable default model: {default_model}")
|
|
693
|
-
return default_model
|
|
694
|
-
|
|
695
|
-
def _test_model_compatibility(self, model_name):
|
|
696
|
-
"""Quick test if a model is compatible with current system.
|
|
697
|
-
|
|
698
|
-
Args:
|
|
699
|
-
model_name: TTS model name
|
|
700
|
-
|
|
701
|
-
Returns:
|
|
702
|
-
True if compatible, False otherwise
|
|
703
|
-
"""
|
|
704
|
-
# For VITS models, check if espeak-ng is available
|
|
705
|
-
if 'vits' in model_name.lower():
|
|
706
|
-
try:
|
|
707
|
-
import subprocess
|
|
708
|
-
result = subprocess.run(['espeak-ng', '--version'],
|
|
709
|
-
capture_output=True, timeout=2)
|
|
710
|
-
return result.returncode == 0
|
|
711
|
-
except (FileNotFoundError, subprocess.TimeoutExpired, subprocess.SubprocessError):
|
|
712
|
-
return False
|
|
713
|
-
|
|
714
|
-
# For other models, assume they work (they're more compatible)
|
|
715
|
-
return True
|
|
716
|
-
|
|
717
|
-
def set_voice_variant(self, language, variant):
|
|
718
|
-
"""Set a specific voice variant for a language.
|
|
719
|
-
|
|
720
|
-
Args:
|
|
721
|
-
language: Language code ('fr', 'it')
|
|
722
|
-
variant: Variant name ('female', 'alternative', etc.)
|
|
723
|
-
|
|
724
|
-
Returns:
|
|
725
|
-
True if successful, False otherwise
|
|
726
|
-
|
|
727
|
-
Examples:
|
|
728
|
-
vm.set_voice_variant('it', 'female') # Use female Italian voice
|
|
729
|
-
vm.set_voice_variant('fr', 'alternative') # Use original French model
|
|
730
|
-
"""
|
|
731
|
-
if language not in self.ALTERNATIVE_MODELS:
|
|
732
|
-
if self.debug_mode:
|
|
733
|
-
available_langs = ', '.join(self.ALTERNATIVE_MODELS.keys())
|
|
734
|
-
print(f"⚠️ No variants available for '{language}'. Languages with variants: {available_langs}")
|
|
735
|
-
return False
|
|
736
|
-
|
|
737
|
-
if variant not in self.ALTERNATIVE_MODELS[language]:
|
|
738
|
-
if self.debug_mode:
|
|
739
|
-
available_variants = ', '.join(self.ALTERNATIVE_MODELS[language].keys())
|
|
740
|
-
print(f"⚠️ Variant '{variant}' not available for {language}. Available: {available_variants}")
|
|
741
|
-
return False
|
|
742
|
-
|
|
743
|
-
# Get the specific model for this variant
|
|
744
|
-
model_name = self.ALTERNATIVE_MODELS[language][variant]
|
|
745
|
-
|
|
746
|
-
if self.debug_mode:
|
|
747
|
-
lang_name = self.LANGUAGES[language]['name']
|
|
748
|
-
print(f"🎭 Switching to {lang_name} {variant} voice: {model_name}")
|
|
749
|
-
|
|
750
|
-
# Set the specific model
|
|
751
|
-
return self.set_tts_model(model_name)
|
|
752
|
-
|
|
753
|
-
def get_model_info(self):
|
|
754
|
-
"""Get information about currently loaded models and system capabilities.
|
|
755
|
-
|
|
756
|
-
Returns:
|
|
757
|
-
Dict with model information and system capabilities
|
|
758
|
-
"""
|
|
759
|
-
info = {
|
|
760
|
-
'current_language': self.language,
|
|
761
|
-
'language_name': self.get_language_name(),
|
|
762
|
-
'espeak_available': self._test_model_compatibility('test_vits'),
|
|
763
|
-
'supported_languages': self.get_supported_languages()
|
|
764
|
-
}
|
|
765
|
-
|
|
766
|
-
# Add model recommendations for each language
|
|
767
|
-
info['models'] = {}
|
|
768
|
-
for lang in self.get_supported_languages():
|
|
769
|
-
selected_model = self._select_best_model(lang)
|
|
770
|
-
lang_config = self.LANGUAGES[lang]
|
|
771
|
-
is_premium = selected_model == lang_config.get('premium', '')
|
|
772
|
-
|
|
773
|
-
info['models'][lang] = {
|
|
774
|
-
'name': lang_config['name'],
|
|
775
|
-
'selected_model': selected_model,
|
|
776
|
-
'quality': 'premium' if is_premium else 'default',
|
|
777
|
-
'default_available': lang_config.get('default', ''),
|
|
778
|
-
'premium_available': lang_config.get('premium', '')
|
|
779
|
-
}
|
|
780
|
-
|
|
781
|
-
return info
|
|
782
|
-
|
|
783
|
-
def browse_voices(self, language=None, quality=None, gender=None):
|
|
784
|
-
"""Browse available voices with filtering options.
|
|
785
|
-
|
|
786
|
-
Args:
|
|
787
|
-
language: Language code ('en', 'fr', etc.) or None for all
|
|
788
|
-
quality: 'premium', 'good', or None for all
|
|
789
|
-
gender: 'male', 'female', 'multiple', or None for all
|
|
790
|
-
|
|
791
|
-
Returns:
|
|
792
|
-
Dict of available voices with metadata
|
|
793
|
-
"""
|
|
794
|
-
voices = {}
|
|
795
|
-
|
|
796
|
-
# Get languages to check
|
|
797
|
-
languages_to_check = [language] if language else self.VOICE_CATALOG.keys()
|
|
798
|
-
|
|
799
|
-
for lang in languages_to_check:
|
|
800
|
-
if lang not in self.VOICE_CATALOG:
|
|
801
|
-
continue
|
|
802
|
-
|
|
803
|
-
lang_voices = {}
|
|
804
|
-
for voice_id, voice_info in self.VOICE_CATALOG[lang].items():
|
|
805
|
-
# Apply filters
|
|
806
|
-
if quality and voice_info['quality'] != quality:
|
|
807
|
-
continue
|
|
808
|
-
if gender and voice_info['gender'] != gender:
|
|
809
|
-
continue
|
|
810
|
-
|
|
811
|
-
# Check if voice is compatible with current system
|
|
812
|
-
compatible = True
|
|
813
|
-
if voice_info['requires'] == 'espeak-ng':
|
|
814
|
-
compatible = self._test_model_compatibility(voice_info['model'])
|
|
815
|
-
|
|
816
|
-
# Add compatibility info
|
|
817
|
-
voice_data = voice_info.copy()
|
|
818
|
-
voice_data['compatible'] = compatible
|
|
819
|
-
lang_voices[voice_id] = voice_data
|
|
820
|
-
|
|
821
|
-
if lang_voices:
|
|
822
|
-
voices[lang] = lang_voices
|
|
823
|
-
|
|
824
|
-
return voices
|
|
825
|
-
|
|
826
|
-
def list_voices(self, language=None):
|
|
827
|
-
"""List available voices in a user-friendly format.
|
|
828
|
-
|
|
829
|
-
Args:
|
|
830
|
-
language: Language code or None for all languages
|
|
831
|
-
"""
|
|
832
|
-
voices = self.browse_voices(language)
|
|
833
|
-
|
|
834
|
-
if not voices:
|
|
835
|
-
print("No voices found matching criteria.")
|
|
836
|
-
return
|
|
837
|
-
|
|
838
|
-
# License links mapping
|
|
839
|
-
license_links = {
|
|
840
|
-
'CSS10': 'https://github.com/Kyubyong/CSS10',
|
|
841
|
-
'M-AILABS': 'https://www.caito.de/2019/01/03/the-m-ailabs-speech-dataset/',
|
|
842
|
-
'LJSpeech': 'https://keithito.com/LJ-Speech-Dataset/',
|
|
843
|
-
'VCTK': 'https://datashare.ed.ac.uk/handle/10283/3443',
|
|
844
|
-
'Thorsten': 'https://www.thorsten-voice.de/en/'
|
|
845
|
-
}
|
|
846
|
-
|
|
847
|
-
for lang, lang_voices in voices.items():
|
|
848
|
-
lang_name = self.LANGUAGES.get(lang, {}).get('name', lang)
|
|
849
|
-
print(f"\n🌍 {lang_name} ({lang}) - {len(lang_voices)} voices available:")
|
|
850
|
-
|
|
851
|
-
for voice_id, voice_info in lang_voices.items():
|
|
852
|
-
quality_icon = "✨" if voice_info['quality'] == 'premium' else "🔧"
|
|
853
|
-
compat_icon = "✅" if voice_info['compatible'] else "⚠️"
|
|
854
|
-
gender_icon = {"male": "👨", "female": "👩", "multiple": "👥"}.get(voice_info['gender'], "🗣️")
|
|
855
|
-
|
|
856
|
-
# Show full format: language.voice_id
|
|
857
|
-
full_voice_id = f"{lang}.{voice_id}"
|
|
858
|
-
print(f" {compat_icon} {quality_icon} {gender_icon} {full_voice_id}")
|
|
859
|
-
print(f" {voice_info['accent']} - {voice_info['gender']} voice")
|
|
860
|
-
|
|
861
|
-
# Extract license name and add link if available
|
|
862
|
-
license_text = voice_info['license']
|
|
863
|
-
license_with_link = license_text
|
|
864
|
-
for dataset_name, link in license_links.items():
|
|
865
|
-
if dataset_name in license_text:
|
|
866
|
-
license_with_link = f"{license_text} - {link}"
|
|
867
|
-
break
|
|
868
|
-
|
|
869
|
-
print(f" License: {license_with_link}")
|
|
870
|
-
if not voice_info['compatible'] and voice_info['requires'] == 'espeak-ng':
|
|
871
|
-
print(f" ⚠️ Requires: espeak-ng (install for premium quality)")
|
|
872
|
-
|
|
873
|
-
def set_voice(self, language, voice_id):
|
|
874
|
-
"""Set a specific voice by ID.
|
|
875
|
-
|
|
876
|
-
Args:
|
|
877
|
-
language: Language code
|
|
878
|
-
voice_id: Voice ID from voice catalog
|
|
879
|
-
|
|
880
|
-
Returns:
|
|
881
|
-
True if successful
|
|
882
|
-
|
|
883
|
-
Example:
|
|
884
|
-
vm.set_voice('fr', 'css10_vits') # Use CSS10 French VITS voice
|
|
885
|
-
vm.set_voice('it', 'mai_female_vits') # Use female Italian VITS voice
|
|
886
|
-
"""
|
|
887
|
-
if language not in self.VOICE_CATALOG:
|
|
888
|
-
if self.debug_mode:
|
|
889
|
-
print(f"⚠️ Language '{language}' not available")
|
|
890
|
-
return False
|
|
891
|
-
|
|
892
|
-
if voice_id not in self.VOICE_CATALOG[language]:
|
|
893
|
-
if self.debug_mode:
|
|
894
|
-
available = ', '.join(self.VOICE_CATALOG[language].keys())
|
|
895
|
-
print(f"⚠️ Voice '{voice_id}' not available for {language}. Available: {available}")
|
|
896
|
-
return False
|
|
897
|
-
|
|
898
|
-
voice_info = self.VOICE_CATALOG[language][voice_id]
|
|
899
|
-
model_name = voice_info['model']
|
|
900
|
-
|
|
901
|
-
# CRITICAL FIX: Download model if not cached
|
|
902
|
-
from .instant_setup import is_model_cached
|
|
903
|
-
from .simple_model_manager import download_model
|
|
904
|
-
|
|
905
|
-
if not is_model_cached(model_name):
|
|
906
|
-
print(f"📥 Voice model '{voice_id}' not cached, downloading...")
|
|
907
|
-
success = download_model(model_name)
|
|
908
|
-
if not success:
|
|
909
|
-
print(f"❌ Failed to download voice '{voice_id}'")
|
|
910
|
-
print(f" Check your internet connection and try again")
|
|
911
|
-
return False
|
|
912
|
-
print(f"✅ Voice model '{voice_id}' downloaded successfully")
|
|
913
|
-
|
|
914
|
-
# Check compatibility after download
|
|
915
|
-
if voice_info['requires'] == 'espeak-ng' and not self._test_model_compatibility(model_name):
|
|
916
|
-
if self.debug_mode:
|
|
917
|
-
print(f"⚠️ Voice '{voice_id}' requires espeak-ng. Install it for premium quality.")
|
|
918
|
-
# Don't fail - try to load anyway
|
|
919
|
-
# return False
|
|
920
|
-
|
|
921
|
-
# Set the specific voice
|
|
922
|
-
if self.debug_mode:
|
|
923
|
-
print(f"🎭 Setting {language} voice to: {voice_id}")
|
|
924
|
-
print(f" Model: {model_name}")
|
|
925
|
-
print(f" Quality: {voice_info['quality']} | Gender: {voice_info['gender']}")
|
|
926
|
-
print(f" Accent: {voice_info['accent']}")
|
|
927
|
-
|
|
928
|
-
# Switch to the language and specific model
|
|
929
|
-
self.language = language
|
|
930
|
-
|
|
931
|
-
# Set voice-specific speed if available
|
|
932
|
-
if 'speed' in voice_info:
|
|
933
|
-
self.speed = voice_info['speed']
|
|
934
|
-
if self.debug_mode:
|
|
935
|
-
print(f" Speed: {voice_info['speed']} (adjusted for optimal pace)")
|
|
936
|
-
else:
|
|
937
|
-
self.speed = 1.0 # Default speed
|
|
938
|
-
|
|
939
|
-
return self.set_tts_model(model_name)
|
|
940
|
-
|
|
941
|
-
def change_vad_aggressiveness(self, aggressiveness):
|
|
942
|
-
"""Change VAD aggressiveness.
|
|
943
|
-
|
|
944
|
-
Args:
|
|
945
|
-
aggressiveness: New aggressiveness level (0-3)
|
|
946
|
-
|
|
947
|
-
Returns:
|
|
948
|
-
True if changed, False otherwise
|
|
949
|
-
"""
|
|
950
|
-
if self.voice_recognizer:
|
|
951
|
-
return self.voice_recognizer.change_vad_aggressiveness(aggressiveness)
|
|
952
|
-
return False
|
|
953
|
-
|
|
954
|
-
# ===== SIMPLE MODEL MANAGEMENT METHODS =====
|
|
955
|
-
# Clean, simple APIs for both CLI and third-party applications
|
|
956
|
-
|
|
957
|
-
def list_available_models(self, language: str = None) -> dict:
|
|
958
|
-
"""Get available models with metadata.
|
|
959
|
-
|
|
960
|
-
Args:
|
|
961
|
-
language: Optional language filter
|
|
962
|
-
|
|
963
|
-
Returns:
|
|
964
|
-
dict: Model information with cache status
|
|
965
|
-
|
|
966
|
-
Example:
|
|
967
|
-
>>> vm = VoiceManager()
|
|
968
|
-
>>> models = vm.list_available_models('en')
|
|
969
|
-
>>> print(json.dumps(models, indent=2))
|
|
970
|
-
"""
|
|
971
|
-
from .simple_model_manager import get_model_manager
|
|
972
|
-
manager = get_model_manager(self.debug_mode)
|
|
973
|
-
return manager.list_available_models(language)
|
|
974
|
-
|
|
975
|
-
def download_model(self, model_name: str, progress_callback=None) -> bool:
|
|
976
|
-
"""Download a specific model.
|
|
977
|
-
|
|
978
|
-
Args:
|
|
979
|
-
model_name: Model name or voice ID (e.g., 'en.vits' or full model path)
|
|
980
|
-
progress_callback: Optional function(model_name, success)
|
|
981
|
-
|
|
982
|
-
Returns:
|
|
983
|
-
bool: True if successful
|
|
984
|
-
|
|
985
|
-
Example:
|
|
986
|
-
>>> vm = VoiceManager()
|
|
987
|
-
>>> vm.download_model('en.vits') # or 'tts_models/en/ljspeech/vits'
|
|
988
|
-
"""
|
|
989
|
-
from .simple_model_manager import download_model
|
|
990
|
-
return download_model(model_name, progress_callback)
|
|
991
|
-
|
|
992
|
-
def is_model_ready(self) -> bool:
|
|
993
|
-
"""Check if essential model is ready for immediate use.
|
|
994
|
-
|
|
995
|
-
Returns:
|
|
996
|
-
bool: True if can speak immediately without download
|
|
997
|
-
"""
|
|
998
|
-
from .simple_model_manager import is_ready
|
|
999
|
-
return is_ready()
|
|
1000
|
-
|
|
1001
|
-
def ensure_ready(self, auto_download: bool = True) -> bool:
|
|
1002
|
-
"""Ensure TTS is ready for immediate use.
|
|
1003
|
-
|
|
1004
|
-
Args:
|
|
1005
|
-
auto_download: Whether to download essential model if needed
|
|
1006
|
-
|
|
1007
|
-
Returns:
|
|
1008
|
-
bool: True if TTS is ready
|
|
1009
|
-
|
|
1010
|
-
Example:
|
|
1011
|
-
>>> vm = VoiceManager()
|
|
1012
|
-
>>> if vm.ensure_ready():
|
|
1013
|
-
... vm.speak("Ready to go!")
|
|
1014
|
-
"""
|
|
1015
|
-
if self.is_model_ready():
|
|
1016
|
-
return True
|
|
1017
|
-
|
|
1018
|
-
if not auto_download:
|
|
1019
|
-
return False
|
|
1020
|
-
|
|
1021
|
-
from .simple_model_manager import get_model_manager
|
|
1022
|
-
manager = get_model_manager(self.debug_mode)
|
|
1023
|
-
return manager.download_essential_model()
|
|
1024
|
-
|
|
1025
|
-
def get_cache_status(self) -> dict:
|
|
1026
|
-
"""Get model cache status.
|
|
1027
|
-
|
|
1028
|
-
Returns:
|
|
1029
|
-
dict: Cache information including total models, sizes, etc.
|
|
1030
|
-
"""
|
|
1031
|
-
from .simple_model_manager import get_model_manager
|
|
1032
|
-
manager = get_model_manager(self.debug_mode)
|
|
1033
|
-
return manager.get_status()
|
|
1034
|
-
|
|
1035
|
-
def cleanup(self):
|
|
1036
|
-
"""Clean up resources.
|
|
1037
|
-
|
|
1038
|
-
Returns:
|
|
1039
|
-
True if cleanup successful
|
|
1040
|
-
"""
|
|
1041
|
-
if self.voice_recognizer:
|
|
1042
|
-
self.voice_recognizer.stop()
|
|
1043
|
-
|
|
1044
|
-
self.stop_speaking()
|
|
1045
|
-
return True
|
|
1046
|
-
|
|
1047
|
-
def _on_audio_start(self):
|
|
1048
|
-
"""Called when audio actually starts playing."""
|
|
1049
|
-
if self.on_audio_start:
|
|
1050
|
-
self.on_audio_start()
|
|
1051
|
-
|
|
1052
|
-
def _on_audio_end(self):
|
|
1053
|
-
"""Called when audio actually finishes playing."""
|
|
1054
|
-
if self.on_audio_end:
|
|
1055
|
-
self.on_audio_end()
|
|
1056
|
-
|
|
1057
|
-
def _on_audio_pause(self):
|
|
1058
|
-
"""Called when audio is paused."""
|
|
1059
|
-
if self.on_audio_pause:
|
|
1060
|
-
self.on_audio_pause()
|
|
1061
|
-
|
|
1062
|
-
def _on_audio_resume(self):
|
|
1063
|
-
"""Called when audio is resumed."""
|
|
1064
|
-
if self.on_audio_resume:
|
|
1065
|
-
self.on_audio_resume()
|