abstractvoice 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. abstractvoice/__init__.py +2 -5
  2. abstractvoice/__main__.py +82 -3
  3. abstractvoice/adapters/__init__.py +12 -0
  4. abstractvoice/adapters/base.py +207 -0
  5. abstractvoice/adapters/stt_faster_whisper.py +401 -0
  6. abstractvoice/adapters/tts_piper.py +480 -0
  7. abstractvoice/aec/__init__.py +10 -0
  8. abstractvoice/aec/webrtc_apm.py +56 -0
  9. abstractvoice/artifacts.py +173 -0
  10. abstractvoice/audio/__init__.py +7 -0
  11. abstractvoice/audio/recorder.py +46 -0
  12. abstractvoice/audio/resample.py +25 -0
  13. abstractvoice/cloning/__init__.py +7 -0
  14. abstractvoice/cloning/engine_chroma.py +738 -0
  15. abstractvoice/cloning/engine_f5.py +546 -0
  16. abstractvoice/cloning/manager.py +349 -0
  17. abstractvoice/cloning/store.py +362 -0
  18. abstractvoice/compute/__init__.py +6 -0
  19. abstractvoice/compute/device.py +73 -0
  20. abstractvoice/config/__init__.py +2 -0
  21. abstractvoice/config/voice_catalog.py +19 -0
  22. abstractvoice/dependency_check.py +0 -1
  23. abstractvoice/examples/cli_repl.py +2403 -243
  24. abstractvoice/examples/voice_cli.py +64 -63
  25. abstractvoice/integrations/__init__.py +2 -0
  26. abstractvoice/integrations/abstractcore.py +116 -0
  27. abstractvoice/integrations/abstractcore_plugin.py +253 -0
  28. abstractvoice/prefetch.py +82 -0
  29. abstractvoice/recognition.py +424 -42
  30. abstractvoice/stop_phrase.py +103 -0
  31. abstractvoice/tts/__init__.py +3 -3
  32. abstractvoice/tts/adapter_tts_engine.py +210 -0
  33. abstractvoice/tts/tts_engine.py +257 -1208
  34. abstractvoice/vm/__init__.py +2 -0
  35. abstractvoice/vm/common.py +21 -0
  36. abstractvoice/vm/core.py +139 -0
  37. abstractvoice/vm/manager.py +108 -0
  38. abstractvoice/vm/stt_mixin.py +158 -0
  39. abstractvoice/vm/tts_mixin.py +550 -0
  40. abstractvoice/voice_manager.py +6 -1061
  41. abstractvoice-0.6.1.dist-info/METADATA +213 -0
  42. abstractvoice-0.6.1.dist-info/RECORD +52 -0
  43. {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/WHEEL +1 -1
  44. abstractvoice-0.6.1.dist-info/entry_points.txt +6 -0
  45. abstractvoice/instant_setup.py +0 -83
  46. abstractvoice/simple_model_manager.py +0 -539
  47. abstractvoice-0.5.1.dist-info/METADATA +0 -1458
  48. abstractvoice-0.5.1.dist-info/RECORD +0 -23
  49. abstractvoice-0.5.1.dist-info/entry_points.txt +0 -2
  50. {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/licenses/LICENSE +0 -0
  51. {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/top_level.txt +0 -0
@@ -1,1065 +1,10 @@
1
- """Main Voice Manager class for coordinating TTS and STT components."""
1
+ """Public `VoiceManager` façade.
2
2
 
3
- # Lazy imports - heavy dependencies are only imported when needed
4
- def _import_tts_engine():
5
- """Import TTSEngine with helpful error message if dependencies missing."""
6
- try:
7
- from .tts import TTSEngine
8
- return TTSEngine
9
- except ImportError as e:
10
- if "TTS" in str(e) or "torch" in str(e) or "librosa" in str(e):
11
- raise ImportError(
12
- "TTS functionality requires optional dependencies. Install with:\n"
13
- " pip install abstractvoice[tts] # For TTS only\n"
14
- " pip install abstractvoice[all] # For all features\n"
15
- f"Original error: {e}"
16
- ) from e
17
- raise
3
+ Implementation is split into small focused modules under `abstractvoice/vm/`
4
+ to keep files readable and responsibilities clear.
5
+ """
18
6
 
19
- def _import_voice_recognizer():
20
- """Import VoiceRecognizer with helpful error message if dependencies missing."""
21
- try:
22
- from .recognition import VoiceRecognizer
23
- return VoiceRecognizer
24
- except ImportError as e:
25
- if "whisper" in str(e) or "tiktoken" in str(e):
26
- raise ImportError(
27
- "Speech recognition functionality requires optional dependencies. Install with:\n"
28
- " pip install abstractvoice[stt] # For speech recognition only\n"
29
- " pip install abstractvoice[all] # For all features\n"
30
- f"Original error: {e}"
31
- ) from e
32
- raise
7
+ from .vm.manager import VoiceManager
33
8
 
9
+ __all__ = ["VoiceManager"]
34
10
 
35
- class VoiceManager:
36
- """Main class for voice interaction capabilities with multilingual support."""
37
-
38
- # Smart language configuration - high quality stable defaults
39
- LANGUAGES = {
40
- 'en': {
41
- 'default': 'tts_models/en/ljspeech/tacotron2-DDC', # Reliable, compatible voice
42
- 'premium': 'tts_models/en/ljspeech/vits', # High quality (requires espeak)
43
- 'name': 'English'
44
- },
45
- 'fr': {
46
- 'default': 'tts_models/fr/css10/vits', # High quality cleaner audio
47
- 'premium': 'tts_models/fr/css10/vits', # Use same stable model
48
- 'name': 'French'
49
- },
50
- 'es': {
51
- 'default': 'tts_models/es/mai/tacotron2-DDC', # Keep stable Spanish model
52
- 'premium': 'tts_models/es/mai/tacotron2-DDC', # Same model (reliable)
53
- 'name': 'Spanish'
54
- },
55
- 'de': {
56
- 'default': 'tts_models/de/thorsten/vits', # High quality German
57
- 'premium': 'tts_models/de/thorsten/vits', # Use same stable model
58
- 'name': 'German'
59
- },
60
- 'it': {
61
- 'default': 'tts_models/it/mai_male/vits', # Use slower male voice as default
62
- 'premium': 'tts_models/it/mai_male/vits', # Same stable model
63
- 'name': 'Italian'
64
- }
65
- }
66
-
67
- # Universal safe fallback
68
- SAFE_FALLBACK = 'tts_models/en/ljspeech/fast_pitch'
69
-
70
- # Complete voice catalog with metadata
71
- VOICE_CATALOG = {
72
- 'en': {
73
- 'tacotron2': {
74
- 'model': 'tts_models/en/ljspeech/tacotron2-DDC',
75
- 'quality': 'good',
76
- 'gender': 'female',
77
- 'accent': 'US English',
78
- 'license': 'Open source (LJSpeech)',
79
- 'requires': 'none'
80
- },
81
- 'jenny': {
82
- 'model': 'tts_models/en/jenny/jenny',
83
- 'quality': 'excellent',
84
- 'gender': 'female',
85
- 'accent': 'US English',
86
- 'license': 'Open source (Jenny)',
87
- 'requires': 'none'
88
- },
89
- 'ek1': {
90
- 'model': 'tts_models/en/ek1/tacotron2',
91
- 'quality': 'excellent',
92
- 'gender': 'male',
93
- 'accent': 'British English',
94
- 'license': 'Open source (EK1)',
95
- 'requires': 'none'
96
- },
97
- 'sam': {
98
- 'model': 'tts_models/en/sam/tacotron-DDC',
99
- 'quality': 'good',
100
- 'gender': 'male',
101
- 'accent': 'US English',
102
- 'license': 'Open source (Sam)',
103
- 'requires': 'none'
104
- },
105
- 'fast_pitch': {
106
- 'model': 'tts_models/en/ljspeech/fast_pitch',
107
- 'quality': 'good',
108
- 'gender': 'female',
109
- 'accent': 'US English',
110
- 'license': 'Open source (LJSpeech)',
111
- 'requires': 'none'
112
- },
113
- 'vits': {
114
- 'model': 'tts_models/en/ljspeech/vits',
115
- 'quality': 'premium',
116
- 'gender': 'female',
117
- 'accent': 'US English',
118
- 'license': 'Open source (LJSpeech)',
119
- 'requires': 'espeak-ng'
120
- }
121
- },
122
- 'fr': {
123
- 'css10_vits': {
124
- 'model': 'tts_models/fr/css10/vits',
125
- 'quality': 'premium',
126
- 'gender': 'male',
127
- 'accent': 'France French',
128
- 'license': 'Apache 2.0 (CSS10/LibriVox)',
129
- 'requires': 'espeak-ng'
130
- },
131
- 'mai_tacotron': {
132
- 'model': 'tts_models/fr/mai/tacotron2-DDC',
133
- 'quality': 'good',
134
- 'gender': 'female',
135
- 'accent': 'France French',
136
- 'license': 'Permissive (M-AILABS/LibriVox)',
137
- 'requires': 'none'
138
- }
139
- },
140
- 'es': {
141
- 'mai_tacotron': {
142
- 'model': 'tts_models/es/mai/tacotron2-DDC',
143
- 'quality': 'good',
144
- 'gender': 'female',
145
- 'accent': 'Spain Spanish',
146
- 'license': 'Permissive (M-AILABS)',
147
- 'requires': 'none'
148
- }
149
- },
150
- 'de': {
151
- 'thorsten_vits': {
152
- 'model': 'tts_models/de/thorsten/vits',
153
- 'quality': 'premium',
154
- 'gender': 'male',
155
- 'accent': 'Standard German',
156
- 'license': 'Open source (Thorsten)',
157
- 'requires': 'espeak-ng'
158
- },
159
- 'thorsten_tacotron': {
160
- 'model': 'tts_models/de/thorsten/tacotron2-DDC',
161
- 'quality': 'good',
162
- 'gender': 'male',
163
- 'accent': 'Standard German',
164
- 'license': 'Open source (Thorsten)',
165
- 'requires': 'none'
166
- }
167
- },
168
- 'it': {
169
- 'mai_male_vits': {
170
- 'model': 'tts_models/it/mai_male/vits',
171
- 'quality': 'premium',
172
- 'gender': 'male',
173
- 'accent': 'Standard Italian',
174
- 'license': 'Permissive (M-AILABS)',
175
- 'requires': 'espeak-ng',
176
- 'speed': 0.8 # Slow down to fix pace issues
177
- },
178
- 'mai_female_vits': {
179
- 'model': 'tts_models/it/mai_female/vits',
180
- 'quality': 'premium',
181
- 'gender': 'female',
182
- 'accent': 'Standard Italian',
183
- 'license': 'Permissive (M-AILABS)',
184
- 'requires': 'espeak-ng',
185
- 'speed': 0.8 # Slow down to fix pace issues
186
- }
187
- }
188
- }
189
-
190
- def __init__(self, language='en', tts_model=None, whisper_model="tiny", debug_mode=False):
191
- """Initialize the Voice Manager with language support.
192
-
193
- Args:
194
- language: Language code ('en', 'fr', 'es', 'de', 'it')
195
- tts_model: Specific TTS model name or None for language default
196
- whisper_model: Whisper model name to use
197
- debug_mode: Enable debug logging
198
- """
199
- self.debug_mode = debug_mode
200
- self.speed = 1.0
201
-
202
- # Validate and set language
203
- language = language.lower()
204
- if language not in self.LANGUAGES:
205
- if debug_mode:
206
- available = ', '.join(self.LANGUAGES.keys())
207
- print(f"⚠️ Unsupported language '{language}', using English. Available: {available}")
208
- language = 'en'
209
- self.language = language
210
-
211
- # Select TTS model with smart detection
212
- if tts_model is None:
213
- tts_model = self._select_best_model(self.language)
214
- if debug_mode:
215
- lang_name = self.LANGUAGES[self.language]['name']
216
- print(f"🌍 Using {lang_name} voice: {tts_model}")
217
-
218
- # Initialize TTS engine with instant setup for new users
219
- from .instant_setup import ensure_instant_tts, get_instant_model, is_model_cached
220
-
221
- # If using default VITS model but it's not cached, use instant setup
222
- if tts_model == "tts_models/en/ljspeech/vits" and not is_model_cached(tts_model):
223
- if debug_mode:
224
- print("🚀 First-time setup: ensuring instant TTS availability...")
225
-
226
- # Try instant setup with lightweight model
227
- if ensure_instant_tts():
228
- tts_model = get_instant_model() # Use fast_pitch instead
229
- if debug_mode:
230
- print(f"✅ Using essential model: {tts_model}")
231
-
232
- # Initialize TTS engine using lazy import
233
- TTSEngine = _import_tts_engine()
234
- self.tts_engine = TTSEngine(
235
- model_name=tts_model,
236
- debug_mode=debug_mode
237
- )
238
-
239
- # Set up callbacks to pause/resume voice recognition during TTS playback
240
- # This prevents the system from interrupting its own speech
241
- self.tts_engine.on_playback_start = self._on_tts_start
242
- self.tts_engine.on_playback_end = self._on_tts_end
243
-
244
- # NEW: Enhanced audio lifecycle callbacks (v0.5.1)
245
- self.on_audio_start = None # Called when first audio sample plays
246
- self.on_audio_end = None # Called when last audio sample finishes
247
- self.on_audio_pause = None # Called when audio is paused
248
- self.on_audio_resume = None # Called when audio is resumed
249
-
250
- # Wire callbacks directly to audio player (skip TTSEngine layer)
251
- self.tts_engine.audio_player.on_audio_start = self._on_audio_start
252
- self.tts_engine.audio_player.on_audio_end = self._on_audio_end
253
- self.tts_engine.audio_player.on_audio_pause = self._on_audio_pause
254
- self.tts_engine.audio_player.on_audio_resume = self._on_audio_resume
255
-
256
- # Voice recognizer is initialized on demand
257
- self.voice_recognizer = None
258
- self.whisper_model = whisper_model
259
-
260
- # State tracking
261
- self._transcription_callback = None
262
- self._stop_callback = None
263
- self._voice_mode = "full" # full, wait, stop, ptt
264
-
265
- def _on_tts_start(self):
266
- """Called when TTS playback starts - handle based on voice mode."""
267
- if not self.voice_recognizer:
268
- return
269
-
270
- if self._voice_mode == "full":
271
- # Full mode: Keep listening but pause interrupt capability
272
- self.voice_recognizer.pause_tts_interrupt()
273
- elif self._voice_mode in ["wait", "stop", "ptt"]:
274
- # Wait/Stop/PTT modes: Pause listening entirely during TTS
275
- self.voice_recognizer.pause_listening()
276
-
277
- def _on_tts_end(self):
278
- """Called when TTS playback ends - handle based on voice mode."""
279
- if not self.voice_recognizer:
280
- return
281
-
282
- if self._voice_mode == "full":
283
- # Full mode: Resume interrupt capability
284
- self.voice_recognizer.resume_tts_interrupt()
285
- elif self._voice_mode in ["wait", "stop", "ptt"]:
286
- # Wait/Stop/PTT modes: Resume listening
287
- self.voice_recognizer.resume_listening()
288
-
289
- def speak(self, text, speed=1.0, callback=None):
290
- """Convert text to speech and play audio.
291
-
292
- Args:
293
- text: Text to convert to speech
294
- speed: Speech speed (0.5-2.0)
295
- callback: Function to call when speech completes
296
-
297
- Returns:
298
- True if speech started, False otherwise
299
- """
300
- sp = 1.0
301
- if speed != 1.0:
302
- sp = speed
303
- else:
304
- sp = self.speed
305
-
306
- return self.tts_engine.speak(text, sp, callback)
307
-
308
- def stop_speaking(self):
309
- """Stop current speech playback.
310
-
311
- Returns:
312
- True if stopped, False if no playback was active
313
- """
314
- return self.tts_engine.stop()
315
-
316
- def pause_speaking(self):
317
- """Pause current speech playback.
318
-
319
- Pauses at chunk boundaries in streaming mode. Can be resumed with resume_speaking().
320
-
321
- Returns:
322
- True if paused, False if no playback was active
323
- """
324
- return self.tts_engine.pause()
325
-
326
- def resume_speaking(self):
327
- """Resume paused speech playback.
328
-
329
- Returns:
330
- True if resumed, False if not paused or no playback active
331
- """
332
- return self.tts_engine.resume()
333
-
334
- def is_paused(self):
335
- """Check if TTS is currently paused.
336
-
337
- Returns:
338
- True if paused, False otherwise
339
- """
340
- return self.tts_engine.is_paused()
341
-
342
- def is_speaking(self):
343
- """Check if TTS is currently active.
344
-
345
- Returns:
346
- True if speaking, False otherwise
347
- """
348
- return self.tts_engine.is_active()
349
-
350
- def listen(self, on_transcription, on_stop=None):
351
- """Start listening for speech with callbacks.
352
-
353
- Args:
354
- on_transcription: Callback for transcribed text
355
- on_stop: Callback when 'stop' command detected
356
-
357
- Returns:
358
- True if started, False if already listening
359
- """
360
- # Store callbacks
361
- self._transcription_callback = on_transcription
362
- self._stop_callback = on_stop
363
-
364
- # Initialize recognizer if not already done
365
- if not self.voice_recognizer:
366
- def _transcription_handler(text):
367
- if self._transcription_callback:
368
- self._transcription_callback(text)
369
-
370
- def _stop_handler():
371
- # Stop listening
372
- self.stop_listening()
373
- # Call user's stop callback if provided
374
- if self._stop_callback:
375
- self._stop_callback()
376
-
377
- # Use lazy import for VoiceRecognizer
378
- VoiceRecognizer = _import_voice_recognizer()
379
- self.voice_recognizer = VoiceRecognizer(
380
- transcription_callback=_transcription_handler,
381
- stop_callback=_stop_handler,
382
- whisper_model=self.whisper_model,
383
- debug_mode=self.debug_mode
384
- )
385
-
386
- # Start with TTS interrupt capability
387
- return self.voice_recognizer.start(
388
- tts_interrupt_callback=self.stop_speaking
389
- )
390
-
391
- def stop_listening(self):
392
- """Stop listening for speech.
393
-
394
- Returns:
395
- True if stopped, False if not listening
396
- """
397
- if self.voice_recognizer:
398
- return self.voice_recognizer.stop()
399
- return False
400
-
401
- def is_listening(self):
402
- """Check if currently listening for speech.
403
-
404
- Returns:
405
- True if listening, False otherwise
406
- """
407
- return self.voice_recognizer and self.voice_recognizer.is_running
408
-
409
- def set_voice_mode(self, mode):
410
- """Set the voice mode (full, wait, stop, ptt).
411
-
412
- Args:
413
- mode: Voice mode to use
414
-
415
- Returns:
416
- True if successful
417
- """
418
- if mode in ["full", "wait", "stop", "ptt"]:
419
- self._voice_mode = mode
420
- return True
421
- return False
422
-
423
- def set_speed(self, speed):
424
- """Set the TTS speed.
425
-
426
- Args:
427
- speed: Speech speed multiplier (0.5-2.0)
428
-
429
- Returns:
430
- True if successful
431
- """
432
- self.speed = speed
433
- return True
434
-
435
- def get_speed(self):
436
- """Get the TTS speed.
437
-
438
- Returns:
439
- Current TTS speed multiplier
440
- """
441
- return self.speed
442
-
443
- def set_tts_model(self, model_name):
444
- """Change the TTS model safely without memory conflicts.
445
-
446
- Available models (all pure Python, cross-platform):
447
- - "tts_models/en/ljspeech/fast_pitch" (default, recommended)
448
- - "tts_models/en/ljspeech/glow-tts" (alternative)
449
- - "tts_models/en/ljspeech/tacotron2-DDC" (legacy)
450
-
451
- Args:
452
- model_name: TTS model name to use
453
-
454
- Returns:
455
- True if successful
456
-
457
- Example:
458
- vm.set_tts_model("tts_models/en/ljspeech/glow-tts")
459
- """
460
- # Stop any current speech
461
- self.stop_speaking()
462
-
463
- # CRITICAL: Crash-safe cleanup of old TTS engine
464
- if hasattr(self, 'tts_engine') and self.tts_engine:
465
- try:
466
- # Stop all audio and cleanup player
467
- if hasattr(self.tts_engine, 'audio_player') and self.tts_engine.audio_player:
468
- # Try stop method if available
469
- if hasattr(self.tts_engine.audio_player, 'stop'):
470
- self.tts_engine.audio_player.stop()
471
- self.tts_engine.audio_player.cleanup()
472
-
473
- # Force cleanup of TTS object and release GPU memory
474
- if hasattr(self.tts_engine, 'tts') and self.tts_engine.tts:
475
- # Clear CUDA cache if using GPU
476
- try:
477
- import torch
478
- if torch.cuda.is_available():
479
- torch.cuda.empty_cache()
480
- except:
481
- pass
482
-
483
- del self.tts_engine.tts
484
-
485
- # Clear the engine itself
486
- del self.tts_engine
487
- self.tts_engine = None
488
-
489
- # Force garbage collection to prevent memory leaks
490
- import gc
491
- gc.collect()
492
-
493
- except Exception as e:
494
- if self.debug_mode:
495
- print(f"Warning: TTS cleanup issue: {e}")
496
- # Force clear even if cleanup failed
497
- self.tts_engine = None
498
-
499
- # Reinitialize TTS engine with new model using lazy import
500
- TTSEngine = _import_tts_engine()
501
- self.tts_engine = TTSEngine(
502
- model_name=model_name,
503
- debug_mode=self.debug_mode
504
- )
505
-
506
- # Restore callbacks
507
- self.tts_engine.on_playback_start = self._on_tts_start
508
- self.tts_engine.on_playback_end = self._on_tts_end
509
-
510
- return True
511
-
512
- def set_whisper(self, model_name):
513
- """Set the Whisper model.
514
-
515
- Args:
516
- whisper_model: Whisper model name (tiny, base, etc.)
517
-
518
- Returns:
519
- True if successful
520
- """
521
- self.whisper_model = model_name
522
- if self.voice_recognizer:
523
- return self.voice_recognizer.change_whisper_model(model_name)
524
-
525
- def get_whisper(self):
526
- """Get the Whisper model.
527
-
528
- Returns:
529
- Current Whisper model name
530
- """
531
- return self.whisper_model
532
-
533
- def set_language(self, language):
534
- """Set the voice language.
535
-
536
- Args:
537
- language: Language code ('en', 'fr', 'es', 'de', 'it')
538
-
539
- Returns:
540
- True if successful, False otherwise
541
- """
542
- # Validate language
543
- language = language.lower()
544
- if language not in self.LANGUAGES:
545
- if self.debug_mode:
546
- available = ', '.join(self.LANGUAGES.keys())
547
- print(f"⚠️ Unsupported language '{language}'. Available: {available}")
548
- return False
549
-
550
- # Skip if already using this language
551
- if language == self.language:
552
- if self.debug_mode:
553
- print(f"✓ Already using {self.LANGUAGES[language]['name']} voice")
554
- return True
555
-
556
- # Stop any current operations
557
- self.stop_speaking()
558
- if self.voice_recognizer:
559
- self.voice_recognizer.stop()
560
-
561
- # Select best model for this language
562
- selected_model = self._select_best_model(language)
563
-
564
- # CRITICAL FIX: Check if model is available, download if not
565
- from .instant_setup import is_model_cached
566
- from .simple_model_manager import download_model
567
-
568
- if not is_model_cached(selected_model):
569
- if self.debug_mode:
570
- print(f"📥 Model {selected_model} not cached, downloading...")
571
-
572
- # Try to download the model
573
- success = download_model(selected_model)
574
- if not success:
575
- if self.debug_mode:
576
- print(f"❌ Failed to download {selected_model}")
577
- # If download fails and it's not English, we have a problem
578
- if language != 'en':
579
- print(f"❌ Cannot switch to {self.LANGUAGES[language]['name']}: Model download failed")
580
- print(f" Try: abstractvoice download-models --language {language}")
581
- return False
582
-
583
- models_to_try = [selected_model]
584
-
585
- # Only add fallback if it's different from selected
586
- if selected_model != self.SAFE_FALLBACK:
587
- models_to_try.append(self.SAFE_FALLBACK)
588
-
589
- for model_name in models_to_try:
590
- try:
591
- if self.debug_mode:
592
- lang_name = self.LANGUAGES[language]['name']
593
- print(f"🌍 Loading {lang_name} voice: {model_name}")
594
-
595
- # Reinitialize TTS engine
596
- TTSEngine = _import_tts_engine()
597
- self.tts_engine = TTSEngine(model_name=model_name, debug_mode=self.debug_mode)
598
-
599
- # Restore callbacks
600
- self.tts_engine.on_playback_start = self._on_tts_start
601
- self.tts_engine.on_playback_end = self._on_tts_end
602
-
603
- # Update language and set appropriate speed for Italian voices
604
- self.language = language
605
-
606
- # Set language-specific speed adjustments
607
- if language == 'it':
608
- self.speed = 0.8 # Slow down Italian voices to fix pace issues
609
- if self.debug_mode:
610
- print(f" Speed: {self.speed} (adjusted for optimal Italian pace)")
611
- else:
612
- self.speed = 1.0 # Default speed for other languages
613
-
614
- return True
615
-
616
- except Exception as e:
617
- if self.debug_mode:
618
- print(f"⚠️ Model {model_name} failed to load: {e}")
619
- # Don't silently continue - report the failure
620
- if model_name == selected_model and language != 'en':
621
- print(f"❌ Failed to load {lang_name} voice model")
622
- print(f" The model might be corrupted. Try:")
623
- print(f" abstractvoice download-models --language {language}")
624
- continue
625
-
626
- # All models failed
627
- print(f"❌ Cannot switch to {self.LANGUAGES[language]['name']}: No working models")
628
- return False
629
-
630
- def get_language(self):
631
- """Get the current voice language.
632
-
633
- Returns:
634
- Current language code
635
- """
636
- return self.language
637
-
638
- def get_supported_languages(self):
639
- """Get list of supported language codes.
640
-
641
- Returns:
642
- List of supported language codes
643
- """
644
- return list(self.LANGUAGES.keys())
645
-
646
- def get_language_name(self, language_code=None):
647
- """Get the display name for a language.
648
-
649
- Args:
650
- language_code: Language code (defaults to current language)
651
-
652
- Returns:
653
- Language display name
654
- """
655
- lang = language_code or self.language
656
- return self.LANGUAGES.get(lang, {}).get('name', lang)
657
-
658
- def _select_best_model(self, language):
659
- """Select the best available TTS model for a language.
660
-
661
- Try premium model first (higher quality), fallback to default (reliable).
662
-
663
- Args:
664
- language: Language code
665
-
666
- Returns:
667
- Model name string
668
- """
669
- if language not in self.LANGUAGES:
670
- return self.SAFE_FALLBACK
671
-
672
- lang_config = self.LANGUAGES[language]
673
-
674
- # Try premium model first (better quality)
675
- if 'premium' in lang_config:
676
- try:
677
- premium_model = lang_config['premium']
678
- # Quick test to see if this model type works
679
- if self._test_model_compatibility(premium_model):
680
- if self.debug_mode:
681
- print(f"✨ Using premium quality model: {premium_model}")
682
- return premium_model
683
- elif self.debug_mode:
684
- print(f"⚠️ Premium model not compatible, using default")
685
- except Exception:
686
- if self.debug_mode:
687
- print(f"⚠️ Premium model failed, using default")
688
-
689
- # Use reliable default model
690
- default_model = lang_config.get('default', self.SAFE_FALLBACK)
691
- if self.debug_mode:
692
- print(f"🔧 Using reliable default model: {default_model}")
693
- return default_model
694
-
695
- def _test_model_compatibility(self, model_name):
696
- """Quick test if a model is compatible with current system.
697
-
698
- Args:
699
- model_name: TTS model name
700
-
701
- Returns:
702
- True if compatible, False otherwise
703
- """
704
- # For VITS models, check if espeak-ng is available
705
- if 'vits' in model_name.lower():
706
- try:
707
- import subprocess
708
- result = subprocess.run(['espeak-ng', '--version'],
709
- capture_output=True, timeout=2)
710
- return result.returncode == 0
711
- except (FileNotFoundError, subprocess.TimeoutExpired, subprocess.SubprocessError):
712
- return False
713
-
714
- # For other models, assume they work (they're more compatible)
715
- return True
716
-
717
- def set_voice_variant(self, language, variant):
718
- """Set a specific voice variant for a language.
719
-
720
- Args:
721
- language: Language code ('fr', 'it')
722
- variant: Variant name ('female', 'alternative', etc.)
723
-
724
- Returns:
725
- True if successful, False otherwise
726
-
727
- Examples:
728
- vm.set_voice_variant('it', 'female') # Use female Italian voice
729
- vm.set_voice_variant('fr', 'alternative') # Use original French model
730
- """
731
- if language not in self.ALTERNATIVE_MODELS:
732
- if self.debug_mode:
733
- available_langs = ', '.join(self.ALTERNATIVE_MODELS.keys())
734
- print(f"⚠️ No variants available for '{language}'. Languages with variants: {available_langs}")
735
- return False
736
-
737
- if variant not in self.ALTERNATIVE_MODELS[language]:
738
- if self.debug_mode:
739
- available_variants = ', '.join(self.ALTERNATIVE_MODELS[language].keys())
740
- print(f"⚠️ Variant '{variant}' not available for {language}. Available: {available_variants}")
741
- return False
742
-
743
- # Get the specific model for this variant
744
- model_name = self.ALTERNATIVE_MODELS[language][variant]
745
-
746
- if self.debug_mode:
747
- lang_name = self.LANGUAGES[language]['name']
748
- print(f"🎭 Switching to {lang_name} {variant} voice: {model_name}")
749
-
750
- # Set the specific model
751
- return self.set_tts_model(model_name)
752
-
753
- def get_model_info(self):
754
- """Get information about currently loaded models and system capabilities.
755
-
756
- Returns:
757
- Dict with model information and system capabilities
758
- """
759
- info = {
760
- 'current_language': self.language,
761
- 'language_name': self.get_language_name(),
762
- 'espeak_available': self._test_model_compatibility('test_vits'),
763
- 'supported_languages': self.get_supported_languages()
764
- }
765
-
766
- # Add model recommendations for each language
767
- info['models'] = {}
768
- for lang in self.get_supported_languages():
769
- selected_model = self._select_best_model(lang)
770
- lang_config = self.LANGUAGES[lang]
771
- is_premium = selected_model == lang_config.get('premium', '')
772
-
773
- info['models'][lang] = {
774
- 'name': lang_config['name'],
775
- 'selected_model': selected_model,
776
- 'quality': 'premium' if is_premium else 'default',
777
- 'default_available': lang_config.get('default', ''),
778
- 'premium_available': lang_config.get('premium', '')
779
- }
780
-
781
- return info
782
-
783
- def browse_voices(self, language=None, quality=None, gender=None):
784
- """Browse available voices with filtering options.
785
-
786
- Args:
787
- language: Language code ('en', 'fr', etc.) or None for all
788
- quality: 'premium', 'good', or None for all
789
- gender: 'male', 'female', 'multiple', or None for all
790
-
791
- Returns:
792
- Dict of available voices with metadata
793
- """
794
- voices = {}
795
-
796
- # Get languages to check
797
- languages_to_check = [language] if language else self.VOICE_CATALOG.keys()
798
-
799
- for lang in languages_to_check:
800
- if lang not in self.VOICE_CATALOG:
801
- continue
802
-
803
- lang_voices = {}
804
- for voice_id, voice_info in self.VOICE_CATALOG[lang].items():
805
- # Apply filters
806
- if quality and voice_info['quality'] != quality:
807
- continue
808
- if gender and voice_info['gender'] != gender:
809
- continue
810
-
811
- # Check if voice is compatible with current system
812
- compatible = True
813
- if voice_info['requires'] == 'espeak-ng':
814
- compatible = self._test_model_compatibility(voice_info['model'])
815
-
816
- # Add compatibility info
817
- voice_data = voice_info.copy()
818
- voice_data['compatible'] = compatible
819
- lang_voices[voice_id] = voice_data
820
-
821
- if lang_voices:
822
- voices[lang] = lang_voices
823
-
824
- return voices
825
-
826
- def list_voices(self, language=None):
827
- """List available voices in a user-friendly format.
828
-
829
- Args:
830
- language: Language code or None for all languages
831
- """
832
- voices = self.browse_voices(language)
833
-
834
- if not voices:
835
- print("No voices found matching criteria.")
836
- return
837
-
838
- # License links mapping
839
- license_links = {
840
- 'CSS10': 'https://github.com/Kyubyong/CSS10',
841
- 'M-AILABS': 'https://www.caito.de/2019/01/03/the-m-ailabs-speech-dataset/',
842
- 'LJSpeech': 'https://keithito.com/LJ-Speech-Dataset/',
843
- 'VCTK': 'https://datashare.ed.ac.uk/handle/10283/3443',
844
- 'Thorsten': 'https://www.thorsten-voice.de/en/'
845
- }
846
-
847
- for lang, lang_voices in voices.items():
848
- lang_name = self.LANGUAGES.get(lang, {}).get('name', lang)
849
- print(f"\n🌍 {lang_name} ({lang}) - {len(lang_voices)} voices available:")
850
-
851
- for voice_id, voice_info in lang_voices.items():
852
- quality_icon = "✨" if voice_info['quality'] == 'premium' else "🔧"
853
- compat_icon = "✅" if voice_info['compatible'] else "⚠️"
854
- gender_icon = {"male": "👨", "female": "👩", "multiple": "👥"}.get(voice_info['gender'], "🗣️")
855
-
856
- # Show full format: language.voice_id
857
- full_voice_id = f"{lang}.{voice_id}"
858
- print(f" {compat_icon} {quality_icon} {gender_icon} {full_voice_id}")
859
- print(f" {voice_info['accent']} - {voice_info['gender']} voice")
860
-
861
- # Extract license name and add link if available
862
- license_text = voice_info['license']
863
- license_with_link = license_text
864
- for dataset_name, link in license_links.items():
865
- if dataset_name in license_text:
866
- license_with_link = f"{license_text} - {link}"
867
- break
868
-
869
- print(f" License: {license_with_link}")
870
- if not voice_info['compatible'] and voice_info['requires'] == 'espeak-ng':
871
- print(f" ⚠️ Requires: espeak-ng (install for premium quality)")
872
-
873
- def set_voice(self, language, voice_id):
874
- """Set a specific voice by ID.
875
-
876
- Args:
877
- language: Language code
878
- voice_id: Voice ID from voice catalog
879
-
880
- Returns:
881
- True if successful
882
-
883
- Example:
884
- vm.set_voice('fr', 'css10_vits') # Use CSS10 French VITS voice
885
- vm.set_voice('it', 'mai_female_vits') # Use female Italian VITS voice
886
- """
887
- if language not in self.VOICE_CATALOG:
888
- if self.debug_mode:
889
- print(f"⚠️ Language '{language}' not available")
890
- return False
891
-
892
- if voice_id not in self.VOICE_CATALOG[language]:
893
- if self.debug_mode:
894
- available = ', '.join(self.VOICE_CATALOG[language].keys())
895
- print(f"⚠️ Voice '{voice_id}' not available for {language}. Available: {available}")
896
- return False
897
-
898
- voice_info = self.VOICE_CATALOG[language][voice_id]
899
- model_name = voice_info['model']
900
-
901
- # CRITICAL FIX: Download model if not cached
902
- from .instant_setup import is_model_cached
903
- from .simple_model_manager import download_model
904
-
905
- if not is_model_cached(model_name):
906
- print(f"📥 Voice model '{voice_id}' not cached, downloading...")
907
- success = download_model(model_name)
908
- if not success:
909
- print(f"❌ Failed to download voice '{voice_id}'")
910
- print(f" Check your internet connection and try again")
911
- return False
912
- print(f"✅ Voice model '{voice_id}' downloaded successfully")
913
-
914
- # Check compatibility after download
915
- if voice_info['requires'] == 'espeak-ng' and not self._test_model_compatibility(model_name):
916
- if self.debug_mode:
917
- print(f"⚠️ Voice '{voice_id}' requires espeak-ng. Install it for premium quality.")
918
- # Don't fail - try to load anyway
919
- # return False
920
-
921
- # Set the specific voice
922
- if self.debug_mode:
923
- print(f"🎭 Setting {language} voice to: {voice_id}")
924
- print(f" Model: {model_name}")
925
- print(f" Quality: {voice_info['quality']} | Gender: {voice_info['gender']}")
926
- print(f" Accent: {voice_info['accent']}")
927
-
928
- # Switch to the language and specific model
929
- self.language = language
930
-
931
- # Set voice-specific speed if available
932
- if 'speed' in voice_info:
933
- self.speed = voice_info['speed']
934
- if self.debug_mode:
935
- print(f" Speed: {voice_info['speed']} (adjusted for optimal pace)")
936
- else:
937
- self.speed = 1.0 # Default speed
938
-
939
- return self.set_tts_model(model_name)
940
-
941
- def change_vad_aggressiveness(self, aggressiveness):
942
- """Change VAD aggressiveness.
943
-
944
- Args:
945
- aggressiveness: New aggressiveness level (0-3)
946
-
947
- Returns:
948
- True if changed, False otherwise
949
- """
950
- if self.voice_recognizer:
951
- return self.voice_recognizer.change_vad_aggressiveness(aggressiveness)
952
- return False
953
-
954
- # ===== SIMPLE MODEL MANAGEMENT METHODS =====
955
- # Clean, simple APIs for both CLI and third-party applications
956
-
957
- def list_available_models(self, language: str = None) -> dict:
958
- """Get available models with metadata.
959
-
960
- Args:
961
- language: Optional language filter
962
-
963
- Returns:
964
- dict: Model information with cache status
965
-
966
- Example:
967
- >>> vm = VoiceManager()
968
- >>> models = vm.list_available_models('en')
969
- >>> print(json.dumps(models, indent=2))
970
- """
971
- from .simple_model_manager import get_model_manager
972
- manager = get_model_manager(self.debug_mode)
973
- return manager.list_available_models(language)
974
-
975
- def download_model(self, model_name: str, progress_callback=None) -> bool:
976
- """Download a specific model.
977
-
978
- Args:
979
- model_name: Model name or voice ID (e.g., 'en.vits' or full model path)
980
- progress_callback: Optional function(model_name, success)
981
-
982
- Returns:
983
- bool: True if successful
984
-
985
- Example:
986
- >>> vm = VoiceManager()
987
- >>> vm.download_model('en.vits') # or 'tts_models/en/ljspeech/vits'
988
- """
989
- from .simple_model_manager import download_model
990
- return download_model(model_name, progress_callback)
991
-
992
- def is_model_ready(self) -> bool:
993
- """Check if essential model is ready for immediate use.
994
-
995
- Returns:
996
- bool: True if can speak immediately without download
997
- """
998
- from .simple_model_manager import is_ready
999
- return is_ready()
1000
-
1001
- def ensure_ready(self, auto_download: bool = True) -> bool:
1002
- """Ensure TTS is ready for immediate use.
1003
-
1004
- Args:
1005
- auto_download: Whether to download essential model if needed
1006
-
1007
- Returns:
1008
- bool: True if TTS is ready
1009
-
1010
- Example:
1011
- >>> vm = VoiceManager()
1012
- >>> if vm.ensure_ready():
1013
- ... vm.speak("Ready to go!")
1014
- """
1015
- if self.is_model_ready():
1016
- return True
1017
-
1018
- if not auto_download:
1019
- return False
1020
-
1021
- from .simple_model_manager import get_model_manager
1022
- manager = get_model_manager(self.debug_mode)
1023
- return manager.download_essential_model()
1024
-
1025
- def get_cache_status(self) -> dict:
1026
- """Get model cache status.
1027
-
1028
- Returns:
1029
- dict: Cache information including total models, sizes, etc.
1030
- """
1031
- from .simple_model_manager import get_model_manager
1032
- manager = get_model_manager(self.debug_mode)
1033
- return manager.get_status()
1034
-
1035
- def cleanup(self):
1036
- """Clean up resources.
1037
-
1038
- Returns:
1039
- True if cleanup successful
1040
- """
1041
- if self.voice_recognizer:
1042
- self.voice_recognizer.stop()
1043
-
1044
- self.stop_speaking()
1045
- return True
1046
-
1047
- def _on_audio_start(self):
1048
- """Called when audio actually starts playing."""
1049
- if self.on_audio_start:
1050
- self.on_audio_start()
1051
-
1052
- def _on_audio_end(self):
1053
- """Called when audio actually finishes playing."""
1054
- if self.on_audio_end:
1055
- self.on_audio_end()
1056
-
1057
- def _on_audio_pause(self):
1058
- """Called when audio is paused."""
1059
- if self.on_audio_pause:
1060
- self.on_audio_pause()
1061
-
1062
- def _on_audio_resume(self):
1063
- """Called when audio is resumed."""
1064
- if self.on_audio_resume:
1065
- self.on_audio_resume()