abstractvoice 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractvoice/__main__.py +33 -11
- abstractvoice/dependency_check.py +274 -0
- abstractvoice/examples/cli_repl.py +198 -13
- abstractvoice/examples/voice_cli.py +20 -6
- abstractvoice/recognition.py +50 -7
- abstractvoice/stt/transcriber.py +17 -2
- abstractvoice/tts/tts_engine.py +138 -32
- abstractvoice/vad/voice_detector.py +16 -2
- abstractvoice/voice_manager.py +558 -16
- {abstractvoice-0.1.1.dist-info → abstractvoice-0.2.1.dist-info}/METADATA +196 -50
- abstractvoice-0.2.1.dist-info/RECORD +21 -0
- {abstractvoice-0.1.1.dist-info → abstractvoice-0.2.1.dist-info}/licenses/LICENSE +1 -1
- abstractvoice-0.1.1.dist-info/RECORD +0 -20
- {abstractvoice-0.1.1.dist-info → abstractvoice-0.2.1.dist-info}/WHEEL +0 -0
- {abstractvoice-0.1.1.dist-info → abstractvoice-0.2.1.dist-info}/entry_points.txt +0 -0
- {abstractvoice-0.1.1.dist-info → abstractvoice-0.2.1.dist-info}/top_level.txt +0 -0
abstractvoice/voice_manager.py
CHANGED
|
@@ -1,25 +1,198 @@
|
|
|
1
1
|
"""Main Voice Manager class for coordinating TTS and STT components."""
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
# Lazy imports - heavy dependencies are only imported when needed
|
|
4
|
+
def _import_tts_engine():
|
|
5
|
+
"""Import TTSEngine with helpful error message if dependencies missing."""
|
|
6
|
+
try:
|
|
7
|
+
from .tts import TTSEngine
|
|
8
|
+
return TTSEngine
|
|
9
|
+
except ImportError as e:
|
|
10
|
+
if "TTS" in str(e) or "torch" in str(e) or "librosa" in str(e):
|
|
11
|
+
raise ImportError(
|
|
12
|
+
"TTS functionality requires optional dependencies. Install with:\n"
|
|
13
|
+
" pip install abstractvoice[tts] # For TTS only\n"
|
|
14
|
+
" pip install abstractvoice[all] # For all features\n"
|
|
15
|
+
f"Original error: {e}"
|
|
16
|
+
) from e
|
|
17
|
+
raise
|
|
18
|
+
|
|
19
|
+
def _import_voice_recognizer():
|
|
20
|
+
"""Import VoiceRecognizer with helpful error message if dependencies missing."""
|
|
21
|
+
try:
|
|
22
|
+
from .recognition import VoiceRecognizer
|
|
23
|
+
return VoiceRecognizer
|
|
24
|
+
except ImportError as e:
|
|
25
|
+
if "whisper" in str(e) or "tiktoken" in str(e):
|
|
26
|
+
raise ImportError(
|
|
27
|
+
"Speech recognition functionality requires optional dependencies. Install with:\n"
|
|
28
|
+
" pip install abstractvoice[stt] # For speech recognition only\n"
|
|
29
|
+
" pip install abstractvoice[all] # For all features\n"
|
|
30
|
+
f"Original error: {e}"
|
|
31
|
+
) from e
|
|
32
|
+
raise
|
|
5
33
|
|
|
6
34
|
|
|
7
35
|
class VoiceManager:
|
|
8
|
-
"""Main class for voice interaction capabilities."""
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
36
|
+
"""Main class for voice interaction capabilities with multilingual support."""
|
|
37
|
+
|
|
38
|
+
# Smart language configuration - high quality stable defaults
|
|
39
|
+
LANGUAGES = {
|
|
40
|
+
'en': {
|
|
41
|
+
'default': 'tts_models/en/ljspeech/vits', # High quality premium voice
|
|
42
|
+
'premium': 'tts_models/en/ljspeech/vits', # Use same stable model
|
|
43
|
+
'name': 'English'
|
|
44
|
+
},
|
|
45
|
+
'fr': {
|
|
46
|
+
'default': 'tts_models/fr/css10/vits', # High quality cleaner audio
|
|
47
|
+
'premium': 'tts_models/fr/css10/vits', # Use same stable model
|
|
48
|
+
'name': 'French'
|
|
49
|
+
},
|
|
50
|
+
'es': {
|
|
51
|
+
'default': 'tts_models/es/mai/tacotron2-DDC', # Keep stable Spanish model
|
|
52
|
+
'premium': 'tts_models/es/mai/tacotron2-DDC', # Same model (reliable)
|
|
53
|
+
'name': 'Spanish'
|
|
54
|
+
},
|
|
55
|
+
'de': {
|
|
56
|
+
'default': 'tts_models/de/thorsten/vits', # High quality German
|
|
57
|
+
'premium': 'tts_models/de/thorsten/vits', # Use same stable model
|
|
58
|
+
'name': 'German'
|
|
59
|
+
},
|
|
60
|
+
'it': {
|
|
61
|
+
'default': 'tts_models/it/mai_male/vits', # Use slower male voice as default
|
|
62
|
+
'premium': 'tts_models/it/mai_male/vits', # Same stable model
|
|
63
|
+
'name': 'Italian'
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
# Universal safe fallback
|
|
68
|
+
SAFE_FALLBACK = 'tts_models/en/ljspeech/fast_pitch'
|
|
69
|
+
|
|
70
|
+
# Complete voice catalog with metadata
|
|
71
|
+
VOICE_CATALOG = {
|
|
72
|
+
'en': {
|
|
73
|
+
'vits_premium': {
|
|
74
|
+
'model': 'tts_models/en/ljspeech/vits',
|
|
75
|
+
'quality': 'premium',
|
|
76
|
+
'gender': 'female',
|
|
77
|
+
'accent': 'US English',
|
|
78
|
+
'license': 'Open source (LJSpeech)',
|
|
79
|
+
'requires': 'espeak-ng'
|
|
80
|
+
},
|
|
81
|
+
'fast_pitch_reliable': {
|
|
82
|
+
'model': 'tts_models/en/ljspeech/fast_pitch',
|
|
83
|
+
'quality': 'good',
|
|
84
|
+
'gender': 'female',
|
|
85
|
+
'accent': 'US English',
|
|
86
|
+
'license': 'Open source (LJSpeech)',
|
|
87
|
+
'requires': 'none'
|
|
88
|
+
},
|
|
89
|
+
'vctk_multi': {
|
|
90
|
+
'model': 'tts_models/en/vctk/vits',
|
|
91
|
+
'quality': 'premium',
|
|
92
|
+
'gender': 'multiple',
|
|
93
|
+
'accent': 'British English',
|
|
94
|
+
'license': 'Open source (VCTK)',
|
|
95
|
+
'requires': 'espeak-ng'
|
|
96
|
+
}
|
|
97
|
+
},
|
|
98
|
+
'fr': {
|
|
99
|
+
'css10_vits': {
|
|
100
|
+
'model': 'tts_models/fr/css10/vits',
|
|
101
|
+
'quality': 'premium',
|
|
102
|
+
'gender': 'male',
|
|
103
|
+
'accent': 'France French',
|
|
104
|
+
'license': 'Apache 2.0 (CSS10/LibriVox)',
|
|
105
|
+
'requires': 'espeak-ng'
|
|
106
|
+
},
|
|
107
|
+
'mai_tacotron': {
|
|
108
|
+
'model': 'tts_models/fr/mai/tacotron2-DDC',
|
|
109
|
+
'quality': 'good',
|
|
110
|
+
'gender': 'female',
|
|
111
|
+
'accent': 'France French',
|
|
112
|
+
'license': 'Permissive (M-AILABS/LibriVox)',
|
|
113
|
+
'requires': 'none'
|
|
114
|
+
}
|
|
115
|
+
},
|
|
116
|
+
'es': {
|
|
117
|
+
'mai_tacotron': {
|
|
118
|
+
'model': 'tts_models/es/mai/tacotron2-DDC',
|
|
119
|
+
'quality': 'good',
|
|
120
|
+
'gender': 'female',
|
|
121
|
+
'accent': 'Spain Spanish',
|
|
122
|
+
'license': 'Permissive (M-AILABS)',
|
|
123
|
+
'requires': 'none'
|
|
124
|
+
}
|
|
125
|
+
},
|
|
126
|
+
'de': {
|
|
127
|
+
'thorsten_vits': {
|
|
128
|
+
'model': 'tts_models/de/thorsten/vits',
|
|
129
|
+
'quality': 'premium',
|
|
130
|
+
'gender': 'male',
|
|
131
|
+
'accent': 'Standard German',
|
|
132
|
+
'license': 'Open source (Thorsten)',
|
|
133
|
+
'requires': 'espeak-ng'
|
|
134
|
+
},
|
|
135
|
+
'thorsten_tacotron': {
|
|
136
|
+
'model': 'tts_models/de/thorsten/tacotron2-DDC',
|
|
137
|
+
'quality': 'good',
|
|
138
|
+
'gender': 'male',
|
|
139
|
+
'accent': 'Standard German',
|
|
140
|
+
'license': 'Open source (Thorsten)',
|
|
141
|
+
'requires': 'none'
|
|
142
|
+
}
|
|
143
|
+
},
|
|
144
|
+
'it': {
|
|
145
|
+
'mai_male_vits': {
|
|
146
|
+
'model': 'tts_models/it/mai_male/vits',
|
|
147
|
+
'quality': 'premium',
|
|
148
|
+
'gender': 'male',
|
|
149
|
+
'accent': 'Standard Italian',
|
|
150
|
+
'license': 'Permissive (M-AILABS)',
|
|
151
|
+
'requires': 'espeak-ng',
|
|
152
|
+
'speed': 0.8 # Slow down to fix pace issues
|
|
153
|
+
},
|
|
154
|
+
'mai_female_vits': {
|
|
155
|
+
'model': 'tts_models/it/mai_female/vits',
|
|
156
|
+
'quality': 'premium',
|
|
157
|
+
'gender': 'female',
|
|
158
|
+
'accent': 'Standard Italian',
|
|
159
|
+
'license': 'Permissive (M-AILABS)',
|
|
160
|
+
'requires': 'espeak-ng',
|
|
161
|
+
'speed': 0.8 # Slow down to fix pace issues
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
def __init__(self, language='en', tts_model=None, whisper_model="tiny", debug_mode=False):
|
|
167
|
+
"""Initialize the Voice Manager with language support.
|
|
168
|
+
|
|
14
169
|
Args:
|
|
15
|
-
|
|
170
|
+
language: Language code ('en', 'fr', 'es', 'de', 'it')
|
|
171
|
+
tts_model: Specific TTS model name or None for language default
|
|
16
172
|
whisper_model: Whisper model name to use
|
|
17
173
|
debug_mode: Enable debug logging
|
|
18
174
|
"""
|
|
19
175
|
self.debug_mode = debug_mode
|
|
20
|
-
self.speed = 1.0
|
|
21
|
-
|
|
22
|
-
#
|
|
176
|
+
self.speed = 1.0
|
|
177
|
+
|
|
178
|
+
# Validate and set language
|
|
179
|
+
language = language.lower()
|
|
180
|
+
if language not in self.LANGUAGES:
|
|
181
|
+
if debug_mode:
|
|
182
|
+
available = ', '.join(self.LANGUAGES.keys())
|
|
183
|
+
print(f"⚠️ Unsupported language '{language}', using English. Available: {available}")
|
|
184
|
+
language = 'en'
|
|
185
|
+
self.language = language
|
|
186
|
+
|
|
187
|
+
# Select TTS model with smart detection
|
|
188
|
+
if tts_model is None:
|
|
189
|
+
tts_model = self._select_best_model(self.language)
|
|
190
|
+
if debug_mode:
|
|
191
|
+
lang_name = self.LANGUAGES[self.language]['name']
|
|
192
|
+
print(f"🌍 Using {lang_name} voice: {tts_model}")
|
|
193
|
+
|
|
194
|
+
# Initialize TTS engine using lazy import
|
|
195
|
+
TTSEngine = _import_tts_engine()
|
|
23
196
|
self.tts_engine = TTSEngine(
|
|
24
197
|
model_name=tts_model,
|
|
25
198
|
debug_mode=debug_mode
|
|
@@ -143,14 +316,16 @@ class VoiceManager:
|
|
|
143
316
|
def _transcription_handler(text):
|
|
144
317
|
if self._transcription_callback:
|
|
145
318
|
self._transcription_callback(text)
|
|
146
|
-
|
|
319
|
+
|
|
147
320
|
def _stop_handler():
|
|
148
321
|
# Stop listening
|
|
149
322
|
self.stop_listening()
|
|
150
323
|
# Call user's stop callback if provided
|
|
151
324
|
if self._stop_callback:
|
|
152
325
|
self._stop_callback()
|
|
153
|
-
|
|
326
|
+
|
|
327
|
+
# Use lazy import for VoiceRecognizer
|
|
328
|
+
VoiceRecognizer = _import_voice_recognizer()
|
|
154
329
|
self.voice_recognizer = VoiceRecognizer(
|
|
155
330
|
transcription_callback=_transcription_handler,
|
|
156
331
|
stop_callback=_stop_handler,
|
|
@@ -235,7 +410,8 @@ class VoiceManager:
|
|
|
235
410
|
# Stop any current speech
|
|
236
411
|
self.stop_speaking()
|
|
237
412
|
|
|
238
|
-
# Reinitialize TTS engine with new model
|
|
413
|
+
# Reinitialize TTS engine with new model using lazy import
|
|
414
|
+
TTSEngine = _import_tts_engine()
|
|
239
415
|
self.tts_engine = TTSEngine(
|
|
240
416
|
model_name=model_name,
|
|
241
417
|
debug_mode=self.debug_mode
|
|
@@ -262,11 +438,377 @@ class VoiceManager:
|
|
|
262
438
|
|
|
263
439
|
def get_whisper(self):
|
|
264
440
|
"""Get the Whisper model.
|
|
265
|
-
|
|
441
|
+
|
|
266
442
|
Returns:
|
|
267
443
|
Current Whisper model name
|
|
268
444
|
"""
|
|
269
445
|
return self.whisper_model
|
|
446
|
+
|
|
447
|
+
def set_language(self, language):
|
|
448
|
+
"""Set the voice language.
|
|
449
|
+
|
|
450
|
+
Args:
|
|
451
|
+
language: Language code ('en', 'fr', 'es', 'de', 'it')
|
|
452
|
+
|
|
453
|
+
Returns:
|
|
454
|
+
True if successful, False otherwise
|
|
455
|
+
"""
|
|
456
|
+
# Validate language
|
|
457
|
+
language = language.lower()
|
|
458
|
+
if language not in self.LANGUAGES:
|
|
459
|
+
if self.debug_mode:
|
|
460
|
+
available = ', '.join(self.LANGUAGES.keys())
|
|
461
|
+
print(f"⚠️ Unsupported language '{language}'. Available: {available}")
|
|
462
|
+
return False
|
|
463
|
+
|
|
464
|
+
# Skip if already using this language
|
|
465
|
+
if language == self.language:
|
|
466
|
+
if self.debug_mode:
|
|
467
|
+
print(f"✓ Already using {self.LANGUAGES[language]['name']} voice")
|
|
468
|
+
return True
|
|
469
|
+
|
|
470
|
+
# Stop any current operations
|
|
471
|
+
self.stop_speaking()
|
|
472
|
+
if self.voice_recognizer:
|
|
473
|
+
self.voice_recognizer.stop()
|
|
474
|
+
|
|
475
|
+
# Select best model for this language
|
|
476
|
+
selected_model = self._select_best_model(language)
|
|
477
|
+
models_to_try = [selected_model, self.SAFE_FALLBACK]
|
|
478
|
+
|
|
479
|
+
for model_name in models_to_try:
|
|
480
|
+
try:
|
|
481
|
+
if self.debug_mode:
|
|
482
|
+
lang_name = self.LANGUAGES[language]['name']
|
|
483
|
+
print(f"🌍 Switching to {lang_name} voice: {model_name}")
|
|
484
|
+
|
|
485
|
+
# Reinitialize TTS engine
|
|
486
|
+
TTSEngine = _import_tts_engine()
|
|
487
|
+
self.tts_engine = TTSEngine(model_name=model_name, debug_mode=self.debug_mode)
|
|
488
|
+
|
|
489
|
+
# Restore callbacks
|
|
490
|
+
self.tts_engine.on_playback_start = self._on_tts_start
|
|
491
|
+
self.tts_engine.on_playback_end = self._on_tts_end
|
|
492
|
+
|
|
493
|
+
# Update language and set appropriate speed for Italian voices
|
|
494
|
+
self.language = language
|
|
495
|
+
|
|
496
|
+
# Set language-specific speed adjustments
|
|
497
|
+
if language == 'it':
|
|
498
|
+
self.speed = 0.8 # Slow down Italian voices to fix pace issues
|
|
499
|
+
if self.debug_mode:
|
|
500
|
+
print(f" Speed: {self.speed} (adjusted for optimal Italian pace)")
|
|
501
|
+
else:
|
|
502
|
+
self.speed = 1.0 # Default speed for other languages
|
|
503
|
+
|
|
504
|
+
return True
|
|
505
|
+
|
|
506
|
+
except Exception as e:
|
|
507
|
+
if self.debug_mode:
|
|
508
|
+
print(f"⚠️ Model {model_name} failed: {e}")
|
|
509
|
+
continue
|
|
510
|
+
|
|
511
|
+
# All models failed
|
|
512
|
+
if self.debug_mode:
|
|
513
|
+
print(f"❌ All models failed for language '{language}'")
|
|
514
|
+
return False
|
|
515
|
+
|
|
516
|
+
def get_language(self):
|
|
517
|
+
"""Get the current voice language.
|
|
518
|
+
|
|
519
|
+
Returns:
|
|
520
|
+
Current language code
|
|
521
|
+
"""
|
|
522
|
+
return self.language
|
|
523
|
+
|
|
524
|
+
def get_supported_languages(self):
|
|
525
|
+
"""Get list of supported language codes.
|
|
526
|
+
|
|
527
|
+
Returns:
|
|
528
|
+
List of supported language codes
|
|
529
|
+
"""
|
|
530
|
+
return list(self.LANGUAGES.keys())
|
|
531
|
+
|
|
532
|
+
def get_language_name(self, language_code=None):
|
|
533
|
+
"""Get the display name for a language.
|
|
534
|
+
|
|
535
|
+
Args:
|
|
536
|
+
language_code: Language code (defaults to current language)
|
|
537
|
+
|
|
538
|
+
Returns:
|
|
539
|
+
Language display name
|
|
540
|
+
"""
|
|
541
|
+
lang = language_code or self.language
|
|
542
|
+
return self.LANGUAGES.get(lang, {}).get('name', lang)
|
|
543
|
+
|
|
544
|
+
def _select_best_model(self, language):
|
|
545
|
+
"""Select the best available TTS model for a language.
|
|
546
|
+
|
|
547
|
+
Try premium model first (higher quality), fallback to default (reliable).
|
|
548
|
+
|
|
549
|
+
Args:
|
|
550
|
+
language: Language code
|
|
551
|
+
|
|
552
|
+
Returns:
|
|
553
|
+
Model name string
|
|
554
|
+
"""
|
|
555
|
+
if language not in self.LANGUAGES:
|
|
556
|
+
return self.SAFE_FALLBACK
|
|
557
|
+
|
|
558
|
+
lang_config = self.LANGUAGES[language]
|
|
559
|
+
|
|
560
|
+
# Try premium model first (better quality)
|
|
561
|
+
if 'premium' in lang_config:
|
|
562
|
+
try:
|
|
563
|
+
premium_model = lang_config['premium']
|
|
564
|
+
# Quick test to see if this model type works
|
|
565
|
+
if self._test_model_compatibility(premium_model):
|
|
566
|
+
if self.debug_mode:
|
|
567
|
+
print(f"✨ Using premium quality model: {premium_model}")
|
|
568
|
+
return premium_model
|
|
569
|
+
elif self.debug_mode:
|
|
570
|
+
print(f"⚠️ Premium model not compatible, using default")
|
|
571
|
+
except Exception:
|
|
572
|
+
if self.debug_mode:
|
|
573
|
+
print(f"⚠️ Premium model failed, using default")
|
|
574
|
+
|
|
575
|
+
# Use reliable default model
|
|
576
|
+
default_model = lang_config.get('default', self.SAFE_FALLBACK)
|
|
577
|
+
if self.debug_mode:
|
|
578
|
+
print(f"🔧 Using reliable default model: {default_model}")
|
|
579
|
+
return default_model
|
|
580
|
+
|
|
581
|
+
def _test_model_compatibility(self, model_name):
|
|
582
|
+
"""Quick test if a model is compatible with current system.
|
|
583
|
+
|
|
584
|
+
Args:
|
|
585
|
+
model_name: TTS model name
|
|
586
|
+
|
|
587
|
+
Returns:
|
|
588
|
+
True if compatible, False otherwise
|
|
589
|
+
"""
|
|
590
|
+
# For VITS models, check if espeak-ng is available
|
|
591
|
+
if 'vits' in model_name.lower():
|
|
592
|
+
try:
|
|
593
|
+
import subprocess
|
|
594
|
+
result = subprocess.run(['espeak-ng', '--version'],
|
|
595
|
+
capture_output=True, timeout=2)
|
|
596
|
+
return result.returncode == 0
|
|
597
|
+
except (FileNotFoundError, subprocess.TimeoutExpired, subprocess.SubprocessError):
|
|
598
|
+
return False
|
|
599
|
+
|
|
600
|
+
# For other models, assume they work (they're more compatible)
|
|
601
|
+
return True
|
|
602
|
+
|
|
603
|
+
def set_voice_variant(self, language, variant):
|
|
604
|
+
"""Set a specific voice variant for a language.
|
|
605
|
+
|
|
606
|
+
Args:
|
|
607
|
+
language: Language code ('fr', 'it')
|
|
608
|
+
variant: Variant name ('female', 'alternative', etc.)
|
|
609
|
+
|
|
610
|
+
Returns:
|
|
611
|
+
True if successful, False otherwise
|
|
612
|
+
|
|
613
|
+
Examples:
|
|
614
|
+
vm.set_voice_variant('it', 'female') # Use female Italian voice
|
|
615
|
+
vm.set_voice_variant('fr', 'alternative') # Use original French model
|
|
616
|
+
"""
|
|
617
|
+
if language not in self.ALTERNATIVE_MODELS:
|
|
618
|
+
if self.debug_mode:
|
|
619
|
+
available_langs = ', '.join(self.ALTERNATIVE_MODELS.keys())
|
|
620
|
+
print(f"⚠️ No variants available for '{language}'. Languages with variants: {available_langs}")
|
|
621
|
+
return False
|
|
622
|
+
|
|
623
|
+
if variant not in self.ALTERNATIVE_MODELS[language]:
|
|
624
|
+
if self.debug_mode:
|
|
625
|
+
available_variants = ', '.join(self.ALTERNATIVE_MODELS[language].keys())
|
|
626
|
+
print(f"⚠️ Variant '{variant}' not available for {language}. Available: {available_variants}")
|
|
627
|
+
return False
|
|
628
|
+
|
|
629
|
+
# Get the specific model for this variant
|
|
630
|
+
model_name = self.ALTERNATIVE_MODELS[language][variant]
|
|
631
|
+
|
|
632
|
+
if self.debug_mode:
|
|
633
|
+
lang_name = self.LANGUAGES[language]['name']
|
|
634
|
+
print(f"🎭 Switching to {lang_name} {variant} voice: {model_name}")
|
|
635
|
+
|
|
636
|
+
# Set the specific model
|
|
637
|
+
return self.set_tts_model(model_name)
|
|
638
|
+
|
|
639
|
+
def get_model_info(self):
|
|
640
|
+
"""Get information about currently loaded models and system capabilities.
|
|
641
|
+
|
|
642
|
+
Returns:
|
|
643
|
+
Dict with model information and system capabilities
|
|
644
|
+
"""
|
|
645
|
+
info = {
|
|
646
|
+
'current_language': self.language,
|
|
647
|
+
'language_name': self.get_language_name(),
|
|
648
|
+
'espeak_available': self._test_model_compatibility('test_vits'),
|
|
649
|
+
'supported_languages': self.get_supported_languages()
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
# Add model recommendations for each language
|
|
653
|
+
info['models'] = {}
|
|
654
|
+
for lang in self.get_supported_languages():
|
|
655
|
+
selected_model = self._select_best_model(lang)
|
|
656
|
+
lang_config = self.LANGUAGES[lang]
|
|
657
|
+
is_premium = selected_model == lang_config.get('premium', '')
|
|
658
|
+
|
|
659
|
+
info['models'][lang] = {
|
|
660
|
+
'name': lang_config['name'],
|
|
661
|
+
'selected_model': selected_model,
|
|
662
|
+
'quality': 'premium' if is_premium else 'default',
|
|
663
|
+
'default_available': lang_config.get('default', ''),
|
|
664
|
+
'premium_available': lang_config.get('premium', '')
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
return info
|
|
668
|
+
|
|
669
|
+
def browse_voices(self, language=None, quality=None, gender=None):
|
|
670
|
+
"""Browse available voices with filtering options.
|
|
671
|
+
|
|
672
|
+
Args:
|
|
673
|
+
language: Language code ('en', 'fr', etc.) or None for all
|
|
674
|
+
quality: 'premium', 'good', or None for all
|
|
675
|
+
gender: 'male', 'female', 'multiple', or None for all
|
|
676
|
+
|
|
677
|
+
Returns:
|
|
678
|
+
Dict of available voices with metadata
|
|
679
|
+
"""
|
|
680
|
+
voices = {}
|
|
681
|
+
|
|
682
|
+
# Get languages to check
|
|
683
|
+
languages_to_check = [language] if language else self.VOICE_CATALOG.keys()
|
|
684
|
+
|
|
685
|
+
for lang in languages_to_check:
|
|
686
|
+
if lang not in self.VOICE_CATALOG:
|
|
687
|
+
continue
|
|
688
|
+
|
|
689
|
+
lang_voices = {}
|
|
690
|
+
for voice_id, voice_info in self.VOICE_CATALOG[lang].items():
|
|
691
|
+
# Apply filters
|
|
692
|
+
if quality and voice_info['quality'] != quality:
|
|
693
|
+
continue
|
|
694
|
+
if gender and voice_info['gender'] != gender:
|
|
695
|
+
continue
|
|
696
|
+
|
|
697
|
+
# Check if voice is compatible with current system
|
|
698
|
+
compatible = True
|
|
699
|
+
if voice_info['requires'] == 'espeak-ng':
|
|
700
|
+
compatible = self._test_model_compatibility(voice_info['model'])
|
|
701
|
+
|
|
702
|
+
# Add compatibility info
|
|
703
|
+
voice_data = voice_info.copy()
|
|
704
|
+
voice_data['compatible'] = compatible
|
|
705
|
+
lang_voices[voice_id] = voice_data
|
|
706
|
+
|
|
707
|
+
if lang_voices:
|
|
708
|
+
voices[lang] = lang_voices
|
|
709
|
+
|
|
710
|
+
return voices
|
|
711
|
+
|
|
712
|
+
def list_voices(self, language=None):
|
|
713
|
+
"""List available voices in a user-friendly format.
|
|
714
|
+
|
|
715
|
+
Args:
|
|
716
|
+
language: Language code or None for all languages
|
|
717
|
+
"""
|
|
718
|
+
voices = self.browse_voices(language)
|
|
719
|
+
|
|
720
|
+
if not voices:
|
|
721
|
+
print("No voices found matching criteria.")
|
|
722
|
+
return
|
|
723
|
+
|
|
724
|
+
# License links mapping
|
|
725
|
+
license_links = {
|
|
726
|
+
'CSS10': 'https://github.com/Kyubyong/CSS10',
|
|
727
|
+
'M-AILABS': 'https://www.caito.de/2019/01/03/the-m-ailabs-speech-dataset/',
|
|
728
|
+
'LJSpeech': 'https://keithito.com/LJ-Speech-Dataset/',
|
|
729
|
+
'VCTK': 'https://datashare.ed.ac.uk/handle/10283/3443',
|
|
730
|
+
'Thorsten': 'https://www.thorsten-voice.de/en/'
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
for lang, lang_voices in voices.items():
|
|
734
|
+
lang_name = self.LANGUAGES.get(lang, {}).get('name', lang)
|
|
735
|
+
print(f"\n🌍 {lang_name} ({lang}) - {len(lang_voices)} voices available:")
|
|
736
|
+
|
|
737
|
+
for voice_id, voice_info in lang_voices.items():
|
|
738
|
+
quality_icon = "✨" if voice_info['quality'] == 'premium' else "🔧"
|
|
739
|
+
compat_icon = "✅" if voice_info['compatible'] else "⚠️"
|
|
740
|
+
gender_icon = {"male": "👨", "female": "👩", "multiple": "👥"}.get(voice_info['gender'], "🗣️")
|
|
741
|
+
|
|
742
|
+
# Show full format: language.voice_id
|
|
743
|
+
full_voice_id = f"{lang}.{voice_id}"
|
|
744
|
+
print(f" {compat_icon} {quality_icon} {gender_icon} {full_voice_id}")
|
|
745
|
+
print(f" {voice_info['accent']} - {voice_info['gender']} voice")
|
|
746
|
+
|
|
747
|
+
# Extract license name and add link if available
|
|
748
|
+
license_text = voice_info['license']
|
|
749
|
+
license_with_link = license_text
|
|
750
|
+
for dataset_name, link in license_links.items():
|
|
751
|
+
if dataset_name in license_text:
|
|
752
|
+
license_with_link = f"{license_text} - {link}"
|
|
753
|
+
break
|
|
754
|
+
|
|
755
|
+
print(f" License: {license_with_link}")
|
|
756
|
+
if not voice_info['compatible'] and voice_info['requires'] == 'espeak-ng':
|
|
757
|
+
print(f" ⚠️ Requires: espeak-ng (install for premium quality)")
|
|
758
|
+
|
|
759
|
+
def set_voice(self, language, voice_id):
|
|
760
|
+
"""Set a specific voice by ID.
|
|
761
|
+
|
|
762
|
+
Args:
|
|
763
|
+
language: Language code
|
|
764
|
+
voice_id: Voice ID from voice catalog
|
|
765
|
+
|
|
766
|
+
Returns:
|
|
767
|
+
True if successful
|
|
768
|
+
|
|
769
|
+
Example:
|
|
770
|
+
vm.set_voice('fr', 'css10_vits') # Use CSS10 French VITS voice
|
|
771
|
+
vm.set_voice('it', 'mai_female_vits') # Use female Italian VITS voice
|
|
772
|
+
"""
|
|
773
|
+
if language not in self.VOICE_CATALOG:
|
|
774
|
+
if self.debug_mode:
|
|
775
|
+
print(f"⚠️ Language '{language}' not available")
|
|
776
|
+
return False
|
|
777
|
+
|
|
778
|
+
if voice_id not in self.VOICE_CATALOG[language]:
|
|
779
|
+
if self.debug_mode:
|
|
780
|
+
available = ', '.join(self.VOICE_CATALOG[language].keys())
|
|
781
|
+
print(f"⚠️ Voice '{voice_id}' not available for {language}. Available: {available}")
|
|
782
|
+
return False
|
|
783
|
+
|
|
784
|
+
voice_info = self.VOICE_CATALOG[language][voice_id]
|
|
785
|
+
|
|
786
|
+
# Check compatibility
|
|
787
|
+
if voice_info['requires'] == 'espeak-ng' and not self._test_model_compatibility(voice_info['model']):
|
|
788
|
+
if self.debug_mode:
|
|
789
|
+
print(f"⚠️ Voice '{voice_id}' requires espeak-ng. Install it for premium quality.")
|
|
790
|
+
return False
|
|
791
|
+
|
|
792
|
+
# Set the specific voice
|
|
793
|
+
model_name = voice_info['model']
|
|
794
|
+
if self.debug_mode:
|
|
795
|
+
print(f"🎭 Setting {language} voice to: {voice_id}")
|
|
796
|
+
print(f" Model: {model_name}")
|
|
797
|
+
print(f" Quality: {voice_info['quality']} | Gender: {voice_info['gender']}")
|
|
798
|
+
print(f" Accent: {voice_info['accent']}")
|
|
799
|
+
|
|
800
|
+
# Switch to the language and specific model
|
|
801
|
+
self.language = language
|
|
802
|
+
|
|
803
|
+
# Set voice-specific speed if available
|
|
804
|
+
if 'speed' in voice_info:
|
|
805
|
+
self.speed = voice_info['speed']
|
|
806
|
+
if self.debug_mode:
|
|
807
|
+
print(f" Speed: {voice_info['speed']} (adjusted for optimal pace)")
|
|
808
|
+
else:
|
|
809
|
+
self.speed = 1.0 # Default speed
|
|
810
|
+
|
|
811
|
+
return self.set_tts_model(model_name)
|
|
270
812
|
|
|
271
813
|
def change_vad_aggressiveness(self, aggressiveness):
|
|
272
814
|
"""Change VAD aggressiveness.
|