abstractvoice 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,25 +1,198 @@
1
1
  """Main Voice Manager class for coordinating TTS and STT components."""
2
2
 
3
- from .tts import TTSEngine
4
- from .recognition import VoiceRecognizer
3
+ # Lazy imports - heavy dependencies are only imported when needed
4
+ def _import_tts_engine():
5
+ """Import TTSEngine with helpful error message if dependencies missing."""
6
+ try:
7
+ from .tts import TTSEngine
8
+ return TTSEngine
9
+ except ImportError as e:
10
+ if "TTS" in str(e) or "torch" in str(e) or "librosa" in str(e):
11
+ raise ImportError(
12
+ "TTS functionality requires optional dependencies. Install with:\n"
13
+ " pip install abstractvoice[tts] # For TTS only\n"
14
+ " pip install abstractvoice[all] # For all features\n"
15
+ f"Original error: {e}"
16
+ ) from e
17
+ raise
18
+
19
+ def _import_voice_recognizer():
20
+ """Import VoiceRecognizer with helpful error message if dependencies missing."""
21
+ try:
22
+ from .recognition import VoiceRecognizer
23
+ return VoiceRecognizer
24
+ except ImportError as e:
25
+ if "whisper" in str(e) or "tiktoken" in str(e):
26
+ raise ImportError(
27
+ "Speech recognition functionality requires optional dependencies. Install with:\n"
28
+ " pip install abstractvoice[stt] # For speech recognition only\n"
29
+ " pip install abstractvoice[all] # For all features\n"
30
+ f"Original error: {e}"
31
+ ) from e
32
+ raise
5
33
 
6
34
 
7
35
  class VoiceManager:
8
- """Main class for voice interaction capabilities."""
9
-
10
- def __init__(self, tts_model="tts_models/en/ljspeech/vits",
11
- whisper_model="tiny", debug_mode=False):
12
- """Initialize the Voice Manager.
13
-
36
+ """Main class for voice interaction capabilities with multilingual support."""
37
+
38
+ # Smart language configuration - high quality stable defaults
39
+ LANGUAGES = {
40
+ 'en': {
41
+ 'default': 'tts_models/en/ljspeech/vits', # High quality premium voice
42
+ 'premium': 'tts_models/en/ljspeech/vits', # Use same stable model
43
+ 'name': 'English'
44
+ },
45
+ 'fr': {
46
+ 'default': 'tts_models/fr/css10/vits', # High quality cleaner audio
47
+ 'premium': 'tts_models/fr/css10/vits', # Use same stable model
48
+ 'name': 'French'
49
+ },
50
+ 'es': {
51
+ 'default': 'tts_models/es/mai/tacotron2-DDC', # Keep stable Spanish model
52
+ 'premium': 'tts_models/es/mai/tacotron2-DDC', # Same model (reliable)
53
+ 'name': 'Spanish'
54
+ },
55
+ 'de': {
56
+ 'default': 'tts_models/de/thorsten/vits', # High quality German
57
+ 'premium': 'tts_models/de/thorsten/vits', # Use same stable model
58
+ 'name': 'German'
59
+ },
60
+ 'it': {
61
+ 'default': 'tts_models/it/mai_male/vits', # Use slower male voice as default
62
+ 'premium': 'tts_models/it/mai_male/vits', # Same stable model
63
+ 'name': 'Italian'
64
+ }
65
+ }
66
+
67
+ # Universal safe fallback
68
+ SAFE_FALLBACK = 'tts_models/en/ljspeech/fast_pitch'
69
+
70
+ # Complete voice catalog with metadata
71
+ VOICE_CATALOG = {
72
+ 'en': {
73
+ 'vits_premium': {
74
+ 'model': 'tts_models/en/ljspeech/vits',
75
+ 'quality': 'premium',
76
+ 'gender': 'female',
77
+ 'accent': 'US English',
78
+ 'license': 'Open source (LJSpeech)',
79
+ 'requires': 'espeak-ng'
80
+ },
81
+ 'fast_pitch_reliable': {
82
+ 'model': 'tts_models/en/ljspeech/fast_pitch',
83
+ 'quality': 'good',
84
+ 'gender': 'female',
85
+ 'accent': 'US English',
86
+ 'license': 'Open source (LJSpeech)',
87
+ 'requires': 'none'
88
+ },
89
+ 'vctk_multi': {
90
+ 'model': 'tts_models/en/vctk/vits',
91
+ 'quality': 'premium',
92
+ 'gender': 'multiple',
93
+ 'accent': 'British English',
94
+ 'license': 'Open source (VCTK)',
95
+ 'requires': 'espeak-ng'
96
+ }
97
+ },
98
+ 'fr': {
99
+ 'css10_vits': {
100
+ 'model': 'tts_models/fr/css10/vits',
101
+ 'quality': 'premium',
102
+ 'gender': 'male',
103
+ 'accent': 'France French',
104
+ 'license': 'Apache 2.0 (CSS10/LibriVox)',
105
+ 'requires': 'espeak-ng'
106
+ },
107
+ 'mai_tacotron': {
108
+ 'model': 'tts_models/fr/mai/tacotron2-DDC',
109
+ 'quality': 'good',
110
+ 'gender': 'female',
111
+ 'accent': 'France French',
112
+ 'license': 'Permissive (M-AILABS/LibriVox)',
113
+ 'requires': 'none'
114
+ }
115
+ },
116
+ 'es': {
117
+ 'mai_tacotron': {
118
+ 'model': 'tts_models/es/mai/tacotron2-DDC',
119
+ 'quality': 'good',
120
+ 'gender': 'female',
121
+ 'accent': 'Spain Spanish',
122
+ 'license': 'Permissive (M-AILABS)',
123
+ 'requires': 'none'
124
+ }
125
+ },
126
+ 'de': {
127
+ 'thorsten_vits': {
128
+ 'model': 'tts_models/de/thorsten/vits',
129
+ 'quality': 'premium',
130
+ 'gender': 'male',
131
+ 'accent': 'Standard German',
132
+ 'license': 'Open source (Thorsten)',
133
+ 'requires': 'espeak-ng'
134
+ },
135
+ 'thorsten_tacotron': {
136
+ 'model': 'tts_models/de/thorsten/tacotron2-DDC',
137
+ 'quality': 'good',
138
+ 'gender': 'male',
139
+ 'accent': 'Standard German',
140
+ 'license': 'Open source (Thorsten)',
141
+ 'requires': 'none'
142
+ }
143
+ },
144
+ 'it': {
145
+ 'mai_male_vits': {
146
+ 'model': 'tts_models/it/mai_male/vits',
147
+ 'quality': 'premium',
148
+ 'gender': 'male',
149
+ 'accent': 'Standard Italian',
150
+ 'license': 'Permissive (M-AILABS)',
151
+ 'requires': 'espeak-ng',
152
+ 'speed': 0.8 # Slow down to fix pace issues
153
+ },
154
+ 'mai_female_vits': {
155
+ 'model': 'tts_models/it/mai_female/vits',
156
+ 'quality': 'premium',
157
+ 'gender': 'female',
158
+ 'accent': 'Standard Italian',
159
+ 'license': 'Permissive (M-AILABS)',
160
+ 'requires': 'espeak-ng',
161
+ 'speed': 0.8 # Slow down to fix pace issues
162
+ }
163
+ }
164
+ }
165
+
166
+ def __init__(self, language='en', tts_model=None, whisper_model="tiny", debug_mode=False):
167
+ """Initialize the Voice Manager with language support.
168
+
14
169
  Args:
15
- tts_model: TTS model name to use
170
+ language: Language code ('en', 'fr', 'es', 'de', 'it')
171
+ tts_model: Specific TTS model name or None for language default
16
172
  whisper_model: Whisper model name to use
17
173
  debug_mode: Enable debug logging
18
174
  """
19
175
  self.debug_mode = debug_mode
20
- self.speed = 1.0
21
-
22
- # Initialize TTS engine
176
+ self.speed = 1.0
177
+
178
+ # Validate and set language
179
+ language = language.lower()
180
+ if language not in self.LANGUAGES:
181
+ if debug_mode:
182
+ available = ', '.join(self.LANGUAGES.keys())
183
+ print(f"⚠️ Unsupported language '{language}', using English. Available: {available}")
184
+ language = 'en'
185
+ self.language = language
186
+
187
+ # Select TTS model with smart detection
188
+ if tts_model is None:
189
+ tts_model = self._select_best_model(self.language)
190
+ if debug_mode:
191
+ lang_name = self.LANGUAGES[self.language]['name']
192
+ print(f"🌍 Using {lang_name} voice: {tts_model}")
193
+
194
+ # Initialize TTS engine using lazy import
195
+ TTSEngine = _import_tts_engine()
23
196
  self.tts_engine = TTSEngine(
24
197
  model_name=tts_model,
25
198
  debug_mode=debug_mode
@@ -143,14 +316,16 @@ class VoiceManager:
143
316
  def _transcription_handler(text):
144
317
  if self._transcription_callback:
145
318
  self._transcription_callback(text)
146
-
319
+
147
320
  def _stop_handler():
148
321
  # Stop listening
149
322
  self.stop_listening()
150
323
  # Call user's stop callback if provided
151
324
  if self._stop_callback:
152
325
  self._stop_callback()
153
-
326
+
327
+ # Use lazy import for VoiceRecognizer
328
+ VoiceRecognizer = _import_voice_recognizer()
154
329
  self.voice_recognizer = VoiceRecognizer(
155
330
  transcription_callback=_transcription_handler,
156
331
  stop_callback=_stop_handler,
@@ -235,7 +410,8 @@ class VoiceManager:
235
410
  # Stop any current speech
236
411
  self.stop_speaking()
237
412
 
238
- # Reinitialize TTS engine with new model
413
+ # Reinitialize TTS engine with new model using lazy import
414
+ TTSEngine = _import_tts_engine()
239
415
  self.tts_engine = TTSEngine(
240
416
  model_name=model_name,
241
417
  debug_mode=self.debug_mode
@@ -262,11 +438,377 @@ class VoiceManager:
262
438
 
263
439
  def get_whisper(self):
264
440
  """Get the Whisper model.
265
-
441
+
266
442
  Returns:
267
443
  Current Whisper model name
268
444
  """
269
445
  return self.whisper_model
446
+
447
+ def set_language(self, language):
448
+ """Set the voice language.
449
+
450
+ Args:
451
+ language: Language code ('en', 'fr', 'es', 'de', 'it')
452
+
453
+ Returns:
454
+ True if successful, False otherwise
455
+ """
456
+ # Validate language
457
+ language = language.lower()
458
+ if language not in self.LANGUAGES:
459
+ if self.debug_mode:
460
+ available = ', '.join(self.LANGUAGES.keys())
461
+ print(f"⚠️ Unsupported language '{language}'. Available: {available}")
462
+ return False
463
+
464
+ # Skip if already using this language
465
+ if language == self.language:
466
+ if self.debug_mode:
467
+ print(f"✓ Already using {self.LANGUAGES[language]['name']} voice")
468
+ return True
469
+
470
+ # Stop any current operations
471
+ self.stop_speaking()
472
+ if self.voice_recognizer:
473
+ self.voice_recognizer.stop()
474
+
475
+ # Select best model for this language
476
+ selected_model = self._select_best_model(language)
477
+ models_to_try = [selected_model, self.SAFE_FALLBACK]
478
+
479
+ for model_name in models_to_try:
480
+ try:
481
+ if self.debug_mode:
482
+ lang_name = self.LANGUAGES[language]['name']
483
+ print(f"🌍 Switching to {lang_name} voice: {model_name}")
484
+
485
+ # Reinitialize TTS engine
486
+ TTSEngine = _import_tts_engine()
487
+ self.tts_engine = TTSEngine(model_name=model_name, debug_mode=self.debug_mode)
488
+
489
+ # Restore callbacks
490
+ self.tts_engine.on_playback_start = self._on_tts_start
491
+ self.tts_engine.on_playback_end = self._on_tts_end
492
+
493
+ # Update language and set appropriate speed for Italian voices
494
+ self.language = language
495
+
496
+ # Set language-specific speed adjustments
497
+ if language == 'it':
498
+ self.speed = 0.8 # Slow down Italian voices to fix pace issues
499
+ if self.debug_mode:
500
+ print(f" Speed: {self.speed} (adjusted for optimal Italian pace)")
501
+ else:
502
+ self.speed = 1.0 # Default speed for other languages
503
+
504
+ return True
505
+
506
+ except Exception as e:
507
+ if self.debug_mode:
508
+ print(f"⚠️ Model {model_name} failed: {e}")
509
+ continue
510
+
511
+ # All models failed
512
+ if self.debug_mode:
513
+ print(f"❌ All models failed for language '{language}'")
514
+ return False
515
+
516
+ def get_language(self):
517
+ """Get the current voice language.
518
+
519
+ Returns:
520
+ Current language code
521
+ """
522
+ return self.language
523
+
524
+ def get_supported_languages(self):
525
+ """Get list of supported language codes.
526
+
527
+ Returns:
528
+ List of supported language codes
529
+ """
530
+ return list(self.LANGUAGES.keys())
531
+
532
+ def get_language_name(self, language_code=None):
533
+ """Get the display name for a language.
534
+
535
+ Args:
536
+ language_code: Language code (defaults to current language)
537
+
538
+ Returns:
539
+ Language display name
540
+ """
541
+ lang = language_code or self.language
542
+ return self.LANGUAGES.get(lang, {}).get('name', lang)
543
+
544
+ def _select_best_model(self, language):
545
+ """Select the best available TTS model for a language.
546
+
547
+ Try premium model first (higher quality), fallback to default (reliable).
548
+
549
+ Args:
550
+ language: Language code
551
+
552
+ Returns:
553
+ Model name string
554
+ """
555
+ if language not in self.LANGUAGES:
556
+ return self.SAFE_FALLBACK
557
+
558
+ lang_config = self.LANGUAGES[language]
559
+
560
+ # Try premium model first (better quality)
561
+ if 'premium' in lang_config:
562
+ try:
563
+ premium_model = lang_config['premium']
564
+ # Quick test to see if this model type works
565
+ if self._test_model_compatibility(premium_model):
566
+ if self.debug_mode:
567
+ print(f"✨ Using premium quality model: {premium_model}")
568
+ return premium_model
569
+ elif self.debug_mode:
570
+ print(f"⚠️ Premium model not compatible, using default")
571
+ except Exception:
572
+ if self.debug_mode:
573
+ print(f"⚠️ Premium model failed, using default")
574
+
575
+ # Use reliable default model
576
+ default_model = lang_config.get('default', self.SAFE_FALLBACK)
577
+ if self.debug_mode:
578
+ print(f"🔧 Using reliable default model: {default_model}")
579
+ return default_model
580
+
581
+ def _test_model_compatibility(self, model_name):
582
+ """Quick test if a model is compatible with current system.
583
+
584
+ Args:
585
+ model_name: TTS model name
586
+
587
+ Returns:
588
+ True if compatible, False otherwise
589
+ """
590
+ # For VITS models, check if espeak-ng is available
591
+ if 'vits' in model_name.lower():
592
+ try:
593
+ import subprocess
594
+ result = subprocess.run(['espeak-ng', '--version'],
595
+ capture_output=True, timeout=2)
596
+ return result.returncode == 0
597
+ except (FileNotFoundError, subprocess.TimeoutExpired, subprocess.SubprocessError):
598
+ return False
599
+
600
+ # For other models, assume they work (they're more compatible)
601
+ return True
602
+
603
+ def set_voice_variant(self, language, variant):
604
+ """Set a specific voice variant for a language.
605
+
606
+ Args:
607
+ language: Language code ('fr', 'it')
608
+ variant: Variant name ('female', 'alternative', etc.)
609
+
610
+ Returns:
611
+ True if successful, False otherwise
612
+
613
+ Examples:
614
+ vm.set_voice_variant('it', 'female') # Use female Italian voice
615
+ vm.set_voice_variant('fr', 'alternative') # Use original French model
616
+ """
617
+ if language not in self.ALTERNATIVE_MODELS:
618
+ if self.debug_mode:
619
+ available_langs = ', '.join(self.ALTERNATIVE_MODELS.keys())
620
+ print(f"⚠️ No variants available for '{language}'. Languages with variants: {available_langs}")
621
+ return False
622
+
623
+ if variant not in self.ALTERNATIVE_MODELS[language]:
624
+ if self.debug_mode:
625
+ available_variants = ', '.join(self.ALTERNATIVE_MODELS[language].keys())
626
+ print(f"⚠️ Variant '{variant}' not available for {language}. Available: {available_variants}")
627
+ return False
628
+
629
+ # Get the specific model for this variant
630
+ model_name = self.ALTERNATIVE_MODELS[language][variant]
631
+
632
+ if self.debug_mode:
633
+ lang_name = self.LANGUAGES[language]['name']
634
+ print(f"🎭 Switching to {lang_name} {variant} voice: {model_name}")
635
+
636
+ # Set the specific model
637
+ return self.set_tts_model(model_name)
638
+
639
+ def get_model_info(self):
640
+ """Get information about currently loaded models and system capabilities.
641
+
642
+ Returns:
643
+ Dict with model information and system capabilities
644
+ """
645
+ info = {
646
+ 'current_language': self.language,
647
+ 'language_name': self.get_language_name(),
648
+ 'espeak_available': self._test_model_compatibility('test_vits'),
649
+ 'supported_languages': self.get_supported_languages()
650
+ }
651
+
652
+ # Add model recommendations for each language
653
+ info['models'] = {}
654
+ for lang in self.get_supported_languages():
655
+ selected_model = self._select_best_model(lang)
656
+ lang_config = self.LANGUAGES[lang]
657
+ is_premium = selected_model == lang_config.get('premium', '')
658
+
659
+ info['models'][lang] = {
660
+ 'name': lang_config['name'],
661
+ 'selected_model': selected_model,
662
+ 'quality': 'premium' if is_premium else 'default',
663
+ 'default_available': lang_config.get('default', ''),
664
+ 'premium_available': lang_config.get('premium', '')
665
+ }
666
+
667
+ return info
668
+
669
+ def browse_voices(self, language=None, quality=None, gender=None):
670
+ """Browse available voices with filtering options.
671
+
672
+ Args:
673
+ language: Language code ('en', 'fr', etc.) or None for all
674
+ quality: 'premium', 'good', or None for all
675
+ gender: 'male', 'female', 'multiple', or None for all
676
+
677
+ Returns:
678
+ Dict of available voices with metadata
679
+ """
680
+ voices = {}
681
+
682
+ # Get languages to check
683
+ languages_to_check = [language] if language else self.VOICE_CATALOG.keys()
684
+
685
+ for lang in languages_to_check:
686
+ if lang not in self.VOICE_CATALOG:
687
+ continue
688
+
689
+ lang_voices = {}
690
+ for voice_id, voice_info in self.VOICE_CATALOG[lang].items():
691
+ # Apply filters
692
+ if quality and voice_info['quality'] != quality:
693
+ continue
694
+ if gender and voice_info['gender'] != gender:
695
+ continue
696
+
697
+ # Check if voice is compatible with current system
698
+ compatible = True
699
+ if voice_info['requires'] == 'espeak-ng':
700
+ compatible = self._test_model_compatibility(voice_info['model'])
701
+
702
+ # Add compatibility info
703
+ voice_data = voice_info.copy()
704
+ voice_data['compatible'] = compatible
705
+ lang_voices[voice_id] = voice_data
706
+
707
+ if lang_voices:
708
+ voices[lang] = lang_voices
709
+
710
+ return voices
711
+
712
+ def list_voices(self, language=None):
713
+ """List available voices in a user-friendly format.
714
+
715
+ Args:
716
+ language: Language code or None for all languages
717
+ """
718
+ voices = self.browse_voices(language)
719
+
720
+ if not voices:
721
+ print("No voices found matching criteria.")
722
+ return
723
+
724
+ # License links mapping
725
+ license_links = {
726
+ 'CSS10': 'https://github.com/Kyubyong/CSS10',
727
+ 'M-AILABS': 'https://www.caito.de/2019/01/03/the-m-ailabs-speech-dataset/',
728
+ 'LJSpeech': 'https://keithito.com/LJ-Speech-Dataset/',
729
+ 'VCTK': 'https://datashare.ed.ac.uk/handle/10283/3443',
730
+ 'Thorsten': 'https://www.thorsten-voice.de/en/'
731
+ }
732
+
733
+ for lang, lang_voices in voices.items():
734
+ lang_name = self.LANGUAGES.get(lang, {}).get('name', lang)
735
+ print(f"\n🌍 {lang_name} ({lang}) - {len(lang_voices)} voices available:")
736
+
737
+ for voice_id, voice_info in lang_voices.items():
738
+ quality_icon = "✨" if voice_info['quality'] == 'premium' else "🔧"
739
+ compat_icon = "✅" if voice_info['compatible'] else "⚠️"
740
+ gender_icon = {"male": "👨", "female": "👩", "multiple": "👥"}.get(voice_info['gender'], "🗣️")
741
+
742
+ # Show full format: language.voice_id
743
+ full_voice_id = f"{lang}.{voice_id}"
744
+ print(f" {compat_icon} {quality_icon} {gender_icon} {full_voice_id}")
745
+ print(f" {voice_info['accent']} - {voice_info['gender']} voice")
746
+
747
+ # Extract license name and add link if available
748
+ license_text = voice_info['license']
749
+ license_with_link = license_text
750
+ for dataset_name, link in license_links.items():
751
+ if dataset_name in license_text:
752
+ license_with_link = f"{license_text} - {link}"
753
+ break
754
+
755
+ print(f" License: {license_with_link}")
756
+ if not voice_info['compatible'] and voice_info['requires'] == 'espeak-ng':
757
+ print(f" ⚠️ Requires: espeak-ng (install for premium quality)")
758
+
759
+ def set_voice(self, language, voice_id):
760
+ """Set a specific voice by ID.
761
+
762
+ Args:
763
+ language: Language code
764
+ voice_id: Voice ID from voice catalog
765
+
766
+ Returns:
767
+ True if successful
768
+
769
+ Example:
770
+ vm.set_voice('fr', 'css10_vits') # Use CSS10 French VITS voice
771
+ vm.set_voice('it', 'mai_female_vits') # Use female Italian VITS voice
772
+ """
773
+ if language not in self.VOICE_CATALOG:
774
+ if self.debug_mode:
775
+ print(f"⚠️ Language '{language}' not available")
776
+ return False
777
+
778
+ if voice_id not in self.VOICE_CATALOG[language]:
779
+ if self.debug_mode:
780
+ available = ', '.join(self.VOICE_CATALOG[language].keys())
781
+ print(f"⚠️ Voice '{voice_id}' not available for {language}. Available: {available}")
782
+ return False
783
+
784
+ voice_info = self.VOICE_CATALOG[language][voice_id]
785
+
786
+ # Check compatibility
787
+ if voice_info['requires'] == 'espeak-ng' and not self._test_model_compatibility(voice_info['model']):
788
+ if self.debug_mode:
789
+ print(f"⚠️ Voice '{voice_id}' requires espeak-ng. Install it for premium quality.")
790
+ return False
791
+
792
+ # Set the specific voice
793
+ model_name = voice_info['model']
794
+ if self.debug_mode:
795
+ print(f"🎭 Setting {language} voice to: {voice_id}")
796
+ print(f" Model: {model_name}")
797
+ print(f" Quality: {voice_info['quality']} | Gender: {voice_info['gender']}")
798
+ print(f" Accent: {voice_info['accent']}")
799
+
800
+ # Switch to the language and specific model
801
+ self.language = language
802
+
803
+ # Set voice-specific speed if available
804
+ if 'speed' in voice_info:
805
+ self.speed = voice_info['speed']
806
+ if self.debug_mode:
807
+ print(f" Speed: {voice_info['speed']} (adjusted for optimal pace)")
808
+ else:
809
+ self.speed = 1.0 # Default speed
810
+
811
+ return self.set_tts_model(model_name)
270
812
 
271
813
  def change_vad_aggressiveness(self, aggressiveness):
272
814
  """Change VAD aggressiveness.