abstractvoice 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,9 +2,50 @@
2
2
 
3
3
  import threading
4
4
  import time
5
- import pyaudio
6
- from .vad import VoiceDetector
7
- from .stt import Transcriber
5
+
6
+ # Lazy imports for heavy dependencies
7
+ def _import_audio_deps():
8
+ """Import audio dependencies with helpful error message if missing."""
9
+ try:
10
+ import pyaudio
11
+ return pyaudio
12
+ except ImportError as e:
13
+ raise ImportError(
14
+ "Audio functionality requires optional dependencies. Install with:\n"
15
+ " pip install abstractvoice[voice] # For basic audio\n"
16
+ " pip install abstractvoice[all] # For all features\n"
17
+ f"Original error: {e}"
18
+ ) from e
19
+
20
+ def _import_vad():
21
+ """Import VoiceDetector with helpful error message if dependencies missing."""
22
+ try:
23
+ from .vad import VoiceDetector
24
+ return VoiceDetector
25
+ except ImportError as e:
26
+ if "webrtcvad" in str(e):
27
+ raise ImportError(
28
+ "Voice activity detection requires optional dependencies. Install with:\n"
29
+ " pip install abstractvoice[voice] # For basic audio\n"
30
+ " pip install abstractvoice[all] # For all features\n"
31
+ f"Original error: {e}"
32
+ ) from e
33
+ raise
34
+
35
+ def _import_transcriber():
36
+ """Import Transcriber with helpful error message if dependencies missing."""
37
+ try:
38
+ from .stt import Transcriber
39
+ return Transcriber
40
+ except ImportError as e:
41
+ if "whisper" in str(e) or "tiktoken" in str(e):
42
+ raise ImportError(
43
+ "Speech recognition functionality requires optional dependencies. Install with:\n"
44
+ " pip install abstractvoice[stt] # For speech recognition only\n"
45
+ " pip install abstractvoice[all] # For all features\n"
46
+ f"Original error: {e}"
47
+ ) from e
48
+ raise
8
49
 
9
50
 
10
51
  class VoiceRecognizer:
@@ -40,13 +81,15 @@ class VoiceRecognizer:
40
81
  self.min_speech_chunks = int(min_speech_duration / chunk_duration)
41
82
  self.silence_timeout_chunks = int(silence_timeout / chunk_duration)
42
83
 
43
- # Initialize components
84
+ # Initialize components using lazy imports
85
+ VoiceDetector = _import_vad()
44
86
  self.voice_detector = VoiceDetector(
45
87
  aggressiveness=vad_aggressiveness,
46
88
  sample_rate=sample_rate,
47
89
  debug_mode=debug_mode
48
90
  )
49
-
91
+
92
+ Transcriber = _import_transcriber()
50
93
  self.transcriber = Transcriber(
51
94
  model_name=whisper_model,
52
95
  min_transcription_length=min_transcription_length,
@@ -109,8 +152,8 @@ class VoiceRecognizer:
109
152
 
110
153
  def _recognition_loop(self):
111
154
  """Main recognition loop."""
112
- import pyaudio
113
-
155
+ pyaudio = _import_audio_deps()
156
+
114
157
  self.pyaudio = pyaudio.PyAudio()
115
158
  self.stream = self.pyaudio.open(
116
159
  format=pyaudio.paInt16,
@@ -1,11 +1,24 @@
1
1
  """Speech-to-text transcription using OpenAI's Whisper."""
2
2
 
3
- import whisper
4
3
  import numpy as np
5
4
  import os
6
5
  import sys
7
6
  import logging
8
7
 
8
+ # Lazy import for heavy dependencies
9
+ def _import_whisper():
10
+ """Import whisper with helpful error message if dependencies missing."""
11
+ try:
12
+ import whisper
13
+ return whisper
14
+ except ImportError as e:
15
+ raise ImportError(
16
+ "Speech recognition functionality requires optional dependencies. Install with:\n"
17
+ " pip install abstractvoice[stt] # For speech recognition only\n"
18
+ " pip install abstractvoice[all] # For all features\n"
19
+ f"Original error: {e}"
20
+ ) from e
21
+
9
22
 
10
23
  class Transcriber:
11
24
  """Transcribes audio using OpenAI's Whisper model."""
@@ -38,7 +51,8 @@ class Transcriber:
38
51
  null_out = open(os.devnull, 'w')
39
52
  sys.stdout = null_out
40
53
 
41
- # Load the Whisper model
54
+ # Load the Whisper model using lazy import
55
+ whisper = _import_whisper()
42
56
  self.model = whisper.load_model(model_name)
43
57
  finally:
44
58
  # Restore stdout if we redirected it
@@ -120,6 +134,7 @@ class Transcriber:
120
134
  sys.stdout = null_out
121
135
 
122
136
  try:
137
+ whisper = _import_whisper()
123
138
  self.model = whisper.load_model(model_name)
124
139
  self.model_name = model_name
125
140
  finally:
@@ -10,16 +10,97 @@ This module implements best practices for TTS synthesis including:
10
10
  import threading
11
11
  import time
12
12
  import numpy as np
13
- import sounddevice as sd
14
13
  import os
15
14
  import sys
16
15
  import logging
17
16
  import warnings
18
17
  import re
19
- from TTS.api import TTS
20
- import librosa
21
18
  import queue
22
19
 
20
+ # Lazy imports for heavy dependencies
21
+ def _import_tts():
22
+ """Import TTS with helpful error message if dependencies missing."""
23
+ try:
24
+ from TTS.api import TTS
25
+ return TTS
26
+ except ImportError as e:
27
+ error_msg = str(e).lower()
28
+
29
+ # Check for specific PyTorch/TorchVision conflicts
30
+ if "torchvision::nms does not exist" in error_msg or "gpt2pretrainedmodel" in error_msg:
31
+ raise ImportError(
32
+ "❌ PyTorch/TorchVision version conflict detected!\n\n"
33
+ "This is a known compatibility issue. To fix:\n\n"
34
+ "1. Uninstall conflicting packages:\n"
35
+ " pip uninstall torch torchvision torchaudio transformers\n\n"
36
+ "2. Reinstall with compatible versions:\n"
37
+ " pip install abstractvoice[all] # Installs tested compatible versions\n\n"
38
+ "3. Or use specific PyTorch version:\n"
39
+ " pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1\n"
40
+ " pip install abstractvoice[voice-full]\n\n"
41
+ "For conda environments, consider:\n"
42
+ " conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia\n\n"
43
+ f"Original error: {e}"
44
+ ) from e
45
+ elif "no module named 'tts'" in error_msg or "coqui" in error_msg:
46
+ raise ImportError(
47
+ "TTS functionality requires coqui-tts. Install with:\n"
48
+ " pip install abstractvoice[tts] # For TTS only\n"
49
+ " pip install abstractvoice[voice-full] # For complete voice functionality\n"
50
+ " pip install abstractvoice[all] # For all features\n"
51
+ f"Original error: {e}"
52
+ ) from e
53
+ else:
54
+ # Generic import error
55
+ raise ImportError(
56
+ "TTS functionality requires optional dependencies. Install with:\n"
57
+ " pip install abstractvoice[tts] # For TTS only\n"
58
+ " pip install abstractvoice[voice-full] # For complete voice functionality\n"
59
+ " pip install abstractvoice[all] # For all features\n\n"
60
+ "If you're getting PyTorch-related errors, try:\n"
61
+ " pip install abstractvoice[core-tts] # Lightweight TTS without extras\n\n"
62
+ f"Original error: {e}"
63
+ ) from e
64
+
65
+ def _import_audio_deps():
66
+ """Import audio dependencies with helpful error message if missing."""
67
+ try:
68
+ import sounddevice as sd
69
+ import librosa
70
+ return sd, librosa
71
+ except ImportError as e:
72
+ error_msg = str(e).lower()
73
+
74
+ if "sounddevice" in error_msg:
75
+ raise ImportError(
76
+ "Audio playback requires sounddevice. Install with:\n"
77
+ " pip install abstractvoice[audio-only] # For audio processing only\n"
78
+ " pip install abstractvoice[voice-full] # For complete voice functionality\n"
79
+ " pip install abstractvoice[all] # For all features\n\n"
80
+ "On some systems, you may need system audio libraries:\n"
81
+ " Ubuntu/Debian: sudo apt-get install portaudio19-dev\n"
82
+ " macOS: brew install portaudio\n"
83
+ " Windows: Usually works out of the box\n\n"
84
+ f"Original error: {e}"
85
+ ) from e
86
+ elif "librosa" in error_msg:
87
+ raise ImportError(
88
+ "Audio processing requires librosa. Install with:\n"
89
+ " pip install abstractvoice[tts] # For TTS functionality\n"
90
+ " pip install abstractvoice[voice-full] # For complete voice functionality\n"
91
+ " pip install abstractvoice[all] # For all features\n\n"
92
+ f"Original error: {e}"
93
+ ) from e
94
+ else:
95
+ # Generic audio import error
96
+ raise ImportError(
97
+ "Audio functionality requires optional dependencies. Install with:\n"
98
+ " pip install abstractvoice[audio-only] # For audio processing only\n"
99
+ " pip install abstractvoice[voice-full] # For complete voice functionality\n"
100
+ " pip install abstractvoice[all] # For all features\n\n"
101
+ f"Original error: {e}"
102
+ ) from e
103
+
23
104
  # Suppress the PyTorch FutureWarning about torch.load
24
105
  warnings.filterwarnings(
25
106
  "ignore",
@@ -103,6 +184,7 @@ def apply_speed_without_pitch_change(audio, speed, sr=22050):
103
184
  # rate < 1.0 makes audio slower (longer)
104
185
  # This matches our speed semantics
105
186
  try:
187
+ _, librosa = _import_audio_deps()
106
188
  stretched_audio = librosa.effects.time_stretch(audio, rate=speed)
107
189
  return stretched_audio
108
190
  except Exception as e:
@@ -189,6 +271,7 @@ class NonBlockingAudioPlayer:
189
271
  """Start the audio stream."""
190
272
  if self.stream is None:
191
273
  try:
274
+ sd, _ = _import_audio_deps()
192
275
  self.stream = sd.OutputStream(
193
276
  samplerate=self.sample_rate,
194
277
  channels=1, # Mono output
@@ -384,8 +467,9 @@ class TTSEngine:
384
467
  if self.debug_mode:
385
468
  print(f" > Loading TTS model: {model_name}")
386
469
 
387
- # Try to initialize TTS
470
+ # Try to initialize TTS using lazy import
388
471
  try:
472
+ TTS = _import_tts()
389
473
  self.tts = TTS(model_name=model_name, progress_bar=self.debug_mode)
390
474
  except Exception as e:
391
475
  error_msg = str(e).lower()
@@ -443,105 +527,124 @@ class TTSEngine:
443
527
  if self.on_playback_end:
444
528
  self.on_playback_end()
445
529
 
446
- def _speak_with_nonblocking_player(self, text, speed=1.0, callback=None):
447
- """Alternative speak method using NonBlockingAudioPlayer for immediate pause/resume."""
530
+ def _speak_with_nonblocking_player(self, text, speed=1.0, callback=None, language='en'):
531
+ """Alternative speak method using NonBlockingAudioPlayer for immediate pause/resume with language support."""
448
532
  # Stop any existing playback
449
533
  self.stop()
450
-
534
+
451
535
  if not text:
452
536
  return False
453
-
537
+
454
538
  try:
455
539
  # Preprocess text for better synthesis quality
456
540
  processed_text = preprocess_text(text)
457
-
541
+
458
542
  if self.debug_mode:
459
543
  print(f" > Speaking (non-blocking): '{processed_text[:100]}{'...' if len(processed_text) > 100 else ''}'")
460
544
  print(f" > Text length: {len(processed_text)} chars")
545
+ if language != 'en':
546
+ print(f" > Language: {language}")
461
547
  if speed != 1.0:
462
548
  print(f" > Using speed multiplier: {speed}x")
463
-
549
+
464
550
  # For very long text, chunk it at natural boundaries
465
551
  text_chunks = chunk_long_text(processed_text, max_chunk_size=300)
466
-
552
+
467
553
  if self.debug_mode and len(text_chunks) > 1:
468
554
  print(f" > Split into {len(text_chunks)} chunks for processing")
469
-
555
+
470
556
  # Set playing state
471
557
  self.is_playing = True
472
558
  self.is_paused_state = False
473
-
559
+
474
560
  # Call start callback
475
561
  if self.on_playback_start:
476
562
  self.on_playback_start()
477
-
563
+
478
564
  # Synthesize and queue audio chunks
479
565
  def synthesis_worker():
480
566
  try:
481
567
  for i, chunk in enumerate(text_chunks):
482
568
  if self.stop_flag.is_set():
483
569
  break
484
-
570
+
485
571
  if self.debug_mode and len(text_chunks) > 1:
486
572
  print(f" > Processing chunk {i+1}/{len(text_chunks)} ({len(chunk)} chars)...")
487
-
488
- # Generate audio for this chunk
489
- chunk_audio = self.tts.tts(chunk, split_sentences=True)
490
-
573
+
574
+ # Generate audio for this chunk with language support
575
+ try:
576
+ # Check if this is an XTTS model (supports language parameter)
577
+ if 'xtts' in self.tts.model_name.lower():
578
+ chunk_audio = self.tts.tts(chunk, language=language, split_sentences=True)
579
+ if self.debug_mode and language != 'en':
580
+ print(f" > Using XTTS with language: {language}")
581
+ else:
582
+ # Monolingual model - ignore language parameter
583
+ chunk_audio = self.tts.tts(chunk, split_sentences=True)
584
+ if self.debug_mode and language != 'en':
585
+ print(f" > Monolingual model - ignoring language parameter")
586
+ except Exception as tts_error:
587
+ # Fallback: try without language parameter
588
+ if self.debug_mode:
589
+ print(f" > TTS with language failed, trying without: {tts_error}")
590
+ chunk_audio = self.tts.tts(chunk, split_sentences=True)
591
+
491
592
  if chunk_audio and len(chunk_audio) > 0:
492
593
  # Apply speed adjustment
493
594
  if speed != 1.0:
494
595
  chunk_audio = apply_speed_without_pitch_change(
495
596
  np.array(chunk_audio), speed
496
597
  )
497
-
598
+
498
599
  # Queue the audio for playback
499
600
  self.audio_player.play_audio(np.array(chunk_audio))
500
-
601
+
501
602
  if self.debug_mode:
502
603
  print(f" > Chunk {i+1} queued ({len(chunk_audio)} samples)")
503
-
604
+
504
605
  # Small delay between chunks to prevent overwhelming the queue
505
606
  time.sleep(0.01)
506
-
607
+
507
608
  except Exception as e:
508
609
  if self.debug_mode:
509
610
  print(f"Error in synthesis worker: {e}")
510
611
  finally:
511
612
  # Synthesis complete - audio player will handle completion callback
512
613
  pass
513
-
614
+
514
615
  # Start synthesis in background thread
515
616
  synthesis_thread = threading.Thread(target=synthesis_worker, daemon=True)
516
617
  synthesis_thread.start()
517
-
618
+
518
619
  return True
519
-
620
+
520
621
  except Exception as e:
521
622
  if self.debug_mode:
522
623
  print(f"Error in _speak_with_nonblocking_player: {e}")
523
624
  self.is_playing = False
524
625
  return False
525
626
 
526
- def speak(self, text, speed=1.0, callback=None):
527
- """Convert text to speech and play audio.
528
-
627
+ def speak(self, text, speed=1.0, callback=None, language='en'):
628
+ """Convert text to speech and play audio with language support.
629
+
529
630
  Implements SOTA best practices for long text synthesis:
530
631
  - Text preprocessing and normalization
531
632
  - Intelligent chunking for very long text (>500 chars)
532
633
  - Sentence segmentation to prevent attention degradation
533
634
  - Seamless audio concatenation for chunks
534
-
635
+ - Multilingual support via XTTS models
636
+
535
637
  Args:
536
638
  text: Text to convert to speech
537
639
  speed: Speed multiplier (0.5-2.0)
538
640
  callback: Function to call when speech is complete
539
-
641
+ language: Language code for XTTS models ('en', 'fr', 'es', 'de', 'it', 'ru')
642
+
540
643
  Returns:
541
644
  True if speech started, False if text was empty
542
645
  """
543
646
  # Use the new non-blocking audio player for immediate pause/resume
544
- return self._speak_with_nonblocking_player(text, speed, callback)
647
+ return self._speak_with_nonblocking_player(text, speed, callback, language)
545
648
 
546
649
  if not text:
547
650
  return False
@@ -674,6 +777,9 @@ class TTSEngine:
674
777
  null_out.close()
675
778
 
676
779
  def _audio_playback():
780
+ # Import sounddevice at runtime to avoid loading heavy dependencies
781
+ sd, _ = _import_audio_deps()
782
+
677
783
  try:
678
784
  self.is_playing = True
679
785
  self.start_time = time.time()
@@ -1,8 +1,21 @@
1
1
  """Voice activity detection using WebRTC VAD."""
2
2
 
3
- import webrtcvad
4
3
  import logging
5
4
 
5
+ # Lazy import for heavy dependencies
6
+ def _import_webrtcvad():
7
+ """Import webrtcvad with helpful error message if dependencies missing."""
8
+ try:
9
+ import webrtcvad
10
+ return webrtcvad
11
+ except ImportError as e:
12
+ raise ImportError(
13
+ "Voice activity detection requires optional dependencies. Install with:\n"
14
+ " pip install abstractvoice[voice] # For basic audio\n"
15
+ " pip install abstractvoice[all] # For all features\n"
16
+ f"Original error: {e}"
17
+ ) from e
18
+
6
19
 
7
20
  class VoiceDetector:
8
21
  """Detects voice activity in audio streams."""
@@ -23,8 +36,9 @@ class VoiceDetector:
23
36
  if sample_rate not in [8000, 16000, 32000, 48000]:
24
37
  raise ValueError("Sample rate must be 8000, 16000, 32000, or 48000 Hz")
25
38
 
26
- # Initialize WebRTC VAD
39
+ # Initialize WebRTC VAD using lazy import
27
40
  try:
41
+ webrtcvad = _import_webrtcvad()
28
42
  self.vad = webrtcvad.Vad(aggressiveness)
29
43
  if self.debug_mode:
30
44
  print(f" > VAD initialized with aggressiveness {aggressiveness}")