PyPI - abstractvoice - Versions diffs - 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

abstractvoice 0.1.1py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

abstractvoice/__main__.py +33 -11
abstractvoice/dependency_check.py +274 -0
abstractvoice/examples/cli_repl.py +198 -13
abstractvoice/examples/voice_cli.py +20 -6
abstractvoice/recognition.py +50 -7
abstractvoice/stt/transcriber.py +17 -2
abstractvoice/tts/tts_engine.py +138 -32
abstractvoice/vad/voice_detector.py +16 -2
abstractvoice/voice_manager.py +558 -16
{abstractvoice-0.1.1.dist-info → abstractvoice-0.2.1.dist-info}/METADATA +196 -50
abstractvoice-0.2.1.dist-info/RECORD +21 -0
{abstractvoice-0.1.1.dist-info → abstractvoice-0.2.1.dist-info}/licenses/LICENSE +1 -1
abstractvoice-0.1.1.dist-info/RECORD +0 -20
{abstractvoice-0.1.1.dist-info → abstractvoice-0.2.1.dist-info}/WHEEL +0 -0
{abstractvoice-0.1.1.dist-info → abstractvoice-0.2.1.dist-info}/entry_points.txt +0 -0
{abstractvoice-0.1.1.dist-info → abstractvoice-0.2.1.dist-info}/top_level.txt +0 -0

abstractvoice/recognition.py CHANGED Viewed

@@ -2,9 +2,50 @@
 import threading
 import time
-import pyaudio
-from .vad import VoiceDetector
-from .stt import Transcriber
+# Lazy imports for heavy dependencies
+def _import_audio_deps():
+    """Import audio dependencies with helpful error message if missing."""
+    try:
+        import pyaudio
+        return pyaudio
+    except ImportError as e:
+        raise ImportError(
+            "Audio functionality requires optional dependencies. Install with:\n"
+            "  pip install abstractvoice[voice]  # For basic audio\n"
+            "  pip install abstractvoice[all]    # For all features\n"
+            f"Original error: {e}"
+        ) from e
+def _import_vad():
+    """Import VoiceDetector with helpful error message if dependencies missing."""
+    try:
+        from .vad import VoiceDetector
+        return VoiceDetector
+    except ImportError as e:
+        if "webrtcvad" in str(e):
+            raise ImportError(
+                "Voice activity detection requires optional dependencies. Install with:\n"
+                "  pip install abstractvoice[voice]  # For basic audio\n"
+                "  pip install abstractvoice[all]    # For all features\n"
+                f"Original error: {e}"
+            ) from e
+        raise
+def _import_transcriber():
+    """Import Transcriber with helpful error message if dependencies missing."""
+    try:
+        from .stt import Transcriber
+        return Transcriber
+    except ImportError as e:
+        if "whisper" in str(e) or "tiktoken" in str(e):
+            raise ImportError(
+                "Speech recognition functionality requires optional dependencies. Install with:\n"
+                "  pip install abstractvoice[stt]    # For speech recognition only\n"
+                "  pip install abstractvoice[all]    # For all features\n"
+                f"Original error: {e}"
+            ) from e
+        raise
 class VoiceRecognizer:
@@ -40,13 +81,15 @@ class VoiceRecognizer:
         self.min_speech_chunks = int(min_speech_duration / chunk_duration)
         self.silence_timeout_chunks = int(silence_timeout / chunk_duration)
-        # Initialize components
+        # Initialize components using lazy imports
+        VoiceDetector = _import_vad()
         self.voice_detector = VoiceDetector(
             aggressiveness=vad_aggressiveness,
             sample_rate=sample_rate,
             debug_mode=debug_mode
         )
+        Transcriber = _import_transcriber()
         self.transcriber = Transcriber(
             model_name=whisper_model,
             min_transcription_length=min_transcription_length,
@@ -109,8 +152,8 @@ class VoiceRecognizer:
     def _recognition_loop(self):
         """Main recognition loop."""
-        import pyaudio
+        pyaudio = _import_audio_deps()
         self.pyaudio = pyaudio.PyAudio()
         self.stream = self.pyaudio.open(
             format=pyaudio.paInt16,

abstractvoice/stt/transcriber.py CHANGED Viewed

@@ -1,11 +1,24 @@
 """Speech-to-text transcription using OpenAI's Whisper."""
-import whisper
 import numpy as np
 import os
 import sys
 import logging
+# Lazy import for heavy dependencies
+def _import_whisper():
+    """Import whisper with helpful error message if dependencies missing."""
+    try:
+        import whisper
+        return whisper
+    except ImportError as e:
+        raise ImportError(
+            "Speech recognition functionality requires optional dependencies. Install with:\n"
+            "  pip install abstractvoice[stt]    # For speech recognition only\n"
+            "  pip install abstractvoice[all]    # For all features\n"
+            f"Original error: {e}"
+        ) from e
 class Transcriber:
     """Transcribes audio using OpenAI's Whisper model."""
@@ -38,7 +51,8 @@ class Transcriber:
                 null_out = open(os.devnull, 'w')
                 sys.stdout = null_out
-            # Load the Whisper model
+            # Load the Whisper model using lazy import
+            whisper = _import_whisper()
             self.model = whisper.load_model(model_name)
         finally:
             # Restore stdout if we redirected it
@@ -120,6 +134,7 @@ class Transcriber:
                 sys.stdout = null_out
             try:
+                whisper = _import_whisper()
                 self.model = whisper.load_model(model_name)
                 self.model_name = model_name
             finally:

abstractvoice/tts/tts_engine.py CHANGED Viewed

@@ -10,16 +10,97 @@ This module implements best practices for TTS synthesis including:
 import threading
 import time
 import numpy as np
-import sounddevice as sd
 import os
 import sys
 import logging
 import warnings
 import re
-from TTS.api import TTS
-import librosa
 import queue
+# Lazy imports for heavy dependencies
+def _import_tts():
+    """Import TTS with helpful error message if dependencies missing."""
+    try:
+        from TTS.api import TTS
+        return TTS
+    except ImportError as e:
+        error_msg = str(e).lower()
+        # Check for specific PyTorch/TorchVision conflicts
+        if "torchvision::nms does not exist" in error_msg or "gpt2pretrainedmodel" in error_msg:
+            raise ImportError(
+                "❌ PyTorch/TorchVision version conflict detected!\n\n"
+                "This is a known compatibility issue. To fix:\n\n"
+                "1. Uninstall conflicting packages:\n"
+                "   pip uninstall torch torchvision torchaudio transformers\n\n"
+                "2. Reinstall with compatible versions:\n"
+                "   pip install abstractvoice[all]  # Installs tested compatible versions\n\n"
+                "3. Or use specific PyTorch version:\n"
+                "   pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1\n"
+                "   pip install abstractvoice[voice-full]\n\n"
+                "For conda environments, consider:\n"
+                "   conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia\n\n"
+                f"Original error: {e}"
+            ) from e
+        elif "no module named 'tts'" in error_msg or "coqui" in error_msg:
+            raise ImportError(
+                "TTS functionality requires coqui-tts. Install with:\n"
+                "  pip install abstractvoice[tts]        # For TTS only\n"
+                "  pip install abstractvoice[voice-full] # For complete voice functionality\n"
+                "  pip install abstractvoice[all]        # For all features\n"
+                f"Original error: {e}"
+            ) from e
+        else:
+            # Generic import error
+            raise ImportError(
+                "TTS functionality requires optional dependencies. Install with:\n"
+                "  pip install abstractvoice[tts]        # For TTS only\n"
+                "  pip install abstractvoice[voice-full] # For complete voice functionality\n"
+                "  pip install abstractvoice[all]        # For all features\n\n"
+                "If you're getting PyTorch-related errors, try:\n"
+                "  pip install abstractvoice[core-tts]   # Lightweight TTS without extras\n\n"
+                f"Original error: {e}"
+            ) from e
+def _import_audio_deps():
+    """Import audio dependencies with helpful error message if missing."""
+    try:
+        import sounddevice as sd
+        import librosa
+        return sd, librosa
+    except ImportError as e:
+        error_msg = str(e).lower()
+        if "sounddevice" in error_msg:
+            raise ImportError(
+                "Audio playback requires sounddevice. Install with:\n"
+                "  pip install abstractvoice[audio-only]  # For audio processing only\n"
+                "  pip install abstractvoice[voice-full]  # For complete voice functionality\n"
+                "  pip install abstractvoice[all]         # For all features\n\n"
+                "On some systems, you may need system audio libraries:\n"
+                "  Ubuntu/Debian: sudo apt-get install portaudio19-dev\n"
+                "  macOS: brew install portaudio\n"
+                "  Windows: Usually works out of the box\n\n"
+                f"Original error: {e}"
+            ) from e
+        elif "librosa" in error_msg:
+            raise ImportError(
+                "Audio processing requires librosa. Install with:\n"
+                "  pip install abstractvoice[tts]         # For TTS functionality\n"
+                "  pip install abstractvoice[voice-full]  # For complete voice functionality\n"
+                "  pip install abstractvoice[all]         # For all features\n\n"
+                f"Original error: {e}"
+            ) from e
+        else:
+            # Generic audio import error
+            raise ImportError(
+                "Audio functionality requires optional dependencies. Install with:\n"
+                "  pip install abstractvoice[audio-only]  # For audio processing only\n"
+                "  pip install abstractvoice[voice-full]  # For complete voice functionality\n"
+                "  pip install abstractvoice[all]         # For all features\n\n"
+                f"Original error: {e}"
+            ) from e
 # Suppress the PyTorch FutureWarning about torch.load
 warnings.filterwarnings(
     "ignore",
@@ -103,6 +184,7 @@ def apply_speed_without_pitch_change(audio, speed, sr=22050):
     # rate < 1.0 makes audio slower (longer)
     # This matches our speed semantics
     try:
+        _, librosa = _import_audio_deps()
         stretched_audio = librosa.effects.time_stretch(audio, rate=speed)
         return stretched_audio
     except Exception as e:
@@ -189,6 +271,7 @@ class NonBlockingAudioPlayer:
         """Start the audio stream."""
         if self.stream is None:
             try:
+                sd, _ = _import_audio_deps()
                 self.stream = sd.OutputStream(
                     samplerate=self.sample_rate,
                     channels=1,  # Mono output
@@ -384,8 +467,9 @@ class TTSEngine:
             if self.debug_mode:
                 print(f" > Loading TTS model: {model_name}")
-            # Try to initialize TTS
+            # Try to initialize TTS using lazy import
             try:
+                TTS = _import_tts()
                 self.tts = TTS(model_name=model_name, progress_bar=self.debug_mode)
             except Exception as e:
                 error_msg = str(e).lower()
@@ -443,105 +527,124 @@ class TTSEngine:
         if self.on_playback_end:
             self.on_playback_end()
-    def _speak_with_nonblocking_player(self, text, speed=1.0, callback=None):
-        """Alternative speak method using NonBlockingAudioPlayer for immediate pause/resume."""
+    def _speak_with_nonblocking_player(self, text, speed=1.0, callback=None, language='en'):
+        """Alternative speak method using NonBlockingAudioPlayer for immediate pause/resume with language support."""
         # Stop any existing playback
         self.stop()
         if not text:
             return False
         try:
             # Preprocess text for better synthesis quality
             processed_text = preprocess_text(text)
             if self.debug_mode:
                 print(f" > Speaking (non-blocking): '{processed_text[:100]}{'...' if len(processed_text) > 100 else ''}'")
                 print(f" > Text length: {len(processed_text)} chars")
+                if language != 'en':
+                    print(f" > Language: {language}")
                 if speed != 1.0:
                     print(f" > Using speed multiplier: {speed}x")
             # For very long text, chunk it at natural boundaries
             text_chunks = chunk_long_text(processed_text, max_chunk_size=300)
             if self.debug_mode and len(text_chunks) > 1:
                 print(f" > Split into {len(text_chunks)} chunks for processing")
             # Set playing state
             self.is_playing = True
             self.is_paused_state = False
             # Call start callback
             if self.on_playback_start:
                 self.on_playback_start()
             # Synthesize and queue audio chunks
             def synthesis_worker():
                 try:
                     for i, chunk in enumerate(text_chunks):
                         if self.stop_flag.is_set():
                             break
                         if self.debug_mode and len(text_chunks) > 1:
                             print(f" > Processing chunk {i+1}/{len(text_chunks)} ({len(chunk)} chars)...")
-                        # Generate audio for this chunk
-                        chunk_audio = self.tts.tts(chunk, split_sentences=True)
+                        # Generate audio for this chunk with language support
+                        try:
+                            # Check if this is an XTTS model (supports language parameter)
+                            if 'xtts' in self.tts.model_name.lower():
+                                chunk_audio = self.tts.tts(chunk, language=language, split_sentences=True)
+                                if self.debug_mode and language != 'en':
+                                    print(f" > Using XTTS with language: {language}")
+                            else:
+                                # Monolingual model - ignore language parameter
+                                chunk_audio = self.tts.tts(chunk, split_sentences=True)
+                                if self.debug_mode and language != 'en':
+                                    print(f" > Monolingual model - ignoring language parameter")
+                        except Exception as tts_error:
+                            # Fallback: try without language parameter
+                            if self.debug_mode:
+                                print(f" > TTS with language failed, trying without: {tts_error}")
+                            chunk_audio = self.tts.tts(chunk, split_sentences=True)
                         if chunk_audio and len(chunk_audio) > 0:
                             # Apply speed adjustment
                             if speed != 1.0:
                                 chunk_audio = apply_speed_without_pitch_change(
                                     np.array(chunk_audio), speed
                                 )
                             # Queue the audio for playback
                             self.audio_player.play_audio(np.array(chunk_audio))
                             if self.debug_mode:
                                 print(f" > Chunk {i+1} queued ({len(chunk_audio)} samples)")
                         # Small delay between chunks to prevent overwhelming the queue
                         time.sleep(0.01)
                 except Exception as e:
                     if self.debug_mode:
                         print(f"Error in synthesis worker: {e}")
                 finally:
                     # Synthesis complete - audio player will handle completion callback
                     pass
             # Start synthesis in background thread
             synthesis_thread = threading.Thread(target=synthesis_worker, daemon=True)
             synthesis_thread.start()
             return True
         except Exception as e:
             if self.debug_mode:
                 print(f"Error in _speak_with_nonblocking_player: {e}")
             self.is_playing = False
             return False
-    def speak(self, text, speed=1.0, callback=None):
-        """Convert text to speech and play audio.
+    def speak(self, text, speed=1.0, callback=None, language='en'):
+        """Convert text to speech and play audio with language support.
         Implements SOTA best practices for long text synthesis:
         - Text preprocessing and normalization
         - Intelligent chunking for very long text (>500 chars)
         - Sentence segmentation to prevent attention degradation
         - Seamless audio concatenation for chunks
+        - Multilingual support via XTTS models
         Args:
             text: Text to convert to speech
             speed: Speed multiplier (0.5-2.0)
             callback: Function to call when speech is complete
+            language: Language code for XTTS models ('en', 'fr', 'es', 'de', 'it', 'ru')
         Returns:
             True if speech started, False if text was empty
         """
         # Use the new non-blocking audio player for immediate pause/resume
-        return self._speak_with_nonblocking_player(text, speed, callback)
+        return self._speak_with_nonblocking_player(text, speed, callback, language)
         if not text:
             return False
@@ -674,6 +777,9 @@ class TTSEngine:
                         null_out.close()
             def _audio_playback():
+                # Import sounddevice at runtime to avoid loading heavy dependencies
+                sd, _ = _import_audio_deps()
                 try:
                     self.is_playing = True
                     self.start_time = time.time()

abstractvoice/vad/voice_detector.py CHANGED Viewed

@@ -1,8 +1,21 @@
 """Voice activity detection using WebRTC VAD."""
-import webrtcvad
 import logging
+# Lazy import for heavy dependencies
+def _import_webrtcvad():
+    """Import webrtcvad with helpful error message if dependencies missing."""
+    try:
+        import webrtcvad
+        return webrtcvad
+    except ImportError as e:
+        raise ImportError(
+            "Voice activity detection requires optional dependencies. Install with:\n"
+            "  pip install abstractvoice[voice]  # For basic audio\n"
+            "  pip install abstractvoice[all]    # For all features\n"
+            f"Original error: {e}"
+        ) from e
 class VoiceDetector:
     """Detects voice activity in audio streams."""
@@ -23,8 +36,9 @@ class VoiceDetector:
         if sample_rate not in [8000, 16000, 32000, 48000]:
             raise ValueError("Sample rate must be 8000, 16000, 32000, or 48000 Hz")
-        # Initialize WebRTC VAD
+        # Initialize WebRTC VAD using lazy import
         try:
+            webrtcvad = _import_webrtcvad()
             self.vad = webrtcvad.Vad(aggressiveness)
             if self.debug_mode:
                 print(f" > VAD initialized with aggressiveness {aggressiveness}")

abstractvoice 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl

abstractvoice 0.1.1py3-none-any.whl → 0.2.1py3-none-any.whl