PyPI - abstractvoice - Versions diffs - 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

abstractvoice 0.1.1py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

abstractvoice/__main__.py +20 -10
abstractvoice/examples/cli_repl.py +198 -13
abstractvoice/examples/voice_cli.py +20 -6
abstractvoice/recognition.py +50 -7
abstractvoice/stt/transcriber.py +17 -2
abstractvoice/tts/tts_engine.py +84 -32
abstractvoice/vad/voice_detector.py +16 -2
abstractvoice/voice_manager.py +558 -16
{abstractvoice-0.1.1.dist-info → abstractvoice-0.2.0.dist-info}/METADATA +228 -50
abstractvoice-0.2.0.dist-info/RECORD +20 -0
{abstractvoice-0.1.1.dist-info → abstractvoice-0.2.0.dist-info}/licenses/LICENSE +1 -1
abstractvoice-0.1.1.dist-info/RECORD +0 -20
{abstractvoice-0.1.1.dist-info → abstractvoice-0.2.0.dist-info}/WHEEL +0 -0
{abstractvoice-0.1.1.dist-info → abstractvoice-0.2.0.dist-info}/entry_points.txt +0 -0
{abstractvoice-0.1.1.dist-info → abstractvoice-0.2.0.dist-info}/top_level.txt +0 -0

abstractvoice/tts/tts_engine.py CHANGED Viewed

@@ -10,16 +10,43 @@ This module implements best practices for TTS synthesis including:
 import threading
 import time
 import numpy as np
-import sounddevice as sd
 import os
 import sys
 import logging
 import warnings
 import re
-from TTS.api import TTS
-import librosa
 import queue
+# Lazy imports for heavy dependencies
+def _import_tts():
+    """Import TTS with helpful error message if dependencies missing."""
+    try:
+        from TTS.api import TTS
+        return TTS
+    except ImportError as e:
+        raise ImportError(
+            "TTS functionality requires optional dependencies. Install with:\n"
+            "  pip install abstractvoice[tts]    # For TTS only\n"
+            "  pip install abstractvoice[all]    # For all features\n"
+            f"Original error: {e}"
+        ) from e
+def _import_audio_deps():
+    """Import audio dependencies with helpful error message if missing."""
+    try:
+        import sounddevice as sd
+        import librosa
+        return sd, librosa
+    except ImportError as e:
+        if "sounddevice" in str(e) or "librosa" in str(e):
+            raise ImportError(
+                "Audio functionality requires optional dependencies. Install with:\n"
+                "  pip install abstractvoice[voice]  # For basic audio\n"
+                "  pip install abstractvoice[all]    # For all features\n"
+                f"Original error: {e}"
+            ) from e
+        raise
 # Suppress the PyTorch FutureWarning about torch.load
 warnings.filterwarnings(
     "ignore",
@@ -103,6 +130,7 @@ def apply_speed_without_pitch_change(audio, speed, sr=22050):
     # rate < 1.0 makes audio slower (longer)
     # This matches our speed semantics
     try:
+        _, librosa = _import_audio_deps()
         stretched_audio = librosa.effects.time_stretch(audio, rate=speed)
         return stretched_audio
     except Exception as e:
@@ -189,6 +217,7 @@ class NonBlockingAudioPlayer:
         """Start the audio stream."""
         if self.stream is None:
             try:
+                sd, _ = _import_audio_deps()
                 self.stream = sd.OutputStream(
                     samplerate=self.sample_rate,
                     channels=1,  # Mono output
@@ -384,8 +413,9 @@ class TTSEngine:
             if self.debug_mode:
                 print(f" > Loading TTS model: {model_name}")
-            # Try to initialize TTS
+            # Try to initialize TTS using lazy import
             try:
+                TTS = _import_tts()
                 self.tts = TTS(model_name=model_name, progress_bar=self.debug_mode)
             except Exception as e:
                 error_msg = str(e).lower()
@@ -443,105 +473,124 @@ class TTSEngine:
         if self.on_playback_end:
             self.on_playback_end()
-    def _speak_with_nonblocking_player(self, text, speed=1.0, callback=None):
-        """Alternative speak method using NonBlockingAudioPlayer for immediate pause/resume."""
+    def _speak_with_nonblocking_player(self, text, speed=1.0, callback=None, language='en'):
+        """Alternative speak method using NonBlockingAudioPlayer for immediate pause/resume with language support."""
         # Stop any existing playback
         self.stop()
         if not text:
             return False
         try:
             # Preprocess text for better synthesis quality
             processed_text = preprocess_text(text)
             if self.debug_mode:
                 print(f" > Speaking (non-blocking): '{processed_text[:100]}{'...' if len(processed_text) > 100 else ''}'")
                 print(f" > Text length: {len(processed_text)} chars")
+                if language != 'en':
+                    print(f" > Language: {language}")
                 if speed != 1.0:
                     print(f" > Using speed multiplier: {speed}x")
             # For very long text, chunk it at natural boundaries
             text_chunks = chunk_long_text(processed_text, max_chunk_size=300)
             if self.debug_mode and len(text_chunks) > 1:
                 print(f" > Split into {len(text_chunks)} chunks for processing")
             # Set playing state
             self.is_playing = True
             self.is_paused_state = False
             # Call start callback
             if self.on_playback_start:
                 self.on_playback_start()
             # Synthesize and queue audio chunks
             def synthesis_worker():
                 try:
                     for i, chunk in enumerate(text_chunks):
                         if self.stop_flag.is_set():
                             break
                         if self.debug_mode and len(text_chunks) > 1:
                             print(f" > Processing chunk {i+1}/{len(text_chunks)} ({len(chunk)} chars)...")
-                        # Generate audio for this chunk
-                        chunk_audio = self.tts.tts(chunk, split_sentences=True)
+                        # Generate audio for this chunk with language support
+                        try:
+                            # Check if this is an XTTS model (supports language parameter)
+                            if 'xtts' in self.tts.model_name.lower():
+                                chunk_audio = self.tts.tts(chunk, language=language, split_sentences=True)
+                                if self.debug_mode and language != 'en':
+                                    print(f" > Using XTTS with language: {language}")
+                            else:
+                                # Monolingual model - ignore language parameter
+                                chunk_audio = self.tts.tts(chunk, split_sentences=True)
+                                if self.debug_mode and language != 'en':
+                                    print(f" > Monolingual model - ignoring language parameter")
+                        except Exception as tts_error:
+                            # Fallback: try without language parameter
+                            if self.debug_mode:
+                                print(f" > TTS with language failed, trying without: {tts_error}")
+                            chunk_audio = self.tts.tts(chunk, split_sentences=True)
                         if chunk_audio and len(chunk_audio) > 0:
                             # Apply speed adjustment
                             if speed != 1.0:
                                 chunk_audio = apply_speed_without_pitch_change(
                                     np.array(chunk_audio), speed
                                 )
                             # Queue the audio for playback
                             self.audio_player.play_audio(np.array(chunk_audio))
                             if self.debug_mode:
                                 print(f" > Chunk {i+1} queued ({len(chunk_audio)} samples)")
                         # Small delay between chunks to prevent overwhelming the queue
                         time.sleep(0.01)
                 except Exception as e:
                     if self.debug_mode:
                         print(f"Error in synthesis worker: {e}")
                 finally:
                     # Synthesis complete - audio player will handle completion callback
                     pass
             # Start synthesis in background thread
             synthesis_thread = threading.Thread(target=synthesis_worker, daemon=True)
             synthesis_thread.start()
             return True
         except Exception as e:
             if self.debug_mode:
                 print(f"Error in _speak_with_nonblocking_player: {e}")
             self.is_playing = False
             return False
-    def speak(self, text, speed=1.0, callback=None):
-        """Convert text to speech and play audio.
+    def speak(self, text, speed=1.0, callback=None, language='en'):
+        """Convert text to speech and play audio with language support.
         Implements SOTA best practices for long text synthesis:
         - Text preprocessing and normalization
         - Intelligent chunking for very long text (>500 chars)
         - Sentence segmentation to prevent attention degradation
         - Seamless audio concatenation for chunks
+        - Multilingual support via XTTS models
         Args:
             text: Text to convert to speech
             speed: Speed multiplier (0.5-2.0)
             callback: Function to call when speech is complete
+            language: Language code for XTTS models ('en', 'fr', 'es', 'de', 'it', 'ru')
         Returns:
             True if speech started, False if text was empty
         """
         # Use the new non-blocking audio player for immediate pause/resume
-        return self._speak_with_nonblocking_player(text, speed, callback)
+        return self._speak_with_nonblocking_player(text, speed, callback, language)
         if not text:
             return False
@@ -674,6 +723,9 @@ class TTSEngine:
                         null_out.close()
             def _audio_playback():
+                # Import sounddevice at runtime to avoid loading heavy dependencies
+                sd, _ = _import_audio_deps()
                 try:
                     self.is_playing = True
                     self.start_time = time.time()

abstractvoice/vad/voice_detector.py CHANGED Viewed

@@ -1,8 +1,21 @@
 """Voice activity detection using WebRTC VAD."""
-import webrtcvad
 import logging
+# Lazy import for heavy dependencies
+def _import_webrtcvad():
+    """Import webrtcvad with helpful error message if dependencies missing."""
+    try:
+        import webrtcvad
+        return webrtcvad
+    except ImportError as e:
+        raise ImportError(
+            "Voice activity detection requires optional dependencies. Install with:\n"
+            "  pip install abstractvoice[voice]  # For basic audio\n"
+            "  pip install abstractvoice[all]    # For all features\n"
+            f"Original error: {e}"
+        ) from e
 class VoiceDetector:
     """Detects voice activity in audio streams."""
@@ -23,8 +36,9 @@ class VoiceDetector:
         if sample_rate not in [8000, 16000, 32000, 48000]:
             raise ValueError("Sample rate must be 8000, 16000, 32000, or 48000 Hz")
-        # Initialize WebRTC VAD
+        # Initialize WebRTC VAD using lazy import
         try:
+            webrtcvad = _import_webrtcvad()
             self.vad = webrtcvad.Vad(aggressiveness)
             if self.debug_mode:
                 print(f" > VAD initialized with aggressiveness {aggressiveness}")

abstractvoice 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

abstractvoice 0.1.1py3-none-any.whl → 0.2.0py3-none-any.whl