abstractvoice 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,16 +10,43 @@ This module implements best practices for TTS synthesis including:
10
10
  import threading
11
11
  import time
12
12
  import numpy as np
13
- import sounddevice as sd
14
13
  import os
15
14
  import sys
16
15
  import logging
17
16
  import warnings
18
17
  import re
19
- from TTS.api import TTS
20
- import librosa
21
18
  import queue
22
19
 
20
+ # Lazy imports for heavy dependencies
21
+ def _import_tts():
22
+ """Import TTS with helpful error message if dependencies missing."""
23
+ try:
24
+ from TTS.api import TTS
25
+ return TTS
26
+ except ImportError as e:
27
+ raise ImportError(
28
+ "TTS functionality requires optional dependencies. Install with:\n"
29
+ " pip install abstractvoice[tts] # For TTS only\n"
30
+ " pip install abstractvoice[all] # For all features\n"
31
+ f"Original error: {e}"
32
+ ) from e
33
+
34
+ def _import_audio_deps():
35
+ """Import audio dependencies with helpful error message if missing."""
36
+ try:
37
+ import sounddevice as sd
38
+ import librosa
39
+ return sd, librosa
40
+ except ImportError as e:
41
+ if "sounddevice" in str(e) or "librosa" in str(e):
42
+ raise ImportError(
43
+ "Audio functionality requires optional dependencies. Install with:\n"
44
+ " pip install abstractvoice[voice] # For basic audio\n"
45
+ " pip install abstractvoice[all] # For all features\n"
46
+ f"Original error: {e}"
47
+ ) from e
48
+ raise
49
+
23
50
  # Suppress the PyTorch FutureWarning about torch.load
24
51
  warnings.filterwarnings(
25
52
  "ignore",
@@ -103,6 +130,7 @@ def apply_speed_without_pitch_change(audio, speed, sr=22050):
103
130
  # rate < 1.0 makes audio slower (longer)
104
131
  # This matches our speed semantics
105
132
  try:
133
+ _, librosa = _import_audio_deps()
106
134
  stretched_audio = librosa.effects.time_stretch(audio, rate=speed)
107
135
  return stretched_audio
108
136
  except Exception as e:
@@ -189,6 +217,7 @@ class NonBlockingAudioPlayer:
189
217
  """Start the audio stream."""
190
218
  if self.stream is None:
191
219
  try:
220
+ sd, _ = _import_audio_deps()
192
221
  self.stream = sd.OutputStream(
193
222
  samplerate=self.sample_rate,
194
223
  channels=1, # Mono output
@@ -384,8 +413,9 @@ class TTSEngine:
384
413
  if self.debug_mode:
385
414
  print(f" > Loading TTS model: {model_name}")
386
415
 
387
- # Try to initialize TTS
416
+ # Try to initialize TTS using lazy import
388
417
  try:
418
+ TTS = _import_tts()
389
419
  self.tts = TTS(model_name=model_name, progress_bar=self.debug_mode)
390
420
  except Exception as e:
391
421
  error_msg = str(e).lower()
@@ -443,105 +473,124 @@ class TTSEngine:
443
473
  if self.on_playback_end:
444
474
  self.on_playback_end()
445
475
 
446
- def _speak_with_nonblocking_player(self, text, speed=1.0, callback=None):
447
- """Alternative speak method using NonBlockingAudioPlayer for immediate pause/resume."""
476
+ def _speak_with_nonblocking_player(self, text, speed=1.0, callback=None, language='en'):
477
+ """Alternative speak method using NonBlockingAudioPlayer for immediate pause/resume with language support."""
448
478
  # Stop any existing playback
449
479
  self.stop()
450
-
480
+
451
481
  if not text:
452
482
  return False
453
-
483
+
454
484
  try:
455
485
  # Preprocess text for better synthesis quality
456
486
  processed_text = preprocess_text(text)
457
-
487
+
458
488
  if self.debug_mode:
459
489
  print(f" > Speaking (non-blocking): '{processed_text[:100]}{'...' if len(processed_text) > 100 else ''}'")
460
490
  print(f" > Text length: {len(processed_text)} chars")
491
+ if language != 'en':
492
+ print(f" > Language: {language}")
461
493
  if speed != 1.0:
462
494
  print(f" > Using speed multiplier: {speed}x")
463
-
495
+
464
496
  # For very long text, chunk it at natural boundaries
465
497
  text_chunks = chunk_long_text(processed_text, max_chunk_size=300)
466
-
498
+
467
499
  if self.debug_mode and len(text_chunks) > 1:
468
500
  print(f" > Split into {len(text_chunks)} chunks for processing")
469
-
501
+
470
502
  # Set playing state
471
503
  self.is_playing = True
472
504
  self.is_paused_state = False
473
-
505
+
474
506
  # Call start callback
475
507
  if self.on_playback_start:
476
508
  self.on_playback_start()
477
-
509
+
478
510
  # Synthesize and queue audio chunks
479
511
  def synthesis_worker():
480
512
  try:
481
513
  for i, chunk in enumerate(text_chunks):
482
514
  if self.stop_flag.is_set():
483
515
  break
484
-
516
+
485
517
  if self.debug_mode and len(text_chunks) > 1:
486
518
  print(f" > Processing chunk {i+1}/{len(text_chunks)} ({len(chunk)} chars)...")
487
-
488
- # Generate audio for this chunk
489
- chunk_audio = self.tts.tts(chunk, split_sentences=True)
490
-
519
+
520
+ # Generate audio for this chunk with language support
521
+ try:
522
+ # Check if this is an XTTS model (supports language parameter)
523
+ if 'xtts' in self.tts.model_name.lower():
524
+ chunk_audio = self.tts.tts(chunk, language=language, split_sentences=True)
525
+ if self.debug_mode and language != 'en':
526
+ print(f" > Using XTTS with language: {language}")
527
+ else:
528
+ # Monolingual model - ignore language parameter
529
+ chunk_audio = self.tts.tts(chunk, split_sentences=True)
530
+ if self.debug_mode and language != 'en':
531
+ print(f" > Monolingual model - ignoring language parameter")
532
+ except Exception as tts_error:
533
+ # Fallback: try without language parameter
534
+ if self.debug_mode:
535
+ print(f" > TTS with language failed, trying without: {tts_error}")
536
+ chunk_audio = self.tts.tts(chunk, split_sentences=True)
537
+
491
538
  if chunk_audio and len(chunk_audio) > 0:
492
539
  # Apply speed adjustment
493
540
  if speed != 1.0:
494
541
  chunk_audio = apply_speed_without_pitch_change(
495
542
  np.array(chunk_audio), speed
496
543
  )
497
-
544
+
498
545
  # Queue the audio for playback
499
546
  self.audio_player.play_audio(np.array(chunk_audio))
500
-
547
+
501
548
  if self.debug_mode:
502
549
  print(f" > Chunk {i+1} queued ({len(chunk_audio)} samples)")
503
-
550
+
504
551
  # Small delay between chunks to prevent overwhelming the queue
505
552
  time.sleep(0.01)
506
-
553
+
507
554
  except Exception as e:
508
555
  if self.debug_mode:
509
556
  print(f"Error in synthesis worker: {e}")
510
557
  finally:
511
558
  # Synthesis complete - audio player will handle completion callback
512
559
  pass
513
-
560
+
514
561
  # Start synthesis in background thread
515
562
  synthesis_thread = threading.Thread(target=synthesis_worker, daemon=True)
516
563
  synthesis_thread.start()
517
-
564
+
518
565
  return True
519
-
566
+
520
567
  except Exception as e:
521
568
  if self.debug_mode:
522
569
  print(f"Error in _speak_with_nonblocking_player: {e}")
523
570
  self.is_playing = False
524
571
  return False
525
572
 
526
- def speak(self, text, speed=1.0, callback=None):
527
- """Convert text to speech and play audio.
528
-
573
+ def speak(self, text, speed=1.0, callback=None, language='en'):
574
+ """Convert text to speech and play audio with language support.
575
+
529
576
  Implements SOTA best practices for long text synthesis:
530
577
  - Text preprocessing and normalization
531
578
  - Intelligent chunking for very long text (>500 chars)
532
579
  - Sentence segmentation to prevent attention degradation
533
580
  - Seamless audio concatenation for chunks
534
-
581
+ - Multilingual support via XTTS models
582
+
535
583
  Args:
536
584
  text: Text to convert to speech
537
585
  speed: Speed multiplier (0.5-2.0)
538
586
  callback: Function to call when speech is complete
539
-
587
+ language: Language code for XTTS models ('en', 'fr', 'es', 'de', 'it', 'ru')
588
+
540
589
  Returns:
541
590
  True if speech started, False if text was empty
542
591
  """
543
592
  # Use the new non-blocking audio player for immediate pause/resume
544
- return self._speak_with_nonblocking_player(text, speed, callback)
593
+ return self._speak_with_nonblocking_player(text, speed, callback, language)
545
594
 
546
595
  if not text:
547
596
  return False
@@ -674,6 +723,9 @@ class TTSEngine:
674
723
  null_out.close()
675
724
 
676
725
  def _audio_playback():
726
+ # Import sounddevice at runtime to avoid loading heavy dependencies
727
+ sd, _ = _import_audio_deps()
728
+
677
729
  try:
678
730
  self.is_playing = True
679
731
  self.start_time = time.time()
@@ -1,8 +1,21 @@
1
1
  """Voice activity detection using WebRTC VAD."""
2
2
 
3
- import webrtcvad
4
3
  import logging
5
4
 
5
+ # Lazy import for heavy dependencies
6
+ def _import_webrtcvad():
7
+ """Import webrtcvad with helpful error message if dependencies missing."""
8
+ try:
9
+ import webrtcvad
10
+ return webrtcvad
11
+ except ImportError as e:
12
+ raise ImportError(
13
+ "Voice activity detection requires optional dependencies. Install with:\n"
14
+ " pip install abstractvoice[voice] # For basic audio\n"
15
+ " pip install abstractvoice[all] # For all features\n"
16
+ f"Original error: {e}"
17
+ ) from e
18
+
6
19
 
7
20
  class VoiceDetector:
8
21
  """Detects voice activity in audio streams."""
@@ -23,8 +36,9 @@ class VoiceDetector:
23
36
  if sample_rate not in [8000, 16000, 32000, 48000]:
24
37
  raise ValueError("Sample rate must be 8000, 16000, 32000, or 48000 Hz")
25
38
 
26
- # Initialize WebRTC VAD
39
+ # Initialize WebRTC VAD using lazy import
27
40
  try:
41
+ webrtcvad = _import_webrtcvad()
28
42
  self.vad = webrtcvad.Vad(aggressiveness)
29
43
  if self.debug_mode:
30
44
  print(f" > VAD initialized with aggressiveness {aggressiveness}")