abstractvoice 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractvoice/__main__.py +20 -10
- abstractvoice/examples/cli_repl.py +198 -13
- abstractvoice/examples/voice_cli.py +20 -6
- abstractvoice/recognition.py +50 -7
- abstractvoice/stt/transcriber.py +17 -2
- abstractvoice/tts/tts_engine.py +84 -32
- abstractvoice/vad/voice_detector.py +16 -2
- abstractvoice/voice_manager.py +558 -16
- {abstractvoice-0.1.1.dist-info → abstractvoice-0.2.0.dist-info}/METADATA +228 -50
- abstractvoice-0.2.0.dist-info/RECORD +20 -0
- {abstractvoice-0.1.1.dist-info → abstractvoice-0.2.0.dist-info}/licenses/LICENSE +1 -1
- abstractvoice-0.1.1.dist-info/RECORD +0 -20
- {abstractvoice-0.1.1.dist-info → abstractvoice-0.2.0.dist-info}/WHEEL +0 -0
- {abstractvoice-0.1.1.dist-info → abstractvoice-0.2.0.dist-info}/entry_points.txt +0 -0
- {abstractvoice-0.1.1.dist-info → abstractvoice-0.2.0.dist-info}/top_level.txt +0 -0
abstractvoice/tts/tts_engine.py
CHANGED
|
@@ -10,16 +10,43 @@ This module implements best practices for TTS synthesis including:
|
|
|
10
10
|
import threading
|
|
11
11
|
import time
|
|
12
12
|
import numpy as np
|
|
13
|
-
import sounddevice as sd
|
|
14
13
|
import os
|
|
15
14
|
import sys
|
|
16
15
|
import logging
|
|
17
16
|
import warnings
|
|
18
17
|
import re
|
|
19
|
-
from TTS.api import TTS
|
|
20
|
-
import librosa
|
|
21
18
|
import queue
|
|
22
19
|
|
|
20
|
+
# Lazy imports for heavy dependencies
|
|
21
|
+
def _import_tts():
|
|
22
|
+
"""Import TTS with helpful error message if dependencies missing."""
|
|
23
|
+
try:
|
|
24
|
+
from TTS.api import TTS
|
|
25
|
+
return TTS
|
|
26
|
+
except ImportError as e:
|
|
27
|
+
raise ImportError(
|
|
28
|
+
"TTS functionality requires optional dependencies. Install with:\n"
|
|
29
|
+
" pip install abstractvoice[tts] # For TTS only\n"
|
|
30
|
+
" pip install abstractvoice[all] # For all features\n"
|
|
31
|
+
f"Original error: {e}"
|
|
32
|
+
) from e
|
|
33
|
+
|
|
34
|
+
def _import_audio_deps():
|
|
35
|
+
"""Import audio dependencies with helpful error message if missing."""
|
|
36
|
+
try:
|
|
37
|
+
import sounddevice as sd
|
|
38
|
+
import librosa
|
|
39
|
+
return sd, librosa
|
|
40
|
+
except ImportError as e:
|
|
41
|
+
if "sounddevice" in str(e) or "librosa" in str(e):
|
|
42
|
+
raise ImportError(
|
|
43
|
+
"Audio functionality requires optional dependencies. Install with:\n"
|
|
44
|
+
" pip install abstractvoice[voice] # For basic audio\n"
|
|
45
|
+
" pip install abstractvoice[all] # For all features\n"
|
|
46
|
+
f"Original error: {e}"
|
|
47
|
+
) from e
|
|
48
|
+
raise
|
|
49
|
+
|
|
23
50
|
# Suppress the PyTorch FutureWarning about torch.load
|
|
24
51
|
warnings.filterwarnings(
|
|
25
52
|
"ignore",
|
|
@@ -103,6 +130,7 @@ def apply_speed_without_pitch_change(audio, speed, sr=22050):
|
|
|
103
130
|
# rate < 1.0 makes audio slower (longer)
|
|
104
131
|
# This matches our speed semantics
|
|
105
132
|
try:
|
|
133
|
+
_, librosa = _import_audio_deps()
|
|
106
134
|
stretched_audio = librosa.effects.time_stretch(audio, rate=speed)
|
|
107
135
|
return stretched_audio
|
|
108
136
|
except Exception as e:
|
|
@@ -189,6 +217,7 @@ class NonBlockingAudioPlayer:
|
|
|
189
217
|
"""Start the audio stream."""
|
|
190
218
|
if self.stream is None:
|
|
191
219
|
try:
|
|
220
|
+
sd, _ = _import_audio_deps()
|
|
192
221
|
self.stream = sd.OutputStream(
|
|
193
222
|
samplerate=self.sample_rate,
|
|
194
223
|
channels=1, # Mono output
|
|
@@ -384,8 +413,9 @@ class TTSEngine:
|
|
|
384
413
|
if self.debug_mode:
|
|
385
414
|
print(f" > Loading TTS model: {model_name}")
|
|
386
415
|
|
|
387
|
-
# Try to initialize TTS
|
|
416
|
+
# Try to initialize TTS using lazy import
|
|
388
417
|
try:
|
|
418
|
+
TTS = _import_tts()
|
|
389
419
|
self.tts = TTS(model_name=model_name, progress_bar=self.debug_mode)
|
|
390
420
|
except Exception as e:
|
|
391
421
|
error_msg = str(e).lower()
|
|
@@ -443,105 +473,124 @@ class TTSEngine:
|
|
|
443
473
|
if self.on_playback_end:
|
|
444
474
|
self.on_playback_end()
|
|
445
475
|
|
|
446
|
-
def _speak_with_nonblocking_player(self, text, speed=1.0, callback=None):
|
|
447
|
-
"""Alternative speak method using NonBlockingAudioPlayer for immediate pause/resume."""
|
|
476
|
+
def _speak_with_nonblocking_player(self, text, speed=1.0, callback=None, language='en'):
|
|
477
|
+
"""Alternative speak method using NonBlockingAudioPlayer for immediate pause/resume with language support."""
|
|
448
478
|
# Stop any existing playback
|
|
449
479
|
self.stop()
|
|
450
|
-
|
|
480
|
+
|
|
451
481
|
if not text:
|
|
452
482
|
return False
|
|
453
|
-
|
|
483
|
+
|
|
454
484
|
try:
|
|
455
485
|
# Preprocess text for better synthesis quality
|
|
456
486
|
processed_text = preprocess_text(text)
|
|
457
|
-
|
|
487
|
+
|
|
458
488
|
if self.debug_mode:
|
|
459
489
|
print(f" > Speaking (non-blocking): '{processed_text[:100]}{'...' if len(processed_text) > 100 else ''}'")
|
|
460
490
|
print(f" > Text length: {len(processed_text)} chars")
|
|
491
|
+
if language != 'en':
|
|
492
|
+
print(f" > Language: {language}")
|
|
461
493
|
if speed != 1.0:
|
|
462
494
|
print(f" > Using speed multiplier: {speed}x")
|
|
463
|
-
|
|
495
|
+
|
|
464
496
|
# For very long text, chunk it at natural boundaries
|
|
465
497
|
text_chunks = chunk_long_text(processed_text, max_chunk_size=300)
|
|
466
|
-
|
|
498
|
+
|
|
467
499
|
if self.debug_mode and len(text_chunks) > 1:
|
|
468
500
|
print(f" > Split into {len(text_chunks)} chunks for processing")
|
|
469
|
-
|
|
501
|
+
|
|
470
502
|
# Set playing state
|
|
471
503
|
self.is_playing = True
|
|
472
504
|
self.is_paused_state = False
|
|
473
|
-
|
|
505
|
+
|
|
474
506
|
# Call start callback
|
|
475
507
|
if self.on_playback_start:
|
|
476
508
|
self.on_playback_start()
|
|
477
|
-
|
|
509
|
+
|
|
478
510
|
# Synthesize and queue audio chunks
|
|
479
511
|
def synthesis_worker():
|
|
480
512
|
try:
|
|
481
513
|
for i, chunk in enumerate(text_chunks):
|
|
482
514
|
if self.stop_flag.is_set():
|
|
483
515
|
break
|
|
484
|
-
|
|
516
|
+
|
|
485
517
|
if self.debug_mode and len(text_chunks) > 1:
|
|
486
518
|
print(f" > Processing chunk {i+1}/{len(text_chunks)} ({len(chunk)} chars)...")
|
|
487
|
-
|
|
488
|
-
# Generate audio for this chunk
|
|
489
|
-
|
|
490
|
-
|
|
519
|
+
|
|
520
|
+
# Generate audio for this chunk with language support
|
|
521
|
+
try:
|
|
522
|
+
# Check if this is an XTTS model (supports language parameter)
|
|
523
|
+
if 'xtts' in self.tts.model_name.lower():
|
|
524
|
+
chunk_audio = self.tts.tts(chunk, language=language, split_sentences=True)
|
|
525
|
+
if self.debug_mode and language != 'en':
|
|
526
|
+
print(f" > Using XTTS with language: {language}")
|
|
527
|
+
else:
|
|
528
|
+
# Monolingual model - ignore language parameter
|
|
529
|
+
chunk_audio = self.tts.tts(chunk, split_sentences=True)
|
|
530
|
+
if self.debug_mode and language != 'en':
|
|
531
|
+
print(f" > Monolingual model - ignoring language parameter")
|
|
532
|
+
except Exception as tts_error:
|
|
533
|
+
# Fallback: try without language parameter
|
|
534
|
+
if self.debug_mode:
|
|
535
|
+
print(f" > TTS with language failed, trying without: {tts_error}")
|
|
536
|
+
chunk_audio = self.tts.tts(chunk, split_sentences=True)
|
|
537
|
+
|
|
491
538
|
if chunk_audio and len(chunk_audio) > 0:
|
|
492
539
|
# Apply speed adjustment
|
|
493
540
|
if speed != 1.0:
|
|
494
541
|
chunk_audio = apply_speed_without_pitch_change(
|
|
495
542
|
np.array(chunk_audio), speed
|
|
496
543
|
)
|
|
497
|
-
|
|
544
|
+
|
|
498
545
|
# Queue the audio for playback
|
|
499
546
|
self.audio_player.play_audio(np.array(chunk_audio))
|
|
500
|
-
|
|
547
|
+
|
|
501
548
|
if self.debug_mode:
|
|
502
549
|
print(f" > Chunk {i+1} queued ({len(chunk_audio)} samples)")
|
|
503
|
-
|
|
550
|
+
|
|
504
551
|
# Small delay between chunks to prevent overwhelming the queue
|
|
505
552
|
time.sleep(0.01)
|
|
506
|
-
|
|
553
|
+
|
|
507
554
|
except Exception as e:
|
|
508
555
|
if self.debug_mode:
|
|
509
556
|
print(f"Error in synthesis worker: {e}")
|
|
510
557
|
finally:
|
|
511
558
|
# Synthesis complete - audio player will handle completion callback
|
|
512
559
|
pass
|
|
513
|
-
|
|
560
|
+
|
|
514
561
|
# Start synthesis in background thread
|
|
515
562
|
synthesis_thread = threading.Thread(target=synthesis_worker, daemon=True)
|
|
516
563
|
synthesis_thread.start()
|
|
517
|
-
|
|
564
|
+
|
|
518
565
|
return True
|
|
519
|
-
|
|
566
|
+
|
|
520
567
|
except Exception as e:
|
|
521
568
|
if self.debug_mode:
|
|
522
569
|
print(f"Error in _speak_with_nonblocking_player: {e}")
|
|
523
570
|
self.is_playing = False
|
|
524
571
|
return False
|
|
525
572
|
|
|
526
|
-
def speak(self, text, speed=1.0, callback=None):
|
|
527
|
-
"""Convert text to speech and play audio.
|
|
528
|
-
|
|
573
|
+
def speak(self, text, speed=1.0, callback=None, language='en'):
|
|
574
|
+
"""Convert text to speech and play audio with language support.
|
|
575
|
+
|
|
529
576
|
Implements SOTA best practices for long text synthesis:
|
|
530
577
|
- Text preprocessing and normalization
|
|
531
578
|
- Intelligent chunking for very long text (>500 chars)
|
|
532
579
|
- Sentence segmentation to prevent attention degradation
|
|
533
580
|
- Seamless audio concatenation for chunks
|
|
534
|
-
|
|
581
|
+
- Multilingual support via XTTS models
|
|
582
|
+
|
|
535
583
|
Args:
|
|
536
584
|
text: Text to convert to speech
|
|
537
585
|
speed: Speed multiplier (0.5-2.0)
|
|
538
586
|
callback: Function to call when speech is complete
|
|
539
|
-
|
|
587
|
+
language: Language code for XTTS models ('en', 'fr', 'es', 'de', 'it', 'ru')
|
|
588
|
+
|
|
540
589
|
Returns:
|
|
541
590
|
True if speech started, False if text was empty
|
|
542
591
|
"""
|
|
543
592
|
# Use the new non-blocking audio player for immediate pause/resume
|
|
544
|
-
return self._speak_with_nonblocking_player(text, speed, callback)
|
|
593
|
+
return self._speak_with_nonblocking_player(text, speed, callback, language)
|
|
545
594
|
|
|
546
595
|
if not text:
|
|
547
596
|
return False
|
|
@@ -674,6 +723,9 @@ class TTSEngine:
|
|
|
674
723
|
null_out.close()
|
|
675
724
|
|
|
676
725
|
def _audio_playback():
|
|
726
|
+
# Import sounddevice at runtime to avoid loading heavy dependencies
|
|
727
|
+
sd, _ = _import_audio_deps()
|
|
728
|
+
|
|
677
729
|
try:
|
|
678
730
|
self.is_playing = True
|
|
679
731
|
self.start_time = time.time()
|
|
@@ -1,8 +1,21 @@
|
|
|
1
1
|
"""Voice activity detection using WebRTC VAD."""
|
|
2
2
|
|
|
3
|
-
import webrtcvad
|
|
4
3
|
import logging
|
|
5
4
|
|
|
5
|
+
# Lazy import for heavy dependencies
|
|
6
|
+
def _import_webrtcvad():
|
|
7
|
+
"""Import webrtcvad with helpful error message if dependencies missing."""
|
|
8
|
+
try:
|
|
9
|
+
import webrtcvad
|
|
10
|
+
return webrtcvad
|
|
11
|
+
except ImportError as e:
|
|
12
|
+
raise ImportError(
|
|
13
|
+
"Voice activity detection requires optional dependencies. Install with:\n"
|
|
14
|
+
" pip install abstractvoice[voice] # For basic audio\n"
|
|
15
|
+
" pip install abstractvoice[all] # For all features\n"
|
|
16
|
+
f"Original error: {e}"
|
|
17
|
+
) from e
|
|
18
|
+
|
|
6
19
|
|
|
7
20
|
class VoiceDetector:
|
|
8
21
|
"""Detects voice activity in audio streams."""
|
|
@@ -23,8 +36,9 @@ class VoiceDetector:
|
|
|
23
36
|
if sample_rate not in [8000, 16000, 32000, 48000]:
|
|
24
37
|
raise ValueError("Sample rate must be 8000, 16000, 32000, or 48000 Hz")
|
|
25
38
|
|
|
26
|
-
# Initialize WebRTC VAD
|
|
39
|
+
# Initialize WebRTC VAD using lazy import
|
|
27
40
|
try:
|
|
41
|
+
webrtcvad = _import_webrtcvad()
|
|
28
42
|
self.vad = webrtcvad.Vad(aggressiveness)
|
|
29
43
|
if self.debug_mode:
|
|
30
44
|
print(f" > VAD initialized with aggressiveness {aggressiveness}")
|