abstractvoice 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractvoice/__main__.py +33 -11
- abstractvoice/dependency_check.py +274 -0
- abstractvoice/examples/cli_repl.py +198 -13
- abstractvoice/examples/voice_cli.py +20 -6
- abstractvoice/recognition.py +50 -7
- abstractvoice/stt/transcriber.py +17 -2
- abstractvoice/tts/tts_engine.py +138 -32
- abstractvoice/vad/voice_detector.py +16 -2
- abstractvoice/voice_manager.py +558 -16
- {abstractvoice-0.1.1.dist-info → abstractvoice-0.2.1.dist-info}/METADATA +196 -50
- abstractvoice-0.2.1.dist-info/RECORD +21 -0
- {abstractvoice-0.1.1.dist-info → abstractvoice-0.2.1.dist-info}/licenses/LICENSE +1 -1
- abstractvoice-0.1.1.dist-info/RECORD +0 -20
- {abstractvoice-0.1.1.dist-info → abstractvoice-0.2.1.dist-info}/WHEEL +0 -0
- {abstractvoice-0.1.1.dist-info → abstractvoice-0.2.1.dist-info}/entry_points.txt +0 -0
- {abstractvoice-0.1.1.dist-info → abstractvoice-0.2.1.dist-info}/top_level.txt +0 -0
abstractvoice/recognition.py
CHANGED
|
@@ -2,9 +2,50 @@
|
|
|
2
2
|
|
|
3
3
|
import threading
|
|
4
4
|
import time
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
5
|
+
|
|
6
|
+
# Lazy imports for heavy dependencies
|
|
7
|
+
def _import_audio_deps():
|
|
8
|
+
"""Import audio dependencies with helpful error message if missing."""
|
|
9
|
+
try:
|
|
10
|
+
import pyaudio
|
|
11
|
+
return pyaudio
|
|
12
|
+
except ImportError as e:
|
|
13
|
+
raise ImportError(
|
|
14
|
+
"Audio functionality requires optional dependencies. Install with:\n"
|
|
15
|
+
" pip install abstractvoice[voice] # For basic audio\n"
|
|
16
|
+
" pip install abstractvoice[all] # For all features\n"
|
|
17
|
+
f"Original error: {e}"
|
|
18
|
+
) from e
|
|
19
|
+
|
|
20
|
+
def _import_vad():
|
|
21
|
+
"""Import VoiceDetector with helpful error message if dependencies missing."""
|
|
22
|
+
try:
|
|
23
|
+
from .vad import VoiceDetector
|
|
24
|
+
return VoiceDetector
|
|
25
|
+
except ImportError as e:
|
|
26
|
+
if "webrtcvad" in str(e):
|
|
27
|
+
raise ImportError(
|
|
28
|
+
"Voice activity detection requires optional dependencies. Install with:\n"
|
|
29
|
+
" pip install abstractvoice[voice] # For basic audio\n"
|
|
30
|
+
" pip install abstractvoice[all] # For all features\n"
|
|
31
|
+
f"Original error: {e}"
|
|
32
|
+
) from e
|
|
33
|
+
raise
|
|
34
|
+
|
|
35
|
+
def _import_transcriber():
|
|
36
|
+
"""Import Transcriber with helpful error message if dependencies missing."""
|
|
37
|
+
try:
|
|
38
|
+
from .stt import Transcriber
|
|
39
|
+
return Transcriber
|
|
40
|
+
except ImportError as e:
|
|
41
|
+
if "whisper" in str(e) or "tiktoken" in str(e):
|
|
42
|
+
raise ImportError(
|
|
43
|
+
"Speech recognition functionality requires optional dependencies. Install with:\n"
|
|
44
|
+
" pip install abstractvoice[stt] # For speech recognition only\n"
|
|
45
|
+
" pip install abstractvoice[all] # For all features\n"
|
|
46
|
+
f"Original error: {e}"
|
|
47
|
+
) from e
|
|
48
|
+
raise
|
|
8
49
|
|
|
9
50
|
|
|
10
51
|
class VoiceRecognizer:
|
|
@@ -40,13 +81,15 @@ class VoiceRecognizer:
|
|
|
40
81
|
self.min_speech_chunks = int(min_speech_duration / chunk_duration)
|
|
41
82
|
self.silence_timeout_chunks = int(silence_timeout / chunk_duration)
|
|
42
83
|
|
|
43
|
-
# Initialize components
|
|
84
|
+
# Initialize components using lazy imports
|
|
85
|
+
VoiceDetector = _import_vad()
|
|
44
86
|
self.voice_detector = VoiceDetector(
|
|
45
87
|
aggressiveness=vad_aggressiveness,
|
|
46
88
|
sample_rate=sample_rate,
|
|
47
89
|
debug_mode=debug_mode
|
|
48
90
|
)
|
|
49
|
-
|
|
91
|
+
|
|
92
|
+
Transcriber = _import_transcriber()
|
|
50
93
|
self.transcriber = Transcriber(
|
|
51
94
|
model_name=whisper_model,
|
|
52
95
|
min_transcription_length=min_transcription_length,
|
|
@@ -109,8 +152,8 @@ class VoiceRecognizer:
|
|
|
109
152
|
|
|
110
153
|
def _recognition_loop(self):
|
|
111
154
|
"""Main recognition loop."""
|
|
112
|
-
|
|
113
|
-
|
|
155
|
+
pyaudio = _import_audio_deps()
|
|
156
|
+
|
|
114
157
|
self.pyaudio = pyaudio.PyAudio()
|
|
115
158
|
self.stream = self.pyaudio.open(
|
|
116
159
|
format=pyaudio.paInt16,
|
abstractvoice/stt/transcriber.py
CHANGED
|
@@ -1,11 +1,24 @@
|
|
|
1
1
|
"""Speech-to-text transcription using OpenAI's Whisper."""
|
|
2
2
|
|
|
3
|
-
import whisper
|
|
4
3
|
import numpy as np
|
|
5
4
|
import os
|
|
6
5
|
import sys
|
|
7
6
|
import logging
|
|
8
7
|
|
|
8
|
+
# Lazy import for heavy dependencies
|
|
9
|
+
def _import_whisper():
|
|
10
|
+
"""Import whisper with helpful error message if dependencies missing."""
|
|
11
|
+
try:
|
|
12
|
+
import whisper
|
|
13
|
+
return whisper
|
|
14
|
+
except ImportError as e:
|
|
15
|
+
raise ImportError(
|
|
16
|
+
"Speech recognition functionality requires optional dependencies. Install with:\n"
|
|
17
|
+
" pip install abstractvoice[stt] # For speech recognition only\n"
|
|
18
|
+
" pip install abstractvoice[all] # For all features\n"
|
|
19
|
+
f"Original error: {e}"
|
|
20
|
+
) from e
|
|
21
|
+
|
|
9
22
|
|
|
10
23
|
class Transcriber:
|
|
11
24
|
"""Transcribes audio using OpenAI's Whisper model."""
|
|
@@ -38,7 +51,8 @@ class Transcriber:
|
|
|
38
51
|
null_out = open(os.devnull, 'w')
|
|
39
52
|
sys.stdout = null_out
|
|
40
53
|
|
|
41
|
-
# Load the Whisper model
|
|
54
|
+
# Load the Whisper model using lazy import
|
|
55
|
+
whisper = _import_whisper()
|
|
42
56
|
self.model = whisper.load_model(model_name)
|
|
43
57
|
finally:
|
|
44
58
|
# Restore stdout if we redirected it
|
|
@@ -120,6 +134,7 @@ class Transcriber:
|
|
|
120
134
|
sys.stdout = null_out
|
|
121
135
|
|
|
122
136
|
try:
|
|
137
|
+
whisper = _import_whisper()
|
|
123
138
|
self.model = whisper.load_model(model_name)
|
|
124
139
|
self.model_name = model_name
|
|
125
140
|
finally:
|
abstractvoice/tts/tts_engine.py
CHANGED
|
@@ -10,16 +10,97 @@ This module implements best practices for TTS synthesis including:
|
|
|
10
10
|
import threading
|
|
11
11
|
import time
|
|
12
12
|
import numpy as np
|
|
13
|
-
import sounddevice as sd
|
|
14
13
|
import os
|
|
15
14
|
import sys
|
|
16
15
|
import logging
|
|
17
16
|
import warnings
|
|
18
17
|
import re
|
|
19
|
-
from TTS.api import TTS
|
|
20
|
-
import librosa
|
|
21
18
|
import queue
|
|
22
19
|
|
|
20
|
+
# Lazy imports for heavy dependencies
|
|
21
|
+
def _import_tts():
|
|
22
|
+
"""Import TTS with helpful error message if dependencies missing."""
|
|
23
|
+
try:
|
|
24
|
+
from TTS.api import TTS
|
|
25
|
+
return TTS
|
|
26
|
+
except ImportError as e:
|
|
27
|
+
error_msg = str(e).lower()
|
|
28
|
+
|
|
29
|
+
# Check for specific PyTorch/TorchVision conflicts
|
|
30
|
+
if "torchvision::nms does not exist" in error_msg or "gpt2pretrainedmodel" in error_msg:
|
|
31
|
+
raise ImportError(
|
|
32
|
+
"❌ PyTorch/TorchVision version conflict detected!\n\n"
|
|
33
|
+
"This is a known compatibility issue. To fix:\n\n"
|
|
34
|
+
"1. Uninstall conflicting packages:\n"
|
|
35
|
+
" pip uninstall torch torchvision torchaudio transformers\n\n"
|
|
36
|
+
"2. Reinstall with compatible versions:\n"
|
|
37
|
+
" pip install abstractvoice[all] # Installs tested compatible versions\n\n"
|
|
38
|
+
"3. Or use specific PyTorch version:\n"
|
|
39
|
+
" pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1\n"
|
|
40
|
+
" pip install abstractvoice[voice-full]\n\n"
|
|
41
|
+
"For conda environments, consider:\n"
|
|
42
|
+
" conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia\n\n"
|
|
43
|
+
f"Original error: {e}"
|
|
44
|
+
) from e
|
|
45
|
+
elif "no module named 'tts'" in error_msg or "coqui" in error_msg:
|
|
46
|
+
raise ImportError(
|
|
47
|
+
"TTS functionality requires coqui-tts. Install with:\n"
|
|
48
|
+
" pip install abstractvoice[tts] # For TTS only\n"
|
|
49
|
+
" pip install abstractvoice[voice-full] # For complete voice functionality\n"
|
|
50
|
+
" pip install abstractvoice[all] # For all features\n"
|
|
51
|
+
f"Original error: {e}"
|
|
52
|
+
) from e
|
|
53
|
+
else:
|
|
54
|
+
# Generic import error
|
|
55
|
+
raise ImportError(
|
|
56
|
+
"TTS functionality requires optional dependencies. Install with:\n"
|
|
57
|
+
" pip install abstractvoice[tts] # For TTS only\n"
|
|
58
|
+
" pip install abstractvoice[voice-full] # For complete voice functionality\n"
|
|
59
|
+
" pip install abstractvoice[all] # For all features\n\n"
|
|
60
|
+
"If you're getting PyTorch-related errors, try:\n"
|
|
61
|
+
" pip install abstractvoice[core-tts] # Lightweight TTS without extras\n\n"
|
|
62
|
+
f"Original error: {e}"
|
|
63
|
+
) from e
|
|
64
|
+
|
|
65
|
+
def _import_audio_deps():
|
|
66
|
+
"""Import audio dependencies with helpful error message if missing."""
|
|
67
|
+
try:
|
|
68
|
+
import sounddevice as sd
|
|
69
|
+
import librosa
|
|
70
|
+
return sd, librosa
|
|
71
|
+
except ImportError as e:
|
|
72
|
+
error_msg = str(e).lower()
|
|
73
|
+
|
|
74
|
+
if "sounddevice" in error_msg:
|
|
75
|
+
raise ImportError(
|
|
76
|
+
"Audio playback requires sounddevice. Install with:\n"
|
|
77
|
+
" pip install abstractvoice[audio-only] # For audio processing only\n"
|
|
78
|
+
" pip install abstractvoice[voice-full] # For complete voice functionality\n"
|
|
79
|
+
" pip install abstractvoice[all] # For all features\n\n"
|
|
80
|
+
"On some systems, you may need system audio libraries:\n"
|
|
81
|
+
" Ubuntu/Debian: sudo apt-get install portaudio19-dev\n"
|
|
82
|
+
" macOS: brew install portaudio\n"
|
|
83
|
+
" Windows: Usually works out of the box\n\n"
|
|
84
|
+
f"Original error: {e}"
|
|
85
|
+
) from e
|
|
86
|
+
elif "librosa" in error_msg:
|
|
87
|
+
raise ImportError(
|
|
88
|
+
"Audio processing requires librosa. Install with:\n"
|
|
89
|
+
" pip install abstractvoice[tts] # For TTS functionality\n"
|
|
90
|
+
" pip install abstractvoice[voice-full] # For complete voice functionality\n"
|
|
91
|
+
" pip install abstractvoice[all] # For all features\n\n"
|
|
92
|
+
f"Original error: {e}"
|
|
93
|
+
) from e
|
|
94
|
+
else:
|
|
95
|
+
# Generic audio import error
|
|
96
|
+
raise ImportError(
|
|
97
|
+
"Audio functionality requires optional dependencies. Install with:\n"
|
|
98
|
+
" pip install abstractvoice[audio-only] # For audio processing only\n"
|
|
99
|
+
" pip install abstractvoice[voice-full] # For complete voice functionality\n"
|
|
100
|
+
" pip install abstractvoice[all] # For all features\n\n"
|
|
101
|
+
f"Original error: {e}"
|
|
102
|
+
) from e
|
|
103
|
+
|
|
23
104
|
# Suppress the PyTorch FutureWarning about torch.load
|
|
24
105
|
warnings.filterwarnings(
|
|
25
106
|
"ignore",
|
|
@@ -103,6 +184,7 @@ def apply_speed_without_pitch_change(audio, speed, sr=22050):
|
|
|
103
184
|
# rate < 1.0 makes audio slower (longer)
|
|
104
185
|
# This matches our speed semantics
|
|
105
186
|
try:
|
|
187
|
+
_, librosa = _import_audio_deps()
|
|
106
188
|
stretched_audio = librosa.effects.time_stretch(audio, rate=speed)
|
|
107
189
|
return stretched_audio
|
|
108
190
|
except Exception as e:
|
|
@@ -189,6 +271,7 @@ class NonBlockingAudioPlayer:
|
|
|
189
271
|
"""Start the audio stream."""
|
|
190
272
|
if self.stream is None:
|
|
191
273
|
try:
|
|
274
|
+
sd, _ = _import_audio_deps()
|
|
192
275
|
self.stream = sd.OutputStream(
|
|
193
276
|
samplerate=self.sample_rate,
|
|
194
277
|
channels=1, # Mono output
|
|
@@ -384,8 +467,9 @@ class TTSEngine:
|
|
|
384
467
|
if self.debug_mode:
|
|
385
468
|
print(f" > Loading TTS model: {model_name}")
|
|
386
469
|
|
|
387
|
-
# Try to initialize TTS
|
|
470
|
+
# Try to initialize TTS using lazy import
|
|
388
471
|
try:
|
|
472
|
+
TTS = _import_tts()
|
|
389
473
|
self.tts = TTS(model_name=model_name, progress_bar=self.debug_mode)
|
|
390
474
|
except Exception as e:
|
|
391
475
|
error_msg = str(e).lower()
|
|
@@ -443,105 +527,124 @@ class TTSEngine:
|
|
|
443
527
|
if self.on_playback_end:
|
|
444
528
|
self.on_playback_end()
|
|
445
529
|
|
|
446
|
-
def _speak_with_nonblocking_player(self, text, speed=1.0, callback=None):
|
|
447
|
-
"""Alternative speak method using NonBlockingAudioPlayer for immediate pause/resume."""
|
|
530
|
+
def _speak_with_nonblocking_player(self, text, speed=1.0, callback=None, language='en'):
|
|
531
|
+
"""Alternative speak method using NonBlockingAudioPlayer for immediate pause/resume with language support."""
|
|
448
532
|
# Stop any existing playback
|
|
449
533
|
self.stop()
|
|
450
|
-
|
|
534
|
+
|
|
451
535
|
if not text:
|
|
452
536
|
return False
|
|
453
|
-
|
|
537
|
+
|
|
454
538
|
try:
|
|
455
539
|
# Preprocess text for better synthesis quality
|
|
456
540
|
processed_text = preprocess_text(text)
|
|
457
|
-
|
|
541
|
+
|
|
458
542
|
if self.debug_mode:
|
|
459
543
|
print(f" > Speaking (non-blocking): '{processed_text[:100]}{'...' if len(processed_text) > 100 else ''}'")
|
|
460
544
|
print(f" > Text length: {len(processed_text)} chars")
|
|
545
|
+
if language != 'en':
|
|
546
|
+
print(f" > Language: {language}")
|
|
461
547
|
if speed != 1.0:
|
|
462
548
|
print(f" > Using speed multiplier: {speed}x")
|
|
463
|
-
|
|
549
|
+
|
|
464
550
|
# For very long text, chunk it at natural boundaries
|
|
465
551
|
text_chunks = chunk_long_text(processed_text, max_chunk_size=300)
|
|
466
|
-
|
|
552
|
+
|
|
467
553
|
if self.debug_mode and len(text_chunks) > 1:
|
|
468
554
|
print(f" > Split into {len(text_chunks)} chunks for processing")
|
|
469
|
-
|
|
555
|
+
|
|
470
556
|
# Set playing state
|
|
471
557
|
self.is_playing = True
|
|
472
558
|
self.is_paused_state = False
|
|
473
|
-
|
|
559
|
+
|
|
474
560
|
# Call start callback
|
|
475
561
|
if self.on_playback_start:
|
|
476
562
|
self.on_playback_start()
|
|
477
|
-
|
|
563
|
+
|
|
478
564
|
# Synthesize and queue audio chunks
|
|
479
565
|
def synthesis_worker():
|
|
480
566
|
try:
|
|
481
567
|
for i, chunk in enumerate(text_chunks):
|
|
482
568
|
if self.stop_flag.is_set():
|
|
483
569
|
break
|
|
484
|
-
|
|
570
|
+
|
|
485
571
|
if self.debug_mode and len(text_chunks) > 1:
|
|
486
572
|
print(f" > Processing chunk {i+1}/{len(text_chunks)} ({len(chunk)} chars)...")
|
|
487
|
-
|
|
488
|
-
# Generate audio for this chunk
|
|
489
|
-
|
|
490
|
-
|
|
573
|
+
|
|
574
|
+
# Generate audio for this chunk with language support
|
|
575
|
+
try:
|
|
576
|
+
# Check if this is an XTTS model (supports language parameter)
|
|
577
|
+
if 'xtts' in self.tts.model_name.lower():
|
|
578
|
+
chunk_audio = self.tts.tts(chunk, language=language, split_sentences=True)
|
|
579
|
+
if self.debug_mode and language != 'en':
|
|
580
|
+
print(f" > Using XTTS with language: {language}")
|
|
581
|
+
else:
|
|
582
|
+
# Monolingual model - ignore language parameter
|
|
583
|
+
chunk_audio = self.tts.tts(chunk, split_sentences=True)
|
|
584
|
+
if self.debug_mode and language != 'en':
|
|
585
|
+
print(f" > Monolingual model - ignoring language parameter")
|
|
586
|
+
except Exception as tts_error:
|
|
587
|
+
# Fallback: try without language parameter
|
|
588
|
+
if self.debug_mode:
|
|
589
|
+
print(f" > TTS with language failed, trying without: {tts_error}")
|
|
590
|
+
chunk_audio = self.tts.tts(chunk, split_sentences=True)
|
|
591
|
+
|
|
491
592
|
if chunk_audio and len(chunk_audio) > 0:
|
|
492
593
|
# Apply speed adjustment
|
|
493
594
|
if speed != 1.0:
|
|
494
595
|
chunk_audio = apply_speed_without_pitch_change(
|
|
495
596
|
np.array(chunk_audio), speed
|
|
496
597
|
)
|
|
497
|
-
|
|
598
|
+
|
|
498
599
|
# Queue the audio for playback
|
|
499
600
|
self.audio_player.play_audio(np.array(chunk_audio))
|
|
500
|
-
|
|
601
|
+
|
|
501
602
|
if self.debug_mode:
|
|
502
603
|
print(f" > Chunk {i+1} queued ({len(chunk_audio)} samples)")
|
|
503
|
-
|
|
604
|
+
|
|
504
605
|
# Small delay between chunks to prevent overwhelming the queue
|
|
505
606
|
time.sleep(0.01)
|
|
506
|
-
|
|
607
|
+
|
|
507
608
|
except Exception as e:
|
|
508
609
|
if self.debug_mode:
|
|
509
610
|
print(f"Error in synthesis worker: {e}")
|
|
510
611
|
finally:
|
|
511
612
|
# Synthesis complete - audio player will handle completion callback
|
|
512
613
|
pass
|
|
513
|
-
|
|
614
|
+
|
|
514
615
|
# Start synthesis in background thread
|
|
515
616
|
synthesis_thread = threading.Thread(target=synthesis_worker, daemon=True)
|
|
516
617
|
synthesis_thread.start()
|
|
517
|
-
|
|
618
|
+
|
|
518
619
|
return True
|
|
519
|
-
|
|
620
|
+
|
|
520
621
|
except Exception as e:
|
|
521
622
|
if self.debug_mode:
|
|
522
623
|
print(f"Error in _speak_with_nonblocking_player: {e}")
|
|
523
624
|
self.is_playing = False
|
|
524
625
|
return False
|
|
525
626
|
|
|
526
|
-
def speak(self, text, speed=1.0, callback=None):
|
|
527
|
-
"""Convert text to speech and play audio.
|
|
528
|
-
|
|
627
|
+
def speak(self, text, speed=1.0, callback=None, language='en'):
|
|
628
|
+
"""Convert text to speech and play audio with language support.
|
|
629
|
+
|
|
529
630
|
Implements SOTA best practices for long text synthesis:
|
|
530
631
|
- Text preprocessing and normalization
|
|
531
632
|
- Intelligent chunking for very long text (>500 chars)
|
|
532
633
|
- Sentence segmentation to prevent attention degradation
|
|
533
634
|
- Seamless audio concatenation for chunks
|
|
534
|
-
|
|
635
|
+
- Multilingual support via XTTS models
|
|
636
|
+
|
|
535
637
|
Args:
|
|
536
638
|
text: Text to convert to speech
|
|
537
639
|
speed: Speed multiplier (0.5-2.0)
|
|
538
640
|
callback: Function to call when speech is complete
|
|
539
|
-
|
|
641
|
+
language: Language code for XTTS models ('en', 'fr', 'es', 'de', 'it', 'ru')
|
|
642
|
+
|
|
540
643
|
Returns:
|
|
541
644
|
True if speech started, False if text was empty
|
|
542
645
|
"""
|
|
543
646
|
# Use the new non-blocking audio player for immediate pause/resume
|
|
544
|
-
return self._speak_with_nonblocking_player(text, speed, callback)
|
|
647
|
+
return self._speak_with_nonblocking_player(text, speed, callback, language)
|
|
545
648
|
|
|
546
649
|
if not text:
|
|
547
650
|
return False
|
|
@@ -674,6 +777,9 @@ class TTSEngine:
|
|
|
674
777
|
null_out.close()
|
|
675
778
|
|
|
676
779
|
def _audio_playback():
|
|
780
|
+
# Import sounddevice at runtime to avoid loading heavy dependencies
|
|
781
|
+
sd, _ = _import_audio_deps()
|
|
782
|
+
|
|
677
783
|
try:
|
|
678
784
|
self.is_playing = True
|
|
679
785
|
self.start_time = time.time()
|
|
@@ -1,8 +1,21 @@
|
|
|
1
1
|
"""Voice activity detection using WebRTC VAD."""
|
|
2
2
|
|
|
3
|
-
import webrtcvad
|
|
4
3
|
import logging
|
|
5
4
|
|
|
5
|
+
# Lazy import for heavy dependencies
|
|
6
|
+
def _import_webrtcvad():
|
|
7
|
+
"""Import webrtcvad with helpful error message if dependencies missing."""
|
|
8
|
+
try:
|
|
9
|
+
import webrtcvad
|
|
10
|
+
return webrtcvad
|
|
11
|
+
except ImportError as e:
|
|
12
|
+
raise ImportError(
|
|
13
|
+
"Voice activity detection requires optional dependencies. Install with:\n"
|
|
14
|
+
" pip install abstractvoice[voice] # For basic audio\n"
|
|
15
|
+
" pip install abstractvoice[all] # For all features\n"
|
|
16
|
+
f"Original error: {e}"
|
|
17
|
+
) from e
|
|
18
|
+
|
|
6
19
|
|
|
7
20
|
class VoiceDetector:
|
|
8
21
|
"""Detects voice activity in audio streams."""
|
|
@@ -23,8 +36,9 @@ class VoiceDetector:
|
|
|
23
36
|
if sample_rate not in [8000, 16000, 32000, 48000]:
|
|
24
37
|
raise ValueError("Sample rate must be 8000, 16000, 32000, or 48000 Hz")
|
|
25
38
|
|
|
26
|
-
# Initialize WebRTC VAD
|
|
39
|
+
# Initialize WebRTC VAD using lazy import
|
|
27
40
|
try:
|
|
41
|
+
webrtcvad = _import_webrtcvad()
|
|
28
42
|
self.vad = webrtcvad.Vad(aggressiveness)
|
|
29
43
|
if self.debug_mode:
|
|
30
44
|
print(f" > VAD initialized with aggressiveness {aggressiveness}")
|