abstractvoice 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractvoice/__init__.py +33 -0
- abstractvoice/__main__.py +119 -0
- abstractvoice/examples/__init__.py +1 -0
- abstractvoice/examples/cli_repl.py +861 -0
- abstractvoice/examples/voice_cli.py +85 -0
- abstractvoice/examples/web_api.py +214 -0
- abstractvoice/recognition.py +252 -0
- abstractvoice/stt/__init__.py +5 -0
- abstractvoice/stt/transcriber.py +138 -0
- abstractvoice/tts/__init__.py +5 -0
- abstractvoice/tts/tts_engine.py +931 -0
- abstractvoice/vad/__init__.py +5 -0
- abstractvoice/vad/voice_detector.py +75 -0
- abstractvoice/voice_manager.py +294 -0
- abstractvoice-0.1.0.dist-info/METADATA +1132 -0
- abstractvoice-0.1.0.dist-info/RECORD +20 -0
- abstractvoice-0.1.0.dist-info/WHEEL +5 -0
- abstractvoice-0.1.0.dist-info/entry_points.txt +3 -0
- abstractvoice-0.1.0.dist-info/licenses/LICENSE +21 -0
- abstractvoice-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Voice activity detection using WebRTC VAD."""
|
|
2
|
+
|
|
3
|
+
import webrtcvad
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class VoiceDetector:
|
|
8
|
+
"""Detects voice activity in audio streams."""
|
|
9
|
+
|
|
10
|
+
def __init__(self, aggressiveness=1, sample_rate=16000, debug_mode=False):
|
|
11
|
+
"""Initialize the voice detector.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
aggressiveness: VAD aggressiveness (0-3, higher is more strict)
|
|
15
|
+
sample_rate: Audio sample rate (8000, 16000, 32000, 48000 Hz)
|
|
16
|
+
debug_mode: Enable debug output
|
|
17
|
+
"""
|
|
18
|
+
self.debug_mode = debug_mode
|
|
19
|
+
self.sample_rate = sample_rate
|
|
20
|
+
self.aggressiveness = aggressiveness
|
|
21
|
+
|
|
22
|
+
# Check sample rate is valid for WebRTC VAD
|
|
23
|
+
if sample_rate not in [8000, 16000, 32000, 48000]:
|
|
24
|
+
raise ValueError("Sample rate must be 8000, 16000, 32000, or 48000 Hz")
|
|
25
|
+
|
|
26
|
+
# Initialize WebRTC VAD
|
|
27
|
+
try:
|
|
28
|
+
self.vad = webrtcvad.Vad(aggressiveness)
|
|
29
|
+
if self.debug_mode:
|
|
30
|
+
print(f" > VAD initialized with aggressiveness {aggressiveness}")
|
|
31
|
+
except Exception as e:
|
|
32
|
+
if self.debug_mode:
|
|
33
|
+
print(f"VAD initialization error: {e}")
|
|
34
|
+
raise
|
|
35
|
+
|
|
36
|
+
def is_speech(self, audio_frame):
|
|
37
|
+
"""Check if audio frame contains speech.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
audio_frame: Audio frame as bytes (must be 10, 20, or 30ms at sample_rate)
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
True if speech detected, False otherwise
|
|
44
|
+
"""
|
|
45
|
+
try:
|
|
46
|
+
return self.vad.is_speech(audio_frame, self.sample_rate)
|
|
47
|
+
except Exception as e:
|
|
48
|
+
if self.debug_mode:
|
|
49
|
+
print(f"VAD processing error: {e}")
|
|
50
|
+
return False
|
|
51
|
+
|
|
52
|
+
def set_aggressiveness(self, aggressiveness):
|
|
53
|
+
"""Change VAD aggressiveness.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
aggressiveness: New aggressiveness level (0-3)
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
True if changed, False otherwise
|
|
60
|
+
"""
|
|
61
|
+
if 0 <= aggressiveness <= 3:
|
|
62
|
+
try:
|
|
63
|
+
self.vad.set_mode(aggressiveness)
|
|
64
|
+
self.aggressiveness = aggressiveness
|
|
65
|
+
if self.debug_mode:
|
|
66
|
+
print(f" > VAD aggressiveness changed to {aggressiveness}")
|
|
67
|
+
return True
|
|
68
|
+
except Exception as e:
|
|
69
|
+
if self.debug_mode:
|
|
70
|
+
print(f"VAD aggressiveness change error: {e}")
|
|
71
|
+
return False
|
|
72
|
+
else:
|
|
73
|
+
if self.debug_mode:
|
|
74
|
+
print(f" > Invalid aggressiveness: {aggressiveness}")
|
|
75
|
+
return False
|
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
"""Main Voice Manager class for coordinating TTS and STT components."""
|
|
2
|
+
|
|
3
|
+
from .tts import TTSEngine
|
|
4
|
+
from .recognition import VoiceRecognizer
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class VoiceManager:
|
|
8
|
+
"""Main class for voice interaction capabilities."""
|
|
9
|
+
|
|
10
|
+
def __init__(self, tts_model="tts_models/en/ljspeech/vits",
|
|
11
|
+
whisper_model="tiny", debug_mode=False):
|
|
12
|
+
"""Initialize the Voice Manager.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
tts_model: TTS model name to use
|
|
16
|
+
whisper_model: Whisper model name to use
|
|
17
|
+
debug_mode: Enable debug logging
|
|
18
|
+
"""
|
|
19
|
+
self.debug_mode = debug_mode
|
|
20
|
+
self.speed = 1.0
|
|
21
|
+
|
|
22
|
+
# Initialize TTS engine
|
|
23
|
+
self.tts_engine = TTSEngine(
|
|
24
|
+
model_name=tts_model,
|
|
25
|
+
debug_mode=debug_mode
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
# Set up callbacks to pause/resume voice recognition during TTS playback
|
|
29
|
+
# This prevents the system from interrupting its own speech
|
|
30
|
+
self.tts_engine.on_playback_start = self._on_tts_start
|
|
31
|
+
self.tts_engine.on_playback_end = self._on_tts_end
|
|
32
|
+
|
|
33
|
+
# Voice recognizer is initialized on demand
|
|
34
|
+
self.voice_recognizer = None
|
|
35
|
+
self.whisper_model = whisper_model
|
|
36
|
+
|
|
37
|
+
# State tracking
|
|
38
|
+
self._transcription_callback = None
|
|
39
|
+
self._stop_callback = None
|
|
40
|
+
self._voice_mode = "full" # full, wait, stop, ptt
|
|
41
|
+
|
|
42
|
+
def _on_tts_start(self):
|
|
43
|
+
"""Called when TTS playback starts - handle based on voice mode."""
|
|
44
|
+
if not self.voice_recognizer:
|
|
45
|
+
return
|
|
46
|
+
|
|
47
|
+
if self._voice_mode == "full":
|
|
48
|
+
# Full mode: Keep listening but pause interrupt capability
|
|
49
|
+
self.voice_recognizer.pause_tts_interrupt()
|
|
50
|
+
elif self._voice_mode in ["wait", "stop", "ptt"]:
|
|
51
|
+
# Wait/Stop/PTT modes: Pause listening entirely during TTS
|
|
52
|
+
self.voice_recognizer.pause_listening()
|
|
53
|
+
|
|
54
|
+
def _on_tts_end(self):
|
|
55
|
+
"""Called when TTS playback ends - handle based on voice mode."""
|
|
56
|
+
if not self.voice_recognizer:
|
|
57
|
+
return
|
|
58
|
+
|
|
59
|
+
if self._voice_mode == "full":
|
|
60
|
+
# Full mode: Resume interrupt capability
|
|
61
|
+
self.voice_recognizer.resume_tts_interrupt()
|
|
62
|
+
elif self._voice_mode in ["wait", "stop", "ptt"]:
|
|
63
|
+
# Wait/Stop/PTT modes: Resume listening
|
|
64
|
+
self.voice_recognizer.resume_listening()
|
|
65
|
+
|
|
66
|
+
def speak(self, text, speed=1.0, callback=None):
|
|
67
|
+
"""Convert text to speech and play audio.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
text: Text to convert to speech
|
|
71
|
+
speed: Speech speed (0.5-2.0)
|
|
72
|
+
callback: Function to call when speech completes
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
True if speech started, False otherwise
|
|
76
|
+
"""
|
|
77
|
+
sp = 1.0
|
|
78
|
+
if speed != 1.0:
|
|
79
|
+
sp = speed
|
|
80
|
+
else:
|
|
81
|
+
sp = self.speed
|
|
82
|
+
|
|
83
|
+
return self.tts_engine.speak(text, sp, callback)
|
|
84
|
+
|
|
85
|
+
def stop_speaking(self):
|
|
86
|
+
"""Stop current speech playback.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
True if stopped, False if no playback was active
|
|
90
|
+
"""
|
|
91
|
+
return self.tts_engine.stop()
|
|
92
|
+
|
|
93
|
+
def pause_speaking(self):
|
|
94
|
+
"""Pause current speech playback.
|
|
95
|
+
|
|
96
|
+
Pauses at chunk boundaries in streaming mode. Can be resumed with resume_speaking().
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
True if paused, False if no playback was active
|
|
100
|
+
"""
|
|
101
|
+
return self.tts_engine.pause()
|
|
102
|
+
|
|
103
|
+
def resume_speaking(self):
|
|
104
|
+
"""Resume paused speech playback.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
True if resumed, False if not paused or no playback active
|
|
108
|
+
"""
|
|
109
|
+
return self.tts_engine.resume()
|
|
110
|
+
|
|
111
|
+
def is_paused(self):
|
|
112
|
+
"""Check if TTS is currently paused.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
True if paused, False otherwise
|
|
116
|
+
"""
|
|
117
|
+
return self.tts_engine.is_paused()
|
|
118
|
+
|
|
119
|
+
def is_speaking(self):
|
|
120
|
+
"""Check if TTS is currently active.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
True if speaking, False otherwise
|
|
124
|
+
"""
|
|
125
|
+
return self.tts_engine.is_active()
|
|
126
|
+
|
|
127
|
+
def listen(self, on_transcription, on_stop=None):
|
|
128
|
+
"""Start listening for speech with callbacks.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
on_transcription: Callback for transcribed text
|
|
132
|
+
on_stop: Callback when 'stop' command detected
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
True if started, False if already listening
|
|
136
|
+
"""
|
|
137
|
+
# Store callbacks
|
|
138
|
+
self._transcription_callback = on_transcription
|
|
139
|
+
self._stop_callback = on_stop
|
|
140
|
+
|
|
141
|
+
# Initialize recognizer if not already done
|
|
142
|
+
if not self.voice_recognizer:
|
|
143
|
+
def _transcription_handler(text):
|
|
144
|
+
if self._transcription_callback:
|
|
145
|
+
self._transcription_callback(text)
|
|
146
|
+
|
|
147
|
+
def _stop_handler():
|
|
148
|
+
# Stop listening
|
|
149
|
+
self.stop_listening()
|
|
150
|
+
# Call user's stop callback if provided
|
|
151
|
+
if self._stop_callback:
|
|
152
|
+
self._stop_callback()
|
|
153
|
+
|
|
154
|
+
self.voice_recognizer = VoiceRecognizer(
|
|
155
|
+
transcription_callback=_transcription_handler,
|
|
156
|
+
stop_callback=_stop_handler,
|
|
157
|
+
whisper_model=self.whisper_model,
|
|
158
|
+
debug_mode=self.debug_mode
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# Start with TTS interrupt capability
|
|
162
|
+
return self.voice_recognizer.start(
|
|
163
|
+
tts_interrupt_callback=self.stop_speaking
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
def stop_listening(self):
|
|
167
|
+
"""Stop listening for speech.
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
True if stopped, False if not listening
|
|
171
|
+
"""
|
|
172
|
+
if self.voice_recognizer:
|
|
173
|
+
return self.voice_recognizer.stop()
|
|
174
|
+
return False
|
|
175
|
+
|
|
176
|
+
def is_listening(self):
|
|
177
|
+
"""Check if currently listening for speech.
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
True if listening, False otherwise
|
|
181
|
+
"""
|
|
182
|
+
return self.voice_recognizer and self.voice_recognizer.is_running
|
|
183
|
+
|
|
184
|
+
def set_voice_mode(self, mode):
|
|
185
|
+
"""Set the voice mode (full, wait, stop, ptt).
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
mode: Voice mode to use
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
True if successful
|
|
192
|
+
"""
|
|
193
|
+
if mode in ["full", "wait", "stop", "ptt"]:
|
|
194
|
+
self._voice_mode = mode
|
|
195
|
+
return True
|
|
196
|
+
return False
|
|
197
|
+
|
|
198
|
+
def set_speed(self, speed):
|
|
199
|
+
"""Set the TTS speed.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
speed: Speech speed multiplier (0.5-2.0)
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
True if successful
|
|
206
|
+
"""
|
|
207
|
+
self.speed = speed
|
|
208
|
+
return True
|
|
209
|
+
|
|
210
|
+
def get_speed(self):
|
|
211
|
+
"""Get the TTS speed.
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
Current TTS speed multiplier
|
|
215
|
+
"""
|
|
216
|
+
return self.speed
|
|
217
|
+
|
|
218
|
+
def set_tts_model(self, model_name):
|
|
219
|
+
"""Change the TTS model.
|
|
220
|
+
|
|
221
|
+
Available models (all pure Python, cross-platform):
|
|
222
|
+
- "tts_models/en/ljspeech/fast_pitch" (default, recommended)
|
|
223
|
+
- "tts_models/en/ljspeech/glow-tts" (alternative)
|
|
224
|
+
- "tts_models/en/ljspeech/tacotron2-DDC" (legacy)
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
model_name: TTS model name to use
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
True if successful
|
|
231
|
+
|
|
232
|
+
Example:
|
|
233
|
+
vm.set_tts_model("tts_models/en/ljspeech/glow-tts")
|
|
234
|
+
"""
|
|
235
|
+
# Stop any current speech
|
|
236
|
+
self.stop_speaking()
|
|
237
|
+
|
|
238
|
+
# Reinitialize TTS engine with new model
|
|
239
|
+
self.tts_engine = TTSEngine(
|
|
240
|
+
model_name=model_name,
|
|
241
|
+
debug_mode=self.debug_mode
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
# Restore callbacks
|
|
245
|
+
self.tts_engine.on_playback_start = self._on_tts_start
|
|
246
|
+
self.tts_engine.on_playback_end = self._on_tts_end
|
|
247
|
+
|
|
248
|
+
return True
|
|
249
|
+
|
|
250
|
+
def set_whisper(self, model_name):
|
|
251
|
+
"""Set the Whisper model.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
whisper_model: Whisper model name (tiny, base, etc.)
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
True if successful
|
|
258
|
+
"""
|
|
259
|
+
self.whisper_model = model_name
|
|
260
|
+
if self.voice_recognizer:
|
|
261
|
+
return self.voice_recognizer.change_whisper_model(model_name)
|
|
262
|
+
|
|
263
|
+
def get_whisper(self):
|
|
264
|
+
"""Get the Whisper model.
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
Current Whisper model name
|
|
268
|
+
"""
|
|
269
|
+
return self.whisper_model
|
|
270
|
+
|
|
271
|
+
def change_vad_aggressiveness(self, aggressiveness):
|
|
272
|
+
"""Change VAD aggressiveness.
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
aggressiveness: New aggressiveness level (0-3)
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
True if changed, False otherwise
|
|
279
|
+
"""
|
|
280
|
+
if self.voice_recognizer:
|
|
281
|
+
return self.voice_recognizer.change_vad_aggressiveness(aggressiveness)
|
|
282
|
+
return False
|
|
283
|
+
|
|
284
|
+
def cleanup(self):
|
|
285
|
+
"""Clean up resources.
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
True if cleanup successful
|
|
289
|
+
"""
|
|
290
|
+
if self.voice_recognizer:
|
|
291
|
+
self.voice_recognizer.stop()
|
|
292
|
+
|
|
293
|
+
self.stop_speaking()
|
|
294
|
+
return True
|