abstractvoice 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractvoice/__init__.py +2 -5
- abstractvoice/__main__.py +82 -3
- abstractvoice/adapters/__init__.py +12 -0
- abstractvoice/adapters/base.py +207 -0
- abstractvoice/adapters/stt_faster_whisper.py +401 -0
- abstractvoice/adapters/tts_piper.py +480 -0
- abstractvoice/aec/__init__.py +10 -0
- abstractvoice/aec/webrtc_apm.py +56 -0
- abstractvoice/artifacts.py +173 -0
- abstractvoice/audio/__init__.py +7 -0
- abstractvoice/audio/recorder.py +46 -0
- abstractvoice/audio/resample.py +25 -0
- abstractvoice/cloning/__init__.py +7 -0
- abstractvoice/cloning/engine_chroma.py +738 -0
- abstractvoice/cloning/engine_f5.py +546 -0
- abstractvoice/cloning/manager.py +349 -0
- abstractvoice/cloning/store.py +362 -0
- abstractvoice/compute/__init__.py +6 -0
- abstractvoice/compute/device.py +73 -0
- abstractvoice/config/__init__.py +2 -0
- abstractvoice/config/voice_catalog.py +19 -0
- abstractvoice/dependency_check.py +0 -1
- abstractvoice/examples/cli_repl.py +2403 -243
- abstractvoice/examples/voice_cli.py +64 -63
- abstractvoice/integrations/__init__.py +2 -0
- abstractvoice/integrations/abstractcore.py +116 -0
- abstractvoice/integrations/abstractcore_plugin.py +253 -0
- abstractvoice/prefetch.py +82 -0
- abstractvoice/recognition.py +424 -42
- abstractvoice/stop_phrase.py +103 -0
- abstractvoice/tts/__init__.py +3 -3
- abstractvoice/tts/adapter_tts_engine.py +210 -0
- abstractvoice/tts/tts_engine.py +257 -1208
- abstractvoice/vm/__init__.py +2 -0
- abstractvoice/vm/common.py +21 -0
- abstractvoice/vm/core.py +139 -0
- abstractvoice/vm/manager.py +108 -0
- abstractvoice/vm/stt_mixin.py +158 -0
- abstractvoice/vm/tts_mixin.py +550 -0
- abstractvoice/voice_manager.py +6 -1061
- abstractvoice-0.6.1.dist-info/METADATA +213 -0
- abstractvoice-0.6.1.dist-info/RECORD +52 -0
- {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/WHEEL +1 -1
- abstractvoice-0.6.1.dist-info/entry_points.txt +6 -0
- abstractvoice/instant_setup.py +0 -83
- abstractvoice/simple_model_manager.py +0 -539
- abstractvoice-0.5.1.dist-info/METADATA +0 -1458
- abstractvoice-0.5.1.dist-info/RECORD +0 -23
- abstractvoice-0.5.1.dist-info/entry_points.txt +0 -2
- {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/licenses/LICENSE +0 -0
- {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/top_level.txt +0 -0
abstractvoice/recognition.py
CHANGED
|
@@ -2,18 +2,26 @@
|
|
|
2
2
|
|
|
3
3
|
import threading
|
|
4
4
|
import time
|
|
5
|
+
from typing import Optional
|
|
6
|
+
from collections import deque
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import re
|
|
10
|
+
|
|
11
|
+
from .stop_phrase import is_stop_phrase
|
|
12
|
+
from .audio.resample import linear_resample_mono
|
|
5
13
|
|
|
6
14
|
# Lazy imports for heavy dependencies
|
|
7
15
|
def _import_audio_deps():
|
|
8
16
|
"""Import audio dependencies with helpful error message if missing."""
|
|
9
17
|
try:
|
|
10
|
-
import
|
|
11
|
-
return
|
|
18
|
+
import sounddevice as sd
|
|
19
|
+
return sd
|
|
12
20
|
except ImportError as e:
|
|
13
21
|
raise ImportError(
|
|
14
|
-
"Audio
|
|
15
|
-
" pip install abstractvoice
|
|
16
|
-
" pip install abstractvoice[all]
|
|
22
|
+
"Audio capture/playback requires sounddevice. Install with:\n"
|
|
23
|
+
" pip install abstractvoice # Core install (includes sounddevice)\n"
|
|
24
|
+
" pip install abstractvoice[all] # All features\n"
|
|
17
25
|
f"Original error: {e}"
|
|
18
26
|
) from e
|
|
19
27
|
|
|
@@ -33,21 +41,34 @@ def _import_vad():
|
|
|
33
41
|
raise
|
|
34
42
|
|
|
35
43
|
def _import_transcriber():
|
|
36
|
-
"""Import
|
|
44
|
+
"""Import STT adapter with helpful error message if dependencies missing."""
|
|
37
45
|
try:
|
|
38
|
-
from .
|
|
39
|
-
return
|
|
46
|
+
from .adapters.stt_faster_whisper import FasterWhisperAdapter
|
|
47
|
+
return FasterWhisperAdapter
|
|
40
48
|
except ImportError as e:
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
49
|
+
raise ImportError(
|
|
50
|
+
"Speech recognition requires faster-whisper (core dependency). "
|
|
51
|
+
"If this error occurs, your installation is inconsistent.\n"
|
|
52
|
+
"Try reinstalling:\n"
|
|
53
|
+
" pip install --upgrade abstractvoice\n"
|
|
54
|
+
f"Original error: {e}"
|
|
55
|
+
) from e
|
|
48
56
|
raise
|
|
49
57
|
|
|
50
58
|
|
|
59
|
+
def _import_aec_processor():
|
|
60
|
+
"""Import AEC processor with helpful error if dependencies missing."""
|
|
61
|
+
try:
|
|
62
|
+
from .aec.webrtc_apm import AecConfig, WebRtcAecProcessor
|
|
63
|
+
return AecConfig, WebRtcAecProcessor
|
|
64
|
+
except ImportError as e:
|
|
65
|
+
raise ImportError(
|
|
66
|
+
"AEC is optional and requires extra dependencies.\n"
|
|
67
|
+
"Install with: pip install \"abstractvoice[aec]\"\n"
|
|
68
|
+
f"Original error: {e}"
|
|
69
|
+
) from e
|
|
70
|
+
|
|
71
|
+
|
|
51
72
|
class VoiceRecognizer:
|
|
52
73
|
"""Voice recognition with VAD and STT."""
|
|
53
74
|
|
|
@@ -55,7 +76,10 @@ class VoiceRecognizer:
|
|
|
55
76
|
vad_aggressiveness=1, min_speech_duration=600,
|
|
56
77
|
silence_timeout=1500, sample_rate=16000,
|
|
57
78
|
chunk_duration=30, whisper_model="tiny",
|
|
58
|
-
min_transcription_length=5, debug_mode=False
|
|
79
|
+
min_transcription_length=5, debug_mode=False,
|
|
80
|
+
aec_enabled: bool = False, aec_stream_delay_ms: int = 0,
|
|
81
|
+
language: str | None = None,
|
|
82
|
+
allow_downloads: bool = True):
|
|
59
83
|
"""Initialize voice recognizer.
|
|
60
84
|
|
|
61
85
|
Args:
|
|
@@ -73,6 +97,25 @@ class VoiceRecognizer:
|
|
|
73
97
|
self.debug_mode = debug_mode
|
|
74
98
|
self.transcription_callback = transcription_callback
|
|
75
99
|
self.stop_callback = stop_callback
|
|
100
|
+
self.language = (language or None)
|
|
101
|
+
self.allow_downloads = bool(allow_downloads)
|
|
102
|
+
|
|
103
|
+
# Stop phrase(s): robust “interrupt” without requiring echo cancellation.
|
|
104
|
+
# Keep it conservative to avoid accidental stops from the assistant audio.
|
|
105
|
+
# Include bare "stop" because users will naturally say it.
|
|
106
|
+
self.stop_phrases = ["stop", "ok stop", "okay stop"]
|
|
107
|
+
|
|
108
|
+
# While TTS is playing we can end up with continuous "speech" from speaker echo,
|
|
109
|
+
# which prevents end-of-utterance detection and therefore prevents stop phrase
|
|
110
|
+
# transcription. To keep STOP mode usable without AEC, we run a low-rate rolling
|
|
111
|
+
# window transcription ONLY for stop-phrase detection when transcriptions are paused.
|
|
112
|
+
self._stop_ring = bytearray()
|
|
113
|
+
self._stop_last_check = 0.0
|
|
114
|
+
# Faster checks help catch "ok stop" early during playback.
|
|
115
|
+
self._stop_check_interval_s = 0.6
|
|
116
|
+
self._stop_window_s = 2.0
|
|
117
|
+
self._stop_hit_count = 0
|
|
118
|
+
self._stop_hit_deadline = 0.0
|
|
76
119
|
|
|
77
120
|
# Configuration
|
|
78
121
|
self.sample_rate = sample_rate
|
|
@@ -80,6 +123,8 @@ class VoiceRecognizer:
|
|
|
80
123
|
self.chunk_size = int(sample_rate * chunk_duration / 1000)
|
|
81
124
|
self.min_speech_chunks = int(min_speech_duration / chunk_duration)
|
|
82
125
|
self.silence_timeout_chunks = int(silence_timeout / chunk_duration)
|
|
126
|
+
self._default_min_speech_chunks = int(self.min_speech_chunks)
|
|
127
|
+
self._default_silence_timeout_chunks = int(self.silence_timeout_chunks)
|
|
83
128
|
|
|
84
129
|
# Initialize components using lazy imports
|
|
85
130
|
VoiceDetector = _import_vad()
|
|
@@ -89,21 +134,181 @@ class VoiceRecognizer:
|
|
|
89
134
|
debug_mode=debug_mode
|
|
90
135
|
)
|
|
91
136
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
137
|
+
# STT: use faster-whisper adapter by default (core dependency)
|
|
138
|
+
STTAdapter = _import_transcriber()
|
|
139
|
+
self.stt_adapter = STTAdapter(
|
|
140
|
+
model_size=whisper_model,
|
|
141
|
+
device="auto",
|
|
142
|
+
compute_type="int8",
|
|
143
|
+
allow_downloads=bool(self.allow_downloads),
|
|
97
144
|
)
|
|
145
|
+
self.min_transcription_length = min_transcription_length
|
|
98
146
|
|
|
99
147
|
# State
|
|
100
148
|
self.is_running = False
|
|
101
149
|
self.thread = None
|
|
102
|
-
self.pyaudio = None
|
|
103
150
|
self.stream = None
|
|
104
151
|
self.tts_interrupt_callback = None
|
|
105
152
|
self.tts_interrupt_enabled = True # Can be disabled during TTS playback
|
|
106
153
|
self.listening_paused = False # Can be paused to completely stop processing audio
|
|
154
|
+
# While TTS is playing (esp. without AEC), we often want to suppress normal
|
|
155
|
+
# transcriptions to avoid self-feedback loops, but still allow stop phrase.
|
|
156
|
+
self.transcriptions_paused = False
|
|
157
|
+
self._profile = "stop"
|
|
158
|
+
|
|
159
|
+
# Last STT metrics (best-effort; used by verbose REPL output).
|
|
160
|
+
# Populated only for "normal" transcriptions that invoke transcription_callback.
|
|
161
|
+
self.last_stt_metrics: dict | None = None
|
|
162
|
+
|
|
163
|
+
# Optional AEC (echo cancellation) state.
|
|
164
|
+
self.aec_enabled = False
|
|
165
|
+
self._aec = None
|
|
166
|
+
self._far_end_lock = threading.Lock()
|
|
167
|
+
self._far_end_pcm16 = bytearray()
|
|
168
|
+
# Lightweight echo gating (for full-mode barge-in without AEC).
|
|
169
|
+
self._echo_gate_enabled = False
|
|
170
|
+
self._echo_corr_threshold = 0.72
|
|
171
|
+
if aec_enabled:
|
|
172
|
+
self.enable_aec(True, stream_delay_ms=aec_stream_delay_ms)
|
|
173
|
+
|
|
174
|
+
# Apply initial profile.
|
|
175
|
+
self.set_profile("stop")
|
|
176
|
+
|
|
177
|
+
def set_profile(self, profile: str) -> None:
|
|
178
|
+
"""Set listening profile tuned for the current interaction mode.
|
|
179
|
+
|
|
180
|
+
Why this exists:
|
|
181
|
+
- PTT needs *very* low thresholds to reliably capture short utterances.
|
|
182
|
+
- STOP/WAIT should use more conservative defaults to reduce false triggers.
|
|
183
|
+
"""
|
|
184
|
+
p = (profile or "").strip().lower()
|
|
185
|
+
if p not in ("stop", "wait", "full", "ptt"):
|
|
186
|
+
return
|
|
187
|
+
self._profile = p
|
|
188
|
+
|
|
189
|
+
if p == "ptt":
|
|
190
|
+
# Make capture responsive: start recording as soon as we see speech,
|
|
191
|
+
# and end quickly after short silence.
|
|
192
|
+
self.min_speech_chunks = 1
|
|
193
|
+
# ~700ms of silence to end (tuned for quick PTT turns).
|
|
194
|
+
self.silence_timeout_chunks = max(8, int(round(700.0 / float(self.chunk_duration))))
|
|
195
|
+
self.transcriptions_paused = False
|
|
196
|
+
self.listening_paused = False
|
|
197
|
+
return
|
|
198
|
+
|
|
199
|
+
if p == "full":
|
|
200
|
+
# Make FULL responsive: start recording sooner, end sooner.
|
|
201
|
+
# This improves "didn't recognize me" reports on headsets.
|
|
202
|
+
self.min_speech_chunks = max(3, int(round(180.0 / float(self.chunk_duration))))
|
|
203
|
+
self.silence_timeout_chunks = max(12, int(round(900.0 / float(self.chunk_duration))))
|
|
204
|
+
# Echo gating is useful when AEC is not enabled.
|
|
205
|
+
self._echo_gate_enabled = True
|
|
206
|
+
return
|
|
207
|
+
|
|
208
|
+
# Default/conservative for continuous modes.
|
|
209
|
+
self.min_speech_chunks = int(self._default_min_speech_chunks)
|
|
210
|
+
self.silence_timeout_chunks = int(self._default_silence_timeout_chunks)
|
|
211
|
+
self._echo_gate_enabled = False
|
|
212
|
+
|
|
213
|
+
def enable_aec(self, enabled: bool = True, *, stream_delay_ms: int = 0) -> bool:
|
|
214
|
+
"""Enable/disable acoustic echo cancellation (optional).
|
|
215
|
+
|
|
216
|
+
When enabled, the recognizer expects far-end audio via `feed_far_end_audio()`.
|
|
217
|
+
"""
|
|
218
|
+
if not enabled:
|
|
219
|
+
self.aec_enabled = False
|
|
220
|
+
self._aec = None
|
|
221
|
+
with self._far_end_lock:
|
|
222
|
+
self._far_end_pcm16 = bytearray()
|
|
223
|
+
return True
|
|
224
|
+
|
|
225
|
+
AecConfig, WebRtcAecProcessor = _import_aec_processor()
|
|
226
|
+
self._aec = WebRtcAecProcessor(
|
|
227
|
+
AecConfig(sample_rate=int(self.sample_rate), channels=1, stream_delay_ms=int(stream_delay_ms))
|
|
228
|
+
)
|
|
229
|
+
self.aec_enabled = True
|
|
230
|
+
return True
|
|
231
|
+
|
|
232
|
+
def feed_far_end_audio(self, audio_chunk: np.ndarray, *, sample_rate: int) -> None:
|
|
233
|
+
"""Provide far-end (speaker) audio reference for AEC.
|
|
234
|
+
|
|
235
|
+
audio_chunk: mono float32 in [-1, 1] (as written to speaker output)
|
|
236
|
+
"""
|
|
237
|
+
# Store far-end audio for AEC and/or echo gating.
|
|
238
|
+
if audio_chunk is None or len(audio_chunk) == 0:
|
|
239
|
+
return
|
|
240
|
+
|
|
241
|
+
mono = audio_chunk.astype(np.float32, copy=False)
|
|
242
|
+
if int(sample_rate) != int(self.sample_rate):
|
|
243
|
+
mono = linear_resample_mono(mono, int(sample_rate), int(self.sample_rate))
|
|
244
|
+
|
|
245
|
+
pcm16 = np.clip(mono, -1.0, 1.0)
|
|
246
|
+
pcm16 = (pcm16 * 32767.0).astype(np.int16).tobytes()
|
|
247
|
+
|
|
248
|
+
with self._far_end_lock:
|
|
249
|
+
self._far_end_pcm16.extend(pcm16)
|
|
250
|
+
# Cap buffer to a few seconds to avoid unbounded growth.
|
|
251
|
+
max_bytes = int(self.sample_rate * 3.0) * 2
|
|
252
|
+
if len(self._far_end_pcm16) > max_bytes:
|
|
253
|
+
del self._far_end_pcm16[: len(self._far_end_pcm16) - max_bytes]
|
|
254
|
+
|
|
255
|
+
def _is_likely_echo(self, near_pcm16: bytes) -> bool:
|
|
256
|
+
"""Return True if near-end chunk looks like far-end echo.
|
|
257
|
+
|
|
258
|
+
This is a lightweight correlation gate (not AEC). It reduces false barge-in
|
|
259
|
+
triggers in FULL mode when AEC is not enabled.
|
|
260
|
+
"""
|
|
261
|
+
try:
|
|
262
|
+
far = self._pop_far_end_pcm16(len(near_pcm16))
|
|
263
|
+
if not far or far == b"\x00" * len(far):
|
|
264
|
+
return False
|
|
265
|
+
n = np.frombuffer(near_pcm16, dtype=np.int16).astype(np.float32)
|
|
266
|
+
f = np.frombuffer(far, dtype=np.int16).astype(np.float32)
|
|
267
|
+
if n.size < 32:
|
|
268
|
+
return False
|
|
269
|
+
# Normalize.
|
|
270
|
+
n = n - float(np.mean(n))
|
|
271
|
+
f = f - float(np.mean(f))
|
|
272
|
+
nn = float(np.linalg.norm(n)) + 1e-6
|
|
273
|
+
fn = float(np.linalg.norm(f)) + 1e-6
|
|
274
|
+
corr = float(np.dot(n, f) / (nn * fn))
|
|
275
|
+
return corr >= float(self._echo_corr_threshold)
|
|
276
|
+
except Exception:
|
|
277
|
+
return False
|
|
278
|
+
|
|
279
|
+
def _pop_far_end_pcm16(self, nbytes: int) -> bytes:
|
|
280
|
+
if nbytes <= 0:
|
|
281
|
+
return b""
|
|
282
|
+
with self._far_end_lock:
|
|
283
|
+
if not self._far_end_pcm16:
|
|
284
|
+
return b"\x00" * nbytes
|
|
285
|
+
take = min(nbytes, len(self._far_end_pcm16))
|
|
286
|
+
out = bytes(self._far_end_pcm16[:take])
|
|
287
|
+
del self._far_end_pcm16[:take]
|
|
288
|
+
if take < nbytes:
|
|
289
|
+
out += b"\x00" * (nbytes - take)
|
|
290
|
+
return out
|
|
291
|
+
|
|
292
|
+
def _apply_aec(self, near_pcm16: bytes) -> bytes:
|
|
293
|
+
if not (self.aec_enabled and self._aec):
|
|
294
|
+
return near_pcm16
|
|
295
|
+
|
|
296
|
+
# The underlying APM typically expects 10ms frames. We can split any chunk
|
|
297
|
+
# size into 10ms sub-frames for robustness.
|
|
298
|
+
frame_bytes = int(self.sample_rate * 0.01) * 2 # 10ms * int16
|
|
299
|
+
if frame_bytes <= 0:
|
|
300
|
+
return near_pcm16
|
|
301
|
+
if len(near_pcm16) % frame_bytes != 0:
|
|
302
|
+
# Pad to whole frames.
|
|
303
|
+
pad = frame_bytes - (len(near_pcm16) % frame_bytes)
|
|
304
|
+
near_pcm16 = near_pcm16 + (b"\x00" * pad)
|
|
305
|
+
|
|
306
|
+
out = bytearray()
|
|
307
|
+
for i in range(0, len(near_pcm16), frame_bytes):
|
|
308
|
+
near = near_pcm16[i : i + frame_bytes]
|
|
309
|
+
far = self._pop_far_end_pcm16(frame_bytes)
|
|
310
|
+
out.extend(self._aec.process(near_pcm16=near, far_pcm16=far))
|
|
311
|
+
return bytes(out)
|
|
107
312
|
|
|
108
313
|
def start(self, tts_interrupt_callback=None):
|
|
109
314
|
"""Start voice recognition in a separate thread.
|
|
@@ -140,28 +345,140 @@ class VoiceRecognizer:
|
|
|
140
345
|
self.thread.join()
|
|
141
346
|
|
|
142
347
|
if self.stream:
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
348
|
+
try:
|
|
349
|
+
self.stream.stop()
|
|
350
|
+
except Exception:
|
|
351
|
+
pass
|
|
352
|
+
try:
|
|
353
|
+
self.stream.close()
|
|
354
|
+
except Exception:
|
|
355
|
+
pass
|
|
356
|
+
self.stream = None
|
|
148
357
|
|
|
149
358
|
if self.debug_mode:
|
|
150
359
|
print(" > Voice recognition stopped")
|
|
151
360
|
return True
|
|
361
|
+
|
|
362
|
+
def pop_last_stt_metrics(self) -> dict | None:
|
|
363
|
+
"""Return and clear the most recent STT metrics (if any)."""
|
|
364
|
+
m = self.last_stt_metrics
|
|
365
|
+
self.last_stt_metrics = None
|
|
366
|
+
return m
|
|
367
|
+
|
|
368
|
+
def _transcribe_pcm16(
|
|
369
|
+
self,
|
|
370
|
+
pcm16_bytes: bytes,
|
|
371
|
+
language: Optional[str] = None,
|
|
372
|
+
*,
|
|
373
|
+
hotwords: str | None = None,
|
|
374
|
+
condition_on_previous_text: bool = True,
|
|
375
|
+
) -> str:
|
|
376
|
+
"""Transcribe raw PCM16 mono audio bytes."""
|
|
377
|
+
if not pcm16_bytes:
|
|
378
|
+
return ""
|
|
379
|
+
|
|
380
|
+
audio = np.frombuffer(pcm16_bytes, dtype=np.int16).astype(np.float32) / 32768.0
|
|
381
|
+
lang = language if language is not None else self.language
|
|
382
|
+
text = self.stt_adapter.transcribe_from_array(
|
|
383
|
+
audio,
|
|
384
|
+
sample_rate=self.sample_rate,
|
|
385
|
+
language=lang,
|
|
386
|
+
hotwords=hotwords,
|
|
387
|
+
condition_on_previous_text=bool(condition_on_previous_text),
|
|
388
|
+
)
|
|
389
|
+
return (text or "").strip()
|
|
390
|
+
|
|
391
|
+
def _is_stop_command(self, text: str) -> bool:
|
|
392
|
+
"""Return True if text matches a configured stop phrase."""
|
|
393
|
+
return is_stop_phrase(text, self.stop_phrases)
|
|
394
|
+
|
|
395
|
+
def _match_stop_phrase(self, text: str) -> str | None:
|
|
396
|
+
"""Return the matched stop phrase (normalized) or None."""
|
|
397
|
+
from .stop_phrase import normalize_stop_phrase
|
|
398
|
+
|
|
399
|
+
normalized = normalize_stop_phrase(text)
|
|
400
|
+
if not normalized:
|
|
401
|
+
return None
|
|
402
|
+
phrases = [normalize_stop_phrase(p) for p in (self.stop_phrases or []) if p]
|
|
403
|
+
for ph in phrases:
|
|
404
|
+
if not ph:
|
|
405
|
+
continue
|
|
406
|
+
if normalized == ph or normalized.startswith(ph + " ") or normalized.endswith(" " + ph):
|
|
407
|
+
return ph
|
|
408
|
+
return None
|
|
409
|
+
|
|
410
|
+
def _maybe_detect_stop_phrase_continuous(self, pcm16_chunk: bytes) -> bool:
|
|
411
|
+
"""Best-effort rolling stop-phrase detection during TTS playback.
|
|
412
|
+
|
|
413
|
+
Returns True if stop_callback was invoked.
|
|
414
|
+
"""
|
|
415
|
+
if not (self.transcriptions_paused and self.stop_callback):
|
|
416
|
+
return False
|
|
417
|
+
|
|
418
|
+
now = time.time()
|
|
419
|
+
self._stop_ring.extend(pcm16_chunk)
|
|
420
|
+
max_bytes = int(self.sample_rate * float(self._stop_window_s) * 2)
|
|
421
|
+
if max_bytes > 0 and len(self._stop_ring) > max_bytes:
|
|
422
|
+
del self._stop_ring[: len(self._stop_ring) - max_bytes]
|
|
423
|
+
|
|
424
|
+
if (now - float(self._stop_last_check)) < float(self._stop_check_interval_s):
|
|
425
|
+
return False
|
|
426
|
+
self._stop_last_check = now
|
|
427
|
+
|
|
428
|
+
try:
|
|
429
|
+
text = self._transcribe_pcm16(
|
|
430
|
+
bytes(self._stop_ring),
|
|
431
|
+
hotwords="stop, ok stop, okay stop",
|
|
432
|
+
condition_on_previous_text=False,
|
|
433
|
+
)
|
|
434
|
+
except Exception:
|
|
435
|
+
return False
|
|
436
|
+
|
|
437
|
+
# Keep this conservative to avoid hallucinated "stop" from hotword bias:
|
|
438
|
+
# - only accept short transcripts
|
|
439
|
+
# - require confirmation for bare "stop"
|
|
440
|
+
words = (text or "").strip().split()
|
|
441
|
+
if len(words) > 4:
|
|
442
|
+
self._stop_hit_count = 0
|
|
443
|
+
return False
|
|
444
|
+
|
|
445
|
+
matched = self._match_stop_phrase(text or "")
|
|
446
|
+
if matched:
|
|
447
|
+
now2 = time.time()
|
|
448
|
+
# Confirmation: for bare "stop" require 2 hits within 2.5s.
|
|
449
|
+
if matched == "stop":
|
|
450
|
+
if now2 > float(self._stop_hit_deadline):
|
|
451
|
+
self._stop_hit_count = 0
|
|
452
|
+
self._stop_hit_deadline = now2 + 2.5
|
|
453
|
+
self._stop_hit_count += 1
|
|
454
|
+
if self._stop_hit_count < 2:
|
|
455
|
+
return False
|
|
456
|
+
else:
|
|
457
|
+
self._stop_hit_count = 0
|
|
458
|
+
|
|
459
|
+
try:
|
|
460
|
+
self.stop_callback()
|
|
461
|
+
except Exception:
|
|
462
|
+
pass
|
|
463
|
+
self._stop_ring = bytearray()
|
|
464
|
+
# small cooldown
|
|
465
|
+
self._stop_last_check = time.time()
|
|
466
|
+
return True
|
|
467
|
+
return False
|
|
152
468
|
|
|
153
469
|
def _recognition_loop(self):
|
|
154
470
|
"""Main recognition loop."""
|
|
155
|
-
|
|
471
|
+
sd = _import_audio_deps()
|
|
156
472
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
473
|
+
# NOTE: sounddevice uses PortAudio under the hood (same as our TTS playback).
|
|
474
|
+
# Keeping microphone capture in-process avoids PyAudio install issues.
|
|
475
|
+
self.stream = sd.InputStream(
|
|
476
|
+
samplerate=self.sample_rate,
|
|
160
477
|
channels=1,
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
frames_per_buffer=self.chunk_size
|
|
478
|
+
dtype="int16",
|
|
479
|
+
blocksize=self.chunk_size,
|
|
164
480
|
)
|
|
481
|
+
self.stream.start()
|
|
165
482
|
|
|
166
483
|
speech_buffer = []
|
|
167
484
|
speech_count = 0
|
|
@@ -176,7 +493,21 @@ class VoiceRecognizer:
|
|
|
176
493
|
continue
|
|
177
494
|
|
|
178
495
|
# Read audio data
|
|
179
|
-
|
|
496
|
+
audio_chunk, overflowed = self.stream.read(self.chunk_size)
|
|
497
|
+
if overflowed and self.debug_mode:
|
|
498
|
+
print(" > Mic input overflow")
|
|
499
|
+
audio_data = audio_chunk.tobytes()
|
|
500
|
+
|
|
501
|
+
# Optional AEC: remove speaker echo from mic input before VAD/STT.
|
|
502
|
+
if self.aec_enabled and self._aec:
|
|
503
|
+
audio_data = self._apply_aec(audio_data)
|
|
504
|
+
|
|
505
|
+
# While transcriptions are paused (typically during TTS in STOP mode),
|
|
506
|
+
# run a rolling stop-phrase detector so "stop" can still work even if
|
|
507
|
+
# VAD never sees a clean end-of-utterance due to speaker echo.
|
|
508
|
+
if self._maybe_detect_stop_phrase_continuous(audio_data):
|
|
509
|
+
# Don't also feed this chunk into VAD/recording state.
|
|
510
|
+
continue
|
|
180
511
|
|
|
181
512
|
# Check for speech
|
|
182
513
|
is_speech = self.voice_detector.is_speech(audio_data)
|
|
@@ -192,7 +523,16 @@ class VoiceRecognizer:
|
|
|
192
523
|
self.tts_interrupt_enabled and
|
|
193
524
|
speech_count >= self.min_speech_chunks and
|
|
194
525
|
not recording):
|
|
195
|
-
|
|
526
|
+
# In FULL mode without AEC, avoid false barge-in from echo by
|
|
527
|
+
# gating on near/far correlation.
|
|
528
|
+
if self._profile == "full" and self._echo_gate_enabled and not self.aec_enabled:
|
|
529
|
+
if self._is_likely_echo(audio_data):
|
|
530
|
+
if self.debug_mode:
|
|
531
|
+
print(" > Echo-gated barge-in (ignored)")
|
|
532
|
+
else:
|
|
533
|
+
self.tts_interrupt_callback()
|
|
534
|
+
else:
|
|
535
|
+
self.tts_interrupt_callback()
|
|
196
536
|
if self.debug_mode:
|
|
197
537
|
print(" > TTS interrupted by user speech")
|
|
198
538
|
|
|
@@ -212,19 +552,41 @@ class VoiceRecognizer:
|
|
|
212
552
|
print(f" > Speech detected ({len(speech_buffer)} chunks), transcribing...")
|
|
213
553
|
|
|
214
554
|
audio_bytes = b''.join(speech_buffer)
|
|
215
|
-
|
|
555
|
+
audio_seconds = 0.0
|
|
556
|
+
try:
|
|
557
|
+
if self.sample_rate and self.sample_rate > 0:
|
|
558
|
+
audio_seconds = float(len(audio_bytes)) / float(int(self.sample_rate) * 2)
|
|
559
|
+
except Exception:
|
|
560
|
+
audio_seconds = 0.0
|
|
561
|
+
|
|
562
|
+
t0 = time.monotonic()
|
|
563
|
+
text = self._transcribe_pcm16(audio_bytes)
|
|
564
|
+
t1 = time.monotonic()
|
|
565
|
+
stt_s = float(t1 - t0)
|
|
566
|
+
metrics = {
|
|
567
|
+
"stt_s": stt_s,
|
|
568
|
+
"audio_s": float(audio_seconds),
|
|
569
|
+
"rtf": (stt_s / float(audio_seconds)) if audio_seconds else None,
|
|
570
|
+
"sample_rate": int(self.sample_rate),
|
|
571
|
+
"chunks": int(len(speech_buffer)),
|
|
572
|
+
"chunk_ms": int(self.chunk_duration),
|
|
573
|
+
"ts": time.time(),
|
|
574
|
+
}
|
|
216
575
|
|
|
217
576
|
if text:
|
|
218
577
|
# Check for stop command
|
|
219
|
-
if
|
|
578
|
+
if self._is_stop_command(text):
|
|
220
579
|
if self.stop_callback:
|
|
221
580
|
self.stop_callback()
|
|
222
581
|
else:
|
|
223
582
|
# If no stop callback, invoke transcription callback anyway
|
|
224
583
|
self.transcription_callback(text)
|
|
225
584
|
else:
|
|
226
|
-
# Normal transcription
|
|
227
|
-
self.
|
|
585
|
+
# Normal transcription (can be suppressed during TTS)
|
|
586
|
+
if not self.transcriptions_paused:
|
|
587
|
+
# Record metrics only when this transcription is actually emitted.
|
|
588
|
+
self.last_stt_metrics = metrics
|
|
589
|
+
self.transcription_callback(text)
|
|
228
590
|
|
|
229
591
|
# Reset state
|
|
230
592
|
speech_buffer = []
|
|
@@ -251,7 +613,15 @@ class VoiceRecognizer:
|
|
|
251
613
|
Returns:
|
|
252
614
|
True if changed, False otherwise
|
|
253
615
|
"""
|
|
254
|
-
|
|
616
|
+
try:
|
|
617
|
+
# Recreate adapter to switch model size.
|
|
618
|
+
STTAdapter = _import_transcriber()
|
|
619
|
+
self.stt_adapter = STTAdapter(model_size=model_name, device="cpu", compute_type="int8")
|
|
620
|
+
return True
|
|
621
|
+
except Exception as e:
|
|
622
|
+
if self.debug_mode:
|
|
623
|
+
print(f"STT model change error: {e}")
|
|
624
|
+
return False
|
|
255
625
|
|
|
256
626
|
def change_vad_aggressiveness(self, aggressiveness):
|
|
257
627
|
"""Change VAD aggressiveness.
|
|
@@ -292,4 +662,16 @@ class VoiceRecognizer:
|
|
|
292
662
|
"""Resume audio processing after it was paused."""
|
|
293
663
|
self.listening_paused = False
|
|
294
664
|
if self.debug_mode:
|
|
295
|
-
print(" > Listening resumed")
|
|
665
|
+
print(" > Listening resumed")
|
|
666
|
+
|
|
667
|
+
def pause_transcriptions(self):
|
|
668
|
+
"""Suppress normal transcriptions while still allowing stop phrase detection."""
|
|
669
|
+
self.transcriptions_paused = True
|
|
670
|
+
if self.debug_mode:
|
|
671
|
+
print(" > Transcriptions paused")
|
|
672
|
+
|
|
673
|
+
def resume_transcriptions(self):
|
|
674
|
+
"""Re-enable normal transcriptions after they were suppressed."""
|
|
675
|
+
self.transcriptions_paused = False
|
|
676
|
+
if self.debug_mode:
|
|
677
|
+
print(" > Transcriptions resumed")
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""Stop phrase matching utilities (no heavy deps).
|
|
2
|
+
|
|
3
|
+
Keep this module dependency-free so it can be used in:
|
|
4
|
+
- core unit tests
|
|
5
|
+
- recognition pipeline (without forcing VAD/STT imports)
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
from typing import Iterable
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def normalize_stop_phrase(text: str) -> str:
|
|
15
|
+
"""Normalize text for conservative stop-phrase matching."""
|
|
16
|
+
if not text:
|
|
17
|
+
return ""
|
|
18
|
+
normalized = re.sub(r"[^a-z0-9\s]+", " ", text.lower()).strip()
|
|
19
|
+
normalized = re.sub(r"\s+", " ", normalized)
|
|
20
|
+
return normalized
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _levenshtein_leq(a: str, b: str, *, max_dist: int) -> bool:
|
|
24
|
+
"""Return True if Levenshtein(a,b) <= max_dist (small, early-exit).
|
|
25
|
+
|
|
26
|
+
This is intentionally tiny and only used for short tokens like "ok"/"okay".
|
|
27
|
+
"""
|
|
28
|
+
a = a or ""
|
|
29
|
+
b = b or ""
|
|
30
|
+
if a == b:
|
|
31
|
+
return True
|
|
32
|
+
if max_dist <= 0:
|
|
33
|
+
return False
|
|
34
|
+
# Fast bounds.
|
|
35
|
+
if abs(len(a) - len(b)) > max_dist:
|
|
36
|
+
return False
|
|
37
|
+
|
|
38
|
+
# DP with early exit.
|
|
39
|
+
prev = list(range(len(b) + 1))
|
|
40
|
+
for i, ca in enumerate(a, start=1):
|
|
41
|
+
cur = [i]
|
|
42
|
+
row_min = cur[0]
|
|
43
|
+
for j, cb in enumerate(b, start=1):
|
|
44
|
+
cost = 0 if ca == cb else 1
|
|
45
|
+
cur_val = min(
|
|
46
|
+
prev[j] + 1, # deletion
|
|
47
|
+
cur[j - 1] + 1, # insertion
|
|
48
|
+
prev[j - 1] + cost, # substitution
|
|
49
|
+
)
|
|
50
|
+
cur.append(cur_val)
|
|
51
|
+
row_min = min(row_min, cur_val)
|
|
52
|
+
if row_min > max_dist:
|
|
53
|
+
return False
|
|
54
|
+
prev = cur
|
|
55
|
+
return prev[-1] <= max_dist
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def is_stop_phrase(text: str, phrases: Iterable[str]) -> bool:
|
|
59
|
+
"""Return True if text matches any configured stop phrase.
|
|
60
|
+
|
|
61
|
+
Matching is intentionally:
|
|
62
|
+
- conservative about normalization (no fancy text transforms)
|
|
63
|
+
- but tolerant to common STT variations like "stop." / "stop please"
|
|
64
|
+
|
|
65
|
+
We match phrases as whole-word sequences inside the normalized text.
|
|
66
|
+
"""
|
|
67
|
+
normalized = normalize_stop_phrase(text)
|
|
68
|
+
if not normalized:
|
|
69
|
+
return False
|
|
70
|
+
phrase_set = {normalize_stop_phrase(p) for p in phrases if p}
|
|
71
|
+
for phrase in phrase_set:
|
|
72
|
+
if not phrase:
|
|
73
|
+
continue
|
|
74
|
+
# Special-case: tolerate common STT variants for "ok/okay stop"
|
|
75
|
+
# (e.g. "okay stop", "okey stop", "oh stop").
|
|
76
|
+
# Keep it conservative:
|
|
77
|
+
# - require "stop" at the end
|
|
78
|
+
# - require an ok-like token right before it (or one token earlier with "please")
|
|
79
|
+
phrase_toks = phrase.split()
|
|
80
|
+
toks = normalized.split()
|
|
81
|
+
|
|
82
|
+
if phrase_toks == ["ok", "stop"] or phrase_toks == ["okay", "stop"]:
|
|
83
|
+
if len(toks) in (2, 3) and toks[-1] == "stop":
|
|
84
|
+
candidates = [toks[-2]]
|
|
85
|
+
if len(toks) == 3:
|
|
86
|
+
candidates.append(toks[-3])
|
|
87
|
+
for t in candidates:
|
|
88
|
+
if _levenshtein_leq(t, "ok", max_dist=1) or _levenshtein_leq(t, "okay", max_dist=1):
|
|
89
|
+
return True
|
|
90
|
+
|
|
91
|
+
# Default rule:
|
|
92
|
+
# - exact (stop)
|
|
93
|
+
# - prefix (stop please)
|
|
94
|
+
# - suffix (please stop)
|
|
95
|
+
# This avoids false positives like "don't stop now" when "stop" is a phrase.
|
|
96
|
+
if normalized == phrase:
|
|
97
|
+
return True
|
|
98
|
+
if normalized.startswith(phrase + " "):
|
|
99
|
+
return True
|
|
100
|
+
if normalized.endswith(" " + phrase):
|
|
101
|
+
return True
|
|
102
|
+
return False
|
|
103
|
+
|