abstractvoice 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. abstractvoice/__init__.py +2 -5
  2. abstractvoice/__main__.py +82 -3
  3. abstractvoice/adapters/__init__.py +12 -0
  4. abstractvoice/adapters/base.py +207 -0
  5. abstractvoice/adapters/stt_faster_whisper.py +401 -0
  6. abstractvoice/adapters/tts_piper.py +480 -0
  7. abstractvoice/aec/__init__.py +10 -0
  8. abstractvoice/aec/webrtc_apm.py +56 -0
  9. abstractvoice/artifacts.py +173 -0
  10. abstractvoice/audio/__init__.py +7 -0
  11. abstractvoice/audio/recorder.py +46 -0
  12. abstractvoice/audio/resample.py +25 -0
  13. abstractvoice/cloning/__init__.py +7 -0
  14. abstractvoice/cloning/engine_chroma.py +738 -0
  15. abstractvoice/cloning/engine_f5.py +546 -0
  16. abstractvoice/cloning/manager.py +349 -0
  17. abstractvoice/cloning/store.py +362 -0
  18. abstractvoice/compute/__init__.py +6 -0
  19. abstractvoice/compute/device.py +73 -0
  20. abstractvoice/config/__init__.py +2 -0
  21. abstractvoice/config/voice_catalog.py +19 -0
  22. abstractvoice/dependency_check.py +0 -1
  23. abstractvoice/examples/cli_repl.py +2403 -243
  24. abstractvoice/examples/voice_cli.py +64 -63
  25. abstractvoice/integrations/__init__.py +2 -0
  26. abstractvoice/integrations/abstractcore.py +116 -0
  27. abstractvoice/integrations/abstractcore_plugin.py +253 -0
  28. abstractvoice/prefetch.py +82 -0
  29. abstractvoice/recognition.py +424 -42
  30. abstractvoice/stop_phrase.py +103 -0
  31. abstractvoice/tts/__init__.py +3 -3
  32. abstractvoice/tts/adapter_tts_engine.py +210 -0
  33. abstractvoice/tts/tts_engine.py +257 -1208
  34. abstractvoice/vm/__init__.py +2 -0
  35. abstractvoice/vm/common.py +21 -0
  36. abstractvoice/vm/core.py +139 -0
  37. abstractvoice/vm/manager.py +108 -0
  38. abstractvoice/vm/stt_mixin.py +158 -0
  39. abstractvoice/vm/tts_mixin.py +550 -0
  40. abstractvoice/voice_manager.py +6 -1061
  41. abstractvoice-0.6.1.dist-info/METADATA +213 -0
  42. abstractvoice-0.6.1.dist-info/RECORD +52 -0
  43. {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/WHEEL +1 -1
  44. abstractvoice-0.6.1.dist-info/entry_points.txt +6 -0
  45. abstractvoice/instant_setup.py +0 -83
  46. abstractvoice/simple_model_manager.py +0 -539
  47. abstractvoice-0.5.1.dist-info/METADATA +0 -1458
  48. abstractvoice-0.5.1.dist-info/RECORD +0 -23
  49. abstractvoice-0.5.1.dist-info/entry_points.txt +0 -2
  50. {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/licenses/LICENSE +0 -0
  51. {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/top_level.txt +0 -0
@@ -2,18 +2,26 @@
2
2
 
3
3
  import threading
4
4
  import time
5
+ from typing import Optional
6
+ from collections import deque
7
+
8
+ import numpy as np
9
+ import re
10
+
11
+ from .stop_phrase import is_stop_phrase
12
+ from .audio.resample import linear_resample_mono
5
13
 
6
14
  # Lazy imports for heavy dependencies
7
15
  def _import_audio_deps():
8
16
  """Import audio dependencies with helpful error message if missing."""
9
17
  try:
10
- import pyaudio
11
- return pyaudio
18
+ import sounddevice as sd
19
+ return sd
12
20
  except ImportError as e:
13
21
  raise ImportError(
14
- "Audio functionality requires optional dependencies. Install with:\n"
15
- " pip install abstractvoice[voice] # For basic audio\n"
16
- " pip install abstractvoice[all] # For all features\n"
22
+ "Audio capture/playback requires sounddevice. Install with:\n"
23
+ " pip install abstractvoice # Core install (includes sounddevice)\n"
24
+ " pip install abstractvoice[all] # All features\n"
17
25
  f"Original error: {e}"
18
26
  ) from e
19
27
 
@@ -33,21 +41,34 @@ def _import_vad():
33
41
  raise
34
42
 
35
43
  def _import_transcriber():
36
- """Import Transcriber with helpful error message if dependencies missing."""
44
+ """Import STT adapter with helpful error message if dependencies missing."""
37
45
  try:
38
- from .stt import Transcriber
39
- return Transcriber
46
+ from .adapters.stt_faster_whisper import FasterWhisperAdapter
47
+ return FasterWhisperAdapter
40
48
  except ImportError as e:
41
- if "whisper" in str(e) or "tiktoken" in str(e):
42
- raise ImportError(
43
- "Speech recognition functionality requires optional dependencies. Install with:\n"
44
- " pip install abstractvoice[stt] # For speech recognition only\n"
45
- " pip install abstractvoice[all] # For all features\n"
46
- f"Original error: {e}"
47
- ) from e
49
+ raise ImportError(
50
+ "Speech recognition requires faster-whisper (core dependency). "
51
+ "If this error occurs, your installation is inconsistent.\n"
52
+ "Try reinstalling:\n"
53
+ " pip install --upgrade abstractvoice\n"
54
+ f"Original error: {e}"
55
+ ) from e
48
56
  raise
49
57
 
50
58
 
59
+ def _import_aec_processor():
60
+ """Import AEC processor with helpful error if dependencies missing."""
61
+ try:
62
+ from .aec.webrtc_apm import AecConfig, WebRtcAecProcessor
63
+ return AecConfig, WebRtcAecProcessor
64
+ except ImportError as e:
65
+ raise ImportError(
66
+ "AEC is optional and requires extra dependencies.\n"
67
+ "Install with: pip install \"abstractvoice[aec]\"\n"
68
+ f"Original error: {e}"
69
+ ) from e
70
+
71
+
51
72
  class VoiceRecognizer:
52
73
  """Voice recognition with VAD and STT."""
53
74
 
@@ -55,7 +76,10 @@ class VoiceRecognizer:
55
76
  vad_aggressiveness=1, min_speech_duration=600,
56
77
  silence_timeout=1500, sample_rate=16000,
57
78
  chunk_duration=30, whisper_model="tiny",
58
- min_transcription_length=5, debug_mode=False):
79
+ min_transcription_length=5, debug_mode=False,
80
+ aec_enabled: bool = False, aec_stream_delay_ms: int = 0,
81
+ language: str | None = None,
82
+ allow_downloads: bool = True):
59
83
  """Initialize voice recognizer.
60
84
 
61
85
  Args:
@@ -73,6 +97,25 @@ class VoiceRecognizer:
73
97
  self.debug_mode = debug_mode
74
98
  self.transcription_callback = transcription_callback
75
99
  self.stop_callback = stop_callback
100
+ self.language = (language or None)
101
+ self.allow_downloads = bool(allow_downloads)
102
+
103
+ # Stop phrase(s): robust “interrupt” without requiring echo cancellation.
104
+ # Keep it conservative to avoid accidental stops from the assistant audio.
105
+ # Include bare "stop" because users will naturally say it.
106
+ self.stop_phrases = ["stop", "ok stop", "okay stop"]
107
+
108
+ # While TTS is playing we can end up with continuous "speech" from speaker echo,
109
+ # which prevents end-of-utterance detection and therefore prevents stop phrase
110
+ # transcription. To keep STOP mode usable without AEC, we run a low-rate rolling
111
+ # window transcription ONLY for stop-phrase detection when transcriptions are paused.
112
+ self._stop_ring = bytearray()
113
+ self._stop_last_check = 0.0
114
+ # Faster checks help catch "ok stop" early during playback.
115
+ self._stop_check_interval_s = 0.6
116
+ self._stop_window_s = 2.0
117
+ self._stop_hit_count = 0
118
+ self._stop_hit_deadline = 0.0
76
119
 
77
120
  # Configuration
78
121
  self.sample_rate = sample_rate
@@ -80,6 +123,8 @@ class VoiceRecognizer:
80
123
  self.chunk_size = int(sample_rate * chunk_duration / 1000)
81
124
  self.min_speech_chunks = int(min_speech_duration / chunk_duration)
82
125
  self.silence_timeout_chunks = int(silence_timeout / chunk_duration)
126
+ self._default_min_speech_chunks = int(self.min_speech_chunks)
127
+ self._default_silence_timeout_chunks = int(self.silence_timeout_chunks)
83
128
 
84
129
  # Initialize components using lazy imports
85
130
  VoiceDetector = _import_vad()
@@ -89,21 +134,181 @@ class VoiceRecognizer:
89
134
  debug_mode=debug_mode
90
135
  )
91
136
 
92
- Transcriber = _import_transcriber()
93
- self.transcriber = Transcriber(
94
- model_name=whisper_model,
95
- min_transcription_length=min_transcription_length,
96
- debug_mode=debug_mode
137
+ # STT: use faster-whisper adapter by default (core dependency)
138
+ STTAdapter = _import_transcriber()
139
+ self.stt_adapter = STTAdapter(
140
+ model_size=whisper_model,
141
+ device="auto",
142
+ compute_type="int8",
143
+ allow_downloads=bool(self.allow_downloads),
97
144
  )
145
+ self.min_transcription_length = min_transcription_length
98
146
 
99
147
  # State
100
148
  self.is_running = False
101
149
  self.thread = None
102
- self.pyaudio = None
103
150
  self.stream = None
104
151
  self.tts_interrupt_callback = None
105
152
  self.tts_interrupt_enabled = True # Can be disabled during TTS playback
106
153
  self.listening_paused = False # Can be paused to completely stop processing audio
154
+ # While TTS is playing (esp. without AEC), we often want to suppress normal
155
+ # transcriptions to avoid self-feedback loops, but still allow stop phrase.
156
+ self.transcriptions_paused = False
157
+ self._profile = "stop"
158
+
159
+ # Last STT metrics (best-effort; used by verbose REPL output).
160
+ # Populated only for "normal" transcriptions that invoke transcription_callback.
161
+ self.last_stt_metrics: dict | None = None
162
+
163
+ # Optional AEC (echo cancellation) state.
164
+ self.aec_enabled = False
165
+ self._aec = None
166
+ self._far_end_lock = threading.Lock()
167
+ self._far_end_pcm16 = bytearray()
168
+ # Lightweight echo gating (for full-mode barge-in without AEC).
169
+ self._echo_gate_enabled = False
170
+ self._echo_corr_threshold = 0.72
171
+ if aec_enabled:
172
+ self.enable_aec(True, stream_delay_ms=aec_stream_delay_ms)
173
+
174
+ # Apply initial profile.
175
+ self.set_profile("stop")
176
+
177
+ def set_profile(self, profile: str) -> None:
178
+ """Set listening profile tuned for the current interaction mode.
179
+
180
+ Why this exists:
181
+ - PTT needs *very* low thresholds to reliably capture short utterances.
182
+ - STOP/WAIT should use more conservative defaults to reduce false triggers.
183
+ """
184
+ p = (profile or "").strip().lower()
185
+ if p not in ("stop", "wait", "full", "ptt"):
186
+ return
187
+ self._profile = p
188
+
189
+ if p == "ptt":
190
+ # Make capture responsive: start recording as soon as we see speech,
191
+ # and end quickly after short silence.
192
+ self.min_speech_chunks = 1
193
+ # ~700ms of silence to end (tuned for quick PTT turns).
194
+ self.silence_timeout_chunks = max(8, int(round(700.0 / float(self.chunk_duration))))
195
+ self.transcriptions_paused = False
196
+ self.listening_paused = False
197
+ return
198
+
199
+ if p == "full":
200
+ # Make FULL responsive: start recording sooner, end sooner.
201
+ # This improves "didn't recognize me" reports on headsets.
202
+ self.min_speech_chunks = max(3, int(round(180.0 / float(self.chunk_duration))))
203
+ self.silence_timeout_chunks = max(12, int(round(900.0 / float(self.chunk_duration))))
204
+ # Echo gating is useful when AEC is not enabled.
205
+ self._echo_gate_enabled = True
206
+ return
207
+
208
+ # Default/conservative for continuous modes.
209
+ self.min_speech_chunks = int(self._default_min_speech_chunks)
210
+ self.silence_timeout_chunks = int(self._default_silence_timeout_chunks)
211
+ self._echo_gate_enabled = False
212
+
213
+ def enable_aec(self, enabled: bool = True, *, stream_delay_ms: int = 0) -> bool:
214
+ """Enable/disable acoustic echo cancellation (optional).
215
+
216
+ When enabled, the recognizer expects far-end audio via `feed_far_end_audio()`.
217
+ """
218
+ if not enabled:
219
+ self.aec_enabled = False
220
+ self._aec = None
221
+ with self._far_end_lock:
222
+ self._far_end_pcm16 = bytearray()
223
+ return True
224
+
225
+ AecConfig, WebRtcAecProcessor = _import_aec_processor()
226
+ self._aec = WebRtcAecProcessor(
227
+ AecConfig(sample_rate=int(self.sample_rate), channels=1, stream_delay_ms=int(stream_delay_ms))
228
+ )
229
+ self.aec_enabled = True
230
+ return True
231
+
232
+ def feed_far_end_audio(self, audio_chunk: np.ndarray, *, sample_rate: int) -> None:
233
+ """Provide far-end (speaker) audio reference for AEC.
234
+
235
+ audio_chunk: mono float32 in [-1, 1] (as written to speaker output)
236
+ """
237
+ # Store far-end audio for AEC and/or echo gating.
238
+ if audio_chunk is None or len(audio_chunk) == 0:
239
+ return
240
+
241
+ mono = audio_chunk.astype(np.float32, copy=False)
242
+ if int(sample_rate) != int(self.sample_rate):
243
+ mono = linear_resample_mono(mono, int(sample_rate), int(self.sample_rate))
244
+
245
+ pcm16 = np.clip(mono, -1.0, 1.0)
246
+ pcm16 = (pcm16 * 32767.0).astype(np.int16).tobytes()
247
+
248
+ with self._far_end_lock:
249
+ self._far_end_pcm16.extend(pcm16)
250
+ # Cap buffer to a few seconds to avoid unbounded growth.
251
+ max_bytes = int(self.sample_rate * 3.0) * 2
252
+ if len(self._far_end_pcm16) > max_bytes:
253
+ del self._far_end_pcm16[: len(self._far_end_pcm16) - max_bytes]
254
+
255
+ def _is_likely_echo(self, near_pcm16: bytes) -> bool:
256
+ """Return True if near-end chunk looks like far-end echo.
257
+
258
+ This is a lightweight correlation gate (not AEC). It reduces false barge-in
259
+ triggers in FULL mode when AEC is not enabled.
260
+ """
261
+ try:
262
+ far = self._pop_far_end_pcm16(len(near_pcm16))
263
+ if not far or far == b"\x00" * len(far):
264
+ return False
265
+ n = np.frombuffer(near_pcm16, dtype=np.int16).astype(np.float32)
266
+ f = np.frombuffer(far, dtype=np.int16).astype(np.float32)
267
+ if n.size < 32:
268
+ return False
269
+ # Normalize.
270
+ n = n - float(np.mean(n))
271
+ f = f - float(np.mean(f))
272
+ nn = float(np.linalg.norm(n)) + 1e-6
273
+ fn = float(np.linalg.norm(f)) + 1e-6
274
+ corr = float(np.dot(n, f) / (nn * fn))
275
+ return corr >= float(self._echo_corr_threshold)
276
+ except Exception:
277
+ return False
278
+
279
+ def _pop_far_end_pcm16(self, nbytes: int) -> bytes:
280
+ if nbytes <= 0:
281
+ return b""
282
+ with self._far_end_lock:
283
+ if not self._far_end_pcm16:
284
+ return b"\x00" * nbytes
285
+ take = min(nbytes, len(self._far_end_pcm16))
286
+ out = bytes(self._far_end_pcm16[:take])
287
+ del self._far_end_pcm16[:take]
288
+ if take < nbytes:
289
+ out += b"\x00" * (nbytes - take)
290
+ return out
291
+
292
+ def _apply_aec(self, near_pcm16: bytes) -> bytes:
293
+ if not (self.aec_enabled and self._aec):
294
+ return near_pcm16
295
+
296
+ # The underlying APM typically expects 10ms frames. We can split any chunk
297
+ # size into 10ms sub-frames for robustness.
298
+ frame_bytes = int(self.sample_rate * 0.01) * 2 # 10ms * int16
299
+ if frame_bytes <= 0:
300
+ return near_pcm16
301
+ if len(near_pcm16) % frame_bytes != 0:
302
+ # Pad to whole frames.
303
+ pad = frame_bytes - (len(near_pcm16) % frame_bytes)
304
+ near_pcm16 = near_pcm16 + (b"\x00" * pad)
305
+
306
+ out = bytearray()
307
+ for i in range(0, len(near_pcm16), frame_bytes):
308
+ near = near_pcm16[i : i + frame_bytes]
309
+ far = self._pop_far_end_pcm16(frame_bytes)
310
+ out.extend(self._aec.process(near_pcm16=near, far_pcm16=far))
311
+ return bytes(out)
107
312
 
108
313
  def start(self, tts_interrupt_callback=None):
109
314
  """Start voice recognition in a separate thread.
@@ -140,28 +345,140 @@ class VoiceRecognizer:
140
345
  self.thread.join()
141
346
 
142
347
  if self.stream:
143
- self.stream.stop_stream()
144
- self.stream.close()
145
-
146
- if self.pyaudio:
147
- self.pyaudio.terminate()
348
+ try:
349
+ self.stream.stop()
350
+ except Exception:
351
+ pass
352
+ try:
353
+ self.stream.close()
354
+ except Exception:
355
+ pass
356
+ self.stream = None
148
357
 
149
358
  if self.debug_mode:
150
359
  print(" > Voice recognition stopped")
151
360
  return True
361
+
362
+ def pop_last_stt_metrics(self) -> dict | None:
363
+ """Return and clear the most recent STT metrics (if any)."""
364
+ m = self.last_stt_metrics
365
+ self.last_stt_metrics = None
366
+ return m
367
+
368
+ def _transcribe_pcm16(
369
+ self,
370
+ pcm16_bytes: bytes,
371
+ language: Optional[str] = None,
372
+ *,
373
+ hotwords: str | None = None,
374
+ condition_on_previous_text: bool = True,
375
+ ) -> str:
376
+ """Transcribe raw PCM16 mono audio bytes."""
377
+ if not pcm16_bytes:
378
+ return ""
379
+
380
+ audio = np.frombuffer(pcm16_bytes, dtype=np.int16).astype(np.float32) / 32768.0
381
+ lang = language if language is not None else self.language
382
+ text = self.stt_adapter.transcribe_from_array(
383
+ audio,
384
+ sample_rate=self.sample_rate,
385
+ language=lang,
386
+ hotwords=hotwords,
387
+ condition_on_previous_text=bool(condition_on_previous_text),
388
+ )
389
+ return (text or "").strip()
390
+
391
+ def _is_stop_command(self, text: str) -> bool:
392
+ """Return True if text matches a configured stop phrase."""
393
+ return is_stop_phrase(text, self.stop_phrases)
394
+
395
+ def _match_stop_phrase(self, text: str) -> str | None:
396
+ """Return the matched stop phrase (normalized) or None."""
397
+ from .stop_phrase import normalize_stop_phrase
398
+
399
+ normalized = normalize_stop_phrase(text)
400
+ if not normalized:
401
+ return None
402
+ phrases = [normalize_stop_phrase(p) for p in (self.stop_phrases or []) if p]
403
+ for ph in phrases:
404
+ if not ph:
405
+ continue
406
+ if normalized == ph or normalized.startswith(ph + " ") or normalized.endswith(" " + ph):
407
+ return ph
408
+ return None
409
+
410
+ def _maybe_detect_stop_phrase_continuous(self, pcm16_chunk: bytes) -> bool:
411
+ """Best-effort rolling stop-phrase detection during TTS playback.
412
+
413
+ Returns True if stop_callback was invoked.
414
+ """
415
+ if not (self.transcriptions_paused and self.stop_callback):
416
+ return False
417
+
418
+ now = time.time()
419
+ self._stop_ring.extend(pcm16_chunk)
420
+ max_bytes = int(self.sample_rate * float(self._stop_window_s) * 2)
421
+ if max_bytes > 0 and len(self._stop_ring) > max_bytes:
422
+ del self._stop_ring[: len(self._stop_ring) - max_bytes]
423
+
424
+ if (now - float(self._stop_last_check)) < float(self._stop_check_interval_s):
425
+ return False
426
+ self._stop_last_check = now
427
+
428
+ try:
429
+ text = self._transcribe_pcm16(
430
+ bytes(self._stop_ring),
431
+ hotwords="stop, ok stop, okay stop",
432
+ condition_on_previous_text=False,
433
+ )
434
+ except Exception:
435
+ return False
436
+
437
+ # Keep this conservative to avoid hallucinated "stop" from hotword bias:
438
+ # - only accept short transcripts
439
+ # - require confirmation for bare "stop"
440
+ words = (text or "").strip().split()
441
+ if len(words) > 4:
442
+ self._stop_hit_count = 0
443
+ return False
444
+
445
+ matched = self._match_stop_phrase(text or "")
446
+ if matched:
447
+ now2 = time.time()
448
+ # Confirmation: for bare "stop" require 2 hits within 2.5s.
449
+ if matched == "stop":
450
+ if now2 > float(self._stop_hit_deadline):
451
+ self._stop_hit_count = 0
452
+ self._stop_hit_deadline = now2 + 2.5
453
+ self._stop_hit_count += 1
454
+ if self._stop_hit_count < 2:
455
+ return False
456
+ else:
457
+ self._stop_hit_count = 0
458
+
459
+ try:
460
+ self.stop_callback()
461
+ except Exception:
462
+ pass
463
+ self._stop_ring = bytearray()
464
+ # small cooldown
465
+ self._stop_last_check = time.time()
466
+ return True
467
+ return False
152
468
 
153
469
  def _recognition_loop(self):
154
470
  """Main recognition loop."""
155
- pyaudio = _import_audio_deps()
471
+ sd = _import_audio_deps()
156
472
 
157
- self.pyaudio = pyaudio.PyAudio()
158
- self.stream = self.pyaudio.open(
159
- format=pyaudio.paInt16,
473
+ # NOTE: sounddevice uses PortAudio under the hood (same as our TTS playback).
474
+ # Keeping microphone capture in-process avoids PyAudio install issues.
475
+ self.stream = sd.InputStream(
476
+ samplerate=self.sample_rate,
160
477
  channels=1,
161
- rate=self.sample_rate,
162
- input=True,
163
- frames_per_buffer=self.chunk_size
478
+ dtype="int16",
479
+ blocksize=self.chunk_size,
164
480
  )
481
+ self.stream.start()
165
482
 
166
483
  speech_buffer = []
167
484
  speech_count = 0
@@ -176,7 +493,21 @@ class VoiceRecognizer:
176
493
  continue
177
494
 
178
495
  # Read audio data
179
- audio_data = self.stream.read(self.chunk_size, exception_on_overflow=False)
496
+ audio_chunk, overflowed = self.stream.read(self.chunk_size)
497
+ if overflowed and self.debug_mode:
498
+ print(" > Mic input overflow")
499
+ audio_data = audio_chunk.tobytes()
500
+
501
+ # Optional AEC: remove speaker echo from mic input before VAD/STT.
502
+ if self.aec_enabled and self._aec:
503
+ audio_data = self._apply_aec(audio_data)
504
+
505
+ # While transcriptions are paused (typically during TTS in STOP mode),
506
+ # run a rolling stop-phrase detector so "stop" can still work even if
507
+ # VAD never sees a clean end-of-utterance due to speaker echo.
508
+ if self._maybe_detect_stop_phrase_continuous(audio_data):
509
+ # Don't also feed this chunk into VAD/recording state.
510
+ continue
180
511
 
181
512
  # Check for speech
182
513
  is_speech = self.voice_detector.is_speech(audio_data)
@@ -192,7 +523,16 @@ class VoiceRecognizer:
192
523
  self.tts_interrupt_enabled and
193
524
  speech_count >= self.min_speech_chunks and
194
525
  not recording):
195
- self.tts_interrupt_callback()
526
+ # In FULL mode without AEC, avoid false barge-in from echo by
527
+ # gating on near/far correlation.
528
+ if self._profile == "full" and self._echo_gate_enabled and not self.aec_enabled:
529
+ if self._is_likely_echo(audio_data):
530
+ if self.debug_mode:
531
+ print(" > Echo-gated barge-in (ignored)")
532
+ else:
533
+ self.tts_interrupt_callback()
534
+ else:
535
+ self.tts_interrupt_callback()
196
536
  if self.debug_mode:
197
537
  print(" > TTS interrupted by user speech")
198
538
 
@@ -212,19 +552,41 @@ class VoiceRecognizer:
212
552
  print(f" > Speech detected ({len(speech_buffer)} chunks), transcribing...")
213
553
 
214
554
  audio_bytes = b''.join(speech_buffer)
215
- text = self.transcriber.transcribe(audio_bytes)
555
+ audio_seconds = 0.0
556
+ try:
557
+ if self.sample_rate and self.sample_rate > 0:
558
+ audio_seconds = float(len(audio_bytes)) / float(int(self.sample_rate) * 2)
559
+ except Exception:
560
+ audio_seconds = 0.0
561
+
562
+ t0 = time.monotonic()
563
+ text = self._transcribe_pcm16(audio_bytes)
564
+ t1 = time.monotonic()
565
+ stt_s = float(t1 - t0)
566
+ metrics = {
567
+ "stt_s": stt_s,
568
+ "audio_s": float(audio_seconds),
569
+ "rtf": (stt_s / float(audio_seconds)) if audio_seconds else None,
570
+ "sample_rate": int(self.sample_rate),
571
+ "chunks": int(len(speech_buffer)),
572
+ "chunk_ms": int(self.chunk_duration),
573
+ "ts": time.time(),
574
+ }
216
575
 
217
576
  if text:
218
577
  # Check for stop command
219
- if text.lower() == "stop":
578
+ if self._is_stop_command(text):
220
579
  if self.stop_callback:
221
580
  self.stop_callback()
222
581
  else:
223
582
  # If no stop callback, invoke transcription callback anyway
224
583
  self.transcription_callback(text)
225
584
  else:
226
- # Normal transcription
227
- self.transcription_callback(text)
585
+ # Normal transcription (can be suppressed during TTS)
586
+ if not self.transcriptions_paused:
587
+ # Record metrics only when this transcription is actually emitted.
588
+ self.last_stt_metrics = metrics
589
+ self.transcription_callback(text)
228
590
 
229
591
  # Reset state
230
592
  speech_buffer = []
@@ -251,7 +613,15 @@ class VoiceRecognizer:
251
613
  Returns:
252
614
  True if changed, False otherwise
253
615
  """
254
- return self.transcriber.change_model(model_name)
616
+ try:
617
+ # Recreate adapter to switch model size.
618
+ STTAdapter = _import_transcriber()
619
+ self.stt_adapter = STTAdapter(model_size=model_name, device="cpu", compute_type="int8")
620
+ return True
621
+ except Exception as e:
622
+ if self.debug_mode:
623
+ print(f"STT model change error: {e}")
624
+ return False
255
625
 
256
626
  def change_vad_aggressiveness(self, aggressiveness):
257
627
  """Change VAD aggressiveness.
@@ -292,4 +662,16 @@ class VoiceRecognizer:
292
662
  """Resume audio processing after it was paused."""
293
663
  self.listening_paused = False
294
664
  if self.debug_mode:
295
- print(" > Listening resumed")
665
+ print(" > Listening resumed")
666
+
667
+ def pause_transcriptions(self):
668
+ """Suppress normal transcriptions while still allowing stop phrase detection."""
669
+ self.transcriptions_paused = True
670
+ if self.debug_mode:
671
+ print(" > Transcriptions paused")
672
+
673
+ def resume_transcriptions(self):
674
+ """Re-enable normal transcriptions after they were suppressed."""
675
+ self.transcriptions_paused = False
676
+ if self.debug_mode:
677
+ print(" > Transcriptions resumed")
@@ -0,0 +1,103 @@
1
+ """Stop phrase matching utilities (no heavy deps).
2
+
3
+ Keep this module dependency-free so it can be used in:
4
+ - core unit tests
5
+ - recognition pipeline (without forcing VAD/STT imports)
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import re
11
+ from typing import Iterable
12
+
13
+
14
+ def normalize_stop_phrase(text: str) -> str:
15
+ """Normalize text for conservative stop-phrase matching."""
16
+ if not text:
17
+ return ""
18
+ normalized = re.sub(r"[^a-z0-9\s]+", " ", text.lower()).strip()
19
+ normalized = re.sub(r"\s+", " ", normalized)
20
+ return normalized
21
+
22
+
23
+ def _levenshtein_leq(a: str, b: str, *, max_dist: int) -> bool:
24
+ """Return True if Levenshtein(a,b) <= max_dist (small, early-exit).
25
+
26
+ This is intentionally tiny and only used for short tokens like "ok"/"okay".
27
+ """
28
+ a = a or ""
29
+ b = b or ""
30
+ if a == b:
31
+ return True
32
+ if max_dist <= 0:
33
+ return False
34
+ # Fast bounds.
35
+ if abs(len(a) - len(b)) > max_dist:
36
+ return False
37
+
38
+ # DP with early exit.
39
+ prev = list(range(len(b) + 1))
40
+ for i, ca in enumerate(a, start=1):
41
+ cur = [i]
42
+ row_min = cur[0]
43
+ for j, cb in enumerate(b, start=1):
44
+ cost = 0 if ca == cb else 1
45
+ cur_val = min(
46
+ prev[j] + 1, # deletion
47
+ cur[j - 1] + 1, # insertion
48
+ prev[j - 1] + cost, # substitution
49
+ )
50
+ cur.append(cur_val)
51
+ row_min = min(row_min, cur_val)
52
+ if row_min > max_dist:
53
+ return False
54
+ prev = cur
55
+ return prev[-1] <= max_dist
56
+
57
+
58
+ def is_stop_phrase(text: str, phrases: Iterable[str]) -> bool:
59
+ """Return True if text matches any configured stop phrase.
60
+
61
+ Matching is intentionally:
62
+ - conservative about normalization (no fancy text transforms)
63
+ - but tolerant to common STT variations like "stop." / "stop please"
64
+
65
+ We match phrases as whole-word sequences inside the normalized text.
66
+ """
67
+ normalized = normalize_stop_phrase(text)
68
+ if not normalized:
69
+ return False
70
+ phrase_set = {normalize_stop_phrase(p) for p in phrases if p}
71
+ for phrase in phrase_set:
72
+ if not phrase:
73
+ continue
74
+ # Special-case: tolerate common STT variants for "ok/okay stop"
75
+ # (e.g. "okay stop", "okey stop", "oh stop").
76
+ # Keep it conservative:
77
+ # - require "stop" at the end
78
+ # - require an ok-like token right before it (or one token earlier with "please")
79
+ phrase_toks = phrase.split()
80
+ toks = normalized.split()
81
+
82
+ if phrase_toks == ["ok", "stop"] or phrase_toks == ["okay", "stop"]:
83
+ if len(toks) in (2, 3) and toks[-1] == "stop":
84
+ candidates = [toks[-2]]
85
+ if len(toks) == 3:
86
+ candidates.append(toks[-3])
87
+ for t in candidates:
88
+ if _levenshtein_leq(t, "ok", max_dist=1) or _levenshtein_leq(t, "okay", max_dist=1):
89
+ return True
90
+
91
+ # Default rule:
92
+ # - exact (stop)
93
+ # - prefix (stop please)
94
+ # - suffix (please stop)
95
+ # This avoids false positives like "don't stop now" when "stop" is a phrase.
96
+ if normalized == phrase:
97
+ return True
98
+ if normalized.startswith(phrase + " "):
99
+ return True
100
+ if normalized.endswith(" " + phrase):
101
+ return True
102
+ return False
103
+