PyPI - abstractvoice - Versions diffs - 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl - Mend

abstractvoice 0.5.1py3-none-any.whl → 0.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

abstractvoice/__init__.py +2 -5
abstractvoice/__main__.py +82 -3
abstractvoice/adapters/__init__.py +12 -0
abstractvoice/adapters/base.py +207 -0
abstractvoice/adapters/stt_faster_whisper.py +401 -0
abstractvoice/adapters/tts_piper.py +480 -0
abstractvoice/aec/__init__.py +10 -0
abstractvoice/aec/webrtc_apm.py +56 -0
abstractvoice/artifacts.py +173 -0
abstractvoice/audio/__init__.py +7 -0
abstractvoice/audio/recorder.py +46 -0
abstractvoice/audio/resample.py +25 -0
abstractvoice/cloning/__init__.py +7 -0
abstractvoice/cloning/engine_chroma.py +738 -0
abstractvoice/cloning/engine_f5.py +546 -0
abstractvoice/cloning/manager.py +349 -0
abstractvoice/cloning/store.py +362 -0
abstractvoice/compute/__init__.py +6 -0
abstractvoice/compute/device.py +73 -0
abstractvoice/config/__init__.py +2 -0
abstractvoice/config/voice_catalog.py +19 -0
abstractvoice/dependency_check.py +0 -1
abstractvoice/examples/cli_repl.py +2403 -243
abstractvoice/examples/voice_cli.py +64 -63
abstractvoice/integrations/__init__.py +2 -0
abstractvoice/integrations/abstractcore.py +116 -0
abstractvoice/integrations/abstractcore_plugin.py +253 -0
abstractvoice/prefetch.py +82 -0
abstractvoice/recognition.py +424 -42
abstractvoice/stop_phrase.py +103 -0
abstractvoice/tts/__init__.py +3 -3
abstractvoice/tts/adapter_tts_engine.py +210 -0
abstractvoice/tts/tts_engine.py +257 -1208
abstractvoice/vm/__init__.py +2 -0
abstractvoice/vm/common.py +21 -0
abstractvoice/vm/core.py +139 -0
abstractvoice/vm/manager.py +108 -0
abstractvoice/vm/stt_mixin.py +158 -0
abstractvoice/vm/tts_mixin.py +550 -0
abstractvoice/voice_manager.py +6 -1061
abstractvoice-0.6.1.dist-info/METADATA +213 -0
abstractvoice-0.6.1.dist-info/RECORD +52 -0
{abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/WHEEL +1 -1
abstractvoice-0.6.1.dist-info/entry_points.txt +6 -0
abstractvoice/instant_setup.py +0 -83
abstractvoice/simple_model_manager.py +0 -539
abstractvoice-0.5.1.dist-info/METADATA +0 -1458
abstractvoice-0.5.1.dist-info/RECORD +0 -23
abstractvoice-0.5.1.dist-info/entry_points.txt +0 -2
{abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/licenses/LICENSE +0 -0
{abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/top_level.txt +0 -0

abstractvoice/recognition.py CHANGED Viewed

@@ -2,18 +2,26 @@
 import threading
 import time
+from typing import Optional
+from collections import deque
+import numpy as np
+import re
+from .stop_phrase import is_stop_phrase
+from .audio.resample import linear_resample_mono
 # Lazy imports for heavy dependencies
 def _import_audio_deps():
     """Import audio dependencies with helpful error message if missing."""
     try:
-        import pyaudio
-        return pyaudio
+        import sounddevice as sd
+        return sd
     except ImportError as e:
         raise ImportError(
-            "Audio functionality requires optional dependencies. Install with:\n"
-            "  pip install abstractvoice[voice]  # For basic audio\n"
-            "  pip install abstractvoice[all]    # For all features\n"
+            "Audio capture/playback requires sounddevice. Install with:\n"
+            "  pip install abstractvoice          # Core install (includes sounddevice)\n"
+            "  pip install abstractvoice[all]      # All features\n"
             f"Original error: {e}"
         ) from e
@@ -33,21 +41,34 @@ def _import_vad():
         raise
 def _import_transcriber():
-    """Import Transcriber with helpful error message if dependencies missing."""
+    """Import STT adapter with helpful error message if dependencies missing."""
     try:
-        from .stt import Transcriber
-        return Transcriber
+        from .adapters.stt_faster_whisper import FasterWhisperAdapter
+        return FasterWhisperAdapter
     except ImportError as e:
-        if "whisper" in str(e) or "tiktoken" in str(e):
-            raise ImportError(
-                "Speech recognition functionality requires optional dependencies. Install with:\n"
-                "  pip install abstractvoice[stt]    # For speech recognition only\n"
-                "  pip install abstractvoice[all]    # For all features\n"
-                f"Original error: {e}"
-            ) from e
+        raise ImportError(
+            "Speech recognition requires faster-whisper (core dependency). "
+            "If this error occurs, your installation is inconsistent.\n"
+            "Try reinstalling:\n"
+            "  pip install --upgrade abstractvoice\n"
+            f"Original error: {e}"
+        ) from e
         raise
+def _import_aec_processor():
+    """Import AEC processor with helpful error if dependencies missing."""
+    try:
+        from .aec.webrtc_apm import AecConfig, WebRtcAecProcessor
+        return AecConfig, WebRtcAecProcessor
+    except ImportError as e:
+        raise ImportError(
+            "AEC is optional and requires extra dependencies.\n"
+            "Install with: pip install \"abstractvoice[aec]\"\n"
+            f"Original error: {e}"
+        ) from e
 class VoiceRecognizer:
     """Voice recognition with VAD and STT."""
@@ -55,7 +76,10 @@ class VoiceRecognizer:
                  vad_aggressiveness=1, min_speech_duration=600,
                  silence_timeout=1500, sample_rate=16000,
                  chunk_duration=30, whisper_model="tiny",
-                 min_transcription_length=5, debug_mode=False):
+                 min_transcription_length=5, debug_mode=False,
+                 aec_enabled: bool = False, aec_stream_delay_ms: int = 0,
+                 language: str | None = None,
+                 allow_downloads: bool = True):
         """Initialize voice recognizer.
         Args:
@@ -73,6 +97,25 @@ class VoiceRecognizer:
         self.debug_mode = debug_mode
         self.transcription_callback = transcription_callback
         self.stop_callback = stop_callback
+        self.language = (language or None)
+        self.allow_downloads = bool(allow_downloads)
+        # Stop phrase(s): robust “interrupt” without requiring echo cancellation.
+        # Keep it conservative to avoid accidental stops from the assistant audio.
+        # Include bare "stop" because users will naturally say it.
+        self.stop_phrases = ["stop", "ok stop", "okay stop"]
+        # While TTS is playing we can end up with continuous "speech" from speaker echo,
+        # which prevents end-of-utterance detection and therefore prevents stop phrase
+        # transcription. To keep STOP mode usable without AEC, we run a low-rate rolling
+        # window transcription ONLY for stop-phrase detection when transcriptions are paused.
+        self._stop_ring = bytearray()
+        self._stop_last_check = 0.0
+        # Faster checks help catch "ok stop" early during playback.
+        self._stop_check_interval_s = 0.6
+        self._stop_window_s = 2.0
+        self._stop_hit_count = 0
+        self._stop_hit_deadline = 0.0
         # Configuration
         self.sample_rate = sample_rate
@@ -80,6 +123,8 @@ class VoiceRecognizer:
         self.chunk_size = int(sample_rate * chunk_duration / 1000)
         self.min_speech_chunks = int(min_speech_duration / chunk_duration)
         self.silence_timeout_chunks = int(silence_timeout / chunk_duration)
+        self._default_min_speech_chunks = int(self.min_speech_chunks)
+        self._default_silence_timeout_chunks = int(self.silence_timeout_chunks)
         # Initialize components using lazy imports
         VoiceDetector = _import_vad()
@@ -89,21 +134,181 @@ class VoiceRecognizer:
             debug_mode=debug_mode
         )
-        Transcriber = _import_transcriber()
-        self.transcriber = Transcriber(
-            model_name=whisper_model,
-            min_transcription_length=min_transcription_length,
-            debug_mode=debug_mode
+        # STT: use faster-whisper adapter by default (core dependency)
+        STTAdapter = _import_transcriber()
+        self.stt_adapter = STTAdapter(
+            model_size=whisper_model,
+            device="auto",
+            compute_type="int8",
+            allow_downloads=bool(self.allow_downloads),
         )
+        self.min_transcription_length = min_transcription_length
         # State
         self.is_running = False
         self.thread = None
-        self.pyaudio = None
         self.stream = None
         self.tts_interrupt_callback = None
         self.tts_interrupt_enabled = True  # Can be disabled during TTS playback
         self.listening_paused = False  # Can be paused to completely stop processing audio
+        # While TTS is playing (esp. without AEC), we often want to suppress normal
+        # transcriptions to avoid self-feedback loops, but still allow stop phrase.
+        self.transcriptions_paused = False
+        self._profile = "stop"
+        # Last STT metrics (best-effort; used by verbose REPL output).
+        # Populated only for "normal" transcriptions that invoke transcription_callback.
+        self.last_stt_metrics: dict | None = None
+        # Optional AEC (echo cancellation) state.
+        self.aec_enabled = False
+        self._aec = None
+        self._far_end_lock = threading.Lock()
+        self._far_end_pcm16 = bytearray()
+        # Lightweight echo gating (for full-mode barge-in without AEC).
+        self._echo_gate_enabled = False
+        self._echo_corr_threshold = 0.72
+        if aec_enabled:
+            self.enable_aec(True, stream_delay_ms=aec_stream_delay_ms)
+        # Apply initial profile.
+        self.set_profile("stop")
+    def set_profile(self, profile: str) -> None:
+        """Set listening profile tuned for the current interaction mode.
+        Why this exists:
+        - PTT needs *very* low thresholds to reliably capture short utterances.
+        - STOP/WAIT should use more conservative defaults to reduce false triggers.
+        """
+        p = (profile or "").strip().lower()
+        if p not in ("stop", "wait", "full", "ptt"):
+            return
+        self._profile = p
+        if p == "ptt":
+            # Make capture responsive: start recording as soon as we see speech,
+            # and end quickly after short silence.
+            self.min_speech_chunks = 1
+            # ~700ms of silence to end (tuned for quick PTT turns).
+            self.silence_timeout_chunks = max(8, int(round(700.0 / float(self.chunk_duration))))
+            self.transcriptions_paused = False
+            self.listening_paused = False
+            return
+        if p == "full":
+            # Make FULL responsive: start recording sooner, end sooner.
+            # This improves "didn't recognize me" reports on headsets.
+            self.min_speech_chunks = max(3, int(round(180.0 / float(self.chunk_duration))))
+            self.silence_timeout_chunks = max(12, int(round(900.0 / float(self.chunk_duration))))
+            # Echo gating is useful when AEC is not enabled.
+            self._echo_gate_enabled = True
+            return
+        # Default/conservative for continuous modes.
+        self.min_speech_chunks = int(self._default_min_speech_chunks)
+        self.silence_timeout_chunks = int(self._default_silence_timeout_chunks)
+        self._echo_gate_enabled = False
+    def enable_aec(self, enabled: bool = True, *, stream_delay_ms: int = 0) -> bool:
+        """Enable/disable acoustic echo cancellation (optional).
+        When enabled, the recognizer expects far-end audio via `feed_far_end_audio()`.
+        """
+        if not enabled:
+            self.aec_enabled = False
+            self._aec = None
+            with self._far_end_lock:
+                self._far_end_pcm16 = bytearray()
+            return True
+        AecConfig, WebRtcAecProcessor = _import_aec_processor()
+        self._aec = WebRtcAecProcessor(
+            AecConfig(sample_rate=int(self.sample_rate), channels=1, stream_delay_ms=int(stream_delay_ms))
+        )
+        self.aec_enabled = True
+        return True
+    def feed_far_end_audio(self, audio_chunk: np.ndarray, *, sample_rate: int) -> None:
+        """Provide far-end (speaker) audio reference for AEC.
+        audio_chunk: mono float32 in [-1, 1] (as written to speaker output)
+        """
+        # Store far-end audio for AEC and/or echo gating.
+        if audio_chunk is None or len(audio_chunk) == 0:
+            return
+        mono = audio_chunk.astype(np.float32, copy=False)
+        if int(sample_rate) != int(self.sample_rate):
+            mono = linear_resample_mono(mono, int(sample_rate), int(self.sample_rate))
+        pcm16 = np.clip(mono, -1.0, 1.0)
+        pcm16 = (pcm16 * 32767.0).astype(np.int16).tobytes()
+        with self._far_end_lock:
+            self._far_end_pcm16.extend(pcm16)
+            # Cap buffer to a few seconds to avoid unbounded growth.
+            max_bytes = int(self.sample_rate * 3.0) * 2
+            if len(self._far_end_pcm16) > max_bytes:
+                del self._far_end_pcm16[: len(self._far_end_pcm16) - max_bytes]
+    def _is_likely_echo(self, near_pcm16: bytes) -> bool:
+        """Return True if near-end chunk looks like far-end echo.
+        This is a lightweight correlation gate (not AEC). It reduces false barge-in
+        triggers in FULL mode when AEC is not enabled.
+        """
+        try:
+            far = self._pop_far_end_pcm16(len(near_pcm16))
+            if not far or far == b"\x00" * len(far):
+                return False
+            n = np.frombuffer(near_pcm16, dtype=np.int16).astype(np.float32)
+            f = np.frombuffer(far, dtype=np.int16).astype(np.float32)
+            if n.size < 32:
+                return False
+            # Normalize.
+            n = n - float(np.mean(n))
+            f = f - float(np.mean(f))
+            nn = float(np.linalg.norm(n)) + 1e-6
+            fn = float(np.linalg.norm(f)) + 1e-6
+            corr = float(np.dot(n, f) / (nn * fn))
+            return corr >= float(self._echo_corr_threshold)
+        except Exception:
+            return False
+    def _pop_far_end_pcm16(self, nbytes: int) -> bytes:
+        if nbytes <= 0:
+            return b""
+        with self._far_end_lock:
+            if not self._far_end_pcm16:
+                return b"\x00" * nbytes
+            take = min(nbytes, len(self._far_end_pcm16))
+            out = bytes(self._far_end_pcm16[:take])
+            del self._far_end_pcm16[:take]
+        if take < nbytes:
+            out += b"\x00" * (nbytes - take)
+        return out
+    def _apply_aec(self, near_pcm16: bytes) -> bytes:
+        if not (self.aec_enabled and self._aec):
+            return near_pcm16
+        # The underlying APM typically expects 10ms frames. We can split any chunk
+        # size into 10ms sub-frames for robustness.
+        frame_bytes = int(self.sample_rate * 0.01) * 2  # 10ms * int16
+        if frame_bytes <= 0:
+            return near_pcm16
+        if len(near_pcm16) % frame_bytes != 0:
+            # Pad to whole frames.
+            pad = frame_bytes - (len(near_pcm16) % frame_bytes)
+            near_pcm16 = near_pcm16 + (b"\x00" * pad)
+        out = bytearray()
+        for i in range(0, len(near_pcm16), frame_bytes):
+            near = near_pcm16[i : i + frame_bytes]
+            far = self._pop_far_end_pcm16(frame_bytes)
+            out.extend(self._aec.process(near_pcm16=near, far_pcm16=far))
+        return bytes(out)
     def start(self, tts_interrupt_callback=None):
         """Start voice recognition in a separate thread.
@@ -140,28 +345,140 @@ class VoiceRecognizer:
             self.thread.join()
         if self.stream:
-            self.stream.stop_stream()
-            self.stream.close()
-        if self.pyaudio:
-            self.pyaudio.terminate()
+            try:
+                self.stream.stop()
+            except Exception:
+                pass
+            try:
+                self.stream.close()
+            except Exception:
+                pass
+            self.stream = None
         if self.debug_mode:
             print(" > Voice recognition stopped")
         return True
+    def pop_last_stt_metrics(self) -> dict | None:
+        """Return and clear the most recent STT metrics (if any)."""
+        m = self.last_stt_metrics
+        self.last_stt_metrics = None
+        return m
+    def _transcribe_pcm16(
+        self,
+        pcm16_bytes: bytes,
+        language: Optional[str] = None,
+        *,
+        hotwords: str | None = None,
+        condition_on_previous_text: bool = True,
+    ) -> str:
+        """Transcribe raw PCM16 mono audio bytes."""
+        if not pcm16_bytes:
+            return ""
+        audio = np.frombuffer(pcm16_bytes, dtype=np.int16).astype(np.float32) / 32768.0
+        lang = language if language is not None else self.language
+        text = self.stt_adapter.transcribe_from_array(
+            audio,
+            sample_rate=self.sample_rate,
+            language=lang,
+            hotwords=hotwords,
+            condition_on_previous_text=bool(condition_on_previous_text),
+        )
+        return (text or "").strip()
+    def _is_stop_command(self, text: str) -> bool:
+        """Return True if text matches a configured stop phrase."""
+        return is_stop_phrase(text, self.stop_phrases)
+    def _match_stop_phrase(self, text: str) -> str | None:
+        """Return the matched stop phrase (normalized) or None."""
+        from .stop_phrase import normalize_stop_phrase
+        normalized = normalize_stop_phrase(text)
+        if not normalized:
+            return None
+        phrases = [normalize_stop_phrase(p) for p in (self.stop_phrases or []) if p]
+        for ph in phrases:
+            if not ph:
+                continue
+            if normalized == ph or normalized.startswith(ph + " ") or normalized.endswith(" " + ph):
+                return ph
+        return None
+    def _maybe_detect_stop_phrase_continuous(self, pcm16_chunk: bytes) -> bool:
+        """Best-effort rolling stop-phrase detection during TTS playback.
+        Returns True if stop_callback was invoked.
+        """
+        if not (self.transcriptions_paused and self.stop_callback):
+            return False
+        now = time.time()
+        self._stop_ring.extend(pcm16_chunk)
+        max_bytes = int(self.sample_rate * float(self._stop_window_s) * 2)
+        if max_bytes > 0 and len(self._stop_ring) > max_bytes:
+            del self._stop_ring[: len(self._stop_ring) - max_bytes]
+        if (now - float(self._stop_last_check)) < float(self._stop_check_interval_s):
+            return False
+        self._stop_last_check = now
+        try:
+            text = self._transcribe_pcm16(
+                bytes(self._stop_ring),
+                hotwords="stop, ok stop, okay stop",
+                condition_on_previous_text=False,
+            )
+        except Exception:
+            return False
+        # Keep this conservative to avoid hallucinated "stop" from hotword bias:
+        # - only accept short transcripts
+        # - require confirmation for bare "stop"
+        words = (text or "").strip().split()
+        if len(words) > 4:
+            self._stop_hit_count = 0
+            return False
+        matched = self._match_stop_phrase(text or "")
+        if matched:
+            now2 = time.time()
+            # Confirmation: for bare "stop" require 2 hits within 2.5s.
+            if matched == "stop":
+                if now2 > float(self._stop_hit_deadline):
+                    self._stop_hit_count = 0
+                self._stop_hit_deadline = now2 + 2.5
+                self._stop_hit_count += 1
+                if self._stop_hit_count < 2:
+                    return False
+            else:
+                self._stop_hit_count = 0
+            try:
+                self.stop_callback()
+            except Exception:
+                pass
+            self._stop_ring = bytearray()
+            # small cooldown
+            self._stop_last_check = time.time()
+            return True
+        return False
     def _recognition_loop(self):
         """Main recognition loop."""
-        pyaudio = _import_audio_deps()
+        sd = _import_audio_deps()
-        self.pyaudio = pyaudio.PyAudio()
-        self.stream = self.pyaudio.open(
-            format=pyaudio.paInt16,
+        # NOTE: sounddevice uses PortAudio under the hood (same as our TTS playback).
+        # Keeping microphone capture in-process avoids PyAudio install issues.
+        self.stream = sd.InputStream(
+            samplerate=self.sample_rate,
             channels=1,
-            rate=self.sample_rate,
-            input=True,
-            frames_per_buffer=self.chunk_size
+            dtype="int16",
+            blocksize=self.chunk_size,
         )
+        self.stream.start()
         speech_buffer = []
         speech_count = 0
@@ -176,7 +493,21 @@ class VoiceRecognizer:
                     continue
                 # Read audio data
-                audio_data = self.stream.read(self.chunk_size, exception_on_overflow=False)
+                audio_chunk, overflowed = self.stream.read(self.chunk_size)
+                if overflowed and self.debug_mode:
+                    print(" > Mic input overflow")
+                audio_data = audio_chunk.tobytes()
+                # Optional AEC: remove speaker echo from mic input before VAD/STT.
+                if self.aec_enabled and self._aec:
+                    audio_data = self._apply_aec(audio_data)
+                # While transcriptions are paused (typically during TTS in STOP mode),
+                # run a rolling stop-phrase detector so "stop" can still work even if
+                # VAD never sees a clean end-of-utterance due to speaker echo.
+                if self._maybe_detect_stop_phrase_continuous(audio_data):
+                    # Don't also feed this chunk into VAD/recording state.
+                    continue
                 # Check for speech
                 is_speech = self.voice_detector.is_speech(audio_data)
@@ -192,7 +523,16 @@ class VoiceRecognizer:
                         self.tts_interrupt_enabled and
                         speech_count >= self.min_speech_chunks and
                         not recording):
-                        self.tts_interrupt_callback()
+                        # In FULL mode without AEC, avoid false barge-in from echo by
+                        # gating on near/far correlation.
+                        if self._profile == "full" and self._echo_gate_enabled and not self.aec_enabled:
+                            if self._is_likely_echo(audio_data):
+                                if self.debug_mode:
+                                    print(" > Echo-gated barge-in (ignored)")
+                            else:
+                                self.tts_interrupt_callback()
+                        else:
+                            self.tts_interrupt_callback()
                         if self.debug_mode:
                             print(" > TTS interrupted by user speech")
@@ -212,19 +552,41 @@ class VoiceRecognizer:
                                 print(f" > Speech detected ({len(speech_buffer)} chunks), transcribing...")
                             audio_bytes = b''.join(speech_buffer)
-                            text = self.transcriber.transcribe(audio_bytes)
+                            audio_seconds = 0.0
+                            try:
+                                if self.sample_rate and self.sample_rate > 0:
+                                    audio_seconds = float(len(audio_bytes)) / float(int(self.sample_rate) * 2)
+                            except Exception:
+                                audio_seconds = 0.0
+                            t0 = time.monotonic()
+                            text = self._transcribe_pcm16(audio_bytes)
+                            t1 = time.monotonic()
+                            stt_s = float(t1 - t0)
+                            metrics = {
+                                "stt_s": stt_s,
+                                "audio_s": float(audio_seconds),
+                                "rtf": (stt_s / float(audio_seconds)) if audio_seconds else None,
+                                "sample_rate": int(self.sample_rate),
+                                "chunks": int(len(speech_buffer)),
+                                "chunk_ms": int(self.chunk_duration),
+                                "ts": time.time(),
+                            }
                             if text:
                                 # Check for stop command
-                                if text.lower() == "stop":
+                                if self._is_stop_command(text):
                                     if self.stop_callback:
                                         self.stop_callback()
                                     else:
                                         # If no stop callback, invoke transcription callback anyway
                                         self.transcription_callback(text)
                                 else:
-                                    # Normal transcription
-                                    self.transcription_callback(text)
+                                    # Normal transcription (can be suppressed during TTS)
+                                    if not self.transcriptions_paused:
+                                        # Record metrics only when this transcription is actually emitted.
+                                        self.last_stt_metrics = metrics
+                                        self.transcription_callback(text)
                             # Reset state
                             speech_buffer = []
@@ -251,7 +613,15 @@ class VoiceRecognizer:
         Returns:
             True if changed, False otherwise
         """
-        return self.transcriber.change_model(model_name)
+        try:
+            # Recreate adapter to switch model size.
+            STTAdapter = _import_transcriber()
+            self.stt_adapter = STTAdapter(model_size=model_name, device="cpu", compute_type="int8")
+            return True
+        except Exception as e:
+            if self.debug_mode:
+                print(f"STT model change error: {e}")
+            return False
     def change_vad_aggressiveness(self, aggressiveness):
         """Change VAD aggressiveness.
@@ -292,4 +662,16 @@ class VoiceRecognizer:
         """Resume audio processing after it was paused."""
         self.listening_paused = False
         if self.debug_mode:
-            print(" > Listening resumed")
+            print(" > Listening resumed")
+    def pause_transcriptions(self):
+        """Suppress normal transcriptions while still allowing stop phrase detection."""
+        self.transcriptions_paused = True
+        if self.debug_mode:
+            print(" > Transcriptions paused")
+    def resume_transcriptions(self):
+        """Re-enable normal transcriptions after they were suppressed."""
+        self.transcriptions_paused = False
+        if self.debug_mode:
+            print(" > Transcriptions resumed")

abstractvoice/stop_phrase.py ADDED Viewed

@@ -0,0 +1,103 @@
+"""Stop phrase matching utilities (no heavy deps).
+Keep this module dependency-free so it can be used in:
+- core unit tests
+- recognition pipeline (without forcing VAD/STT imports)
+"""
+from __future__ import annotations
+import re
+from typing import Iterable
+def normalize_stop_phrase(text: str) -> str:
+    """Normalize text for conservative stop-phrase matching."""
+    if not text:
+        return ""
+    normalized = re.sub(r"[^a-z0-9\s]+", " ", text.lower()).strip()
+    normalized = re.sub(r"\s+", " ", normalized)
+    return normalized
+def _levenshtein_leq(a: str, b: str, *, max_dist: int) -> bool:
+    """Return True if Levenshtein(a,b) <= max_dist (small, early-exit).
+    This is intentionally tiny and only used for short tokens like "ok"/"okay".
+    """
+    a = a or ""
+    b = b or ""
+    if a == b:
+        return True
+    if max_dist <= 0:
+        return False
+    # Fast bounds.
+    if abs(len(a) - len(b)) > max_dist:
+        return False
+    # DP with early exit.
+    prev = list(range(len(b) + 1))
+    for i, ca in enumerate(a, start=1):
+        cur = [i]
+        row_min = cur[0]
+        for j, cb in enumerate(b, start=1):
+            cost = 0 if ca == cb else 1
+            cur_val = min(
+                prev[j] + 1,      # deletion
+                cur[j - 1] + 1,   # insertion
+                prev[j - 1] + cost,  # substitution
+            )
+            cur.append(cur_val)
+            row_min = min(row_min, cur_val)
+        if row_min > max_dist:
+            return False
+        prev = cur
+    return prev[-1] <= max_dist
+def is_stop_phrase(text: str, phrases: Iterable[str]) -> bool:
+    """Return True if text matches any configured stop phrase.
+    Matching is intentionally:
+    - conservative about normalization (no fancy text transforms)
+    - but tolerant to common STT variations like "stop." / "stop please"
+    We match phrases as whole-word sequences inside the normalized text.
+    """
+    normalized = normalize_stop_phrase(text)
+    if not normalized:
+        return False
+    phrase_set = {normalize_stop_phrase(p) for p in phrases if p}
+    for phrase in phrase_set:
+        if not phrase:
+            continue
+        # Special-case: tolerate common STT variants for "ok/okay stop"
+        # (e.g. "okay stop", "okey stop", "oh stop").
+        # Keep it conservative:
+        # - require "stop" at the end
+        # - require an ok-like token right before it (or one token earlier with "please")
+        phrase_toks = phrase.split()
+        toks = normalized.split()
+        if phrase_toks == ["ok", "stop"] or phrase_toks == ["okay", "stop"]:
+            if len(toks) in (2, 3) and toks[-1] == "stop":
+                candidates = [toks[-2]]
+                if len(toks) == 3:
+                    candidates.append(toks[-3])
+                for t in candidates:
+                    if _levenshtein_leq(t, "ok", max_dist=1) or _levenshtein_leq(t, "okay", max_dist=1):
+                        return True
+        # Default rule:
+        # - exact (stop)
+        # - prefix (stop please)
+        # - suffix (please stop)
+        # This avoids false positives like "don't stop now" when "stop" is a phrase.
+        if normalized == phrase:
+            return True
+        if normalized.startswith(phrase + " "):
+            return True
+        if normalized.endswith(" " + phrase):
+            return True
+    return False

abstractvoice 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

abstractvoice 0.5.1py3-none-any.whl → 0.6.1py3-none-any.whl