PyPI - abstractvoice - Versions diffs - 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl - Mend

abstractvoice 0.5.1py3-none-any.whl → 0.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

abstractvoice/__init__.py +2 -5
abstractvoice/__main__.py +82 -3
abstractvoice/adapters/__init__.py +12 -0
abstractvoice/adapters/base.py +207 -0
abstractvoice/adapters/stt_faster_whisper.py +401 -0
abstractvoice/adapters/tts_piper.py +480 -0
abstractvoice/aec/__init__.py +10 -0
abstractvoice/aec/webrtc_apm.py +56 -0
abstractvoice/artifacts.py +173 -0
abstractvoice/audio/__init__.py +7 -0
abstractvoice/audio/recorder.py +46 -0
abstractvoice/audio/resample.py +25 -0
abstractvoice/cloning/__init__.py +7 -0
abstractvoice/cloning/engine_chroma.py +738 -0
abstractvoice/cloning/engine_f5.py +546 -0
abstractvoice/cloning/manager.py +349 -0
abstractvoice/cloning/store.py +362 -0
abstractvoice/compute/__init__.py +6 -0
abstractvoice/compute/device.py +73 -0
abstractvoice/config/__init__.py +2 -0
abstractvoice/config/voice_catalog.py +19 -0
abstractvoice/dependency_check.py +0 -1
abstractvoice/examples/cli_repl.py +2403 -243
abstractvoice/examples/voice_cli.py +64 -63
abstractvoice/integrations/__init__.py +2 -0
abstractvoice/integrations/abstractcore.py +116 -0
abstractvoice/integrations/abstractcore_plugin.py +253 -0
abstractvoice/prefetch.py +82 -0
abstractvoice/recognition.py +424 -42
abstractvoice/stop_phrase.py +103 -0
abstractvoice/tts/__init__.py +3 -3
abstractvoice/tts/adapter_tts_engine.py +210 -0
abstractvoice/tts/tts_engine.py +257 -1208
abstractvoice/vm/__init__.py +2 -0
abstractvoice/vm/common.py +21 -0
abstractvoice/vm/core.py +139 -0
abstractvoice/vm/manager.py +108 -0
abstractvoice/vm/stt_mixin.py +158 -0
abstractvoice/vm/tts_mixin.py +550 -0
abstractvoice/voice_manager.py +6 -1061
abstractvoice-0.6.1.dist-info/METADATA +213 -0
abstractvoice-0.6.1.dist-info/RECORD +52 -0
{abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/WHEEL +1 -1
abstractvoice-0.6.1.dist-info/entry_points.txt +6 -0
abstractvoice/instant_setup.py +0 -83
abstractvoice/simple_model_manager.py +0 -539
abstractvoice-0.5.1.dist-info/METADATA +0 -1458
abstractvoice-0.5.1.dist-info/RECORD +0 -23
abstractvoice-0.5.1.dist-info/entry_points.txt +0 -2
{abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/licenses/LICENSE +0 -0
{abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/top_level.txt +0 -0

abstractvoice/vm/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ """Internal modules used to keep `VoiceManager` small and focused."""
2	+

abstractvoice/vm/common.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""Common helpers for VoiceManager parts.
+This module exists to avoid circular imports while keeping `voice_manager.py`
+small and focused on the public façade.
+"""
+from __future__ import annotations
+def import_voice_recognizer():
+    """Import VoiceRecognizer with a helpful error if dependencies are missing."""
+    try:
+        from ..recognition import VoiceRecognizer
+        return VoiceRecognizer
+    except ImportError as e:
+        raise ImportError(
+            "Microphone capture/listen() requires optional dependencies to be installed correctly.\n"
+            "Try:\n"
+            "  pip install --upgrade abstractvoice\n"
+            f"Original error: {e}"
+        ) from e

abstractvoice/vm/core.py ADDED Viewed

@@ -0,0 +1,139 @@
+"""VoiceManager core (init + lifecycle callbacks + cleanup)."""
+from __future__ import annotations
+class VoiceManagerCore:
+    """Core orchestration (shared state and callbacks)."""
+    def _wire_tts_callbacks(self) -> None:
+        if self.tts_engine is None:
+            return
+        # TTS lifecycle used to coordinate listening modes.
+        self.tts_engine.on_playback_start = self._on_tts_start
+        self.tts_engine.on_playback_end = self._on_tts_end
+        # Audio lifecycle callbacks (actual playback).
+        if hasattr(self.tts_engine, "audio_player") and self.tts_engine.audio_player:
+            self.tts_engine.audio_player.on_audio_start = self._on_audio_start
+            self.tts_engine.audio_player.on_audio_end = self._on_audio_end
+            self.tts_engine.audio_player.on_audio_pause = self._on_audio_pause
+            self.tts_engine.audio_player.on_audio_resume = self._on_audio_resume
+            # Optional: feed far-end playback audio to the listener for AEC.
+            try:
+                self.tts_engine.audio_player.on_audio_chunk = self._on_audio_chunk
+            except Exception:
+                pass
+    def _on_audio_chunk(self, audio_chunk, sample_rate: int):
+        """Called with chunks actually written to speaker output.
+        This is used only for advanced features like AEC-based barge-in.
+        """
+        if not self.voice_recognizer:
+            return
+        if hasattr(self.voice_recognizer, "feed_far_end_audio"):
+            try:
+                self.voice_recognizer.feed_far_end_audio(audio_chunk, sample_rate=sample_rate)
+            except Exception:
+                pass
+    def _on_tts_start(self):
+        """Called when TTS playback starts - handle based on voice mode."""
+        if not self.voice_recognizer:
+            return
+        if self._voice_mode == "full":
+            # Full mode is intended for headsets (minimal echo) OR AEC-enabled setups.
+            #
+            # - Always allow speech-triggered TTS interruption (barge-in).
+            # - Keep transcriptions enabled (headset assumption). If you're on speakers,
+            #   prefer STOP/WAIT modes or enable AEC.
+            return
+        if self._voice_mode == "wait":
+            # WAIT: fully pause mic processing while we speak (max robustness).
+            # Trade-off: user can't barge-in by voice while TTS is playing.
+            if hasattr(self.voice_recognizer, "pause_listening"):
+                self.voice_recognizer.pause_listening()
+            return
+        if self._voice_mode == "stop":
+            # STOP: keep listening, but suppress normal transcriptions while speaking
+            # to avoid self-feedback loops; stop phrase remains available.
+            self.voice_recognizer.pause_tts_interrupt()
+            if hasattr(self.voice_recognizer, "pause_transcriptions"):
+                self.voice_recognizer.pause_transcriptions()
+            return
+        if self._voice_mode == "ptt":
+            # PTT: listening should be controlled explicitly by the integrator/REPL.
+            # If we happen to be listening, treat speaking like STOP mode.
+            self.voice_recognizer.pause_tts_interrupt()
+            if hasattr(self.voice_recognizer, "pause_transcriptions"):
+                self.voice_recognizer.pause_transcriptions()
+            return
+    def _on_tts_end(self):
+        """Called when TTS playback ends - handle based on voice mode."""
+        if not self.voice_recognizer:
+            return
+        if self._voice_mode == "full":
+            self.voice_recognizer.resume_tts_interrupt()
+            return
+        if self._voice_mode == "wait":
+            if hasattr(self.voice_recognizer, "resume_listening"):
+                self.voice_recognizer.resume_listening()
+            return
+        if self._voice_mode in ("stop", "ptt"):
+            self.voice_recognizer.resume_tts_interrupt()
+            if hasattr(self.voice_recognizer, "resume_transcriptions"):
+                self.voice_recognizer.resume_transcriptions()
+            return
+    def _on_audio_start(self):
+        if self.on_audio_start:
+            self.on_audio_start()
+    def _on_audio_end(self):
+        if self.on_audio_end:
+            self.on_audio_end()
+    def _on_audio_pause(self):
+        if self.on_audio_pause:
+            self.on_audio_pause()
+    def _on_audio_resume(self):
+        if self.on_audio_resume:
+            self.on_audio_resume()
+    def cleanup(self):
+        """Clean up resources."""
+        if self.voice_recognizer:
+            self.voice_recognizer.stop()
+        self.stop_speaking()
+        # Best-effort: fully release audio resources.
+        try:
+            if self.tts_engine is not None:
+                if hasattr(self.tts_engine, "cleanup"):
+                    self.tts_engine.cleanup()
+                elif hasattr(self.tts_engine, "audio_player") and self.tts_engine.audio_player:
+                    self.tts_engine.audio_player.cleanup()
+        except Exception:
+            pass
+        # Best-effort: release any loaded cloning engine weights (GPU-heavy).
+        try:
+            unload = getattr(self, "unload_cloning_engines", None)
+            if callable(unload):
+                unload()
+        except Exception:
+            pass
+        return True

abstractvoice/vm/manager.py ADDED Viewed

@@ -0,0 +1,108 @@
+"""Small public façade for VoiceManager.
+The heavy implementation is split across focused mixins to keep files small
+and responsibilities clear.
+"""
+from __future__ import annotations
+import threading
+from typing import Optional
+from ..config.voice_catalog import LANGUAGES, SAFE_FALLBACK
+from ..tts.adapter_tts_engine import AdapterTTSEngine
+from .core import VoiceManagerCore
+from .stt_mixin import SttMixin
+from .tts_mixin import TtsMixin
+class VoiceManager(VoiceManagerCore, TtsMixin, SttMixin):
+    """Main class for voice interaction capabilities."""
+    LANGUAGES = LANGUAGES
+    SAFE_FALLBACK = SAFE_FALLBACK
+    def __init__(
+        self,
+        language: str = "en",
+        tts_model: Optional[str] = None,
+        # Default STT model: "base" is a better out-of-box quality baseline than "tiny",
+        # especially for short commands and non-ideal microphone conditions.
+        whisper_model: str = "base",
+        debug_mode: bool = False,
+        tts_engine: str = "auto",
+        stt_engine: str = "auto",
+        allow_downloads: bool = True,
+        cloned_tts_streaming: bool = True,
+        cloning_engine: str = "f5_tts",
+    ):
+        self.debug_mode = debug_mode
+        self.speed = 1.0
+        # Controls whether the library may download model weights implicitly.
+        # The REPL sets this to False to enforce "no surprise downloads".
+        self.allow_downloads = bool(allow_downloads)
+        # Cloned TTS can either stream batches (lower time-to-first-audio, but may
+        # introduce gaps if generation can't stay ahead) or generate full audio first.
+        self.cloned_tts_streaming = bool(cloned_tts_streaming)
+        self.cloning_engine = str(cloning_engine or "f5_tts").strip().lower()
+        language = (language or "en").lower()
+        if language not in self.LANGUAGES:
+            if debug_mode:
+                available = ", ".join(self.LANGUAGES.keys())
+                print(f"⚠️ Unsupported language '{language}', using English. Available: {available}")
+            language = "en"
+        self.language = language
+        self._tts_engine_preference = tts_engine
+        self._stt_engine_preference = stt_engine
+        # TTS selection
+        self.tts_adapter = None
+        self._tts_engine_name = None
+        self.tts_engine = None
+        if tts_engine not in ("auto", "piper"):
+            raise ValueError("Only Piper TTS is supported in AbstractVoice core. Use tts_engine='piper'.")
+        if tts_engine in ("auto", "piper"):
+            self.tts_adapter = self._try_init_piper(language)
+            # Create the playback engine as long as Piper runtime is importable.
+            # This keeps audio output available for cloning backends even when no
+            # Piper voice model is cached locally (offline-first).
+            if self.tts_adapter:
+                self.tts_engine = AdapterTTSEngine(self.tts_adapter, debug_mode=debug_mode)
+                self._tts_engine_name = "piper"
+        # Audio lifecycle callbacks (public hooks)
+        self.on_audio_start = None
+        self.on_audio_end = None
+        self.on_audio_pause = None
+        self.on_audio_resume = None
+        self._wire_tts_callbacks()
+        # STT / listening
+        self.voice_recognizer = None
+        self.whisper_model = whisper_model
+        self.stt_adapter = None
+        self._voice_cloner = None
+        self._aec_enabled = False
+        self._aec_stream_delay_ms = 0
+        # Cloned-speech cancellation token (best-effort).
+        self._cloned_cancel_event = threading.Event()
+        # Tracks whether cloned TTS synthesis is currently running (separate from playback).
+        self._cloned_synthesis_active = threading.Event()
+        # Best-effort last TTS metrics (used by verbose REPL output).
+        self._last_tts_metrics = None
+        self._last_tts_metrics_lock = threading.Lock()
+        # State tracking
+        self._transcription_callback = None
+        self._stop_callback = None
+        # Default to "wait" for robustness without echo cancellation.
+        # "full" is intended for headset / echo-controlled environments.
+        self._voice_mode = "wait"

abstractvoice/vm/stt_mixin.py ADDED Viewed

@@ -0,0 +1,158 @@
+"""STT + listening methods for VoiceManager."""
+from __future__ import annotations
+from typing import Optional
+from .common import import_voice_recognizer
+class SttMixin:
+    def transcribe_from_bytes(self, audio_bytes: bytes, language: Optional[str] = None) -> str:
+        import tempfile
+        import os
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+            tmp_file.write(audio_bytes)
+            tmp_path = tmp_file.name
+        try:
+            return self.transcribe_file(tmp_path, language=language)
+        finally:
+            try:
+                os.unlink(tmp_path)
+            except Exception:
+                pass
+    def transcribe_file(self, audio_path: str, language: Optional[str] = None) -> str:
+        stt = self._get_stt_adapter()
+        if stt is not None:
+            return stt.transcribe(audio_path, language=language)
+        # Optional fallback to legacy Transcriber if present.
+        from ..stt import Transcriber
+        transcriber = Transcriber(model_name=self.whisper_model, debug_mode=self.debug_mode)
+        result = transcriber.transcribe(audio_path)
+        return result["text"] if result and "text" in result else ""
+    def _get_stt_adapter(self):
+        if self.stt_adapter is not None:
+            return self.stt_adapter if self.stt_adapter.is_available() else None
+        if self._stt_engine_preference not in ("auto", "faster_whisper"):
+            return None
+        try:
+            from ..adapters.stt_faster_whisper import FasterWhisperAdapter
+            self.stt_adapter = FasterWhisperAdapter(
+                model_size=self.whisper_model,
+                device="cpu",
+                compute_type="int8",
+                allow_downloads=bool(getattr(self, "allow_downloads", True)),
+            )
+            if self.stt_adapter.is_available():
+                return self.stt_adapter
+            return None
+        except Exception as e:
+            if self.debug_mode:
+                print(f"⚠️  Faster-Whisper STT not available: {e}")
+            self.stt_adapter = None
+            return None
+    def set_whisper(self, model_name):
+        self.whisper_model = model_name
+        if self.voice_recognizer:
+            return self.voice_recognizer.change_whisper_model(model_name)
+    def get_whisper(self):
+        return self.whisper_model
+    def listen(self, on_transcription, on_stop=None):
+        self._transcription_callback = on_transcription
+        self._stop_callback = on_stop
+        if not self.voice_recognizer:
+            def _transcription_handler(text):
+                if self._transcription_callback:
+                    self._transcription_callback(text)
+            def _stop_handler():
+                # Stop phrase semantics (ADR 0002 Phase 1):
+                # - Always stop TTS playback immediately.
+                # - Do NOT forcibly stop listening unless the integrator wants that
+                #   (they can call stop_listening() inside on_stop).
+                self.stop_speaking()
+                if self._stop_callback:
+                    self._stop_callback()
+            VoiceRecognizer = import_voice_recognizer()
+            self.voice_recognizer = VoiceRecognizer(
+                transcription_callback=_transcription_handler,
+                stop_callback=_stop_handler,
+                whisper_model=self.whisper_model,
+                debug_mode=self.debug_mode,
+                aec_enabled=bool(getattr(self, "_aec_enabled", False)),
+                aec_stream_delay_ms=int(getattr(self, "_aec_stream_delay_ms", 0)),
+                language=getattr(self, "language", None),
+                allow_downloads=bool(getattr(self, "allow_downloads", True)),
+            )
+            try:
+                if hasattr(self.voice_recognizer, "set_profile"):
+                    self.voice_recognizer.set_profile(getattr(self, "_voice_mode", "stop"))
+            except Exception:
+                pass
+        return self.voice_recognizer.start(tts_interrupt_callback=self.stop_speaking)
+    def enable_aec(self, enabled: bool = True, *, stream_delay_ms: int = 0) -> bool:
+        """Enable optional AEC-based barge-in support.
+        Notes:
+        - This is opt-in and requires: pip install "abstractvoice[aec]"
+        - Intended for `voice_mode="full"` where we want true barge-in.
+        """
+        self._aec_enabled = bool(enabled)
+        self._aec_stream_delay_ms = int(stream_delay_ms)
+        if self.voice_recognizer and hasattr(self.voice_recognizer, "enable_aec"):
+            return bool(self.voice_recognizer.enable_aec(bool(enabled), stream_delay_ms=int(stream_delay_ms)))
+        return True
+    def stop_listening(self):
+        if self.voice_recognizer:
+            return self.voice_recognizer.stop()
+        return False
+    def pause_listening(self) -> bool:
+        if self.voice_recognizer:
+            self.voice_recognizer.pause_listening()
+            return True
+        return False
+    def resume_listening(self) -> bool:
+        if self.voice_recognizer:
+            self.voice_recognizer.resume_listening()
+            return True
+        return False
+    def is_listening(self):
+        return self.voice_recognizer and self.voice_recognizer.is_running
+    def set_voice_mode(self, mode):
+        if mode in ["full", "wait", "stop", "ptt"]:
+            self._voice_mode = mode
+            # Keep recognizer thresholds aligned with interaction mode.
+            try:
+                if self.voice_recognizer and hasattr(self.voice_recognizer, "set_profile"):
+                    self.voice_recognizer.set_profile(mode)
+            except Exception:
+                pass
+            return True
+        return False
+    def change_vad_aggressiveness(self, aggressiveness):
+        if self.voice_recognizer:
+            return self.voice_recognizer.change_vad_aggressiveness(aggressiveness)
+        return False

abstractvoice 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

abstractvoice 0.5.1py3-none-any.whl → 0.6.1py3-none-any.whl