abstractvoice 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. abstractvoice/__init__.py +2 -5
  2. abstractvoice/__main__.py +82 -3
  3. abstractvoice/adapters/__init__.py +12 -0
  4. abstractvoice/adapters/base.py +207 -0
  5. abstractvoice/adapters/stt_faster_whisper.py +401 -0
  6. abstractvoice/adapters/tts_piper.py +480 -0
  7. abstractvoice/aec/__init__.py +10 -0
  8. abstractvoice/aec/webrtc_apm.py +56 -0
  9. abstractvoice/artifacts.py +173 -0
  10. abstractvoice/audio/__init__.py +7 -0
  11. abstractvoice/audio/recorder.py +46 -0
  12. abstractvoice/audio/resample.py +25 -0
  13. abstractvoice/cloning/__init__.py +7 -0
  14. abstractvoice/cloning/engine_chroma.py +738 -0
  15. abstractvoice/cloning/engine_f5.py +546 -0
  16. abstractvoice/cloning/manager.py +349 -0
  17. abstractvoice/cloning/store.py +362 -0
  18. abstractvoice/compute/__init__.py +6 -0
  19. abstractvoice/compute/device.py +73 -0
  20. abstractvoice/config/__init__.py +2 -0
  21. abstractvoice/config/voice_catalog.py +19 -0
  22. abstractvoice/dependency_check.py +0 -1
  23. abstractvoice/examples/cli_repl.py +2403 -243
  24. abstractvoice/examples/voice_cli.py +64 -63
  25. abstractvoice/integrations/__init__.py +2 -0
  26. abstractvoice/integrations/abstractcore.py +116 -0
  27. abstractvoice/integrations/abstractcore_plugin.py +253 -0
  28. abstractvoice/prefetch.py +82 -0
  29. abstractvoice/recognition.py +424 -42
  30. abstractvoice/stop_phrase.py +103 -0
  31. abstractvoice/tts/__init__.py +3 -3
  32. abstractvoice/tts/adapter_tts_engine.py +210 -0
  33. abstractvoice/tts/tts_engine.py +257 -1208
  34. abstractvoice/vm/__init__.py +2 -0
  35. abstractvoice/vm/common.py +21 -0
  36. abstractvoice/vm/core.py +139 -0
  37. abstractvoice/vm/manager.py +108 -0
  38. abstractvoice/vm/stt_mixin.py +158 -0
  39. abstractvoice/vm/tts_mixin.py +550 -0
  40. abstractvoice/voice_manager.py +6 -1061
  41. abstractvoice-0.6.1.dist-info/METADATA +213 -0
  42. abstractvoice-0.6.1.dist-info/RECORD +52 -0
  43. {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/WHEEL +1 -1
  44. abstractvoice-0.6.1.dist-info/entry_points.txt +6 -0
  45. abstractvoice/instant_setup.py +0 -83
  46. abstractvoice/simple_model_manager.py +0 -539
  47. abstractvoice-0.5.1.dist-info/METADATA +0 -1458
  48. abstractvoice-0.5.1.dist-info/RECORD +0 -23
  49. abstractvoice-0.5.1.dist-info/entry_points.txt +0 -2
  50. {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/licenses/LICENSE +0 -0
  51. {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,2 @@
1
+ """Internal modules used to keep `VoiceManager` small and focused."""
2
+
@@ -0,0 +1,21 @@
1
+ """Common helpers for VoiceManager parts.
2
+
3
+ This module exists to avoid circular imports while keeping `voice_manager.py`
4
+ small and focused on the public façade.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ def import_voice_recognizer():
10
+ """Import VoiceRecognizer with a helpful error if dependencies are missing."""
11
+ try:
12
+ from ..recognition import VoiceRecognizer
13
+ return VoiceRecognizer
14
+ except ImportError as e:
15
+ raise ImportError(
16
+ "Microphone capture/listen() requires optional dependencies to be installed correctly.\n"
17
+ "Try:\n"
18
+ " pip install --upgrade abstractvoice\n"
19
+ f"Original error: {e}"
20
+ ) from e
21
+
@@ -0,0 +1,139 @@
1
+ """VoiceManager core (init + lifecycle callbacks + cleanup)."""
2
+
3
+ from __future__ import annotations
4
+
5
+
6
+ class VoiceManagerCore:
7
+ """Core orchestration (shared state and callbacks)."""
8
+
9
+ def _wire_tts_callbacks(self) -> None:
10
+ if self.tts_engine is None:
11
+ return
12
+
13
+ # TTS lifecycle used to coordinate listening modes.
14
+ self.tts_engine.on_playback_start = self._on_tts_start
15
+ self.tts_engine.on_playback_end = self._on_tts_end
16
+
17
+ # Audio lifecycle callbacks (actual playback).
18
+ if hasattr(self.tts_engine, "audio_player") and self.tts_engine.audio_player:
19
+ self.tts_engine.audio_player.on_audio_start = self._on_audio_start
20
+ self.tts_engine.audio_player.on_audio_end = self._on_audio_end
21
+ self.tts_engine.audio_player.on_audio_pause = self._on_audio_pause
22
+ self.tts_engine.audio_player.on_audio_resume = self._on_audio_resume
23
+ # Optional: feed far-end playback audio to the listener for AEC.
24
+ try:
25
+ self.tts_engine.audio_player.on_audio_chunk = self._on_audio_chunk
26
+ except Exception:
27
+ pass
28
+
29
+ def _on_audio_chunk(self, audio_chunk, sample_rate: int):
30
+ """Called with chunks actually written to speaker output.
31
+
32
+ This is used only for advanced features like AEC-based barge-in.
33
+ """
34
+ if not self.voice_recognizer:
35
+ return
36
+ if hasattr(self.voice_recognizer, "feed_far_end_audio"):
37
+ try:
38
+ self.voice_recognizer.feed_far_end_audio(audio_chunk, sample_rate=sample_rate)
39
+ except Exception:
40
+ pass
41
+
42
+ def _on_tts_start(self):
43
+ """Called when TTS playback starts - handle based on voice mode."""
44
+ if not self.voice_recognizer:
45
+ return
46
+
47
+ if self._voice_mode == "full":
48
+ # Full mode is intended for headsets (minimal echo) OR AEC-enabled setups.
49
+ #
50
+ # - Always allow speech-triggered TTS interruption (barge-in).
51
+ # - Keep transcriptions enabled (headset assumption). If you're on speakers,
52
+ # prefer STOP/WAIT modes or enable AEC.
53
+ return
54
+
55
+ if self._voice_mode == "wait":
56
+ # WAIT: fully pause mic processing while we speak (max robustness).
57
+ # Trade-off: user can't barge-in by voice while TTS is playing.
58
+ if hasattr(self.voice_recognizer, "pause_listening"):
59
+ self.voice_recognizer.pause_listening()
60
+ return
61
+
62
+ if self._voice_mode == "stop":
63
+ # STOP: keep listening, but suppress normal transcriptions while speaking
64
+ # to avoid self-feedback loops; stop phrase remains available.
65
+ self.voice_recognizer.pause_tts_interrupt()
66
+ if hasattr(self.voice_recognizer, "pause_transcriptions"):
67
+ self.voice_recognizer.pause_transcriptions()
68
+ return
69
+
70
+ if self._voice_mode == "ptt":
71
+ # PTT: listening should be controlled explicitly by the integrator/REPL.
72
+ # If we happen to be listening, treat speaking like STOP mode.
73
+ self.voice_recognizer.pause_tts_interrupt()
74
+ if hasattr(self.voice_recognizer, "pause_transcriptions"):
75
+ self.voice_recognizer.pause_transcriptions()
76
+ return
77
+
78
+ def _on_tts_end(self):
79
+ """Called when TTS playback ends - handle based on voice mode."""
80
+ if not self.voice_recognizer:
81
+ return
82
+
83
+ if self._voice_mode == "full":
84
+ self.voice_recognizer.resume_tts_interrupt()
85
+ return
86
+
87
+ if self._voice_mode == "wait":
88
+ if hasattr(self.voice_recognizer, "resume_listening"):
89
+ self.voice_recognizer.resume_listening()
90
+ return
91
+
92
+ if self._voice_mode in ("stop", "ptt"):
93
+ self.voice_recognizer.resume_tts_interrupt()
94
+ if hasattr(self.voice_recognizer, "resume_transcriptions"):
95
+ self.voice_recognizer.resume_transcriptions()
96
+ return
97
+
98
+ def _on_audio_start(self):
99
+ if self.on_audio_start:
100
+ self.on_audio_start()
101
+
102
+ def _on_audio_end(self):
103
+ if self.on_audio_end:
104
+ self.on_audio_end()
105
+
106
+ def _on_audio_pause(self):
107
+ if self.on_audio_pause:
108
+ self.on_audio_pause()
109
+
110
+ def _on_audio_resume(self):
111
+ if self.on_audio_resume:
112
+ self.on_audio_resume()
113
+
114
+ def cleanup(self):
115
+ """Clean up resources."""
116
+ if self.voice_recognizer:
117
+ self.voice_recognizer.stop()
118
+
119
+ self.stop_speaking()
120
+
121
+ # Best-effort: fully release audio resources.
122
+ try:
123
+ if self.tts_engine is not None:
124
+ if hasattr(self.tts_engine, "cleanup"):
125
+ self.tts_engine.cleanup()
126
+ elif hasattr(self.tts_engine, "audio_player") and self.tts_engine.audio_player:
127
+ self.tts_engine.audio_player.cleanup()
128
+ except Exception:
129
+ pass
130
+
131
+ # Best-effort: release any loaded cloning engine weights (GPU-heavy).
132
+ try:
133
+ unload = getattr(self, "unload_cloning_engines", None)
134
+ if callable(unload):
135
+ unload()
136
+ except Exception:
137
+ pass
138
+
139
+ return True
@@ -0,0 +1,108 @@
1
+ """Small public façade for VoiceManager.
2
+
3
+ The heavy implementation is split across focused mixins to keep files small
4
+ and responsibilities clear.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import threading
10
+ from typing import Optional
11
+
12
+ from ..config.voice_catalog import LANGUAGES, SAFE_FALLBACK
13
+ from ..tts.adapter_tts_engine import AdapterTTSEngine
14
+
15
+ from .core import VoiceManagerCore
16
+ from .stt_mixin import SttMixin
17
+ from .tts_mixin import TtsMixin
18
+
19
+
20
+ class VoiceManager(VoiceManagerCore, TtsMixin, SttMixin):
21
+ """Main class for voice interaction capabilities."""
22
+
23
+ LANGUAGES = LANGUAGES
24
+ SAFE_FALLBACK = SAFE_FALLBACK
25
+
26
+ def __init__(
27
+ self,
28
+ language: str = "en",
29
+ tts_model: Optional[str] = None,
30
+ # Default STT model: "base" is a better out-of-box quality baseline than "tiny",
31
+ # especially for short commands and non-ideal microphone conditions.
32
+ whisper_model: str = "base",
33
+ debug_mode: bool = False,
34
+ tts_engine: str = "auto",
35
+ stt_engine: str = "auto",
36
+ allow_downloads: bool = True,
37
+ cloned_tts_streaming: bool = True,
38
+ cloning_engine: str = "f5_tts",
39
+ ):
40
+ self.debug_mode = debug_mode
41
+ self.speed = 1.0
42
+ # Controls whether the library may download model weights implicitly.
43
+ # The REPL sets this to False to enforce "no surprise downloads".
44
+ self.allow_downloads = bool(allow_downloads)
45
+ # Cloned TTS can either stream batches (lower time-to-first-audio, but may
46
+ # introduce gaps if generation can't stay ahead) or generate full audio first.
47
+ self.cloned_tts_streaming = bool(cloned_tts_streaming)
48
+ self.cloning_engine = str(cloning_engine or "f5_tts").strip().lower()
49
+
50
+ language = (language or "en").lower()
51
+ if language not in self.LANGUAGES:
52
+ if debug_mode:
53
+ available = ", ".join(self.LANGUAGES.keys())
54
+ print(f"⚠️ Unsupported language '{language}', using English. Available: {available}")
55
+ language = "en"
56
+ self.language = language
57
+
58
+ self._tts_engine_preference = tts_engine
59
+ self._stt_engine_preference = stt_engine
60
+
61
+ # TTS selection
62
+ self.tts_adapter = None
63
+ self._tts_engine_name = None
64
+ self.tts_engine = None
65
+
66
+ if tts_engine not in ("auto", "piper"):
67
+ raise ValueError("Only Piper TTS is supported in AbstractVoice core. Use tts_engine='piper'.")
68
+
69
+ if tts_engine in ("auto", "piper"):
70
+ self.tts_adapter = self._try_init_piper(language)
71
+ # Create the playback engine as long as Piper runtime is importable.
72
+ # This keeps audio output available for cloning backends even when no
73
+ # Piper voice model is cached locally (offline-first).
74
+ if self.tts_adapter:
75
+ self.tts_engine = AdapterTTSEngine(self.tts_adapter, debug_mode=debug_mode)
76
+ self._tts_engine_name = "piper"
77
+
78
+ # Audio lifecycle callbacks (public hooks)
79
+ self.on_audio_start = None
80
+ self.on_audio_end = None
81
+ self.on_audio_pause = None
82
+ self.on_audio_resume = None
83
+
84
+ self._wire_tts_callbacks()
85
+
86
+ # STT / listening
87
+ self.voice_recognizer = None
88
+ self.whisper_model = whisper_model
89
+ self.stt_adapter = None
90
+ self._voice_cloner = None
91
+ self._aec_enabled = False
92
+ self._aec_stream_delay_ms = 0
93
+
94
+ # Cloned-speech cancellation token (best-effort).
95
+ self._cloned_cancel_event = threading.Event()
96
+ # Tracks whether cloned TTS synthesis is currently running (separate from playback).
97
+ self._cloned_synthesis_active = threading.Event()
98
+
99
+ # Best-effort last TTS metrics (used by verbose REPL output).
100
+ self._last_tts_metrics = None
101
+ self._last_tts_metrics_lock = threading.Lock()
102
+
103
+ # State tracking
104
+ self._transcription_callback = None
105
+ self._stop_callback = None
106
+ # Default to "wait" for robustness without echo cancellation.
107
+ # "full" is intended for headset / echo-controlled environments.
108
+ self._voice_mode = "wait"
@@ -0,0 +1,158 @@
1
+ """STT + listening methods for VoiceManager."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+
7
+ from .common import import_voice_recognizer
8
+
9
+
10
+ class SttMixin:
11
+ def transcribe_from_bytes(self, audio_bytes: bytes, language: Optional[str] = None) -> str:
12
+ import tempfile
13
+ import os
14
+
15
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
16
+ tmp_file.write(audio_bytes)
17
+ tmp_path = tmp_file.name
18
+
19
+ try:
20
+ return self.transcribe_file(tmp_path, language=language)
21
+ finally:
22
+ try:
23
+ os.unlink(tmp_path)
24
+ except Exception:
25
+ pass
26
+
27
+ def transcribe_file(self, audio_path: str, language: Optional[str] = None) -> str:
28
+ stt = self._get_stt_adapter()
29
+ if stt is not None:
30
+ return stt.transcribe(audio_path, language=language)
31
+
32
+ # Optional fallback to legacy Transcriber if present.
33
+ from ..stt import Transcriber
34
+
35
+ transcriber = Transcriber(model_name=self.whisper_model, debug_mode=self.debug_mode)
36
+ result = transcriber.transcribe(audio_path)
37
+ return result["text"] if result and "text" in result else ""
38
+
39
+ def _get_stt_adapter(self):
40
+ if self.stt_adapter is not None:
41
+ return self.stt_adapter if self.stt_adapter.is_available() else None
42
+
43
+ if self._stt_engine_preference not in ("auto", "faster_whisper"):
44
+ return None
45
+
46
+ try:
47
+ from ..adapters.stt_faster_whisper import FasterWhisperAdapter
48
+
49
+ self.stt_adapter = FasterWhisperAdapter(
50
+ model_size=self.whisper_model,
51
+ device="cpu",
52
+ compute_type="int8",
53
+ allow_downloads=bool(getattr(self, "allow_downloads", True)),
54
+ )
55
+ if self.stt_adapter.is_available():
56
+ return self.stt_adapter
57
+ return None
58
+ except Exception as e:
59
+ if self.debug_mode:
60
+ print(f"⚠️ Faster-Whisper STT not available: {e}")
61
+ self.stt_adapter = None
62
+ return None
63
+
64
+ def set_whisper(self, model_name):
65
+ self.whisper_model = model_name
66
+ if self.voice_recognizer:
67
+ return self.voice_recognizer.change_whisper_model(model_name)
68
+
69
+ def get_whisper(self):
70
+ return self.whisper_model
71
+
72
+ def listen(self, on_transcription, on_stop=None):
73
+ self._transcription_callback = on_transcription
74
+ self._stop_callback = on_stop
75
+
76
+ if not self.voice_recognizer:
77
+ def _transcription_handler(text):
78
+ if self._transcription_callback:
79
+ self._transcription_callback(text)
80
+
81
+ def _stop_handler():
82
+ # Stop phrase semantics (ADR 0002 Phase 1):
83
+ # - Always stop TTS playback immediately.
84
+ # - Do NOT forcibly stop listening unless the integrator wants that
85
+ # (they can call stop_listening() inside on_stop).
86
+ self.stop_speaking()
87
+ if self._stop_callback:
88
+ self._stop_callback()
89
+
90
+ VoiceRecognizer = import_voice_recognizer()
91
+ self.voice_recognizer = VoiceRecognizer(
92
+ transcription_callback=_transcription_handler,
93
+ stop_callback=_stop_handler,
94
+ whisper_model=self.whisper_model,
95
+ debug_mode=self.debug_mode,
96
+ aec_enabled=bool(getattr(self, "_aec_enabled", False)),
97
+ aec_stream_delay_ms=int(getattr(self, "_aec_stream_delay_ms", 0)),
98
+ language=getattr(self, "language", None),
99
+ allow_downloads=bool(getattr(self, "allow_downloads", True)),
100
+ )
101
+ try:
102
+ if hasattr(self.voice_recognizer, "set_profile"):
103
+ self.voice_recognizer.set_profile(getattr(self, "_voice_mode", "stop"))
104
+ except Exception:
105
+ pass
106
+
107
+ return self.voice_recognizer.start(tts_interrupt_callback=self.stop_speaking)
108
+
109
+ def enable_aec(self, enabled: bool = True, *, stream_delay_ms: int = 0) -> bool:
110
+ """Enable optional AEC-based barge-in support.
111
+
112
+ Notes:
113
+ - This is opt-in and requires: pip install "abstractvoice[aec]"
114
+ - Intended for `voice_mode="full"` where we want true barge-in.
115
+ """
116
+ self._aec_enabled = bool(enabled)
117
+ self._aec_stream_delay_ms = int(stream_delay_ms)
118
+ if self.voice_recognizer and hasattr(self.voice_recognizer, "enable_aec"):
119
+ return bool(self.voice_recognizer.enable_aec(bool(enabled), stream_delay_ms=int(stream_delay_ms)))
120
+ return True
121
+
122
+ def stop_listening(self):
123
+ if self.voice_recognizer:
124
+ return self.voice_recognizer.stop()
125
+ return False
126
+
127
+ def pause_listening(self) -> bool:
128
+ if self.voice_recognizer:
129
+ self.voice_recognizer.pause_listening()
130
+ return True
131
+ return False
132
+
133
+ def resume_listening(self) -> bool:
134
+ if self.voice_recognizer:
135
+ self.voice_recognizer.resume_listening()
136
+ return True
137
+ return False
138
+
139
+ def is_listening(self):
140
+ return self.voice_recognizer and self.voice_recognizer.is_running
141
+
142
+ def set_voice_mode(self, mode):
143
+ if mode in ["full", "wait", "stop", "ptt"]:
144
+ self._voice_mode = mode
145
+ # Keep recognizer thresholds aligned with interaction mode.
146
+ try:
147
+ if self.voice_recognizer and hasattr(self.voice_recognizer, "set_profile"):
148
+ self.voice_recognizer.set_profile(mode)
149
+ except Exception:
150
+ pass
151
+ return True
152
+ return False
153
+
154
+ def change_vad_aggressiveness(self, aggressiveness):
155
+ if self.voice_recognizer:
156
+ return self.voice_recognizer.change_vad_aggressiveness(aggressiveness)
157
+ return False
158
+