abstractvoice 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractvoice/__init__.py +2 -5
- abstractvoice/__main__.py +82 -3
- abstractvoice/adapters/__init__.py +12 -0
- abstractvoice/adapters/base.py +207 -0
- abstractvoice/adapters/stt_faster_whisper.py +401 -0
- abstractvoice/adapters/tts_piper.py +480 -0
- abstractvoice/aec/__init__.py +10 -0
- abstractvoice/aec/webrtc_apm.py +56 -0
- abstractvoice/artifacts.py +173 -0
- abstractvoice/audio/__init__.py +7 -0
- abstractvoice/audio/recorder.py +46 -0
- abstractvoice/audio/resample.py +25 -0
- abstractvoice/cloning/__init__.py +7 -0
- abstractvoice/cloning/engine_chroma.py +738 -0
- abstractvoice/cloning/engine_f5.py +546 -0
- abstractvoice/cloning/manager.py +349 -0
- abstractvoice/cloning/store.py +362 -0
- abstractvoice/compute/__init__.py +6 -0
- abstractvoice/compute/device.py +73 -0
- abstractvoice/config/__init__.py +2 -0
- abstractvoice/config/voice_catalog.py +19 -0
- abstractvoice/dependency_check.py +0 -1
- abstractvoice/examples/cli_repl.py +2403 -243
- abstractvoice/examples/voice_cli.py +64 -63
- abstractvoice/integrations/__init__.py +2 -0
- abstractvoice/integrations/abstractcore.py +116 -0
- abstractvoice/integrations/abstractcore_plugin.py +253 -0
- abstractvoice/prefetch.py +82 -0
- abstractvoice/recognition.py +424 -42
- abstractvoice/stop_phrase.py +103 -0
- abstractvoice/tts/__init__.py +3 -3
- abstractvoice/tts/adapter_tts_engine.py +210 -0
- abstractvoice/tts/tts_engine.py +257 -1208
- abstractvoice/vm/__init__.py +2 -0
- abstractvoice/vm/common.py +21 -0
- abstractvoice/vm/core.py +139 -0
- abstractvoice/vm/manager.py +108 -0
- abstractvoice/vm/stt_mixin.py +158 -0
- abstractvoice/vm/tts_mixin.py +550 -0
- abstractvoice/voice_manager.py +6 -1061
- abstractvoice-0.6.1.dist-info/METADATA +213 -0
- abstractvoice-0.6.1.dist-info/RECORD +52 -0
- {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/WHEEL +1 -1
- abstractvoice-0.6.1.dist-info/entry_points.txt +6 -0
- abstractvoice/instant_setup.py +0 -83
- abstractvoice/simple_model_manager.py +0 -539
- abstractvoice-0.5.1.dist-info/METADATA +0 -1458
- abstractvoice-0.5.1.dist-info/RECORD +0 -23
- abstractvoice-0.5.1.dist-info/entry_points.txt +0 -2
- {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/licenses/LICENSE +0 -0
- {abstractvoice-0.5.1.dist-info → abstractvoice-0.6.1.dist-info}/top_level.txt +0 -0
|
@@ -1,1458 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: abstractvoice
|
|
3
|
-
Version: 0.5.1
|
|
4
|
-
Summary: A modular Python library for voice interactions with AI systems
|
|
5
|
-
Author-email: Laurent-Philippe Albou <contact@abstractcore.ai>
|
|
6
|
-
License-Expression: MIT
|
|
7
|
-
Project-URL: Repository, https://github.com/lpalbou/abstractvoice
|
|
8
|
-
Project-URL: Documentation, https://github.com/lpalbou/abstractvoice#readme
|
|
9
|
-
Classifier: Development Status :: 3 - Alpha
|
|
10
|
-
Classifier: Intended Audience :: Developers
|
|
11
|
-
Classifier: Programming Language :: Python :: 3
|
|
12
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
-
Requires-Python: >=3.8
|
|
18
|
-
Description-Content-Type: text/markdown
|
|
19
|
-
License-File: LICENSE
|
|
20
|
-
Requires-Dist: numpy>=1.24.0
|
|
21
|
-
Requires-Dist: requests>=2.31.0
|
|
22
|
-
Requires-Dist: appdirs>=1.4.0
|
|
23
|
-
Requires-Dist: coqui-tts<0.30.0,>=0.27.0
|
|
24
|
-
Requires-Dist: torch<2.4.0,>=2.0.0
|
|
25
|
-
Requires-Dist: torchvision<0.19.0,>=0.15.0
|
|
26
|
-
Requires-Dist: torchaudio<2.4.0,>=2.0.0
|
|
27
|
-
Requires-Dist: librosa>=0.10.0
|
|
28
|
-
Requires-Dist: sounddevice>=0.4.6
|
|
29
|
-
Requires-Dist: soundfile>=0.12.1
|
|
30
|
-
Provides-Extra: voice
|
|
31
|
-
Requires-Dist: sounddevice>=0.4.6; extra == "voice"
|
|
32
|
-
Requires-Dist: webrtcvad>=2.0.10; extra == "voice"
|
|
33
|
-
Requires-Dist: PyAudio>=0.2.13; extra == "voice"
|
|
34
|
-
Requires-Dist: soundfile>=0.12.1; extra == "voice"
|
|
35
|
-
Provides-Extra: tts
|
|
36
|
-
Requires-Dist: coqui-tts<0.30.0,>=0.27.0; extra == "tts"
|
|
37
|
-
Requires-Dist: torch<2.4.0,>=2.0.0; extra == "tts"
|
|
38
|
-
Requires-Dist: torchvision<0.19.0,>=0.15.0; extra == "tts"
|
|
39
|
-
Requires-Dist: torchaudio<2.4.0,>=2.0.0; extra == "tts"
|
|
40
|
-
Requires-Dist: librosa>=0.10.0; extra == "tts"
|
|
41
|
-
Provides-Extra: stt
|
|
42
|
-
Requires-Dist: openai-whisper>=20230314; extra == "stt"
|
|
43
|
-
Requires-Dist: tiktoken>=0.6.0; extra == "stt"
|
|
44
|
-
Provides-Extra: web
|
|
45
|
-
Requires-Dist: flask>=2.0.0; extra == "web"
|
|
46
|
-
Provides-Extra: all
|
|
47
|
-
Requires-Dist: sounddevice>=0.4.6; extra == "all"
|
|
48
|
-
Requires-Dist: webrtcvad>=2.0.10; extra == "all"
|
|
49
|
-
Requires-Dist: PyAudio>=0.2.13; extra == "all"
|
|
50
|
-
Requires-Dist: openai-whisper>=20230314; extra == "all"
|
|
51
|
-
Requires-Dist: coqui-tts<0.30.0,>=0.27.0; extra == "all"
|
|
52
|
-
Requires-Dist: torch<2.4.0,>=2.0.0; extra == "all"
|
|
53
|
-
Requires-Dist: torchvision<0.19.0,>=0.15.0; extra == "all"
|
|
54
|
-
Requires-Dist: torchaudio<2.4.0,>=2.0.0; extra == "all"
|
|
55
|
-
Requires-Dist: librosa>=0.10.0; extra == "all"
|
|
56
|
-
Requires-Dist: soundfile>=0.12.1; extra == "all"
|
|
57
|
-
Requires-Dist: flask>=2.0.0; extra == "all"
|
|
58
|
-
Requires-Dist: tiktoken>=0.6.0; extra == "all"
|
|
59
|
-
Provides-Extra: dev
|
|
60
|
-
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
61
|
-
Requires-Dist: black>=22.0.0; extra == "dev"
|
|
62
|
-
Requires-Dist: flake8>=5.0.0; extra == "dev"
|
|
63
|
-
Provides-Extra: voice-full
|
|
64
|
-
Requires-Dist: sounddevice>=0.4.6; extra == "voice-full"
|
|
65
|
-
Requires-Dist: webrtcvad>=2.0.10; extra == "voice-full"
|
|
66
|
-
Requires-Dist: PyAudio>=0.2.13; extra == "voice-full"
|
|
67
|
-
Requires-Dist: openai-whisper>=20230314; extra == "voice-full"
|
|
68
|
-
Requires-Dist: coqui-tts<0.30.0,>=0.27.0; extra == "voice-full"
|
|
69
|
-
Requires-Dist: torch<2.4.0,>=2.0.0; extra == "voice-full"
|
|
70
|
-
Requires-Dist: torchvision<0.19.0,>=0.15.0; extra == "voice-full"
|
|
71
|
-
Requires-Dist: torchaudio<2.4.0,>=2.0.0; extra == "voice-full"
|
|
72
|
-
Requires-Dist: librosa>=0.10.0; extra == "voice-full"
|
|
73
|
-
Requires-Dist: soundfile>=0.12.1; extra == "voice-full"
|
|
74
|
-
Requires-Dist: tiktoken>=0.6.0; extra == "voice-full"
|
|
75
|
-
Provides-Extra: core-tts
|
|
76
|
-
Requires-Dist: coqui-tts<0.30.0,>=0.27.0; extra == "core-tts"
|
|
77
|
-
Requires-Dist: torch<2.4.0,>=2.0.0; extra == "core-tts"
|
|
78
|
-
Requires-Dist: torchvision<0.19.0,>=0.15.0; extra == "core-tts"
|
|
79
|
-
Requires-Dist: torchaudio<2.4.0,>=2.0.0; extra == "core-tts"
|
|
80
|
-
Requires-Dist: librosa>=0.10.0; extra == "core-tts"
|
|
81
|
-
Provides-Extra: core-stt
|
|
82
|
-
Requires-Dist: openai-whisper>=20230314; extra == "core-stt"
|
|
83
|
-
Requires-Dist: tiktoken>=0.6.0; extra == "core-stt"
|
|
84
|
-
Provides-Extra: audio-only
|
|
85
|
-
Requires-Dist: sounddevice>=0.4.6; extra == "audio-only"
|
|
86
|
-
Requires-Dist: webrtcvad>=2.0.10; extra == "audio-only"
|
|
87
|
-
Requires-Dist: PyAudio>=0.2.13; extra == "audio-only"
|
|
88
|
-
Requires-Dist: soundfile>=0.12.1; extra == "audio-only"
|
|
89
|
-
Dynamic: license-file
|
|
90
|
-
|
|
91
|
-
# AbstractVoice
|
|
92
|
-
|
|
93
|
-
[](https://pypi.org/project/abstractvoice/)
|
|
94
|
-
[](https://pypi.org/project/abstractvoice/)
|
|
95
|
-
[](https://github.com/lpalbou/abstractvoice/blob/main/LICENSE)
|
|
96
|
-
[](https://github.com/lpalbou/abstractvoice/stargazers)
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
A modular Python library for voice interactions with AI systems, providing text-to-speech (TTS) and speech-to-text (STT) capabilities with interrupt handling.
|
|
100
|
-
|
|
101
|
-
While we provide CLI and WEB examples, AbstractVoice is designed to be integrated in other projects.
|
|
102
|
-
|
|
103
|
-
## Features
|
|
104
|
-
|
|
105
|
-
- **High-Quality TTS**: Best-in-class speech synthesis with VITS model
|
|
106
|
-
- Natural prosody and intonation
|
|
107
|
-
- Adjustable speed without pitch distortion (using librosa time-stretching)
|
|
108
|
-
- Multiple quality levels (VITS best, fast_pitch fallback)
|
|
109
|
-
- Automatic fallback if espeak-ng not installed
|
|
110
|
-
- **Cross-Platform**: Works on macOS, Linux, and Windows
|
|
111
|
-
- Best quality: Install espeak-ng (easy on all platforms)
|
|
112
|
-
- Fallback mode: Works without any system dependencies
|
|
113
|
-
- **Speech-to-Text**: Accurate voice recognition using OpenAI's Whisper
|
|
114
|
-
- **Voice Activity Detection**: Efficient speech detection using WebRTC VAD
|
|
115
|
-
- **Interrupt Handling**: Stop TTS by speaking or using stop commands
|
|
116
|
-
- **Modular Design**: Easily integrate with any text generation system
|
|
117
|
-
|
|
118
|
-
Note : *the LLM access is rudimentary and abstractvoice is provided more as an example and demonstrator. A better integration is to use the functionalities of this library and use them directly in combination with [AbstractCore](https://github.com/lpalbou/AbstractCore)*.
|
|
119
|
-
|
|
120
|
-
## Installation
|
|
121
|
-
|
|
122
|
-
AbstractVoice is designed to **work everywhere, out of the box** with automatic quality upgrades.
|
|
123
|
-
|
|
124
|
-
### 🚀 Quick Start (Recommended)
|
|
125
|
-
|
|
126
|
-
```bash
|
|
127
|
-
# One command installation - works on all systems
|
|
128
|
-
pip install abstractvoice[all]
|
|
129
|
-
|
|
130
|
-
# Verify it works
|
|
131
|
-
python -c "from abstractvoice import VoiceManager; print('✅ Ready to go!')"
|
|
132
|
-
```
|
|
133
|
-
|
|
134
|
-
**That's it!** AbstractVoice automatically:
|
|
135
|
-
- ✅ **Works everywhere** - Uses reliable models that run on any system
|
|
136
|
-
- ✅ **Auto-upgrades quality** - Detects when better models are available
|
|
137
|
-
- ✅ **No system dependencies required** - Pure Python installation
|
|
138
|
-
- ✅ **Optional quality boost** - Install `espeak-ng` for premium voices
|
|
139
|
-
|
|
140
|
-
### Installation Options
|
|
141
|
-
|
|
142
|
-
```bash
|
|
143
|
-
# Minimal (just 2 dependencies)
|
|
144
|
-
pip install abstractvoice
|
|
145
|
-
|
|
146
|
-
# Add features as needed
|
|
147
|
-
pip install abstractvoice[tts] # Text-to-speech
|
|
148
|
-
pip install abstractvoice[stt] # Speech-to-text
|
|
149
|
-
pip install abstractvoice[all] # Everything (recommended)
|
|
150
|
-
|
|
151
|
-
# Language-specific
|
|
152
|
-
pip install abstractvoice[fr] # French with all features
|
|
153
|
-
pip install abstractvoice[de] # German with all features
|
|
154
|
-
```
|
|
155
|
-
|
|
156
|
-
### Optional Quality Upgrade
|
|
157
|
-
|
|
158
|
-
For the **absolute best voice quality**, install espeak-ng:
|
|
159
|
-
|
|
160
|
-
```bash
|
|
161
|
-
# macOS
|
|
162
|
-
brew install espeak-ng
|
|
163
|
-
|
|
164
|
-
# Linux
|
|
165
|
-
sudo apt-get install espeak-ng
|
|
166
|
-
|
|
167
|
-
# Windows
|
|
168
|
-
conda install espeak-ng
|
|
169
|
-
```
|
|
170
|
-
|
|
171
|
-
AbstractVoice automatically detects espeak-ng and upgrades to premium quality voices when available.
|
|
172
|
-
|
|
173
|
-
## Quick Start
|
|
174
|
-
|
|
175
|
-
### ⚡ Instant TTS (v0.5.0+)
|
|
176
|
-
|
|
177
|
-
```python
|
|
178
|
-
from abstractvoice import VoiceManager
|
|
179
|
-
|
|
180
|
-
# Initialize voice manager - works immediately with included dependencies
|
|
181
|
-
vm = VoiceManager()
|
|
182
|
-
|
|
183
|
-
# Text-to-speech works right away!
|
|
184
|
-
vm.speak("Hello! TTS works out of the box!")
|
|
185
|
-
|
|
186
|
-
# Language switching with automatic model download
|
|
187
|
-
vm.set_language('fr')
|
|
188
|
-
vm.speak("Bonjour! Le français fonctionne aussi!")
|
|
189
|
-
```
|
|
190
|
-
|
|
191
|
-
**That's it!** AbstractVoice v0.5.0+ automatically:
|
|
192
|
-
- ✅ Includes essential TTS dependencies in base installation
|
|
193
|
-
- ✅ Downloads models automatically when switching languages/voices
|
|
194
|
-
- ✅ Works immediately after `pip install abstractvoice`
|
|
195
|
-
- ✅ No silent failures - clear error messages if download fails
|
|
196
|
-
- ✅ No complex configuration needed
|
|
197
|
-
|
|
198
|
-
### 🌍 Multi-Language Support (Auto-Download in v0.5.0+)
|
|
199
|
-
|
|
200
|
-
```python
|
|
201
|
-
# Simply switch language - downloads model automatically if needed!
|
|
202
|
-
vm.set_language('fr')
|
|
203
|
-
vm.speak("Bonjour! Je parle français maintenant.")
|
|
204
|
-
|
|
205
|
-
# Switch to German - no manual download needed
|
|
206
|
-
vm.set_language('de')
|
|
207
|
-
vm.speak("Hallo! Ich spreche jetzt Deutsch.")
|
|
208
|
-
|
|
209
|
-
# Spanish, Italian also supported
|
|
210
|
-
vm.set_language('es')
|
|
211
|
-
vm.speak("¡Hola! Hablo español ahora.")
|
|
212
|
-
|
|
213
|
-
# If download fails, you'll get clear error messages with instructions
|
|
214
|
-
# Example: "❌ Cannot switch to French: Model download failed"
|
|
215
|
-
# " Try: abstractvoice download-models --language fr"
|
|
216
|
-
```
|
|
217
|
-
|
|
218
|
-
**New in v0.5.0:** No more manual `download_model()` calls! Language switching handles downloads automatically.
|
|
219
|
-
|
|
220
|
-
### 🔧 Check System Status
|
|
221
|
-
|
|
222
|
-
```python
|
|
223
|
-
from abstractvoice import is_ready, get_status, list_models
|
|
224
|
-
import json
|
|
225
|
-
|
|
226
|
-
# Quick readiness check
|
|
227
|
-
ready = is_ready()
|
|
228
|
-
print(f"TTS ready: {ready}")
|
|
229
|
-
|
|
230
|
-
# Get detailed status
|
|
231
|
-
status = json.loads(get_status())
|
|
232
|
-
print(f"Models cached: {status['total_cached']}")
|
|
233
|
-
print(f"Offline ready: {status['ready_for_offline']}")
|
|
234
|
-
|
|
235
|
-
# List all available models
|
|
236
|
-
models = json.loads(list_models())
|
|
237
|
-
for lang, voices in models.items():
|
|
238
|
-
print(f"{lang}: {len(voices)} voices available")
|
|
239
|
-
```
|
|
240
|
-
|
|
241
|
-
# Speech-to-text with callbacks
|
|
242
|
-
def on_transcription(text):
|
|
243
|
-
print(f"You said: {text}")
|
|
244
|
-
# Process the transcription
|
|
245
|
-
vm.speak(f"I heard you say: {text}")
|
|
246
|
-
|
|
247
|
-
def on_stop():
|
|
248
|
-
print("Stopping voice interaction")
|
|
249
|
-
|
|
250
|
-
# Start listening
|
|
251
|
-
vm.listen(on_transcription, on_stop)
|
|
252
|
-
|
|
253
|
-
# The voice manager will automatically pause listening when speaking
|
|
254
|
-
# and resume when done to prevent feedback loops
|
|
255
|
-
```
|
|
256
|
-
|
|
257
|
-
## Additional Examples
|
|
258
|
-
|
|
259
|
-
### Language-Specific Usage
|
|
260
|
-
|
|
261
|
-
```python
|
|
262
|
-
# French voice
|
|
263
|
-
vm_fr = VoiceManager(language='fr')
|
|
264
|
-
vm_fr.speak("Bonjour! Je peux parler français.")
|
|
265
|
-
|
|
266
|
-
# Spanish voice
|
|
267
|
-
vm_es = VoiceManager(language='es')
|
|
268
|
-
vm_es.speak("¡Hola! Puedo hablar español.")
|
|
269
|
-
|
|
270
|
-
# Dynamic language switching
|
|
271
|
-
vm.set_language('fr') # Switch to French
|
|
272
|
-
vm.set_language('en') # Switch back to English
|
|
273
|
-
```
|
|
274
|
-
|
|
275
|
-
### Advanced Configuration
|
|
276
|
-
|
|
277
|
-
```python
|
|
278
|
-
from abstractvoice import VoiceManager
|
|
279
|
-
|
|
280
|
-
# Custom TTS model selection
|
|
281
|
-
vm = VoiceManager(
|
|
282
|
-
language='en',
|
|
283
|
-
tts_model='tts_models/en/ljspeech/fast_pitch', # Specific model
|
|
284
|
-
whisper_model='base', # Larger Whisper model for better accuracy
|
|
285
|
-
debug_mode=True
|
|
286
|
-
)
|
|
287
|
-
|
|
288
|
-
# Speed control
|
|
289
|
-
vm.set_speed(1.5) # 1.5x speed
|
|
290
|
-
vm.speak("This text will be spoken faster.")
|
|
291
|
-
|
|
292
|
-
# Model switching at runtime
|
|
293
|
-
vm.set_tts_model('tts_models/en/ljspeech/vits') # Switch to VITS
|
|
294
|
-
vm.set_whisper('small') # Switch to larger Whisper model
|
|
295
|
-
```
|
|
296
|
-
|
|
297
|
-
### Error Handling and Graceful Degradation
|
|
298
|
-
|
|
299
|
-
AbstractVoice is designed to provide helpful error messages and fallback gracefully:
|
|
300
|
-
|
|
301
|
-
```python
|
|
302
|
-
# If you install just the basic package
|
|
303
|
-
# pip install abstractvoice
|
|
304
|
-
|
|
305
|
-
from abstractvoice import VoiceManager # This works fine
|
|
306
|
-
|
|
307
|
-
try:
|
|
308
|
-
vm = VoiceManager() # This will fail with helpful message
|
|
309
|
-
except ImportError as e:
|
|
310
|
-
print(e)
|
|
311
|
-
# Output: "TTS functionality requires optional dependencies. Install with:
|
|
312
|
-
# pip install abstractvoice[tts] # For TTS only
|
|
313
|
-
# pip install abstractvoice[all] # For all features"
|
|
314
|
-
|
|
315
|
-
# Missing espeak-ng automatically falls back to compatible models
|
|
316
|
-
# Missing dependencies show clear installation instructions
|
|
317
|
-
# All errors are graceful with helpful guidance
|
|
318
|
-
```
|
|
319
|
-
|
|
320
|
-
## CLI and Web Examples
|
|
321
|
-
|
|
322
|
-
AbstractVoice includes example applications to demonstrate its capabilities:
|
|
323
|
-
|
|
324
|
-
### Using AbstractVoice from the Command Line
|
|
325
|
-
|
|
326
|
-
The easiest way to get started is to use AbstractVoice directly from your shell:
|
|
327
|
-
|
|
328
|
-
```bash
|
|
329
|
-
# Start AbstractVoice in voice mode (TTS ON, STT ON)
|
|
330
|
-
abstractvoice
|
|
331
|
-
# → Automatically uses VITS if espeak-ng installed (best quality)
|
|
332
|
-
# → Falls back to fast_pitch if espeak-ng not found
|
|
333
|
-
|
|
334
|
-
# Or start with custom settings
|
|
335
|
-
abstractvoice --model gemma3:latest --whisper base
|
|
336
|
-
|
|
337
|
-
# Start in text-only mode (TTS enabled, listening disabled)
|
|
338
|
-
abstractvoice --no-listening
|
|
339
|
-
```
|
|
340
|
-
|
|
341
|
-
Once started, you can interact with the AI using voice or text. Use `/help` to see all available commands.
|
|
342
|
-
|
|
343
|
-
**Note**: AbstractVoice automatically selects the best available TTS model. For best quality, install espeak-ng (see Installation section above).
|
|
344
|
-
|
|
345
|
-
### Integrating AbstractVoice in Your Python Project
|
|
346
|
-
|
|
347
|
-
Here's a simple example of how to integrate AbstractVoice into your own application:
|
|
348
|
-
|
|
349
|
-
```python
|
|
350
|
-
from abstractvoice import VoiceManager
|
|
351
|
-
import time
|
|
352
|
-
|
|
353
|
-
# Initialize voice manager
|
|
354
|
-
voice_manager = VoiceManager(debug_mode=False)
|
|
355
|
-
|
|
356
|
-
# Text to speech
|
|
357
|
-
voice_manager.speak("Hello, I am an AI assistant. How can I help you today?")
|
|
358
|
-
|
|
359
|
-
# Wait for speech to complete
|
|
360
|
-
while voice_manager.is_speaking():
|
|
361
|
-
time.sleep(0.1)
|
|
362
|
-
|
|
363
|
-
# Speech to text with callback
|
|
364
|
-
def on_transcription(text):
|
|
365
|
-
print(f"User said: {text}")
|
|
366
|
-
if text.lower() != "stop":
|
|
367
|
-
# Process with your text generation system
|
|
368
|
-
response = f"You said: {text}"
|
|
369
|
-
voice_manager.speak(response)
|
|
370
|
-
|
|
371
|
-
# Start voice recognition
|
|
372
|
-
voice_manager.listen(on_transcription)
|
|
373
|
-
|
|
374
|
-
# Wait for user to say "stop" or press Ctrl+C
|
|
375
|
-
try:
|
|
376
|
-
while voice_manager.is_listening():
|
|
377
|
-
time.sleep(0.1)
|
|
378
|
-
except KeyboardInterrupt:
|
|
379
|
-
pass
|
|
380
|
-
|
|
381
|
-
# Clean up
|
|
382
|
-
voice_manager.cleanup()
|
|
383
|
-
```
|
|
384
|
-
|
|
385
|
-
## Running Examples
|
|
386
|
-
|
|
387
|
-
The package includes several examples that demonstrate different ways to use AbstractVoice.
|
|
388
|
-
|
|
389
|
-
### Voice Mode (Default)
|
|
390
|
-
|
|
391
|
-
If installed globally, you can launch AbstractVoice directly in voice mode:
|
|
392
|
-
|
|
393
|
-
```bash
|
|
394
|
-
# Start AbstractVoice in voice mode (TTS ON, STT ON)
|
|
395
|
-
abstractvoice
|
|
396
|
-
|
|
397
|
-
# With options
|
|
398
|
-
abstractvoice --debug --whisper base --model gemma3:latest --api http://localhost:11434/api/chat
|
|
399
|
-
```
|
|
400
|
-
|
|
401
|
-
**Command line options:**
|
|
402
|
-
- `--debug` - Enable debug mode with detailed logging
|
|
403
|
-
- `--api <url>` - URL of the Ollama API (default: http://localhost:11434/api/chat)
|
|
404
|
-
- `--model <name>` - Ollama model to use (default: granite3.3:2b)
|
|
405
|
-
- Examples: cogito:3b, phi4-mini:latest, qwen2.5:latest, gemma3:latest, etc.
|
|
406
|
-
- `--whisper <model>` - Whisper model to use (default: tiny)
|
|
407
|
-
- Options: tiny, base, small, medium, large
|
|
408
|
-
- `--no-listening` - Disable speech-to-text (listening), TTS still works
|
|
409
|
-
- **Note**: This creates a "TTS-only" mode where you type and the AI speaks back
|
|
410
|
-
- `--system <prompt>` - Custom system prompt
|
|
411
|
-
|
|
412
|
-
### 🎯 Complete CLI Interface (v0.3.0+)
|
|
413
|
-
|
|
414
|
-
AbstractVoice provides a unified command interface for all functionality:
|
|
415
|
-
|
|
416
|
-
```bash
|
|
417
|
-
# Voice mode (default)
|
|
418
|
-
abstractvoice # Interactive voice mode with AI
|
|
419
|
-
abstractvoice --model cogito:3b # With custom Ollama model
|
|
420
|
-
abstractvoice --language fr # French voice mode
|
|
421
|
-
|
|
422
|
-
# Examples and utilities
|
|
423
|
-
abstractvoice cli # CLI REPL for text interaction
|
|
424
|
-
abstractvoice web # Web API server
|
|
425
|
-
abstractvoice simple # Simple TTS/STT demonstration
|
|
426
|
-
abstractvoice check-deps # Check dependency compatibility
|
|
427
|
-
abstractvoice help # Show available commands
|
|
428
|
-
|
|
429
|
-
# Get help
|
|
430
|
-
abstractvoice --help # Complete help with all options
|
|
431
|
-
```
|
|
432
|
-
|
|
433
|
-
**All functionality through one command!** No more confusion between different entry points.
|
|
434
|
-
|
|
435
|
-
### Command-Line REPL
|
|
436
|
-
|
|
437
|
-
```bash
|
|
438
|
-
# Run the CLI example (TTS ON, STT OFF)
|
|
439
|
-
abstractvoice cli
|
|
440
|
-
|
|
441
|
-
# With debug mode
|
|
442
|
-
abstractvoice cli --debug
|
|
443
|
-
|
|
444
|
-
# With specific language
|
|
445
|
-
abstractvoice cli --language fr
|
|
446
|
-
```
|
|
447
|
-
|
|
448
|
-
#### REPL Commands
|
|
449
|
-
|
|
450
|
-
All commands must start with `/` except `stop`:
|
|
451
|
-
|
|
452
|
-
**Basic Commands:**
|
|
453
|
-
- `/exit`, `/q`, `/quit` - Exit REPL
|
|
454
|
-
- `/clear` - Clear conversation history
|
|
455
|
-
- `/help` - Show help information
|
|
456
|
-
- `stop` - Stop voice mode or TTS (voice command, no `/` needed)
|
|
457
|
-
|
|
458
|
-
**Voice & Audio:**
|
|
459
|
-
- `/tts on|off` - Toggle text-to-speech
|
|
460
|
-
- `/voice <mode>` - Voice input modes:
|
|
461
|
-
- `off` - Disable voice input
|
|
462
|
-
- `full` - Continuous listening, interrupts TTS on speech detection
|
|
463
|
-
- `wait` - Pause listening while speaking (recommended, reduces self-interruption)
|
|
464
|
-
- `stop` - Only stop on 'stop' keyword (planned)
|
|
465
|
-
- `ptt` - Push-to-talk mode (planned)
|
|
466
|
-
- `/speed <number>` - Set TTS speed (0.5-2.0, default: 1.0, **pitch preserved**)
|
|
467
|
-
- `/tts_model <model>` - Switch TTS model:
|
|
468
|
-
- `vits` - **Best quality** (requires espeak-ng)
|
|
469
|
-
- `fast_pitch` - Good quality (works everywhere)
|
|
470
|
-
- `glow-tts` - Alternative (similar quality to fast_pitch)
|
|
471
|
-
- `tacotron2-DDC` - Legacy (slower, lower quality)
|
|
472
|
-
- `/whisper <model>` - Switch Whisper model (tiny|base|small|medium|large)
|
|
473
|
-
- `/stop` - Stop voice mode or TTS playback
|
|
474
|
-
- `/pause` - Pause current TTS playback (can be resumed)
|
|
475
|
-
- `/resume` - Resume paused TTS playback
|
|
476
|
-
|
|
477
|
-
**LLM Configuration:**
|
|
478
|
-
- `/model <name>` - Change LLM model (e.g., `/model gemma3:latest`)
|
|
479
|
-
- `/system <prompt>` - Set system prompt (e.g., `/system You are a helpful coding assistant`)
|
|
480
|
-
- `/temperature <val>` - Set temperature (0.0-2.0, default: 0.7)
|
|
481
|
-
- `/max_tokens <num>` - Set max tokens (default: 4096)
|
|
482
|
-
|
|
483
|
-
**Chat Management:**
|
|
484
|
-
- `/save <filename>` - Save chat history (e.g., `/save conversation`)
|
|
485
|
-
- `/load <filename>` - Load chat history (e.g., `/load conversation`)
|
|
486
|
-
- `/tokens` - Display token usage statistics
|
|
487
|
-
|
|
488
|
-
**Sending Messages:**
|
|
489
|
-
- `<message>` - Any text without `/` prefix is sent to the LLM
|
|
490
|
-
|
|
491
|
-
**Note**: Commands without `/` (except `stop`) are sent to the LLM as regular messages.
|
|
492
|
-
|
|
493
|
-
### Web API
|
|
494
|
-
|
|
495
|
-
```bash
|
|
496
|
-
# Run the web API example
|
|
497
|
-
abstractvoice web
|
|
498
|
-
|
|
499
|
-
# With different host and port
|
|
500
|
-
abstractvoice web --host 0.0.0.0 --port 8000
|
|
501
|
-
```
|
|
502
|
-
|
|
503
|
-
You can also run a simplified version that doesn't load the full models:
|
|
504
|
-
|
|
505
|
-
```bash
|
|
506
|
-
# Run the web API with simulation mode
|
|
507
|
-
abstractvoice web --simulate
|
|
508
|
-
```
|
|
509
|
-
|
|
510
|
-
#### Troubleshooting Web API
|
|
511
|
-
|
|
512
|
-
If you encounter issues with the web API:
|
|
513
|
-
|
|
514
|
-
1. **404 Not Found**: Make sure you're accessing the correct endpoints (e.g., `/api/test`, `/api/tts`)
|
|
515
|
-
2. **Connection Issues**: Ensure no other service is using the port
|
|
516
|
-
3. **Model Loading Errors**: Try running with `--simulate` flag to test without loading models
|
|
517
|
-
4. **Dependencies**: Ensure all required packages are installed:
|
|
518
|
-
```bash
|
|
519
|
-
pip install flask soundfile numpy requests
|
|
520
|
-
```
|
|
521
|
-
5. **Test with a simple Flask script**:
|
|
522
|
-
```python
|
|
523
|
-
from flask import Flask
|
|
524
|
-
app = Flask(__name__)
|
|
525
|
-
@app.route('/')
|
|
526
|
-
def home():
|
|
527
|
-
return "Flask works!"
|
|
528
|
-
app.run(host='127.0.0.1', port=5000)
|
|
529
|
-
```
|
|
530
|
-
|
|
531
|
-
### Simple Demo
|
|
532
|
-
|
|
533
|
-
```bash
|
|
534
|
-
# Run the simple example
|
|
535
|
-
abstractvoice simple
|
|
536
|
-
```
|
|
537
|
-
|
|
538
|
-
## Documentation
|
|
539
|
-
|
|
540
|
-
### 📚 Documentation Overview
|
|
541
|
-
|
|
542
|
-
- **[README.md](README.md)** - This file: User guide, API reference, and examples
|
|
543
|
-
- **[CONTRIBUTING.md](CONTRIBUTING.md)** - Contribution guidelines and development setup
|
|
544
|
-
- **[CHANGELOG.md](CHANGELOG.md)** - Version history and release notes
|
|
545
|
-
- **[docs/](docs/)** - Technical documentation for developers
|
|
546
|
-
|
|
547
|
-
### 🎯 Quick Navigation
|
|
548
|
-
|
|
549
|
-
- **Getting Started**: [Installation](#installation) and [Quick Start](#quick-start)
|
|
550
|
-
- **Pause/Resume Control**: [TTS Control](#quick-reference-tts-control) section
|
|
551
|
-
- **Integration Examples**: [Integration Guide](#integration-guide-for-third-party-applications)
|
|
552
|
-
- **Technical Details**: [docs/architecture.md](docs/architecture.md) - How immediate pause/resume works
|
|
553
|
-
- **Development**: [CONTRIBUTING.md](CONTRIBUTING.md) - Setup and guidelines
|
|
554
|
-
|
|
555
|
-
## Component Overview
|
|
556
|
-
|
|
557
|
-
### VoiceManager
|
|
558
|
-
|
|
559
|
-
The main class that coordinates TTS and STT functionality:
|
|
560
|
-
|
|
561
|
-
```python
|
|
562
|
-
from abstractvoice import VoiceManager
|
|
563
|
-
|
|
564
|
-
# Simple initialization (automatic model selection)
|
|
565
|
-
# - Uses VITS if espeak-ng is installed (best quality)
|
|
566
|
-
# - Falls back to fast_pitch if espeak-ng is missing
|
|
567
|
-
manager = VoiceManager()
|
|
568
|
-
|
|
569
|
-
# Or specify a model explicitly
|
|
570
|
-
manager = VoiceManager(
|
|
571
|
-
tts_model="tts_models/en/ljspeech/vits", # Best quality (needs espeak-ng)
|
|
572
|
-
# tts_model="tts_models/en/ljspeech/fast_pitch", # Good (works everywhere)
|
|
573
|
-
whisper_model="tiny",
|
|
574
|
-
debug_mode=False
|
|
575
|
-
)
|
|
576
|
-
|
|
577
|
-
# === TTS (Text-to-Speech) ===
|
|
578
|
-
|
|
579
|
-
# Basic speech synthesis
|
|
580
|
-
manager.speak("Hello world")
|
|
581
|
-
|
|
582
|
-
# With speed control (pitch preserved via time-stretching!)
|
|
583
|
-
manager.speak("This is 20% faster", speed=1.2)
|
|
584
|
-
manager.speak("This is half speed", speed=0.5)
|
|
585
|
-
|
|
586
|
-
# Check if speaking
|
|
587
|
-
if manager.is_speaking():
|
|
588
|
-
manager.stop_speaking()
|
|
589
|
-
|
|
590
|
-
# Pause and resume TTS (IMMEDIATE response)
|
|
591
|
-
manager.speak("This is a long sentence that can be paused and resumed immediately")
|
|
592
|
-
time.sleep(1)
|
|
593
|
-
success = manager.pause_speaking() # Pause IMMEDIATELY (~20ms response)
|
|
594
|
-
if success:
|
|
595
|
-
print("TTS paused immediately")
|
|
596
|
-
|
|
597
|
-
time.sleep(2)
|
|
598
|
-
success = manager.resume_speaking() # Resume IMMEDIATELY from exact position
|
|
599
|
-
if success:
|
|
600
|
-
print("TTS resumed from exact position")
|
|
601
|
-
|
|
602
|
-
# Check pause status
|
|
603
|
-
if manager.is_paused():
|
|
604
|
-
manager.resume_speaking()
|
|
605
|
-
|
|
606
|
-
# Change TTS speed globally
|
|
607
|
-
manager.set_speed(1.3) # All subsequent speech will be 30% faster
|
|
608
|
-
|
|
609
|
-
# Change TTS model dynamically
|
|
610
|
-
manager.set_tts_model("tts_models/en/ljspeech/glow-tts")
|
|
611
|
-
|
|
612
|
-
# Available TTS models (quality ranking):
|
|
613
|
-
# - "tts_models/en/ljspeech/vits" (BEST quality, requires espeak-ng)
|
|
614
|
-
# - "tts_models/en/ljspeech/fast_pitch" (fallback, works everywhere)
|
|
615
|
-
# - "tts_models/en/ljspeech/glow-tts" (alternative fallback)
|
|
616
|
-
# - "tts_models/en/ljspeech/tacotron2-DDC" (legacy)
|
|
617
|
-
|
|
618
|
-
# === Audio Lifecycle Callbacks (v0.5.1+) ===
|
|
619
|
-
|
|
620
|
-
# NEW: Precise audio timing callbacks for visual status indicators
|
|
621
|
-
def on_synthesis_start():
|
|
622
|
-
print("🔴 Synthesis started - show thinking animation")
|
|
623
|
-
|
|
624
|
-
def on_audio_start():
|
|
625
|
-
print("🔵 Audio started - show speaking animation")
|
|
626
|
-
|
|
627
|
-
def on_audio_pause():
|
|
628
|
-
print("⏸️ Audio paused - show paused animation")
|
|
629
|
-
|
|
630
|
-
def on_audio_resume():
|
|
631
|
-
print("▶️ Audio resumed - continue speaking animation")
|
|
632
|
-
|
|
633
|
-
def on_audio_end():
|
|
634
|
-
print("🟢 Audio ended - show ready animation")
|
|
635
|
-
|
|
636
|
-
def on_synthesis_end():
|
|
637
|
-
print("✅ Synthesis complete")
|
|
638
|
-
|
|
639
|
-
# Wire up callbacks
|
|
640
|
-
manager.tts_engine.on_playback_start = on_synthesis_start # Existing (synthesis phase)
|
|
641
|
-
manager.tts_engine.on_playback_end = on_synthesis_end # Existing (synthesis phase)
|
|
642
|
-
manager.on_audio_start = on_audio_start # NEW (actual audio playback)
|
|
643
|
-
manager.on_audio_end = on_audio_end # NEW (actual audio playback)
|
|
644
|
-
manager.on_audio_pause = on_audio_pause # NEW (pause events)
|
|
645
|
-
manager.on_audio_resume = on_audio_resume # NEW (resume events)
|
|
646
|
-
|
|
647
|
-
# Perfect for system tray icons, UI animations, or coordinating multiple audio streams
|
|
648
|
-
|
|
649
|
-
# === STT (Speech-to-Text) ===
|
|
650
|
-
|
|
651
|
-
def on_transcription(text):
|
|
652
|
-
print(f"You said: {text}")
|
|
653
|
-
|
|
654
|
-
manager.listen(on_transcription, on_stop=None)
|
|
655
|
-
manager.stop_listening()
|
|
656
|
-
manager.is_listening()
|
|
657
|
-
|
|
658
|
-
# Change Whisper model
|
|
659
|
-
manager.set_whisper("base") # tiny, base, small, medium, large
|
|
660
|
-
|
|
661
|
-
# === Voice Modes ===
|
|
662
|
-
|
|
663
|
-
# Control how voice recognition behaves during TTS
|
|
664
|
-
manager.set_voice_mode("wait") # Pause listening while speaking (recommended)
|
|
665
|
-
manager.set_voice_mode("full") # Keep listening, interrupt on speech
|
|
666
|
-
manager.set_voice_mode("off") # Disable voice recognition
|
|
667
|
-
|
|
668
|
-
# === VAD (Voice Activity Detection) ===
|
|
669
|
-
|
|
670
|
-
manager.change_vad_aggressiveness(2) # 0-3, higher = more aggressive
|
|
671
|
-
|
|
672
|
-
# === Cleanup ===
|
|
673
|
-
|
|
674
|
-
manager.cleanup()
|
|
675
|
-
```
|
|
676
|
-
|
|
677
|
-
### TTSEngine
|
|
678
|
-
|
|
679
|
-
Handles text-to-speech synthesis:
|
|
680
|
-
|
|
681
|
-
```python
|
|
682
|
-
from abstractvoice.tts import TTSEngine
|
|
683
|
-
|
|
684
|
-
# Initialize with fast_pitch model (default, no external dependencies)
|
|
685
|
-
tts = TTSEngine(
|
|
686
|
-
model_name="tts_models/en/ljspeech/fast_pitch",
|
|
687
|
-
debug_mode=False,
|
|
688
|
-
streaming=True # Enable progressive playback for long text
|
|
689
|
-
)
|
|
690
|
-
|
|
691
|
-
# Speak with speed control (pitch preserved via time-stretching)
|
|
692
|
-
tts.speak(text, speed=1.2, callback=None) # 20% faster, same pitch
|
|
693
|
-
|
|
694
|
-
# Immediate pause and resume control
|
|
695
|
-
success = tts.pause() # Pause IMMEDIATELY (~20ms response)
|
|
696
|
-
success = tts.resume() # Resume IMMEDIATELY from exact position
|
|
697
|
-
is_paused = tts.is_paused() # Check if currently paused
|
|
698
|
-
|
|
699
|
-
tts.stop() # Stop completely (cannot resume)
|
|
700
|
-
tts.is_active() # Check if active
|
|
701
|
-
```
|
|
702
|
-
|
|
703
|
-
**Important Note on Speed Parameter:**
|
|
704
|
-
- The speed parameter now uses proper time-stretching (via librosa)
|
|
705
|
-
- Changing speed does NOT affect pitch anymore
|
|
706
|
-
- Range: 0.5 (half speed) to 2.0 (double speed)
|
|
707
|
-
- Example: `speed=1.3` makes speech 30% faster while preserving natural pitch
|
|
708
|
-
|
|
709
|
-
### VoiceRecognizer
|
|
710
|
-
|
|
711
|
-
Manages speech recognition with VAD:
|
|
712
|
-
|
|
713
|
-
```python
|
|
714
|
-
from abstractvoice.recognition import VoiceRecognizer
|
|
715
|
-
|
|
716
|
-
def on_transcription(text):
|
|
717
|
-
print(f"Transcribed: {text}")
|
|
718
|
-
|
|
719
|
-
def on_stop():
|
|
720
|
-
print("Stop command detected")
|
|
721
|
-
|
|
722
|
-
recognizer = VoiceRecognizer(transcription_callback=on_transcription,
|
|
723
|
-
stop_callback=on_stop,
|
|
724
|
-
whisper_model="tiny",
|
|
725
|
-
debug_mode=False)
|
|
726
|
-
recognizer.start(tts_interrupt_callback=None)
|
|
727
|
-
recognizer.stop()
|
|
728
|
-
recognizer.change_whisper_model("base")
|
|
729
|
-
recognizer.change_vad_aggressiveness(2)
|
|
730
|
-
```
|
|
731
|
-
|
|
732
|
-
## Quick Reference: TTS Control
|
|
733
|
-
|
|
734
|
-
### Pause and Resume TTS
|
|
735
|
-
|
|
736
|
-
**Professional-grade pause/resume control** with immediate response and no terminal interference.
|
|
737
|
-
|
|
738
|
-
**In CLI/REPL:**
|
|
739
|
-
```bash
|
|
740
|
-
/pause # Pause current TTS playback IMMEDIATELY
|
|
741
|
-
/resume # Resume paused TTS playback IMMEDIATELY
|
|
742
|
-
/stop # Stop TTS completely (cannot resume)
|
|
743
|
-
```
|
|
744
|
-
|
|
745
|
-
**Programmatic Usage:**
|
|
746
|
-
|
|
747
|
-
#### Basic Pause/Resume
|
|
748
|
-
```python
|
|
749
|
-
from abstractvoice import VoiceManager
|
|
750
|
-
import time
|
|
751
|
-
|
|
752
|
-
vm = VoiceManager()
|
|
753
|
-
|
|
754
|
-
# Start speech
|
|
755
|
-
vm.speak("This is a long sentence that demonstrates immediate pause and resume functionality.")
|
|
756
|
-
|
|
757
|
-
# Pause immediately (takes effect within ~20ms)
|
|
758
|
-
time.sleep(1)
|
|
759
|
-
result = vm.pause_speaking()
|
|
760
|
-
if result:
|
|
761
|
-
print("✓ TTS paused immediately")
|
|
762
|
-
|
|
763
|
-
# Resume immediately (takes effect within ~20ms)
|
|
764
|
-
time.sleep(2)
|
|
765
|
-
result = vm.resume_speaking()
|
|
766
|
-
if result:
|
|
767
|
-
print("✓ TTS resumed immediately")
|
|
768
|
-
```
|
|
769
|
-
|
|
770
|
-
#### Advanced Control with Status Checking
|
|
771
|
-
```python
|
|
772
|
-
from abstractvoice import VoiceManager
|
|
773
|
-
import time
|
|
774
|
-
|
|
775
|
-
vm = VoiceManager()
|
|
776
|
-
|
|
777
|
-
# Start long speech
|
|
778
|
-
vm.speak("This is a very long text that will be used to demonstrate the advanced pause and resume control features.")
|
|
779
|
-
|
|
780
|
-
# Wait and pause
|
|
781
|
-
time.sleep(1.5)
|
|
782
|
-
if vm.is_speaking():
|
|
783
|
-
vm.pause_speaking()
|
|
784
|
-
print("Speech paused")
|
|
785
|
-
|
|
786
|
-
# Check pause status
|
|
787
|
-
if vm.is_paused():
|
|
788
|
-
print("Confirmed: TTS is paused")
|
|
789
|
-
time.sleep(2)
|
|
790
|
-
|
|
791
|
-
# Resume from exact position
|
|
792
|
-
vm.resume_speaking()
|
|
793
|
-
print("Speech resumed from exact position")
|
|
794
|
-
|
|
795
|
-
# Wait for completion
|
|
796
|
-
while vm.is_speaking():
|
|
797
|
-
time.sleep(0.1)
|
|
798
|
-
print("Speech completed")
|
|
799
|
-
```
|
|
800
|
-
|
|
801
|
-
#### Interactive Control Example
|
|
802
|
-
```python
|
|
803
|
-
from abstractvoice import VoiceManager
|
|
804
|
-
import threading
|
|
805
|
-
import time
|
|
806
|
-
|
|
807
|
-
vm = VoiceManager()
|
|
808
|
-
|
|
809
|
-
def control_speech():
|
|
810
|
-
"""Interactive control in separate thread"""
|
|
811
|
-
time.sleep(2)
|
|
812
|
-
print("Pausing speech...")
|
|
813
|
-
vm.pause_speaking()
|
|
814
|
-
|
|
815
|
-
time.sleep(3)
|
|
816
|
-
print("Resuming speech...")
|
|
817
|
-
vm.resume_speaking()
|
|
818
|
-
|
|
819
|
-
# Start long speech
|
|
820
|
-
long_text = """
|
|
821
|
-
This is a comprehensive demonstration of AbstractVoice's immediate pause and resume functionality.
|
|
822
|
-
The system uses non-blocking audio streaming with callback-based control.
|
|
823
|
-
You can pause and resume at any time with immediate response.
|
|
824
|
-
The audio continues from the exact position where it was paused.
|
|
825
|
-
"""
|
|
826
|
-
|
|
827
|
-
# Start control thread
|
|
828
|
-
control_thread = threading.Thread(target=control_speech, daemon=True)
|
|
829
|
-
control_thread.start()
|
|
830
|
-
|
|
831
|
-
# Start speech (non-blocking)
|
|
832
|
-
vm.speak(long_text)
|
|
833
|
-
|
|
834
|
-
# Wait for completion
|
|
835
|
-
while vm.is_speaking() or vm.is_paused():
|
|
836
|
-
time.sleep(0.1)
|
|
837
|
-
|
|
838
|
-
vm.cleanup()
|
|
839
|
-
```
|
|
840
|
-
|
|
841
|
-
#### Error Handling
|
|
842
|
-
```python
|
|
843
|
-
from abstractvoice import VoiceManager
|
|
844
|
-
|
|
845
|
-
vm = VoiceManager()
|
|
846
|
-
|
|
847
|
-
# Start speech
|
|
848
|
-
vm.speak("Testing pause/resume with error handling")
|
|
849
|
-
|
|
850
|
-
# Safe pause with error handling
|
|
851
|
-
try:
|
|
852
|
-
if vm.is_speaking():
|
|
853
|
-
success = vm.pause_speaking()
|
|
854
|
-
if success:
|
|
855
|
-
print("Successfully paused")
|
|
856
|
-
else:
|
|
857
|
-
print("No active speech to pause")
|
|
858
|
-
|
|
859
|
-
# Safe resume with error handling
|
|
860
|
-
if vm.is_paused():
|
|
861
|
-
success = vm.resume_speaking()
|
|
862
|
-
if success:
|
|
863
|
-
print("Successfully resumed")
|
|
864
|
-
else:
|
|
865
|
-
print("Was not paused or playback completed")
|
|
866
|
-
|
|
867
|
-
except Exception as e:
|
|
868
|
-
print(f"Error controlling TTS: {e}")
|
|
869
|
-
```
|
|
870
|
-
|
|
871
|
-
**Key Features:**
|
|
872
|
-
- **⚡ Immediate Response**: Pause/resume takes effect within ~20ms
|
|
873
|
-
- **🎯 Exact Position**: Resumes from precise audio position (no repetition)
|
|
874
|
-
- **🖥️ No Terminal Interference**: Uses OutputStream callbacks, never blocks terminal
|
|
875
|
-
- **🔒 Thread-Safe**: Safe to call from any thread or callback
|
|
876
|
-
- **📊 Reliable Status**: `is_paused()` and `is_speaking()` always accurate
|
|
877
|
-
- **🔄 Seamless Streaming**: Works with ongoing text synthesis
|
|
878
|
-
|
|
879
|
-
**How it works:**
|
|
880
|
-
- Uses `sounddevice.OutputStream` with callback function
|
|
881
|
-
- Pause immediately outputs silence in next audio callback (~20ms)
|
|
882
|
-
- Resume immediately continues audio output from exact position
|
|
883
|
-
- No blocking `sd.stop()` calls that interfere with terminal I/O
|
|
884
|
-
- Thread-safe with proper locking mechanisms
|
|
885
|
-
|
|
886
|
-
## Quick Reference: Speed & Model Control
|
|
887
|
-
|
|
888
|
-
### Changing TTS Speed
|
|
889
|
-
|
|
890
|
-
**In CLI/REPL:**
|
|
891
|
-
```bash
|
|
892
|
-
/speed 1.2 # 20% faster, pitch preserved
|
|
893
|
-
/speed 0.8 # 20% slower, pitch preserved
|
|
894
|
-
```
|
|
895
|
-
|
|
896
|
-
**Programmatically:**
|
|
897
|
-
```python
|
|
898
|
-
from abstractvoice import VoiceManager
|
|
899
|
-
|
|
900
|
-
vm = VoiceManager()
|
|
901
|
-
|
|
902
|
-
# Method 1: Set global speed
|
|
903
|
-
vm.set_speed(1.3) # All speech will be 30% faster
|
|
904
|
-
vm.speak("This will be 30% faster")
|
|
905
|
-
|
|
906
|
-
# Method 2: Per-speech speed
|
|
907
|
-
vm.speak("This is 50% faster", speed=1.5)
|
|
908
|
-
vm.speak("This is normal speed", speed=1.0)
|
|
909
|
-
vm.speak("This is half speed", speed=0.5)
|
|
910
|
-
|
|
911
|
-
# Get current speed
|
|
912
|
-
current = vm.get_speed() # Returns 1.3 from set_speed() above
|
|
913
|
-
```
|
|
914
|
-
|
|
915
|
-
### Changing TTS Model
|
|
916
|
-
|
|
917
|
-
**In CLI/REPL:**
|
|
918
|
-
```bash
|
|
919
|
-
/tts_model vits # Best quality (needs espeak-ng)
|
|
920
|
-
/tts_model fast_pitch # Good quality (works everywhere)
|
|
921
|
-
/tts_model glow-tts # Alternative model
|
|
922
|
-
/tts_model tacotron2-DDC # Legacy model
|
|
923
|
-
```
|
|
924
|
-
|
|
925
|
-
**Programmatically:**
|
|
926
|
-
```python
|
|
927
|
-
from abstractvoice import VoiceManager
|
|
928
|
-
|
|
929
|
-
# Method 1: Set at initialization
|
|
930
|
-
vm = VoiceManager(tts_model="tts_models/en/ljspeech/glow-tts")
|
|
931
|
-
|
|
932
|
-
# Method 2: Change dynamically at runtime
|
|
933
|
-
vm.set_tts_model("tts_models/en/ljspeech/fast_pitch")
|
|
934
|
-
vm.speak("Using fast_pitch now")
|
|
935
|
-
|
|
936
|
-
vm.set_tts_model("tts_models/en/ljspeech/glow-tts")
|
|
937
|
-
vm.speak("Using glow-tts now")
|
|
938
|
-
|
|
939
|
-
# Available models (quality ranking):
|
|
940
|
-
models = [
|
|
941
|
-
"tts_models/en/ljspeech/vits", # BEST (requires espeak-ng)
|
|
942
|
-
"tts_models/en/ljspeech/fast_pitch", # Good (works everywhere)
|
|
943
|
-
"tts_models/en/ljspeech/glow-tts", # Alternative fallback
|
|
944
|
-
"tts_models/en/ljspeech/tacotron2-DDC" # Legacy
|
|
945
|
-
]
|
|
946
|
-
```
|
|
947
|
-
|
|
948
|
-
### Complete Example: Experiment with Settings
|
|
949
|
-
|
|
950
|
-
```python
|
|
951
|
-
from abstractvoice import VoiceManager
|
|
952
|
-
import time
|
|
953
|
-
|
|
954
|
-
vm = VoiceManager()
|
|
955
|
-
|
|
956
|
-
# Test different models (vits requires espeak-ng)
|
|
957
|
-
for model in ["vits", "fast_pitch", "glow-tts", "tacotron2-DDC"]:
|
|
958
|
-
full_name = f"tts_models/en/ljspeech/{model}"
|
|
959
|
-
vm.set_tts_model(full_name)
|
|
960
|
-
|
|
961
|
-
# Test different speeds with each model
|
|
962
|
-
for speed in [0.8, 1.0, 1.2]:
|
|
963
|
-
vm.speak(f"Testing {model} at {speed}x speed", speed=speed)
|
|
964
|
-
while vm.is_speaking():
|
|
965
|
-
time.sleep(0.1)
|
|
966
|
-
```
|
|
967
|
-
|
|
968
|
-
## Integration Guide for Third-Party Applications
|
|
969
|
-
|
|
970
|
-
AbstractVoice is designed as a lightweight, modular library for easy integration into your applications. This guide covers everything you need to know.
|
|
971
|
-
|
|
972
|
-
### Quick Start: Basic Integration
|
|
973
|
-
|
|
974
|
-
```python
|
|
975
|
-
from abstractvoice import VoiceManager
|
|
976
|
-
|
|
977
|
-
# 1. Initialize (automatic best-quality model selection)
|
|
978
|
-
vm = VoiceManager()
|
|
979
|
-
|
|
980
|
-
# 2. Text-to-Speech
|
|
981
|
-
vm.speak("Hello from my app!")
|
|
982
|
-
|
|
983
|
-
# 3. Speech-to-Text with callback
|
|
984
|
-
def handle_speech(text):
|
|
985
|
-
print(f"User said: {text}")
|
|
986
|
-
# Process text in your app...
|
|
987
|
-
|
|
988
|
-
vm.listen(on_transcription=handle_speech)
|
|
989
|
-
```
|
|
990
|
-
|
|
991
|
-
### Model Selection: Automatic vs Explicit
|
|
992
|
-
|
|
993
|
-
**Automatic (Recommended):**
|
|
994
|
-
```python
|
|
995
|
-
# Automatically uses best available model
|
|
996
|
-
vm = VoiceManager()
|
|
997
|
-
# → Uses VITS if espeak-ng installed (best quality)
|
|
998
|
-
# → Falls back to fast_pitch if espeak-ng missing
|
|
999
|
-
```
|
|
1000
|
-
|
|
1001
|
-
**Explicit:**
|
|
1002
|
-
```python
|
|
1003
|
-
# Force a specific model (bypasses auto-detection)
|
|
1004
|
-
vm = VoiceManager(tts_model="tts_models/en/ljspeech/fast_pitch")
|
|
1005
|
-
|
|
1006
|
-
# Or change dynamically at runtime
|
|
1007
|
-
vm.set_tts_model("tts_models/en/ljspeech/vits")
|
|
1008
|
-
```
|
|
1009
|
-
|
|
1010
|
-
### Voice Quality Levels
|
|
1011
|
-
|
|
1012
|
-
| Model | Quality | Speed | Requirements |
|
|
1013
|
-
|-------|---------|-------|--------------|
|
|
1014
|
-
| **vits** | ⭐⭐⭐⭐⭐ Excellent | Fast | espeak-ng |
|
|
1015
|
-
| **fast_pitch** | ⭐⭐⭐ Good | Fast | None |
|
|
1016
|
-
| **glow-tts** | ⭐⭐⭐ Good | Fast | None |
|
|
1017
|
-
| **tacotron2-DDC** | ⭐⭐ Fair | Slow | None |
|
|
1018
|
-
|
|
1019
|
-
### Customization Options
|
|
1020
|
-
|
|
1021
|
-
```python
|
|
1022
|
-
from abstractvoice import VoiceManager
|
|
1023
|
-
|
|
1024
|
-
vm = VoiceManager(
|
|
1025
|
-
# TTS Configuration
|
|
1026
|
-
tts_model="tts_models/en/ljspeech/vits", # Model to use
|
|
1027
|
-
|
|
1028
|
-
# STT Configuration
|
|
1029
|
-
whisper_model="base", # tiny, base, small, medium, large
|
|
1030
|
-
|
|
1031
|
-
# Debugging
|
|
1032
|
-
debug_mode=True # Enable detailed logging
|
|
1033
|
-
)
|
|
1034
|
-
|
|
1035
|
-
# Runtime customization
|
|
1036
|
-
vm.set_speed(1.2) # Adjust TTS speed (0.5-2.0)
|
|
1037
|
-
vm.set_tts_model("...") # Change TTS model
|
|
1038
|
-
vm.set_whisper("small") # Change STT model
|
|
1039
|
-
vm.set_voice_mode("wait") # wait, full, or off
|
|
1040
|
-
vm.change_vad_aggressiveness(2) # VAD sensitivity (0-3)
|
|
1041
|
-
```
|
|
1042
|
-
|
|
1043
|
-
### Integration Patterns
|
|
1044
|
-
|
|
1045
|
-
#### Pattern 1: TTS Only (No Voice Input)
|
|
1046
|
-
```python
|
|
1047
|
-
vm = VoiceManager()
|
|
1048
|
-
|
|
1049
|
-
# Speak with different speeds
|
|
1050
|
-
vm.speak("Normal speed")
|
|
1051
|
-
vm.speak("Fast speech", speed=1.5)
|
|
1052
|
-
vm.speak("Slow speech", speed=0.7)
|
|
1053
|
-
|
|
1054
|
-
# Control playback with immediate response
|
|
1055
|
-
if vm.is_speaking():
|
|
1056
|
-
success = vm.pause_speaking() # Pause IMMEDIATELY (~20ms)
|
|
1057
|
-
if success:
|
|
1058
|
-
print("Speech paused immediately")
|
|
1059
|
-
# or
|
|
1060
|
-
vm.stop_speaking() # Stop completely (cannot resume)
|
|
1061
|
-
|
|
1062
|
-
# Resume from exact position
|
|
1063
|
-
if vm.is_paused():
|
|
1064
|
-
success = vm.resume_speaking() # Resume IMMEDIATELY (~20ms)
|
|
1065
|
-
if success:
|
|
1066
|
-
print("Speech resumed from exact position")
|
|
1067
|
-
```
|
|
1068
|
-
|
|
1069
|
-
#### Pattern 2: STT Only (No Text-to-Speech)
|
|
1070
|
-
```python
|
|
1071
|
-
vm = VoiceManager()
|
|
1072
|
-
|
|
1073
|
-
def process_speech(text):
|
|
1074
|
-
# Send to your backend, save to DB, etc.
|
|
1075
|
-
your_app.process(text)
|
|
1076
|
-
|
|
1077
|
-
vm.listen(on_transcription=process_speech)
|
|
1078
|
-
```
|
|
1079
|
-
|
|
1080
|
-
#### Pattern 3: Full Voice Interaction
|
|
1081
|
-
```python
|
|
1082
|
-
vm = VoiceManager()
|
|
1083
|
-
|
|
1084
|
-
def on_speech(text):
|
|
1085
|
-
response = your_llm.generate(text)
|
|
1086
|
-
vm.speak(response)
|
|
1087
|
-
|
|
1088
|
-
def on_stop():
|
|
1089
|
-
print("User said stop")
|
|
1090
|
-
vm.cleanup()
|
|
1091
|
-
|
|
1092
|
-
vm.listen(
|
|
1093
|
-
on_transcription=on_speech,
|
|
1094
|
-
on_stop=on_stop
|
|
1095
|
-
)
|
|
1096
|
-
```
|
|
1097
|
-
|
|
1098
|
-
### Error Handling
|
|
1099
|
-
|
|
1100
|
-
```python
|
|
1101
|
-
try:
|
|
1102
|
-
vm = VoiceManager()
|
|
1103
|
-
vm.speak("Test")
|
|
1104
|
-
except Exception as e:
|
|
1105
|
-
print(f"TTS Error: {e}")
|
|
1106
|
-
# Handle missing dependencies, etc.
|
|
1107
|
-
|
|
1108
|
-
# Check model availability
|
|
1109
|
-
try:
|
|
1110
|
-
vm.set_tts_model("tts_models/en/ljspeech/vits")
|
|
1111
|
-
print("VITS available")
|
|
1112
|
-
except:
|
|
1113
|
-
print("VITS not available, using fallback")
|
|
1114
|
-
vm.set_tts_model("tts_models/en/ljspeech/fast_pitch")
|
|
1115
|
-
```
|
|
1116
|
-
|
|
1117
|
-
### Threading and Async Support
|
|
1118
|
-
|
|
1119
|
-
AbstractVoice handles threading internally for TTS and STT:
|
|
1120
|
-
|
|
1121
|
-
```python
|
|
1122
|
-
# TTS is non-blocking
|
|
1123
|
-
vm.speak("Long text...") # Returns immediately
|
|
1124
|
-
# Your code continues while speech plays
|
|
1125
|
-
|
|
1126
|
-
# Check status
|
|
1127
|
-
if vm.is_speaking():
|
|
1128
|
-
print("Still speaking...")
|
|
1129
|
-
|
|
1130
|
-
# Wait for completion
|
|
1131
|
-
while vm.is_speaking():
|
|
1132
|
-
time.sleep(0.1)
|
|
1133
|
-
|
|
1134
|
-
# STT runs in background thread
|
|
1135
|
-
vm.listen(on_transcription=callback) # Returns immediately
|
|
1136
|
-
# Callbacks fire on background thread
|
|
1137
|
-
```
|
|
1138
|
-
|
|
1139
|
-
### Cleanup and Resource Management
|
|
1140
|
-
|
|
1141
|
-
```python
|
|
1142
|
-
# Always cleanup when done
|
|
1143
|
-
vm.cleanup()
|
|
1144
|
-
|
|
1145
|
-
# Or use context manager pattern
|
|
1146
|
-
from contextlib import contextmanager
|
|
1147
|
-
|
|
1148
|
-
@contextmanager
|
|
1149
|
-
def voice_manager():
|
|
1150
|
-
vm = VoiceManager()
|
|
1151
|
-
try:
|
|
1152
|
-
yield vm
|
|
1153
|
-
finally:
|
|
1154
|
-
vm.cleanup()
|
|
1155
|
-
|
|
1156
|
-
# Usage
|
|
1157
|
-
with voice_manager() as vm:
|
|
1158
|
-
vm.speak("Hello")
|
|
1159
|
-
```
|
|
1160
|
-
|
|
1161
|
-
### Configuration for Different Environments
|
|
1162
|
-
|
|
1163
|
-
**Development (fast iteration):**
|
|
1164
|
-
```python
|
|
1165
|
-
vm = VoiceManager(
|
|
1166
|
-
tts_model="tts_models/en/ljspeech/fast_pitch", # Fast
|
|
1167
|
-
whisper_model="tiny", # Fast STT
|
|
1168
|
-
debug_mode=True
|
|
1169
|
-
)
|
|
1170
|
-
```
|
|
1171
|
-
|
|
1172
|
-
**Production (best quality):**
|
|
1173
|
-
```python
|
|
1174
|
-
vm = VoiceManager(
|
|
1175
|
-
tts_model="tts_models/en/ljspeech/vits", # Best quality
|
|
1176
|
-
whisper_model="base", # Good accuracy
|
|
1177
|
-
debug_mode=False
|
|
1178
|
-
)
|
|
1179
|
-
```
|
|
1180
|
-
|
|
1181
|
-
**Embedded/Resource-Constrained:**
|
|
1182
|
-
```python
|
|
1183
|
-
vm = VoiceManager(
|
|
1184
|
-
tts_model="tts_models/en/ljspeech/fast_pitch", # Lower memory
|
|
1185
|
-
whisper_model="tiny", # Smallest model
|
|
1186
|
-
debug_mode=False
|
|
1187
|
-
)
|
|
1188
|
-
```
|
|
1189
|
-
|
|
1190
|
-
## Integration with Text Generation Systems
|
|
1191
|
-
|
|
1192
|
-
AbstractVoice is designed to be a lightweight, modular library that you can easily integrate into your own applications. Here are complete examples for common use cases:
|
|
1193
|
-
|
|
1194
|
-
### Example 1: Voice-Enabled Chatbot with Ollama
|
|
1195
|
-
|
|
1196
|
-
```python
|
|
1197
|
-
from abstractvoice import VoiceManager
|
|
1198
|
-
import requests
|
|
1199
|
-
import time
|
|
1200
|
-
|
|
1201
|
-
# Initialize voice manager
|
|
1202
|
-
voice_manager = VoiceManager()
|
|
1203
|
-
|
|
1204
|
-
# Function to call Ollama API
|
|
1205
|
-
def generate_text(prompt):
|
|
1206
|
-
response = requests.post("http://localhost:11434/api/chat", json={
|
|
1207
|
-
"model": "granite3.3:2b",
|
|
1208
|
-
"messages": [{"role": "user", "content": prompt}],
|
|
1209
|
-
"stream": False
|
|
1210
|
-
})
|
|
1211
|
-
return response.json()["message"]["content"]
|
|
1212
|
-
|
|
1213
|
-
# Callback for speech recognition
|
|
1214
|
-
def on_transcription(text):
|
|
1215
|
-
if text.lower() == "stop":
|
|
1216
|
-
return
|
|
1217
|
-
|
|
1218
|
-
print(f"User: {text}")
|
|
1219
|
-
|
|
1220
|
-
# Generate response
|
|
1221
|
-
response = generate_text(text)
|
|
1222
|
-
print(f"AI: {response}")
|
|
1223
|
-
|
|
1224
|
-
# Speak response
|
|
1225
|
-
voice_manager.speak(response)
|
|
1226
|
-
|
|
1227
|
-
# Start listening
|
|
1228
|
-
voice_manager.listen(on_transcription)
|
|
1229
|
-
|
|
1230
|
-
# Keep running until interrupted
|
|
1231
|
-
try:
|
|
1232
|
-
while voice_manager.is_listening():
|
|
1233
|
-
time.sleep(0.1)
|
|
1234
|
-
except KeyboardInterrupt:
|
|
1235
|
-
voice_manager.cleanup()
|
|
1236
|
-
```
|
|
1237
|
-
|
|
1238
|
-
### Example 2: Voice-Enabled Assistant with OpenAI
|
|
1239
|
-
|
|
1240
|
-
```python
|
|
1241
|
-
from abstractvoice import VoiceManager
|
|
1242
|
-
import openai
|
|
1243
|
-
import time
|
|
1244
|
-
|
|
1245
|
-
# Initialize
|
|
1246
|
-
voice_manager = VoiceManager()
|
|
1247
|
-
openai.api_key = "your-api-key"
|
|
1248
|
-
|
|
1249
|
-
def on_transcription(text):
|
|
1250
|
-
print(f"User: {text}")
|
|
1251
|
-
|
|
1252
|
-
# Get response from OpenAI
|
|
1253
|
-
response = openai.ChatCompletion.create(
|
|
1254
|
-
model="gpt-4",
|
|
1255
|
-
messages=[{"role": "user", "content": text}]
|
|
1256
|
-
)
|
|
1257
|
-
|
|
1258
|
-
ai_response = response.choices[0].message.content
|
|
1259
|
-
print(f"AI: {ai_response}")
|
|
1260
|
-
|
|
1261
|
-
# Speak the response
|
|
1262
|
-
voice_manager.speak(ai_response)
|
|
1263
|
-
|
|
1264
|
-
# Start voice interaction
|
|
1265
|
-
voice_manager.listen(on_transcription)
|
|
1266
|
-
|
|
1267
|
-
# Keep running
|
|
1268
|
-
try:
|
|
1269
|
-
while voice_manager.is_listening():
|
|
1270
|
-
time.sleep(0.1)
|
|
1271
|
-
except KeyboardInterrupt:
|
|
1272
|
-
voice_manager.cleanup()
|
|
1273
|
-
```
|
|
1274
|
-
|
|
1275
|
-
### Example 3: Text-to-Speech Only (No Voice Input)
|
|
1276
|
-
|
|
1277
|
-
```python
|
|
1278
|
-
from abstractvoice import VoiceManager
|
|
1279
|
-
import time
|
|
1280
|
-
|
|
1281
|
-
# Initialize voice manager
|
|
1282
|
-
voice_manager = VoiceManager()
|
|
1283
|
-
|
|
1284
|
-
# Simple text-to-speech
|
|
1285
|
-
voice_manager.speak("Hello! This is a test of the text to speech system.")
|
|
1286
|
-
|
|
1287
|
-
# Wait for speech to finish
|
|
1288
|
-
while voice_manager.is_speaking():
|
|
1289
|
-
time.sleep(0.1)
|
|
1290
|
-
|
|
1291
|
-
# Adjust speed
|
|
1292
|
-
voice_manager.set_speed(1.5)
|
|
1293
|
-
voice_manager.speak("This speech is 50% faster.")
|
|
1294
|
-
|
|
1295
|
-
while voice_manager.is_speaking():
|
|
1296
|
-
time.sleep(0.1)
|
|
1297
|
-
|
|
1298
|
-
# Cleanup
|
|
1299
|
-
voice_manager.cleanup()
|
|
1300
|
-
```
|
|
1301
|
-
|
|
1302
|
-
### Example 4: Speech-to-Text Only (No TTS)
|
|
1303
|
-
|
|
1304
|
-
```python
|
|
1305
|
-
from abstractvoice import VoiceManager
|
|
1306
|
-
import time
|
|
1307
|
-
|
|
1308
|
-
voice_manager = VoiceManager()
|
|
1309
|
-
|
|
1310
|
-
def on_transcription(text):
|
|
1311
|
-
print(f"Transcribed: {text}")
|
|
1312
|
-
# Do something with the transcribed text
|
|
1313
|
-
# e.g., save to file, send to API, etc.
|
|
1314
|
-
|
|
1315
|
-
# Start listening
|
|
1316
|
-
voice_manager.listen(on_transcription)
|
|
1317
|
-
|
|
1318
|
-
# Keep running
|
|
1319
|
-
try:
|
|
1320
|
-
while voice_manager.is_listening():
|
|
1321
|
-
time.sleep(0.1)
|
|
1322
|
-
except KeyboardInterrupt:
|
|
1323
|
-
voice_manager.cleanup()
|
|
1324
|
-
```
|
|
1325
|
-
|
|
1326
|
-
### Key Integration Points
|
|
1327
|
-
|
|
1328
|
-
**VoiceManager Configuration:**
|
|
1329
|
-
```python
|
|
1330
|
-
# Full configuration example
|
|
1331
|
-
voice_manager = VoiceManager(
|
|
1332
|
-
tts_model="tts_models/en/ljspeech/fast_pitch", # Default (no external deps)
|
|
1333
|
-
whisper_model="base", # Whisper STT model (tiny, base, small, medium, large)
|
|
1334
|
-
debug_mode=True # Enable debug logging
|
|
1335
|
-
)
|
|
1336
|
-
|
|
1337
|
-
# Alternative TTS models (all pure Python, cross-platform):
|
|
1338
|
-
# - "tts_models/en/ljspeech/fast_pitch" - Default (fast, good quality)
|
|
1339
|
-
# - "tts_models/en/ljspeech/glow-tts" - Alternative (similar quality)
|
|
1340
|
-
# - "tts_models/en/ljspeech/tacotron2-DDC" - Legacy (older, slower)
|
|
1341
|
-
|
|
1342
|
-
# Set voice mode (full, wait, off)
|
|
1343
|
-
voice_manager.set_voice_mode("wait") # Recommended to avoid self-interruption
|
|
1344
|
-
|
|
1345
|
-
# Adjust settings (speed now preserves pitch!)
|
|
1346
|
-
voice_manager.set_speed(1.2) # TTS speed (default is 1.0, range 0.5-2.0)
|
|
1347
|
-
voice_manager.change_vad_aggressiveness(2) # VAD sensitivity (0-3)
|
|
1348
|
-
```
|
|
1349
|
-
|
|
1350
|
-
**Callback Functions:**
|
|
1351
|
-
```python
|
|
1352
|
-
def on_transcription(text):
|
|
1353
|
-
"""Called when speech is transcribed"""
|
|
1354
|
-
print(f"User said: {text}")
|
|
1355
|
-
# Your custom logic here
|
|
1356
|
-
|
|
1357
|
-
def on_stop():
|
|
1358
|
-
"""Called when user says 'stop'"""
|
|
1359
|
-
print("Stopping voice mode")
|
|
1360
|
-
# Your cleanup logic here
|
|
1361
|
-
|
|
1362
|
-
voice_manager.listen(
|
|
1363
|
-
on_transcription=on_transcription,
|
|
1364
|
-
on_stop=on_stop
|
|
1365
|
-
)
|
|
1366
|
-
```
|
|
1367
|
-
|
|
1368
|
-
## 💻 CLI Commands (v0.4.0+)
|
|
1369
|
-
|
|
1370
|
-
AbstractVoice provides powerful CLI commands for model management and voice interactions.
|
|
1371
|
-
|
|
1372
|
-
### Model Management
|
|
1373
|
-
|
|
1374
|
-
```bash
|
|
1375
|
-
# Download essential model for offline use (recommended first step)
|
|
1376
|
-
abstractvoice download-models
|
|
1377
|
-
|
|
1378
|
-
# Download models for specific languages
|
|
1379
|
-
abstractvoice download-models --language fr # French
|
|
1380
|
-
abstractvoice download-models --language de # German
|
|
1381
|
-
abstractvoice download-models --language it # Italian
|
|
1382
|
-
abstractvoice download-models --language es # Spanish
|
|
1383
|
-
|
|
1384
|
-
# Download specific model by name
|
|
1385
|
-
abstractvoice download-models --model tts_models/fr/css10/vits
|
|
1386
|
-
|
|
1387
|
-
# Download all available models (large download!)
|
|
1388
|
-
abstractvoice download-models --all
|
|
1389
|
-
|
|
1390
|
-
# Check current cache status
|
|
1391
|
-
abstractvoice download-models --status
|
|
1392
|
-
|
|
1393
|
-
# Clear model cache
|
|
1394
|
-
abstractvoice download-models --clear
|
|
1395
|
-
```
|
|
1396
|
-
|
|
1397
|
-
### Voice Interface
|
|
1398
|
-
|
|
1399
|
-
```bash
|
|
1400
|
-
# Start voice interface (default)
|
|
1401
|
-
abstractvoice
|
|
1402
|
-
|
|
1403
|
-
# Start CLI REPL with specific language
|
|
1404
|
-
abstractvoice cli --language fr
|
|
1405
|
-
|
|
1406
|
-
# Start with specific model
|
|
1407
|
-
abstractvoice --model granite3.3:2b --language de
|
|
1408
|
-
|
|
1409
|
-
# Run simple example
|
|
1410
|
-
abstractvoice simple
|
|
1411
|
-
|
|
1412
|
-
# Check dependencies
|
|
1413
|
-
abstractvoice check-deps
|
|
1414
|
-
```
|
|
1415
|
-
|
|
1416
|
-
### CLI Voice Commands
|
|
1417
|
-
|
|
1418
|
-
In the CLI REPL, use these commands (v0.5.0+):
|
|
1419
|
-
|
|
1420
|
-
```bash
|
|
1421
|
-
# List all available voices with download status
|
|
1422
|
-
/setvoice
|
|
1423
|
-
|
|
1424
|
-
# Automatically download and set specific voice (NEW in v0.5.0!)
|
|
1425
|
-
/setvoice fr.css10_vits # Downloads French CSS10 if needed
|
|
1426
|
-
/setvoice de.thorsten_vits # Downloads German Thorsten if needed
|
|
1427
|
-
/setvoice it.mai_male_vits # Downloads Italian Male if needed
|
|
1428
|
-
/setvoice en.jenny # Downloads Jenny voice if needed
|
|
1429
|
-
|
|
1430
|
-
# Change language (automatically downloads models if needed - NEW!)
|
|
1431
|
-
/language fr # Switches to French, downloads if needed
|
|
1432
|
-
/language de # Switches to German, downloads if needed
|
|
1433
|
-
/language es # Switches to Spanish, downloads if needed
|
|
1434
|
-
|
|
1435
|
-
# Voice controls
|
|
1436
|
-
/pause # Pause current speech
|
|
1437
|
-
/resume # Resume speech
|
|
1438
|
-
/stop # Stop speech
|
|
1439
|
-
|
|
1440
|
-
# Exit
|
|
1441
|
-
/exit
|
|
1442
|
-
```
|
|
1443
|
-
|
|
1444
|
-
**New in v0.5.0:** Language and voice commands now automatically download missing models with progress indicators. No more silent failures!
|
|
1445
|
-
|
|
1446
|
-
## Perspectives
|
|
1447
|
-
|
|
1448
|
-
This is a test project that I designed with examples to work with Ollama, but I will adapt the examples and abstractvoice to work with any LLM provider (anthropic, openai, etc).
|
|
1449
|
-
|
|
1450
|
-
Next iteration will leverage directly [AbstractCore](https://www.abstractcore.ai) to handle everything related to LLM, their providers, models and configurations.
|
|
1451
|
-
|
|
1452
|
-
## License and Acknowledgments
|
|
1453
|
-
|
|
1454
|
-
AbstractVoice is licensed under the [MIT License](LICENSE).
|
|
1455
|
-
|
|
1456
|
-
This project depends on several open-source libraries and models, each with their own licenses. Please see [ACKNOWLEDGMENTS.md](ACKNOWLEDGMENTS.md) for a detailed list of dependencies and their respective licenses.
|
|
1457
|
-
|
|
1458
|
-
Some dependencies, particularly certain TTS models, may have non-commercial use restrictions. If you plan to use AbstractVoice in a commercial application, please ensure you are using models that permit commercial use or obtain appropriate licenses.
|