voicesmith-mcp 1.0.17 → 1.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -10
- package/config.py +4 -0
- package/package.json +1 -1
- package/server.py +14 -0
- package/stt/__pycache__/mic_capture.cpython-314.pyc +0 -0
- package/tts/__pycache__/audio_player.cpython-314.pyc +0 -0
- package/tts/__pycache__/kokoro_engine.cpython-314.pyc +0 -0
- package/tts/__pycache__/media_duck.cpython-314.pyc +0 -0
- package/tts/__pycache__/speech_queue.cpython-314.pyc +0 -0
package/README.md
CHANGED
|
@@ -39,7 +39,7 @@ What the AI does automatically:
|
|
|
39
39
|
|
|
40
40
|
| Moment | What happens |
|
|
41
41
|
|--------|-------------|
|
|
42
|
-
| You give it a task |
|
|
42
|
+
| You give it a task | Gets to work (speaks only when clarifying approach) |
|
|
43
43
|
| It finishes work | Speaks a summary of what was done |
|
|
44
44
|
| It has a question | Asks out loud, then listens for your voice response |
|
|
45
45
|
| Voice tools unavailable | Falls back to text silently |
|
|
@@ -112,7 +112,8 @@ The MCP server runs as a local process alongside your IDE. It communicates over
|
|
|
112
112
|
- **TTS**: Kokoro ONNX — fast neural TTS, 54 voices, no GPU needed
|
|
113
113
|
- **STT**: faster-whisper — OpenAI Whisper running locally via CTranslate2
|
|
114
114
|
- **VAD**: Silero VAD — voice activity detection for clean recordings
|
|
115
|
-
- **Audio**: mpv for playback
|
|
115
|
+
- **Audio**: mpv for playback; CoreAudio via native app bundle on macOS (sounddevice fallback on Linux)
|
|
116
|
+
- **Media ducking**: Auto-pauses Apple Music, Spotify, and browser audio during speech (macOS)
|
|
116
117
|
|
|
117
118
|
## Multi-Session
|
|
118
119
|
|
|
@@ -131,16 +132,24 @@ Config lives at `~/.local/share/voicesmith-mcp/config.json`. Key settings:
|
|
|
131
132
|
"main_agent": "Eric",
|
|
132
133
|
"tts": {
|
|
133
134
|
"default_voice": "am_eric",
|
|
134
|
-
"audio_player": "mpv"
|
|
135
|
+
"audio_player": "mpv",
|
|
136
|
+
"duck_media": true
|
|
135
137
|
},
|
|
136
138
|
"stt": {
|
|
137
139
|
"model_size": "base",
|
|
138
140
|
"language": "en",
|
|
139
|
-
"vad_threshold": 0.3
|
|
141
|
+
"vad_threshold": 0.3,
|
|
142
|
+
"nudge_on_timeout": false
|
|
140
143
|
}
|
|
141
144
|
}
|
|
142
145
|
```
|
|
143
146
|
|
|
147
|
+
| Setting | Description | Default |
|
|
148
|
+
|---------|-------------|---------|
|
|
149
|
+
| `tts.duck_media` | Auto-pause music/browser audio during speech (macOS) | `true` |
|
|
150
|
+
| `stt.nudge_on_timeout` | Speak "I didn't catch that" when listen times out | `false` |
|
|
151
|
+
| `stt.vad_threshold` | Voice detection sensitivity (lower = more sensitive) | `0.3` |
|
|
152
|
+
|
|
144
153
|
Re-run `npx voicesmith-mcp install` to change your voice or update settings. Existing configuration is preserved — only new defaults are added.
|
|
145
154
|
|
|
146
155
|
## Requirements
|
|
@@ -166,16 +175,14 @@ Re-run `npx voicesmith-mcp install` to change your voice or update settings. Exi
|
|
|
166
175
|
|
|
167
176
|
### The AI can't hear me (listen returns empty or times out)
|
|
168
177
|
|
|
169
|
-
**Check microphone permissions.** On macOS,
|
|
178
|
+
**Check microphone permissions.** On macOS, VoiceSmith uses a native app bundle (`VoiceSmithMCP.app`) for mic access. The first time it records, macOS should show a permission dialog for the app. If it didn't:
|
|
170
179
|
|
|
171
180
|
1. Open **System Settings > Privacy & Security > Microphone**
|
|
172
|
-
2.
|
|
173
|
-
|
|
174
|
-
- **Cursor** or **VS Code** — if using those IDEs directly
|
|
175
|
-
3. If the app isn't listed, the first `listen` call should trigger the permission prompt. Approve it and try again.
|
|
181
|
+
2. Look for **VoiceSmithMCP** and make sure it's enabled
|
|
182
|
+
3. If it's not listed, the LaunchAgent may not be running — try reinstalling: `npx voicesmith-mcp install`
|
|
176
183
|
|
|
177
184
|
> [!IMPORTANT]
|
|
178
|
-
>
|
|
185
|
+
> If the server detects silent audio (all zeros for ~320ms), it returns an error pointing you to the microphone permission settings. This usually means macOS TCC denied mic access.
|
|
179
186
|
|
|
180
187
|
**Check your audio input device.** If an external mic is selected but not connected, the server opens it but gets silence:
|
|
181
188
|
- Open **System Settings > Sound > Input** and verify the correct mic is selected
|
package/config.py
CHANGED
|
@@ -37,6 +37,7 @@ class STTConfig:
|
|
|
37
37
|
silence_threshold: float = 1.5
|
|
38
38
|
max_listen_timeout: float = 15
|
|
39
39
|
vad_threshold: float = 0.3
|
|
40
|
+
nudge_on_timeout: bool = False
|
|
40
41
|
|
|
41
42
|
|
|
42
43
|
@dataclass
|
|
@@ -117,6 +118,8 @@ def load_config(config_path: Optional[Path] = None) -> AppConfig:
|
|
|
117
118
|
config.stt.max_listen_timeout = float(stt["max_listen_timeout"])
|
|
118
119
|
if "vad_threshold" in stt:
|
|
119
120
|
config.stt.vad_threshold = float(stt["vad_threshold"])
|
|
121
|
+
if "nudge_on_timeout" in stt:
|
|
122
|
+
config.stt.nudge_on_timeout = bool(stt["nudge_on_timeout"])
|
|
120
123
|
|
|
121
124
|
# Top-level config
|
|
122
125
|
if "main_agent" in data:
|
|
@@ -191,6 +194,7 @@ def save_config(config: AppConfig, config_path: Optional[Path] = None) -> None:
|
|
|
191
194
|
"silence_threshold": config.stt.silence_threshold,
|
|
192
195
|
"max_listen_timeout": config.stt.max_listen_timeout,
|
|
193
196
|
"vad_threshold": config.stt.vad_threshold,
|
|
197
|
+
"nudge_on_timeout": config.stt.nudge_on_timeout,
|
|
194
198
|
},
|
|
195
199
|
"main_agent": config.main_agent,
|
|
196
200
|
"last_voice_name": config.last_voice_name,
|
package/package.json
CHANGED
package/server.py
CHANGED
|
@@ -565,6 +565,20 @@ async def speak_then_listen(
|
|
|
565
565
|
|
|
566
566
|
listen_result = await listen(timeout=timeout, silence_threshold=silence_threshold)
|
|
567
567
|
|
|
568
|
+
# Optionally speak a nudge on timeout to prompt user to type instead
|
|
569
|
+
if (listen_result.get("error") == "timeout"
|
|
570
|
+
and _config and _config.stt.nudge_on_timeout
|
|
571
|
+
and _speech_queue):
|
|
572
|
+
nudge_text = "I didn't catch that. Go ahead and type it."
|
|
573
|
+
voice, _ = _registry.get_voice(name) if _registry else (None, False)
|
|
574
|
+
if voice and _tts_engine:
|
|
575
|
+
try:
|
|
576
|
+
result = _tts_engine.synthesize(nudge_text, voice, speed)
|
|
577
|
+
_audio_player.play(result.samples, result.sample_rate)
|
|
578
|
+
listen_result["nudge_spoken"] = True
|
|
579
|
+
except Exception:
|
|
580
|
+
pass
|
|
581
|
+
|
|
568
582
|
return {"speak": speak_result, "listen": listen_result}
|
|
569
583
|
finally:
|
|
570
584
|
_suppress_duck = False
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|