voicesmith-mcp 1.0.17 → 1.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -39,7 +39,7 @@ What the AI does automatically:
39
39
 
40
40
  | Moment | What happens |
41
41
  |--------|-------------|
42
- | You give it a task | Speaks a brief acknowledgment |
42
+ | You give it a task | Gets to work (speaks only when clarifying approach) |
43
43
  | It finishes work | Speaks a summary of what was done |
44
44
  | It has a question | Asks out loud, then listens for your voice response |
45
45
  | Voice tools unavailable | Falls back to text silently |
@@ -112,7 +112,8 @@ The MCP server runs as a local process alongside your IDE. It communicates over
112
112
  - **TTS**: Kokoro ONNX — fast neural TTS, 54 voices, no GPU needed
113
113
  - **STT**: faster-whisper — OpenAI Whisper running locally via CTranslate2
114
114
  - **VAD**: Silero VAD — voice activity detection for clean recordings
115
- - **Audio**: mpv for playback, sounddevice for recording
115
+ - **Audio**: mpv for playback; CoreAudio via native app bundle on macOS (sounddevice fallback on Linux)
116
+ - **Media ducking**: Auto-pauses Apple Music, Spotify, and browser audio during speech (macOS)
116
117
 
117
118
  ## Multi-Session
118
119
 
@@ -131,16 +132,24 @@ Config lives at `~/.local/share/voicesmith-mcp/config.json`. Key settings:
131
132
  "main_agent": "Eric",
132
133
  "tts": {
133
134
  "default_voice": "am_eric",
134
- "audio_player": "mpv"
135
+ "audio_player": "mpv",
136
+ "duck_media": true
135
137
  },
136
138
  "stt": {
137
139
  "model_size": "base",
138
140
  "language": "en",
139
- "vad_threshold": 0.3
141
+ "vad_threshold": 0.3,
142
+ "nudge_on_timeout": false
140
143
  }
141
144
  }
142
145
  ```
143
146
 
147
+ | Setting | Description | Default |
148
+ |---------|-------------|---------|
149
+ | `tts.duck_media` | Auto-pause music/browser audio during speech (macOS) | `true` |
150
+ | `stt.nudge_on_timeout` | Speak "I didn't catch that" when listen times out | `false` |
151
+ | `stt.vad_threshold` | Voice detection sensitivity (lower = more sensitive) | `0.3` |
152
+
144
153
  Re-run `npx voicesmith-mcp install` to change your voice or update settings. Existing configuration is preserved — only new defaults are added.
145
154
 
146
155
  ## Requirements
@@ -166,16 +175,14 @@ Re-run `npx voicesmith-mcp install` to change your voice or update settings. Exi
166
175
 
167
176
  ### The AI can't hear me (listen returns empty or times out)
168
177
 
169
- **Check microphone permissions.** On macOS, the terminal app that runs your IDE needs microphone access:
178
+ **Check microphone permissions.** On macOS, VoiceSmith uses a native app bundle (`VoiceSmithMCP.app`) for mic access. The first time it records, macOS should show a permission dialog for the app. If it didn't:
170
179
 
171
180
  1. Open **System Settings > Privacy & Security > Microphone**
172
- 2. Make sure your terminal app is listed and enabled:
173
- - **Warp**, **Terminal.app**, or **iTerm2**for Claude Code
174
- - **Cursor** or **VS Code** — if using those IDEs directly
175
- 3. If the app isn't listed, the first `listen` call should trigger the permission prompt. Approve it and try again.
181
+ 2. Look for **VoiceSmithMCP** and make sure it's enabled
182
+ 3. If it's not listed, the LaunchAgent may not be running try reinstalling: `npx voicesmith-mcp install`
176
183
 
177
184
  > [!IMPORTANT]
178
- > The Python process inherits microphone permissions from the app that launched it. If your terminal doesn't have mic access, listen will silently fail.
185
+ > If the server detects silent audio (all zeros for ~320ms), it returns an error pointing you to the microphone permission settings. This usually means macOS TCC denied mic access.
179
186
 
180
187
  **Check your audio input device.** If an external mic is selected but not connected, the server opens it but gets silence:
181
188
  - Open **System Settings > Sound > Input** and verify the correct mic is selected
package/config.py CHANGED
@@ -37,6 +37,7 @@ class STTConfig:
37
37
  silence_threshold: float = 1.5
38
38
  max_listen_timeout: float = 15
39
39
  vad_threshold: float = 0.3
40
+ nudge_on_timeout: bool = False
40
41
 
41
42
 
42
43
  @dataclass
@@ -117,6 +118,8 @@ def load_config(config_path: Optional[Path] = None) -> AppConfig:
117
118
  config.stt.max_listen_timeout = float(stt["max_listen_timeout"])
118
119
  if "vad_threshold" in stt:
119
120
  config.stt.vad_threshold = float(stt["vad_threshold"])
121
+ if "nudge_on_timeout" in stt:
122
+ config.stt.nudge_on_timeout = bool(stt["nudge_on_timeout"])
120
123
 
121
124
  # Top-level config
122
125
  if "main_agent" in data:
@@ -191,6 +194,7 @@ def save_config(config: AppConfig, config_path: Optional[Path] = None) -> None:
191
194
  "silence_threshold": config.stt.silence_threshold,
192
195
  "max_listen_timeout": config.stt.max_listen_timeout,
193
196
  "vad_threshold": config.stt.vad_threshold,
197
+ "nudge_on_timeout": config.stt.nudge_on_timeout,
194
198
  },
195
199
  "main_agent": config.main_agent,
196
200
  "last_voice_name": config.last_voice_name,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "voicesmith-mcp",
3
- "version": "1.0.17",
3
+ "version": "1.0.18",
4
4
  "description": "Local AI voice for coding assistants — TTS & STT via MCP. Kokoro ONNX + faster-whisper, fully offline.",
5
5
  "bin": {
6
6
  "voicesmith-mcp": "bin/cli.js"
package/server.py CHANGED
@@ -565,6 +565,20 @@ async def speak_then_listen(
565
565
 
566
566
  listen_result = await listen(timeout=timeout, silence_threshold=silence_threshold)
567
567
 
568
+ # Optionally speak a nudge on timeout to prompt user to type instead
569
+ if (listen_result.get("error") == "timeout"
570
+ and _config and _config.stt.nudge_on_timeout
571
+ and _speech_queue):
572
+ nudge_text = "I didn't catch that. Go ahead and type it."
573
+ voice, _ = _registry.get_voice(name) if _registry else (None, False)
574
+ if voice and _tts_engine:
575
+ try:
576
+ result = _tts_engine.synthesize(nudge_text, voice, speed)
577
+ _audio_player.play(result.samples, result.sample_rate)
578
+ listen_result["nudge_spoken"] = True
579
+ except Exception:
580
+ pass
581
+
568
582
  return {"speak": speak_result, "listen": listen_result}
569
583
  finally:
570
584
  _suppress_duck = False