voice-mcp-server 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -5,6 +5,7 @@
5
5
  **Give your AI agents a voice, real ears, and the ability to handle interruptions in real-time.**
6
6
 
7
7
  [![npm version](https://img.shields.io/npm/v/voice-mcp-server.svg?color=red&style=flat-square&logo=npm)](https://www.npmjs.com/package/voice-mcp-server)
8
+ [![Publish to NPM](https://img.shields.io/github/actions/workflow/status/erickvs/voice-mcp-server/publish.yml?style=flat-square&logo=github)](https://github.com/erickvs/voice-mcp-server/actions)
8
9
  [![Platform: macOS Apple Silicon](https://img.shields.io/badge/Platform-macOS%20%7C%20Apple%20Silicon-lightgrey?style=flat-square&logo=apple)](#-target-environment)
9
10
  [![Python](https://img.shields.io/badge/Python-3.10%2B-blue?logo=python&style=flat-square)](https://python.org)
10
11
  [![MCP Compatible](https://img.shields.io/badge/MCP-Compatible-success?style=flat-square)](https://modelcontextprotocol.io/)
@@ -149,7 +150,34 @@ Simply use `voice-mcp-server` as the command in your configuration.
149
150
  > [!NOTE]
150
151
  > **First Run Performance:** The very first time you invoke the voice tool, it will take a few minutes to initialize the Python environment and download the heavy ML weights (~4GB). **The tools will not be available until this background setup completes.** You can monitor progress in your terminal logs. *Depending on your AI client, you may need to restart the application/CLI for the tools to appear after setup.*
151
152
 
152
- ### 4. Uninstalling
153
+ ### 4. Customizing the Voice (ElevenLabs)
154
+
155
+ If you prefer to use **ElevenLabs** for ultra-realistic cloud TTS instead of the default local Kokoro engine, you can easily configure it using Environment Variables!
156
+
157
+ > [!WARNING]
158
+ > **Privacy Notice:** By configuring and using ElevenLabs, the text generated by your LLM will be transmitted over the internet to ElevenLabs' servers for audio rendering. This data is subject to ElevenLabs' own privacy policies and terms of service. If you require absolute privacy and air-gapped security, do not configure this key and continue using the default local MLX engine.
159
+
160
+ When adding the server to your MCP Client (like `claude_desktop_config.json`), simply provide your API key and your preferred Voice ID in the `env` object:
161
+
162
+ ```json
163
+ {
164
+ "mcpServers": {
165
+ "voice-mcp-server": {
166
+ "command": "voice-mcp-server",
167
+ "args": [],
168
+ "env": {
169
+ "ELEVENLABS_API_KEY": "sk_your_api_key_here",
170
+ "ELEVENLABS_VOICE_ID": "aEO01A4wXwd1O8GPgGlF"
171
+ }
172
+ }
173
+ }
174
+ }
175
+ ```
176
+ *(If you are using Gemini CLI or Claude Code, you can simply `export` these variables in your terminal profile like `.zshrc`!)*
177
+
178
+ Once configured, simply tell your AI: *"Switch your audio engine to use the elevenlabs_speaker adapter."*
179
+
180
+ ### 5. Uninstalling
153
181
 
154
182
  If you wish to completely remove the server and its downloaded ML models from your system to free up space:
155
183
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "voice-mcp-server",
3
- "version": "0.2.0",
3
+ "version": "0.3.1",
4
4
  "description": "An MCP server to allow LLMs to speak and listen via bidirectional voice loops",
5
5
  "main": "build/index.js",
6
6
  "type": "module",
@@ -30,6 +30,10 @@
30
30
  ],
31
31
  "author": "Erick Vazquez Santillan",
32
32
  "license": "MIT",
33
+ "repository": {
34
+ "type": "git",
35
+ "url": "git+https://github.com/erickvs/voice-mcp-server.git"
36
+ },
33
37
  "dependencies": {
34
38
  "@modelcontextprotocol/sdk": "^1.5.0"
35
39
  },
@@ -2,9 +2,11 @@ import os
2
2
  import time
3
3
  import subprocess
4
4
  import httpx
5
+ import threading
5
6
  from simulation.ports import ISpeaker
6
7
  from simulation.models import VirtualAudioFrame
7
8
  from dotenv import load_dotenv
9
+ from logger import logger
8
10
 
9
11
  load_dotenv()
10
12
 
@@ -16,65 +18,98 @@ class ElevenLabsSpeaker(ISpeaker):
16
18
  self.words = []
17
19
  self.process = None
18
20
  self.start_time = 0
19
- self.voice_id = voice_id
21
+ self.voice_id = os.getenv("ELEVENLABS_VOICE_ID", voice_id)
20
22
  self.api_key = os.getenv("ELEVENLABS_API_KEY")
21
23
  self.temp_file = "/tmp/elevenlabs_output.mp3"
24
+
25
+ self._lock = threading.RLock()
26
+ self._is_preparing = False
27
+ self._stop_event = threading.Event()
28
+ self._thread = None
29
+
30
+ logger.info(f"ElevenLabs Speaker initialized. Voice ID: {self.voice_id}, API Key Present: {bool(self.api_key)}")
22
31
 
23
32
  def speak(self, text: str):
24
33
  if not text.strip():
25
34
  return
26
35
 
27
- self.current_text = text
28
- self.words = text.split()
36
+ # Cancel any current playback or preparation
37
+ self.flush()
38
+
39
+ with self._lock:
40
+ self.current_text = text
41
+ self.words = text.split()
42
+ self._is_preparing = True
43
+ self._stop_event.clear()
44
+ self.start_time = 0 # Won't start until afplay starts
45
+
46
+ self._thread = threading.Thread(target=self._generate_and_play, args=(text,), daemon=True)
47
+ self._thread.start()
29
48
 
30
- if self.api_key:
31
- url = f"https://api.elevenlabs.io/v1/text-to-speech/{self.voice_id}"
32
- headers = {
33
- "Accept": "audio/mpeg",
34
- "Content-Type": "application/json",
35
- "xi-api-key": self.api_key
36
- }
37
- data = {
38
- "text": text,
39
- "model_id": "eleven_multilingual_v2",
40
- "voice_settings": {
41
- "stability": 0.5,
42
- "similarity_boost": 0.5
49
+ def _generate_and_play(self, text: str):
50
+ try:
51
+ if self.api_key:
52
+ url = f"https://api.elevenlabs.io/v1/text-to-speech/{self.voice_id}"
53
+ headers = {
54
+ "Accept": "audio/mpeg",
55
+ "Content-Type": "application/json",
56
+ "xi-api-key": self.api_key
43
57
  }
44
- }
45
-
46
- with httpx.Client() as client:
47
- try:
58
+ data = {
59
+ "text": text,
60
+ "model_id": "eleven_multilingual_v2",
61
+ "voice_settings": {
62
+ "stability": 0.5,
63
+ "similarity_boost": 0.5
64
+ }
65
+ }
66
+
67
+ with httpx.Client() as client:
68
+ logger.debug(f"Calling ElevenLabs API for Voice ID: {self.voice_id}")
48
69
  response = client.post(url, json=data, headers=headers, timeout=10.0)
49
70
  response.raise_for_status()
50
71
 
72
+ if self._stop_event.is_set():
73
+ return
74
+
51
75
  with open(self.temp_file, "wb") as f:
52
76
  f.write(response.content)
53
77
 
54
- # Play the downloaded audio
55
- self.start_time = time.time()
56
- self.process = subprocess.Popen(
57
- ["afplay", self.temp_file],
58
- stdout=subprocess.DEVNULL,
59
- stderr=subprocess.DEVNULL
60
- )
61
- except Exception as e:
62
- print(f"ElevenLabs API Error: {e}")
63
- # Fallback to macOS say
78
+ if self._stop_event.is_set():
79
+ return
80
+
81
+ with self._lock:
82
+ # Play the downloaded audio
83
+ self.start_time = time.time()
84
+ self.process = subprocess.Popen(
85
+ ["afplay", self.temp_file],
86
+ stdout=subprocess.DEVNULL,
87
+ stderr=subprocess.DEVNULL
88
+ )
89
+ else:
90
+ logger.warning("No ELEVENLABS_API_KEY found, falling back to 'say'")
91
+ if self._stop_event.is_set(): return
92
+ with self._lock:
64
93
  self.start_time = time.time()
65
94
  self.process = subprocess.Popen(
66
95
  ["say", text],
67
96
  stdout=subprocess.DEVNULL,
68
97
  stderr=subprocess.DEVNULL
69
98
  )
70
- else:
71
- print("Warning: No ELEVENLABS_API_KEY found, falling back to 'say'")
72
- self.start_time = time.time()
73
- self.process = subprocess.Popen(
74
- ["say", text],
75
- stdout=subprocess.DEVNULL,
76
- stderr=subprocess.DEVNULL
77
- )
99
+ except Exception as e:
100
+ logger.error(f"ElevenLabs Error: {e}")
101
+ if self._stop_event.is_set(): return
102
+ # Fallback to macOS say
103
+ with self._lock:
104
+ self.start_time = time.time()
105
+ self.process = subprocess.Popen(
106
+ ["say", text],
107
+ stdout=subprocess.DEVNULL,
108
+ stderr=subprocess.DEVNULL
109
+ )
110
+ finally:
111
+ with self._lock:
112
+ self._is_preparing = False
78
113
 
79
114
  def play_frame(self, frame: VirtualAudioFrame):
80
115
  pass
@@ -83,35 +118,50 @@ class ElevenLabsSpeaker(ISpeaker):
83
118
  pass
84
119
 
85
120
  def is_speaking(self) -> bool:
86
- if self.process is None:
87
- return False
88
- is_running = self.process.poll() is None
89
- if not is_running:
90
- self.current_text = ""
91
- self.words = []
92
- self.process = None
93
- return is_running
121
+ with self._lock:
122
+ if self._is_preparing:
123
+ return True
124
+ if self.process is None:
125
+ return False
126
+ is_running = self.process.poll() is None
127
+ if not is_running:
128
+ self.current_text = ""
129
+ self.words = []
130
+ self.process = None
131
+ return is_running
94
132
 
95
133
  def has_started_audio(self) -> bool:
96
- return self.is_speaking()
134
+ with self._lock:
135
+ if self.process is None:
136
+ return False
137
+ return self.process.poll() is None
97
138
 
98
139
  def flush(self) -> str:
99
- if not self.is_speaking():
100
- return ""
101
-
102
- # Immediately kill the playback process
103
- self.process.kill()
104
-
105
- # Explicitly wait for the process to terminate and reap it
106
- self.process.wait()
107
-
108
- elapsed_ms = (time.time() - self.start_time) * 1000
109
- words_spoken = int(elapsed_ms * self.words_per_ms)
110
-
111
- spoken = " ".join(self.words[:words_spoken])
112
-
113
- self.current_text = ""
114
- self.words = []
115
- self.process = None
140
+ # Signal the thread to stop if it's still downloading
141
+ self._stop_event.set()
116
142
 
117
- return spoken
143
+ with self._lock:
144
+ if not self.is_speaking():
145
+ self._is_preparing = False
146
+ return ""
147
+
148
+ # Immediately kill the playback process if it exists
149
+ if self.process:
150
+ self.process.kill()
151
+ self.process.wait()
152
+
153
+ # If we were preparing but hadn't started afplay yet, we spoken 0 words
154
+ if self.start_time == 0:
155
+ words_spoken = 0
156
+ else:
157
+ elapsed_ms = (time.time() - self.start_time) * 1000
158
+ words_spoken = int(elapsed_ms * self.words_per_ms)
159
+
160
+ spoken = " ".join(self.words[:words_spoken])
161
+
162
+ self.current_text = ""
163
+ self.words = []
164
+ self.process = None
165
+ self._is_preparing = False
166
+
167
+ return spoken
@@ -64,12 +64,15 @@ class LiveMicrophone(IMicrophone):
64
64
  raw_bytes = self.q.get(timeout=0.1) # Block briefly to act as clock
65
65
  # If we didn't get 320 bytes, that's weird but we handle it
66
66
  if len(raw_bytes) < self.chunk * 2:
67
+ logger.warning(f"LiveMicrophone read_frame got only {len(raw_bytes)} bytes instead of {self.chunk * 2}")
67
68
  return VirtualAudioFrame(10, False, False, "", b"")
68
69
  return VirtualAudioFrame(10, False, False, "", raw_bytes)
69
70
  except queue.Empty:
71
+ logger.error("LiveMicrophone queue is EMPTY on read! (PyAudio might have crashed or stopped feeding data)")
70
72
  # If queue is empty, yield silence frame
71
73
  return VirtualAudioFrame(10, False, False, "", b"")
72
74
 
73
75
  def close(self):
74
76
  self.stop_stream()
75
77
  self.p.terminate()
78
+
@@ -80,7 +80,7 @@ def pre_download_models():
80
80
 
81
81
  def run_audio_daemon():
82
82
  """Runs the CoreEngine in a persistent background thread."""
83
- global engine, mic, speaker, last_active_timestamp, daemon_status, daemon_status_message, daemon_progress
83
+ global engine, mic, speaker, vad, stt, last_active_timestamp, daemon_status, daemon_status_message, daemon_progress
84
84
 
85
85
  # Pre-download models so the daemon status reflects exactly what is happening
86
86
  pre_download_models()
@@ -118,8 +118,13 @@ def run_audio_daemon():
118
118
 
119
119
  try:
120
120
  while True:
121
+ current_engine = engine
122
+ if current_engine is None or mic is None:
123
+ time.sleep(0.1)
124
+ continue
125
+
121
126
  # If dormant, check for commands from FastAPI
122
- if engine.state == State.EXECUTING:
127
+ if current_engine.state == State.EXECUTING:
123
128
  try:
124
129
  cmd = mcp_command_queue.get(timeout=0.1) # Blocks briefly
125
130
 
@@ -127,15 +132,15 @@ def run_audio_daemon():
127
132
  mic.start_stream()
128
133
  if hasattr(vad, "set_active"):
129
134
  vad.set_active(True)
130
- engine.start_conversation(cmd.get("text", ""), standby_mode=cmd.get("standby_mode", False))
131
- engine.expect_reply = cmd.get("expect_reply", True)
135
+ current_engine.start_conversation(cmd.get("text", ""), standby_mode=cmd.get("standby_mode", False))
136
+ current_engine.expect_reply = cmd.get("expect_reply", True)
132
137
 
133
138
  except queue.Empty:
134
139
  pass
135
140
  else:
136
- engine.tick()
141
+ current_engine.tick()
137
142
  # Once we drop back to EXECUTING, we finished the conversation loop
138
- if engine.state == State.EXECUTING:
143
+ if current_engine.state == State.EXECUTING:
139
144
  mic.stop_stream()
140
145
  if hasattr(vad, "set_active"):
141
146
  vad.set_active(False)
@@ -231,18 +236,16 @@ async def reload_config():
231
236
  # 1. Stop the current engine
232
237
  if engine:
233
238
  engine.state = State.EXECUTING
234
- if mic:
235
- mic.close()
239
+ # Do NOT close the mic during hot-swap to prevent macOS CoreAudio permission drop!
240
+ # if mic:
241
+ # mic.close()
236
242
 
237
243
  # 1b. CRITICAL: Explicitly obliterate old models from VRAM to prevent Out-Of-Memory (OOM) crashes on hot-swaps
238
244
  import gc
239
- try:
240
- del speaker
241
- del vad
242
- del stt
243
- del engine
244
- except NameError:
245
- pass
245
+ speaker = None
246
+ vad = None
247
+ stt = None
248
+ engine = None
246
249
 
247
250
  gc.collect()
248
251
 
@@ -265,7 +268,12 @@ async def reload_config():
265
268
  cfg = compose(config_name="config")
266
269
 
267
270
  # 3. Instantiate the new models on the fly
268
- mic = instantiate(cfg.microphone)
271
+ if mic is None:
272
+ logger.info("Microphone was None during hot-swap, instantiating a new one.")
273
+ mic = instantiate(cfg.microphone)
274
+ else:
275
+ logger.info("Preserving existing Microphone instance during hot-swap.")
276
+
269
277
  speaker = instantiate(cfg.speaker)
270
278
  vad = instantiate(cfg.vad)
271
279
  stt = instantiate(cfg.stt)
Binary file