voice-mcp-server 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +193 -0
  3. package/build/index.js +51 -0
  4. package/config/config.yaml +25 -0
  5. package/config/microphone/live_mic.yaml +1 -0
  6. package/config/speaker/elevenlabs_speaker.yaml +3 -0
  7. package/config/speaker/kokoro_speaker.yaml +3 -0
  8. package/config/stt/mlx_whisper_large_v3.yaml +2 -0
  9. package/config/vad/ptt_vad.yaml +8 -0
  10. package/config/vad/silero_vad.yaml +7 -0
  11. package/package.json +40 -0
  12. package/requirements.txt +126 -0
  13. package/src/adapters_real/__init__.py +0 -0
  14. package/src/adapters_real/__pycache__/__init__.cpython-312.pyc +0 -0
  15. package/src/adapters_real/__pycache__/kokoro_speaker.cpython-312.pyc +0 -0
  16. package/src/adapters_real/__pycache__/live_mic.cpython-312.pyc +0 -0
  17. package/src/adapters_real/__pycache__/ptt_vad.cpython-312.pyc +0 -0
  18. package/src/adapters_real/__pycache__/queue_llm.cpython-312.pyc +0 -0
  19. package/src/adapters_real/__pycache__/whisper_stt.cpython-312.pyc +0 -0
  20. package/src/adapters_real/echo_llm.py +28 -0
  21. package/src/adapters_real/elevenlabs_speaker.py +117 -0
  22. package/src/adapters_real/kokoro_speaker.py +122 -0
  23. package/src/adapters_real/live_mic.py +64 -0
  24. package/src/adapters_real/live_speaker.py +66 -0
  25. package/src/adapters_real/ptt_vad.py +36 -0
  26. package/src/adapters_real/queue_llm.py +36 -0
  27. package/src/adapters_real/silero_vad.py +43 -0
  28. package/src/adapters_real/wav_mic.py +17 -0
  29. package/src/adapters_real/whisper_stt.py +32 -0
  30. package/src/daemon/__init__.py +0 -0
  31. package/src/daemon/audio_server.py +363 -0
  32. package/src/index.ts +63 -0
  33. package/src/mcp_server.py +254 -0
  34. package/src/simulation/__init__.py +0 -0
  35. package/src/simulation/__pycache__/__init__.cpython-312.pyc +0 -0
  36. package/src/simulation/__pycache__/engine.cpython-312.pyc +0 -0
  37. package/src/simulation/__pycache__/models.cpython-312.pyc +0 -0
  38. package/src/simulation/__pycache__/ports.cpython-312.pyc +0 -0
  39. package/src/simulation/adapters.py +131 -0
  40. package/src/simulation/engine.py +242 -0
  41. package/src/simulation/models.py +25 -0
  42. package/src/simulation/ports.py +57 -0
  43. package/src/simulation/tests/__init__.py +0 -0
  44. package/src/simulation/tests/test_scenarios.py +510 -0
  45. package/tsconfig.json +15 -0
@@ -0,0 +1,122 @@
1
+ import os
2
+ import time
3
+ import subprocess
4
+ import torch
5
+ import soundfile as sf
6
+ from kokoro import KPipeline
7
+ from simulation.ports import ISpeaker
8
+ from simulation.models import VirtualAudioFrame
9
+
10
+ class KokoroSpeaker(ISpeaker):
11
+ def __init__(self, wpm=150, voice="af_heart"): # 'af_heart' is a highly expressive American Female voice
12
+ self.wpm = wpm
13
+ self.words_per_ms = (wpm / 60) / 1000
14
+ self.current_text = ""
15
+ self.words = []
16
+ self.process = None
17
+ self.start_time = 0
18
+ self.temp_file = "/tmp/kokoro_output.wav"
19
+
20
+ print(f"[DEBUG SPEAKER] Loading local Kokoro TTS model (Voice: {voice})...")
21
+ # Load the pipeline. Since you are on M4 Max, we will try to use MPS if available
22
+ if torch.backends.mps.is_available():
23
+ self.device = "mps"
24
+ elif torch.cuda.is_available():
25
+ self.device = "cuda"
26
+ else:
27
+ self.device = "cpu"
28
+
29
+ # Initialize the pipeline (downloads weights first time ~300MB)
30
+ # We use lang_code 'a' for American English
31
+ self.pipeline = KPipeline(lang_code='a', device=self.device)
32
+ self.voice = voice
33
+ print(f"[DEBUG SPEAKER] Kokoro TTS loaded successfully on {self.device}.")
34
+
35
+ def speak(self, text: str):
36
+ if not text.strip():
37
+ return
38
+
39
+ self.current_text = text
40
+ self.words = text.split()
41
+
42
+ try:
43
+ print(f"[DEBUG SPEAKER] Generating Kokoro audio for: {text[:50]}...")
44
+ # Generate the audio locally
45
+ generator = self.pipeline(
46
+ text, voice=self.voice, # <= change voice here
47
+ speed=1.0, split_pattern=r'\n+'
48
+ )
49
+
50
+ # Since pipeline returns a generator of segments, we can just grab the first one (or concatenate them)
51
+ # For simplicity in testing, we concatenate them all
52
+ audio_segments = []
53
+ for gs, ps, audio in generator:
54
+ audio_segments.append(audio)
55
+
56
+ if not audio_segments:
57
+ print("[DEBUG SPEAKER] Kokoro generated empty audio.")
58
+ return
59
+
60
+ final_audio = torch.cat(audio_segments, dim=0).cpu().numpy()
61
+
62
+ # Save to temporary file at 24kHz (Kokoro's default sample rate)
63
+ sf.write(self.temp_file, final_audio, 24000)
64
+ print("[DEBUG SPEAKER] Audio generated, starting playback.")
65
+
66
+ # Play the generated audio using afplay (macOS native)
67
+ self.start_time = time.time()
68
+ self.process = subprocess.Popen(
69
+ ["afplay", self.temp_file],
70
+ stdout=subprocess.DEVNULL,
71
+ stderr=subprocess.DEVNULL
72
+ )
73
+
74
+ except Exception as e:
75
+ print(f"[DEBUG SPEAKER] Kokoro Generation Error: {e}")
76
+ # Fallback to macOS say
77
+ self.start_time = time.time()
78
+ self.process = subprocess.Popen(
79
+ ["say", text],
80
+ stdout=subprocess.DEVNULL,
81
+ stderr=subprocess.DEVNULL
82
+ )
83
+
84
+ def play_frame(self, frame: VirtualAudioFrame):
85
+ pass
86
+
87
+ def tick(self, ms: int):
88
+ pass
89
+
90
+ def is_speaking(self) -> bool:
91
+ if self.process is None:
92
+ return False
93
+ is_running = self.process.poll() is None
94
+ if not is_running:
95
+ self.current_text = ""
96
+ self.words = []
97
+ self.process = None
98
+ return is_running
99
+
100
+ def has_started_audio(self) -> bool:
101
+ return self.is_speaking()
102
+
103
+ def flush(self) -> str:
104
+ if not self.is_speaking():
105
+ return ""
106
+
107
+ # Immediately kill the playback process
108
+ self.process.kill()
109
+
110
+ # Explicitly wait for the process to terminate and reap it
111
+ self.process.wait()
112
+
113
+ elapsed_ms = (time.time() - self.start_time) * 1000
114
+ words_spoken = int(elapsed_ms * self.words_per_ms)
115
+
116
+ spoken = " ".join(self.words[:words_spoken])
117
+
118
+ self.current_text = ""
119
+ self.words = []
120
+ self.process = None
121
+
122
+ return spoken
@@ -0,0 +1,64 @@
1
+ import pyaudio
2
+ import queue
3
+ from simulation.ports import IMicrophone
4
+ from simulation.models import VirtualAudioFrame
5
+
6
+ class LiveMicrophone(IMicrophone):
7
+ def __init__(self, rate=16000, chunk=160):
8
+ self.rate = rate
9
+ self.chunk = chunk
10
+ self.q = queue.Queue(maxsize=100)
11
+ self.p = pyaudio.PyAudio()
12
+ self.stream = None
13
+
14
+ def start_stream(self):
15
+ if self.stream is not None:
16
+ return
17
+
18
+ # Clear any stale data from previous sessions
19
+ while not self.q.empty():
20
+ try:
21
+ self.q.get_nowait()
22
+ except queue.Empty:
23
+ break
24
+
25
+ self.stream = self.p.open(
26
+ format=pyaudio.paInt16,
27
+ channels=1,
28
+ rate=self.rate,
29
+ input=True,
30
+ frames_per_buffer=self.chunk,
31
+ stream_callback=self._callback
32
+ )
33
+ self.stream.start_stream()
34
+
35
+ def stop_stream(self):
36
+ if self.stream is not None:
37
+ self.stream.stop_stream()
38
+ self.stream.close()
39
+ self.stream = None
40
+
41
+ def _callback(self, in_data, frame_count, time_info, status):
42
+ try:
43
+ self.q.put_nowait(in_data)
44
+ except queue.Full:
45
+ pass # drop frames if queue is full rather than blocking audio thread
46
+ return (None, pyaudio.paContinue)
47
+
48
+ def read_frame(self) -> VirtualAudioFrame:
49
+ if self.stream is None:
50
+ return VirtualAudioFrame(10, False, False, "", b"")
51
+
52
+ try:
53
+ raw_bytes = self.q.get(timeout=0.1) # Block briefly to act as clock
54
+ # If we didn't get 320 bytes, that's weird but we handle it
55
+ if len(raw_bytes) < self.chunk * 2:
56
+ return VirtualAudioFrame(10, False, False, "", b"")
57
+ return VirtualAudioFrame(10, False, False, "", raw_bytes)
58
+ except queue.Empty:
59
+ # If queue is empty, yield silence frame
60
+ return VirtualAudioFrame(10, False, False, "", b"")
61
+
62
+ def close(self):
63
+ self.stop_stream()
64
+ self.p.terminate()
@@ -0,0 +1,66 @@
1
+ import subprocess
2
+ import time
3
+ from simulation.ports import ISpeaker
4
+ from simulation.models import VirtualAudioFrame
5
+
6
+ class LiveSpeaker(ISpeaker):
7
+ def __init__(self, wpm=150):
8
+ self.wpm = wpm
9
+ self.words_per_ms = (wpm / 60) / 1000
10
+ self.current_text = ""
11
+ self.words = []
12
+ self.process = None
13
+ self.start_time = 0
14
+
15
+ def speak(self, text: str):
16
+ self.current_text = text
17
+ self.words = text.split()
18
+ self.start_time = time.time()
19
+
20
+ # Start say command non-blocking
21
+ self.process = subprocess.Popen(
22
+ ["say", text],
23
+ stdout=subprocess.DEVNULL,
24
+ stderr=subprocess.DEVNULL
25
+ )
26
+
27
+ def play_frame(self, frame: VirtualAudioFrame):
28
+ pass
29
+
30
+ def tick(self, ms: int):
31
+ pass # Clock is driven by the mic in a real environment
32
+
33
+ def is_speaking(self) -> bool:
34
+ if self.process is None:
35
+ return False
36
+ # Poll returns None if process is still running
37
+ is_running = self.process.poll() is None
38
+ if not is_running:
39
+ self.current_text = ""
40
+ self.words = []
41
+ self.process = None
42
+ return is_running
43
+
44
+ def has_started_audio(self) -> bool:
45
+ return self.is_speaking() # Approximation for macOS say
46
+
47
+ def flush(self) -> str:
48
+ if not self.is_speaking():
49
+ return ""
50
+
51
+ # Immediately kill the say process
52
+ self.process.kill()
53
+
54
+ # Explicitly wait for the process to terminate and reap it
55
+ self.process.wait()
56
+
57
+ elapsed_ms = (time.time() - self.start_time) * 1000
58
+ words_spoken = int(elapsed_ms * self.words_per_ms)
59
+
60
+ spoken = " ".join(self.words[:words_spoken])
61
+
62
+ self.current_text = ""
63
+ self.words = []
64
+ self.process = None
65
+
66
+ return spoken
@@ -0,0 +1,36 @@
1
+ from pynput import keyboard
2
+ from simulation.ports import IVAD
3
+ from simulation.models import VirtualAudioFrame
4
+
5
+ class PushToTalkVAD(IVAD):
6
+ def __init__(self, key_name="shift", **kwargs):
7
+ self.is_pressed = False
8
+ print(f"[DEBUG VAD] Initializing Push-To-Talk VAD. Walkie-Talkie Hotkey: '{key_name}'")
9
+
10
+ # Map string names to pynput Key objects
11
+ key_map = {
12
+ "shift": keyboard.Key.shift,
13
+ "shift_r": keyboard.Key.shift_r,
14
+ "ctrl": keyboard.Key.ctrl,
15
+ "alt": keyboard.Key.alt,
16
+ "cmd": keyboard.Key.cmd,
17
+ "space": keyboard.Key.space
18
+ }
19
+
20
+ self.hotkey = key_map.get(key_name.lower(), keyboard.Key.shift)
21
+
22
+ def on_press(key):
23
+ if key == self.hotkey:
24
+ self.is_pressed = True
25
+
26
+ def on_release(key):
27
+ if key == self.hotkey:
28
+ self.is_pressed = False
29
+
30
+ self.listener = keyboard.Listener(on_press=on_press, on_release=on_release)
31
+ self.listener.start()
32
+
33
+ def analyze(self, frame: VirtualAudioFrame) -> float:
34
+ # If the key is held down, we return 1.0 (100% certainty of speech).
35
+ # If the key is released, we return 0.0 (0% certainty of speech).
36
+ return 1.0 if self.is_pressed else 0.0
@@ -0,0 +1,36 @@
1
+ import queue
2
+ from simulation.ports import ILLMBridge
3
+
4
+ class QueueLLMBridge(ILLMBridge):
5
+ def __init__(self, command_queue: queue.Queue, result_queue: queue.Queue):
6
+ self.cmd_q = command_queue
7
+ self.res_q = result_queue
8
+ self.is_requesting = False
9
+
10
+ def call_mcp_tool(self, context: dict) -> dict:
11
+ self.start_request(context)
12
+ while True:
13
+ resp = self.get_response()
14
+ if resp is not None:
15
+ return resp
16
+ import time
17
+ time.sleep(0.01)
18
+
19
+ def start_request(self, context: dict):
20
+ self.is_requesting = True
21
+ # Send the transcript to the MCP server
22
+ self.res_q.put(context)
23
+
24
+ def tick(self, ms: int):
25
+ pass
26
+
27
+ def get_response(self) -> dict | None:
28
+ if self.is_requesting:
29
+ try:
30
+ # Non-blocking check for next command from MCP server
31
+ cmd = self.cmd_q.get_nowait()
32
+ self.is_requesting = False
33
+ return cmd
34
+ except queue.Empty:
35
+ return None
36
+ return None
@@ -0,0 +1,43 @@
1
+ import numpy as np
2
+ import torch
3
+ from silero_vad import load_silero_vad
4
+ from simulation.ports import IVAD
5
+ from simulation.models import VirtualAudioFrame
6
+
7
+ class RealSileroVAD(IVAD):
8
+ def __init__(self, **kwargs):
9
+ # Determine if Apple Silicon (MPS) or CUDA is available for hardware acceleration
10
+ if torch.backends.mps.is_available():
11
+ self.device = torch.device("mps")
12
+ print("[DEBUG VAD] Initializing Silero VAD on Apple Metal GPU (MPS)")
13
+ elif torch.cuda.is_available():
14
+ self.device = torch.device("cuda")
15
+ else:
16
+ self.device = torch.device("cpu")
17
+
18
+ self.model = load_silero_vad().to(self.device)
19
+ self.buffer = b""
20
+ self.last_prob = 0.0
21
+
22
+ def analyze(self, frame: VirtualAudioFrame) -> float:
23
+ if not frame.raw_bytes:
24
+ return 0.0
25
+
26
+ self.buffer += frame.raw_bytes
27
+
28
+ # 512 samples = 1024 bytes
29
+ if len(self.buffer) >= 1024:
30
+ chunk = self.buffer[:1024]
31
+ self.buffer = self.buffer[1024:]
32
+
33
+ audio_int16 = np.frombuffer(chunk, dtype=np.int16)
34
+ audio_float32 = audio_int16.astype(np.float32) / 32768.0
35
+
36
+ # Move the tensor to the active GPU/CPU device
37
+ tensor = torch.from_numpy(audio_float32).to(self.device)
38
+ # silero-vad model natively handles the state!
39
+ prob = self.model(tensor, 16000).item()
40
+ self.last_prob = float(prob)
41
+ # print(f"[DEBUG VAD] Prob: {self.last_prob:.3f}")
42
+
43
+ return self.last_prob
@@ -0,0 +1,17 @@
1
+ import wave
2
+ from simulation.ports import IMicrophone
3
+ from simulation.models import VirtualAudioFrame
4
+
5
+ class WavFileMicrophone(IMicrophone):
6
+ def __init__(self, filepath: str):
7
+ self.wf = wave.open(filepath, 'rb')
8
+ if self.wf.getnchannels() != 1 or self.wf.getsampwidth() != 2 or self.wf.getframerate() != 16000:
9
+ raise ValueError("Wav file must be 16kHz, 16-bit, mono")
10
+
11
+ def read_frame(self) -> VirtualAudioFrame:
12
+ # 16kHz * 2 bytes/sample * 1 channel * 0.010s = 320 bytes
13
+ # 160 frames at 2 bytes = 320 bytes
14
+ raw_bytes = self.wf.readframes(160)
15
+ if len(raw_bytes) < 320:
16
+ return VirtualAudioFrame(10, False, False, "", b"")
17
+ return VirtualAudioFrame(10, False, False, "", raw_bytes)
@@ -0,0 +1,32 @@
1
+ import numpy as np
2
+ import mlx_whisper
3
+ from typing import List
4
+
5
+ from simulation.ports import ISTT
6
+ from simulation.models import VirtualAudioFrame
7
+
8
+ class RealWhisperSTT(ISTT):
9
+ def __init__(self, model_size="mlx-community/whisper-large-v3-mlx"):
10
+ self.model_size = model_size
11
+ print(f"[DEBUG STT] Preparing MLX Whisper model ({model_size}) for Apple Silicon...")
12
+ # MLX will lazily load and compile the model on the first inference, but we print here to indicate we are using the MLX backend.
13
+
14
+ def transcribe(self, frames: List[VirtualAudioFrame]) -> str:
15
+ raw_bytes = b"".join(f.raw_bytes for f in frames if f.raw_bytes)
16
+ if not raw_bytes:
17
+ return ""
18
+
19
+ # Convert 16-bit PCM (expected from microphone) to float32 [-1.0, 1.0] expected by Whisper
20
+ audio_data = np.frombuffer(raw_bytes, dtype=np.int16).astype(np.float32) / 32768.0
21
+
22
+ print(f"[DEBUG STT] Transcribing {len(audio_data)} samples with Apple MLX Whisper ({self.model_size})...")
23
+
24
+ try:
25
+ # We explicitly set English since you are speaking English, and fp16 for Metal acceleration
26
+ result = mlx_whisper.transcribe(audio_data, path_or_hf_repo=self.model_size, language="en")
27
+ text = result.get("text", "").strip()
28
+ print(f"[DEBUG STT] MLX Transcription result: {text}")
29
+ return text
30
+ except Exception as e:
31
+ print(f"[DEBUG STT] MLX Whisper transcription error: {e}")
32
+ return ""
File without changes