voice-mcp-server 0.1.25 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +31 -3
- package/config/config.yaml +1 -1
- package/config/vad/ptt_vad.yaml +1 -1
- package/package.json +5 -1
- package/requirements.txt +1 -0
- package/src/adapters_real/elevenlabs_speaker.py +1 -1
- package/src/adapters_real/kokoro_speaker.py +7 -6
- package/src/adapters_real/live_mic.py +15 -4
- package/src/adapters_real/ptt_sidecar.swift +156 -0
- package/src/adapters_real/ptt_vad.py +143 -25
- package/src/adapters_real/whisper_stt.py +5 -4
- package/src/daemon/audio_server.py +47 -13
- package/src/logger.py +29 -0
- package/src/mcp_server.py +113 -65
- package/src/simulation/engine.py +12 -1
- package/src/simulation/tests/test_abort_daemon.py +109 -0
- package/src/simulation/tests/test_mcp_cancellation.py +83 -0
- package/src/simulation/tests/test_ptt_vad.py +81 -0
- package/src/adapters_real/__pycache__/__init__.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/kokoro_speaker.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/live_mic.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/ptt_vad.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/queue_llm.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/whisper_stt.cpython-312.pyc +0 -0
- package/src/simulation/__pycache__/__init__.cpython-312.pyc +0 -0
- package/src/simulation/__pycache__/engine.cpython-312.pyc +0 -0
- package/src/simulation/__pycache__/models.cpython-312.pyc +0 -0
- package/src/simulation/__pycache__/ports.cpython-312.pyc +0 -0
package/README.md
CHANGED
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
**Give your AI agents a voice, real ears, and the ability to handle interruptions in real-time.**
|
|
6
6
|
|
|
7
7
|
[](https://www.npmjs.com/package/voice-mcp-server)
|
|
8
|
+
[](https://github.com/erickvs/voice-mcp-server/actions)
|
|
8
9
|
[](#-target-environment)
|
|
9
10
|
[](https://python.org)
|
|
10
11
|
[](https://modelcontextprotocol.io/)
|
|
@@ -66,7 +67,7 @@ The system is built on a highly modular adapter pattern configured via `hydra` Y
|
|
|
66
67
|
| | `elevenlabs_speaker` | Premium cloud-based ultra-realistic voices. |
|
|
67
68
|
| **🎙️ Microphones** | `live_mic` | Direct hardware integration via PyAudio. |
|
|
68
69
|
| **🤫 VAD (Activity)** | `silero_vad` | Conversational mode powered by Silero, heavily optimized for 1-second barge-ins. *(Note: **Headphones are strictly required** for this mode to prevent the AI from hearing its own audio output and endlessly interrupting itself).* |
|
|
69
|
-
| | `ptt_vad` | Manual Push-to-Talk / Walkie-Talkie mode. **(Default: Hold '
|
|
70
|
+
| | `ptt_vad` | Manual Push-to-Talk / Walkie-Talkie mode. **(Default: Hold 'Right Option' to talk)** |
|
|
70
71
|
| **📝 STT (Transcription)**| `mlx_whisper_large_v3`| Blazing fast local transcription leveraging Apple's MLX framework. |
|
|
71
72
|
|
|
72
73
|
-----
|
|
@@ -149,7 +150,34 @@ Simply use `voice-mcp-server` as the command in your configuration.
|
|
|
149
150
|
> [!NOTE]
|
|
150
151
|
> **First Run Performance:** The very first time you invoke the voice tool, it will take a few minutes to initialize the Python environment and download the heavy ML weights (~4GB). **The tools will not be available until this background setup completes.** You can monitor progress in your terminal logs. *Depending on your AI client, you may need to restart the application/CLI for the tools to appear after setup.*
|
|
151
152
|
|
|
152
|
-
### 4.
|
|
153
|
+
### 4. Customizing the Voice (ElevenLabs)
|
|
154
|
+
|
|
155
|
+
If you prefer to use **ElevenLabs** for ultra-realistic cloud TTS instead of the default local Kokoro engine, you can easily configure it using Environment Variables!
|
|
156
|
+
|
|
157
|
+
> [!WARNING]
|
|
158
|
+
> **Privacy Notice:** By configuring and using ElevenLabs, the text generated by your LLM will be transmitted over the internet to ElevenLabs' servers for audio rendering. This data is subject to ElevenLabs' own privacy policies and terms of service. If you require absolute privacy and air-gapped security, do not configure this key and continue using the default local MLX engine.
|
|
159
|
+
|
|
160
|
+
When adding the server to your MCP Client (like `claude_desktop_config.json`), simply provide your API key and your preferred Voice ID in the `env` object:
|
|
161
|
+
|
|
162
|
+
```json
|
|
163
|
+
{
|
|
164
|
+
"mcpServers": {
|
|
165
|
+
"voice-mcp-server": {
|
|
166
|
+
"command": "voice-mcp-server",
|
|
167
|
+
"args": [],
|
|
168
|
+
"env": {
|
|
169
|
+
"ELEVENLABS_API_KEY": "sk_your_api_key_here",
|
|
170
|
+
"ELEVENLABS_VOICE_ID": "aEO01A4wXwd1O8GPgGlF"
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
```
|
|
176
|
+
*(If you are using Gemini CLI or Claude Code, you can simply `export` these variables in your terminal profile like `.zshrc`!)*
|
|
177
|
+
|
|
178
|
+
Once configured, simply tell your AI: *"Switch your audio engine to use the elevenlabs_speaker adapter."*
|
|
179
|
+
|
|
180
|
+
### 5. Uninstalling
|
|
153
181
|
|
|
154
182
|
If you wish to completely remove the server and its downloaded ML models from your system to free up space:
|
|
155
183
|
|
|
@@ -192,7 +220,7 @@ If you wish to contribute to the project or run it from source, follow these ste
|
|
|
192
220
|
|
|
193
221
|
Once connected, test the server by sending this prompt to your AI:
|
|
194
222
|
|
|
195
|
-
> *"Let's test your voice capabilities! Please use the
|
|
223
|
+
> *"Let's test your voice capabilities! Please introduce yourself, seamlessly tell me how to use the Right Option key to interact with you, and then start telling me a long story about a brave robot. I will practice using the Right Option key to interrupt you mid-story. When I interrupt, stop the story instantly, acknowledge my interruption naturally, and ask what we should work on instead."*
|
|
196
224
|
|
|
197
225
|
-----
|
|
198
226
|
|
package/config/config.yaml
CHANGED
|
@@ -9,7 +9,7 @@ defaults:
|
|
|
9
9
|
- speaker: kokoro_speaker
|
|
10
10
|
|
|
11
11
|
# Available VADs:
|
|
12
|
-
# - ptt_vad: Walkie-Talkie mode (Hold '
|
|
12
|
+
# - ptt_vad: Walkie-Talkie mode (Hold 'Right Option' to talk. Instant response. Ignores TV/noise).
|
|
13
13
|
# - silero_vad: Conversational AI mode (Listens automatically. Tuned for 1-second barge-ins).
|
|
14
14
|
- vad: ptt_vad
|
|
15
15
|
|
package/config/vad/ptt_vad.yaml
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "voice-mcp-server",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.0",
|
|
4
4
|
"description": "An MCP server to allow LLMs to speak and listen via bidirectional voice loops",
|
|
5
5
|
"main": "build/index.js",
|
|
6
6
|
"type": "module",
|
|
@@ -30,6 +30,10 @@
|
|
|
30
30
|
],
|
|
31
31
|
"author": "Erick Vazquez Santillan",
|
|
32
32
|
"license": "MIT",
|
|
33
|
+
"repository": {
|
|
34
|
+
"type": "git",
|
|
35
|
+
"url": "git+https://github.com/erickvs/voice-mcp-server.git"
|
|
36
|
+
},
|
|
33
37
|
"dependencies": {
|
|
34
38
|
"@modelcontextprotocol/sdk": "^1.5.0"
|
|
35
39
|
},
|
package/requirements.txt
CHANGED
|
@@ -16,7 +16,7 @@ class ElevenLabsSpeaker(ISpeaker):
|
|
|
16
16
|
self.words = []
|
|
17
17
|
self.process = None
|
|
18
18
|
self.start_time = 0
|
|
19
|
-
self.voice_id = voice_id
|
|
19
|
+
self.voice_id = os.getenv("ELEVENLABS_VOICE_ID", voice_id)
|
|
20
20
|
self.api_key = os.getenv("ELEVENLABS_API_KEY")
|
|
21
21
|
self.temp_file = "/tmp/elevenlabs_output.mp3"
|
|
22
22
|
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from logger import logger
|
|
1
2
|
import os
|
|
2
3
|
import time
|
|
3
4
|
import subprocess
|
|
@@ -17,7 +18,7 @@ class KokoroSpeaker(ISpeaker):
|
|
|
17
18
|
self.start_time = 0
|
|
18
19
|
self.temp_file = "/tmp/kokoro_output.wav"
|
|
19
20
|
|
|
20
|
-
|
|
21
|
+
logger.info(f"Loading local Kokoro TTS model (Voice: {voice})...")
|
|
21
22
|
# Load the pipeline. Since you are on M4 Max, we will try to use MPS if available
|
|
22
23
|
if torch.backends.mps.is_available():
|
|
23
24
|
self.device = "mps"
|
|
@@ -30,7 +31,7 @@ class KokoroSpeaker(ISpeaker):
|
|
|
30
31
|
# We use lang_code 'a' for American English
|
|
31
32
|
self.pipeline = KPipeline(lang_code='a', device=self.device)
|
|
32
33
|
self.voice = voice
|
|
33
|
-
|
|
34
|
+
logger.info(f"Kokoro TTS loaded successfully on {self.device}.")
|
|
34
35
|
|
|
35
36
|
def speak(self, text: str):
|
|
36
37
|
if not text.strip():
|
|
@@ -40,7 +41,7 @@ class KokoroSpeaker(ISpeaker):
|
|
|
40
41
|
self.words = text.split()
|
|
41
42
|
|
|
42
43
|
try:
|
|
43
|
-
|
|
44
|
+
logger.debug(f"Generating Kokoro audio for: {text[:50]}...")
|
|
44
45
|
# Generate the audio locally
|
|
45
46
|
generator = self.pipeline(
|
|
46
47
|
text, voice=self.voice, # <= change voice here
|
|
@@ -54,14 +55,14 @@ class KokoroSpeaker(ISpeaker):
|
|
|
54
55
|
audio_segments.append(audio)
|
|
55
56
|
|
|
56
57
|
if not audio_segments:
|
|
57
|
-
|
|
58
|
+
logger.warning("Kokoro generated empty audio.")
|
|
58
59
|
return
|
|
59
60
|
|
|
60
61
|
final_audio = torch.cat(audio_segments, dim=0).cpu().numpy()
|
|
61
62
|
|
|
62
63
|
# Save to temporary file at 24kHz (Kokoro's default sample rate)
|
|
63
64
|
sf.write(self.temp_file, final_audio, 24000)
|
|
64
|
-
|
|
65
|
+
logger.debug("Audio generated, starting playback.")
|
|
65
66
|
|
|
66
67
|
# Play the generated audio using afplay (macOS native)
|
|
67
68
|
self.start_time = time.time()
|
|
@@ -72,7 +73,7 @@ class KokoroSpeaker(ISpeaker):
|
|
|
72
73
|
)
|
|
73
74
|
|
|
74
75
|
except Exception as e:
|
|
75
|
-
|
|
76
|
+
logger.error(f"Kokoro Generation Error: {e}")
|
|
76
77
|
# Fallback to macOS say
|
|
77
78
|
self.start_time = time.time()
|
|
78
79
|
self.process = subprocess.Popen(
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from logger import logger
|
|
1
2
|
import pyaudio
|
|
2
3
|
import queue
|
|
3
4
|
from simulation.ports import IMicrophone
|
|
@@ -10,6 +11,7 @@ class LiveMicrophone(IMicrophone):
|
|
|
10
11
|
self.q = queue.Queue(maxsize=100)
|
|
11
12
|
self.p = pyaudio.PyAudio()
|
|
12
13
|
self.stream = None
|
|
14
|
+
logger.info(f"Initialized LiveMicrophone with rate={rate}, chunk={chunk}")
|
|
13
15
|
|
|
14
16
|
def start_stream(self):
|
|
15
17
|
if self.stream is not None:
|
|
@@ -31,12 +33,21 @@ class LiveMicrophone(IMicrophone):
|
|
|
31
33
|
stream_callback=self._callback
|
|
32
34
|
)
|
|
33
35
|
self.stream.start_stream()
|
|
36
|
+
logger.info("LiveMicrophone stream started")
|
|
34
37
|
|
|
35
38
|
def stop_stream(self):
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
39
|
+
stream = self.stream
|
|
40
|
+
self.stream = None
|
|
41
|
+
if stream is not None:
|
|
42
|
+
try:
|
|
43
|
+
stream.stop_stream()
|
|
44
|
+
except OSError as e:
|
|
45
|
+
logger.debug(f"Ignored PyAudio OSError during stop_stream: {e}")
|
|
46
|
+
try:
|
|
47
|
+
stream.close()
|
|
48
|
+
except Exception:
|
|
49
|
+
pass
|
|
50
|
+
logger.info("LiveMicrophone stream stopped")
|
|
40
51
|
|
|
41
52
|
def _callback(self, in_data, frame_count, time_info, status):
|
|
42
53
|
try:
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
import Foundation
|
|
2
|
+
import IOKit.hid
|
|
3
|
+
import AudioToolbox
|
|
4
|
+
import AppKit
|
|
5
|
+
|
|
6
|
+
var isPTTActive = false
|
|
7
|
+
var isCtrlPressed = false
|
|
8
|
+
var pingID: SystemSoundID = 0
|
|
9
|
+
var popID: SystemSoundID = 0
|
|
10
|
+
|
|
11
|
+
var idleTimer: Timer?
|
|
12
|
+
let IDLE_TIMEOUT: TimeInterval = 900 // 15 minutes
|
|
13
|
+
|
|
14
|
+
func resetIdleTimer() {
|
|
15
|
+
idleTimer?.invalidate()
|
|
16
|
+
idleTimer = Timer.scheduledTimer(withTimeInterval: IDLE_TIMEOUT, repeats: false) { _ in
|
|
17
|
+
print("💤 [SWIFT] Sidecar idle for \(Int(IDLE_TIMEOUT / 60)) minutes. Exiting to save resources.")
|
|
18
|
+
exit(0)
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
// Load uncompressed audio for 0ms latency
|
|
23
|
+
let pingURL = URL(fileURLWithPath: "/System/Library/Sounds/Morse.aiff") as CFURL
|
|
24
|
+
let popURL = URL(fileURLWithPath: "/System/Library/Sounds/Pop.aiff") as CFURL
|
|
25
|
+
AudioServicesCreateSystemSoundID(pingURL, &pingID)
|
|
26
|
+
AudioServicesCreateSystemSoundID(popURL, &popID)
|
|
27
|
+
|
|
28
|
+
func sendSocketMessage(code: UInt8) -> Bool {
|
|
29
|
+
let fd = socket(AF_UNIX, SOCK_STREAM, 0)
|
|
30
|
+
guard fd >= 0 else { return false }
|
|
31
|
+
defer { close(fd) }
|
|
32
|
+
|
|
33
|
+
var addr = sockaddr_un()
|
|
34
|
+
addr.sun_family = sa_family_t(AF_UNIX)
|
|
35
|
+
|
|
36
|
+
let path = "/tmp/voice_mcp_ptt.sock"
|
|
37
|
+
let pathSize = Int(MemoryLayout.size(ofValue: addr.sun_path))
|
|
38
|
+
_ = withUnsafeMutablePointer(to: &addr.sun_path.0) { ptr in
|
|
39
|
+
path.withCString { cstr in
|
|
40
|
+
strncpy(ptr, cstr, pathSize)
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
let len = socklen_t(MemoryLayout<sockaddr_un>.size)
|
|
45
|
+
let connectResult = withUnsafePointer(to: &addr) {
|
|
46
|
+
$0.withMemoryRebound(to: sockaddr.self, capacity: 1) { connect(fd, $0, len) }
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
if connectResult == 0 {
|
|
50
|
+
var byte: UInt8 = code
|
|
51
|
+
write(fd, &byte, 1)
|
|
52
|
+
return true
|
|
53
|
+
}
|
|
54
|
+
return false
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
func isTerminalFrontmost() -> Bool {
|
|
58
|
+
guard let frontApp = NSWorkspace.shared.frontmostApplication else { return false }
|
|
59
|
+
let bundleID = frontApp.bundleIdentifier ?? ""
|
|
60
|
+
// Add common terminal emulators and editors
|
|
61
|
+
let allowedTerminals = [
|
|
62
|
+
"com.apple.Terminal",
|
|
63
|
+
"com.googlecode.iterm2",
|
|
64
|
+
"dev.warp.Warp-Stable",
|
|
65
|
+
"co.zeit.hyper",
|
|
66
|
+
"com.mitchellh.ghostty",
|
|
67
|
+
"net.kovidgoyal.kitty",
|
|
68
|
+
"org.alacritty",
|
|
69
|
+
"com.anthropic.claudedesktop",
|
|
70
|
+
"com.microsoft.VSCode",
|
|
71
|
+
"com.todesktop.Cursor"
|
|
72
|
+
]
|
|
73
|
+
return allowedTerminals.contains(bundleID)
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
var lastPressTime: TimeInterval = 0
|
|
77
|
+
var lastReleaseTime: TimeInterval = 0
|
|
78
|
+
let DOUBLE_TAP_THRESHOLD: TimeInterval = 0.4 // 400 milliseconds
|
|
79
|
+
|
|
80
|
+
let hidCallback: IOHIDValueCallback = { context, result, sender, value in
|
|
81
|
+
let element = IOHIDValueGetElement(value)
|
|
82
|
+
let usagePage = IOHIDElementGetUsagePage(element)
|
|
83
|
+
let usage = IOHIDElementGetUsage(element)
|
|
84
|
+
let intValue = IOHIDValueGetIntegerValue(value)
|
|
85
|
+
|
|
86
|
+
// 0x07 = Generic Desktop Keyboard
|
|
87
|
+
if usagePage == 0x07 {
|
|
88
|
+
let isPressed = (intValue == 1)
|
|
89
|
+
|
|
90
|
+
// 0xE6 = Right Option
|
|
91
|
+
if usage == 0xE6 {
|
|
92
|
+
// Only process events if our terminal is the active window!
|
|
93
|
+
if isTerminalFrontmost() {
|
|
94
|
+
let now = Date().timeIntervalSince1970
|
|
95
|
+
|
|
96
|
+
if isPressed && !isPTTActive {
|
|
97
|
+
resetIdleTimer()
|
|
98
|
+
|
|
99
|
+
// Check for Double-Tap!
|
|
100
|
+
// If the time since the LAST release is very short, and the time
|
|
101
|
+
// since the LAST press is also very short, this is the second press of a double-tap.
|
|
102
|
+
if (now - lastReleaseTime) < DOUBLE_TAP_THRESHOLD && (now - lastPressTime) < DOUBLE_TAP_THRESHOLD {
|
|
103
|
+
// Abort signal!
|
|
104
|
+
if sendSocketMessage(code: 2) {
|
|
105
|
+
print("🚨 [SWIFT] -> DOUBLE TAP DETECTED! Transmitted 0x02 (Abort)")
|
|
106
|
+
AudioServicesPlaySystemSound(popID) // Play pop to confirm abort
|
|
107
|
+
}
|
|
108
|
+
// Reset timestamps so we don't accidentally triple-tap
|
|
109
|
+
lastPressTime = 0
|
|
110
|
+
lastReleaseTime = 0
|
|
111
|
+
return
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Normal Single Press
|
|
115
|
+
lastPressTime = now
|
|
116
|
+
if sendSocketMessage(code: 1) {
|
|
117
|
+
isPTTActive = true
|
|
118
|
+
AudioServicesPlaySystemSound(pingID)
|
|
119
|
+
print("[SWIFT] -> Transmitted 0x01 (Press)")
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
} else if !isPressed && isPTTActive {
|
|
123
|
+
lastReleaseTime = now
|
|
124
|
+
isPTTActive = false
|
|
125
|
+
|
|
126
|
+
// Normal Release
|
|
127
|
+
_ = sendSocketMessage(code: 0)
|
|
128
|
+
AudioServicesPlaySystemSound(popID)
|
|
129
|
+
print("[SWIFT] -> Transmitted 0x00 (Release)")
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
let manager = IOHIDManagerCreate(kCFAllocatorDefault, IOOptionBits(kIOHIDOptionsTypeNone))
|
|
137
|
+
let deviceMatch: [String: Any] = ["DeviceUsagePage": 1, "DeviceUsage": 6]
|
|
138
|
+
IOHIDManagerSetDeviceMatching(manager, deviceMatch as CFDictionary)
|
|
139
|
+
IOHIDManagerRegisterInputValueCallback(manager, hidCallback, nil)
|
|
140
|
+
IOHIDManagerScheduleWithRunLoop(manager, CFRunLoopGetMain(), CFRunLoopMode.defaultMode.rawValue)
|
|
141
|
+
|
|
142
|
+
let openResult = IOHIDManagerOpen(manager, IOOptionBits(kIOHIDOptionsTypeNone))
|
|
143
|
+
if openResult != kIOReturnSuccess {
|
|
144
|
+
print("❌ FATAL: macOS blocked hardware access.")
|
|
145
|
+
print("👉 ACTION REQUIRED: Open System Settings -> Privacy & Security -> Input Monitoring.")
|
|
146
|
+
print("👉 Add your Terminal application, toggle it ON, completely restart the terminal, and try again.")
|
|
147
|
+
exit(1)
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
print("✅ [SWIFT] Sidecar Online with Context-Aware Focus Filter.")
|
|
151
|
+
print("🎧 [SWIFT] Listening natively for Right Option (Hardware Matrix 0xE6)...")
|
|
152
|
+
print("🔒 [SWIFT] Mic will ONLY open if a Terminal window is currently active.")
|
|
153
|
+
|
|
154
|
+
resetIdleTimer() // Start the idle timer initially
|
|
155
|
+
|
|
156
|
+
CFRunLoopRun()
|
|
@@ -1,36 +1,154 @@
|
|
|
1
|
-
from
|
|
1
|
+
from logger import logger
|
|
2
|
+
import threading
|
|
3
|
+
import subprocess
|
|
4
|
+
import socket
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
import atexit
|
|
8
|
+
import http.client
|
|
2
9
|
from simulation.ports import IVAD
|
|
3
10
|
from simulation.models import VirtualAudioFrame
|
|
4
11
|
|
|
12
|
+
SOCKET_PATH = "/tmp/voice_mcp_ptt.sock"
|
|
13
|
+
|
|
14
|
+
class UDSHTTPConnection(http.client.HTTPConnection):
|
|
15
|
+
def __init__(self, socket_path, timeout=300.0):
|
|
16
|
+
super().__init__("localhost", timeout=timeout)
|
|
17
|
+
self.socket_path = socket_path
|
|
18
|
+
|
|
19
|
+
def connect(self):
|
|
20
|
+
self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
|
21
|
+
self.sock.settimeout(self.timeout)
|
|
22
|
+
self.sock.connect(self.socket_path)
|
|
23
|
+
|
|
5
24
|
class PushToTalkVAD(IVAD):
|
|
6
|
-
def __init__(self, key_name="
|
|
7
|
-
self.
|
|
8
|
-
|
|
25
|
+
def __init__(self, key_name="right_option", **kwargs):
|
|
26
|
+
self.lock = threading.Lock()
|
|
27
|
+
self.is_ptt_active = False
|
|
9
28
|
|
|
10
|
-
|
|
11
|
-
key_map = {
|
|
12
|
-
"shift": keyboard.Key.shift,
|
|
13
|
-
"shift_r": keyboard.Key.shift_r,
|
|
14
|
-
"ctrl": keyboard.Key.ctrl,
|
|
15
|
-
"alt": keyboard.Key.alt,
|
|
16
|
-
"cmd": keyboard.Key.cmd,
|
|
17
|
-
"space": keyboard.Key.space
|
|
18
|
-
}
|
|
29
|
+
logger.info("Initializing Push-To-Talk VAD via Swift Sidecar.")
|
|
19
30
|
|
|
20
|
-
self.
|
|
31
|
+
self.sidecar_process = None
|
|
32
|
+
self.server_socket = None
|
|
33
|
+
self.listener_thread = None
|
|
34
|
+
self._stop_event = threading.Event()
|
|
35
|
+
|
|
36
|
+
self._start_sidecar()
|
|
37
|
+
atexit.register(self._cleanup)
|
|
21
38
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
39
|
+
def set_active(self, active: bool):
|
|
40
|
+
if active and self.server_socket is None:
|
|
41
|
+
self._start_server()
|
|
42
|
+
elif not active and self.server_socket is not None:
|
|
43
|
+
self._stop_server()
|
|
25
44
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
45
|
+
def _start_server(self):
|
|
46
|
+
self._stop_event.clear()
|
|
47
|
+
if os.path.exists(SOCKET_PATH):
|
|
48
|
+
try:
|
|
49
|
+
os.remove(SOCKET_PATH)
|
|
50
|
+
except OSError:
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
self.server_socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
|
54
|
+
self.server_socket.bind(SOCKET_PATH)
|
|
55
|
+
self.server_socket.listen(1)
|
|
56
|
+
logger.debug(f"PTT socket created at {SOCKET_PATH}")
|
|
57
|
+
|
|
58
|
+
self.listener_thread = threading.Thread(target=self._listen_loop, daemon=True)
|
|
59
|
+
self.listener_thread.start()
|
|
60
|
+
|
|
61
|
+
def _stop_server(self):
|
|
62
|
+
self._stop_event.set()
|
|
63
|
+
if self.server_socket:
|
|
64
|
+
try:
|
|
65
|
+
self.server_socket.close()
|
|
66
|
+
except Exception:
|
|
67
|
+
pass
|
|
68
|
+
self.server_socket = None
|
|
69
|
+
|
|
70
|
+
if self.listener_thread:
|
|
71
|
+
self.listener_thread.join(timeout=1.0)
|
|
72
|
+
self.listener_thread = None
|
|
73
|
+
|
|
74
|
+
if os.path.exists(SOCKET_PATH):
|
|
75
|
+
try:
|
|
76
|
+
os.remove(SOCKET_PATH)
|
|
77
|
+
except OSError:
|
|
78
|
+
pass
|
|
79
|
+
|
|
80
|
+
with self.lock:
|
|
81
|
+
self.is_ptt_active = False
|
|
82
|
+
|
|
83
|
+
def _start_sidecar(self):
|
|
84
|
+
try:
|
|
85
|
+
output = subprocess.check_output(["pgrep", "-x", "ptt_sidecar"])
|
|
86
|
+
if len(output.strip()) > 0:
|
|
87
|
+
logger.debug("Swift Sidecar is already running.")
|
|
88
|
+
return
|
|
89
|
+
except subprocess.CalledProcessError:
|
|
90
|
+
pass
|
|
91
|
+
|
|
92
|
+
sidecar_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ptt_sidecar")
|
|
93
|
+
if not os.path.exists(sidecar_path):
|
|
94
|
+
logger.info(f"Compiling Swift Sidecar at {sidecar_path}...")
|
|
95
|
+
swift_src = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ptt_sidecar.swift")
|
|
96
|
+
subprocess.run(["swiftc", swift_src, "-o", sidecar_path])
|
|
97
|
+
|
|
98
|
+
if os.path.exists(sidecar_path):
|
|
99
|
+
self.sidecar_process = subprocess.Popen(
|
|
100
|
+
[sidecar_path],
|
|
101
|
+
stdout=sys.stdout,
|
|
102
|
+
stderr=sys.stderr,
|
|
103
|
+
start_new_session=True
|
|
104
|
+
)
|
|
105
|
+
logger.info("Swift Sidecar started.")
|
|
106
|
+
else:
|
|
107
|
+
logger.error("Failed to start Swift Sidecar, executable not found.")
|
|
29
108
|
|
|
30
|
-
|
|
31
|
-
self.
|
|
109
|
+
def _listen_loop(self):
|
|
110
|
+
while not self._stop_event.is_set():
|
|
111
|
+
try:
|
|
112
|
+
if not self.server_socket:
|
|
113
|
+
break
|
|
114
|
+
conn, _ = self.server_socket.accept()
|
|
115
|
+
with conn:
|
|
116
|
+
while not self._stop_event.is_set():
|
|
117
|
+
data = conn.recv(1)
|
|
118
|
+
if not data:
|
|
119
|
+
break
|
|
120
|
+
|
|
121
|
+
with self.lock:
|
|
122
|
+
if data == b'\x01':
|
|
123
|
+
logger.info("Mic Alive (Right Option Pressed) - Received 0x01")
|
|
124
|
+
self.is_ptt_active = True
|
|
125
|
+
elif data == b'\x00':
|
|
126
|
+
logger.info("Mic Dead (Right Option Released) - Received 0x00")
|
|
127
|
+
self.is_ptt_active = False
|
|
128
|
+
elif data == b'\x02':
|
|
129
|
+
logger.info("Abort (Esc/Ctrl+C Pressed) - Received 0x02. Triggering /abort")
|
|
130
|
+
try:
|
|
131
|
+
daemon_sock = os.path.expanduser("~/Library/Application Support/VoiceMCP/daemon.sock")
|
|
132
|
+
conn_uds = UDSHTTPConnection(daemon_sock, timeout=1.0)
|
|
133
|
+
conn_uds.request("POST", "/abort", body=None, headers={})
|
|
134
|
+
conn_uds.getresponse().read()
|
|
135
|
+
conn_uds.close()
|
|
136
|
+
except Exception as e:
|
|
137
|
+
logger.error(f"Failed to trigger /abort natively: {e}")
|
|
138
|
+
except Exception as e:
|
|
139
|
+
pass
|
|
32
140
|
|
|
33
141
|
def analyze(self, frame: VirtualAudioFrame) -> float:
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
142
|
+
with self.lock:
|
|
143
|
+
return 1.0 if self.is_ptt_active else 0.0
|
|
144
|
+
|
|
145
|
+
def _cleanup(self):
|
|
146
|
+
self._stop_server()
|
|
147
|
+
if self.sidecar_process:
|
|
148
|
+
try:
|
|
149
|
+
self.sidecar_process.terminate()
|
|
150
|
+
except Exception:
|
|
151
|
+
pass
|
|
152
|
+
|
|
153
|
+
def __del__(self):
|
|
154
|
+
self._cleanup()
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from logger import logger
|
|
1
2
|
import numpy as np
|
|
2
3
|
import mlx_whisper
|
|
3
4
|
from typing import List
|
|
@@ -8,7 +9,7 @@ from simulation.models import VirtualAudioFrame
|
|
|
8
9
|
class RealWhisperSTT(ISTT):
|
|
9
10
|
def __init__(self, model_size="mlx-community/whisper-large-v3-mlx"):
|
|
10
11
|
self.model_size = model_size
|
|
11
|
-
|
|
12
|
+
logger.info(f"Preparing MLX Whisper model ({model_size}) for Apple Silicon...")
|
|
12
13
|
# MLX will lazily load and compile the model on the first inference, but we print here to indicate we are using the MLX backend.
|
|
13
14
|
|
|
14
15
|
def transcribe(self, frames: List[VirtualAudioFrame]) -> str:
|
|
@@ -19,14 +20,14 @@ class RealWhisperSTT(ISTT):
|
|
|
19
20
|
# Convert 16-bit PCM (expected from microphone) to float32 [-1.0, 1.0] expected by Whisper
|
|
20
21
|
audio_data = np.frombuffer(raw_bytes, dtype=np.int16).astype(np.float32) / 32768.0
|
|
21
22
|
|
|
22
|
-
|
|
23
|
+
logger.debug(f"Transcribing {len(audio_data)} samples with Apple MLX Whisper ({self.model_size})...")
|
|
23
24
|
|
|
24
25
|
try:
|
|
25
26
|
# We explicitly set English since you are speaking English, and fp16 for Metal acceleration
|
|
26
27
|
result = mlx_whisper.transcribe(audio_data, path_or_hf_repo=self.model_size, language="en")
|
|
27
28
|
text = result.get("text", "").strip()
|
|
28
|
-
|
|
29
|
+
logger.debug(f"MLX Transcription result: {text}")
|
|
29
30
|
return text
|
|
30
31
|
except Exception as e:
|
|
31
|
-
|
|
32
|
+
logger.error(f"MLX Whisper transcription error: {e}")
|
|
32
33
|
return ""
|