voice-mcp-server 0.1.24 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/config/config.yaml +1 -1
- package/config/vad/ptt_vad.yaml +1 -1
- package/package.json +1 -1
- package/requirements.txt +1 -0
- package/src/__pycache__/logger.cpython-312.pyc +0 -0
- package/src/__pycache__/mcp_server.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/kokoro_speaker.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/live_mic.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/ptt_vad.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/whisper_stt.cpython-312.pyc +0 -0
- package/src/adapters_real/kokoro_speaker.py +7 -6
- package/src/adapters_real/live_mic.py +15 -4
- package/src/adapters_real/ptt_sidecar +0 -0
- package/src/adapters_real/ptt_sidecar.swift +156 -0
- package/src/adapters_real/ptt_vad.py +143 -25
- package/src/adapters_real/whisper_stt.py +5 -4
- package/src/daemon/__pycache__/audio_server.cpython-312.pyc +0 -0
- package/src/daemon/audio_server.py +52 -15
- package/src/logger.py +29 -0
- package/src/mcp_server.py +143 -15
- package/src/simulation/__pycache__/adapters.cpython-312.pyc +0 -0
- package/src/simulation/__pycache__/engine.cpython-312.pyc +0 -0
- package/src/simulation/engine.py +67 -19
- package/src/simulation/tests/__pycache__/__init__.cpython-312.pyc +0 -0
- package/src/simulation/tests/__pycache__/test_ptt_vad.cpython-312-pytest-7.4.2.pyc +0 -0
- package/src/simulation/tests/__pycache__/test_scenarios.cpython-312-pytest-7.4.2.pyc +0 -0
- package/src/simulation/tests/test_abort_daemon.py +109 -0
- package/src/simulation/tests/test_mcp_cancellation.py +83 -0
- package/src/simulation/tests/test_ptt_vad.py +81 -0
package/README.md
CHANGED
|
@@ -66,7 +66,7 @@ The system is built on a highly modular adapter pattern configured via `hydra` Y
|
|
|
66
66
|
| | `elevenlabs_speaker` | Premium cloud-based ultra-realistic voices. |
|
|
67
67
|
| **🎙️ Microphones** | `live_mic` | Direct hardware integration via PyAudio. |
|
|
68
68
|
| **🤫 VAD (Activity)** | `silero_vad` | Conversational mode powered by Silero, heavily optimized for 1-second barge-ins. *(Note: **Headphones are strictly required** for this mode to prevent the AI from hearing its own audio output and endlessly interrupting itself).* |
|
|
69
|
-
| | `ptt_vad` | Manual Push-to-Talk / Walkie-Talkie mode. **(Default: Hold '
|
|
69
|
+
| | `ptt_vad` | Manual Push-to-Talk / Walkie-Talkie mode. **(Default: Hold 'Right Option' to talk)** |
|
|
70
70
|
| **📝 STT (Transcription)**| `mlx_whisper_large_v3`| Blazing fast local transcription leveraging Apple's MLX framework. |
|
|
71
71
|
|
|
72
72
|
-----
|
|
@@ -192,7 +192,7 @@ If you wish to contribute to the project or run it from source, follow these ste
|
|
|
192
192
|
|
|
193
193
|
Once connected, test the server by sending this prompt to your AI:
|
|
194
194
|
|
|
195
|
-
> *"Let's test your voice capabilities! Please use the
|
|
195
|
+
> *"Let's test your voice capabilities! Please introduce yourself, seamlessly tell me how to use the Right Option key to interact with you, and then start telling me a long story about a brave robot. I will practice using the Right Option key to interrupt you mid-story. When I interrupt, stop the story instantly, acknowledge my interruption naturally, and ask what we should work on instead."*
|
|
196
196
|
|
|
197
197
|
-----
|
|
198
198
|
|
package/config/config.yaml
CHANGED
|
@@ -9,7 +9,7 @@ defaults:
|
|
|
9
9
|
- speaker: kokoro_speaker
|
|
10
10
|
|
|
11
11
|
# Available VADs:
|
|
12
|
-
# - ptt_vad: Walkie-Talkie mode (Hold '
|
|
12
|
+
# - ptt_vad: Walkie-Talkie mode (Hold 'Right Option' to talk. Instant response. Ignores TV/noise).
|
|
13
13
|
# - silero_vad: Conversational AI mode (Listens automatically. Tuned for 1-second barge-ins).
|
|
14
14
|
- vad: ptt_vad
|
|
15
15
|
|
package/config/vad/ptt_vad.yaml
CHANGED
package/package.json
CHANGED
package/requirements.txt
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from logger import logger
|
|
1
2
|
import os
|
|
2
3
|
import time
|
|
3
4
|
import subprocess
|
|
@@ -17,7 +18,7 @@ class KokoroSpeaker(ISpeaker):
|
|
|
17
18
|
self.start_time = 0
|
|
18
19
|
self.temp_file = "/tmp/kokoro_output.wav"
|
|
19
20
|
|
|
20
|
-
|
|
21
|
+
logger.info(f"Loading local Kokoro TTS model (Voice: {voice})...")
|
|
21
22
|
# Load the pipeline. Since you are on M4 Max, we will try to use MPS if available
|
|
22
23
|
if torch.backends.mps.is_available():
|
|
23
24
|
self.device = "mps"
|
|
@@ -30,7 +31,7 @@ class KokoroSpeaker(ISpeaker):
|
|
|
30
31
|
# We use lang_code 'a' for American English
|
|
31
32
|
self.pipeline = KPipeline(lang_code='a', device=self.device)
|
|
32
33
|
self.voice = voice
|
|
33
|
-
|
|
34
|
+
logger.info(f"Kokoro TTS loaded successfully on {self.device}.")
|
|
34
35
|
|
|
35
36
|
def speak(self, text: str):
|
|
36
37
|
if not text.strip():
|
|
@@ -40,7 +41,7 @@ class KokoroSpeaker(ISpeaker):
|
|
|
40
41
|
self.words = text.split()
|
|
41
42
|
|
|
42
43
|
try:
|
|
43
|
-
|
|
44
|
+
logger.debug(f"Generating Kokoro audio for: {text[:50]}...")
|
|
44
45
|
# Generate the audio locally
|
|
45
46
|
generator = self.pipeline(
|
|
46
47
|
text, voice=self.voice, # <= change voice here
|
|
@@ -54,14 +55,14 @@ class KokoroSpeaker(ISpeaker):
|
|
|
54
55
|
audio_segments.append(audio)
|
|
55
56
|
|
|
56
57
|
if not audio_segments:
|
|
57
|
-
|
|
58
|
+
logger.warning("Kokoro generated empty audio.")
|
|
58
59
|
return
|
|
59
60
|
|
|
60
61
|
final_audio = torch.cat(audio_segments, dim=0).cpu().numpy()
|
|
61
62
|
|
|
62
63
|
# Save to temporary file at 24kHz (Kokoro's default sample rate)
|
|
63
64
|
sf.write(self.temp_file, final_audio, 24000)
|
|
64
|
-
|
|
65
|
+
logger.debug("Audio generated, starting playback.")
|
|
65
66
|
|
|
66
67
|
# Play the generated audio using afplay (macOS native)
|
|
67
68
|
self.start_time = time.time()
|
|
@@ -72,7 +73,7 @@ class KokoroSpeaker(ISpeaker):
|
|
|
72
73
|
)
|
|
73
74
|
|
|
74
75
|
except Exception as e:
|
|
75
|
-
|
|
76
|
+
logger.error(f"Kokoro Generation Error: {e}")
|
|
76
77
|
# Fallback to macOS say
|
|
77
78
|
self.start_time = time.time()
|
|
78
79
|
self.process = subprocess.Popen(
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from logger import logger
|
|
1
2
|
import pyaudio
|
|
2
3
|
import queue
|
|
3
4
|
from simulation.ports import IMicrophone
|
|
@@ -10,6 +11,7 @@ class LiveMicrophone(IMicrophone):
|
|
|
10
11
|
self.q = queue.Queue(maxsize=100)
|
|
11
12
|
self.p = pyaudio.PyAudio()
|
|
12
13
|
self.stream = None
|
|
14
|
+
logger.info(f"Initialized LiveMicrophone with rate={rate}, chunk={chunk}")
|
|
13
15
|
|
|
14
16
|
def start_stream(self):
|
|
15
17
|
if self.stream is not None:
|
|
@@ -31,12 +33,21 @@ class LiveMicrophone(IMicrophone):
|
|
|
31
33
|
stream_callback=self._callback
|
|
32
34
|
)
|
|
33
35
|
self.stream.start_stream()
|
|
36
|
+
logger.info("LiveMicrophone stream started")
|
|
34
37
|
|
|
35
38
|
def stop_stream(self):
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
39
|
+
stream = self.stream
|
|
40
|
+
self.stream = None
|
|
41
|
+
if stream is not None:
|
|
42
|
+
try:
|
|
43
|
+
stream.stop_stream()
|
|
44
|
+
except OSError as e:
|
|
45
|
+
logger.debug(f"Ignored PyAudio OSError during stop_stream: {e}")
|
|
46
|
+
try:
|
|
47
|
+
stream.close()
|
|
48
|
+
except Exception:
|
|
49
|
+
pass
|
|
50
|
+
logger.info("LiveMicrophone stream stopped")
|
|
40
51
|
|
|
41
52
|
def _callback(self, in_data, frame_count, time_info, status):
|
|
42
53
|
try:
|
|
Binary file
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
import Foundation
|
|
2
|
+
import IOKit.hid
|
|
3
|
+
import AudioToolbox
|
|
4
|
+
import AppKit
|
|
5
|
+
|
|
6
|
+
var isPTTActive = false
|
|
7
|
+
var isCtrlPressed = false
|
|
8
|
+
var pingID: SystemSoundID = 0
|
|
9
|
+
var popID: SystemSoundID = 0
|
|
10
|
+
|
|
11
|
+
var idleTimer: Timer?
|
|
12
|
+
let IDLE_TIMEOUT: TimeInterval = 900 // 15 minutes
|
|
13
|
+
|
|
14
|
+
func resetIdleTimer() {
|
|
15
|
+
idleTimer?.invalidate()
|
|
16
|
+
idleTimer = Timer.scheduledTimer(withTimeInterval: IDLE_TIMEOUT, repeats: false) { _ in
|
|
17
|
+
print("💤 [SWIFT] Sidecar idle for \(Int(IDLE_TIMEOUT / 60)) minutes. Exiting to save resources.")
|
|
18
|
+
exit(0)
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
// Load uncompressed audio for 0ms latency
|
|
23
|
+
let pingURL = URL(fileURLWithPath: "/System/Library/Sounds/Morse.aiff") as CFURL
|
|
24
|
+
let popURL = URL(fileURLWithPath: "/System/Library/Sounds/Pop.aiff") as CFURL
|
|
25
|
+
AudioServicesCreateSystemSoundID(pingURL, &pingID)
|
|
26
|
+
AudioServicesCreateSystemSoundID(popURL, &popID)
|
|
27
|
+
|
|
28
|
+
func sendSocketMessage(code: UInt8) -> Bool {
|
|
29
|
+
let fd = socket(AF_UNIX, SOCK_STREAM, 0)
|
|
30
|
+
guard fd >= 0 else { return false }
|
|
31
|
+
defer { close(fd) }
|
|
32
|
+
|
|
33
|
+
var addr = sockaddr_un()
|
|
34
|
+
addr.sun_family = sa_family_t(AF_UNIX)
|
|
35
|
+
|
|
36
|
+
let path = "/tmp/voice_mcp_ptt.sock"
|
|
37
|
+
let pathSize = Int(MemoryLayout.size(ofValue: addr.sun_path))
|
|
38
|
+
_ = withUnsafeMutablePointer(to: &addr.sun_path.0) { ptr in
|
|
39
|
+
path.withCString { cstr in
|
|
40
|
+
strncpy(ptr, cstr, pathSize)
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
let len = socklen_t(MemoryLayout<sockaddr_un>.size)
|
|
45
|
+
let connectResult = withUnsafePointer(to: &addr) {
|
|
46
|
+
$0.withMemoryRebound(to: sockaddr.self, capacity: 1) { connect(fd, $0, len) }
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
if connectResult == 0 {
|
|
50
|
+
var byte: UInt8 = code
|
|
51
|
+
write(fd, &byte, 1)
|
|
52
|
+
return true
|
|
53
|
+
}
|
|
54
|
+
return false
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
func isTerminalFrontmost() -> Bool {
|
|
58
|
+
guard let frontApp = NSWorkspace.shared.frontmostApplication else { return false }
|
|
59
|
+
let bundleID = frontApp.bundleIdentifier ?? ""
|
|
60
|
+
// Add common terminal emulators and editors
|
|
61
|
+
let allowedTerminals = [
|
|
62
|
+
"com.apple.Terminal",
|
|
63
|
+
"com.googlecode.iterm2",
|
|
64
|
+
"dev.warp.Warp-Stable",
|
|
65
|
+
"co.zeit.hyper",
|
|
66
|
+
"com.mitchellh.ghostty",
|
|
67
|
+
"net.kovidgoyal.kitty",
|
|
68
|
+
"org.alacritty",
|
|
69
|
+
"com.anthropic.claudedesktop",
|
|
70
|
+
"com.microsoft.VSCode",
|
|
71
|
+
"com.todesktop.Cursor"
|
|
72
|
+
]
|
|
73
|
+
return allowedTerminals.contains(bundleID)
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
var lastPressTime: TimeInterval = 0
|
|
77
|
+
var lastReleaseTime: TimeInterval = 0
|
|
78
|
+
let DOUBLE_TAP_THRESHOLD: TimeInterval = 0.4 // 400 milliseconds
|
|
79
|
+
|
|
80
|
+
let hidCallback: IOHIDValueCallback = { context, result, sender, value in
|
|
81
|
+
let element = IOHIDValueGetElement(value)
|
|
82
|
+
let usagePage = IOHIDElementGetUsagePage(element)
|
|
83
|
+
let usage = IOHIDElementGetUsage(element)
|
|
84
|
+
let intValue = IOHIDValueGetIntegerValue(value)
|
|
85
|
+
|
|
86
|
+
// 0x07 = Generic Desktop Keyboard
|
|
87
|
+
if usagePage == 0x07 {
|
|
88
|
+
let isPressed = (intValue == 1)
|
|
89
|
+
|
|
90
|
+
// 0xE6 = Right Option
|
|
91
|
+
if usage == 0xE6 {
|
|
92
|
+
// Only process events if our terminal is the active window!
|
|
93
|
+
if isTerminalFrontmost() {
|
|
94
|
+
let now = Date().timeIntervalSince1970
|
|
95
|
+
|
|
96
|
+
if isPressed && !isPTTActive {
|
|
97
|
+
resetIdleTimer()
|
|
98
|
+
|
|
99
|
+
// Check for Double-Tap!
|
|
100
|
+
// If the time since the LAST release is very short, and the time
|
|
101
|
+
// since the LAST press is also very short, this is the second press of a double-tap.
|
|
102
|
+
if (now - lastReleaseTime) < DOUBLE_TAP_THRESHOLD && (now - lastPressTime) < DOUBLE_TAP_THRESHOLD {
|
|
103
|
+
// Abort signal!
|
|
104
|
+
if sendSocketMessage(code: 2) {
|
|
105
|
+
print("🚨 [SWIFT] -> DOUBLE TAP DETECTED! Transmitted 0x02 (Abort)")
|
|
106
|
+
AudioServicesPlaySystemSound(popID) // Play pop to confirm abort
|
|
107
|
+
}
|
|
108
|
+
// Reset timestamps so we don't accidentally triple-tap
|
|
109
|
+
lastPressTime = 0
|
|
110
|
+
lastReleaseTime = 0
|
|
111
|
+
return
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Normal Single Press
|
|
115
|
+
lastPressTime = now
|
|
116
|
+
if sendSocketMessage(code: 1) {
|
|
117
|
+
isPTTActive = true
|
|
118
|
+
AudioServicesPlaySystemSound(pingID)
|
|
119
|
+
print("[SWIFT] -> Transmitted 0x01 (Press)")
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
} else if !isPressed && isPTTActive {
|
|
123
|
+
lastReleaseTime = now
|
|
124
|
+
isPTTActive = false
|
|
125
|
+
|
|
126
|
+
// Normal Release
|
|
127
|
+
_ = sendSocketMessage(code: 0)
|
|
128
|
+
AudioServicesPlaySystemSound(popID)
|
|
129
|
+
print("[SWIFT] -> Transmitted 0x00 (Release)")
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
let manager = IOHIDManagerCreate(kCFAllocatorDefault, IOOptionBits(kIOHIDOptionsTypeNone))
|
|
137
|
+
let deviceMatch: [String: Any] = ["DeviceUsagePage": 1, "DeviceUsage": 6]
|
|
138
|
+
IOHIDManagerSetDeviceMatching(manager, deviceMatch as CFDictionary)
|
|
139
|
+
IOHIDManagerRegisterInputValueCallback(manager, hidCallback, nil)
|
|
140
|
+
IOHIDManagerScheduleWithRunLoop(manager, CFRunLoopGetMain(), CFRunLoopMode.defaultMode.rawValue)
|
|
141
|
+
|
|
142
|
+
let openResult = IOHIDManagerOpen(manager, IOOptionBits(kIOHIDOptionsTypeNone))
|
|
143
|
+
if openResult != kIOReturnSuccess {
|
|
144
|
+
print("❌ FATAL: macOS blocked hardware access.")
|
|
145
|
+
print("👉 ACTION REQUIRED: Open System Settings -> Privacy & Security -> Input Monitoring.")
|
|
146
|
+
print("👉 Add your Terminal application, toggle it ON, completely restart the terminal, and try again.")
|
|
147
|
+
exit(1)
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
print("✅ [SWIFT] Sidecar Online with Context-Aware Focus Filter.")
|
|
151
|
+
print("🎧 [SWIFT] Listening natively for Right Option (Hardware Matrix 0xE6)...")
|
|
152
|
+
print("🔒 [SWIFT] Mic will ONLY open if a Terminal window is currently active.")
|
|
153
|
+
|
|
154
|
+
resetIdleTimer() // Start the idle timer initially
|
|
155
|
+
|
|
156
|
+
CFRunLoopRun()
|
|
@@ -1,36 +1,154 @@
|
|
|
1
|
-
from
|
|
1
|
+
from logger import logger
|
|
2
|
+
import threading
|
|
3
|
+
import subprocess
|
|
4
|
+
import socket
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
import atexit
|
|
8
|
+
import http.client
|
|
2
9
|
from simulation.ports import IVAD
|
|
3
10
|
from simulation.models import VirtualAudioFrame
|
|
4
11
|
|
|
12
|
+
SOCKET_PATH = "/tmp/voice_mcp_ptt.sock"
|
|
13
|
+
|
|
14
|
+
class UDSHTTPConnection(http.client.HTTPConnection):
|
|
15
|
+
def __init__(self, socket_path, timeout=300.0):
|
|
16
|
+
super().__init__("localhost", timeout=timeout)
|
|
17
|
+
self.socket_path = socket_path
|
|
18
|
+
|
|
19
|
+
def connect(self):
|
|
20
|
+
self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
|
21
|
+
self.sock.settimeout(self.timeout)
|
|
22
|
+
self.sock.connect(self.socket_path)
|
|
23
|
+
|
|
5
24
|
class PushToTalkVAD(IVAD):
|
|
6
|
-
def __init__(self, key_name="
|
|
7
|
-
self.
|
|
8
|
-
|
|
25
|
+
def __init__(self, key_name="right_option", **kwargs):
|
|
26
|
+
self.lock = threading.Lock()
|
|
27
|
+
self.is_ptt_active = False
|
|
9
28
|
|
|
10
|
-
|
|
11
|
-
key_map = {
|
|
12
|
-
"shift": keyboard.Key.shift,
|
|
13
|
-
"shift_r": keyboard.Key.shift_r,
|
|
14
|
-
"ctrl": keyboard.Key.ctrl,
|
|
15
|
-
"alt": keyboard.Key.alt,
|
|
16
|
-
"cmd": keyboard.Key.cmd,
|
|
17
|
-
"space": keyboard.Key.space
|
|
18
|
-
}
|
|
29
|
+
logger.info("Initializing Push-To-Talk VAD via Swift Sidecar.")
|
|
19
30
|
|
|
20
|
-
self.
|
|
31
|
+
self.sidecar_process = None
|
|
32
|
+
self.server_socket = None
|
|
33
|
+
self.listener_thread = None
|
|
34
|
+
self._stop_event = threading.Event()
|
|
35
|
+
|
|
36
|
+
self._start_sidecar()
|
|
37
|
+
atexit.register(self._cleanup)
|
|
21
38
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
39
|
+
def set_active(self, active: bool):
|
|
40
|
+
if active and self.server_socket is None:
|
|
41
|
+
self._start_server()
|
|
42
|
+
elif not active and self.server_socket is not None:
|
|
43
|
+
self._stop_server()
|
|
25
44
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
45
|
+
def _start_server(self):
|
|
46
|
+
self._stop_event.clear()
|
|
47
|
+
if os.path.exists(SOCKET_PATH):
|
|
48
|
+
try:
|
|
49
|
+
os.remove(SOCKET_PATH)
|
|
50
|
+
except OSError:
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
self.server_socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
|
54
|
+
self.server_socket.bind(SOCKET_PATH)
|
|
55
|
+
self.server_socket.listen(1)
|
|
56
|
+
logger.debug(f"PTT socket created at {SOCKET_PATH}")
|
|
57
|
+
|
|
58
|
+
self.listener_thread = threading.Thread(target=self._listen_loop, daemon=True)
|
|
59
|
+
self.listener_thread.start()
|
|
60
|
+
|
|
61
|
+
def _stop_server(self):
|
|
62
|
+
self._stop_event.set()
|
|
63
|
+
if self.server_socket:
|
|
64
|
+
try:
|
|
65
|
+
self.server_socket.close()
|
|
66
|
+
except Exception:
|
|
67
|
+
pass
|
|
68
|
+
self.server_socket = None
|
|
69
|
+
|
|
70
|
+
if self.listener_thread:
|
|
71
|
+
self.listener_thread.join(timeout=1.0)
|
|
72
|
+
self.listener_thread = None
|
|
73
|
+
|
|
74
|
+
if os.path.exists(SOCKET_PATH):
|
|
75
|
+
try:
|
|
76
|
+
os.remove(SOCKET_PATH)
|
|
77
|
+
except OSError:
|
|
78
|
+
pass
|
|
79
|
+
|
|
80
|
+
with self.lock:
|
|
81
|
+
self.is_ptt_active = False
|
|
82
|
+
|
|
83
|
+
def _start_sidecar(self):
|
|
84
|
+
try:
|
|
85
|
+
output = subprocess.check_output(["pgrep", "-x", "ptt_sidecar"])
|
|
86
|
+
if len(output.strip()) > 0:
|
|
87
|
+
logger.debug("Swift Sidecar is already running.")
|
|
88
|
+
return
|
|
89
|
+
except subprocess.CalledProcessError:
|
|
90
|
+
pass
|
|
91
|
+
|
|
92
|
+
sidecar_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ptt_sidecar")
|
|
93
|
+
if not os.path.exists(sidecar_path):
|
|
94
|
+
logger.info(f"Compiling Swift Sidecar at {sidecar_path}...")
|
|
95
|
+
swift_src = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ptt_sidecar.swift")
|
|
96
|
+
subprocess.run(["swiftc", swift_src, "-o", sidecar_path])
|
|
97
|
+
|
|
98
|
+
if os.path.exists(sidecar_path):
|
|
99
|
+
self.sidecar_process = subprocess.Popen(
|
|
100
|
+
[sidecar_path],
|
|
101
|
+
stdout=sys.stdout,
|
|
102
|
+
stderr=sys.stderr,
|
|
103
|
+
start_new_session=True
|
|
104
|
+
)
|
|
105
|
+
logger.info("Swift Sidecar started.")
|
|
106
|
+
else:
|
|
107
|
+
logger.error("Failed to start Swift Sidecar, executable not found.")
|
|
29
108
|
|
|
30
|
-
|
|
31
|
-
self.
|
|
109
|
+
def _listen_loop(self):
|
|
110
|
+
while not self._stop_event.is_set():
|
|
111
|
+
try:
|
|
112
|
+
if not self.server_socket:
|
|
113
|
+
break
|
|
114
|
+
conn, _ = self.server_socket.accept()
|
|
115
|
+
with conn:
|
|
116
|
+
while not self._stop_event.is_set():
|
|
117
|
+
data = conn.recv(1)
|
|
118
|
+
if not data:
|
|
119
|
+
break
|
|
120
|
+
|
|
121
|
+
with self.lock:
|
|
122
|
+
if data == b'\x01':
|
|
123
|
+
logger.info("Mic Alive (Right Option Pressed) - Received 0x01")
|
|
124
|
+
self.is_ptt_active = True
|
|
125
|
+
elif data == b'\x00':
|
|
126
|
+
logger.info("Mic Dead (Right Option Released) - Received 0x00")
|
|
127
|
+
self.is_ptt_active = False
|
|
128
|
+
elif data == b'\x02':
|
|
129
|
+
logger.info("Abort (Esc/Ctrl+C Pressed) - Received 0x02. Triggering /abort")
|
|
130
|
+
try:
|
|
131
|
+
daemon_sock = os.path.expanduser("~/Library/Application Support/VoiceMCP/daemon.sock")
|
|
132
|
+
conn_uds = UDSHTTPConnection(daemon_sock, timeout=1.0)
|
|
133
|
+
conn_uds.request("POST", "/abort", body=None, headers={})
|
|
134
|
+
conn_uds.getresponse().read()
|
|
135
|
+
conn_uds.close()
|
|
136
|
+
except Exception as e:
|
|
137
|
+
logger.error(f"Failed to trigger /abort natively: {e}")
|
|
138
|
+
except Exception as e:
|
|
139
|
+
pass
|
|
32
140
|
|
|
33
141
|
def analyze(self, frame: VirtualAudioFrame) -> float:
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
142
|
+
with self.lock:
|
|
143
|
+
return 1.0 if self.is_ptt_active else 0.0
|
|
144
|
+
|
|
145
|
+
def _cleanup(self):
|
|
146
|
+
self._stop_server()
|
|
147
|
+
if self.sidecar_process:
|
|
148
|
+
try:
|
|
149
|
+
self.sidecar_process.terminate()
|
|
150
|
+
except Exception:
|
|
151
|
+
pass
|
|
152
|
+
|
|
153
|
+
def __del__(self):
|
|
154
|
+
self._cleanup()
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from logger import logger
|
|
1
2
|
import numpy as np
|
|
2
3
|
import mlx_whisper
|
|
3
4
|
from typing import List
|
|
@@ -8,7 +9,7 @@ from simulation.models import VirtualAudioFrame
|
|
|
8
9
|
class RealWhisperSTT(ISTT):
|
|
9
10
|
def __init__(self, model_size="mlx-community/whisper-large-v3-mlx"):
|
|
10
11
|
self.model_size = model_size
|
|
11
|
-
|
|
12
|
+
logger.info(f"Preparing MLX Whisper model ({model_size}) for Apple Silicon...")
|
|
12
13
|
# MLX will lazily load and compile the model on the first inference, but we print here to indicate we are using the MLX backend.
|
|
13
14
|
|
|
14
15
|
def transcribe(self, frames: List[VirtualAudioFrame]) -> str:
|
|
@@ -19,14 +20,14 @@ class RealWhisperSTT(ISTT):
|
|
|
19
20
|
# Convert 16-bit PCM (expected from microphone) to float32 [-1.0, 1.0] expected by Whisper
|
|
20
21
|
audio_data = np.frombuffer(raw_bytes, dtype=np.int16).astype(np.float32) / 32768.0
|
|
21
22
|
|
|
22
|
-
|
|
23
|
+
logger.debug(f"Transcribing {len(audio_data)} samples with Apple MLX Whisper ({self.model_size})...")
|
|
23
24
|
|
|
24
25
|
try:
|
|
25
26
|
# We explicitly set English since you are speaking English, and fp16 for Metal acceleration
|
|
26
27
|
result = mlx_whisper.transcribe(audio_data, path_or_hf_repo=self.model_size, language="en")
|
|
27
28
|
text = result.get("text", "").strip()
|
|
28
|
-
|
|
29
|
+
logger.debug(f"MLX Transcription result: {text}")
|
|
29
30
|
return text
|
|
30
31
|
except Exception as e:
|
|
31
|
-
|
|
32
|
+
logger.error(f"MLX Whisper transcription error: {e}")
|
|
32
33
|
return ""
|
|
Binary file
|