voice-mcp-server 0.1.25 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. package/README.md +2 -2
  2. package/config/config.yaml +1 -1
  3. package/config/vad/ptt_vad.yaml +1 -1
  4. package/package.json +1 -1
  5. package/requirements.txt +1 -0
  6. package/src/__pycache__/logger.cpython-312.pyc +0 -0
  7. package/src/__pycache__/mcp_server.cpython-312.pyc +0 -0
  8. package/src/adapters_real/__pycache__/kokoro_speaker.cpython-312.pyc +0 -0
  9. package/src/adapters_real/__pycache__/live_mic.cpython-312.pyc +0 -0
  10. package/src/adapters_real/__pycache__/ptt_vad.cpython-312.pyc +0 -0
  11. package/src/adapters_real/__pycache__/whisper_stt.cpython-312.pyc +0 -0
  12. package/src/adapters_real/kokoro_speaker.py +7 -6
  13. package/src/adapters_real/live_mic.py +15 -4
  14. package/src/adapters_real/ptt_sidecar +0 -0
  15. package/src/adapters_real/ptt_sidecar.swift +156 -0
  16. package/src/adapters_real/ptt_vad.py +143 -25
  17. package/src/adapters_real/whisper_stt.py +5 -4
  18. package/src/daemon/__pycache__/audio_server.cpython-312.pyc +0 -0
  19. package/src/daemon/audio_server.py +47 -13
  20. package/src/logger.py +29 -0
  21. package/src/mcp_server.py +113 -65
  22. package/src/simulation/__pycache__/adapters.cpython-312.pyc +0 -0
  23. package/src/simulation/__pycache__/engine.cpython-312.pyc +0 -0
  24. package/src/simulation/engine.py +12 -1
  25. package/src/simulation/tests/__pycache__/__init__.cpython-312.pyc +0 -0
  26. package/src/simulation/tests/__pycache__/test_ptt_vad.cpython-312-pytest-7.4.2.pyc +0 -0
  27. package/src/simulation/tests/__pycache__/test_scenarios.cpython-312-pytest-7.4.2.pyc +0 -0
  28. package/src/simulation/tests/test_abort_daemon.py +109 -0
  29. package/src/simulation/tests/test_mcp_cancellation.py +83 -0
  30. package/src/simulation/tests/test_ptt_vad.py +81 -0
package/README.md CHANGED
@@ -66,7 +66,7 @@ The system is built on a highly modular adapter pattern configured via `hydra` Y
66
66
  | | `elevenlabs_speaker` | Premium cloud-based ultra-realistic voices. |
67
67
  | **🎙️ Microphones** | `live_mic` | Direct hardware integration via PyAudio. |
68
68
  | **🤫 VAD (Activity)** | `silero_vad` | Conversational mode powered by Silero, heavily optimized for 1-second barge-ins. *(Note: **Headphones are strictly required** for this mode to prevent the AI from hearing its own audio output and endlessly interrupting itself).* |
69
- | | `ptt_vad` | Manual Push-to-Talk / Walkie-Talkie mode. **(Default: Hold 'Shift' to talk)** |
69
+ | | `ptt_vad` | Manual Push-to-Talk / Walkie-Talkie mode. **(Default: Hold 'Right Option' to talk)** |
70
70
  | **📝 STT (Transcription)**| `mlx_whisper_large_v3`| Blazing fast local transcription leveraging Apple's MLX framework. |
71
71
 
72
72
  -----
@@ -192,7 +192,7 @@ If you wish to contribute to the project or run it from source, follow these ste
192
192
 
193
193
  Once connected, test the server by sending this prompt to your AI:
194
194
 
195
- > *"Let's test your voice capabilities! Please use the `voice_converse` tool to introduce yourself and tell me a story about a brave robot. If I interrupt you while you are speaking, stop the story and acknowledge my interruption in your next response."*
195
+ > *"Let's test your voice capabilities! Please introduce yourself, seamlessly tell me how to use the Right Option key to interact with you, and then start telling me a long story about a brave robot. I will practice using the Right Option key to interrupt you mid-story. When I interrupt, stop the story instantly, acknowledge my interruption naturally, and ask what we should work on instead."*
196
196
 
197
197
  -----
198
198
 
@@ -9,7 +9,7 @@ defaults:
9
9
  - speaker: kokoro_speaker
10
10
 
11
11
  # Available VADs:
12
- # - ptt_vad: Walkie-Talkie mode (Hold 'Shift' to talk. Instant response. Ignores TV/noise).
12
+ # - ptt_vad: Walkie-Talkie mode (Hold 'Right Option' to talk. Instant response. Ignores TV/noise).
13
13
  # - silero_vad: Conversational AI mode (Listens automatically. Tuned for 1-second barge-ins).
14
14
  - vad: ptt_vad
15
15
 
@@ -1,5 +1,5 @@
1
1
  _target_: adapters_real.ptt_vad.PushToTalkVAD
2
- key_name: "shift"
2
+ key_name: "right_option"
3
3
 
4
4
  # PTT specific tuning (tuned for instant response)
5
5
  vad_probability_threshold: 0.50
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "voice-mcp-server",
3
- "version": "0.1.25",
3
+ "version": "0.2.0",
4
4
  "description": "An MCP server to allow LLMs to speak and listen via bidirectional voice loops",
5
5
  "main": "build/index.js",
6
6
  "type": "module",
package/requirements.txt CHANGED
@@ -97,6 +97,7 @@ shellingham==1.5.4
97
97
  silero-vad==6.2.1
98
98
  six==1.17.0
99
99
  smart_open==7.5.1
100
+ sounddevice==0.5.5
100
101
  soundfile==0.13.1
101
102
  spacy==3.8.14
102
103
  spacy-curated-transformers==0.3.1
@@ -1,3 +1,4 @@
1
+ from logger import logger
1
2
  import os
2
3
  import time
3
4
  import subprocess
@@ -17,7 +18,7 @@ class KokoroSpeaker(ISpeaker):
17
18
  self.start_time = 0
18
19
  self.temp_file = "/tmp/kokoro_output.wav"
19
20
 
20
- print(f"[DEBUG SPEAKER] Loading local Kokoro TTS model (Voice: {voice})...")
21
+ logger.info(f"Loading local Kokoro TTS model (Voice: {voice})...")
21
22
  # Load the pipeline. Since you are on M4 Max, we will try to use MPS if available
22
23
  if torch.backends.mps.is_available():
23
24
  self.device = "mps"
@@ -30,7 +31,7 @@ class KokoroSpeaker(ISpeaker):
30
31
  # We use lang_code 'a' for American English
31
32
  self.pipeline = KPipeline(lang_code='a', device=self.device)
32
33
  self.voice = voice
33
- print(f"[DEBUG SPEAKER] Kokoro TTS loaded successfully on {self.device}.")
34
+ logger.info(f"Kokoro TTS loaded successfully on {self.device}.")
34
35
 
35
36
  def speak(self, text: str):
36
37
  if not text.strip():
@@ -40,7 +41,7 @@ class KokoroSpeaker(ISpeaker):
40
41
  self.words = text.split()
41
42
 
42
43
  try:
43
- print(f"[DEBUG SPEAKER] Generating Kokoro audio for: {text[:50]}...")
44
+ logger.debug(f"Generating Kokoro audio for: {text[:50]}...")
44
45
  # Generate the audio locally
45
46
  generator = self.pipeline(
46
47
  text, voice=self.voice, # <= change voice here
@@ -54,14 +55,14 @@ class KokoroSpeaker(ISpeaker):
54
55
  audio_segments.append(audio)
55
56
 
56
57
  if not audio_segments:
57
- print("[DEBUG SPEAKER] Kokoro generated empty audio.")
58
+ logger.warning("Kokoro generated empty audio.")
58
59
  return
59
60
 
60
61
  final_audio = torch.cat(audio_segments, dim=0).cpu().numpy()
61
62
 
62
63
  # Save to temporary file at 24kHz (Kokoro's default sample rate)
63
64
  sf.write(self.temp_file, final_audio, 24000)
64
- print("[DEBUG SPEAKER] Audio generated, starting playback.")
65
+ logger.debug("Audio generated, starting playback.")
65
66
 
66
67
  # Play the generated audio using afplay (macOS native)
67
68
  self.start_time = time.time()
@@ -72,7 +73,7 @@ class KokoroSpeaker(ISpeaker):
72
73
  )
73
74
 
74
75
  except Exception as e:
75
- print(f"[DEBUG SPEAKER] Kokoro Generation Error: {e}")
76
+ logger.error(f"Kokoro Generation Error: {e}")
76
77
  # Fallback to macOS say
77
78
  self.start_time = time.time()
78
79
  self.process = subprocess.Popen(
@@ -1,3 +1,4 @@
1
+ from logger import logger
1
2
  import pyaudio
2
3
  import queue
3
4
  from simulation.ports import IMicrophone
@@ -10,6 +11,7 @@ class LiveMicrophone(IMicrophone):
10
11
  self.q = queue.Queue(maxsize=100)
11
12
  self.p = pyaudio.PyAudio()
12
13
  self.stream = None
14
+ logger.info(f"Initialized LiveMicrophone with rate={rate}, chunk={chunk}")
13
15
 
14
16
  def start_stream(self):
15
17
  if self.stream is not None:
@@ -31,12 +33,21 @@ class LiveMicrophone(IMicrophone):
31
33
  stream_callback=self._callback
32
34
  )
33
35
  self.stream.start_stream()
36
+ logger.info("LiveMicrophone stream started")
34
37
 
35
38
  def stop_stream(self):
36
- if self.stream is not None:
37
- self.stream.stop_stream()
38
- self.stream.close()
39
- self.stream = None
39
+ stream = self.stream
40
+ self.stream = None
41
+ if stream is not None:
42
+ try:
43
+ stream.stop_stream()
44
+ except OSError as e:
45
+ logger.debug(f"Ignored PyAudio OSError during stop_stream: {e}")
46
+ try:
47
+ stream.close()
48
+ except Exception:
49
+ pass
50
+ logger.info("LiveMicrophone stream stopped")
40
51
 
41
52
  def _callback(self, in_data, frame_count, time_info, status):
42
53
  try:
Binary file
@@ -0,0 +1,156 @@
1
+ import Foundation
2
+ import IOKit.hid
3
+ import AudioToolbox
4
+ import AppKit
5
+
6
+ var isPTTActive = false
7
+ var isCtrlPressed = false
8
+ var pingID: SystemSoundID = 0
9
+ var popID: SystemSoundID = 0
10
+
11
+ var idleTimer: Timer?
12
+ let IDLE_TIMEOUT: TimeInterval = 900 // 15 minutes
13
+
14
+ func resetIdleTimer() {
15
+ idleTimer?.invalidate()
16
+ idleTimer = Timer.scheduledTimer(withTimeInterval: IDLE_TIMEOUT, repeats: false) { _ in
17
+ print("💤 [SWIFT] Sidecar idle for \(Int(IDLE_TIMEOUT / 60)) minutes. Exiting to save resources.")
18
+ exit(0)
19
+ }
20
+ }
21
+
22
+ // Load uncompressed audio for 0ms latency
23
+ let pingURL = URL(fileURLWithPath: "/System/Library/Sounds/Morse.aiff") as CFURL
24
+ let popURL = URL(fileURLWithPath: "/System/Library/Sounds/Pop.aiff") as CFURL
25
+ AudioServicesCreateSystemSoundID(pingURL, &pingID)
26
+ AudioServicesCreateSystemSoundID(popURL, &popID)
27
+
28
+ func sendSocketMessage(code: UInt8) -> Bool {
29
+ let fd = socket(AF_UNIX, SOCK_STREAM, 0)
30
+ guard fd >= 0 else { return false }
31
+ defer { close(fd) }
32
+
33
+ var addr = sockaddr_un()
34
+ addr.sun_family = sa_family_t(AF_UNIX)
35
+
36
+ let path = "/tmp/voice_mcp_ptt.sock"
37
+ let pathSize = Int(MemoryLayout.size(ofValue: addr.sun_path))
38
+ _ = withUnsafeMutablePointer(to: &addr.sun_path.0) { ptr in
39
+ path.withCString { cstr in
40
+ strncpy(ptr, cstr, pathSize)
41
+ }
42
+ }
43
+
44
+ let len = socklen_t(MemoryLayout<sockaddr_un>.size)
45
+ let connectResult = withUnsafePointer(to: &addr) {
46
+ $0.withMemoryRebound(to: sockaddr.self, capacity: 1) { connect(fd, $0, len) }
47
+ }
48
+
49
+ if connectResult == 0 {
50
+ var byte: UInt8 = code
51
+ write(fd, &byte, 1)
52
+ return true
53
+ }
54
+ return false
55
+ }
56
+
57
+ func isTerminalFrontmost() -> Bool {
58
+ guard let frontApp = NSWorkspace.shared.frontmostApplication else { return false }
59
+ let bundleID = frontApp.bundleIdentifier ?? ""
60
+ // Add common terminal emulators and editors
61
+ let allowedTerminals = [
62
+ "com.apple.Terminal",
63
+ "com.googlecode.iterm2",
64
+ "dev.warp.Warp-Stable",
65
+ "co.zeit.hyper",
66
+ "com.mitchellh.ghostty",
67
+ "net.kovidgoyal.kitty",
68
+ "org.alacritty",
69
+ "com.anthropic.claudedesktop",
70
+ "com.microsoft.VSCode",
71
+ "com.todesktop.Cursor"
72
+ ]
73
+ return allowedTerminals.contains(bundleID)
74
+ }
75
+
76
+ var lastPressTime: TimeInterval = 0
77
+ var lastReleaseTime: TimeInterval = 0
78
+ let DOUBLE_TAP_THRESHOLD: TimeInterval = 0.4 // 400 milliseconds
79
+
80
+ let hidCallback: IOHIDValueCallback = { context, result, sender, value in
81
+ let element = IOHIDValueGetElement(value)
82
+ let usagePage = IOHIDElementGetUsagePage(element)
83
+ let usage = IOHIDElementGetUsage(element)
84
+ let intValue = IOHIDValueGetIntegerValue(value)
85
+
86
+ // 0x07 = Generic Desktop Keyboard
87
+ if usagePage == 0x07 {
88
+ let isPressed = (intValue == 1)
89
+
90
+ // 0xE6 = Right Option
91
+ if usage == 0xE6 {
92
+ // Only process events if our terminal is the active window!
93
+ if isTerminalFrontmost() {
94
+ let now = Date().timeIntervalSince1970
95
+
96
+ if isPressed && !isPTTActive {
97
+ resetIdleTimer()
98
+
99
+ // Check for Double-Tap!
100
+ // If the time since the LAST release is very short, and the time
101
+ // since the LAST press is also very short, this is the second press of a double-tap.
102
+ if (now - lastReleaseTime) < DOUBLE_TAP_THRESHOLD && (now - lastPressTime) < DOUBLE_TAP_THRESHOLD {
103
+ // Abort signal!
104
+ if sendSocketMessage(code: 2) {
105
+ print("🚨 [SWIFT] -> DOUBLE TAP DETECTED! Transmitted 0x02 (Abort)")
106
+ AudioServicesPlaySystemSound(popID) // Play pop to confirm abort
107
+ }
108
+ // Reset timestamps so we don't accidentally triple-tap
109
+ lastPressTime = 0
110
+ lastReleaseTime = 0
111
+ return
112
+ }
113
+
114
+ // Normal Single Press
115
+ lastPressTime = now
116
+ if sendSocketMessage(code: 1) {
117
+ isPTTActive = true
118
+ AudioServicesPlaySystemSound(pingID)
119
+ print("[SWIFT] -> Transmitted 0x01 (Press)")
120
+ }
121
+
122
+ } else if !isPressed && isPTTActive {
123
+ lastReleaseTime = now
124
+ isPTTActive = false
125
+
126
+ // Normal Release
127
+ _ = sendSocketMessage(code: 0)
128
+ AudioServicesPlaySystemSound(popID)
129
+ print("[SWIFT] -> Transmitted 0x00 (Release)")
130
+ }
131
+ }
132
+ }
133
+ }
134
+ }
135
+
136
+ let manager = IOHIDManagerCreate(kCFAllocatorDefault, IOOptionBits(kIOHIDOptionsTypeNone))
137
+ let deviceMatch: [String: Any] = ["DeviceUsagePage": 1, "DeviceUsage": 6]
138
+ IOHIDManagerSetDeviceMatching(manager, deviceMatch as CFDictionary)
139
+ IOHIDManagerRegisterInputValueCallback(manager, hidCallback, nil)
140
+ IOHIDManagerScheduleWithRunLoop(manager, CFRunLoopGetMain(), CFRunLoopMode.defaultMode.rawValue)
141
+
142
+ let openResult = IOHIDManagerOpen(manager, IOOptionBits(kIOHIDOptionsTypeNone))
143
+ if openResult != kIOReturnSuccess {
144
+ print("❌ FATAL: macOS blocked hardware access.")
145
+ print("👉 ACTION REQUIRED: Open System Settings -> Privacy & Security -> Input Monitoring.")
146
+ print("👉 Add your Terminal application, toggle it ON, completely restart the terminal, and try again.")
147
+ exit(1)
148
+ }
149
+
150
+ print("✅ [SWIFT] Sidecar Online with Context-Aware Focus Filter.")
151
+ print("🎧 [SWIFT] Listening natively for Right Option (Hardware Matrix 0xE6)...")
152
+ print("🔒 [SWIFT] Mic will ONLY open if a Terminal window is currently active.")
153
+
154
+ resetIdleTimer() // Start the idle timer initially
155
+
156
+ CFRunLoopRun()
@@ -1,36 +1,154 @@
1
- from pynput import keyboard
1
+ from logger import logger
2
+ import threading
3
+ import subprocess
4
+ import socket
5
+ import os
6
+ import sys
7
+ import atexit
8
+ import http.client
2
9
  from simulation.ports import IVAD
3
10
  from simulation.models import VirtualAudioFrame
4
11
 
12
+ SOCKET_PATH = "/tmp/voice_mcp_ptt.sock"
13
+
14
+ class UDSHTTPConnection(http.client.HTTPConnection):
15
+ def __init__(self, socket_path, timeout=300.0):
16
+ super().__init__("localhost", timeout=timeout)
17
+ self.socket_path = socket_path
18
+
19
+ def connect(self):
20
+ self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
21
+ self.sock.settimeout(self.timeout)
22
+ self.sock.connect(self.socket_path)
23
+
5
24
  class PushToTalkVAD(IVAD):
6
- def __init__(self, key_name="shift", **kwargs):
7
- self.is_pressed = False
8
- print(f"[DEBUG VAD] Initializing Push-To-Talk VAD. Walkie-Talkie Hotkey: '{key_name}'")
25
+ def __init__(self, key_name="right_option", **kwargs):
26
+ self.lock = threading.Lock()
27
+ self.is_ptt_active = False
9
28
 
10
- # Map string names to pynput Key objects
11
- key_map = {
12
- "shift": keyboard.Key.shift,
13
- "shift_r": keyboard.Key.shift_r,
14
- "ctrl": keyboard.Key.ctrl,
15
- "alt": keyboard.Key.alt,
16
- "cmd": keyboard.Key.cmd,
17
- "space": keyboard.Key.space
18
- }
29
+ logger.info("Initializing Push-To-Talk VAD via Swift Sidecar.")
19
30
 
20
- self.hotkey = key_map.get(key_name.lower(), keyboard.Key.shift)
31
+ self.sidecar_process = None
32
+ self.server_socket = None
33
+ self.listener_thread = None
34
+ self._stop_event = threading.Event()
35
+
36
+ self._start_sidecar()
37
+ atexit.register(self._cleanup)
21
38
 
22
- def on_press(key):
23
- if key == self.hotkey:
24
- self.is_pressed = True
39
+ def set_active(self, active: bool):
40
+ if active and self.server_socket is None:
41
+ self._start_server()
42
+ elif not active and self.server_socket is not None:
43
+ self._stop_server()
25
44
 
26
- def on_release(key):
27
- if key == self.hotkey:
28
- self.is_pressed = False
45
+ def _start_server(self):
46
+ self._stop_event.clear()
47
+ if os.path.exists(SOCKET_PATH):
48
+ try:
49
+ os.remove(SOCKET_PATH)
50
+ except OSError:
51
+ pass
52
+
53
+ self.server_socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
54
+ self.server_socket.bind(SOCKET_PATH)
55
+ self.server_socket.listen(1)
56
+ logger.debug(f"PTT socket created at {SOCKET_PATH}")
57
+
58
+ self.listener_thread = threading.Thread(target=self._listen_loop, daemon=True)
59
+ self.listener_thread.start()
60
+
61
+ def _stop_server(self):
62
+ self._stop_event.set()
63
+ if self.server_socket:
64
+ try:
65
+ self.server_socket.close()
66
+ except Exception:
67
+ pass
68
+ self.server_socket = None
69
+
70
+ if self.listener_thread:
71
+ self.listener_thread.join(timeout=1.0)
72
+ self.listener_thread = None
73
+
74
+ if os.path.exists(SOCKET_PATH):
75
+ try:
76
+ os.remove(SOCKET_PATH)
77
+ except OSError:
78
+ pass
79
+
80
+ with self.lock:
81
+ self.is_ptt_active = False
82
+
83
+ def _start_sidecar(self):
84
+ try:
85
+ output = subprocess.check_output(["pgrep", "-x", "ptt_sidecar"])
86
+ if len(output.strip()) > 0:
87
+ logger.debug("Swift Sidecar is already running.")
88
+ return
89
+ except subprocess.CalledProcessError:
90
+ pass
91
+
92
+ sidecar_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ptt_sidecar")
93
+ if not os.path.exists(sidecar_path):
94
+ logger.info(f"Compiling Swift Sidecar at {sidecar_path}...")
95
+ swift_src = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ptt_sidecar.swift")
96
+ subprocess.run(["swiftc", swift_src, "-o", sidecar_path])
97
+
98
+ if os.path.exists(sidecar_path):
99
+ self.sidecar_process = subprocess.Popen(
100
+ [sidecar_path],
101
+ stdout=sys.stdout,
102
+ stderr=sys.stderr,
103
+ start_new_session=True
104
+ )
105
+ logger.info("Swift Sidecar started.")
106
+ else:
107
+ logger.error("Failed to start Swift Sidecar, executable not found.")
29
108
 
30
- self.listener = keyboard.Listener(on_press=on_press, on_release=on_release)
31
- self.listener.start()
109
+ def _listen_loop(self):
110
+ while not self._stop_event.is_set():
111
+ try:
112
+ if not self.server_socket:
113
+ break
114
+ conn, _ = self.server_socket.accept()
115
+ with conn:
116
+ while not self._stop_event.is_set():
117
+ data = conn.recv(1)
118
+ if not data:
119
+ break
120
+
121
+ with self.lock:
122
+ if data == b'\x01':
123
+ logger.info("Mic Alive (Right Option Pressed) - Received 0x01")
124
+ self.is_ptt_active = True
125
+ elif data == b'\x00':
126
+ logger.info("Mic Dead (Right Option Released) - Received 0x00")
127
+ self.is_ptt_active = False
128
+ elif data == b'\x02':
129
+ logger.info("Abort (Esc/Ctrl+C Pressed) - Received 0x02. Triggering /abort")
130
+ try:
131
+ daemon_sock = os.path.expanduser("~/Library/Application Support/VoiceMCP/daemon.sock")
132
+ conn_uds = UDSHTTPConnection(daemon_sock, timeout=1.0)
133
+ conn_uds.request("POST", "/abort", body=None, headers={})
134
+ conn_uds.getresponse().read()
135
+ conn_uds.close()
136
+ except Exception as e:
137
+ logger.error(f"Failed to trigger /abort natively: {e}")
138
+ except Exception as e:
139
+ pass
32
140
 
33
141
  def analyze(self, frame: VirtualAudioFrame) -> float:
34
- # If the key is held down, we return 1.0 (100% certainty of speech).
35
- # If the key is released, we return 0.0 (0% certainty of speech).
36
- return 1.0 if self.is_pressed else 0.0
142
+ with self.lock:
143
+ return 1.0 if self.is_ptt_active else 0.0
144
+
145
+ def _cleanup(self):
146
+ self._stop_server()
147
+ if self.sidecar_process:
148
+ try:
149
+ self.sidecar_process.terminate()
150
+ except Exception:
151
+ pass
152
+
153
+ def __del__(self):
154
+ self._cleanup()
@@ -1,3 +1,4 @@
1
+ from logger import logger
1
2
  import numpy as np
2
3
  import mlx_whisper
3
4
  from typing import List
@@ -8,7 +9,7 @@ from simulation.models import VirtualAudioFrame
8
9
  class RealWhisperSTT(ISTT):
9
10
  def __init__(self, model_size="mlx-community/whisper-large-v3-mlx"):
10
11
  self.model_size = model_size
11
- print(f"[DEBUG STT] Preparing MLX Whisper model ({model_size}) for Apple Silicon...")
12
+ logger.info(f"Preparing MLX Whisper model ({model_size}) for Apple Silicon...")
12
13
  # MLX will lazily load and compile the model on the first inference, but we print here to indicate we are using the MLX backend.
13
14
 
14
15
  def transcribe(self, frames: List[VirtualAudioFrame]) -> str:
@@ -19,14 +20,14 @@ class RealWhisperSTT(ISTT):
19
20
  # Convert 16-bit PCM (expected from microphone) to float32 [-1.0, 1.0] expected by Whisper
20
21
  audio_data = np.frombuffer(raw_bytes, dtype=np.int16).astype(np.float32) / 32768.0
21
22
 
22
- print(f"[DEBUG STT] Transcribing {len(audio_data)} samples with Apple MLX Whisper ({self.model_size})...")
23
+ logger.debug(f"Transcribing {len(audio_data)} samples with Apple MLX Whisper ({self.model_size})...")
23
24
 
24
25
  try:
25
26
  # We explicitly set English since you are speaking English, and fp16 for Metal acceleration
26
27
  result = mlx_whisper.transcribe(audio_data, path_or_hf_repo=self.model_size, language="en")
27
28
  text = result.get("text", "").strip()
28
- print(f"[DEBUG STT] MLX Transcription result: {text}")
29
+ logger.debug(f"MLX Transcription result: {text}")
29
30
  return text
30
31
  except Exception as e:
31
- print(f"[DEBUG STT] MLX Whisper transcription error: {e}")
32
+ logger.error(f"MLX Whisper transcription error: {e}")
32
33
  return ""