voice-mcp-server 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +193 -0
- package/build/index.js +51 -0
- package/config/config.yaml +25 -0
- package/config/microphone/live_mic.yaml +1 -0
- package/config/speaker/elevenlabs_speaker.yaml +3 -0
- package/config/speaker/kokoro_speaker.yaml +3 -0
- package/config/stt/mlx_whisper_large_v3.yaml +2 -0
- package/config/vad/ptt_vad.yaml +8 -0
- package/config/vad/silero_vad.yaml +7 -0
- package/package.json +40 -0
- package/requirements.txt +126 -0
- package/src/adapters_real/__init__.py +0 -0
- package/src/adapters_real/__pycache__/__init__.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/kokoro_speaker.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/live_mic.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/ptt_vad.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/queue_llm.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/whisper_stt.cpython-312.pyc +0 -0
- package/src/adapters_real/echo_llm.py +28 -0
- package/src/adapters_real/elevenlabs_speaker.py +117 -0
- package/src/adapters_real/kokoro_speaker.py +122 -0
- package/src/adapters_real/live_mic.py +64 -0
- package/src/adapters_real/live_speaker.py +66 -0
- package/src/adapters_real/ptt_vad.py +36 -0
- package/src/adapters_real/queue_llm.py +36 -0
- package/src/adapters_real/silero_vad.py +43 -0
- package/src/adapters_real/wav_mic.py +17 -0
- package/src/adapters_real/whisper_stt.py +32 -0
- package/src/daemon/__init__.py +0 -0
- package/src/daemon/audio_server.py +363 -0
- package/src/index.ts +63 -0
- package/src/mcp_server.py +254 -0
- package/src/simulation/__init__.py +0 -0
- package/src/simulation/__pycache__/__init__.cpython-312.pyc +0 -0
- package/src/simulation/__pycache__/engine.cpython-312.pyc +0 -0
- package/src/simulation/__pycache__/models.cpython-312.pyc +0 -0
- package/src/simulation/__pycache__/ports.cpython-312.pyc +0 -0
- package/src/simulation/adapters.py +131 -0
- package/src/simulation/engine.py +242 -0
- package/src/simulation/models.py +25 -0
- package/src/simulation/ports.py +57 -0
- package/src/simulation/tests/__init__.py +0 -0
- package/src/simulation/tests/test_scenarios.py +510 -0
- package/tsconfig.json +15 -0
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import time
|
|
3
|
+
import subprocess
|
|
4
|
+
import torch
|
|
5
|
+
import soundfile as sf
|
|
6
|
+
from kokoro import KPipeline
|
|
7
|
+
from simulation.ports import ISpeaker
|
|
8
|
+
from simulation.models import VirtualAudioFrame
|
|
9
|
+
|
|
10
|
+
class KokoroSpeaker(ISpeaker):
|
|
11
|
+
def __init__(self, wpm=150, voice="af_heart"): # 'af_heart' is a highly expressive American Female voice
|
|
12
|
+
self.wpm = wpm
|
|
13
|
+
self.words_per_ms = (wpm / 60) / 1000
|
|
14
|
+
self.current_text = ""
|
|
15
|
+
self.words = []
|
|
16
|
+
self.process = None
|
|
17
|
+
self.start_time = 0
|
|
18
|
+
self.temp_file = "/tmp/kokoro_output.wav"
|
|
19
|
+
|
|
20
|
+
print(f"[DEBUG SPEAKER] Loading local Kokoro TTS model (Voice: {voice})...")
|
|
21
|
+
# Load the pipeline. Since you are on M4 Max, we will try to use MPS if available
|
|
22
|
+
if torch.backends.mps.is_available():
|
|
23
|
+
self.device = "mps"
|
|
24
|
+
elif torch.cuda.is_available():
|
|
25
|
+
self.device = "cuda"
|
|
26
|
+
else:
|
|
27
|
+
self.device = "cpu"
|
|
28
|
+
|
|
29
|
+
# Initialize the pipeline (downloads weights first time ~300MB)
|
|
30
|
+
# We use lang_code 'a' for American English
|
|
31
|
+
self.pipeline = KPipeline(lang_code='a', device=self.device)
|
|
32
|
+
self.voice = voice
|
|
33
|
+
print(f"[DEBUG SPEAKER] Kokoro TTS loaded successfully on {self.device}.")
|
|
34
|
+
|
|
35
|
+
def speak(self, text: str):
|
|
36
|
+
if not text.strip():
|
|
37
|
+
return
|
|
38
|
+
|
|
39
|
+
self.current_text = text
|
|
40
|
+
self.words = text.split()
|
|
41
|
+
|
|
42
|
+
try:
|
|
43
|
+
print(f"[DEBUG SPEAKER] Generating Kokoro audio for: {text[:50]}...")
|
|
44
|
+
# Generate the audio locally
|
|
45
|
+
generator = self.pipeline(
|
|
46
|
+
text, voice=self.voice, # <= change voice here
|
|
47
|
+
speed=1.0, split_pattern=r'\n+'
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# Since pipeline returns a generator of segments, we can just grab the first one (or concatenate them)
|
|
51
|
+
# For simplicity in testing, we concatenate them all
|
|
52
|
+
audio_segments = []
|
|
53
|
+
for gs, ps, audio in generator:
|
|
54
|
+
audio_segments.append(audio)
|
|
55
|
+
|
|
56
|
+
if not audio_segments:
|
|
57
|
+
print("[DEBUG SPEAKER] Kokoro generated empty audio.")
|
|
58
|
+
return
|
|
59
|
+
|
|
60
|
+
final_audio = torch.cat(audio_segments, dim=0).cpu().numpy()
|
|
61
|
+
|
|
62
|
+
# Save to temporary file at 24kHz (Kokoro's default sample rate)
|
|
63
|
+
sf.write(self.temp_file, final_audio, 24000)
|
|
64
|
+
print("[DEBUG SPEAKER] Audio generated, starting playback.")
|
|
65
|
+
|
|
66
|
+
# Play the generated audio using afplay (macOS native)
|
|
67
|
+
self.start_time = time.time()
|
|
68
|
+
self.process = subprocess.Popen(
|
|
69
|
+
["afplay", self.temp_file],
|
|
70
|
+
stdout=subprocess.DEVNULL,
|
|
71
|
+
stderr=subprocess.DEVNULL
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
except Exception as e:
|
|
75
|
+
print(f"[DEBUG SPEAKER] Kokoro Generation Error: {e}")
|
|
76
|
+
# Fallback to macOS say
|
|
77
|
+
self.start_time = time.time()
|
|
78
|
+
self.process = subprocess.Popen(
|
|
79
|
+
["say", text],
|
|
80
|
+
stdout=subprocess.DEVNULL,
|
|
81
|
+
stderr=subprocess.DEVNULL
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
def play_frame(self, frame: VirtualAudioFrame):
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
def tick(self, ms: int):
|
|
88
|
+
pass
|
|
89
|
+
|
|
90
|
+
def is_speaking(self) -> bool:
|
|
91
|
+
if self.process is None:
|
|
92
|
+
return False
|
|
93
|
+
is_running = self.process.poll() is None
|
|
94
|
+
if not is_running:
|
|
95
|
+
self.current_text = ""
|
|
96
|
+
self.words = []
|
|
97
|
+
self.process = None
|
|
98
|
+
return is_running
|
|
99
|
+
|
|
100
|
+
def has_started_audio(self) -> bool:
|
|
101
|
+
return self.is_speaking()
|
|
102
|
+
|
|
103
|
+
def flush(self) -> str:
|
|
104
|
+
if not self.is_speaking():
|
|
105
|
+
return ""
|
|
106
|
+
|
|
107
|
+
# Immediately kill the playback process
|
|
108
|
+
self.process.kill()
|
|
109
|
+
|
|
110
|
+
# Explicitly wait for the process to terminate and reap it
|
|
111
|
+
self.process.wait()
|
|
112
|
+
|
|
113
|
+
elapsed_ms = (time.time() - self.start_time) * 1000
|
|
114
|
+
words_spoken = int(elapsed_ms * self.words_per_ms)
|
|
115
|
+
|
|
116
|
+
spoken = " ".join(self.words[:words_spoken])
|
|
117
|
+
|
|
118
|
+
self.current_text = ""
|
|
119
|
+
self.words = []
|
|
120
|
+
self.process = None
|
|
121
|
+
|
|
122
|
+
return spoken
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import pyaudio
|
|
2
|
+
import queue
|
|
3
|
+
from simulation.ports import IMicrophone
|
|
4
|
+
from simulation.models import VirtualAudioFrame
|
|
5
|
+
|
|
6
|
+
class LiveMicrophone(IMicrophone):
|
|
7
|
+
def __init__(self, rate=16000, chunk=160):
|
|
8
|
+
self.rate = rate
|
|
9
|
+
self.chunk = chunk
|
|
10
|
+
self.q = queue.Queue(maxsize=100)
|
|
11
|
+
self.p = pyaudio.PyAudio()
|
|
12
|
+
self.stream = None
|
|
13
|
+
|
|
14
|
+
def start_stream(self):
|
|
15
|
+
if self.stream is not None:
|
|
16
|
+
return
|
|
17
|
+
|
|
18
|
+
# Clear any stale data from previous sessions
|
|
19
|
+
while not self.q.empty():
|
|
20
|
+
try:
|
|
21
|
+
self.q.get_nowait()
|
|
22
|
+
except queue.Empty:
|
|
23
|
+
break
|
|
24
|
+
|
|
25
|
+
self.stream = self.p.open(
|
|
26
|
+
format=pyaudio.paInt16,
|
|
27
|
+
channels=1,
|
|
28
|
+
rate=self.rate,
|
|
29
|
+
input=True,
|
|
30
|
+
frames_per_buffer=self.chunk,
|
|
31
|
+
stream_callback=self._callback
|
|
32
|
+
)
|
|
33
|
+
self.stream.start_stream()
|
|
34
|
+
|
|
35
|
+
def stop_stream(self):
|
|
36
|
+
if self.stream is not None:
|
|
37
|
+
self.stream.stop_stream()
|
|
38
|
+
self.stream.close()
|
|
39
|
+
self.stream = None
|
|
40
|
+
|
|
41
|
+
def _callback(self, in_data, frame_count, time_info, status):
|
|
42
|
+
try:
|
|
43
|
+
self.q.put_nowait(in_data)
|
|
44
|
+
except queue.Full:
|
|
45
|
+
pass # drop frames if queue is full rather than blocking audio thread
|
|
46
|
+
return (None, pyaudio.paContinue)
|
|
47
|
+
|
|
48
|
+
def read_frame(self) -> VirtualAudioFrame:
|
|
49
|
+
if self.stream is None:
|
|
50
|
+
return VirtualAudioFrame(10, False, False, "", b"")
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
raw_bytes = self.q.get(timeout=0.1) # Block briefly to act as clock
|
|
54
|
+
# If we didn't get 320 bytes, that's weird but we handle it
|
|
55
|
+
if len(raw_bytes) < self.chunk * 2:
|
|
56
|
+
return VirtualAudioFrame(10, False, False, "", b"")
|
|
57
|
+
return VirtualAudioFrame(10, False, False, "", raw_bytes)
|
|
58
|
+
except queue.Empty:
|
|
59
|
+
# If queue is empty, yield silence frame
|
|
60
|
+
return VirtualAudioFrame(10, False, False, "", b"")
|
|
61
|
+
|
|
62
|
+
def close(self):
|
|
63
|
+
self.stop_stream()
|
|
64
|
+
self.p.terminate()
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import time
|
|
3
|
+
from simulation.ports import ISpeaker
|
|
4
|
+
from simulation.models import VirtualAudioFrame
|
|
5
|
+
|
|
6
|
+
class LiveSpeaker(ISpeaker):
|
|
7
|
+
def __init__(self, wpm=150):
|
|
8
|
+
self.wpm = wpm
|
|
9
|
+
self.words_per_ms = (wpm / 60) / 1000
|
|
10
|
+
self.current_text = ""
|
|
11
|
+
self.words = []
|
|
12
|
+
self.process = None
|
|
13
|
+
self.start_time = 0
|
|
14
|
+
|
|
15
|
+
def speak(self, text: str):
|
|
16
|
+
self.current_text = text
|
|
17
|
+
self.words = text.split()
|
|
18
|
+
self.start_time = time.time()
|
|
19
|
+
|
|
20
|
+
# Start say command non-blocking
|
|
21
|
+
self.process = subprocess.Popen(
|
|
22
|
+
["say", text],
|
|
23
|
+
stdout=subprocess.DEVNULL,
|
|
24
|
+
stderr=subprocess.DEVNULL
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
def play_frame(self, frame: VirtualAudioFrame):
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
def tick(self, ms: int):
|
|
31
|
+
pass # Clock is driven by the mic in a real environment
|
|
32
|
+
|
|
33
|
+
def is_speaking(self) -> bool:
|
|
34
|
+
if self.process is None:
|
|
35
|
+
return False
|
|
36
|
+
# Poll returns None if process is still running
|
|
37
|
+
is_running = self.process.poll() is None
|
|
38
|
+
if not is_running:
|
|
39
|
+
self.current_text = ""
|
|
40
|
+
self.words = []
|
|
41
|
+
self.process = None
|
|
42
|
+
return is_running
|
|
43
|
+
|
|
44
|
+
def has_started_audio(self) -> bool:
|
|
45
|
+
return self.is_speaking() # Approximation for macOS say
|
|
46
|
+
|
|
47
|
+
def flush(self) -> str:
|
|
48
|
+
if not self.is_speaking():
|
|
49
|
+
return ""
|
|
50
|
+
|
|
51
|
+
# Immediately kill the say process
|
|
52
|
+
self.process.kill()
|
|
53
|
+
|
|
54
|
+
# Explicitly wait for the process to terminate and reap it
|
|
55
|
+
self.process.wait()
|
|
56
|
+
|
|
57
|
+
elapsed_ms = (time.time() - self.start_time) * 1000
|
|
58
|
+
words_spoken = int(elapsed_ms * self.words_per_ms)
|
|
59
|
+
|
|
60
|
+
spoken = " ".join(self.words[:words_spoken])
|
|
61
|
+
|
|
62
|
+
self.current_text = ""
|
|
63
|
+
self.words = []
|
|
64
|
+
self.process = None
|
|
65
|
+
|
|
66
|
+
return spoken
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from pynput import keyboard
|
|
2
|
+
from simulation.ports import IVAD
|
|
3
|
+
from simulation.models import VirtualAudioFrame
|
|
4
|
+
|
|
5
|
+
class PushToTalkVAD(IVAD):
|
|
6
|
+
def __init__(self, key_name="shift", **kwargs):
|
|
7
|
+
self.is_pressed = False
|
|
8
|
+
print(f"[DEBUG VAD] Initializing Push-To-Talk VAD. Walkie-Talkie Hotkey: '{key_name}'")
|
|
9
|
+
|
|
10
|
+
# Map string names to pynput Key objects
|
|
11
|
+
key_map = {
|
|
12
|
+
"shift": keyboard.Key.shift,
|
|
13
|
+
"shift_r": keyboard.Key.shift_r,
|
|
14
|
+
"ctrl": keyboard.Key.ctrl,
|
|
15
|
+
"alt": keyboard.Key.alt,
|
|
16
|
+
"cmd": keyboard.Key.cmd,
|
|
17
|
+
"space": keyboard.Key.space
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
self.hotkey = key_map.get(key_name.lower(), keyboard.Key.shift)
|
|
21
|
+
|
|
22
|
+
def on_press(key):
|
|
23
|
+
if key == self.hotkey:
|
|
24
|
+
self.is_pressed = True
|
|
25
|
+
|
|
26
|
+
def on_release(key):
|
|
27
|
+
if key == self.hotkey:
|
|
28
|
+
self.is_pressed = False
|
|
29
|
+
|
|
30
|
+
self.listener = keyboard.Listener(on_press=on_press, on_release=on_release)
|
|
31
|
+
self.listener.start()
|
|
32
|
+
|
|
33
|
+
def analyze(self, frame: VirtualAudioFrame) -> float:
|
|
34
|
+
# If the key is held down, we return 1.0 (100% certainty of speech).
|
|
35
|
+
# If the key is released, we return 0.0 (0% certainty of speech).
|
|
36
|
+
return 1.0 if self.is_pressed else 0.0
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import queue
|
|
2
|
+
from simulation.ports import ILLMBridge
|
|
3
|
+
|
|
4
|
+
class QueueLLMBridge(ILLMBridge):
|
|
5
|
+
def __init__(self, command_queue: queue.Queue, result_queue: queue.Queue):
|
|
6
|
+
self.cmd_q = command_queue
|
|
7
|
+
self.res_q = result_queue
|
|
8
|
+
self.is_requesting = False
|
|
9
|
+
|
|
10
|
+
def call_mcp_tool(self, context: dict) -> dict:
|
|
11
|
+
self.start_request(context)
|
|
12
|
+
while True:
|
|
13
|
+
resp = self.get_response()
|
|
14
|
+
if resp is not None:
|
|
15
|
+
return resp
|
|
16
|
+
import time
|
|
17
|
+
time.sleep(0.01)
|
|
18
|
+
|
|
19
|
+
def start_request(self, context: dict):
|
|
20
|
+
self.is_requesting = True
|
|
21
|
+
# Send the transcript to the MCP server
|
|
22
|
+
self.res_q.put(context)
|
|
23
|
+
|
|
24
|
+
def tick(self, ms: int):
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
def get_response(self) -> dict | None:
|
|
28
|
+
if self.is_requesting:
|
|
29
|
+
try:
|
|
30
|
+
# Non-blocking check for next command from MCP server
|
|
31
|
+
cmd = self.cmd_q.get_nowait()
|
|
32
|
+
self.is_requesting = False
|
|
33
|
+
return cmd
|
|
34
|
+
except queue.Empty:
|
|
35
|
+
return None
|
|
36
|
+
return None
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import torch
|
|
3
|
+
from silero_vad import load_silero_vad
|
|
4
|
+
from simulation.ports import IVAD
|
|
5
|
+
from simulation.models import VirtualAudioFrame
|
|
6
|
+
|
|
7
|
+
class RealSileroVAD(IVAD):
|
|
8
|
+
def __init__(self, **kwargs):
|
|
9
|
+
# Determine if Apple Silicon (MPS) or CUDA is available for hardware acceleration
|
|
10
|
+
if torch.backends.mps.is_available():
|
|
11
|
+
self.device = torch.device("mps")
|
|
12
|
+
print("[DEBUG VAD] Initializing Silero VAD on Apple Metal GPU (MPS)")
|
|
13
|
+
elif torch.cuda.is_available():
|
|
14
|
+
self.device = torch.device("cuda")
|
|
15
|
+
else:
|
|
16
|
+
self.device = torch.device("cpu")
|
|
17
|
+
|
|
18
|
+
self.model = load_silero_vad().to(self.device)
|
|
19
|
+
self.buffer = b""
|
|
20
|
+
self.last_prob = 0.0
|
|
21
|
+
|
|
22
|
+
def analyze(self, frame: VirtualAudioFrame) -> float:
|
|
23
|
+
if not frame.raw_bytes:
|
|
24
|
+
return 0.0
|
|
25
|
+
|
|
26
|
+
self.buffer += frame.raw_bytes
|
|
27
|
+
|
|
28
|
+
# 512 samples = 1024 bytes
|
|
29
|
+
if len(self.buffer) >= 1024:
|
|
30
|
+
chunk = self.buffer[:1024]
|
|
31
|
+
self.buffer = self.buffer[1024:]
|
|
32
|
+
|
|
33
|
+
audio_int16 = np.frombuffer(chunk, dtype=np.int16)
|
|
34
|
+
audio_float32 = audio_int16.astype(np.float32) / 32768.0
|
|
35
|
+
|
|
36
|
+
# Move the tensor to the active GPU/CPU device
|
|
37
|
+
tensor = torch.from_numpy(audio_float32).to(self.device)
|
|
38
|
+
# silero-vad model natively handles the state!
|
|
39
|
+
prob = self.model(tensor, 16000).item()
|
|
40
|
+
self.last_prob = float(prob)
|
|
41
|
+
# print(f"[DEBUG VAD] Prob: {self.last_prob:.3f}")
|
|
42
|
+
|
|
43
|
+
return self.last_prob
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import wave
|
|
2
|
+
from simulation.ports import IMicrophone
|
|
3
|
+
from simulation.models import VirtualAudioFrame
|
|
4
|
+
|
|
5
|
+
class WavFileMicrophone(IMicrophone):
|
|
6
|
+
def __init__(self, filepath: str):
|
|
7
|
+
self.wf = wave.open(filepath, 'rb')
|
|
8
|
+
if self.wf.getnchannels() != 1 or self.wf.getsampwidth() != 2 or self.wf.getframerate() != 16000:
|
|
9
|
+
raise ValueError("Wav file must be 16kHz, 16-bit, mono")
|
|
10
|
+
|
|
11
|
+
def read_frame(self) -> VirtualAudioFrame:
|
|
12
|
+
# 16kHz * 2 bytes/sample * 1 channel * 0.010s = 320 bytes
|
|
13
|
+
# 160 frames at 2 bytes = 320 bytes
|
|
14
|
+
raw_bytes = self.wf.readframes(160)
|
|
15
|
+
if len(raw_bytes) < 320:
|
|
16
|
+
return VirtualAudioFrame(10, False, False, "", b"")
|
|
17
|
+
return VirtualAudioFrame(10, False, False, "", raw_bytes)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import mlx_whisper
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from simulation.ports import ISTT
|
|
6
|
+
from simulation.models import VirtualAudioFrame
|
|
7
|
+
|
|
8
|
+
class RealWhisperSTT(ISTT):
|
|
9
|
+
def __init__(self, model_size="mlx-community/whisper-large-v3-mlx"):
|
|
10
|
+
self.model_size = model_size
|
|
11
|
+
print(f"[DEBUG STT] Preparing MLX Whisper model ({model_size}) for Apple Silicon...")
|
|
12
|
+
# MLX will lazily load and compile the model on the first inference, but we print here to indicate we are using the MLX backend.
|
|
13
|
+
|
|
14
|
+
def transcribe(self, frames: List[VirtualAudioFrame]) -> str:
|
|
15
|
+
raw_bytes = b"".join(f.raw_bytes for f in frames if f.raw_bytes)
|
|
16
|
+
if not raw_bytes:
|
|
17
|
+
return ""
|
|
18
|
+
|
|
19
|
+
# Convert 16-bit PCM (expected from microphone) to float32 [-1.0, 1.0] expected by Whisper
|
|
20
|
+
audio_data = np.frombuffer(raw_bytes, dtype=np.int16).astype(np.float32) / 32768.0
|
|
21
|
+
|
|
22
|
+
print(f"[DEBUG STT] Transcribing {len(audio_data)} samples with Apple MLX Whisper ({self.model_size})...")
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
# We explicitly set English since you are speaking English, and fp16 for Metal acceleration
|
|
26
|
+
result = mlx_whisper.transcribe(audio_data, path_or_hf_repo=self.model_size, language="en")
|
|
27
|
+
text = result.get("text", "").strip()
|
|
28
|
+
print(f"[DEBUG STT] MLX Transcription result: {text}")
|
|
29
|
+
return text
|
|
30
|
+
except Exception as e:
|
|
31
|
+
print(f"[DEBUG STT] MLX Whisper transcription error: {e}")
|
|
32
|
+
return ""
|
|
File without changes
|