audio-transcript-mcp 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- audio_transcript_mcp/__init__.py +3 -0
- audio_transcript_mcp/__main__.py +5 -0
- audio_transcript_mcp/audio_utils.py +15 -0
- audio_transcript_mcp/backends/__init__.py +12 -0
- audio_transcript_mcp/backends/deepgram.py +91 -0
- audio_transcript_mcp/backends/whisper.py +182 -0
- audio_transcript_mcp/recorder/__init__.py +5 -0
- audio_transcript_mcp/recorder/opus.py +64 -0
- audio_transcript_mcp/server.py +448 -0
- audio_transcript_mcp-0.1.0.dist-info/METADATA +226 -0
- audio_transcript_mcp-0.1.0.dist-info/RECORD +14 -0
- audio_transcript_mcp-0.1.0.dist-info/WHEEL +4 -0
- audio_transcript_mcp-0.1.0.dist-info/entry_points.txt +2 -0
- audio_transcript_mcp-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Audio format conversion utilities."""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def float32_to_int16(data: bytes) -> bytes:
|
|
7
|
+
"""Convert float32 PCM bytes to int16 PCM bytes."""
|
|
8
|
+
arr = np.frombuffer(data, dtype=np.float32)
|
|
9
|
+
return np.clip(arr * 32767, -32768, 32767).astype(np.int16).tobytes()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def stereo_to_mono_f32(data: bytes, channels: int = 2) -> bytes:
|
|
13
|
+
"""Downmix multi-channel float32 to mono by averaging channels."""
|
|
14
|
+
arr = np.frombuffer(data, dtype=np.float32).reshape(-1, channels)
|
|
15
|
+
return arr.mean(axis=1).astype(np.float32).tobytes()
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""STT backend factory."""
|
|
2
|
+
|
|
3
|
+
from audio_transcript_mcp.backends.deepgram import DeepgramBackend
|
|
4
|
+
from audio_transcript_mcp.backends.whisper import WhisperBackend
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def create_backend(backend_type, label, sample_rate, channels, is_float32, buf, config):
|
|
8
|
+
"""Create a backend instance based on type string."""
|
|
9
|
+
if backend_type == "local":
|
|
10
|
+
return WhisperBackend(label, sample_rate, channels, is_float32, buf, config)
|
|
11
|
+
else:
|
|
12
|
+
return DeepgramBackend(label, sample_rate, channels, is_float32, buf, config)
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""Deepgram WebSocket STT backend."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import threading
|
|
5
|
+
|
|
6
|
+
from audio_transcript_mcp.audio_utils import float32_to_int16
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DeepgramBackend:
|
|
10
|
+
"""Streams audio to Deepgram via WebSocket. One instance per audio source."""
|
|
11
|
+
|
|
12
|
+
def __init__(self, label, sample_rate, channels, is_float32, buf, config):
|
|
13
|
+
self.label = label
|
|
14
|
+
self.sample_rate = sample_rate
|
|
15
|
+
self.channels = channels
|
|
16
|
+
self._is_float32 = is_float32
|
|
17
|
+
self._buf = buf
|
|
18
|
+
self._language = config.get("deepgram_language", "ru")
|
|
19
|
+
self._api_key = config.get("deepgram_api_key", "")
|
|
20
|
+
self._model = config.get("deepgram_model", "nova-3")
|
|
21
|
+
self._utterance_end_ms = config.get("deepgram_utterance_end_ms", "2500")
|
|
22
|
+
self._endpointing = config.get("deepgram_endpointing", "500")
|
|
23
|
+
self._ws = None
|
|
24
|
+
self._recv_thread = None
|
|
25
|
+
self._recv_stop = threading.Event()
|
|
26
|
+
|
|
27
|
+
def connect(self):
|
|
28
|
+
from websockets.sync.client import connect as ws_connect
|
|
29
|
+
url = (
|
|
30
|
+
f"wss://api.deepgram.com/v1/listen"
|
|
31
|
+
f"?encoding=linear16"
|
|
32
|
+
f"&sample_rate={self.sample_rate}"
|
|
33
|
+
f"&channels={self.channels}"
|
|
34
|
+
f"&language={self._language}"
|
|
35
|
+
f"&model={self._model}"
|
|
36
|
+
f"&punctuate=true"
|
|
37
|
+
f"&smart_format=true"
|
|
38
|
+
f"&interim_results=true"
|
|
39
|
+
f"&utterance_end_ms={self._utterance_end_ms}"
|
|
40
|
+
f"&endpointing={self._endpointing}"
|
|
41
|
+
)
|
|
42
|
+
headers = {"Authorization": f"Token {self._api_key}"}
|
|
43
|
+
try:
|
|
44
|
+
self._ws = ws_connect(url, additional_headers=headers)
|
|
45
|
+
except Exception:
|
|
46
|
+
return False
|
|
47
|
+
|
|
48
|
+
self._recv_stop.clear()
|
|
49
|
+
self._recv_thread = threading.Thread(target=self._receiver, daemon=True)
|
|
50
|
+
self._recv_thread.start()
|
|
51
|
+
return True
|
|
52
|
+
|
|
53
|
+
def send(self, data):
|
|
54
|
+
"""Send audio chunk. Converts float32 to int16 if needed."""
|
|
55
|
+
if self._is_float32:
|
|
56
|
+
data = float32_to_int16(data)
|
|
57
|
+
try:
|
|
58
|
+
self._ws.send(data)
|
|
59
|
+
return True
|
|
60
|
+
except Exception:
|
|
61
|
+
return False
|
|
62
|
+
|
|
63
|
+
def close(self):
|
|
64
|
+
self._recv_stop.set()
|
|
65
|
+
try:
|
|
66
|
+
self._ws.send(json.dumps({"type": "CloseStream"}))
|
|
67
|
+
self._ws.close()
|
|
68
|
+
except Exception:
|
|
69
|
+
pass
|
|
70
|
+
if self._recv_thread:
|
|
71
|
+
self._recv_thread.join(timeout=2)
|
|
72
|
+
|
|
73
|
+
def _receiver(self):
|
|
74
|
+
while not self._recv_stop.is_set():
|
|
75
|
+
try:
|
|
76
|
+
raw = self._ws.recv(timeout=1.0)
|
|
77
|
+
except TimeoutError:
|
|
78
|
+
continue
|
|
79
|
+
except Exception:
|
|
80
|
+
break
|
|
81
|
+
if not isinstance(raw, str):
|
|
82
|
+
continue
|
|
83
|
+
try:
|
|
84
|
+
msg = json.loads(raw)
|
|
85
|
+
except json.JSONDecodeError:
|
|
86
|
+
continue
|
|
87
|
+
ch = msg.get("channel", {})
|
|
88
|
+
alt = ch.get("alternatives", [{}])[0]
|
|
89
|
+
text = alt.get("transcript", "").strip()
|
|
90
|
+
if text and msg.get("is_final"):
|
|
91
|
+
self._buf.add(self.label, text)
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
"""Local faster-whisper STT backend."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import threading
|
|
5
|
+
import time
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
WHISPER_DEBUG = os.environ.get("WHISPER_DEBUG", "").lower() in ("1", "true", "yes")
|
|
10
|
+
|
|
11
|
+
from audio_transcript_mcp.audio_utils import stereo_to_mono_f32
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
from faster_whisper import WhisperModel
|
|
15
|
+
except ImportError:
|
|
16
|
+
WhisperModel = None
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
import soxr
|
|
20
|
+
except ImportError:
|
|
21
|
+
soxr = None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class WhisperBackend:
|
|
25
|
+
"""Accumulates audio chunks and transcribes locally with faster-whisper."""
|
|
26
|
+
|
|
27
|
+
_model = None
|
|
28
|
+
_model_lock = threading.Lock()
|
|
29
|
+
|
|
30
|
+
def __init__(self, label, sample_rate, channels, is_float32, buf, config):
|
|
31
|
+
self.label = label
|
|
32
|
+
self.sample_rate = sample_rate
|
|
33
|
+
self.channels = channels
|
|
34
|
+
self._is_float32 = is_float32
|
|
35
|
+
self._buf = buf
|
|
36
|
+
self._model_name = config.get("whisper_model", "large-v3")
|
|
37
|
+
self._device = config.get("whisper_device", "cuda")
|
|
38
|
+
self._language = config.get("whisper_language", "")
|
|
39
|
+
self._chunk_sec = config.get("whisper_chunk_sec", 5.0)
|
|
40
|
+
self._overlap_sec = config.get("whisper_overlap_sec", 2.0)
|
|
41
|
+
self._pcm_buf = bytearray()
|
|
42
|
+
# mono float32 = 4 bytes/sample
|
|
43
|
+
self._chunk_bytes = int(self._chunk_sec * sample_rate * 4)
|
|
44
|
+
self._overlap_bytes = int(self._overlap_sec * sample_rate * 4)
|
|
45
|
+
self._step_bytes = self._chunk_bytes - self._overlap_bytes
|
|
46
|
+
self._prev_words: list[str] = []
|
|
47
|
+
|
|
48
|
+
@classmethod
|
|
49
|
+
def _ensure_model(cls, buf, model_name, device):
|
|
50
|
+
with cls._model_lock:
|
|
51
|
+
if cls._model is not None and cls._model.model.model_is_loaded:
|
|
52
|
+
return
|
|
53
|
+
if cls._model is not None:
|
|
54
|
+
buf.add("system", f"[Reloading whisper model to {device}...]")
|
|
55
|
+
cls._model.model.load_model()
|
|
56
|
+
buf.add("system", "[Whisper model reloaded]")
|
|
57
|
+
else:
|
|
58
|
+
buf.add("system", f"[Loading whisper model '{model_name}' on {device}...]")
|
|
59
|
+
cls._model = WhisperModel(
|
|
60
|
+
model_name,
|
|
61
|
+
device=device,
|
|
62
|
+
compute_type="float16" if device == "cuda" else "int8",
|
|
63
|
+
)
|
|
64
|
+
buf.add("system", "[Whisper model loaded]")
|
|
65
|
+
|
|
66
|
+
def connect(self):
|
|
67
|
+
try:
|
|
68
|
+
self._ensure_model(self._buf, self._model_name, self._device)
|
|
69
|
+
return True
|
|
70
|
+
except Exception as exc:
|
|
71
|
+
self._buf.add(self.label, f"[ERROR: whisper load failed: {exc}]")
|
|
72
|
+
return False
|
|
73
|
+
|
|
74
|
+
def send(self, data):
|
|
75
|
+
"""Accumulate audio. Converts to mono float32 internally."""
|
|
76
|
+
if self._is_float32:
|
|
77
|
+
if self.channels >= 2:
|
|
78
|
+
data = stereo_to_mono_f32(data, self.channels)
|
|
79
|
+
else:
|
|
80
|
+
arr = np.frombuffer(data, dtype=np.int16).astype(np.float32) / 32768.0
|
|
81
|
+
if self.channels >= 2:
|
|
82
|
+
arr = arr.reshape(-1, self.channels).mean(axis=1)
|
|
83
|
+
data = arr.tobytes()
|
|
84
|
+
self._pcm_buf.extend(data)
|
|
85
|
+
|
|
86
|
+
if len(self._pcm_buf) >= self._chunk_bytes:
|
|
87
|
+
chunk = bytes(self._pcm_buf[:self._chunk_bytes])
|
|
88
|
+
self._pcm_buf = self._pcm_buf[self._step_bytes:]
|
|
89
|
+
try:
|
|
90
|
+
self._transcribe(chunk)
|
|
91
|
+
except Exception as exc:
|
|
92
|
+
self._buf.add(self.label, f"[TRANSCRIBE ERROR: {exc}]")
|
|
93
|
+
return True
|
|
94
|
+
|
|
95
|
+
def close(self):
|
|
96
|
+
if len(self._pcm_buf) > self.sample_rate:
|
|
97
|
+
self._transcribe(bytes(self._pcm_buf), is_tail=True)
|
|
98
|
+
self._pcm_buf.clear()
|
|
99
|
+
|
|
100
|
+
def _transcribe(self, mono_f32, is_tail=False):
|
|
101
|
+
audio = np.frombuffer(mono_f32, dtype=np.float32)
|
|
102
|
+
duration_sec = len(audio) / self.sample_rate
|
|
103
|
+
|
|
104
|
+
if self.sample_rate != 16000:
|
|
105
|
+
audio = soxr.resample(audio, self.sample_rate, 16000, quality="HQ").astype(np.float32)
|
|
106
|
+
|
|
107
|
+
peak = float(np.max(np.abs(audio)))
|
|
108
|
+
rms = float(np.sqrt(np.mean(audio ** 2)))
|
|
109
|
+
|
|
110
|
+
if WHISPER_DEBUG:
|
|
111
|
+
self._buf.add("debug", f"[chunk {duration_sec:.1f}s | peak={peak:.4f} rms={rms:.4f}]")
|
|
112
|
+
|
|
113
|
+
if peak < 0.01:
|
|
114
|
+
if WHISPER_DEBUG:
|
|
115
|
+
self._buf.add("debug", "[skipped: silence]")
|
|
116
|
+
return
|
|
117
|
+
|
|
118
|
+
lang = self._language or None
|
|
119
|
+
t0 = time.monotonic()
|
|
120
|
+
segments, info = self._model.transcribe(
|
|
121
|
+
audio,
|
|
122
|
+
language=lang,
|
|
123
|
+
beam_size=3,
|
|
124
|
+
vad_filter=True,
|
|
125
|
+
vad_parameters=dict(min_silence_duration_ms=500),
|
|
126
|
+
condition_on_previous_text=False,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
step_sec = self._chunk_sec - self._overlap_sec
|
|
130
|
+
all_segs = []
|
|
131
|
+
texts = []
|
|
132
|
+
for seg in segments:
|
|
133
|
+
all_segs.append(seg)
|
|
134
|
+
if not is_tail and self._overlap_sec > 0 and seg.start >= step_sec:
|
|
135
|
+
break
|
|
136
|
+
if seg.no_speech_prob > 0.5:
|
|
137
|
+
if WHISPER_DEBUG:
|
|
138
|
+
self._buf.add("debug", f"[hallucination filtered: nsp={seg.no_speech_prob:.2f}] {seg.text.strip()}")
|
|
139
|
+
continue
|
|
140
|
+
text = seg.text.strip()
|
|
141
|
+
if text:
|
|
142
|
+
texts.append(text)
|
|
143
|
+
elapsed = time.monotonic() - t0
|
|
144
|
+
|
|
145
|
+
if WHISPER_DEBUG:
|
|
146
|
+
seg_details = " | ".join(
|
|
147
|
+
f"[{s.start:.1f}-{s.end:.1f} p={s.avg_logprob:.2f} nsp={s.no_speech_prob:.2f}] {s.text.strip()}"
|
|
148
|
+
for s in all_segs
|
|
149
|
+
)
|
|
150
|
+
self._buf.add("debug",
|
|
151
|
+
f"[whisper {elapsed:.2f}s | lang={info.language} prob={info.language_probability:.2f} "
|
|
152
|
+
f"| segs={len(all_segs)} kept={len(texts)}] {seg_details}"
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
if not texts:
|
|
156
|
+
return
|
|
157
|
+
|
|
158
|
+
combined = " ".join(texts)
|
|
159
|
+
words = combined.split()
|
|
160
|
+
|
|
161
|
+
if self._prev_words and self._overlap_sec > 0:
|
|
162
|
+
before = len(words)
|
|
163
|
+
words = self._strip_overlap(self._prev_words, words)
|
|
164
|
+
if WHISPER_DEBUG and before != len(words):
|
|
165
|
+
self._buf.add("debug", f"[dedup: stripped {before - len(words)} words]")
|
|
166
|
+
|
|
167
|
+
self._prev_words = combined.split()
|
|
168
|
+
if words:
|
|
169
|
+
self._buf.add(self.label, " ".join(words))
|
|
170
|
+
|
|
171
|
+
@staticmethod
|
|
172
|
+
def _strip_overlap(prev, new):
|
|
173
|
+
"""Remove the longest prefix of `new` that matches a suffix of `prev`."""
|
|
174
|
+
def norm(w):
|
|
175
|
+
return w.strip(".,!?;:\"'()[]{}«»\u2014\u2013-").lower()
|
|
176
|
+
|
|
177
|
+
max_check = min(len(prev), len(new), 12)
|
|
178
|
+
best = 0
|
|
179
|
+
for n in range(1, max_check + 1):
|
|
180
|
+
if all(norm(a) == norm(b) for a, b in zip(prev[-n:], new[:n])):
|
|
181
|
+
best = n
|
|
182
|
+
return new[best:] if best > 0 else new
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Stereo Opus streaming recorder using PyOgg."""
|
|
2
|
+
|
|
3
|
+
import pathlib
|
|
4
|
+
import threading
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class StereoOpusRecorder:
|
|
10
|
+
"""Collects mono int16@48kHz from two sources into one stereo opus file.
|
|
11
|
+
|
|
12
|
+
Left channel = "me" (microphone), Right channel = "others" (system audio).
|
|
13
|
+
Both worker threads call write() concurrently; internal lock keeps it safe.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
FRAME_SAMPLES = 960 # 20ms at 48kHz
|
|
17
|
+
|
|
18
|
+
def __init__(self, path: pathlib.Path):
|
|
19
|
+
from pyogg import OggOpusWriter, OpusBufferedEncoder
|
|
20
|
+
|
|
21
|
+
enc = OpusBufferedEncoder()
|
|
22
|
+
enc.set_application("audio")
|
|
23
|
+
enc.set_sampling_frequency(48000)
|
|
24
|
+
enc.set_channels(2)
|
|
25
|
+
enc.set_frame_size(20) # ms
|
|
26
|
+
|
|
27
|
+
self._writer = OggOpusWriter(str(path), enc)
|
|
28
|
+
self._bufs = {"me": bytearray(), "others": bytearray()}
|
|
29
|
+
self._lock = threading.Lock()
|
|
30
|
+
self._frame_bytes = self.FRAME_SAMPLES * 2 # int16 = 2 bytes/sample
|
|
31
|
+
self.path = path
|
|
32
|
+
|
|
33
|
+
def write(self, label: str, mono_i16: bytes):
|
|
34
|
+
"""Append mono int16 PCM to the given channel and flush full frames."""
|
|
35
|
+
with self._lock:
|
|
36
|
+
self._bufs[label].extend(mono_i16)
|
|
37
|
+
# If one channel is >5s ahead, pad the other with silence
|
|
38
|
+
for lbl in self._bufs:
|
|
39
|
+
if lbl != label and len(self._bufs[lbl]) + 48000 * 2 * 5 < len(self._bufs[label]):
|
|
40
|
+
pad = len(self._bufs[label]) - len(self._bufs[lbl])
|
|
41
|
+
self._bufs[lbl].extend(b'\x00' * pad)
|
|
42
|
+
self._flush()
|
|
43
|
+
|
|
44
|
+
def _flush(self):
|
|
45
|
+
fb = self._frame_bytes
|
|
46
|
+
while len(self._bufs["me"]) >= fb and len(self._bufs["others"]) >= fb:
|
|
47
|
+
left = np.frombuffer(bytes(self._bufs["me"][:fb]), dtype=np.int16)
|
|
48
|
+
right = np.frombuffer(bytes(self._bufs["others"][:fb]), dtype=np.int16)
|
|
49
|
+
del self._bufs["me"][:fb]
|
|
50
|
+
del self._bufs["others"][:fb]
|
|
51
|
+
stereo = np.column_stack([left, right]).astype(np.int16)
|
|
52
|
+
self._writer.write(memoryview(bytearray(stereo.tobytes())))
|
|
53
|
+
|
|
54
|
+
def close(self):
|
|
55
|
+
"""Flush remaining audio (pad shorter channel) and close the file."""
|
|
56
|
+
with self._lock:
|
|
57
|
+
max_len = max(len(b) for b in self._bufs.values())
|
|
58
|
+
if max_len > 0:
|
|
59
|
+
pad_to = ((max_len + self._frame_bytes - 1) // self._frame_bytes) * self._frame_bytes
|
|
60
|
+
for lbl in self._bufs:
|
|
61
|
+
if len(self._bufs[lbl]) < pad_to:
|
|
62
|
+
self._bufs[lbl].extend(b'\x00' * (pad_to - len(self._bufs[lbl])))
|
|
63
|
+
self._flush()
|
|
64
|
+
self._writer.close()
|
|
@@ -0,0 +1,448 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Real-time audio transcription MCP server for Claude Code.
|
|
3
|
+
|
|
4
|
+
Captures mic + system audio (WASAPI loopback) on Windows,
|
|
5
|
+
transcribes via Deepgram OR local faster-whisper model.
|
|
6
|
+
|
|
7
|
+
Backend switchable at runtime via set_backend() tool.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import gc
|
|
11
|
+
import os
|
|
12
|
+
import pathlib
|
|
13
|
+
import queue
|
|
14
|
+
import threading
|
|
15
|
+
import time
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
import soxr
|
|
20
|
+
|
|
21
|
+
from mcp.server.fastmcp import FastMCP
|
|
22
|
+
|
|
23
|
+
from audio_transcript_mcp.backends import create_backend
|
|
24
|
+
from audio_transcript_mcp.backends.whisper import WhisperBackend
|
|
25
|
+
from audio_transcript_mcp.recorder import StereoOpusRecorder
|
|
26
|
+
|
|
27
|
+
# ── config ──────────────────────────────────────────────────────────
|
|
28
|
+
|
|
29
|
+
stt_backend = os.environ.get("STT_BACKEND", "deepgram").lower()
|
|
30
|
+
|
|
31
|
+
CONFIG = {
|
|
32
|
+
"deepgram_api_key": os.environ.get("DEEPGRAM_API_KEY", ""),
|
|
33
|
+
"deepgram_language": os.environ.get("DEEPGRAM_LANGUAGE", "ru"),
|
|
34
|
+
"deepgram_model": os.environ.get("DEEPGRAM_MODEL", "nova-3"),
|
|
35
|
+
"deepgram_utterance_end_ms": os.environ.get("DEEPGRAM_UTTERANCE_END_MS", "2500"),
|
|
36
|
+
"deepgram_endpointing": os.environ.get("DEEPGRAM_ENDPOINTING", "500"),
|
|
37
|
+
"whisper_model": os.environ.get("WHISPER_MODEL", "large-v3"),
|
|
38
|
+
"whisper_device": os.environ.get("WHISPER_DEVICE", "cuda"),
|
|
39
|
+
"whisper_language": os.environ.get("WHISPER_LANGUAGE", ""),
|
|
40
|
+
"whisper_chunk_sec": float(os.environ.get("WHISPER_CHUNK_SEC", "5")),
|
|
41
|
+
"whisper_overlap_sec": float(os.environ.get("WHISPER_OVERLAP_SEC", "2")),
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
BUFFER_MAX_AGE = int(os.environ.get("TRANSCRIPT_MAX_AGE", "3600"))
|
|
45
|
+
TRANSCRIPT_DIR = pathlib.Path(os.environ.get("TRANSCRIPT_DIR", pathlib.Path.home() / ".audio-transcript-mcp" / "transcripts"))
|
|
46
|
+
RECONNECT_DELAY = 3
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# ── transcript buffer ───────────────────────────────────────────────
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class Entry:
|
|
53
|
+
ts: float
|
|
54
|
+
speaker: str
|
|
55
|
+
text: str
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class Buffer:
|
|
59
|
+
def __init__(self, max_age: int = 3600):
|
|
60
|
+
self._entries: list[Entry] = []
|
|
61
|
+
self._lock = threading.Lock()
|
|
62
|
+
self._max_age = max_age
|
|
63
|
+
|
|
64
|
+
def add(self, speaker: str, text: str):
|
|
65
|
+
with self._lock:
|
|
66
|
+
self._entries.append(Entry(time.time(), speaker, text))
|
|
67
|
+
cutoff = time.time() - self._max_age
|
|
68
|
+
self._entries = [e for e in self._entries if e.ts > cutoff]
|
|
69
|
+
|
|
70
|
+
def since(self, ts: float) -> list[Entry]:
|
|
71
|
+
with self._lock:
|
|
72
|
+
return [e for e in self._entries if e.ts >= ts]
|
|
73
|
+
|
|
74
|
+
def last(self, seconds: float) -> list[Entry]:
|
|
75
|
+
return self.since(time.time() - seconds)
|
|
76
|
+
|
|
77
|
+
def all(self) -> list[Entry]:
|
|
78
|
+
with self._lock:
|
|
79
|
+
return list(self._entries)
|
|
80
|
+
|
|
81
|
+
def clear(self):
|
|
82
|
+
with self._lock:
|
|
83
|
+
self._entries.clear()
|
|
84
|
+
|
|
85
|
+
@staticmethod
|
|
86
|
+
def format(entries: list[Entry]) -> str:
|
|
87
|
+
if not entries:
|
|
88
|
+
return "(empty transcript)"
|
|
89
|
+
lines = []
|
|
90
|
+
for e in entries:
|
|
91
|
+
t = time.strftime("%H:%M:%S", time.localtime(e.ts))
|
|
92
|
+
lines.append(f"[{t}] {e.speaker}: {e.text}")
|
|
93
|
+
return "\n".join(lines)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
buf = Buffer(BUFFER_MAX_AGE)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
# ── audio engine ────────────────────────────────────────────────────
|
|
100
|
+
|
|
101
|
+
class AudioEngine:
|
|
102
|
+
"""Captures mic + loopback via PyAudioWPatch callbacks,
|
|
103
|
+
sends to chosen STT backend. Auto-reconnects on drop."""
|
|
104
|
+
|
|
105
|
+
def __init__(self):
|
|
106
|
+
self._running = False
|
|
107
|
+
self._stop = threading.Event()
|
|
108
|
+
self._threads: list[threading.Thread] = []
|
|
109
|
+
self._streams = []
|
|
110
|
+
self._pa = None
|
|
111
|
+
self._session_start: float = 0.0
|
|
112
|
+
self._recorder: StereoOpusRecorder | None = None
|
|
113
|
+
|
|
114
|
+
@property
|
|
115
|
+
def running(self) -> bool:
|
|
116
|
+
return self._running
|
|
117
|
+
|
|
118
|
+
def start(self) -> str:
|
|
119
|
+
if self._running:
|
|
120
|
+
return "Already listening."
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
import ctypes
|
|
124
|
+
ctypes.windll.ole32.CoInitializeEx(0, 0)
|
|
125
|
+
except Exception:
|
|
126
|
+
pass
|
|
127
|
+
|
|
128
|
+
try:
|
|
129
|
+
import pyaudiowpatch as pyaudio
|
|
130
|
+
except ImportError:
|
|
131
|
+
return "ERROR: PyAudioWPatch not installed."
|
|
132
|
+
|
|
133
|
+
self._stop.clear()
|
|
134
|
+
self._running = True
|
|
135
|
+
self._session_start = time.time()
|
|
136
|
+
self._pa = pyaudio.PyAudio()
|
|
137
|
+
|
|
138
|
+
# Create per-session directory and open stereo opus recorder
|
|
139
|
+
self._recorder = None
|
|
140
|
+
self._session_dir = None
|
|
141
|
+
if TRANSCRIPT_DIR:
|
|
142
|
+
try:
|
|
143
|
+
ts = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime(self._session_start))
|
|
144
|
+
self._session_dir = TRANSCRIPT_DIR / ts
|
|
145
|
+
self._session_dir.mkdir(parents=True, exist_ok=True)
|
|
146
|
+
self._recorder = StereoOpusRecorder(self._session_dir / "audio.opus")
|
|
147
|
+
except Exception:
|
|
148
|
+
pass
|
|
149
|
+
|
|
150
|
+
for label, use_loopback in [("me", False), ("others", True)]:
|
|
151
|
+
t = threading.Thread(
|
|
152
|
+
target=self._worker,
|
|
153
|
+
args=(label, use_loopback),
|
|
154
|
+
daemon=True,
|
|
155
|
+
name=f"{label}-worker",
|
|
156
|
+
)
|
|
157
|
+
self._threads.append(t)
|
|
158
|
+
t.start()
|
|
159
|
+
|
|
160
|
+
backend_name = "local (faster-whisper)" if stt_backend == "local" else "Deepgram"
|
|
161
|
+
return f"Listening started (mic + system audio) via {backend_name}."
|
|
162
|
+
|
|
163
|
+
def stop(self) -> str:
|
|
164
|
+
if not self._running:
|
|
165
|
+
return "Not listening."
|
|
166
|
+
self._stop.set()
|
|
167
|
+
for s in self._streams:
|
|
168
|
+
try:
|
|
169
|
+
s.stop_stream()
|
|
170
|
+
s.close()
|
|
171
|
+
except Exception:
|
|
172
|
+
pass
|
|
173
|
+
self._streams.clear()
|
|
174
|
+
for t in self._threads:
|
|
175
|
+
t.join(timeout=10)
|
|
176
|
+
self._threads.clear()
|
|
177
|
+
if self._pa:
|
|
178
|
+
try:
|
|
179
|
+
self._pa.terminate()
|
|
180
|
+
except Exception:
|
|
181
|
+
pass
|
|
182
|
+
self._pa = None
|
|
183
|
+
self._running = False
|
|
184
|
+
return "Listening stopped."
|
|
185
|
+
|
|
186
|
+
def _worker(self, label: str, use_loopback: bool):
|
|
187
|
+
import pyaudiowpatch as pyaudio
|
|
188
|
+
|
|
189
|
+
p = self._pa
|
|
190
|
+
|
|
191
|
+
try:
|
|
192
|
+
if use_loopback:
|
|
193
|
+
device = p.get_default_wasapi_loopback()
|
|
194
|
+
else:
|
|
195
|
+
device = p.get_default_input_device_info()
|
|
196
|
+
except Exception as exc:
|
|
197
|
+
buf.add("system", f"[ERROR: device not found: {exc}]")
|
|
198
|
+
return
|
|
199
|
+
|
|
200
|
+
sample_rate = int(device["defaultSampleRate"])
|
|
201
|
+
channels = max(1, int(device["maxInputChannels"]))
|
|
202
|
+
pa_format = pyaudio.paFloat32 if use_loopback else pyaudio.paInt16
|
|
203
|
+
chunk_frames = sample_rate // 10 # 100ms
|
|
204
|
+
|
|
205
|
+
audio_q: queue.Queue[bytes | None] = queue.Queue(maxsize=100)
|
|
206
|
+
|
|
207
|
+
def audio_callback(in_data, frame_count, time_info, status):
|
|
208
|
+
if self._stop.is_set():
|
|
209
|
+
return (None, pyaudio.paComplete)
|
|
210
|
+
try:
|
|
211
|
+
audio_q.put_nowait(in_data)
|
|
212
|
+
except queue.Full:
|
|
213
|
+
pass
|
|
214
|
+
return (None, pyaudio.paContinue)
|
|
215
|
+
|
|
216
|
+
try:
|
|
217
|
+
stream = p.open(
|
|
218
|
+
format=pa_format,
|
|
219
|
+
channels=channels,
|
|
220
|
+
rate=sample_rate,
|
|
221
|
+
input=True,
|
|
222
|
+
input_device_index=device["index"],
|
|
223
|
+
frames_per_buffer=chunk_frames,
|
|
224
|
+
stream_callback=audio_callback,
|
|
225
|
+
)
|
|
226
|
+
stream.start_stream()
|
|
227
|
+
self._streams.append(stream)
|
|
228
|
+
except Exception as exc:
|
|
229
|
+
buf.add("system", f"[ERROR: audio open failed: {exc}]")
|
|
230
|
+
return
|
|
231
|
+
|
|
232
|
+
buf.add("system", f"[STARTED: {device['name']}, {sample_rate}Hz, {channels}ch]")
|
|
233
|
+
|
|
234
|
+
is_float32 = pa_format == pyaudio.paFloat32
|
|
235
|
+
|
|
236
|
+
# Stateful resampler for opus recording (avoids boundary artifacts)
|
|
237
|
+
opus_resampler = None
|
|
238
|
+
if self._recorder and sample_rate != 48000:
|
|
239
|
+
opus_resampler = soxr.ResampleStream(sample_rate, 48000, 1, dtype="float32", quality="HQ")
|
|
240
|
+
|
|
241
|
+
while not self._stop.is_set():
|
|
242
|
+
backend = create_backend(stt_backend, label, sample_rate, channels, is_float32, buf, CONFIG)
|
|
243
|
+
if not backend.connect():
|
|
244
|
+
buf.add("system", f"[RECONNECT: backend connect failed, retry in {RECONNECT_DELAY}s]")
|
|
245
|
+
self._stop.wait(RECONNECT_DELAY)
|
|
246
|
+
continue
|
|
247
|
+
|
|
248
|
+
# Drain stale audio
|
|
249
|
+
while not audio_q.empty():
|
|
250
|
+
try:
|
|
251
|
+
audio_q.get_nowait()
|
|
252
|
+
except queue.Empty:
|
|
253
|
+
break
|
|
254
|
+
|
|
255
|
+
disconnected = False
|
|
256
|
+
while not self._stop.is_set():
|
|
257
|
+
try:
|
|
258
|
+
data = audio_q.get(timeout=0.5)
|
|
259
|
+
except queue.Empty:
|
|
260
|
+
continue
|
|
261
|
+
if data is None:
|
|
262
|
+
break
|
|
263
|
+
|
|
264
|
+
# Record to stereo opus
|
|
265
|
+
if self._recorder:
|
|
266
|
+
try:
|
|
267
|
+
if is_float32:
|
|
268
|
+
arr = np.frombuffer(data, dtype=np.float32)
|
|
269
|
+
if channels >= 2:
|
|
270
|
+
arr = arr.reshape(-1, channels).mean(axis=1)
|
|
271
|
+
else:
|
|
272
|
+
arr = np.frombuffer(data, dtype=np.int16).astype(np.float32) / 32768.0
|
|
273
|
+
if channels >= 2:
|
|
274
|
+
arr = arr.reshape(-1, channels).mean(axis=1)
|
|
275
|
+
if opus_resampler is not None:
|
|
276
|
+
arr = opus_resampler.resample_chunk(arr)
|
|
277
|
+
mono_i16 = np.clip(arr * 32767, -32768, 32767).astype(np.int16)
|
|
278
|
+
self._recorder.write(label, mono_i16.tobytes())
|
|
279
|
+
except Exception:
|
|
280
|
+
pass
|
|
281
|
+
|
|
282
|
+
if not backend.send(data):
|
|
283
|
+
disconnected = True
|
|
284
|
+
break
|
|
285
|
+
|
|
286
|
+
backend.close()
|
|
287
|
+
|
|
288
|
+
if disconnected and not self._stop.is_set():
|
|
289
|
+
buf.add("system", f"[RECONNECT: connection lost, retry in {RECONNECT_DELAY}s...]")
|
|
290
|
+
self._stop.wait(RECONNECT_DELAY)
|
|
291
|
+
|
|
292
|
+
if stt_backend == "local" and not disconnected:
|
|
293
|
+
break
|
|
294
|
+
|
|
295
|
+
# Flush remaining resampler state into opus recorder
|
|
296
|
+
if opus_resampler is not None and self._recorder:
|
|
297
|
+
try:
|
|
298
|
+
tail = opus_resampler.resample_chunk(np.empty(0, dtype=np.float32), last=True)
|
|
299
|
+
if len(tail) > 0:
|
|
300
|
+
mono_i16 = np.clip(tail * 32767, -32768, 32767).astype(np.int16)
|
|
301
|
+
self._recorder.write(label, mono_i16.tobytes())
|
|
302
|
+
except Exception:
|
|
303
|
+
pass
|
|
304
|
+
|
|
305
|
+
try:
|
|
306
|
+
stream.stop_stream()
|
|
307
|
+
stream.close()
|
|
308
|
+
except Exception:
|
|
309
|
+
pass
|
|
310
|
+
buf.add("system", "[STOPPED]")
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
engine = AudioEngine()
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
# ── MCP tools ───────────────────────────────────────────────────────
|
|
317
|
+
|
|
318
|
+
mcp = FastMCP("audio-transcript")
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
@mcp.tool()
|
|
322
|
+
async def start_listening() -> str:
|
|
323
|
+
"""Start capturing mic + system audio and transcribing via current backend.
|
|
324
|
+
Returns status message.
|
|
325
|
+
"""
|
|
326
|
+
if stt_backend == "deepgram" and not CONFIG["deepgram_api_key"]:
|
|
327
|
+
return "ERROR: Set DEEPGRAM_API_KEY env var first."
|
|
328
|
+
return engine.start()
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
@mcp.tool()
|
|
332
|
+
async def stop_listening() -> str:
|
|
333
|
+
"""Stop audio capture and transcription."""
|
|
334
|
+
session_start = engine._session_start
|
|
335
|
+
result = engine.stop()
|
|
336
|
+
with WhisperBackend._model_lock:
|
|
337
|
+
if WhisperBackend._model is not None:
|
|
338
|
+
WhisperBackend._model.model.unload_model()
|
|
339
|
+
gc.collect()
|
|
340
|
+
|
|
341
|
+
# Save session transcript to file
|
|
342
|
+
session_dir = engine._session_dir
|
|
343
|
+
if session_start and session_dir:
|
|
344
|
+
try:
|
|
345
|
+
session_dir.mkdir(parents=True, exist_ok=True)
|
|
346
|
+
entries = buf.since(session_start)
|
|
347
|
+
path = session_dir / "transcript.txt"
|
|
348
|
+
lines = []
|
|
349
|
+
for e in entries:
|
|
350
|
+
if e.speaker in ("debug",):
|
|
351
|
+
continue
|
|
352
|
+
t = time.strftime("%H:%M:%S", time.localtime(e.ts))
|
|
353
|
+
lines.append(f"[{t}] {e.speaker} — {e.text}")
|
|
354
|
+
path.write_text("\n\n".join(lines) + "\n", encoding="utf-8")
|
|
355
|
+
result += f" Transcript saved to {path}."
|
|
356
|
+
except Exception as exc:
|
|
357
|
+
result += f" (transcript save failed: {exc})"
|
|
358
|
+
|
|
359
|
+
# Close stereo opus recorder
|
|
360
|
+
if engine._recorder:
|
|
361
|
+
try:
|
|
362
|
+
engine._recorder.close()
|
|
363
|
+
p = engine._recorder.path
|
|
364
|
+
size_kb = p.stat().st_size / 1024
|
|
365
|
+
result += f" Audio saved to {p} ({size_kb:.0f} KB)."
|
|
366
|
+
except Exception as exc:
|
|
367
|
+
result += f" (opus save failed: {exc})"
|
|
368
|
+
engine._recorder = None
|
|
369
|
+
|
|
370
|
+
return result
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
@mcp.tool()
|
|
374
|
+
async def is_listening() -> str:
|
|
375
|
+
"""Check if audio capture is currently active."""
|
|
376
|
+
return f"Listening: {engine.running}"
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
@mcp.tool()
|
|
380
|
+
async def get_transcript(last_seconds: float = 60) -> str:
|
|
381
|
+
"""Get transcript for the last N seconds.
|
|
382
|
+
|
|
383
|
+
Args:
|
|
384
|
+
last_seconds: how far back in seconds (default 60)
|
|
385
|
+
"""
|
|
386
|
+
return buf.format(buf.last(last_seconds))
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
@mcp.tool()
|
|
390
|
+
async def get_full_transcript() -> str:
|
|
391
|
+
"""Get the entire accumulated transcript (up to max buffer age)."""
|
|
392
|
+
return buf.format(buf.all())
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
@mcp.tool()
|
|
396
|
+
async def get_transcript_since(timestamp: float) -> str:
|
|
397
|
+
"""Get transcript entries since a Unix timestamp.
|
|
398
|
+
|
|
399
|
+
Args:
|
|
400
|
+
timestamp: Unix epoch timestamp (e.g. from time.time())
|
|
401
|
+
"""
|
|
402
|
+
return buf.format(buf.since(timestamp))
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
@mcp.tool()
|
|
406
|
+
async def clear_transcript() -> str:
|
|
407
|
+
"""Clear the transcript buffer."""
|
|
408
|
+
buf.clear()
|
|
409
|
+
return "Transcript cleared."
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
@mcp.tool()
|
|
413
|
+
async def get_backend() -> str:
|
|
414
|
+
"""Get current STT backend name ("deepgram" or "local")."""
|
|
415
|
+
return f"Current backend: {stt_backend}"
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
@mcp.tool()
|
|
419
|
+
async def set_backend(backend: str) -> str:
|
|
420
|
+
"""Switch STT backend. If currently listening, restarts capture automatically.
|
|
421
|
+
|
|
422
|
+
Args:
|
|
423
|
+
backend: "deepgram" or "local"
|
|
424
|
+
"""
|
|
425
|
+
global stt_backend
|
|
426
|
+
backend = backend.strip().lower()
|
|
427
|
+
if backend not in ("deepgram", "local"):
|
|
428
|
+
return f"ERROR: unknown backend '{backend}'. Use 'deepgram' or 'local'."
|
|
429
|
+
if backend == stt_backend:
|
|
430
|
+
return f"Already using '{stt_backend}'."
|
|
431
|
+
old = stt_backend
|
|
432
|
+
stt_backend = backend
|
|
433
|
+
if engine.running:
|
|
434
|
+
engine.stop()
|
|
435
|
+
msg = engine.start()
|
|
436
|
+
return f"Switched {old} -> {stt_backend} and restarted. {msg}"
|
|
437
|
+
return f"Switched {old} -> {stt_backend}. Start listening when ready."
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
# ── entrypoint ──────────────────────────────────────────────────────
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def main():
|
|
444
|
+
mcp.run()
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
if __name__ == "__main__":
|
|
448
|
+
main()
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: audio-transcript-mcp
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Real-time audio transcription MCP server for Claude Code
|
|
5
|
+
Project-URL: Homepage, https://github.com/llilakoblock/audio-transcript-mcp
|
|
6
|
+
Project-URL: Repository, https://github.com/llilakoblock/audio-transcript-mcp
|
|
7
|
+
Project-URL: Issues, https://github.com/llilakoblock/audio-transcript-mcp/issues
|
|
8
|
+
Author: llilakoblock
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: audio,deepgram,mcp,transcription,whisper
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Requires-Dist: faster-whisper>=1.0
|
|
23
|
+
Requires-Dist: mcp[cli]>=1.2
|
|
24
|
+
Requires-Dist: numpy>=1.24
|
|
25
|
+
Requires-Dist: pyaudiowpatch>=0.2.12
|
|
26
|
+
Requires-Dist: pyogg>=0.7
|
|
27
|
+
Requires-Dist: soxr>=0.3
|
|
28
|
+
Requires-Dist: websockets>=12.0
|
|
29
|
+
Description-Content-Type: text/markdown
|
|
30
|
+
|
|
31
|
+
# audio-transcript-mcp
|
|
32
|
+
|
|
33
|
+
Real-time audio transcription MCP server for Claude Code.
|
|
34
|
+
|
|
35
|
+
Captures **microphone + system audio** (WASAPI loopback on Windows) and transcribes via **Deepgram** (cloud) or **faster-whisper** (local, GPU/CPU).
|
|
36
|
+
|
|
37
|
+
## Features
|
|
38
|
+
|
|
39
|
+
- **Dual audio capture**: mic + system sound simultaneously
|
|
40
|
+
- **Two STT backends** switchable at runtime (Deepgram nova-3 / faster-whisper)
|
|
41
|
+
- **Stereo opus recording**: each session saves a stereo opus file (L=mic, R=system audio)
|
|
42
|
+
- **Per-session directories**: transcript + audio saved to `~/.audio-transcript-mcp/transcripts/<timestamp>/`
|
|
43
|
+
- Chunk overlap with text deduplication (no cut words at boundaries)
|
|
44
|
+
- Native float32 audio pipeline for whisper (no lossy int16 round-trip)
|
|
45
|
+
- High-quality stateful resampling via soxr (no boundary artifacts)
|
|
46
|
+
- Whisper hallucination filter (no_speech_prob + avg_logprob thresholds)
|
|
47
|
+
- Transcript buffer with time-based queries
|
|
48
|
+
- Auto-reconnect for Deepgram WebSocket
|
|
49
|
+
- GPU model unload/reload on stop/start (CUDA memory management)
|
|
50
|
+
|
|
51
|
+
## Architecture
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
┌─────────────┐ ┌──────────┐ ┌─────────────────┐
|
|
55
|
+
│ Mic (int16) ├────►│ │ │ STT Backend │
|
|
56
|
+
│ WASAPI │ │ Worker ├────►│ whisper / DG ├──► Transcript buffer
|
|
57
|
+
└─────────────┘ │ Thread │ └─────────────────┘
|
|
58
|
+
│ ├────►┌─────────────────┐
|
|
59
|
+
┌─────────────┐ │ │ │ StereoOpusRec │
|
|
60
|
+
│ System audio ├────►│ │ │ L=me R=others ├──► audio.opus
|
|
61
|
+
│ Loopback f32 │ └──────────┘ └─────────────────┘
|
|
62
|
+
└─────────────┘
|
|
63
|
+
|
|
64
|
+
Audio pipeline: native capture → stereo→mono → soxr resample → backend/opus
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Each audio source runs in its own worker thread. Audio is captured in the device's native format (float32 for loopback, int16 for mic), converted to mono, and routed to both the STT backend and the stereo opus recorder.
|
|
68
|
+
|
|
69
|
+
## Requirements
|
|
70
|
+
|
|
71
|
+
- Python 3.10+
|
|
72
|
+
- Windows (WASAPI loopback for system audio capture); mic-only on macOS/Linux
|
|
73
|
+
- NVIDIA GPU recommended for local whisper backend
|
|
74
|
+
|
|
75
|
+
## Installation
|
|
76
|
+
|
|
77
|
+
### From PyPI (recommended)
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
pip install audio-transcript-mcp
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Or run without installing via `uvx`:
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
uvx audio-transcript-mcp
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### From source
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
git clone https://github.com/llilakoblock/audio-transcript-mcp.git
|
|
93
|
+
cd audio-transcript-mcp
|
|
94
|
+
pip install -e .
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## MCP Configuration
|
|
98
|
+
|
|
99
|
+
Add to your `mcp.json` (Claude Code settings):
|
|
100
|
+
|
|
101
|
+
### Using PyPI install
|
|
102
|
+
|
|
103
|
+
```json
|
|
104
|
+
{
|
|
105
|
+
"audio-transcript": {
|
|
106
|
+
"type": "stdio",
|
|
107
|
+
"command": "audio-transcript-mcp",
|
|
108
|
+
"env": {
|
|
109
|
+
"STT_BACKEND": "local",
|
|
110
|
+
"DEEPGRAM_API_KEY": "your-deepgram-api-key",
|
|
111
|
+
"DEEPGRAM_LANGUAGE": "ru",
|
|
112
|
+
"DEEPGRAM_MODEL": "nova-3",
|
|
113
|
+
"DEEPGRAM_UTTERANCE_END_MS": "2500",
|
|
114
|
+
"DEEPGRAM_ENDPOINTING": "500",
|
|
115
|
+
"WHISPER_MODEL": "large-v3",
|
|
116
|
+
"WHISPER_DEVICE": "cuda",
|
|
117
|
+
"WHISPER_LANGUAGE": "ru",
|
|
118
|
+
"WHISPER_CHUNK_SEC": "10",
|
|
119
|
+
"WHISPER_OVERLAP_SEC": "2",
|
|
120
|
+
"TRANSCRIPT_MAX_AGE": "3600"
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### Using uvx (no install needed)
|
|
127
|
+
|
|
128
|
+
```json
|
|
129
|
+
{
|
|
130
|
+
"audio-transcript": {
|
|
131
|
+
"type": "stdio",
|
|
132
|
+
"command": "uvx",
|
|
133
|
+
"args": ["audio-transcript-mcp"],
|
|
134
|
+
"env": {
|
|
135
|
+
"STT_BACKEND": "deepgram",
|
|
136
|
+
"DEEPGRAM_API_KEY": "your-deepgram-api-key"
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
> **Note:** System audio capture (loopback) uses WASAPI and is Windows-only. On macOS/Linux only microphone input works out of the box.
|
|
143
|
+
|
|
144
|
+
## Environment Variables
|
|
145
|
+
|
|
146
|
+
| Variable | Default | Description |
|
|
147
|
+
|---|---|---|
|
|
148
|
+
| `STT_BACKEND` | `deepgram` | `"deepgram"` (cloud) or `"local"` (faster-whisper) |
|
|
149
|
+
| `DEEPGRAM_API_KEY` | — | API key for Deepgram (required if backend=deepgram) |
|
|
150
|
+
| `DEEPGRAM_LANGUAGE` | `ru` | Language code for Deepgram |
|
|
151
|
+
| `DEEPGRAM_MODEL` | `nova-3` | Deepgram model (`nova-3`, `nova-2`, etc.) |
|
|
152
|
+
| `DEEPGRAM_UTTERANCE_END_MS` | `2500` | Silence duration (ms) before finalizing utterance |
|
|
153
|
+
| `DEEPGRAM_ENDPOINTING` | `500` | Endpointing sensitivity (ms) |
|
|
154
|
+
| `WHISPER_MODEL` | `large-v3` | Model size: `tiny`, `base`, `small`, `medium`, `large-v3` |
|
|
155
|
+
| `WHISPER_DEVICE` | `cuda` | `"cuda"` or `"cpu"` |
|
|
156
|
+
| `WHISPER_LANGUAGE` | — | Language hint for whisper (empty = auto-detect) |
|
|
157
|
+
| `WHISPER_CHUNK_SEC` | `5` | Audio chunk duration in seconds |
|
|
158
|
+
| `WHISPER_OVERLAP_SEC` | `2` | Overlap between consecutive chunks (avoids cut words) |
|
|
159
|
+
| `TRANSCRIPT_MAX_AGE` | `3600` | Max transcript buffer age in seconds |
|
|
160
|
+
| `TRANSCRIPT_DIR` | `~/.audio-transcript-mcp/transcripts` | Directory for per-session transcript/audio files |
|
|
161
|
+
|
|
162
|
+
## Session Output
|
|
163
|
+
|
|
164
|
+
Each recording session creates a timestamped directory:
|
|
165
|
+
|
|
166
|
+
```
|
|
167
|
+
~/.audio-transcript-mcp/transcripts/
|
|
168
|
+
2026-03-06_23-24-48/
|
|
169
|
+
transcript.txt # Plain text transcript
|
|
170
|
+
audio.opus # Stereo opus (L=mic, R=system)
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
The transcript is plain text:
|
|
174
|
+
```
|
|
175
|
+
[23:24:50] me — Hello, can you hear me?
|
|
176
|
+
|
|
177
|
+
[23:24:52] others — Yes, I can hear you fine.
|
|
178
|
+
|
|
179
|
+
[23:24:55] system — [STARTED: Microphone, 44100Hz, 2ch]
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
## MCP Tools
|
|
183
|
+
|
|
184
|
+
| Tool | Description |
|
|
185
|
+
|---|---|
|
|
186
|
+
| `start_listening` | Start capturing mic + system audio and transcribing |
|
|
187
|
+
| `stop_listening` | Stop capture, save transcript and opus recording |
|
|
188
|
+
| `is_listening` | Check if capture is active |
|
|
189
|
+
| `get_transcript` | Get transcript for the last N seconds (default 60) |
|
|
190
|
+
| `get_full_transcript` | Get entire transcript buffer |
|
|
191
|
+
| `get_transcript_since` | Get transcript since a Unix timestamp |
|
|
192
|
+
| `clear_transcript` | Clear the transcript buffer |
|
|
193
|
+
| `get_backend` | Show current STT backend |
|
|
194
|
+
| `set_backend` | Switch backend (`"deepgram"` / `"local"`) at runtime |
|
|
195
|
+
|
|
196
|
+
## Project Structure
|
|
197
|
+
|
|
198
|
+
```
|
|
199
|
+
src/audio_transcript_mcp/
|
|
200
|
+
__init__.py # Package version
|
|
201
|
+
__main__.py # python -m entry point
|
|
202
|
+
server.py # MCP server, AudioEngine, config
|
|
203
|
+
audio_utils.py # Format conversion (float32↔int16, stereo→mono)
|
|
204
|
+
backends/
|
|
205
|
+
__init__.py # Backend factory
|
|
206
|
+
whisper.py # Local faster-whisper STT
|
|
207
|
+
deepgram.py # Deepgram WebSocket STT
|
|
208
|
+
recorder/
|
|
209
|
+
__init__.py
|
|
210
|
+
opus.py # StereoOpusRecorder (PyOgg)
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
## Releasing
|
|
214
|
+
|
|
215
|
+
Releases are automated via GitHub Actions:
|
|
216
|
+
|
|
217
|
+
```bash
|
|
218
|
+
# Update version in src/audio_transcript_mcp/__init__.py
|
|
219
|
+
git tag v0.1.0
|
|
220
|
+
git push origin v0.1.0
|
|
221
|
+
# CI automatically builds, publishes to PyPI, and creates a GitHub Release
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
## License
|
|
225
|
+
|
|
226
|
+
MIT
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
audio_transcript_mcp/__init__.py,sha256=Q552b2DKL8KTpkvEbL9XjWOHBi_lybCpeD1XyOO5C_0,71
|
|
2
|
+
audio_transcript_mcp/__main__.py,sha256=i-qaCM9yraBkuIO2Aa7JwQ-K-i7kiF4rkn6foylEEgk,111
|
|
3
|
+
audio_transcript_mcp/audio_utils.py,sha256=EtQVN3W9aGm-SSG2hxOZGjGBsEhH_X8Cgb7vyF_TNCk,550
|
|
4
|
+
audio_transcript_mcp/server.py,sha256=UQ5B7J1-Ptoz9JKjpkQO8ux0XeGZtzhtNHBq9EZPR2s,15474
|
|
5
|
+
audio_transcript_mcp/backends/__init__.py,sha256=sItMvJITF6TCT8BHa8_O5PWrPuFKbvseY7dwl7ri5UQ,522
|
|
6
|
+
audio_transcript_mcp/backends/deepgram.py,sha256=kdJwjkg1V-9C2CLtR_XyOATW_TP_x5xCzhD2uDsrpWI,3078
|
|
7
|
+
audio_transcript_mcp/backends/whisper.py,sha256=ict5STAU6XMe4PgTrSRS4Acopl2BodrujjwyT6SBxn0,6601
|
|
8
|
+
audio_transcript_mcp/recorder/__init__.py,sha256=fk6GkQjL5Bd2N4qZCzVplTPWG7DvJh0Y_bnTzO51S1g,130
|
|
9
|
+
audio_transcript_mcp/recorder/opus.py,sha256=wIqWoMVcrYfJquBXdlbz85Uf6ntJAps5uGBfwYOqDsQ,2607
|
|
10
|
+
audio_transcript_mcp-0.1.0.dist-info/METADATA,sha256=MoQmQqE2oneMEVtYa5NYqrmztxc9hyh5JTtJU_SFQBg,7941
|
|
11
|
+
audio_transcript_mcp-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
12
|
+
audio_transcript_mcp-0.1.0.dist-info/entry_points.txt,sha256=YcoWhvqE7zIWV-X9lTg4iJAJ6bi5Ge661dJOfl1wEC8,74
|
|
13
|
+
audio_transcript_mcp-0.1.0.dist-info/licenses/LICENSE,sha256=rKBHD0FlJSZl_ZqhA4Hr8k7L2aRoWaX8cTUsuGC_nSw,1069
|
|
14
|
+
audio_transcript_mcp-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 llilakoblock
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|