audio-transcript-mcp 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ """Real-time audio transcription MCP server."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,5 @@
1
+ """Allow running as `python -m audio_transcript_mcp`."""
2
+
3
+ from audio_transcript_mcp.server import main
4
+
5
+ main()
@@ -0,0 +1,15 @@
1
+ """Audio format conversion utilities."""
2
+
3
+ import numpy as np
4
+
5
+
6
+ def float32_to_int16(data: bytes) -> bytes:
7
+ """Convert float32 PCM bytes to int16 PCM bytes."""
8
+ arr = np.frombuffer(data, dtype=np.float32)
9
+ return np.clip(arr * 32767, -32768, 32767).astype(np.int16).tobytes()
10
+
11
+
12
+ def stereo_to_mono_f32(data: bytes, channels: int = 2) -> bytes:
13
+ """Downmix multi-channel float32 to mono by averaging channels."""
14
+ arr = np.frombuffer(data, dtype=np.float32).reshape(-1, channels)
15
+ return arr.mean(axis=1).astype(np.float32).tobytes()
@@ -0,0 +1,12 @@
1
+ """STT backend factory."""
2
+
3
+ from audio_transcript_mcp.backends.deepgram import DeepgramBackend
4
+ from audio_transcript_mcp.backends.whisper import WhisperBackend
5
+
6
+
7
+ def create_backend(backend_type, label, sample_rate, channels, is_float32, buf, config):
8
+ """Create a backend instance based on type string."""
9
+ if backend_type == "local":
10
+ return WhisperBackend(label, sample_rate, channels, is_float32, buf, config)
11
+ else:
12
+ return DeepgramBackend(label, sample_rate, channels, is_float32, buf, config)
@@ -0,0 +1,91 @@
1
+ """Deepgram WebSocket STT backend."""
2
+
3
+ import json
4
+ import threading
5
+
6
+ from audio_transcript_mcp.audio_utils import float32_to_int16
7
+
8
+
9
+ class DeepgramBackend:
10
+ """Streams audio to Deepgram via WebSocket. One instance per audio source."""
11
+
12
+ def __init__(self, label, sample_rate, channels, is_float32, buf, config):
13
+ self.label = label
14
+ self.sample_rate = sample_rate
15
+ self.channels = channels
16
+ self._is_float32 = is_float32
17
+ self._buf = buf
18
+ self._language = config.get("deepgram_language", "ru")
19
+ self._api_key = config.get("deepgram_api_key", "")
20
+ self._model = config.get("deepgram_model", "nova-3")
21
+ self._utterance_end_ms = config.get("deepgram_utterance_end_ms", "2500")
22
+ self._endpointing = config.get("deepgram_endpointing", "500")
23
+ self._ws = None
24
+ self._recv_thread = None
25
+ self._recv_stop = threading.Event()
26
+
27
+ def connect(self):
28
+ from websockets.sync.client import connect as ws_connect
29
+ url = (
30
+ f"wss://api.deepgram.com/v1/listen"
31
+ f"?encoding=linear16"
32
+ f"&sample_rate={self.sample_rate}"
33
+ f"&channels={self.channels}"
34
+ f"&language={self._language}"
35
+ f"&model={self._model}"
36
+ f"&punctuate=true"
37
+ f"&smart_format=true"
38
+ f"&interim_results=true"
39
+ f"&utterance_end_ms={self._utterance_end_ms}"
40
+ f"&endpointing={self._endpointing}"
41
+ )
42
+ headers = {"Authorization": f"Token {self._api_key}"}
43
+ try:
44
+ self._ws = ws_connect(url, additional_headers=headers)
45
+ except Exception:
46
+ return False
47
+
48
+ self._recv_stop.clear()
49
+ self._recv_thread = threading.Thread(target=self._receiver, daemon=True)
50
+ self._recv_thread.start()
51
+ return True
52
+
53
+ def send(self, data):
54
+ """Send audio chunk. Converts float32 to int16 if needed."""
55
+ if self._is_float32:
56
+ data = float32_to_int16(data)
57
+ try:
58
+ self._ws.send(data)
59
+ return True
60
+ except Exception:
61
+ return False
62
+
63
+ def close(self):
64
+ self._recv_stop.set()
65
+ try:
66
+ self._ws.send(json.dumps({"type": "CloseStream"}))
67
+ self._ws.close()
68
+ except Exception:
69
+ pass
70
+ if self._recv_thread:
71
+ self._recv_thread.join(timeout=2)
72
+
73
+ def _receiver(self):
74
+ while not self._recv_stop.is_set():
75
+ try:
76
+ raw = self._ws.recv(timeout=1.0)
77
+ except TimeoutError:
78
+ continue
79
+ except Exception:
80
+ break
81
+ if not isinstance(raw, str):
82
+ continue
83
+ try:
84
+ msg = json.loads(raw)
85
+ except json.JSONDecodeError:
86
+ continue
87
+ ch = msg.get("channel", {})
88
+ alt = ch.get("alternatives", [{}])[0]
89
+ text = alt.get("transcript", "").strip()
90
+ if text and msg.get("is_final"):
91
+ self._buf.add(self.label, text)
@@ -0,0 +1,182 @@
1
+ """Local faster-whisper STT backend."""
2
+
3
+ import os
4
+ import threading
5
+ import time
6
+
7
+ import numpy as np
8
+
9
+ WHISPER_DEBUG = os.environ.get("WHISPER_DEBUG", "").lower() in ("1", "true", "yes")
10
+
11
+ from audio_transcript_mcp.audio_utils import stereo_to_mono_f32
12
+
13
+ try:
14
+ from faster_whisper import WhisperModel
15
+ except ImportError:
16
+ WhisperModel = None
17
+
18
+ try:
19
+ import soxr
20
+ except ImportError:
21
+ soxr = None
22
+
23
+
24
+ class WhisperBackend:
25
+ """Accumulates audio chunks and transcribes locally with faster-whisper."""
26
+
27
+ _model = None
28
+ _model_lock = threading.Lock()
29
+
30
+ def __init__(self, label, sample_rate, channels, is_float32, buf, config):
31
+ self.label = label
32
+ self.sample_rate = sample_rate
33
+ self.channels = channels
34
+ self._is_float32 = is_float32
35
+ self._buf = buf
36
+ self._model_name = config.get("whisper_model", "large-v3")
37
+ self._device = config.get("whisper_device", "cuda")
38
+ self._language = config.get("whisper_language", "")
39
+ self._chunk_sec = config.get("whisper_chunk_sec", 5.0)
40
+ self._overlap_sec = config.get("whisper_overlap_sec", 2.0)
41
+ self._pcm_buf = bytearray()
42
+ # mono float32 = 4 bytes/sample
43
+ self._chunk_bytes = int(self._chunk_sec * sample_rate * 4)
44
+ self._overlap_bytes = int(self._overlap_sec * sample_rate * 4)
45
+ self._step_bytes = self._chunk_bytes - self._overlap_bytes
46
+ self._prev_words: list[str] = []
47
+
48
+ @classmethod
49
+ def _ensure_model(cls, buf, model_name, device):
50
+ with cls._model_lock:
51
+ if cls._model is not None and cls._model.model.model_is_loaded:
52
+ return
53
+ if cls._model is not None:
54
+ buf.add("system", f"[Reloading whisper model to {device}...]")
55
+ cls._model.model.load_model()
56
+ buf.add("system", "[Whisper model reloaded]")
57
+ else:
58
+ buf.add("system", f"[Loading whisper model '{model_name}' on {device}...]")
59
+ cls._model = WhisperModel(
60
+ model_name,
61
+ device=device,
62
+ compute_type="float16" if device == "cuda" else "int8",
63
+ )
64
+ buf.add("system", "[Whisper model loaded]")
65
+
66
+ def connect(self):
67
+ try:
68
+ self._ensure_model(self._buf, self._model_name, self._device)
69
+ return True
70
+ except Exception as exc:
71
+ self._buf.add(self.label, f"[ERROR: whisper load failed: {exc}]")
72
+ return False
73
+
74
+ def send(self, data):
75
+ """Accumulate audio. Converts to mono float32 internally."""
76
+ if self._is_float32:
77
+ if self.channels >= 2:
78
+ data = stereo_to_mono_f32(data, self.channels)
79
+ else:
80
+ arr = np.frombuffer(data, dtype=np.int16).astype(np.float32) / 32768.0
81
+ if self.channels >= 2:
82
+ arr = arr.reshape(-1, self.channels).mean(axis=1)
83
+ data = arr.tobytes()
84
+ self._pcm_buf.extend(data)
85
+
86
+ if len(self._pcm_buf) >= self._chunk_bytes:
87
+ chunk = bytes(self._pcm_buf[:self._chunk_bytes])
88
+ self._pcm_buf = self._pcm_buf[self._step_bytes:]
89
+ try:
90
+ self._transcribe(chunk)
91
+ except Exception as exc:
92
+ self._buf.add(self.label, f"[TRANSCRIBE ERROR: {exc}]")
93
+ return True
94
+
95
+ def close(self):
96
+ if len(self._pcm_buf) > self.sample_rate:
97
+ self._transcribe(bytes(self._pcm_buf), is_tail=True)
98
+ self._pcm_buf.clear()
99
+
100
+ def _transcribe(self, mono_f32, is_tail=False):
101
+ audio = np.frombuffer(mono_f32, dtype=np.float32)
102
+ duration_sec = len(audio) / self.sample_rate
103
+
104
+ if self.sample_rate != 16000:
105
+ audio = soxr.resample(audio, self.sample_rate, 16000, quality="HQ").astype(np.float32)
106
+
107
+ peak = float(np.max(np.abs(audio)))
108
+ rms = float(np.sqrt(np.mean(audio ** 2)))
109
+
110
+ if WHISPER_DEBUG:
111
+ self._buf.add("debug", f"[chunk {duration_sec:.1f}s | peak={peak:.4f} rms={rms:.4f}]")
112
+
113
+ if peak < 0.01:
114
+ if WHISPER_DEBUG:
115
+ self._buf.add("debug", "[skipped: silence]")
116
+ return
117
+
118
+ lang = self._language or None
119
+ t0 = time.monotonic()
120
+ segments, info = self._model.transcribe(
121
+ audio,
122
+ language=lang,
123
+ beam_size=3,
124
+ vad_filter=True,
125
+ vad_parameters=dict(min_silence_duration_ms=500),
126
+ condition_on_previous_text=False,
127
+ )
128
+
129
+ step_sec = self._chunk_sec - self._overlap_sec
130
+ all_segs = []
131
+ texts = []
132
+ for seg in segments:
133
+ all_segs.append(seg)
134
+ if not is_tail and self._overlap_sec > 0 and seg.start >= step_sec:
135
+ break
136
+ if seg.no_speech_prob > 0.5:
137
+ if WHISPER_DEBUG:
138
+ self._buf.add("debug", f"[hallucination filtered: nsp={seg.no_speech_prob:.2f}] {seg.text.strip()}")
139
+ continue
140
+ text = seg.text.strip()
141
+ if text:
142
+ texts.append(text)
143
+ elapsed = time.monotonic() - t0
144
+
145
+ if WHISPER_DEBUG:
146
+ seg_details = " | ".join(
147
+ f"[{s.start:.1f}-{s.end:.1f} p={s.avg_logprob:.2f} nsp={s.no_speech_prob:.2f}] {s.text.strip()}"
148
+ for s in all_segs
149
+ )
150
+ self._buf.add("debug",
151
+ f"[whisper {elapsed:.2f}s | lang={info.language} prob={info.language_probability:.2f} "
152
+ f"| segs={len(all_segs)} kept={len(texts)}] {seg_details}"
153
+ )
154
+
155
+ if not texts:
156
+ return
157
+
158
+ combined = " ".join(texts)
159
+ words = combined.split()
160
+
161
+ if self._prev_words and self._overlap_sec > 0:
162
+ before = len(words)
163
+ words = self._strip_overlap(self._prev_words, words)
164
+ if WHISPER_DEBUG and before != len(words):
165
+ self._buf.add("debug", f"[dedup: stripped {before - len(words)} words]")
166
+
167
+ self._prev_words = combined.split()
168
+ if words:
169
+ self._buf.add(self.label, " ".join(words))
170
+
171
+ @staticmethod
172
+ def _strip_overlap(prev, new):
173
+ """Remove the longest prefix of `new` that matches a suffix of `prev`."""
174
+ def norm(w):
175
+ return w.strip(".,!?;:\"'()[]{}«»\u2014\u2013-").lower()
176
+
177
+ max_check = min(len(prev), len(new), 12)
178
+ best = 0
179
+ for n in range(1, max_check + 1):
180
+ if all(norm(a) == norm(b) for a, b in zip(prev[-n:], new[:n])):
181
+ best = n
182
+ return new[best:] if best > 0 else new
@@ -0,0 +1,5 @@
1
+ """Audio recorder module."""
2
+
3
+ from audio_transcript_mcp.recorder.opus import StereoOpusRecorder
4
+
5
+ __all__ = ["StereoOpusRecorder"]
@@ -0,0 +1,64 @@
1
+ """Stereo Opus streaming recorder using PyOgg."""
2
+
3
+ import pathlib
4
+ import threading
5
+
6
+ import numpy as np
7
+
8
+
9
+ class StereoOpusRecorder:
10
+ """Collects mono int16@48kHz from two sources into one stereo opus file.
11
+
12
+ Left channel = "me" (microphone), Right channel = "others" (system audio).
13
+ Both worker threads call write() concurrently; internal lock keeps it safe.
14
+ """
15
+
16
+ FRAME_SAMPLES = 960 # 20ms at 48kHz
17
+
18
+ def __init__(self, path: pathlib.Path):
19
+ from pyogg import OggOpusWriter, OpusBufferedEncoder
20
+
21
+ enc = OpusBufferedEncoder()
22
+ enc.set_application("audio")
23
+ enc.set_sampling_frequency(48000)
24
+ enc.set_channels(2)
25
+ enc.set_frame_size(20) # ms
26
+
27
+ self._writer = OggOpusWriter(str(path), enc)
28
+ self._bufs = {"me": bytearray(), "others": bytearray()}
29
+ self._lock = threading.Lock()
30
+ self._frame_bytes = self.FRAME_SAMPLES * 2 # int16 = 2 bytes/sample
31
+ self.path = path
32
+
33
+ def write(self, label: str, mono_i16: bytes):
34
+ """Append mono int16 PCM to the given channel and flush full frames."""
35
+ with self._lock:
36
+ self._bufs[label].extend(mono_i16)
37
+ # If one channel is >5s ahead, pad the other with silence
38
+ for lbl in self._bufs:
39
+ if lbl != label and len(self._bufs[lbl]) + 48000 * 2 * 5 < len(self._bufs[label]):
40
+ pad = len(self._bufs[label]) - len(self._bufs[lbl])
41
+ self._bufs[lbl].extend(b'\x00' * pad)
42
+ self._flush()
43
+
44
+ def _flush(self):
45
+ fb = self._frame_bytes
46
+ while len(self._bufs["me"]) >= fb and len(self._bufs["others"]) >= fb:
47
+ left = np.frombuffer(bytes(self._bufs["me"][:fb]), dtype=np.int16)
48
+ right = np.frombuffer(bytes(self._bufs["others"][:fb]), dtype=np.int16)
49
+ del self._bufs["me"][:fb]
50
+ del self._bufs["others"][:fb]
51
+ stereo = np.column_stack([left, right]).astype(np.int16)
52
+ self._writer.write(memoryview(bytearray(stereo.tobytes())))
53
+
54
+ def close(self):
55
+ """Flush remaining audio (pad shorter channel) and close the file."""
56
+ with self._lock:
57
+ max_len = max(len(b) for b in self._bufs.values())
58
+ if max_len > 0:
59
+ pad_to = ((max_len + self._frame_bytes - 1) // self._frame_bytes) * self._frame_bytes
60
+ for lbl in self._bufs:
61
+ if len(self._bufs[lbl]) < pad_to:
62
+ self._bufs[lbl].extend(b'\x00' * (pad_to - len(self._bufs[lbl])))
63
+ self._flush()
64
+ self._writer.close()
@@ -0,0 +1,448 @@
1
+ #!/usr/bin/env python3
2
+ """Real-time audio transcription MCP server for Claude Code.
3
+
4
+ Captures mic + system audio (WASAPI loopback) on Windows,
5
+ transcribes via Deepgram OR local faster-whisper model.
6
+
7
+ Backend switchable at runtime via set_backend() tool.
8
+ """
9
+
10
+ import gc
11
+ import os
12
+ import pathlib
13
+ import queue
14
+ import threading
15
+ import time
16
+ from dataclasses import dataclass
17
+
18
+ import numpy as np
19
+ import soxr
20
+
21
+ from mcp.server.fastmcp import FastMCP
22
+
23
+ from audio_transcript_mcp.backends import create_backend
24
+ from audio_transcript_mcp.backends.whisper import WhisperBackend
25
+ from audio_transcript_mcp.recorder import StereoOpusRecorder
26
+
27
+ # ── config ──────────────────────────────────────────────────────────
28
+
29
+ stt_backend = os.environ.get("STT_BACKEND", "deepgram").lower()
30
+
31
+ CONFIG = {
32
+ "deepgram_api_key": os.environ.get("DEEPGRAM_API_KEY", ""),
33
+ "deepgram_language": os.environ.get("DEEPGRAM_LANGUAGE", "ru"),
34
+ "deepgram_model": os.environ.get("DEEPGRAM_MODEL", "nova-3"),
35
+ "deepgram_utterance_end_ms": os.environ.get("DEEPGRAM_UTTERANCE_END_MS", "2500"),
36
+ "deepgram_endpointing": os.environ.get("DEEPGRAM_ENDPOINTING", "500"),
37
+ "whisper_model": os.environ.get("WHISPER_MODEL", "large-v3"),
38
+ "whisper_device": os.environ.get("WHISPER_DEVICE", "cuda"),
39
+ "whisper_language": os.environ.get("WHISPER_LANGUAGE", ""),
40
+ "whisper_chunk_sec": float(os.environ.get("WHISPER_CHUNK_SEC", "5")),
41
+ "whisper_overlap_sec": float(os.environ.get("WHISPER_OVERLAP_SEC", "2")),
42
+ }
43
+
44
+ BUFFER_MAX_AGE = int(os.environ.get("TRANSCRIPT_MAX_AGE", "3600"))
45
+ TRANSCRIPT_DIR = pathlib.Path(os.environ.get("TRANSCRIPT_DIR", pathlib.Path.home() / ".audio-transcript-mcp" / "transcripts"))
46
+ RECONNECT_DELAY = 3
47
+
48
+
49
+ # ── transcript buffer ───────────────────────────────────────────────
50
+
51
+ @dataclass
52
+ class Entry:
53
+ ts: float
54
+ speaker: str
55
+ text: str
56
+
57
+
58
+ class Buffer:
59
+ def __init__(self, max_age: int = 3600):
60
+ self._entries: list[Entry] = []
61
+ self._lock = threading.Lock()
62
+ self._max_age = max_age
63
+
64
+ def add(self, speaker: str, text: str):
65
+ with self._lock:
66
+ self._entries.append(Entry(time.time(), speaker, text))
67
+ cutoff = time.time() - self._max_age
68
+ self._entries = [e for e in self._entries if e.ts > cutoff]
69
+
70
+ def since(self, ts: float) -> list[Entry]:
71
+ with self._lock:
72
+ return [e for e in self._entries if e.ts >= ts]
73
+
74
+ def last(self, seconds: float) -> list[Entry]:
75
+ return self.since(time.time() - seconds)
76
+
77
+ def all(self) -> list[Entry]:
78
+ with self._lock:
79
+ return list(self._entries)
80
+
81
+ def clear(self):
82
+ with self._lock:
83
+ self._entries.clear()
84
+
85
+ @staticmethod
86
+ def format(entries: list[Entry]) -> str:
87
+ if not entries:
88
+ return "(empty transcript)"
89
+ lines = []
90
+ for e in entries:
91
+ t = time.strftime("%H:%M:%S", time.localtime(e.ts))
92
+ lines.append(f"[{t}] {e.speaker}: {e.text}")
93
+ return "\n".join(lines)
94
+
95
+
96
+ buf = Buffer(BUFFER_MAX_AGE)
97
+
98
+
99
+ # ── audio engine ────────────────────────────────────────────────────
100
+
101
+ class AudioEngine:
102
+ """Captures mic + loopback via PyAudioWPatch callbacks,
103
+ sends to chosen STT backend. Auto-reconnects on drop."""
104
+
105
+ def __init__(self):
106
+ self._running = False
107
+ self._stop = threading.Event()
108
+ self._threads: list[threading.Thread] = []
109
+ self._streams = []
110
+ self._pa = None
111
+ self._session_start: float = 0.0
112
+ self._recorder: StereoOpusRecorder | None = None
113
+
114
+ @property
115
+ def running(self) -> bool:
116
+ return self._running
117
+
118
+ def start(self) -> str:
119
+ if self._running:
120
+ return "Already listening."
121
+
122
+ try:
123
+ import ctypes
124
+ ctypes.windll.ole32.CoInitializeEx(0, 0)
125
+ except Exception:
126
+ pass
127
+
128
+ try:
129
+ import pyaudiowpatch as pyaudio
130
+ except ImportError:
131
+ return "ERROR: PyAudioWPatch not installed."
132
+
133
+ self._stop.clear()
134
+ self._running = True
135
+ self._session_start = time.time()
136
+ self._pa = pyaudio.PyAudio()
137
+
138
+ # Create per-session directory and open stereo opus recorder
139
+ self._recorder = None
140
+ self._session_dir = None
141
+ if TRANSCRIPT_DIR:
142
+ try:
143
+ ts = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime(self._session_start))
144
+ self._session_dir = TRANSCRIPT_DIR / ts
145
+ self._session_dir.mkdir(parents=True, exist_ok=True)
146
+ self._recorder = StereoOpusRecorder(self._session_dir / "audio.opus")
147
+ except Exception:
148
+ pass
149
+
150
+ for label, use_loopback in [("me", False), ("others", True)]:
151
+ t = threading.Thread(
152
+ target=self._worker,
153
+ args=(label, use_loopback),
154
+ daemon=True,
155
+ name=f"{label}-worker",
156
+ )
157
+ self._threads.append(t)
158
+ t.start()
159
+
160
+ backend_name = "local (faster-whisper)" if stt_backend == "local" else "Deepgram"
161
+ return f"Listening started (mic + system audio) via {backend_name}."
162
+
163
+ def stop(self) -> str:
164
+ if not self._running:
165
+ return "Not listening."
166
+ self._stop.set()
167
+ for s in self._streams:
168
+ try:
169
+ s.stop_stream()
170
+ s.close()
171
+ except Exception:
172
+ pass
173
+ self._streams.clear()
174
+ for t in self._threads:
175
+ t.join(timeout=10)
176
+ self._threads.clear()
177
+ if self._pa:
178
+ try:
179
+ self._pa.terminate()
180
+ except Exception:
181
+ pass
182
+ self._pa = None
183
+ self._running = False
184
+ return "Listening stopped."
185
+
186
+ def _worker(self, label: str, use_loopback: bool):
187
+ import pyaudiowpatch as pyaudio
188
+
189
+ p = self._pa
190
+
191
+ try:
192
+ if use_loopback:
193
+ device = p.get_default_wasapi_loopback()
194
+ else:
195
+ device = p.get_default_input_device_info()
196
+ except Exception as exc:
197
+ buf.add("system", f"[ERROR: device not found: {exc}]")
198
+ return
199
+
200
+ sample_rate = int(device["defaultSampleRate"])
201
+ channels = max(1, int(device["maxInputChannels"]))
202
+ pa_format = pyaudio.paFloat32 if use_loopback else pyaudio.paInt16
203
+ chunk_frames = sample_rate // 10 # 100ms
204
+
205
+ audio_q: queue.Queue[bytes | None] = queue.Queue(maxsize=100)
206
+
207
+ def audio_callback(in_data, frame_count, time_info, status):
208
+ if self._stop.is_set():
209
+ return (None, pyaudio.paComplete)
210
+ try:
211
+ audio_q.put_nowait(in_data)
212
+ except queue.Full:
213
+ pass
214
+ return (None, pyaudio.paContinue)
215
+
216
+ try:
217
+ stream = p.open(
218
+ format=pa_format,
219
+ channels=channels,
220
+ rate=sample_rate,
221
+ input=True,
222
+ input_device_index=device["index"],
223
+ frames_per_buffer=chunk_frames,
224
+ stream_callback=audio_callback,
225
+ )
226
+ stream.start_stream()
227
+ self._streams.append(stream)
228
+ except Exception as exc:
229
+ buf.add("system", f"[ERROR: audio open failed: {exc}]")
230
+ return
231
+
232
+ buf.add("system", f"[STARTED: {device['name']}, {sample_rate}Hz, {channels}ch]")
233
+
234
+ is_float32 = pa_format == pyaudio.paFloat32
235
+
236
+ # Stateful resampler for opus recording (avoids boundary artifacts)
237
+ opus_resampler = None
238
+ if self._recorder and sample_rate != 48000:
239
+ opus_resampler = soxr.ResampleStream(sample_rate, 48000, 1, dtype="float32", quality="HQ")
240
+
241
+ while not self._stop.is_set():
242
+ backend = create_backend(stt_backend, label, sample_rate, channels, is_float32, buf, CONFIG)
243
+ if not backend.connect():
244
+ buf.add("system", f"[RECONNECT: backend connect failed, retry in {RECONNECT_DELAY}s]")
245
+ self._stop.wait(RECONNECT_DELAY)
246
+ continue
247
+
248
+ # Drain stale audio
249
+ while not audio_q.empty():
250
+ try:
251
+ audio_q.get_nowait()
252
+ except queue.Empty:
253
+ break
254
+
255
+ disconnected = False
256
+ while not self._stop.is_set():
257
+ try:
258
+ data = audio_q.get(timeout=0.5)
259
+ except queue.Empty:
260
+ continue
261
+ if data is None:
262
+ break
263
+
264
+ # Record to stereo opus
265
+ if self._recorder:
266
+ try:
267
+ if is_float32:
268
+ arr = np.frombuffer(data, dtype=np.float32)
269
+ if channels >= 2:
270
+ arr = arr.reshape(-1, channels).mean(axis=1)
271
+ else:
272
+ arr = np.frombuffer(data, dtype=np.int16).astype(np.float32) / 32768.0
273
+ if channels >= 2:
274
+ arr = arr.reshape(-1, channels).mean(axis=1)
275
+ if opus_resampler is not None:
276
+ arr = opus_resampler.resample_chunk(arr)
277
+ mono_i16 = np.clip(arr * 32767, -32768, 32767).astype(np.int16)
278
+ self._recorder.write(label, mono_i16.tobytes())
279
+ except Exception:
280
+ pass
281
+
282
+ if not backend.send(data):
283
+ disconnected = True
284
+ break
285
+
286
+ backend.close()
287
+
288
+ if disconnected and not self._stop.is_set():
289
+ buf.add("system", f"[RECONNECT: connection lost, retry in {RECONNECT_DELAY}s...]")
290
+ self._stop.wait(RECONNECT_DELAY)
291
+
292
+ if stt_backend == "local" and not disconnected:
293
+ break
294
+
295
+ # Flush remaining resampler state into opus recorder
296
+ if opus_resampler is not None and self._recorder:
297
+ try:
298
+ tail = opus_resampler.resample_chunk(np.empty(0, dtype=np.float32), last=True)
299
+ if len(tail) > 0:
300
+ mono_i16 = np.clip(tail * 32767, -32768, 32767).astype(np.int16)
301
+ self._recorder.write(label, mono_i16.tobytes())
302
+ except Exception:
303
+ pass
304
+
305
+ try:
306
+ stream.stop_stream()
307
+ stream.close()
308
+ except Exception:
309
+ pass
310
+ buf.add("system", "[STOPPED]")
311
+
312
+
313
+ engine = AudioEngine()
314
+
315
+
316
+ # ── MCP tools ───────────────────────────────────────────────────────
317
+
318
+ mcp = FastMCP("audio-transcript")
319
+
320
+
321
+ @mcp.tool()
322
+ async def start_listening() -> str:
323
+ """Start capturing mic + system audio and transcribing via current backend.
324
+ Returns status message.
325
+ """
326
+ if stt_backend == "deepgram" and not CONFIG["deepgram_api_key"]:
327
+ return "ERROR: Set DEEPGRAM_API_KEY env var first."
328
+ return engine.start()
329
+
330
+
331
+ @mcp.tool()
332
+ async def stop_listening() -> str:
333
+ """Stop audio capture and transcription."""
334
+ session_start = engine._session_start
335
+ result = engine.stop()
336
+ with WhisperBackend._model_lock:
337
+ if WhisperBackend._model is not None:
338
+ WhisperBackend._model.model.unload_model()
339
+ gc.collect()
340
+
341
+ # Save session transcript to file
342
+ session_dir = engine._session_dir
343
+ if session_start and session_dir:
344
+ try:
345
+ session_dir.mkdir(parents=True, exist_ok=True)
346
+ entries = buf.since(session_start)
347
+ path = session_dir / "transcript.txt"
348
+ lines = []
349
+ for e in entries:
350
+ if e.speaker in ("debug",):
351
+ continue
352
+ t = time.strftime("%H:%M:%S", time.localtime(e.ts))
353
+ lines.append(f"[{t}] {e.speaker} — {e.text}")
354
+ path.write_text("\n\n".join(lines) + "\n", encoding="utf-8")
355
+ result += f" Transcript saved to {path}."
356
+ except Exception as exc:
357
+ result += f" (transcript save failed: {exc})"
358
+
359
+ # Close stereo opus recorder
360
+ if engine._recorder:
361
+ try:
362
+ engine._recorder.close()
363
+ p = engine._recorder.path
364
+ size_kb = p.stat().st_size / 1024
365
+ result += f" Audio saved to {p} ({size_kb:.0f} KB)."
366
+ except Exception as exc:
367
+ result += f" (opus save failed: {exc})"
368
+ engine._recorder = None
369
+
370
+ return result
371
+
372
+
373
+ @mcp.tool()
374
+ async def is_listening() -> str:
375
+ """Check if audio capture is currently active."""
376
+ return f"Listening: {engine.running}"
377
+
378
+
379
+ @mcp.tool()
380
+ async def get_transcript(last_seconds: float = 60) -> str:
381
+ """Get transcript for the last N seconds.
382
+
383
+ Args:
384
+ last_seconds: how far back in seconds (default 60)
385
+ """
386
+ return buf.format(buf.last(last_seconds))
387
+
388
+
389
+ @mcp.tool()
390
+ async def get_full_transcript() -> str:
391
+ """Get the entire accumulated transcript (up to max buffer age)."""
392
+ return buf.format(buf.all())
393
+
394
+
395
+ @mcp.tool()
396
+ async def get_transcript_since(timestamp: float) -> str:
397
+ """Get transcript entries since a Unix timestamp.
398
+
399
+ Args:
400
+ timestamp: Unix epoch timestamp (e.g. from time.time())
401
+ """
402
+ return buf.format(buf.since(timestamp))
403
+
404
+
405
+ @mcp.tool()
406
+ async def clear_transcript() -> str:
407
+ """Clear the transcript buffer."""
408
+ buf.clear()
409
+ return "Transcript cleared."
410
+
411
+
412
+ @mcp.tool()
413
+ async def get_backend() -> str:
414
+ """Get current STT backend name ("deepgram" or "local")."""
415
+ return f"Current backend: {stt_backend}"
416
+
417
+
418
+ @mcp.tool()
419
+ async def set_backend(backend: str) -> str:
420
+ """Switch STT backend. If currently listening, restarts capture automatically.
421
+
422
+ Args:
423
+ backend: "deepgram" or "local"
424
+ """
425
+ global stt_backend
426
+ backend = backend.strip().lower()
427
+ if backend not in ("deepgram", "local"):
428
+ return f"ERROR: unknown backend '{backend}'. Use 'deepgram' or 'local'."
429
+ if backend == stt_backend:
430
+ return f"Already using '{stt_backend}'."
431
+ old = stt_backend
432
+ stt_backend = backend
433
+ if engine.running:
434
+ engine.stop()
435
+ msg = engine.start()
436
+ return f"Switched {old} -> {stt_backend} and restarted. {msg}"
437
+ return f"Switched {old} -> {stt_backend}. Start listening when ready."
438
+
439
+
440
+ # ── entrypoint ──────────────────────────────────────────────────────
441
+
442
+
443
+ def main():
444
+ mcp.run()
445
+
446
+
447
+ if __name__ == "__main__":
448
+ main()
@@ -0,0 +1,226 @@
1
+ Metadata-Version: 2.4
2
+ Name: audio-transcript-mcp
3
+ Version: 0.1.0
4
+ Summary: Real-time audio transcription MCP server for Claude Code
5
+ Project-URL: Homepage, https://github.com/llilakoblock/audio-transcript-mcp
6
+ Project-URL: Repository, https://github.com/llilakoblock/audio-transcript-mcp
7
+ Project-URL: Issues, https://github.com/llilakoblock/audio-transcript-mcp/issues
8
+ Author: llilakoblock
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: audio,deepgram,mcp,transcription,whisper
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
21
+ Requires-Python: >=3.10
22
+ Requires-Dist: faster-whisper>=1.0
23
+ Requires-Dist: mcp[cli]>=1.2
24
+ Requires-Dist: numpy>=1.24
25
+ Requires-Dist: pyaudiowpatch>=0.2.12
26
+ Requires-Dist: pyogg>=0.7
27
+ Requires-Dist: soxr>=0.3
28
+ Requires-Dist: websockets>=12.0
29
+ Description-Content-Type: text/markdown
30
+
31
+ # audio-transcript-mcp
32
+
33
+ Real-time audio transcription MCP server for Claude Code.
34
+
35
+ Captures **microphone + system audio** (WASAPI loopback on Windows) and transcribes via **Deepgram** (cloud) or **faster-whisper** (local, GPU/CPU).
36
+
37
+ ## Features
38
+
39
+ - **Dual audio capture**: mic + system sound simultaneously
40
+ - **Two STT backends** switchable at runtime (Deepgram nova-3 / faster-whisper)
41
+ - **Stereo opus recording**: each session saves a stereo opus file (L=mic, R=system audio)
42
+ - **Per-session directories**: transcript + audio saved to `~/.audio-transcript-mcp/transcripts/<timestamp>/`
43
+ - Chunk overlap with text deduplication (no cut words at boundaries)
44
+ - Native float32 audio pipeline for whisper (no lossy int16 round-trip)
45
+ - High-quality stateful resampling via soxr (no boundary artifacts)
46
+ - Whisper hallucination filter (no_speech_prob + avg_logprob thresholds)
47
+ - Transcript buffer with time-based queries
48
+ - Auto-reconnect for Deepgram WebSocket
49
+ - GPU model unload/reload on stop/start (CUDA memory management)
50
+
51
+ ## Architecture
52
+
53
+ ```
54
+ ┌─────────────┐ ┌──────────┐ ┌─────────────────┐
55
+ │ Mic (int16) ├────►│ │ │ STT Backend │
56
+ │ WASAPI │ │ Worker ├────►│ whisper / DG ├──► Transcript buffer
57
+ └─────────────┘ │ Thread │ └─────────────────┘
58
+ │ ├────►┌─────────────────┐
59
+ ┌─────────────┐ │ │ │ StereoOpusRec │
60
+ │ System audio ├────►│ │ │ L=me R=others ├──► audio.opus
61
+ │ Loopback f32 │ └──────────┘ └─────────────────┘
62
+ └─────────────┘
63
+
64
+ Audio pipeline: native capture → stereo→mono → soxr resample → backend/opus
65
+ ```
66
+
67
+ Each audio source runs in its own worker thread. Audio is captured in the device's native format (float32 for loopback, int16 for mic), converted to mono, and routed to both the STT backend and the stereo opus recorder.
68
+
69
+ ## Requirements
70
+
71
+ - Python 3.10+
72
+ - Windows (WASAPI loopback for system audio capture); mic-only on macOS/Linux
73
+ - NVIDIA GPU recommended for local whisper backend
74
+
75
+ ## Installation
76
+
77
+ ### From PyPI (recommended)
78
+
79
+ ```bash
80
+ pip install audio-transcript-mcp
81
+ ```
82
+
83
+ Or run without installing via `uvx`:
84
+
85
+ ```bash
86
+ uvx audio-transcript-mcp
87
+ ```
88
+
89
+ ### From source
90
+
91
+ ```bash
92
+ git clone https://github.com/llilakoblock/audio-transcript-mcp.git
93
+ cd audio-transcript-mcp
94
+ pip install -e .
95
+ ```
96
+
97
+ ## MCP Configuration
98
+
99
+ Add to your `mcp.json` (Claude Code settings):
100
+
101
+ ### Using PyPI install
102
+
103
+ ```json
104
+ {
105
+ "audio-transcript": {
106
+ "type": "stdio",
107
+ "command": "audio-transcript-mcp",
108
+ "env": {
109
+ "STT_BACKEND": "local",
110
+ "DEEPGRAM_API_KEY": "your-deepgram-api-key",
111
+ "DEEPGRAM_LANGUAGE": "ru",
112
+ "DEEPGRAM_MODEL": "nova-3",
113
+ "DEEPGRAM_UTTERANCE_END_MS": "2500",
114
+ "DEEPGRAM_ENDPOINTING": "500",
115
+ "WHISPER_MODEL": "large-v3",
116
+ "WHISPER_DEVICE": "cuda",
117
+ "WHISPER_LANGUAGE": "ru",
118
+ "WHISPER_CHUNK_SEC": "10",
119
+ "WHISPER_OVERLAP_SEC": "2",
120
+ "TRANSCRIPT_MAX_AGE": "3600"
121
+ }
122
+ }
123
+ }
124
+ ```
125
+
126
+ ### Using uvx (no install needed)
127
+
128
+ ```json
129
+ {
130
+ "audio-transcript": {
131
+ "type": "stdio",
132
+ "command": "uvx",
133
+ "args": ["audio-transcript-mcp"],
134
+ "env": {
135
+ "STT_BACKEND": "deepgram",
136
+ "DEEPGRAM_API_KEY": "your-deepgram-api-key"
137
+ }
138
+ }
139
+ }
140
+ ```
141
+
142
+ > **Note:** System audio capture (loopback) uses WASAPI and is Windows-only. On macOS/Linux only microphone input works out of the box.
143
+
144
+ ## Environment Variables
145
+
146
+ | Variable | Default | Description |
147
+ |---|---|---|
148
+ | `STT_BACKEND` | `deepgram` | `"deepgram"` (cloud) or `"local"` (faster-whisper) |
149
+ | `DEEPGRAM_API_KEY` | — | API key for Deepgram (required if backend=deepgram) |
150
+ | `DEEPGRAM_LANGUAGE` | `ru` | Language code for Deepgram |
151
+ | `DEEPGRAM_MODEL` | `nova-3` | Deepgram model (`nova-3`, `nova-2`, etc.) |
152
+ | `DEEPGRAM_UTTERANCE_END_MS` | `2500` | Silence duration (ms) before finalizing utterance |
153
+ | `DEEPGRAM_ENDPOINTING` | `500` | Endpointing sensitivity (ms) |
154
+ | `WHISPER_MODEL` | `large-v3` | Model size: `tiny`, `base`, `small`, `medium`, `large-v3` |
155
+ | `WHISPER_DEVICE` | `cuda` | `"cuda"` or `"cpu"` |
156
+ | `WHISPER_LANGUAGE` | — | Language hint for whisper (empty = auto-detect) |
157
+ | `WHISPER_CHUNK_SEC` | `5` | Audio chunk duration in seconds |
158
+ | `WHISPER_OVERLAP_SEC` | `2` | Overlap between consecutive chunks (avoids cut words) |
159
+ | `TRANSCRIPT_MAX_AGE` | `3600` | Max transcript buffer age in seconds |
160
+ | `TRANSCRIPT_DIR` | `~/.audio-transcript-mcp/transcripts` | Directory for per-session transcript/audio files |
161
+
162
+ ## Session Output
163
+
164
+ Each recording session creates a timestamped directory:
165
+
166
+ ```
167
+ ~/.audio-transcript-mcp/transcripts/
168
+ 2026-03-06_23-24-48/
169
+ transcript.txt # Plain text transcript
170
+ audio.opus # Stereo opus (L=mic, R=system)
171
+ ```
172
+
173
+ The transcript is plain text:
174
+ ```
175
+ [23:24:50] me — Hello, can you hear me?
176
+
177
+ [23:24:52] others — Yes, I can hear you fine.
178
+
179
+ [23:24:55] system — [STARTED: Microphone, 44100Hz, 2ch]
180
+ ```
181
+
182
+ ## MCP Tools
183
+
184
+ | Tool | Description |
185
+ |---|---|
186
+ | `start_listening` | Start capturing mic + system audio and transcribing |
187
+ | `stop_listening` | Stop capture, save transcript and opus recording |
188
+ | `is_listening` | Check if capture is active |
189
+ | `get_transcript` | Get transcript for the last N seconds (default 60) |
190
+ | `get_full_transcript` | Get entire transcript buffer |
191
+ | `get_transcript_since` | Get transcript since a Unix timestamp |
192
+ | `clear_transcript` | Clear the transcript buffer |
193
+ | `get_backend` | Show current STT backend |
194
+ | `set_backend` | Switch backend (`"deepgram"` / `"local"`) at runtime |
195
+
196
+ ## Project Structure
197
+
198
+ ```
199
+ src/audio_transcript_mcp/
200
+ __init__.py # Package version
201
+ __main__.py # python -m entry point
202
+ server.py # MCP server, AudioEngine, config
203
+ audio_utils.py # Format conversion (float32↔int16, stereo→mono)
204
+ backends/
205
+ __init__.py # Backend factory
206
+ whisper.py # Local faster-whisper STT
207
+ deepgram.py # Deepgram WebSocket STT
208
+ recorder/
209
+ __init__.py
210
+ opus.py # StereoOpusRecorder (PyOgg)
211
+ ```
212
+
213
+ ## Releasing
214
+
215
+ Releases are automated via GitHub Actions:
216
+
217
+ ```bash
218
+ # Update version in src/audio_transcript_mcp/__init__.py
219
+ git tag v0.1.0
220
+ git push origin v0.1.0
221
+ # CI automatically builds, publishes to PyPI, and creates a GitHub Release
222
+ ```
223
+
224
+ ## License
225
+
226
+ MIT
@@ -0,0 +1,14 @@
1
+ audio_transcript_mcp/__init__.py,sha256=Q552b2DKL8KTpkvEbL9XjWOHBi_lybCpeD1XyOO5C_0,71
2
+ audio_transcript_mcp/__main__.py,sha256=i-qaCM9yraBkuIO2Aa7JwQ-K-i7kiF4rkn6foylEEgk,111
3
+ audio_transcript_mcp/audio_utils.py,sha256=EtQVN3W9aGm-SSG2hxOZGjGBsEhH_X8Cgb7vyF_TNCk,550
4
+ audio_transcript_mcp/server.py,sha256=UQ5B7J1-Ptoz9JKjpkQO8ux0XeGZtzhtNHBq9EZPR2s,15474
5
+ audio_transcript_mcp/backends/__init__.py,sha256=sItMvJITF6TCT8BHa8_O5PWrPuFKbvseY7dwl7ri5UQ,522
6
+ audio_transcript_mcp/backends/deepgram.py,sha256=kdJwjkg1V-9C2CLtR_XyOATW_TP_x5xCzhD2uDsrpWI,3078
7
+ audio_transcript_mcp/backends/whisper.py,sha256=ict5STAU6XMe4PgTrSRS4Acopl2BodrujjwyT6SBxn0,6601
8
+ audio_transcript_mcp/recorder/__init__.py,sha256=fk6GkQjL5Bd2N4qZCzVplTPWG7DvJh0Y_bnTzO51S1g,130
9
+ audio_transcript_mcp/recorder/opus.py,sha256=wIqWoMVcrYfJquBXdlbz85Uf6ntJAps5uGBfwYOqDsQ,2607
10
+ audio_transcript_mcp-0.1.0.dist-info/METADATA,sha256=MoQmQqE2oneMEVtYa5NYqrmztxc9hyh5JTtJU_SFQBg,7941
11
+ audio_transcript_mcp-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
12
+ audio_transcript_mcp-0.1.0.dist-info/entry_points.txt,sha256=YcoWhvqE7zIWV-X9lTg4iJAJ6bi5Ge661dJOfl1wEC8,74
13
+ audio_transcript_mcp-0.1.0.dist-info/licenses/LICENSE,sha256=rKBHD0FlJSZl_ZqhA4Hr8k7L2aRoWaX8cTUsuGC_nSw,1069
14
+ audio_transcript_mcp-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ audio-transcript-mcp = audio_transcript_mcp.server:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 llilakoblock
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.