audio-transcript-mcp 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- audio_transcript_mcp-0.1.0/.github/workflows/ci.yml +27 -0
- audio_transcript_mcp-0.1.0/.github/workflows/release.yml +34 -0
- audio_transcript_mcp-0.1.0/.gitignore +9 -0
- audio_transcript_mcp-0.1.0/CHANGELOG.md +18 -0
- audio_transcript_mcp-0.1.0/LICENSE +21 -0
- audio_transcript_mcp-0.1.0/PKG-INFO +226 -0
- audio_transcript_mcp-0.1.0/README.md +196 -0
- audio_transcript_mcp-0.1.0/pyproject.toml +47 -0
- audio_transcript_mcp-0.1.0/src/audio_transcript_mcp/__init__.py +3 -0
- audio_transcript_mcp-0.1.0/src/audio_transcript_mcp/__main__.py +5 -0
- audio_transcript_mcp-0.1.0/src/audio_transcript_mcp/audio_utils.py +15 -0
- audio_transcript_mcp-0.1.0/src/audio_transcript_mcp/backends/__init__.py +12 -0
- audio_transcript_mcp-0.1.0/src/audio_transcript_mcp/backends/deepgram.py +91 -0
- audio_transcript_mcp-0.1.0/src/audio_transcript_mcp/backends/whisper.py +182 -0
- audio_transcript_mcp-0.1.0/src/audio_transcript_mcp/recorder/__init__.py +5 -0
- audio_transcript_mcp-0.1.0/src/audio_transcript_mcp/recorder/opus.py +64 -0
- audio_transcript_mcp-0.1.0/src/audio_transcript_mcp/server.py +448 -0
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [master]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [master]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
check:
|
|
11
|
+
runs-on: windows-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.10", "3.12", "3.13"]
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
- uses: actions/setup-python@v5
|
|
18
|
+
with:
|
|
19
|
+
python-version: ${{ matrix.python-version }}
|
|
20
|
+
- name: Install dependencies
|
|
21
|
+
run: |
|
|
22
|
+
pip install build ruff
|
|
23
|
+
pip install -e .
|
|
24
|
+
- name: Lint with ruff
|
|
25
|
+
run: ruff check src/
|
|
26
|
+
- name: Check build
|
|
27
|
+
run: python -m build
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags: ["v*"]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
publish-pypi:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
permissions:
|
|
11
|
+
contents: read
|
|
12
|
+
id-token: write
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
- uses: actions/setup-python@v5
|
|
16
|
+
with:
|
|
17
|
+
python-version: "3.12"
|
|
18
|
+
- name: Build package
|
|
19
|
+
run: |
|
|
20
|
+
pip install build
|
|
21
|
+
python -m build
|
|
22
|
+
- name: Publish to PyPI
|
|
23
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
24
|
+
|
|
25
|
+
github-release:
|
|
26
|
+
needs: publish-pypi
|
|
27
|
+
runs-on: ubuntu-latest
|
|
28
|
+
permissions:
|
|
29
|
+
contents: write
|
|
30
|
+
steps:
|
|
31
|
+
- uses: actions/checkout@v4
|
|
32
|
+
- uses: softprops/action-gh-release@v2
|
|
33
|
+
with:
|
|
34
|
+
generate_release_notes: true
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## [0.1.0] - 2026-03-06
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
- Dual audio capture: mic + system audio (WASAPI loopback)
|
|
7
|
+
- Deepgram nova-3 cloud STT backend with WebSocket streaming
|
|
8
|
+
- Local faster-whisper STT backend with GPU/CUDA support
|
|
9
|
+
- Runtime backend switching via MCP tool
|
|
10
|
+
- Stereo opus recording (L=mic, R=system audio) per session
|
|
11
|
+
- Per-session directories with transcript.txt and audio.opus
|
|
12
|
+
- Chunk overlap with text deduplication for whisper
|
|
13
|
+
- Whisper hallucination filter (no_speech_prob + avg_logprob)
|
|
14
|
+
- Stateful high-quality resampling via soxr
|
|
15
|
+
- Auto-reconnect for Deepgram WebSocket
|
|
16
|
+
- GPU model unload/reload on stop/start
|
|
17
|
+
- Configurable via environment variables
|
|
18
|
+
- PyPI package with `audio-transcript-mcp` CLI entry point
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 llilakoblock
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: audio-transcript-mcp
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Real-time audio transcription MCP server for Claude Code
|
|
5
|
+
Project-URL: Homepage, https://github.com/llilakoblock/audio-transcript-mcp
|
|
6
|
+
Project-URL: Repository, https://github.com/llilakoblock/audio-transcript-mcp
|
|
7
|
+
Project-URL: Issues, https://github.com/llilakoblock/audio-transcript-mcp/issues
|
|
8
|
+
Author: llilakoblock
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: audio,deepgram,mcp,transcription,whisper
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Requires-Dist: faster-whisper>=1.0
|
|
23
|
+
Requires-Dist: mcp[cli]>=1.2
|
|
24
|
+
Requires-Dist: numpy>=1.24
|
|
25
|
+
Requires-Dist: pyaudiowpatch>=0.2.12
|
|
26
|
+
Requires-Dist: pyogg>=0.7
|
|
27
|
+
Requires-Dist: soxr>=0.3
|
|
28
|
+
Requires-Dist: websockets>=12.0
|
|
29
|
+
Description-Content-Type: text/markdown
|
|
30
|
+
|
|
31
|
+
# audio-transcript-mcp
|
|
32
|
+
|
|
33
|
+
Real-time audio transcription MCP server for Claude Code.
|
|
34
|
+
|
|
35
|
+
Captures **microphone + system audio** (WASAPI loopback on Windows) and transcribes via **Deepgram** (cloud) or **faster-whisper** (local, GPU/CPU).
|
|
36
|
+
|
|
37
|
+
## Features
|
|
38
|
+
|
|
39
|
+
- **Dual audio capture**: mic + system sound simultaneously
|
|
40
|
+
- **Two STT backends** switchable at runtime (Deepgram nova-3 / faster-whisper)
|
|
41
|
+
- **Stereo opus recording**: each session saves a stereo opus file (L=mic, R=system audio)
|
|
42
|
+
- **Per-session directories**: transcript + audio saved to `~/.audio-transcript-mcp/transcripts/<timestamp>/`
|
|
43
|
+
- Chunk overlap with text deduplication (no cut words at boundaries)
|
|
44
|
+
- Native float32 audio pipeline for whisper (no lossy int16 round-trip)
|
|
45
|
+
- High-quality stateful resampling via soxr (no boundary artifacts)
|
|
46
|
+
- Whisper hallucination filter (no_speech_prob + avg_logprob thresholds)
|
|
47
|
+
- Transcript buffer with time-based queries
|
|
48
|
+
- Auto-reconnect for Deepgram WebSocket
|
|
49
|
+
- GPU model unload/reload on stop/start (CUDA memory management)
|
|
50
|
+
|
|
51
|
+
## Architecture
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
┌─────────────┐ ┌──────────┐ ┌─────────────────┐
|
|
55
|
+
│ Mic (int16) ├────►│ │ │ STT Backend │
|
|
56
|
+
│ WASAPI │ │ Worker ├────►│ whisper / DG ├──► Transcript buffer
|
|
57
|
+
└─────────────┘ │ Thread │ └─────────────────┘
|
|
58
|
+
│ ├────►┌─────────────────┐
|
|
59
|
+
┌─────────────┐ │ │ │ StereoOpusRec │
|
|
60
|
+
│ System audio ├────►│ │ │ L=me R=others ├──► audio.opus
|
|
61
|
+
│ Loopback f32 │ └──────────┘ └─────────────────┘
|
|
62
|
+
└─────────────┘
|
|
63
|
+
|
|
64
|
+
Audio pipeline: native capture → stereo→mono → soxr resample → backend/opus
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Each audio source runs in its own worker thread. Audio is captured in the device's native format (float32 for loopback, int16 for mic), converted to mono, and routed to both the STT backend and the stereo opus recorder.
|
|
68
|
+
|
|
69
|
+
## Requirements
|
|
70
|
+
|
|
71
|
+
- Python 3.10+
|
|
72
|
+
- Windows (WASAPI loopback for system audio capture); mic-only on macOS/Linux
|
|
73
|
+
- NVIDIA GPU recommended for local whisper backend
|
|
74
|
+
|
|
75
|
+
## Installation
|
|
76
|
+
|
|
77
|
+
### From PyPI (recommended)
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
pip install audio-transcript-mcp
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Or run without installing via `uvx`:
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
uvx audio-transcript-mcp
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### From source
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
git clone https://github.com/llilakoblock/audio-transcript-mcp.git
|
|
93
|
+
cd audio-transcript-mcp
|
|
94
|
+
pip install -e .
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## MCP Configuration
|
|
98
|
+
|
|
99
|
+
Add to your `mcp.json` (Claude Code settings):
|
|
100
|
+
|
|
101
|
+
### Using PyPI install
|
|
102
|
+
|
|
103
|
+
```json
|
|
104
|
+
{
|
|
105
|
+
"audio-transcript": {
|
|
106
|
+
"type": "stdio",
|
|
107
|
+
"command": "audio-transcript-mcp",
|
|
108
|
+
"env": {
|
|
109
|
+
"STT_BACKEND": "local",
|
|
110
|
+
"DEEPGRAM_API_KEY": "your-deepgram-api-key",
|
|
111
|
+
"DEEPGRAM_LANGUAGE": "ru",
|
|
112
|
+
"DEEPGRAM_MODEL": "nova-3",
|
|
113
|
+
"DEEPGRAM_UTTERANCE_END_MS": "2500",
|
|
114
|
+
"DEEPGRAM_ENDPOINTING": "500",
|
|
115
|
+
"WHISPER_MODEL": "large-v3",
|
|
116
|
+
"WHISPER_DEVICE": "cuda",
|
|
117
|
+
"WHISPER_LANGUAGE": "ru",
|
|
118
|
+
"WHISPER_CHUNK_SEC": "10",
|
|
119
|
+
"WHISPER_OVERLAP_SEC": "2",
|
|
120
|
+
"TRANSCRIPT_MAX_AGE": "3600"
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### Using uvx (no install needed)
|
|
127
|
+
|
|
128
|
+
```json
|
|
129
|
+
{
|
|
130
|
+
"audio-transcript": {
|
|
131
|
+
"type": "stdio",
|
|
132
|
+
"command": "uvx",
|
|
133
|
+
"args": ["audio-transcript-mcp"],
|
|
134
|
+
"env": {
|
|
135
|
+
"STT_BACKEND": "deepgram",
|
|
136
|
+
"DEEPGRAM_API_KEY": "your-deepgram-api-key"
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
> **Note:** System audio capture (loopback) uses WASAPI and is Windows-only. On macOS/Linux only microphone input works out of the box.
|
|
143
|
+
|
|
144
|
+
## Environment Variables
|
|
145
|
+
|
|
146
|
+
| Variable | Default | Description |
|
|
147
|
+
|---|---|---|
|
|
148
|
+
| `STT_BACKEND` | `deepgram` | `"deepgram"` (cloud) or `"local"` (faster-whisper) |
|
|
149
|
+
| `DEEPGRAM_API_KEY` | — | API key for Deepgram (required if backend=deepgram) |
|
|
150
|
+
| `DEEPGRAM_LANGUAGE` | `ru` | Language code for Deepgram |
|
|
151
|
+
| `DEEPGRAM_MODEL` | `nova-3` | Deepgram model (`nova-3`, `nova-2`, etc.) |
|
|
152
|
+
| `DEEPGRAM_UTTERANCE_END_MS` | `2500` | Silence duration (ms) before finalizing utterance |
|
|
153
|
+
| `DEEPGRAM_ENDPOINTING` | `500` | Endpointing sensitivity (ms) |
|
|
154
|
+
| `WHISPER_MODEL` | `large-v3` | Model size: `tiny`, `base`, `small`, `medium`, `large-v3` |
|
|
155
|
+
| `WHISPER_DEVICE` | `cuda` | `"cuda"` or `"cpu"` |
|
|
156
|
+
| `WHISPER_LANGUAGE` | — | Language hint for whisper (empty = auto-detect) |
|
|
157
|
+
| `WHISPER_CHUNK_SEC` | `5` | Audio chunk duration in seconds |
|
|
158
|
+
| `WHISPER_OVERLAP_SEC` | `2` | Overlap between consecutive chunks (avoids cut words) |
|
|
159
|
+
| `TRANSCRIPT_MAX_AGE` | `3600` | Max transcript buffer age in seconds |
|
|
160
|
+
| `TRANSCRIPT_DIR` | `~/.audio-transcript-mcp/transcripts` | Directory for per-session transcript/audio files |
|
|
161
|
+
|
|
162
|
+
## Session Output
|
|
163
|
+
|
|
164
|
+
Each recording session creates a timestamped directory:
|
|
165
|
+
|
|
166
|
+
```
|
|
167
|
+
~/.audio-transcript-mcp/transcripts/
|
|
168
|
+
2026-03-06_23-24-48/
|
|
169
|
+
transcript.txt # Plain text transcript
|
|
170
|
+
audio.opus # Stereo opus (L=mic, R=system)
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
The transcript is plain text:
|
|
174
|
+
```
|
|
175
|
+
[23:24:50] me — Hello, can you hear me?
|
|
176
|
+
|
|
177
|
+
[23:24:52] others — Yes, I can hear you fine.
|
|
178
|
+
|
|
179
|
+
[23:24:55] system — [STARTED: Microphone, 44100Hz, 2ch]
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
## MCP Tools
|
|
183
|
+
|
|
184
|
+
| Tool | Description |
|
|
185
|
+
|---|---|
|
|
186
|
+
| `start_listening` | Start capturing mic + system audio and transcribing |
|
|
187
|
+
| `stop_listening` | Stop capture, save transcript and opus recording |
|
|
188
|
+
| `is_listening` | Check if capture is active |
|
|
189
|
+
| `get_transcript` | Get transcript for the last N seconds (default 60) |
|
|
190
|
+
| `get_full_transcript` | Get entire transcript buffer |
|
|
191
|
+
| `get_transcript_since` | Get transcript since a Unix timestamp |
|
|
192
|
+
| `clear_transcript` | Clear the transcript buffer |
|
|
193
|
+
| `get_backend` | Show current STT backend |
|
|
194
|
+
| `set_backend` | Switch backend (`"deepgram"` / `"local"`) at runtime |
|
|
195
|
+
|
|
196
|
+
## Project Structure
|
|
197
|
+
|
|
198
|
+
```
|
|
199
|
+
src/audio_transcript_mcp/
|
|
200
|
+
__init__.py # Package version
|
|
201
|
+
__main__.py # python -m entry point
|
|
202
|
+
server.py # MCP server, AudioEngine, config
|
|
203
|
+
audio_utils.py # Format conversion (float32↔int16, stereo→mono)
|
|
204
|
+
backends/
|
|
205
|
+
__init__.py # Backend factory
|
|
206
|
+
whisper.py # Local faster-whisper STT
|
|
207
|
+
deepgram.py # Deepgram WebSocket STT
|
|
208
|
+
recorder/
|
|
209
|
+
__init__.py
|
|
210
|
+
opus.py # StereoOpusRecorder (PyOgg)
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
## Releasing
|
|
214
|
+
|
|
215
|
+
Releases are automated via GitHub Actions:
|
|
216
|
+
|
|
217
|
+
```bash
|
|
218
|
+
# Update version in src/audio_transcript_mcp/__init__.py
|
|
219
|
+
git tag v0.1.0
|
|
220
|
+
git push origin v0.1.0
|
|
221
|
+
# CI automatically builds, publishes to PyPI, and creates a GitHub Release
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
## License
|
|
225
|
+
|
|
226
|
+
MIT
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
# audio-transcript-mcp
|
|
2
|
+
|
|
3
|
+
Real-time audio transcription MCP server for Claude Code.
|
|
4
|
+
|
|
5
|
+
Captures **microphone + system audio** (WASAPI loopback on Windows) and transcribes via **Deepgram** (cloud) or **faster-whisper** (local, GPU/CPU).
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- **Dual audio capture**: mic + system sound simultaneously
|
|
10
|
+
- **Two STT backends** switchable at runtime (Deepgram nova-3 / faster-whisper)
|
|
11
|
+
- **Stereo opus recording**: each session saves a stereo opus file (L=mic, R=system audio)
|
|
12
|
+
- **Per-session directories**: transcript + audio saved to `~/.audio-transcript-mcp/transcripts/<timestamp>/`
|
|
13
|
+
- Chunk overlap with text deduplication (no cut words at boundaries)
|
|
14
|
+
- Native float32 audio pipeline for whisper (no lossy int16 round-trip)
|
|
15
|
+
- High-quality stateful resampling via soxr (no boundary artifacts)
|
|
16
|
+
- Whisper hallucination filter (no_speech_prob + avg_logprob thresholds)
|
|
17
|
+
- Transcript buffer with time-based queries
|
|
18
|
+
- Auto-reconnect for Deepgram WebSocket
|
|
19
|
+
- GPU model unload/reload on stop/start (CUDA memory management)
|
|
20
|
+
|
|
21
|
+
## Architecture
|
|
22
|
+
|
|
23
|
+
```
|
|
24
|
+
┌─────────────┐ ┌──────────┐ ┌─────────────────┐
|
|
25
|
+
│ Mic (int16) ├────►│ │ │ STT Backend │
|
|
26
|
+
│ WASAPI │ │ Worker ├────►│ whisper / DG ├──► Transcript buffer
|
|
27
|
+
└─────────────┘ │ Thread │ └─────────────────┘
|
|
28
|
+
│ ├────►┌─────────────────┐
|
|
29
|
+
┌─────────────┐ │ │ │ StereoOpusRec │
|
|
30
|
+
│ System audio ├────►│ │ │ L=me R=others ├──► audio.opus
|
|
31
|
+
│ Loopback f32 │ └──────────┘ └─────────────────┘
|
|
32
|
+
└─────────────┘
|
|
33
|
+
|
|
34
|
+
Audio pipeline: native capture → stereo→mono → soxr resample → backend/opus
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Each audio source runs in its own worker thread. Audio is captured in the device's native format (float32 for loopback, int16 for mic), converted to mono, and routed to both the STT backend and the stereo opus recorder.
|
|
38
|
+
|
|
39
|
+
## Requirements
|
|
40
|
+
|
|
41
|
+
- Python 3.10+
|
|
42
|
+
- Windows (WASAPI loopback for system audio capture); mic-only on macOS/Linux
|
|
43
|
+
- NVIDIA GPU recommended for local whisper backend
|
|
44
|
+
|
|
45
|
+
## Installation
|
|
46
|
+
|
|
47
|
+
### From PyPI (recommended)
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install audio-transcript-mcp
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Or run without installing via `uvx`:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
uvx audio-transcript-mcp
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### From source
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
git clone https://github.com/llilakoblock/audio-transcript-mcp.git
|
|
63
|
+
cd audio-transcript-mcp
|
|
64
|
+
pip install -e .
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## MCP Configuration
|
|
68
|
+
|
|
69
|
+
Add to your `mcp.json` (Claude Code settings):
|
|
70
|
+
|
|
71
|
+
### Using PyPI install
|
|
72
|
+
|
|
73
|
+
```json
|
|
74
|
+
{
|
|
75
|
+
"audio-transcript": {
|
|
76
|
+
"type": "stdio",
|
|
77
|
+
"command": "audio-transcript-mcp",
|
|
78
|
+
"env": {
|
|
79
|
+
"STT_BACKEND": "local",
|
|
80
|
+
"DEEPGRAM_API_KEY": "your-deepgram-api-key",
|
|
81
|
+
"DEEPGRAM_LANGUAGE": "ru",
|
|
82
|
+
"DEEPGRAM_MODEL": "nova-3",
|
|
83
|
+
"DEEPGRAM_UTTERANCE_END_MS": "2500",
|
|
84
|
+
"DEEPGRAM_ENDPOINTING": "500",
|
|
85
|
+
"WHISPER_MODEL": "large-v3",
|
|
86
|
+
"WHISPER_DEVICE": "cuda",
|
|
87
|
+
"WHISPER_LANGUAGE": "ru",
|
|
88
|
+
"WHISPER_CHUNK_SEC": "10",
|
|
89
|
+
"WHISPER_OVERLAP_SEC": "2",
|
|
90
|
+
"TRANSCRIPT_MAX_AGE": "3600"
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Using uvx (no install needed)
|
|
97
|
+
|
|
98
|
+
```json
|
|
99
|
+
{
|
|
100
|
+
"audio-transcript": {
|
|
101
|
+
"type": "stdio",
|
|
102
|
+
"command": "uvx",
|
|
103
|
+
"args": ["audio-transcript-mcp"],
|
|
104
|
+
"env": {
|
|
105
|
+
"STT_BACKEND": "deepgram",
|
|
106
|
+
"DEEPGRAM_API_KEY": "your-deepgram-api-key"
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
> **Note:** System audio capture (loopback) uses WASAPI and is Windows-only. On macOS/Linux only microphone input works out of the box.
|
|
113
|
+
|
|
114
|
+
## Environment Variables
|
|
115
|
+
|
|
116
|
+
| Variable | Default | Description |
|
|
117
|
+
|---|---|---|
|
|
118
|
+
| `STT_BACKEND` | `deepgram` | `"deepgram"` (cloud) or `"local"` (faster-whisper) |
|
|
119
|
+
| `DEEPGRAM_API_KEY` | — | API key for Deepgram (required if backend=deepgram) |
|
|
120
|
+
| `DEEPGRAM_LANGUAGE` | `ru` | Language code for Deepgram |
|
|
121
|
+
| `DEEPGRAM_MODEL` | `nova-3` | Deepgram model (`nova-3`, `nova-2`, etc.) |
|
|
122
|
+
| `DEEPGRAM_UTTERANCE_END_MS` | `2500` | Silence duration (ms) before finalizing utterance |
|
|
123
|
+
| `DEEPGRAM_ENDPOINTING` | `500` | Endpointing sensitivity (ms) |
|
|
124
|
+
| `WHISPER_MODEL` | `large-v3` | Model size: `tiny`, `base`, `small`, `medium`, `large-v3` |
|
|
125
|
+
| `WHISPER_DEVICE` | `cuda` | `"cuda"` or `"cpu"` |
|
|
126
|
+
| `WHISPER_LANGUAGE` | — | Language hint for whisper (empty = auto-detect) |
|
|
127
|
+
| `WHISPER_CHUNK_SEC` | `5` | Audio chunk duration in seconds |
|
|
128
|
+
| `WHISPER_OVERLAP_SEC` | `2` | Overlap between consecutive chunks (avoids cut words) |
|
|
129
|
+
| `TRANSCRIPT_MAX_AGE` | `3600` | Max transcript buffer age in seconds |
|
|
130
|
+
| `TRANSCRIPT_DIR` | `~/.audio-transcript-mcp/transcripts` | Directory for per-session transcript/audio files |
|
|
131
|
+
|
|
132
|
+
## Session Output
|
|
133
|
+
|
|
134
|
+
Each recording session creates a timestamped directory:
|
|
135
|
+
|
|
136
|
+
```
|
|
137
|
+
~/.audio-transcript-mcp/transcripts/
|
|
138
|
+
2026-03-06_23-24-48/
|
|
139
|
+
transcript.txt # Plain text transcript
|
|
140
|
+
audio.opus # Stereo opus (L=mic, R=system)
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
The transcript is plain text:
|
|
144
|
+
```
|
|
145
|
+
[23:24:50] me — Hello, can you hear me?
|
|
146
|
+
|
|
147
|
+
[23:24:52] others — Yes, I can hear you fine.
|
|
148
|
+
|
|
149
|
+
[23:24:55] system — [STARTED: Microphone, 44100Hz, 2ch]
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
## MCP Tools
|
|
153
|
+
|
|
154
|
+
| Tool | Description |
|
|
155
|
+
|---|---|
|
|
156
|
+
| `start_listening` | Start capturing mic + system audio and transcribing |
|
|
157
|
+
| `stop_listening` | Stop capture, save transcript and opus recording |
|
|
158
|
+
| `is_listening` | Check if capture is active |
|
|
159
|
+
| `get_transcript` | Get transcript for the last N seconds (default 60) |
|
|
160
|
+
| `get_full_transcript` | Get entire transcript buffer |
|
|
161
|
+
| `get_transcript_since` | Get transcript since a Unix timestamp |
|
|
162
|
+
| `clear_transcript` | Clear the transcript buffer |
|
|
163
|
+
| `get_backend` | Show current STT backend |
|
|
164
|
+
| `set_backend` | Switch backend (`"deepgram"` / `"local"`) at runtime |
|
|
165
|
+
|
|
166
|
+
## Project Structure
|
|
167
|
+
|
|
168
|
+
```
|
|
169
|
+
src/audio_transcript_mcp/
|
|
170
|
+
__init__.py # Package version
|
|
171
|
+
__main__.py # python -m entry point
|
|
172
|
+
server.py # MCP server, AudioEngine, config
|
|
173
|
+
audio_utils.py # Format conversion (float32↔int16, stereo→mono)
|
|
174
|
+
backends/
|
|
175
|
+
__init__.py # Backend factory
|
|
176
|
+
whisper.py # Local faster-whisper STT
|
|
177
|
+
deepgram.py # Deepgram WebSocket STT
|
|
178
|
+
recorder/
|
|
179
|
+
__init__.py
|
|
180
|
+
opus.py # StereoOpusRecorder (PyOgg)
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
## Releasing
|
|
184
|
+
|
|
185
|
+
Releases are automated via GitHub Actions:
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
# Update version in src/audio_transcript_mcp/__init__.py
|
|
189
|
+
git tag v0.1.0
|
|
190
|
+
git push origin v0.1.0
|
|
191
|
+
# CI automatically builds, publishes to PyPI, and creates a GitHub Release
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## License
|
|
195
|
+
|
|
196
|
+
MIT
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "audio-transcript-mcp"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "Real-time audio transcription MCP server for Claude Code"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [{ name = "llilakoblock" }]
|
|
13
|
+
keywords = ["mcp", "audio", "transcription", "whisper", "deepgram"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.10",
|
|
20
|
+
"Programming Language :: Python :: 3.11",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Programming Language :: Python :: 3.13",
|
|
23
|
+
"Topic :: Multimedia :: Sound/Audio :: Speech",
|
|
24
|
+
]
|
|
25
|
+
dependencies = [
|
|
26
|
+
"PyAudioWPatch>=0.2.12",
|
|
27
|
+
"websockets>=12.0",
|
|
28
|
+
"mcp[cli]>=1.2",
|
|
29
|
+
"faster-whisper>=1.0",
|
|
30
|
+
"numpy>=1.24",
|
|
31
|
+
"soxr>=0.3",
|
|
32
|
+
"PyOgg>=0.7",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
[project.scripts]
|
|
36
|
+
audio-transcript-mcp = "audio_transcript_mcp.server:main"
|
|
37
|
+
|
|
38
|
+
[project.urls]
|
|
39
|
+
Homepage = "https://github.com/llilakoblock/audio-transcript-mcp"
|
|
40
|
+
Repository = "https://github.com/llilakoblock/audio-transcript-mcp"
|
|
41
|
+
Issues = "https://github.com/llilakoblock/audio-transcript-mcp/issues"
|
|
42
|
+
|
|
43
|
+
[tool.hatch.version]
|
|
44
|
+
path = "src/audio_transcript_mcp/__init__.py"
|
|
45
|
+
|
|
46
|
+
[tool.hatch.build.targets.wheel]
|
|
47
|
+
packages = ["src/audio_transcript_mcp"]
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Audio format conversion utilities."""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def float32_to_int16(data: bytes) -> bytes:
|
|
7
|
+
"""Convert float32 PCM bytes to int16 PCM bytes."""
|
|
8
|
+
arr = np.frombuffer(data, dtype=np.float32)
|
|
9
|
+
return np.clip(arr * 32767, -32768, 32767).astype(np.int16).tobytes()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def stereo_to_mono_f32(data: bytes, channels: int = 2) -> bytes:
|
|
13
|
+
"""Downmix multi-channel float32 to mono by averaging channels."""
|
|
14
|
+
arr = np.frombuffer(data, dtype=np.float32).reshape(-1, channels)
|
|
15
|
+
return arr.mean(axis=1).astype(np.float32).tobytes()
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""STT backend factory."""
|
|
2
|
+
|
|
3
|
+
from audio_transcript_mcp.backends.deepgram import DeepgramBackend
|
|
4
|
+
from audio_transcript_mcp.backends.whisper import WhisperBackend
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def create_backend(backend_type, label, sample_rate, channels, is_float32, buf, config):
|
|
8
|
+
"""Create a backend instance based on type string."""
|
|
9
|
+
if backend_type == "local":
|
|
10
|
+
return WhisperBackend(label, sample_rate, channels, is_float32, buf, config)
|
|
11
|
+
else:
|
|
12
|
+
return DeepgramBackend(label, sample_rate, channels, is_float32, buf, config)
|