audio-transcript-mcp 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,27 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [master]
6
+ pull_request:
7
+ branches: [master]
8
+
9
+ jobs:
10
+ check:
11
+ runs-on: windows-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.10", "3.12", "3.13"]
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+ - uses: actions/setup-python@v5
18
+ with:
19
+ python-version: ${{ matrix.python-version }}
20
+ - name: Install dependencies
21
+ run: |
22
+ pip install build ruff
23
+ pip install -e .
24
+ - name: Lint with ruff
25
+ run: ruff check src/
26
+ - name: Check build
27
+ run: python -m build
@@ -0,0 +1,34 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ tags: ["v*"]
6
+
7
+ jobs:
8
+ publish-pypi:
9
+ runs-on: ubuntu-latest
10
+ permissions:
11
+ contents: read
12
+ id-token: write
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+ - uses: actions/setup-python@v5
16
+ with:
17
+ python-version: "3.12"
18
+ - name: Build package
19
+ run: |
20
+ pip install build
21
+ python -m build
22
+ - name: Publish to PyPI
23
+ uses: pypa/gh-action-pypi-publish@release/v1
24
+
25
+ github-release:
26
+ needs: publish-pypi
27
+ runs-on: ubuntu-latest
28
+ permissions:
29
+ contents: write
30
+ steps:
31
+ - uses: actions/checkout@v4
32
+ - uses: softprops/action-gh-release@v2
33
+ with:
34
+ generate_release_notes: true
@@ -0,0 +1,9 @@
1
+ .venv/
2
+ __pycache__/
3
+ *.pyc
4
+ .env
5
+ *.egg-info/
6
+ dist/
7
+ build/
8
+ .claude/
9
+ uv.lock
@@ -0,0 +1,18 @@
1
+ # Changelog
2
+
3
+ ## [0.1.0] - 2026-03-06
4
+
5
+ ### Added
6
+ - Dual audio capture: mic + system audio (WASAPI loopback)
7
+ - Deepgram nova-3 cloud STT backend with WebSocket streaming
8
+ - Local faster-whisper STT backend with GPU/CUDA support
9
+ - Runtime backend switching via MCP tool
10
+ - Stereo opus recording (L=mic, R=system audio) per session
11
+ - Per-session directories with transcript.txt and audio.opus
12
+ - Chunk overlap with text deduplication for whisper
13
+ - Whisper hallucination filter (no_speech_prob + avg_logprob)
14
+ - Stateful high-quality resampling via soxr
15
+ - Auto-reconnect for Deepgram WebSocket
16
+ - GPU model unload/reload on stop/start
17
+ - Configurable via environment variables
18
+ - PyPI package with `audio-transcript-mcp` CLI entry point
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 llilakoblock
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,226 @@
1
+ Metadata-Version: 2.4
2
+ Name: audio-transcript-mcp
3
+ Version: 0.1.0
4
+ Summary: Real-time audio transcription MCP server for Claude Code
5
+ Project-URL: Homepage, https://github.com/llilakoblock/audio-transcript-mcp
6
+ Project-URL: Repository, https://github.com/llilakoblock/audio-transcript-mcp
7
+ Project-URL: Issues, https://github.com/llilakoblock/audio-transcript-mcp/issues
8
+ Author: llilakoblock
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: audio,deepgram,mcp,transcription,whisper
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
21
+ Requires-Python: >=3.10
22
+ Requires-Dist: faster-whisper>=1.0
23
+ Requires-Dist: mcp[cli]>=1.2
24
+ Requires-Dist: numpy>=1.24
25
+ Requires-Dist: pyaudiowpatch>=0.2.12
26
+ Requires-Dist: pyogg>=0.7
27
+ Requires-Dist: soxr>=0.3
28
+ Requires-Dist: websockets>=12.0
29
+ Description-Content-Type: text/markdown
30
+
31
+ # audio-transcript-mcp
32
+
33
+ Real-time audio transcription MCP server for Claude Code.
34
+
35
+ Captures **microphone + system audio** (WASAPI loopback on Windows) and transcribes via **Deepgram** (cloud) or **faster-whisper** (local, GPU/CPU).
36
+
37
+ ## Features
38
+
39
+ - **Dual audio capture**: mic + system sound simultaneously
40
+ - **Two STT backends** switchable at runtime (Deepgram nova-3 / faster-whisper)
41
+ - **Stereo opus recording**: each session saves a stereo opus file (L=mic, R=system audio)
42
+ - **Per-session directories**: transcript + audio saved to `~/.audio-transcript-mcp/transcripts/<timestamp>/`
43
+ - Chunk overlap with text deduplication (no cut words at boundaries)
44
+ - Native float32 audio pipeline for whisper (no lossy int16 round-trip)
45
+ - High-quality stateful resampling via soxr (no boundary artifacts)
46
+ - Whisper hallucination filter (no_speech_prob + avg_logprob thresholds)
47
+ - Transcript buffer with time-based queries
48
+ - Auto-reconnect for Deepgram WebSocket
49
+ - GPU model unload/reload on stop/start (CUDA memory management)
50
+
51
+ ## Architecture
52
+
53
+ ```
54
+ ┌─────────────┐ ┌──────────┐ ┌─────────────────┐
55
+ │ Mic (int16) ├────►│ │ │ STT Backend │
56
+ │ WASAPI │ │ Worker ├────►│ whisper / DG ├──► Transcript buffer
57
+ └─────────────┘ │ Thread │ └─────────────────┘
58
+ │ ├────►┌─────────────────┐
59
+ ┌─────────────┐ │ │ │ StereoOpusRec │
60
+ │ System audio ├────►│ │ │ L=me R=others ├──► audio.opus
61
+ │ Loopback f32 │ └──────────┘ └─────────────────┘
62
+ └─────────────┘
63
+
64
+ Audio pipeline: native capture → stereo→mono → soxr resample → backend/opus
65
+ ```
66
+
67
+ Each audio source runs in its own worker thread. Audio is captured in the device's native format (float32 for loopback, int16 for mic), converted to mono, and routed to both the STT backend and the stereo opus recorder.
68
+
69
+ ## Requirements
70
+
71
+ - Python 3.10+
72
+ - Windows (WASAPI loopback for system audio capture); mic-only on macOS/Linux
73
+ - NVIDIA GPU recommended for local whisper backend
74
+
75
+ ## Installation
76
+
77
+ ### From PyPI (recommended)
78
+
79
+ ```bash
80
+ pip install audio-transcript-mcp
81
+ ```
82
+
83
+ Or run without installing via `uvx`:
84
+
85
+ ```bash
86
+ uvx audio-transcript-mcp
87
+ ```
88
+
89
+ ### From source
90
+
91
+ ```bash
92
+ git clone https://github.com/llilakoblock/audio-transcript-mcp.git
93
+ cd audio-transcript-mcp
94
+ pip install -e .
95
+ ```
96
+
97
+ ## MCP Configuration
98
+
99
+ Add to your `mcp.json` (Claude Code settings):
100
+
101
+ ### Using PyPI install
102
+
103
+ ```json
104
+ {
105
+ "audio-transcript": {
106
+ "type": "stdio",
107
+ "command": "audio-transcript-mcp",
108
+ "env": {
109
+ "STT_BACKEND": "local",
110
+ "DEEPGRAM_API_KEY": "your-deepgram-api-key",
111
+ "DEEPGRAM_LANGUAGE": "ru",
112
+ "DEEPGRAM_MODEL": "nova-3",
113
+ "DEEPGRAM_UTTERANCE_END_MS": "2500",
114
+ "DEEPGRAM_ENDPOINTING": "500",
115
+ "WHISPER_MODEL": "large-v3",
116
+ "WHISPER_DEVICE": "cuda",
117
+ "WHISPER_LANGUAGE": "ru",
118
+ "WHISPER_CHUNK_SEC": "10",
119
+ "WHISPER_OVERLAP_SEC": "2",
120
+ "TRANSCRIPT_MAX_AGE": "3600"
121
+ }
122
+ }
123
+ }
124
+ ```
125
+
126
+ ### Using uvx (no install needed)
127
+
128
+ ```json
129
+ {
130
+ "audio-transcript": {
131
+ "type": "stdio",
132
+ "command": "uvx",
133
+ "args": ["audio-transcript-mcp"],
134
+ "env": {
135
+ "STT_BACKEND": "deepgram",
136
+ "DEEPGRAM_API_KEY": "your-deepgram-api-key"
137
+ }
138
+ }
139
+ }
140
+ ```
141
+
142
+ > **Note:** System audio capture (loopback) uses WASAPI and is Windows-only. On macOS/Linux only microphone input works out of the box.
143
+
144
+ ## Environment Variables
145
+
146
+ | Variable | Default | Description |
147
+ |---|---|---|
148
+ | `STT_BACKEND` | `deepgram` | `"deepgram"` (cloud) or `"local"` (faster-whisper) |
149
+ | `DEEPGRAM_API_KEY` | — | API key for Deepgram (required if backend=deepgram) |
150
+ | `DEEPGRAM_LANGUAGE` | `ru` | Language code for Deepgram |
151
+ | `DEEPGRAM_MODEL` | `nova-3` | Deepgram model (`nova-3`, `nova-2`, etc.) |
152
+ | `DEEPGRAM_UTTERANCE_END_MS` | `2500` | Silence duration (ms) before finalizing utterance |
153
+ | `DEEPGRAM_ENDPOINTING` | `500` | Endpointing sensitivity (ms) |
154
+ | `WHISPER_MODEL` | `large-v3` | Model size: `tiny`, `base`, `small`, `medium`, `large-v3` |
155
+ | `WHISPER_DEVICE` | `cuda` | `"cuda"` or `"cpu"` |
156
+ | `WHISPER_LANGUAGE` | — | Language hint for whisper (empty = auto-detect) |
157
+ | `WHISPER_CHUNK_SEC` | `5` | Audio chunk duration in seconds |
158
+ | `WHISPER_OVERLAP_SEC` | `2` | Overlap between consecutive chunks (avoids cut words) |
159
+ | `TRANSCRIPT_MAX_AGE` | `3600` | Max transcript buffer age in seconds |
160
+ | `TRANSCRIPT_DIR` | `~/.audio-transcript-mcp/transcripts` | Directory for per-session transcript/audio files |
161
+
162
+ ## Session Output
163
+
164
+ Each recording session creates a timestamped directory:
165
+
166
+ ```
167
+ ~/.audio-transcript-mcp/transcripts/
168
+ 2026-03-06_23-24-48/
169
+ transcript.txt # Plain text transcript
170
+ audio.opus # Stereo opus (L=mic, R=system)
171
+ ```
172
+
173
+ The transcript is plain text:
174
+ ```
175
+ [23:24:50] me — Hello, can you hear me?
176
+
177
+ [23:24:52] others — Yes, I can hear you fine.
178
+
179
+ [23:24:55] system — [STARTED: Microphone, 44100Hz, 2ch]
180
+ ```
181
+
182
+ ## MCP Tools
183
+
184
+ | Tool | Description |
185
+ |---|---|
186
+ | `start_listening` | Start capturing mic + system audio and transcribing |
187
+ | `stop_listening` | Stop capture, save transcript and opus recording |
188
+ | `is_listening` | Check if capture is active |
189
+ | `get_transcript` | Get transcript for the last N seconds (default 60) |
190
+ | `get_full_transcript` | Get entire transcript buffer |
191
+ | `get_transcript_since` | Get transcript since a Unix timestamp |
192
+ | `clear_transcript` | Clear the transcript buffer |
193
+ | `get_backend` | Show current STT backend |
194
+ | `set_backend` | Switch backend (`"deepgram"` / `"local"`) at runtime |
195
+
196
+ ## Project Structure
197
+
198
+ ```
199
+ src/audio_transcript_mcp/
200
+ __init__.py # Package version
201
+ __main__.py # python -m entry point
202
+ server.py # MCP server, AudioEngine, config
203
+ audio_utils.py # Format conversion (float32↔int16, stereo→mono)
204
+ backends/
205
+ __init__.py # Backend factory
206
+ whisper.py # Local faster-whisper STT
207
+ deepgram.py # Deepgram WebSocket STT
208
+ recorder/
209
+ __init__.py
210
+ opus.py # StereoOpusRecorder (PyOgg)
211
+ ```
212
+
213
+ ## Releasing
214
+
215
+ Releases are automated via GitHub Actions:
216
+
217
+ ```bash
218
+ # Update version in src/audio_transcript_mcp/__init__.py
219
+ git tag v0.1.0
220
+ git push origin v0.1.0
221
+ # CI automatically builds, publishes to PyPI, and creates a GitHub Release
222
+ ```
223
+
224
+ ## License
225
+
226
+ MIT
@@ -0,0 +1,196 @@
1
+ # audio-transcript-mcp
2
+
3
+ Real-time audio transcription MCP server for Claude Code.
4
+
5
+ Captures **microphone + system audio** (WASAPI loopback on Windows) and transcribes via **Deepgram** (cloud) or **faster-whisper** (local, GPU/CPU).
6
+
7
+ ## Features
8
+
9
+ - **Dual audio capture**: mic + system sound simultaneously
10
+ - **Two STT backends** switchable at runtime (Deepgram nova-3 / faster-whisper)
11
+ - **Stereo opus recording**: each session saves a stereo opus file (L=mic, R=system audio)
12
+ - **Per-session directories**: transcript + audio saved to `~/.audio-transcript-mcp/transcripts/<timestamp>/`
13
+ - Chunk overlap with text deduplication (no cut words at boundaries)
14
+ - Native float32 audio pipeline for whisper (no lossy int16 round-trip)
15
+ - High-quality stateful resampling via soxr (no boundary artifacts)
16
+ - Whisper hallucination filter (no_speech_prob + avg_logprob thresholds)
17
+ - Transcript buffer with time-based queries
18
+ - Auto-reconnect for Deepgram WebSocket
19
+ - GPU model unload/reload on stop/start (CUDA memory management)
20
+
21
+ ## Architecture
22
+
23
+ ```
24
+ ┌─────────────┐ ┌──────────┐ ┌─────────────────┐
25
+ │ Mic (int16) ├────►│ │ │ STT Backend │
26
+ │ WASAPI │ │ Worker ├────►│ whisper / DG ├──► Transcript buffer
27
+ └─────────────┘ │ Thread │ └─────────────────┘
28
+ │ ├────►┌─────────────────┐
29
+ ┌─────────────┐ │ │ │ StereoOpusRec │
30
+ │ System audio ├────►│ │ │ L=me R=others ├──► audio.opus
31
+ │ Loopback f32 │ └──────────┘ └─────────────────┘
32
+ └─────────────┘
33
+
34
+ Audio pipeline: native capture → stereo→mono → soxr resample → backend/opus
35
+ ```
36
+
37
+ Each audio source runs in its own worker thread. Audio is captured in the device's native format (float32 for loopback, int16 for mic), converted to mono, and routed to both the STT backend and the stereo opus recorder.
38
+
39
+ ## Requirements
40
+
41
+ - Python 3.10+
42
+ - Windows (WASAPI loopback for system audio capture); mic-only on macOS/Linux
43
+ - NVIDIA GPU recommended for local whisper backend
44
+
45
+ ## Installation
46
+
47
+ ### From PyPI (recommended)
48
+
49
+ ```bash
50
+ pip install audio-transcript-mcp
51
+ ```
52
+
53
+ Or run without installing via `uvx`:
54
+
55
+ ```bash
56
+ uvx audio-transcript-mcp
57
+ ```
58
+
59
+ ### From source
60
+
61
+ ```bash
62
+ git clone https://github.com/llilakoblock/audio-transcript-mcp.git
63
+ cd audio-transcript-mcp
64
+ pip install -e .
65
+ ```
66
+
67
+ ## MCP Configuration
68
+
69
+ Add to your `mcp.json` (Claude Code settings):
70
+
71
+ ### Using PyPI install
72
+
73
+ ```json
74
+ {
75
+ "audio-transcript": {
76
+ "type": "stdio",
77
+ "command": "audio-transcript-mcp",
78
+ "env": {
79
+ "STT_BACKEND": "local",
80
+ "DEEPGRAM_API_KEY": "your-deepgram-api-key",
81
+ "DEEPGRAM_LANGUAGE": "ru",
82
+ "DEEPGRAM_MODEL": "nova-3",
83
+ "DEEPGRAM_UTTERANCE_END_MS": "2500",
84
+ "DEEPGRAM_ENDPOINTING": "500",
85
+ "WHISPER_MODEL": "large-v3",
86
+ "WHISPER_DEVICE": "cuda",
87
+ "WHISPER_LANGUAGE": "ru",
88
+ "WHISPER_CHUNK_SEC": "10",
89
+ "WHISPER_OVERLAP_SEC": "2",
90
+ "TRANSCRIPT_MAX_AGE": "3600"
91
+ }
92
+ }
93
+ }
94
+ ```
95
+
96
+ ### Using uvx (no install needed)
97
+
98
+ ```json
99
+ {
100
+ "audio-transcript": {
101
+ "type": "stdio",
102
+ "command": "uvx",
103
+ "args": ["audio-transcript-mcp"],
104
+ "env": {
105
+ "STT_BACKEND": "deepgram",
106
+ "DEEPGRAM_API_KEY": "your-deepgram-api-key"
107
+ }
108
+ }
109
+ }
110
+ ```
111
+
112
+ > **Note:** System audio capture (loopback) uses WASAPI and is Windows-only. On macOS/Linux only microphone input works out of the box.
113
+
114
+ ## Environment Variables
115
+
116
+ | Variable | Default | Description |
117
+ |---|---|---|
118
+ | `STT_BACKEND` | `deepgram` | `"deepgram"` (cloud) or `"local"` (faster-whisper) |
119
+ | `DEEPGRAM_API_KEY` | — | API key for Deepgram (required if backend=deepgram) |
120
+ | `DEEPGRAM_LANGUAGE` | `ru` | Language code for Deepgram |
121
+ | `DEEPGRAM_MODEL` | `nova-3` | Deepgram model (`nova-3`, `nova-2`, etc.) |
122
+ | `DEEPGRAM_UTTERANCE_END_MS` | `2500` | Silence duration (ms) before finalizing utterance |
123
+ | `DEEPGRAM_ENDPOINTING` | `500` | Endpointing sensitivity (ms) |
124
+ | `WHISPER_MODEL` | `large-v3` | Model size: `tiny`, `base`, `small`, `medium`, `large-v3` |
125
+ | `WHISPER_DEVICE` | `cuda` | `"cuda"` or `"cpu"` |
126
+ | `WHISPER_LANGUAGE` | — | Language hint for whisper (empty = auto-detect) |
127
+ | `WHISPER_CHUNK_SEC` | `5` | Audio chunk duration in seconds |
128
+ | `WHISPER_OVERLAP_SEC` | `2` | Overlap between consecutive chunks (avoids cut words) |
129
+ | `TRANSCRIPT_MAX_AGE` | `3600` | Max transcript buffer age in seconds |
130
+ | `TRANSCRIPT_DIR` | `~/.audio-transcript-mcp/transcripts` | Directory for per-session transcript/audio files |
131
+
132
+ ## Session Output
133
+
134
+ Each recording session creates a timestamped directory:
135
+
136
+ ```
137
+ ~/.audio-transcript-mcp/transcripts/
138
+ 2026-03-06_23-24-48/
139
+ transcript.txt # Plain text transcript
140
+ audio.opus # Stereo opus (L=mic, R=system)
141
+ ```
142
+
143
+ The transcript is plain text:
144
+ ```
145
+ [23:24:50] me — Hello, can you hear me?
146
+
147
+ [23:24:52] others — Yes, I can hear you fine.
148
+
149
+ [23:24:55] system — [STARTED: Microphone, 44100Hz, 2ch]
150
+ ```
151
+
152
+ ## MCP Tools
153
+
154
+ | Tool | Description |
155
+ |---|---|
156
+ | `start_listening` | Start capturing mic + system audio and transcribing |
157
+ | `stop_listening` | Stop capture, save transcript and opus recording |
158
+ | `is_listening` | Check if capture is active |
159
+ | `get_transcript` | Get transcript for the last N seconds (default 60) |
160
+ | `get_full_transcript` | Get entire transcript buffer |
161
+ | `get_transcript_since` | Get transcript since a Unix timestamp |
162
+ | `clear_transcript` | Clear the transcript buffer |
163
+ | `get_backend` | Show current STT backend |
164
+ | `set_backend` | Switch backend (`"deepgram"` / `"local"`) at runtime |
165
+
166
+ ## Project Structure
167
+
168
+ ```
169
+ src/audio_transcript_mcp/
170
+ __init__.py # Package version
171
+ __main__.py # python -m entry point
172
+ server.py # MCP server, AudioEngine, config
173
+ audio_utils.py # Format conversion (float32↔int16, stereo→mono)
174
+ backends/
175
+ __init__.py # Backend factory
176
+ whisper.py # Local faster-whisper STT
177
+ deepgram.py # Deepgram WebSocket STT
178
+ recorder/
179
+ __init__.py
180
+ opus.py # StereoOpusRecorder (PyOgg)
181
+ ```
182
+
183
+ ## Releasing
184
+
185
+ Releases are automated via GitHub Actions:
186
+
187
+ ```bash
188
+ # Update version in src/audio_transcript_mcp/__init__.py
189
+ git tag v0.1.0
190
+ git push origin v0.1.0
191
+ # CI automatically builds, publishes to PyPI, and creates a GitHub Release
192
+ ```
193
+
194
+ ## License
195
+
196
+ MIT
@@ -0,0 +1,47 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "audio-transcript-mcp"
7
+ dynamic = ["version"]
8
+ description = "Real-time audio transcription MCP server for Claude Code"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.10"
12
+ authors = [{ name = "llilakoblock" }]
13
+ keywords = ["mcp", "audio", "transcription", "whisper", "deepgram"]
14
+ classifiers = [
15
+ "Development Status :: 4 - Beta",
16
+ "Intended Audience :: Developers",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3.10",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Programming Language :: Python :: 3.13",
23
+ "Topic :: Multimedia :: Sound/Audio :: Speech",
24
+ ]
25
+ dependencies = [
26
+ "PyAudioWPatch>=0.2.12",
27
+ "websockets>=12.0",
28
+ "mcp[cli]>=1.2",
29
+ "faster-whisper>=1.0",
30
+ "numpy>=1.24",
31
+ "soxr>=0.3",
32
+ "PyOgg>=0.7",
33
+ ]
34
+
35
+ [project.scripts]
36
+ audio-transcript-mcp = "audio_transcript_mcp.server:main"
37
+
38
+ [project.urls]
39
+ Homepage = "https://github.com/llilakoblock/audio-transcript-mcp"
40
+ Repository = "https://github.com/llilakoblock/audio-transcript-mcp"
41
+ Issues = "https://github.com/llilakoblock/audio-transcript-mcp/issues"
42
+
43
+ [tool.hatch.version]
44
+ path = "src/audio_transcript_mcp/__init__.py"
45
+
46
+ [tool.hatch.build.targets.wheel]
47
+ packages = ["src/audio_transcript_mcp"]
@@ -0,0 +1,3 @@
1
+ """Real-time audio transcription MCP server."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,5 @@
1
+ """Allow running as `python -m audio_transcript_mcp`."""
2
+
3
+ from audio_transcript_mcp.server import main
4
+
5
+ main()
@@ -0,0 +1,15 @@
1
+ """Audio format conversion utilities."""
2
+
3
+ import numpy as np
4
+
5
+
6
+ def float32_to_int16(data: bytes) -> bytes:
7
+ """Convert float32 PCM bytes to int16 PCM bytes."""
8
+ arr = np.frombuffer(data, dtype=np.float32)
9
+ return np.clip(arr * 32767, -32768, 32767).astype(np.int16).tobytes()
10
+
11
+
12
+ def stereo_to_mono_f32(data: bytes, channels: int = 2) -> bytes:
13
+ """Downmix multi-channel float32 to mono by averaging channels."""
14
+ arr = np.frombuffer(data, dtype=np.float32).reshape(-1, channels)
15
+ return arr.mean(axis=1).astype(np.float32).tobytes()
@@ -0,0 +1,12 @@
1
+ """STT backend factory."""
2
+
3
+ from audio_transcript_mcp.backends.deepgram import DeepgramBackend
4
+ from audio_transcript_mcp.backends.whisper import WhisperBackend
5
+
6
+
7
+ def create_backend(backend_type, label, sample_rate, channels, is_float32, buf, config):
8
+ """Create a backend instance based on type string."""
9
+ if backend_type == "local":
10
+ return WhisperBackend(label, sample_rate, channels, is_float32, buf, config)
11
+ else:
12
+ return DeepgramBackend(label, sample_rate, channels, is_float32, buf, config)