audio-transcriber 0.5.40__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- audio_transcriber-0.5.40/LICENSE +20 -0
- audio_transcriber-0.5.40/MANIFEST.in +1 -0
- audio_transcriber-0.5.40/PKG-INFO +177 -0
- audio_transcriber-0.5.40/README.md +156 -0
- audio_transcriber-0.5.40/audio_transcriber/__init__.py +18 -0
- audio_transcriber-0.5.40/audio_transcriber/audio_transcriber.py +429 -0
- audio_transcriber-0.5.40/audio_transcriber/audio_transcriber_mcp.py +200 -0
- audio_transcriber-0.5.40/audio_transcriber.egg-info/PKG-INFO +177 -0
- audio_transcriber-0.5.40/audio_transcriber.egg-info/SOURCES.txt +14 -0
- audio_transcriber-0.5.40/audio_transcriber.egg-info/dependency_links.txt +1 -0
- audio_transcriber-0.5.40/audio_transcriber.egg-info/entry_points.txt +3 -0
- audio_transcriber-0.5.40/audio_transcriber.egg-info/requires.txt +5 -0
- audio_transcriber-0.5.40/audio_transcriber.egg-info/top_level.txt +2 -0
- audio_transcriber-0.5.40/pyproject.toml +37 -0
- audio_transcriber-0.5.40/requirements.txt +5 -0
- audio_transcriber-0.5.40/setup.cfg +4 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
Copyright (c) 2012-2023 Audel Rouhi
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
|
4
|
+
a copy of this software and associated documentation files (the
|
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
|
9
|
+
the following conditions:
|
|
10
|
+
|
|
11
|
+
The above copyright notice and this permission notice shall be
|
|
12
|
+
included in all copies or substantial portions of the Software.
|
|
13
|
+
|
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
include README.md include requirements.txt recursive-include audio_transcriber *.py
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: audio-transcriber
|
|
3
|
+
Version: 0.5.40
|
|
4
|
+
Summary: Transcribe your .wav .mp4 .mp3 .flac files to text or record your own audio!
|
|
5
|
+
Author-email: Audel Rouhi <knucklessg1@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
8
|
+
Classifier: License :: Public Domain
|
|
9
|
+
Classifier: Environment :: Console
|
|
10
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Requires-Python: >=3.8
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Requires-Dist: torch>=1.13.1
|
|
16
|
+
Requires-Dist: transformers>=4.25.1
|
|
17
|
+
Requires-Dist: pyaudio>=0.2.13
|
|
18
|
+
Requires-Dist: openai-whisper>=20250625
|
|
19
|
+
Requires-Dist: setuptools-rust>=1.12.0
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
|
|
22
|
+
# Audio-Transcriber
|
|
23
|
+
|
|
24
|
+

|
|
25
|
+

|
|
26
|
+

|
|
27
|
+

|
|
28
|
+

|
|
29
|
+

|
|
30
|
+

|
|
31
|
+
|
|
32
|
+

|
|
33
|
+

|
|
34
|
+

|
|
35
|
+

|
|
36
|
+
|
|
37
|
+

|
|
38
|
+

|
|
39
|
+

|
|
40
|
+

|
|
41
|
+

|
|
42
|
+

|
|
43
|
+
|
|
44
|
+
*Version: 0.5.40*
|
|
45
|
+
|
|
46
|
+
Transcribe your .wav .mp4 .mp3 .flac files to text or record your own audio!
|
|
47
|
+
|
|
48
|
+
This repository is actively maintained - Contributions are welcome!
|
|
49
|
+
|
|
50
|
+
Contribution Opportunities:
|
|
51
|
+
- Support new models
|
|
52
|
+
|
|
53
|
+
Wrapped around [OpenAI Whisper](https://pypi.org/project/openai-whisper)
|
|
54
|
+
|
|
55
|
+
<details>
|
|
56
|
+
<summary><b>Usage:</b></summary>
|
|
57
|
+
|
|
58
|
+
| Short Flag | Long Flag | Description |
|
|
59
|
+
|------------|-------------|---------------------------------------------------------------|
|
|
60
|
+
| -h | --help | See Usage |
|
|
61
|
+
| -b | --bitrate | Bitrate to use during recording |
|
|
62
|
+
| -c | --channels | Number of channels to use during recording |
|
|
63
|
+
| -d | --directory | Directory to save recording |
|
|
64
|
+
| -e | --export | Export txt, srt, and vtt files |
|
|
65
|
+
| -f | --file | File to transcribe |
|
|
66
|
+
| -l | --language | Language to transcribe |
|
|
67
|
+
| -m | --model | Model to use: <tiny, base, small, medium, large> |
|
|
68
|
+
| -n | --name | Name of recording |
|
|
69
|
+
| -r | --record | Specify number of seconds to record to record from microphone |
|
|
70
|
+
|
|
71
|
+
</details>
|
|
72
|
+
|
|
73
|
+
<details>
|
|
74
|
+
<summary><b>Example:</b></summary>
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
audio-transcriber --file '~/Downloads/Federal_Reserve.mp4' --model 'large'
|
|
78
|
+
audio-transcriber --record 60 --directory '~/Downloads/' --name 'my_recording.wav' --model 'tiny'
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
</details>
|
|
83
|
+
|
|
84
|
+
<details>
|
|
85
|
+
<summary><b>Model Information:</b></summary>
|
|
86
|
+
|
|
87
|
+
[Courtesy of and Credits to OpenAI: Whisper.ai](https://github.com/openai/whisper/blob/main/README.md)
|
|
88
|
+
|
|
89
|
+
| Size | Parameters | English-only model | Multilingual model | Required VRAM | Relative speed |
|
|
90
|
+
|:------:|:----------:|:------------------:|:------------------:|:-------------:|:--------------:|
|
|
91
|
+
| tiny | 39 M | `tiny.en` | `tiny` | ~1 GB | ~32x |
|
|
92
|
+
| base | 74 M | `base.en` | `base` | ~1 GB | ~16x |
|
|
93
|
+
| small | 244 M | `small.en` | `small` | ~2 GB | ~6x |
|
|
94
|
+
| medium | 769 M | `medium.en` | `medium` | ~5 GB | ~2x |
|
|
95
|
+
| large | 1550 M | N/A | `large` | ~10 GB | 1x |
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
</details>
|
|
99
|
+
|
|
100
|
+
<details>
|
|
101
|
+
<summary><b>Installation Instructions:</b></summary>
|
|
102
|
+
|
|
103
|
+
## Use with AI
|
|
104
|
+
|
|
105
|
+
Configure `mcp.json`
|
|
106
|
+
|
|
107
|
+
Recommended: Store secrets in environment variables with lookup in JSON file.
|
|
108
|
+
|
|
109
|
+
For Testing Only: Plain text storage will also work, although **not** recommended.
|
|
110
|
+
|
|
111
|
+
```json
|
|
112
|
+
{
|
|
113
|
+
"mcpServers": {
|
|
114
|
+
"audio_transcriber": {
|
|
115
|
+
"command": "uv",
|
|
116
|
+
"args": [
|
|
117
|
+
"run",
|
|
118
|
+
"--with",
|
|
119
|
+
"audio-transcriber",
|
|
120
|
+
"audio-transcriber-mcp"
|
|
121
|
+
],
|
|
122
|
+
"env": {
|
|
123
|
+
"WHISPER_MODEL": "medium", // Optional
|
|
124
|
+
"TRANSCRIBE_DIRECTORY": "~/Downloads" // Optional
|
|
125
|
+
},
|
|
126
|
+
"timeout": 200000
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### Deploy MCP Server as a container
|
|
133
|
+
```bash
|
|
134
|
+
docker pull knucklessg1/audio-transcriber:latest
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
Modify the `compose.yml`
|
|
138
|
+
|
|
139
|
+
```compose
|
|
140
|
+
services:
|
|
141
|
+
audio-transcriber:
|
|
142
|
+
image: knucklessg1/audio-transcriber:latest
|
|
143
|
+
environment:
|
|
144
|
+
- HOST=0.0.0.0
|
|
145
|
+
- PORT=8021
|
|
146
|
+
ports:
|
|
147
|
+
- 8021:8021
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### Install Python Package
|
|
151
|
+
|
|
152
|
+
```bash
|
|
153
|
+
python -m pip install audio-transcriber
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
or
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
uv pip install --upgrade audio-transcriber
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
##### Ubuntu Dependencies
|
|
163
|
+
```bash
|
|
164
|
+
apt install -y libasound-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
</details>
|
|
168
|
+
|
|
169
|
+
<details>
|
|
170
|
+
<summary><b>Repository Owners:</b></summary>
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
<img width="100%" height="180em" src="https://github-readme-stats.vercel.app/api?username=Knucklessg1&show_icons=true&hide_border=true&&count_private=true&include_all_commits=true" />
|
|
174
|
+
|
|
175
|
+

|
|
176
|
+

|
|
177
|
+
</details>
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
# Audio-Transcriber
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+

|
|
6
|
+

|
|
7
|
+

|
|
8
|
+

|
|
9
|
+

|
|
10
|
+
|
|
11
|
+

|
|
12
|
+

|
|
13
|
+

|
|
14
|
+

|
|
15
|
+
|
|
16
|
+

|
|
17
|
+

|
|
18
|
+

|
|
19
|
+

|
|
20
|
+

|
|
21
|
+

|
|
22
|
+
|
|
23
|
+
*Version: 0.5.40*
|
|
24
|
+
|
|
25
|
+
Transcribe your .wav .mp4 .mp3 .flac files to text or record your own audio!
|
|
26
|
+
|
|
27
|
+
This repository is actively maintained - Contributions are welcome!
|
|
28
|
+
|
|
29
|
+
Contribution Opportunities:
|
|
30
|
+
- Support new models
|
|
31
|
+
|
|
32
|
+
Wrapped around [OpenAI Whisper](https://pypi.org/project/openai-whisper)
|
|
33
|
+
|
|
34
|
+
<details>
|
|
35
|
+
<summary><b>Usage:</b></summary>
|
|
36
|
+
|
|
37
|
+
| Short Flag | Long Flag | Description |
|
|
38
|
+
|------------|-------------|---------------------------------------------------------------|
|
|
39
|
+
| -h | --help | See Usage |
|
|
40
|
+
| -b | --bitrate | Bitrate to use during recording |
|
|
41
|
+
| -c | --channels | Number of channels to use during recording |
|
|
42
|
+
| -d | --directory | Directory to save recording |
|
|
43
|
+
| -e | --export | Export txt, srt, and vtt files |
|
|
44
|
+
| -f | --file | File to transcribe |
|
|
45
|
+
| -l | --language | Language to transcribe |
|
|
46
|
+
| -m | --model | Model to use: <tiny, base, small, medium, large> |
|
|
47
|
+
| -n | --name | Name of recording |
|
|
48
|
+
| -r | --record | Specify number of seconds to record to record from microphone |
|
|
49
|
+
|
|
50
|
+
</details>
|
|
51
|
+
|
|
52
|
+
<details>
|
|
53
|
+
<summary><b>Example:</b></summary>
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
audio-transcriber --file '~/Downloads/Federal_Reserve.mp4' --model 'large'
|
|
57
|
+
audio-transcriber --record 60 --directory '~/Downloads/' --name 'my_recording.wav' --model 'tiny'
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
</details>
|
|
62
|
+
|
|
63
|
+
<details>
|
|
64
|
+
<summary><b>Model Information:</b></summary>
|
|
65
|
+
|
|
66
|
+
[Courtesy of and Credits to OpenAI: Whisper.ai](https://github.com/openai/whisper/blob/main/README.md)
|
|
67
|
+
|
|
68
|
+
| Size | Parameters | English-only model | Multilingual model | Required VRAM | Relative speed |
|
|
69
|
+
|:------:|:----------:|:------------------:|:------------------:|:-------------:|:--------------:|
|
|
70
|
+
| tiny | 39 M | `tiny.en` | `tiny` | ~1 GB | ~32x |
|
|
71
|
+
| base | 74 M | `base.en` | `base` | ~1 GB | ~16x |
|
|
72
|
+
| small | 244 M | `small.en` | `small` | ~2 GB | ~6x |
|
|
73
|
+
| medium | 769 M | `medium.en` | `medium` | ~5 GB | ~2x |
|
|
74
|
+
| large | 1550 M | N/A | `large` | ~10 GB | 1x |
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
</details>
|
|
78
|
+
|
|
79
|
+
<details>
|
|
80
|
+
<summary><b>Installation Instructions:</b></summary>
|
|
81
|
+
|
|
82
|
+
## Use with AI
|
|
83
|
+
|
|
84
|
+
Configure `mcp.json`
|
|
85
|
+
|
|
86
|
+
Recommended: Store secrets in environment variables with lookup in JSON file.
|
|
87
|
+
|
|
88
|
+
For Testing Only: Plain text storage will also work, although **not** recommended.
|
|
89
|
+
|
|
90
|
+
```json
|
|
91
|
+
{
|
|
92
|
+
"mcpServers": {
|
|
93
|
+
"audio_transcriber": {
|
|
94
|
+
"command": "uv",
|
|
95
|
+
"args": [
|
|
96
|
+
"run",
|
|
97
|
+
"--with",
|
|
98
|
+
"audio-transcriber",
|
|
99
|
+
"audio-transcriber-mcp"
|
|
100
|
+
],
|
|
101
|
+
"env": {
|
|
102
|
+
"WHISPER_MODEL": "medium", // Optional
|
|
103
|
+
"TRANSCRIBE_DIRECTORY": "~/Downloads" // Optional
|
|
104
|
+
},
|
|
105
|
+
"timeout": 200000
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Deploy MCP Server as a container
|
|
112
|
+
```bash
|
|
113
|
+
docker pull knucklessg1/audio-transcriber:latest
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Modify the `compose.yml`
|
|
117
|
+
|
|
118
|
+
```compose
|
|
119
|
+
services:
|
|
120
|
+
audio-transcriber:
|
|
121
|
+
image: knucklessg1/audio-transcriber:latest
|
|
122
|
+
environment:
|
|
123
|
+
- HOST=0.0.0.0
|
|
124
|
+
- PORT=8021
|
|
125
|
+
ports:
|
|
126
|
+
- 8021:8021
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### Install Python Package
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
python -m pip install audio-transcriber
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
or
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
uv pip install --upgrade audio-transcriber
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
##### Ubuntu Dependencies
|
|
142
|
+
```bash
|
|
143
|
+
apt install -y libasound-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
</details>
|
|
147
|
+
|
|
148
|
+
<details>
|
|
149
|
+
<summary><b>Repository Owners:</b></summary>
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
<img width="100%" height="180em" src="https://github-readme-stats.vercel.app/api?username=Knucklessg1&show_icons=true&hide_border=true&&count_private=true&include_all_commits=true" />
|
|
153
|
+
|
|
154
|
+

|
|
155
|
+

|
|
156
|
+
</details>
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# coding: utf-8
|
|
3
|
+
|
|
4
|
+
from audio_transcriber.audio_transcriber import (
|
|
5
|
+
main,
|
|
6
|
+
AudioTranscriber,
|
|
7
|
+
setup_logging,
|
|
8
|
+
)
|
|
9
|
+
from audio_transcriber.audio_transcriber_mcp import audio_transcriber_mcp
|
|
10
|
+
|
|
11
|
+
"""
|
|
12
|
+
audio-transcriber
|
|
13
|
+
|
|
14
|
+
Transcribe your .wav .mp4 .mp3 .flac files to text using AI!
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
__all__ = ["audio_transcriber_mcp", "main", "AudioTranscriber", "setup_logging"]
|
|
@@ -0,0 +1,429 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# coding: utf-8
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import datetime
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
import sys
|
|
9
|
+
import threading
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Iterator, List, Optional, TextIO, Union
|
|
12
|
+
|
|
13
|
+
import pyaudio
|
|
14
|
+
import whisper
|
|
15
|
+
import wave
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class AudioTranscriber:
|
|
19
|
+
"""A class for recording audio and transcribing it using OpenAI's Whisper model."""
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
model: str = "base",
|
|
24
|
+
channels: int = 1,
|
|
25
|
+
rate: int = 16000, # Whisper recommends 16kHz for better accuracy
|
|
26
|
+
file_name: str = "output.wav",
|
|
27
|
+
directory: Union[str, Path] = Path.cwd(),
|
|
28
|
+
file: Optional[Union[str, Path]] = None,
|
|
29
|
+
device: Optional[int] = None,
|
|
30
|
+
logger: Optional[logging.Logger] = None,
|
|
31
|
+
):
|
|
32
|
+
self.chunk = 1024
|
|
33
|
+
self.format = pyaudio.paInt16
|
|
34
|
+
self.channels = channels
|
|
35
|
+
self.rate = rate
|
|
36
|
+
self.pyaudio_instance = pyaudio.PyAudio()
|
|
37
|
+
self.stream = None
|
|
38
|
+
self.frames: List[bytes] = []
|
|
39
|
+
self.file_path = Path(file) if file else Path(directory) / file_name
|
|
40
|
+
self.title = self.file_path.stem
|
|
41
|
+
self.directory = self.file_path.parent
|
|
42
|
+
self.stop = False
|
|
43
|
+
self.model = whisper.load_model(model)
|
|
44
|
+
self.device_index = device or self._get_default_device()
|
|
45
|
+
self.logger = logger or logging.getLogger(__name__)
|
|
46
|
+
self._check_ffmpeg()
|
|
47
|
+
|
|
48
|
+
def _get_default_device(self) -> int:
|
|
49
|
+
"""Get the default input device index."""
|
|
50
|
+
return self.pyaudio_instance.get_default_input_device_info()["index"]
|
|
51
|
+
|
|
52
|
+
def _check_ffmpeg(self) -> None:
|
|
53
|
+
"""Check if ffmpeg is installed; log warning if not."""
|
|
54
|
+
import shutil
|
|
55
|
+
|
|
56
|
+
if not shutil.which("ffmpeg"):
|
|
57
|
+
self.logger.warning(
|
|
58
|
+
"ffmpeg not found. Install it for better audio format support. "
|
|
59
|
+
"See https://ffmpeg.org/download.html for instructions."
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def initiate_stream(self) -> None:
|
|
63
|
+
"""Initiate the audio input stream."""
|
|
64
|
+
self.stream = self.pyaudio_instance.open(
|
|
65
|
+
format=self.format,
|
|
66
|
+
channels=self.channels,
|
|
67
|
+
rate=self.rate,
|
|
68
|
+
input=True,
|
|
69
|
+
frames_per_buffer=self.chunk,
|
|
70
|
+
input_device_index=self.device_index,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
def record(self, seconds: int = 0) -> None:
|
|
74
|
+
"""Record audio for a specified duration or until stopped."""
|
|
75
|
+
self.logger.info("Recording started...")
|
|
76
|
+
self.frames = []
|
|
77
|
+
self.stop = False
|
|
78
|
+
if seconds > 0:
|
|
79
|
+
for _ in range(0, int((self.rate / self.chunk) * seconds)):
|
|
80
|
+
if self.stop:
|
|
81
|
+
break
|
|
82
|
+
data = self.stream.read(self.chunk)
|
|
83
|
+
self.frames.append(data)
|
|
84
|
+
else:
|
|
85
|
+
self.logger.info("Recording indefinitely until interrupted (Ctrl+C)...")
|
|
86
|
+
threading.Thread(target=self._unlimited_record, daemon=True).start()
|
|
87
|
+
try:
|
|
88
|
+
while not self.stop:
|
|
89
|
+
pass
|
|
90
|
+
except KeyboardInterrupt:
|
|
91
|
+
self.stop = True
|
|
92
|
+
self.logger.info("Recording stopped.")
|
|
93
|
+
|
|
94
|
+
def _unlimited_record(self) -> None:
|
|
95
|
+
"""Thread for unlimited recording."""
|
|
96
|
+
while not self.stop:
|
|
97
|
+
data = self.stream.read(self.chunk)
|
|
98
|
+
self.frames.append(data)
|
|
99
|
+
|
|
100
|
+
def stop_stream(self) -> None:
|
|
101
|
+
"""Stop and close the audio stream."""
|
|
102
|
+
self.stop = True
|
|
103
|
+
if self.stream:
|
|
104
|
+
self.stream.stop_stream()
|
|
105
|
+
self.stream.close()
|
|
106
|
+
self.pyaudio_instance.terminate()
|
|
107
|
+
|
|
108
|
+
def save_stream(self) -> None:
|
|
109
|
+
"""Save the recorded frames to a WAV file."""
|
|
110
|
+
if not self.frames:
|
|
111
|
+
self.logger.warning("No audio frames to save.")
|
|
112
|
+
return
|
|
113
|
+
with wave.open(str(self.file_path), "wb") as wave_file:
|
|
114
|
+
wave_file.setnchannels(self.channels)
|
|
115
|
+
wave_file.setsampwidth(self.pyaudio_instance.get_sample_size(self.format))
|
|
116
|
+
wave_file.setframerate(self.rate)
|
|
117
|
+
wave_file.writeframes(b"".join(self.frames))
|
|
118
|
+
self.logger.info(f"Audio saved to {self.file_path}")
|
|
119
|
+
|
|
120
|
+
def transcribe(
|
|
121
|
+
self,
|
|
122
|
+
language: Optional[str] = None,
|
|
123
|
+
task: str = "transcribe",
|
|
124
|
+
fp16: bool = True,
|
|
125
|
+
word_timestamps: bool = False,
|
|
126
|
+
temperature: float = 0.0,
|
|
127
|
+
initial_prompt: Optional[str] = None,
|
|
128
|
+
verbose: bool = False,
|
|
129
|
+
) -> dict:
|
|
130
|
+
"""Transcribe the audio file using Whisper."""
|
|
131
|
+
if task == "translate" and self.model.name.startswith("turbo"):
|
|
132
|
+
raise ValueError(
|
|
133
|
+
"Turbo model does not support translation. Use a multilingual model like 'medium' or 'large'."
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
start_time = datetime.datetime.now()
|
|
137
|
+
self.logger.info(
|
|
138
|
+
f"Started transcription at {start_time} for file: {self.file_path}"
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
options = whisper.DecodingOptions(
|
|
142
|
+
language=language,
|
|
143
|
+
task=task,
|
|
144
|
+
fp16=fp16,
|
|
145
|
+
word_timestamps=word_timestamps,
|
|
146
|
+
temperature=temperature,
|
|
147
|
+
prompt=initial_prompt,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
result = self.model.transcribe(
|
|
151
|
+
str(self.file_path), **options.__dict__, verbose=verbose
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
end_time = datetime.datetime.now()
|
|
155
|
+
self.logger.info(
|
|
156
|
+
f"Ended transcription at {end_time}. Time elapsed: {end_time - start_time}"
|
|
157
|
+
)
|
|
158
|
+
if verbose:
|
|
159
|
+
self.logger.info(f"Transcription result: {result['text']}")
|
|
160
|
+
|
|
161
|
+
return result
|
|
162
|
+
|
|
163
|
+
def export(
|
|
164
|
+
self,
|
|
165
|
+
result: dict,
|
|
166
|
+
formats: List[str],
|
|
167
|
+
) -> None:
|
|
168
|
+
"""Export transcription to specified formats."""
|
|
169
|
+
segments = result["segments"]
|
|
170
|
+
for fmt in formats:
|
|
171
|
+
export_path = self.directory / f"{self.title}.{fmt}"
|
|
172
|
+
if fmt == "txt":
|
|
173
|
+
with open(export_path, "w", encoding="utf-8") as f:
|
|
174
|
+
self._write_txt(segments, f)
|
|
175
|
+
elif fmt == "vtt":
|
|
176
|
+
with open(export_path, "w", encoding="utf-8") as f:
|
|
177
|
+
self._write_vtt(segments, f)
|
|
178
|
+
elif fmt == "srt":
|
|
179
|
+
with open(export_path, "w", encoding="utf-8") as f:
|
|
180
|
+
self._write_srt(segments, f)
|
|
181
|
+
elif fmt == "json":
|
|
182
|
+
with open(export_path, "w", encoding="utf-8") as f:
|
|
183
|
+
json.dump(result, f, indent=4, ensure_ascii=False)
|
|
184
|
+
else:
|
|
185
|
+
self.logger.warning(f"Unsupported export format: {fmt}")
|
|
186
|
+
self.logger.info(f"Exported to {export_path}")
|
|
187
|
+
|
|
188
|
+
@staticmethod
|
|
189
|
+
def _srt_format_timestamp(seconds: float) -> str:
|
|
190
|
+
"""Format timestamp for SRT."""
|
|
191
|
+
assert seconds >= 0, "non-negative timestamp expected"
|
|
192
|
+
milliseconds = round(seconds * 1000.0)
|
|
193
|
+
hours = milliseconds // 3_600_000
|
|
194
|
+
milliseconds -= hours * 3_600_000
|
|
195
|
+
minutes = milliseconds // 60_000
|
|
196
|
+
milliseconds -= minutes * 60_000
|
|
197
|
+
seconds_int = milliseconds // 1_000
|
|
198
|
+
milliseconds -= seconds_int * 1_000
|
|
199
|
+
return f"{hours:02d}:{minutes:02d}:{seconds_int:02d},{milliseconds:03d}"
|
|
200
|
+
|
|
201
|
+
def _write_srt(self, transcript: Iterator[dict], file: TextIO) -> None:
|
|
202
|
+
"""Write SRT file."""
|
|
203
|
+
count = 0
|
|
204
|
+
for segment in transcript:
|
|
205
|
+
count += 1
|
|
206
|
+
print(
|
|
207
|
+
f"{count}\n"
|
|
208
|
+
f"{self._srt_format_timestamp(segment['start'])} --> {self._srt_format_timestamp(segment['end'])}\n"
|
|
209
|
+
f"{segment['text'].replace('-->', '->').strip()}\n",
|
|
210
|
+
file=file,
|
|
211
|
+
flush=True,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
@staticmethod
|
|
215
|
+
def _write_txt(transcript: Iterator[dict], file: TextIO) -> None:
|
|
216
|
+
"""Write TXT file."""
|
|
217
|
+
for segment in transcript:
|
|
218
|
+
print(segment["text"].strip(), file=file, flush=True)
|
|
219
|
+
|
|
220
|
+
@staticmethod
|
|
221
|
+
def _write_vtt(transcript: Iterator[dict], file: TextIO) -> None:
|
|
222
|
+
"""Write VTT file."""
|
|
223
|
+
print("WEBVTT\n", file=file)
|
|
224
|
+
for segment in transcript:
|
|
225
|
+
print(
|
|
226
|
+
f"{AudioTranscriber._format_timestamp(segment['start'])} --> {AudioTranscriber._format_timestamp(segment['end'])}\n"
|
|
227
|
+
f"{segment['text'].strip().replace('-->', '->')}\n",
|
|
228
|
+
file=file,
|
|
229
|
+
flush=True,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
@staticmethod
|
|
233
|
+
def _format_timestamp(
|
|
234
|
+
seconds: float, always_include_hours: bool = False, decimal_marker: str = "."
|
|
235
|
+
) -> str:
|
|
236
|
+
"""Format timestamp for VTT."""
|
|
237
|
+
assert seconds >= 0, "non-negative timestamp expected"
|
|
238
|
+
milliseconds = round(seconds * 1000.0)
|
|
239
|
+
hours = milliseconds // 3_600_000
|
|
240
|
+
milliseconds -= hours * 3_600_000
|
|
241
|
+
minutes = milliseconds // 60_000
|
|
242
|
+
milliseconds -= minutes * 60_000
|
|
243
|
+
seconds_int = milliseconds // 1_000
|
|
244
|
+
milliseconds -= seconds_int * 1_000
|
|
245
|
+
hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
|
|
246
|
+
return f"{hours_marker}{minutes:02d}:{seconds_int:02d}{decimal_marker}{milliseconds:03d}"
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def setup_logging(
|
|
250
|
+
verbose: bool = False, log_file: Optional[str] = None
|
|
251
|
+
) -> logging.Logger:
|
|
252
|
+
"""Set up logging configuration."""
|
|
253
|
+
logger = logging.getLogger(__name__)
|
|
254
|
+
logger.setLevel(logging.INFO if verbose else logging.WARNING)
|
|
255
|
+
|
|
256
|
+
# Console handler
|
|
257
|
+
ch = logging.StreamHandler()
|
|
258
|
+
ch.setLevel(logging.INFO if verbose else logging.WARNING)
|
|
259
|
+
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
|
260
|
+
ch.setFormatter(formatter)
|
|
261
|
+
logger.addHandler(ch)
|
|
262
|
+
|
|
263
|
+
# File handler if specified
|
|
264
|
+
if log_file:
|
|
265
|
+
fh = logging.FileHandler(log_file)
|
|
266
|
+
fh.setLevel(logging.INFO)
|
|
267
|
+
fh.setFormatter(formatter)
|
|
268
|
+
logger.addHandler(fh)
|
|
269
|
+
|
|
270
|
+
return logger
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def main() -> None:
|
|
274
|
+
parser = argparse.ArgumentParser(
|
|
275
|
+
description="Audio Transcriber: Record and transcribe audio using OpenAI Whisper.",
|
|
276
|
+
epilog="Examples:\n"
|
|
277
|
+
" python audio_transcriber.py --file path/to/audio.mp3 --model large --task translate --language ja\n"
|
|
278
|
+
" python audio_transcriber.py --record 60 --directory ./recordings --name my_recording.wav --verbose",
|
|
279
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
280
|
+
)
|
|
281
|
+
parser.add_argument(
|
|
282
|
+
"--model",
|
|
283
|
+
default="base",
|
|
284
|
+
choices=[
|
|
285
|
+
"tiny",
|
|
286
|
+
"base",
|
|
287
|
+
"small",
|
|
288
|
+
"medium",
|
|
289
|
+
"large",
|
|
290
|
+
"turbo",
|
|
291
|
+
"tiny.en",
|
|
292
|
+
"base.en",
|
|
293
|
+
"small.en",
|
|
294
|
+
"medium.en",
|
|
295
|
+
],
|
|
296
|
+
help="Whisper model to use (default: base)",
|
|
297
|
+
)
|
|
298
|
+
parser.add_argument(
|
|
299
|
+
"--channels", type=int, default=1, help="Number of audio channels (default: 1)"
|
|
300
|
+
)
|
|
301
|
+
parser.add_argument(
|
|
302
|
+
"--rate",
|
|
303
|
+
type=int,
|
|
304
|
+
default=16000,
|
|
305
|
+
help="Sample rate for recording (default: 16000)",
|
|
306
|
+
)
|
|
307
|
+
parser.add_argument(
|
|
308
|
+
"--directory",
|
|
309
|
+
type=Path,
|
|
310
|
+
default=Path.cwd(),
|
|
311
|
+
help="Directory to save recordings/exports (default: current dir)",
|
|
312
|
+
)
|
|
313
|
+
parser.add_argument(
|
|
314
|
+
"--name",
|
|
315
|
+
default="output.wav",
|
|
316
|
+
help="Name of the output file (default: output.wav)",
|
|
317
|
+
)
|
|
318
|
+
parser.add_argument(
|
|
319
|
+
"--file",
|
|
320
|
+
type=Path,
|
|
321
|
+
nargs="*",
|
|
322
|
+
help="Path(s) to audio file(s) to transcribe (skips recording)",
|
|
323
|
+
)
|
|
324
|
+
parser.add_argument(
|
|
325
|
+
"--record",
|
|
326
|
+
type=int,
|
|
327
|
+
default=0,
|
|
328
|
+
help="Seconds to record (0 for unlimited until Ctrl+C; default: 0)",
|
|
329
|
+
)
|
|
330
|
+
parser.add_argument(
|
|
331
|
+
"--device", type=int, help="Input device index (default: system default)"
|
|
332
|
+
)
|
|
333
|
+
parser.add_argument(
|
|
334
|
+
"--language", help="Language code (e.g., 'en', 'fr'; auto-detected if omitted)"
|
|
335
|
+
)
|
|
336
|
+
parser.add_argument(
|
|
337
|
+
"--task",
|
|
338
|
+
default="transcribe",
|
|
339
|
+
choices=["transcribe", "translate"],
|
|
340
|
+
help="Task: transcribe or translate to English (default: transcribe)",
|
|
341
|
+
)
|
|
342
|
+
parser.add_argument(
|
|
343
|
+
"--fp16",
|
|
344
|
+
action="store_true",
|
|
345
|
+
help="Use FP16 for faster inference (default: False)",
|
|
346
|
+
)
|
|
347
|
+
parser.add_argument(
|
|
348
|
+
"--word-timestamps",
|
|
349
|
+
action="store_true",
|
|
350
|
+
help="Include word-level timestamps in output (default: False)",
|
|
351
|
+
)
|
|
352
|
+
parser.add_argument(
|
|
353
|
+
"--temperature",
|
|
354
|
+
type=float,
|
|
355
|
+
default=0.0,
|
|
356
|
+
help="Temperature for sampling diversity (default: 0.0)",
|
|
357
|
+
)
|
|
358
|
+
parser.add_argument(
|
|
359
|
+
"--initial-prompt", help="Initial text prompt to guide transcription"
|
|
360
|
+
)
|
|
361
|
+
parser.add_argument(
|
|
362
|
+
"--export",
|
|
363
|
+
nargs="*",
|
|
364
|
+
choices=["txt", "vtt", "srt", "json"],
|
|
365
|
+
default=[],
|
|
366
|
+
help="Export formats (e.g., --export txt srt)",
|
|
367
|
+
)
|
|
368
|
+
parser.add_argument("--verbose", action="store_true", help="Enable verbose output")
|
|
369
|
+
parser.add_argument("--log-file", help="Path to log file")
|
|
370
|
+
|
|
371
|
+
args = parser.parse_args()
|
|
372
|
+
|
|
373
|
+
logger = setup_logging(args.verbose, args.log_file)
|
|
374
|
+
|
|
375
|
+
if args.file:
|
|
376
|
+
# Batch transcription
|
|
377
|
+
for file_path in args.file:
|
|
378
|
+
if not file_path.exists():
|
|
379
|
+
logger.error(f"File not found: {file_path}")
|
|
380
|
+
sys.exit(1)
|
|
381
|
+
transcriber = AudioTranscriber(
|
|
382
|
+
model=args.model,
|
|
383
|
+
channels=args.channels,
|
|
384
|
+
rate=args.rate,
|
|
385
|
+
file=file_path,
|
|
386
|
+
device=args.device,
|
|
387
|
+
logger=logger,
|
|
388
|
+
)
|
|
389
|
+
result = transcriber.transcribe(
|
|
390
|
+
language=args.language,
|
|
391
|
+
task=args.task,
|
|
392
|
+
fp16=args.fp16,
|
|
393
|
+
word_timestamps=args.word_timestamps,
|
|
394
|
+
temperature=args.temperature,
|
|
395
|
+
initial_prompt=args.initial_prompt,
|
|
396
|
+
verbose=args.verbose,
|
|
397
|
+
)
|
|
398
|
+
if args.export:
|
|
399
|
+
transcriber.export(result, args.export)
|
|
400
|
+
else:
|
|
401
|
+
# Recording mode
|
|
402
|
+
transcriber = AudioTranscriber(
|
|
403
|
+
model=args.model,
|
|
404
|
+
channels=args.channels,
|
|
405
|
+
rate=args.rate,
|
|
406
|
+
file_name=args.name,
|
|
407
|
+
directory=args.directory,
|
|
408
|
+
device=args.device,
|
|
409
|
+
logger=logger,
|
|
410
|
+
)
|
|
411
|
+
transcriber.initiate_stream()
|
|
412
|
+
transcriber.record(seconds=args.record)
|
|
413
|
+
transcriber.stop_stream()
|
|
414
|
+
transcriber.save_stream()
|
|
415
|
+
result = transcriber.transcribe(
|
|
416
|
+
language=args.language,
|
|
417
|
+
task=args.task,
|
|
418
|
+
fp16=args.fp16,
|
|
419
|
+
word_timestamps=args.word_timestamps,
|
|
420
|
+
temperature=args.temperature,
|
|
421
|
+
initial_prompt=args.initial_prompt,
|
|
422
|
+
verbose=args.verbose,
|
|
423
|
+
)
|
|
424
|
+
if args.export:
|
|
425
|
+
transcriber.export(result, args.export)
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
if __name__ == "__main__":
|
|
429
|
+
main()
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# coding: utf-8
|
|
3
|
+
|
|
4
|
+
import getopt
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from audio_transcriber import AudioTranscriber, setup_logging
|
|
10
|
+
from fastmcp import FastMCP, Context
|
|
11
|
+
from pydantic import Field
|
|
12
|
+
|
|
13
|
+
# Initialize logging for MCP server (logs to file, verbose for details)
|
|
14
|
+
logger = setup_logging(verbose=True, log_file="audio_transcriber_mcp.log")
|
|
15
|
+
|
|
16
|
+
mcp = FastMCP(name="AudioTranscriberServer")
|
|
17
|
+
|
|
18
|
+
# Environment variables for defaults
|
|
19
|
+
environment_model = os.environ.get("WHISPER_MODEL", "base")
|
|
20
|
+
environment_directory = os.environ.get(
|
|
21
|
+
"TRANSCRIBE_DIRECTORY", str(Path.home() / "Downloads")
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@mcp.tool(
|
|
26
|
+
annotations={
|
|
27
|
+
"title": "Transcribe Audio",
|
|
28
|
+
"readOnlyHint": False,
|
|
29
|
+
"destructiveHint": False,
|
|
30
|
+
"idempotentHint": True,
|
|
31
|
+
"openWorldHint": False,
|
|
32
|
+
},
|
|
33
|
+
tags={"audio_processing"},
|
|
34
|
+
)
|
|
35
|
+
async def transcribe_audio(
|
|
36
|
+
audio_file: Optional[str] = Field(
|
|
37
|
+
description="Path to the audio file to transcribe. If provided, transcription is performed on this file.",
|
|
38
|
+
default=None,
|
|
39
|
+
),
|
|
40
|
+
record_seconds: int = Field(
|
|
41
|
+
description="Number of seconds to record audio from microphone. Must be positive if no audio_file is provided. 0 or negative not supported for recording in this context.",
|
|
42
|
+
default=0,
|
|
43
|
+
),
|
|
44
|
+
directory: Optional[str] = Field(
|
|
45
|
+
description="Directory for saving recordings or exports.",
|
|
46
|
+
default=environment_directory,
|
|
47
|
+
),
|
|
48
|
+
model: str = Field(
|
|
49
|
+
description="Whisper model to use (e.g., 'base', 'small', 'turbo').",
|
|
50
|
+
default=environment_model,
|
|
51
|
+
),
|
|
52
|
+
language: Optional[str] = Field(
|
|
53
|
+
description="Language code for transcription (e.g., 'en', 'fr'). Auto-detected if not specified.",
|
|
54
|
+
default=None,
|
|
55
|
+
),
|
|
56
|
+
task: str = Field(
|
|
57
|
+
description="Task to perform: 'transcribe' or 'translate' (to English).",
|
|
58
|
+
default="transcribe",
|
|
59
|
+
),
|
|
60
|
+
fp16: bool = Field(description="Use FP16 for faster inference.", default=True),
|
|
61
|
+
word_timestamps: bool = Field(
|
|
62
|
+
description="Include word-level timestamps in the output.", default=False
|
|
63
|
+
),
|
|
64
|
+
temperature: float = Field(
|
|
65
|
+
description="Temperature for sampling diversity (0.0 for deterministic).",
|
|
66
|
+
default=0.0,
|
|
67
|
+
),
|
|
68
|
+
initial_prompt: Optional[str] = Field(
|
|
69
|
+
description="Initial text prompt to guide the transcription.", default=None
|
|
70
|
+
),
|
|
71
|
+
export_formats: List[str] = Field(
|
|
72
|
+
description="Formats to export the transcription (e.g., ['txt', 'srt']).",
|
|
73
|
+
default=None,
|
|
74
|
+
),
|
|
75
|
+
ctx: Context = Field(
|
|
76
|
+
description="MCP context for progress reporting.", default=None
|
|
77
|
+
),
|
|
78
|
+
) -> str:
|
|
79
|
+
"""Transcribes audio from a provided file or by recording from the microphone."""
|
|
80
|
+
logger.info(
|
|
81
|
+
f"Starting transcription: audio_file={audio_file}, record_seconds={record_seconds}, "
|
|
82
|
+
f"directory={directory}, model={model}, language={language}, task={task}"
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
try:
|
|
86
|
+
if not audio_file and record_seconds <= 0:
|
|
87
|
+
raise ValueError(
|
|
88
|
+
"Either audio_file must be provided or record_seconds must be positive."
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Create transcriber instance
|
|
92
|
+
transcriber = AudioTranscriber(
|
|
93
|
+
model=model,
|
|
94
|
+
directory=Path(directory),
|
|
95
|
+
file=audio_file if audio_file else None,
|
|
96
|
+
logger=logger,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Report initial progress
|
|
100
|
+
if ctx:
|
|
101
|
+
await ctx.report_progress(progress=0, total=100)
|
|
102
|
+
logger.debug("Reported initial progress: 0/100")
|
|
103
|
+
|
|
104
|
+
if audio_file:
|
|
105
|
+
# Validate file existence
|
|
106
|
+
file_path = Path(audio_file)
|
|
107
|
+
if not file_path.exists():
|
|
108
|
+
raise ValueError(f"Audio file not found: {audio_file}")
|
|
109
|
+
else:
|
|
110
|
+
# Recording mode (only fixed duration supported)
|
|
111
|
+
logger.info(f"Starting recording for {record_seconds} seconds.")
|
|
112
|
+
transcriber.initiate_stream()
|
|
113
|
+
|
|
114
|
+
# Coarse progress for recording (sync call, so limited granularity)
|
|
115
|
+
transcriber.record(seconds=record_seconds)
|
|
116
|
+
transcriber.stop_stream()
|
|
117
|
+
transcriber.save_stream()
|
|
118
|
+
|
|
119
|
+
if ctx:
|
|
120
|
+
await ctx.report_progress(
|
|
121
|
+
progress=40, total=100
|
|
122
|
+
) # Arbitrary midpoint after recording
|
|
123
|
+
logger.debug("Reported progress after recording: 40/100")
|
|
124
|
+
|
|
125
|
+
# Perform transcription
|
|
126
|
+
logger.info("Starting Whisper transcription.")
|
|
127
|
+
result = transcriber.transcribe(
|
|
128
|
+
language=language,
|
|
129
|
+
task=task,
|
|
130
|
+
fp16=fp16,
|
|
131
|
+
word_timestamps=word_timestamps,
|
|
132
|
+
temperature=temperature,
|
|
133
|
+
initial_prompt=initial_prompt,
|
|
134
|
+
verbose=True, # Enable verbose for logging details
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
if ctx:
|
|
138
|
+
await ctx.report_progress(progress=90, total=100)
|
|
139
|
+
logger.debug("Reported progress after transcription: 90/100")
|
|
140
|
+
|
|
141
|
+
# Export if requested
|
|
142
|
+
if export_formats:
|
|
143
|
+
transcriber.export(result, formats=export_formats)
|
|
144
|
+
logger.info(f"Exported transcription to formats: {export_formats}")
|
|
145
|
+
|
|
146
|
+
# Report completion
|
|
147
|
+
if ctx:
|
|
148
|
+
await ctx.report_progress(progress=100, total=100)
|
|
149
|
+
logger.debug("Reported final progress: 100/100")
|
|
150
|
+
|
|
151
|
+
logger.info("Transcription completed successfully.")
|
|
152
|
+
return result["text"]
|
|
153
|
+
except Exception as e:
|
|
154
|
+
logger.error(f"Failed to transcribe audio: {str(e)}")
|
|
155
|
+
raise RuntimeError(f"Failed to transcribe audio: {str(e)}")
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def audio_transcriber_mcp(argv):
|
|
159
|
+
transport = "stdio"
|
|
160
|
+
host = "0.0.0.0"
|
|
161
|
+
port = 8000
|
|
162
|
+
try:
|
|
163
|
+
opts, args = getopt.getopt(
|
|
164
|
+
argv,
|
|
165
|
+
"ht:h:p:",
|
|
166
|
+
["help", "transport=", "host=", "port="],
|
|
167
|
+
)
|
|
168
|
+
except getopt.GetoptError:
|
|
169
|
+
sys.exit(2)
|
|
170
|
+
for opt, arg in opts:
|
|
171
|
+
if opt in ("-h", "--help"):
|
|
172
|
+
sys.exit()
|
|
173
|
+
elif opt in ("-t", "--transport"):
|
|
174
|
+
transport = arg
|
|
175
|
+
elif opt in ("-h", "--host"):
|
|
176
|
+
host = arg
|
|
177
|
+
elif opt in ("-p", "--port"):
|
|
178
|
+
try:
|
|
179
|
+
port = int(arg)
|
|
180
|
+
if not (0 <= port <= 65535):
|
|
181
|
+
print(f"Error: Port {arg} is out of valid range (0-65535).")
|
|
182
|
+
sys.exit(1)
|
|
183
|
+
except ValueError:
|
|
184
|
+
print(f"Error: Port {arg} is not a valid integer.")
|
|
185
|
+
sys.exit(1)
|
|
186
|
+
if transport == "stdio":
|
|
187
|
+
mcp.run(transport="stdio")
|
|
188
|
+
elif transport == "http":
|
|
189
|
+
mcp.run(transport="http", host=host, port=port)
|
|
190
|
+
else:
|
|
191
|
+
logger.error("Transport not supported")
|
|
192
|
+
sys.exit(1)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def main():
|
|
196
|
+
audio_transcriber_mcp(sys.argv[1:])
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
if __name__ == "__main__":
|
|
200
|
+
audio_transcriber_mcp(sys.argv[1:])
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: audio-transcriber
|
|
3
|
+
Version: 0.5.40
|
|
4
|
+
Summary: Transcribe your .wav .mp4 .mp3 .flac files to text or record your own audio!
|
|
5
|
+
Author-email: Audel Rouhi <knucklessg1@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
8
|
+
Classifier: License :: Public Domain
|
|
9
|
+
Classifier: Environment :: Console
|
|
10
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Requires-Python: >=3.8
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Requires-Dist: torch>=1.13.1
|
|
16
|
+
Requires-Dist: transformers>=4.25.1
|
|
17
|
+
Requires-Dist: pyaudio>=0.2.13
|
|
18
|
+
Requires-Dist: openai-whisper>=20250625
|
|
19
|
+
Requires-Dist: setuptools-rust>=1.12.0
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
|
|
22
|
+
# Audio-Transcriber
|
|
23
|
+
|
|
24
|
+

|
|
25
|
+

|
|
26
|
+

|
|
27
|
+

|
|
28
|
+

|
|
29
|
+

|
|
30
|
+

|
|
31
|
+
|
|
32
|
+

|
|
33
|
+

|
|
34
|
+

|
|
35
|
+

|
|
36
|
+
|
|
37
|
+

|
|
38
|
+

|
|
39
|
+

|
|
40
|
+

|
|
41
|
+

|
|
42
|
+

|
|
43
|
+
|
|
44
|
+
*Version: 0.5.40*
|
|
45
|
+
|
|
46
|
+
Transcribe your .wav .mp4 .mp3 .flac files to text or record your own audio!
|
|
47
|
+
|
|
48
|
+
This repository is actively maintained - Contributions are welcome!
|
|
49
|
+
|
|
50
|
+
Contribution Opportunities:
|
|
51
|
+
- Support new models
|
|
52
|
+
|
|
53
|
+
Wrapped around [OpenAI Whisper](https://pypi.org/project/openai-whisper)
|
|
54
|
+
|
|
55
|
+
<details>
|
|
56
|
+
<summary><b>Usage:</b></summary>
|
|
57
|
+
|
|
58
|
+
| Short Flag | Long Flag | Description |
|
|
59
|
+
|------------|-------------|---------------------------------------------------------------|
|
|
60
|
+
| -h | --help | See Usage |
|
|
61
|
+
| -b | --bitrate | Bitrate to use during recording |
|
|
62
|
+
| -c | --channels | Number of channels to use during recording |
|
|
63
|
+
| -d | --directory | Directory to save recording |
|
|
64
|
+
| -e | --export | Export txt, srt, and vtt files |
|
|
65
|
+
| -f | --file | File to transcribe |
|
|
66
|
+
| -l | --language | Language to transcribe |
|
|
67
|
+
| -m | --model | Model to use: <tiny, base, small, medium, large> |
|
|
68
|
+
| -n | --name | Name of recording |
|
|
69
|
+
| -r | --record | Specify number of seconds to record to record from microphone |
|
|
70
|
+
|
|
71
|
+
</details>
|
|
72
|
+
|
|
73
|
+
<details>
|
|
74
|
+
<summary><b>Example:</b></summary>
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
audio-transcriber --file '~/Downloads/Federal_Reserve.mp4' --model 'large'
|
|
78
|
+
audio-transcriber --record 60 --directory '~/Downloads/' --name 'my_recording.wav' --model 'tiny'
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
</details>
|
|
83
|
+
|
|
84
|
+
<details>
|
|
85
|
+
<summary><b>Model Information:</b></summary>
|
|
86
|
+
|
|
87
|
+
[Courtesy of and Credits to OpenAI: Whisper.ai](https://github.com/openai/whisper/blob/main/README.md)
|
|
88
|
+
|
|
89
|
+
| Size | Parameters | English-only model | Multilingual model | Required VRAM | Relative speed |
|
|
90
|
+
|:------:|:----------:|:------------------:|:------------------:|:-------------:|:--------------:|
|
|
91
|
+
| tiny | 39 M | `tiny.en` | `tiny` | ~1 GB | ~32x |
|
|
92
|
+
| base | 74 M | `base.en` | `base` | ~1 GB | ~16x |
|
|
93
|
+
| small | 244 M | `small.en` | `small` | ~2 GB | ~6x |
|
|
94
|
+
| medium | 769 M | `medium.en` | `medium` | ~5 GB | ~2x |
|
|
95
|
+
| large | 1550 M | N/A | `large` | ~10 GB | 1x |
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
</details>
|
|
99
|
+
|
|
100
|
+
<details>
|
|
101
|
+
<summary><b>Installation Instructions:</b></summary>
|
|
102
|
+
|
|
103
|
+
## Use with AI
|
|
104
|
+
|
|
105
|
+
Configure `mcp.json`
|
|
106
|
+
|
|
107
|
+
Recommended: Store secrets in environment variables with lookup in JSON file.
|
|
108
|
+
|
|
109
|
+
For Testing Only: Plain text storage will also work, although **not** recommended.
|
|
110
|
+
|
|
111
|
+
```json
|
|
112
|
+
{
|
|
113
|
+
"mcpServers": {
|
|
114
|
+
"audio_transcriber": {
|
|
115
|
+
"command": "uv",
|
|
116
|
+
"args": [
|
|
117
|
+
"run",
|
|
118
|
+
"--with",
|
|
119
|
+
"audio-transcriber",
|
|
120
|
+
"audio-transcriber-mcp"
|
|
121
|
+
],
|
|
122
|
+
"env": {
|
|
123
|
+
"WHISPER_MODEL": "medium", // Optional
|
|
124
|
+
"TRANSCRIBE_DIRECTORY": "~/Downloads" // Optional
|
|
125
|
+
},
|
|
126
|
+
"timeout": 200000
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### Deploy MCP Server as a container
|
|
133
|
+
```bash
|
|
134
|
+
docker pull knucklessg1/audio-transcriber:latest
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
Modify the `compose.yml`
|
|
138
|
+
|
|
139
|
+
```compose
|
|
140
|
+
services:
|
|
141
|
+
audio-transcriber:
|
|
142
|
+
image: knucklessg1/audio-transcriber:latest
|
|
143
|
+
environment:
|
|
144
|
+
- HOST=0.0.0.0
|
|
145
|
+
- PORT=8021
|
|
146
|
+
ports:
|
|
147
|
+
- 8021:8021
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### Install Python Package
|
|
151
|
+
|
|
152
|
+
```bash
|
|
153
|
+
python -m pip install audio-transcriber
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
or
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
uv pip install --upgrade audio-transcriber
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
##### Ubuntu Dependencies
|
|
163
|
+
```bash
|
|
164
|
+
apt install -y libasound-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
</details>
|
|
168
|
+
|
|
169
|
+
<details>
|
|
170
|
+
<summary><b>Repository Owners:</b></summary>
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
<img width="100%" height="180em" src="https://github-readme-stats.vercel.app/api?username=Knucklessg1&show_icons=true&hide_border=true&&count_private=true&include_all_commits=true" />
|
|
174
|
+
|
|
175
|
+

|
|
176
|
+

|
|
177
|
+
</details>
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
MANIFEST.in
|
|
3
|
+
README.md
|
|
4
|
+
pyproject.toml
|
|
5
|
+
requirements.txt
|
|
6
|
+
audio_transcriber/__init__.py
|
|
7
|
+
audio_transcriber/audio_transcriber.py
|
|
8
|
+
audio_transcriber/audio_transcriber_mcp.py
|
|
9
|
+
audio_transcriber.egg-info/PKG-INFO
|
|
10
|
+
audio_transcriber.egg-info/SOURCES.txt
|
|
11
|
+
audio_transcriber.egg-info/dependency_links.txt
|
|
12
|
+
audio_transcriber.egg-info/entry_points.txt
|
|
13
|
+
audio_transcriber.egg-info/requires.txt
|
|
14
|
+
audio_transcriber.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=80.9.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "audio-transcriber"
|
|
7
|
+
version = "0.5.40"
|
|
8
|
+
description = "Transcribe your .wav .mp4 .mp3 .flac files to text or record your own audio!"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
authors = [{ name = "Audel Rouhi", email = "knucklessg1@gmail.com" }]
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Development Status :: 5 - Production/Stable",
|
|
14
|
+
"License :: Public Domain",
|
|
15
|
+
"Environment :: Console",
|
|
16
|
+
"Operating System :: POSIX :: Linux",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
]
|
|
19
|
+
requires-python = ">=3.8"
|
|
20
|
+
dependencies = [
|
|
21
|
+
"torch>=1.13.1",
|
|
22
|
+
"transformers>=4.25.1",
|
|
23
|
+
"pyaudio>=0.2.13",
|
|
24
|
+
"openai-whisper>=20250625",
|
|
25
|
+
"setuptools-rust>=1.12.0"
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
[project.scripts]
|
|
29
|
+
audio-transcriber = "audio_transcriber.audio_transcriber:main"
|
|
30
|
+
audio-transcriber-mcp = "audio_transcriber.audio_transcriber_mcp:main"
|
|
31
|
+
|
|
32
|
+
[tool.setuptools.packages.find]
|
|
33
|
+
where = ["."]
|
|
34
|
+
|
|
35
|
+
[tool.setuptools]
|
|
36
|
+
include-package-data = true
|
|
37
|
+
package-data = { "audio_transcriber" = ["audio_transcriber"] }
|