mlx-whisperx 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlx_whisperx-0.1.0/LICENSE +21 -0
- mlx_whisperx-0.1.0/PKG-INFO +497 -0
- mlx_whisperx-0.1.0/README.md +468 -0
- mlx_whisperx-0.1.0/mlx_whisperx/__init__.py +22 -0
- mlx_whisperx-0.1.0/mlx_whisperx/__main__.py +6 -0
- mlx_whisperx-0.1.0/mlx_whisperx/_compat.py +16 -0
- mlx_whisperx-0.1.0/mlx_whisperx/_language.py +98 -0
- mlx_whisperx-0.1.0/mlx_whisperx/alignment.py +558 -0
- mlx_whisperx-0.1.0/mlx_whisperx/audio.py +62 -0
- mlx_whisperx-0.1.0/mlx_whisperx/backend/__init__.py +5 -0
- mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/__init__.py +29 -0
- mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/_version.py +5 -0
- mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/assets/gpt2.tiktoken +50256 -0
- mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/assets/mel_filters.npz +0 -0
- mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/assets/multilingual.tiktoken +50257 -0
- mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/audio.py +194 -0
- mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/cli.py +265 -0
- mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/decoding.py +953 -0
- mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/languages.py +120 -0
- mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/load_models.py +68 -0
- mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/timing.py +349 -0
- mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/tokenizer.py +303 -0
- mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/torch_whisper.py +339 -0
- mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/transcribe.py +594 -0
- mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/version.py +5 -0
- mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/whisper.py +306 -0
- mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/writers.py +293 -0
- mlx_whisperx-0.1.0/mlx_whisperx/cli.py +163 -0
- mlx_whisperx-0.1.0/mlx_whisperx/diarize.py +179 -0
- mlx_whisperx-0.1.0/mlx_whisperx/log_utils.py +29 -0
- mlx_whisperx-0.1.0/mlx_whisperx/pipeline.py +447 -0
- mlx_whisperx-0.1.0/mlx_whisperx/schema.py +55 -0
- mlx_whisperx-0.1.0/mlx_whisperx/transcribe.py +67 -0
- mlx_whisperx-0.1.0/mlx_whisperx/vads/__init__.py +17 -0
- mlx_whisperx-0.1.0/mlx_whisperx/vads/_pyannote_impl.py +135 -0
- mlx_whisperx-0.1.0/mlx_whisperx/vads/pyannote.py +3 -0
- mlx_whisperx-0.1.0/mlx_whisperx/vads/silero.py +109 -0
- mlx_whisperx-0.1.0/mlx_whisperx/vads/vad.py +57 -0
- mlx_whisperx-0.1.0/mlx_whisperx/writers.py +292 -0
- mlx_whisperx-0.1.0/mlx_whisperx.egg-info/PKG-INFO +497 -0
- mlx_whisperx-0.1.0/mlx_whisperx.egg-info/SOURCES.txt +52 -0
- mlx_whisperx-0.1.0/mlx_whisperx.egg-info/dependency_links.txt +1 -0
- mlx_whisperx-0.1.0/mlx_whisperx.egg-info/entry_points.txt +2 -0
- mlx_whisperx-0.1.0/mlx_whisperx.egg-info/requires.txt +22 -0
- mlx_whisperx-0.1.0/mlx_whisperx.egg-info/top_level.txt +1 -0
- mlx_whisperx-0.1.0/pyproject.toml +47 -0
- mlx_whisperx-0.1.0/setup.cfg +4 -0
- mlx_whisperx-0.1.0/tests/test_cli.py +101 -0
- mlx_whisperx-0.1.0/tests/test_language.py +73 -0
- mlx_whisperx-0.1.0/tests/test_load_models.py +95 -0
- mlx_whisperx-0.1.0/tests/test_optional_imports.py +32 -0
- mlx_whisperx-0.1.0/tests/test_pipeline.py +97 -0
- mlx_whisperx-0.1.0/tests/test_pyproject.py +17 -0
- mlx_whisperx-0.1.0/tests/test_schema.py +8 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 mlx-whisperx contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,497 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mlx-whisperx
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: WhisperX-style transcription pipeline using an internal mlx-whisper ASR backend.
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: mlx>=0.11
|
|
10
|
+
Requires-Dist: numpy
|
|
11
|
+
Requires-Dist: numba
|
|
12
|
+
Requires-Dist: more-itertools
|
|
13
|
+
Requires-Dist: tiktoken
|
|
14
|
+
Requires-Dist: huggingface-hub
|
|
15
|
+
Requires-Dist: scipy
|
|
16
|
+
Requires-Dist: pandas
|
|
17
|
+
Requires-Dist: torch
|
|
18
|
+
Requires-Dist: torchaudio
|
|
19
|
+
Requires-Dist: transformers
|
|
20
|
+
Requires-Dist: nltk
|
|
21
|
+
Requires-Dist: tqdm
|
|
22
|
+
Provides-Extra: diarize
|
|
23
|
+
Requires-Dist: pyannote-audio; extra == "diarize"
|
|
24
|
+
Provides-Extra: full
|
|
25
|
+
Requires-Dist: pyannote-audio; extra == "full"
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest; extra == "dev"
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
# mlx-whisperx
|
|
31
|
+
|
|
32
|
+
`mlx-whisperx` is a WhisperX-style transcription pipeline for Apple Silicon. It uses a vendored `mlx-whisper` ASR backend, then optionally applies WhisperX forced alignment and pyannote diarization.
|
|
33
|
+
|
|
34
|
+
The project is intended to provide a practical local pipeline with WhisperX-like JSON, subtitle, and text outputs while keeping ASR execution on MLX.
|
|
35
|
+
|
|
36
|
+
## Why This Project Exists
|
|
37
|
+
|
|
38
|
+
This project adds WhisperX-like functionality to an `mlx-whisper` workflow. The goal is to keep ASR inference on MLX for Apple Silicon while providing the pipeline pieces people commonly use from WhisperX: VAD chunking, forced alignment, word timestamps, diarization hooks, and familiar JSON/subtitle outputs.
|
|
39
|
+
|
|
40
|
+
The implementation borrows ideas and code from both upstream projects:
|
|
41
|
+
|
|
42
|
+
- WhisperX, for the pipeline structure, alignment workflow, diarization integration, and output conventions.
|
|
43
|
+
- `mlx-whisper`, for the Apple Silicon ASR backend and model execution path.
|
|
44
|
+
|
|
45
|
+
This repository vendors and adapts code where needed so the pieces work together as a standalone `mlx-whisperx` package.
|
|
46
|
+
|
|
47
|
+
## Pipeline
|
|
48
|
+
|
|
49
|
+
```text
|
|
50
|
+
audio -> VAD -> mlx-whisper ASR -> forced alignment -> optional diarization -> writers
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Default behavior:
|
|
54
|
+
|
|
55
|
+
- ASR model: `mlx-community/whisper-turbo`
|
|
56
|
+
- VAD backend: Silero
|
|
57
|
+
- Decoding: beam search with `beam_size=5` and `temperature=0`
|
|
58
|
+
- Alignment: enabled for transcription
|
|
59
|
+
- Diarization: disabled unless `--diarize` is passed
|
|
60
|
+
|
|
61
|
+
## Installation
|
|
62
|
+
|
|
63
|
+
Clone the repository and install it into a Python environment:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
git clone https://github.com/seedds/mlx-whisperx.git
|
|
67
|
+
cd mlx-whisperx
|
|
68
|
+
python -m pip install -e .
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Install diarization support only when you need pyannote:
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
python -m pip install -e ".[diarize]"
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Install everything this repository currently exposes:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
python -m pip install -e ".[full]"
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
`ffmpeg` must be available on `PATH` because audio loading is handled through the ffmpeg CLI.
|
|
84
|
+
|
|
85
|
+
On macOS with Homebrew:
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
brew install ffmpeg
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
Optional pyannote VAD and diarization use pyannote models and may require a Hugging Face token, depending on the selected model.
|
|
92
|
+
|
|
93
|
+
## Usage
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
mlx-whisperx AUDIO [AUDIO ...] [OPTIONS]
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
By default, `mlx-whisperx`:
|
|
100
|
+
|
|
101
|
+
- uses `mlx-community/whisper-turbo`
|
|
102
|
+
- runs Silero VAD
|
|
103
|
+
- performs forced alignment for word timestamps
|
|
104
|
+
- writes outputs to the current directory
|
|
105
|
+
- writes every supported output format when `--output_format` is not specified
|
|
106
|
+
|
|
107
|
+
Common options:
|
|
108
|
+
|
|
109
|
+
- `--model`: local model path or Hugging Face repo
|
|
110
|
+
- `--language`: language code such as `en`, `ja`, or `fr`
|
|
111
|
+
- `--task`: `transcribe` or `translate`
|
|
112
|
+
- `--output_dir`: directory to write output files
|
|
113
|
+
- `--output_name`: custom basename for output files
|
|
114
|
+
- `--output_format`: `all`, `json`, `srt`, `vtt`, `txt`, `tsv`, or `aud`
|
|
115
|
+
- `--no_align`: skip forced alignment
|
|
116
|
+
- `--diarize`: attach speaker labels when diarization is enabled
|
|
117
|
+
- `--hf_token`: Hugging Face token for gated pyannote models
|
|
118
|
+
|
|
119
|
+
Example: output JSON
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
mlx-whisperx audio.wav \
|
|
123
|
+
--output_dir transcripts \
|
|
124
|
+
--output_name audio \
|
|
125
|
+
--output_format json
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
This writes `transcripts/audio.json`.
|
|
129
|
+
|
|
130
|
+
Example JSON output:
|
|
131
|
+
|
|
132
|
+
```json
|
|
133
|
+
{
|
|
134
|
+
"segments": [
|
|
135
|
+
{
|
|
136
|
+
"start": 0.0,
|
|
137
|
+
"end": 2.52,
|
|
138
|
+
"text": "Hello and welcome to mlx-whisperx.",
|
|
139
|
+
"words": [
|
|
140
|
+
{"word": "Hello", "start": 0.0, "end": 0.42, "score": 0.99},
|
|
141
|
+
{"word": "and", "start": 0.44, "end": 0.58, "score": 0.98},
|
|
142
|
+
{"word": "welcome", "start": 0.6, "end": 1.05, "score": 0.97},
|
|
143
|
+
{"word": "to", "start": 1.07, "end": 1.18, "score": 0.98},
|
|
144
|
+
{"word": "mlx-whisperx.", "start": 1.2, "end": 2.52, "score": 0.96}
|
|
145
|
+
]
|
|
146
|
+
}
|
|
147
|
+
],
|
|
148
|
+
"word_segments": [
|
|
149
|
+
{"word": "Hello", "start": 0.0, "end": 0.42, "score": 0.99},
|
|
150
|
+
{"word": "and", "start": 0.44, "end": 0.58, "score": 0.98}
|
|
151
|
+
],
|
|
152
|
+
"language": "en"
|
|
153
|
+
}
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
Example: output SRT
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
mlx-whisperx audio.wav \
|
|
160
|
+
--output_dir subtitles \
|
|
161
|
+
--output_name audio \
|
|
162
|
+
--output_format srt \
|
|
163
|
+
--max_line_width 42 \
|
|
164
|
+
--max_line_count 2
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
This writes `subtitles/audio.srt`.
|
|
168
|
+
|
|
169
|
+
Example SRT output:
|
|
170
|
+
|
|
171
|
+
```srt
|
|
172
|
+
1
|
|
173
|
+
00:00:00,000 --> 00:00:02,520
|
|
174
|
+
Hello and welcome to
|
|
175
|
+
mlx-whisperx.
|
|
176
|
+
|
|
177
|
+
2
|
|
178
|
+
00:00:02,700 --> 00:00:05,100
|
|
179
|
+
This example shows subtitle
|
|
180
|
+
output.
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
Complete example with optional parameters:
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
mlx-whisperx audio.wav \
|
|
187
|
+
--model mlx-community/whisper-large-v3-turbo \
|
|
188
|
+
--model_dir ./models \
|
|
189
|
+
--model_cache_only False \
|
|
190
|
+
--device cpu \
|
|
191
|
+
--compute_type float32 \
|
|
192
|
+
--output_dir ./out \
|
|
193
|
+
--output_name meeting \
|
|
194
|
+
--output_format all \
|
|
195
|
+
--verbose True \
|
|
196
|
+
--log-level info \
|
|
197
|
+
--task transcribe \
|
|
198
|
+
--language en \
|
|
199
|
+
--align_model jonatasgrosman/wav2vec2-large-xlsr-53-english \
|
|
200
|
+
--interpolate_method nearest \
|
|
201
|
+
--return_char_alignments \
|
|
202
|
+
--vad_method pyannote \
|
|
203
|
+
--vad_onset 0.5 \
|
|
204
|
+
--vad_offset 0.363 \
|
|
205
|
+
--vad_model pyannote/segmentation-3.0 \
|
|
206
|
+
--chunk_size 30 \
|
|
207
|
+
--vad_dump_path ./out/meeting.vad.json \
|
|
208
|
+
--diarize \
|
|
209
|
+
--min_speakers 2 \
|
|
210
|
+
--max_speakers 4 \
|
|
211
|
+
--diarize_model pyannote/speaker-diarization-community-1 \
|
|
212
|
+
--speaker_embeddings \
|
|
213
|
+
--hf_token YOUR_HF_TOKEN \
|
|
214
|
+
--temperature 0.0 \
|
|
215
|
+
--temperature_increment_on_fallback 0.2 \
|
|
216
|
+
--best_of 5 \
|
|
217
|
+
--beam_size 5 \
|
|
218
|
+
--patience 1.0 \
|
|
219
|
+
--length_penalty 1.0 \
|
|
220
|
+
--suppress_tokens -1 \
|
|
221
|
+
--suppress_numerals \
|
|
222
|
+
--initial_prompt "Technical meeting about MLX WhisperX." \
|
|
223
|
+
--hotwords "MLX, WhisperX, pyannote, diarization" \
|
|
224
|
+
--condition_on_previous_text True \
|
|
225
|
+
--compression_ratio_threshold 2.4 \
|
|
226
|
+
--logprob_threshold -1.0 \
|
|
227
|
+
--no_speech_threshold 0.6 \
|
|
228
|
+
--max_line_width 42 \
|
|
229
|
+
--max_line_count 2 \
|
|
230
|
+
--max_words_per_line 8 \
|
|
231
|
+
--highlight_words False \
|
|
232
|
+
--print_progress True
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
Notes:
|
|
236
|
+
|
|
237
|
+
- `--output_format all` writes `.txt`, `.vtt`, `.srt`, `.tsv`, `.json`, and `.aud`.
|
|
238
|
+
- `--max_line_count` only has an effect when `--max_line_width` is also set.
|
|
239
|
+
- `--highlight_words` applies to `srt` and `vtt`.
|
|
240
|
+
- `--hf_token` is only needed for gated pyannote models.
|
|
241
|
+
|
|
242
|
+
## Python API
|
|
243
|
+
|
|
244
|
+
```python
|
|
245
|
+
from mlx_whisperx import transcribe
|
|
246
|
+
|
|
247
|
+
result = transcribe(
|
|
248
|
+
"audio.wav",
|
|
249
|
+
model="mlx-community/whisper-large-v3-turbo",
|
|
250
|
+
language="en",
|
|
251
|
+
)
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
Print one transcript segment per line:
|
|
255
|
+
|
|
256
|
+
```python
|
|
257
|
+
for segment in result["segments"]:
|
|
258
|
+
print(segment["text"].strip())
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
Print segment timestamps:
|
|
262
|
+
|
|
263
|
+
```python
|
|
264
|
+
for segment in result["segments"]:
|
|
265
|
+
print(f"[{segment['start']:.2f} -> {segment['end']:.2f}] {segment['text'].strip()}")
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
Print word-level timestamps:
|
|
269
|
+
|
|
270
|
+
```python
|
|
271
|
+
for word in result["word_segments"]:
|
|
272
|
+
print(f"[{word['start']:.2f} -> {word['end']:.2f}] {word['word']}")
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
Common API options match the CLI names:
|
|
276
|
+
|
|
277
|
+
```python
|
|
278
|
+
result = transcribe(
|
|
279
|
+
"audio.wav",
|
|
280
|
+
model="mlx-community/whisper-turbo",
|
|
281
|
+
language="en",
|
|
282
|
+
beam_size=5,
|
|
283
|
+
temperature=0.0,
|
|
284
|
+
no_align=False,
|
|
285
|
+
diarize=False,
|
|
286
|
+
vad_method="silero",
|
|
287
|
+
)
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
`language` accepts either canonical codes such as `en` or case-insensitive names and aliases such as `English` or `Portuguese`.
|
|
291
|
+
|
|
292
|
+
## Output Schema
|
|
293
|
+
|
|
294
|
+
JSON output follows the WhisperX-style shape:
|
|
295
|
+
|
|
296
|
+
```json
|
|
297
|
+
{
|
|
298
|
+
"segments": [
|
|
299
|
+
{
|
|
300
|
+
"start": 0.0,
|
|
301
|
+
"end": 2.5,
|
|
302
|
+
"text": "Example transcript text.",
|
|
303
|
+
"words": [
|
|
304
|
+
{"word": "Example", "start": 0.0, "end": 0.6, "score": 0.98}
|
|
305
|
+
]
|
|
306
|
+
}
|
|
307
|
+
],
|
|
308
|
+
"word_segments": [
|
|
309
|
+
{"word": "Example", "start": 0.0, "end": 0.6, "score": 0.98}
|
|
310
|
+
],
|
|
311
|
+
"language": "en"
|
|
312
|
+
}
|
|
313
|
+
```
|
|
314
|
+
|
|
315
|
+
When diarization is enabled, speaker labels are included where available:
|
|
316
|
+
|
|
317
|
+
```json
|
|
318
|
+
{"word": "Hello", "start": 0.0, "end": 0.4, "score": 0.99, "speaker": "SPEAKER_00"}
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
## CLI Reference
|
|
322
|
+
|
|
323
|
+
Basic options:
|
|
324
|
+
|
|
325
|
+
- `--model`: `mlx-whisper` model directory or Hugging Face repo.
|
|
326
|
+
- `--language`: language code or case-insensitive name/alias. If omitted, language is auto-detected by ASR.
|
|
327
|
+
- `--task`: `transcribe` or `translate`.
|
|
328
|
+
- `--output_format`: `all`, `srt`, `vtt`, `txt`, `tsv`, `json`, or `aud`.
|
|
329
|
+
- `--output_dir`: directory for output files.
|
|
330
|
+
- `--output_name`: custom output basename.
|
|
331
|
+
- `--verbose`: print transcript and logs.
|
|
332
|
+
|
|
333
|
+
English-only Whisper models such as `.en` checkpoints force `language=en` and do not support `--task translate`.
|
|
334
|
+
|
|
335
|
+
Decoding options:
|
|
336
|
+
|
|
337
|
+
- `--temperature`: sampling temperature. Default is `0.0`.
|
|
338
|
+
- `--beam_size`: beam size when `temperature=0`. Default is `5`.
|
|
339
|
+
- `--best_of`: number of candidates when sampling with `temperature > 0`.
|
|
340
|
+
- `--patience`: beam-search patience.
|
|
341
|
+
- `--length_penalty`: beam-search length penalty.
|
|
342
|
+
- `--suppress_tokens`: comma-separated token IDs to suppress.
|
|
343
|
+
- `--suppress_numerals`: suppress numeric and currency-symbol tokens.
|
|
344
|
+
- `--initial_prompt`: initial prompt for ASR.
|
|
345
|
+
- `--hotwords`: hint phrases appended to the prompt.
|
|
346
|
+
- `--condition_on_previous_text`: prompt backend windows with previous text inside each VAD chunk.
|
|
347
|
+
|
|
348
|
+
Precision and model-cache options:
|
|
349
|
+
|
|
350
|
+
- `--compute_type float16`: force MLX ASR fp16. This is the default.
|
|
351
|
+
- `--compute_type float32`: force MLX ASR fp32.
|
|
352
|
+
- `--model_dir`: cache directory for ASR, alignment, pyannote VAD, and diarization models.
|
|
353
|
+
- `--model_cache_only`: cached ASR, alignment, pyannote VAD, and diarization models only.
|
|
354
|
+
|
|
355
|
+
VAD options:
|
|
356
|
+
|
|
357
|
+
- `--vad_method silero`: default VAD backend.
|
|
358
|
+
- `--vad_method pyannote`: use pyannote VAD if your environment supports it.
|
|
359
|
+
- `--vad_onset`: VAD onset threshold.
|
|
360
|
+
- `--vad_offset`: VAD offset threshold.
|
|
361
|
+
- `--vad_model`: Hugging Face pyannote segmentation model used with `--vad_method pyannote`. Defaults to `pyannote/segmentation-3.0`.
|
|
362
|
+
- `--chunk_size`: merged VAD chunk size in seconds.
|
|
363
|
+
- `--no_vad`: transcribe the full file as one chunk.
|
|
364
|
+
- `--clip_timestamps`: comma-separated clip start/end pairs in seconds. Requires `--no_vad`.
|
|
365
|
+
- `--vad_dump_path`: write VAD chunks and settings to JSON.
|
|
366
|
+
|
|
367
|
+
Silero VAD loads from the local Torch Hub cache first. To force a local Silero checkout, set:
|
|
368
|
+
|
|
369
|
+
```bash
|
|
370
|
+
export MLX_WHISPERX_SILERO_VAD_PATH=/path/to/snakers4_silero-vad
|
|
371
|
+
```
|
|
372
|
+
|
|
373
|
+
Alignment options:
|
|
374
|
+
|
|
375
|
+
- `--no_align`: skip forced alignment.
|
|
376
|
+
- `--align_model`: override the alignment model.
|
|
377
|
+
- `--interpolate_method`: `nearest`, `linear`, or `ignore`.
|
|
378
|
+
- `--return_char_alignments`: include character alignments in JSON.
|
|
379
|
+
|
|
380
|
+
Diarization options:
|
|
381
|
+
|
|
382
|
+
- `--diarize`: assign speaker labels.
|
|
383
|
+
- `--diarize_model`: pyannote diarization model name.
|
|
384
|
+
- `--min_speakers`: minimum speaker count.
|
|
385
|
+
- `--max_speakers`: maximum speaker count.
|
|
386
|
+
- `--speaker_embeddings`: include speaker embeddings in JSON when available.
|
|
387
|
+
- `--hf_token`: Hugging Face token for gated pyannote models.
|
|
388
|
+
|
|
389
|
+
Subtitle options:
|
|
390
|
+
|
|
391
|
+
- `--max_line_width`: target subtitle line width.
|
|
392
|
+
- `--max_line_count`: maximum lines per subtitle cue. Requires `--max_line_width`.
|
|
393
|
+
- `--max_words_per_line`: maximum words per subtitle cue.
|
|
394
|
+
- `--highlight_words`: underline the active word in SRT/VTT output.
|
|
395
|
+
|
|
396
|
+
## Examples
|
|
397
|
+
|
|
398
|
+
Inspect VAD chunks before ASR:
|
|
399
|
+
|
|
400
|
+
```bash
|
|
401
|
+
mlx-whisperx audio.wav \
|
|
402
|
+
--output_format json \
|
|
403
|
+
--vad_dump_path audio.vad.json
|
|
404
|
+
```
|
|
405
|
+
|
|
406
|
+
Transcribe only selected clips without VAD chunking:
|
|
407
|
+
|
|
408
|
+
```bash
|
|
409
|
+
mlx-whisperx audio.wav \
|
|
410
|
+
--no_vad \
|
|
411
|
+
--clip_timestamps 0,15,30,45 \
|
|
412
|
+
--output_format json
|
|
413
|
+
```
|
|
414
|
+
|
|
415
|
+
Run deterministic beam search explicitly:
|
|
416
|
+
|
|
417
|
+
```bash
|
|
418
|
+
mlx-whisperx audio.wav \
|
|
419
|
+
--language en \
|
|
420
|
+
--temperature 0 \
|
|
421
|
+
--beam_size 5 \
|
|
422
|
+
--output_format json
|
|
423
|
+
```
|
|
424
|
+
|
|
425
|
+
Use temperature fallback:
|
|
426
|
+
|
|
427
|
+
```bash
|
|
428
|
+
mlx-whisperx audio.wav \
|
|
429
|
+
--temperature 0 \
|
|
430
|
+
--temperature_increment_on_fallback 0.2
|
|
431
|
+
```
|
|
432
|
+
|
|
433
|
+
Suppress numerals and currency symbols during decoding:
|
|
434
|
+
|
|
435
|
+
```bash
|
|
436
|
+
mlx-whisperx audio.wav --suppress_numerals --output_format json
|
|
437
|
+
```
|
|
438
|
+
|
|
439
|
+
Use pyannote VAD instead of the default Silero VAD:
|
|
440
|
+
|
|
441
|
+
```bash
|
|
442
|
+
mlx-whisperx audio.wav \
|
|
443
|
+
--vad_method pyannote \
|
|
444
|
+
--vad_model pyannote/segmentation-3.0 \
|
|
445
|
+
--hf_token YOUR_HF_TOKEN \
|
|
446
|
+
--output_format json
|
|
447
|
+
```
|
|
448
|
+
|
|
449
|
+
Skip forced alignment:
|
|
450
|
+
|
|
451
|
+
```bash
|
|
452
|
+
mlx-whisperx audio.wav --no_align --output_format json
|
|
453
|
+
```
|
|
454
|
+
|
|
455
|
+
Run diarization:
|
|
456
|
+
|
|
457
|
+
```bash
|
|
458
|
+
mlx-whisperx audio.wav \
|
|
459
|
+
--diarize \
|
|
460
|
+
--hf_token YOUR_HF_TOKEN \
|
|
461
|
+
--output_format json
|
|
462
|
+
```
|
|
463
|
+
|
|
464
|
+
Process multiple files:
|
|
465
|
+
|
|
466
|
+
```bash
|
|
467
|
+
mlx-whisperx first.wav second.wav third.wav --output_dir transcripts --output_format all
|
|
468
|
+
```
|
|
469
|
+
|
|
470
|
+
## Current Behavior and Limitations
|
|
471
|
+
|
|
472
|
+
- ASR decodes merged VAD chunks serially.
|
|
473
|
+
- There is no `batch_size` CLI or API option.
|
|
474
|
+
- `translate` skips forced alignment because alignment models are transcription-language specific.
|
|
475
|
+
- `clip_timestamps` is only supported with `--no_vad` because VAD chunking changes the timing base before ASR runs.
|
|
476
|
+
- Pyannote VAD and diarization depend on a compatible PyTorch, torchaudio, pyannote installation, and Hugging Face model access when the selected model is gated.
|
|
477
|
+
- The vendored ASR backend lives under `mlx_whisperx.backend.mlx_whisper` so decoder behavior can be changed without modifying external reference repositories.
|
|
478
|
+
|
|
479
|
+
## Development Checks
|
|
480
|
+
|
|
481
|
+
Compile the package:
|
|
482
|
+
|
|
483
|
+
```bash
|
|
484
|
+
python -m py_compile mlx_whisperx/**/*.py
|
|
485
|
+
```
|
|
486
|
+
|
|
487
|
+
Check CLI help:
|
|
488
|
+
|
|
489
|
+
```bash
|
|
490
|
+
python -m mlx_whisperx --help
|
|
491
|
+
```
|
|
492
|
+
|
|
493
|
+
Build a wheel:
|
|
494
|
+
|
|
495
|
+
```bash
|
|
496
|
+
python -m build
|
|
497
|
+
```
|