mlx-whisperx 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. mlx_whisperx-0.1.0/LICENSE +21 -0
  2. mlx_whisperx-0.1.0/PKG-INFO +497 -0
  3. mlx_whisperx-0.1.0/README.md +468 -0
  4. mlx_whisperx-0.1.0/mlx_whisperx/__init__.py +22 -0
  5. mlx_whisperx-0.1.0/mlx_whisperx/__main__.py +6 -0
  6. mlx_whisperx-0.1.0/mlx_whisperx/_compat.py +16 -0
  7. mlx_whisperx-0.1.0/mlx_whisperx/_language.py +98 -0
  8. mlx_whisperx-0.1.0/mlx_whisperx/alignment.py +558 -0
  9. mlx_whisperx-0.1.0/mlx_whisperx/audio.py +62 -0
  10. mlx_whisperx-0.1.0/mlx_whisperx/backend/__init__.py +5 -0
  11. mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/__init__.py +29 -0
  12. mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/_version.py +5 -0
  13. mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/assets/gpt2.tiktoken +50256 -0
  14. mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/assets/mel_filters.npz +0 -0
  15. mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/assets/multilingual.tiktoken +50257 -0
  16. mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/audio.py +194 -0
  17. mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/cli.py +265 -0
  18. mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/decoding.py +953 -0
  19. mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/languages.py +120 -0
  20. mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/load_models.py +68 -0
  21. mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/timing.py +349 -0
  22. mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/tokenizer.py +303 -0
  23. mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/torch_whisper.py +339 -0
  24. mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/transcribe.py +594 -0
  25. mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/version.py +5 -0
  26. mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/whisper.py +306 -0
  27. mlx_whisperx-0.1.0/mlx_whisperx/backend/mlx_whisper/writers.py +293 -0
  28. mlx_whisperx-0.1.0/mlx_whisperx/cli.py +163 -0
  29. mlx_whisperx-0.1.0/mlx_whisperx/diarize.py +179 -0
  30. mlx_whisperx-0.1.0/mlx_whisperx/log_utils.py +29 -0
  31. mlx_whisperx-0.1.0/mlx_whisperx/pipeline.py +447 -0
  32. mlx_whisperx-0.1.0/mlx_whisperx/schema.py +55 -0
  33. mlx_whisperx-0.1.0/mlx_whisperx/transcribe.py +67 -0
  34. mlx_whisperx-0.1.0/mlx_whisperx/vads/__init__.py +17 -0
  35. mlx_whisperx-0.1.0/mlx_whisperx/vads/_pyannote_impl.py +135 -0
  36. mlx_whisperx-0.1.0/mlx_whisperx/vads/pyannote.py +3 -0
  37. mlx_whisperx-0.1.0/mlx_whisperx/vads/silero.py +109 -0
  38. mlx_whisperx-0.1.0/mlx_whisperx/vads/vad.py +57 -0
  39. mlx_whisperx-0.1.0/mlx_whisperx/writers.py +292 -0
  40. mlx_whisperx-0.1.0/mlx_whisperx.egg-info/PKG-INFO +497 -0
  41. mlx_whisperx-0.1.0/mlx_whisperx.egg-info/SOURCES.txt +52 -0
  42. mlx_whisperx-0.1.0/mlx_whisperx.egg-info/dependency_links.txt +1 -0
  43. mlx_whisperx-0.1.0/mlx_whisperx.egg-info/entry_points.txt +2 -0
  44. mlx_whisperx-0.1.0/mlx_whisperx.egg-info/requires.txt +22 -0
  45. mlx_whisperx-0.1.0/mlx_whisperx.egg-info/top_level.txt +1 -0
  46. mlx_whisperx-0.1.0/pyproject.toml +47 -0
  47. mlx_whisperx-0.1.0/setup.cfg +4 -0
  48. mlx_whisperx-0.1.0/tests/test_cli.py +101 -0
  49. mlx_whisperx-0.1.0/tests/test_language.py +73 -0
  50. mlx_whisperx-0.1.0/tests/test_load_models.py +95 -0
  51. mlx_whisperx-0.1.0/tests/test_optional_imports.py +32 -0
  52. mlx_whisperx-0.1.0/tests/test_pipeline.py +97 -0
  53. mlx_whisperx-0.1.0/tests/test_pyproject.py +17 -0
  54. mlx_whisperx-0.1.0/tests/test_schema.py +8 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 mlx-whisperx contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,497 @@
1
+ Metadata-Version: 2.4
2
+ Name: mlx-whisperx
3
+ Version: 0.1.0
4
+ Summary: WhisperX-style transcription pipeline using an internal mlx-whisper ASR backend.
5
+ License-Expression: MIT
6
+ Requires-Python: >=3.10
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: mlx>=0.11
10
+ Requires-Dist: numpy
11
+ Requires-Dist: numba
12
+ Requires-Dist: more-itertools
13
+ Requires-Dist: tiktoken
14
+ Requires-Dist: huggingface-hub
15
+ Requires-Dist: scipy
16
+ Requires-Dist: pandas
17
+ Requires-Dist: torch
18
+ Requires-Dist: torchaudio
19
+ Requires-Dist: transformers
20
+ Requires-Dist: nltk
21
+ Requires-Dist: tqdm
22
+ Provides-Extra: diarize
23
+ Requires-Dist: pyannote-audio; extra == "diarize"
24
+ Provides-Extra: full
25
+ Requires-Dist: pyannote-audio; extra == "full"
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest; extra == "dev"
28
+ Dynamic: license-file
29
+
30
+ # mlx-whisperx
31
+
32
+ `mlx-whisperx` is a WhisperX-style transcription pipeline for Apple Silicon. It uses a vendored `mlx-whisper` ASR backend, then optionally applies WhisperX forced alignment and pyannote diarization.
33
+
34
+ The project is intended to provide a practical local pipeline with WhisperX-like JSON, subtitle, and text outputs while keeping ASR execution on MLX.
35
+
36
+ ## Why This Project Exists
37
+
38
+ This project adds WhisperX-like functionality to an `mlx-whisper` workflow. The goal is to keep ASR inference on MLX for Apple Silicon while providing the pipeline pieces people commonly use from WhisperX: VAD chunking, forced alignment, word timestamps, diarization hooks, and familiar JSON/subtitle outputs.
39
+
40
+ The implementation borrows ideas and code from both upstream projects:
41
+
42
+ - WhisperX, for the pipeline structure, alignment workflow, diarization integration, and output conventions.
43
+ - `mlx-whisper`, for the Apple Silicon ASR backend and model execution path.
44
+
45
+ This repository vendors and adapts code where needed so the pieces work together as a standalone `mlx-whisperx` package.
46
+
47
+ ## Pipeline
48
+
49
+ ```text
50
+ audio -> VAD -> mlx-whisper ASR -> forced alignment -> optional diarization -> writers
51
+ ```
52
+
53
+ Default behavior:
54
+
55
+ - ASR model: `mlx-community/whisper-turbo`
56
+ - VAD backend: Silero
57
+ - Decoding: beam search with `beam_size=5` and `temperature=0`
58
+ - Alignment: enabled for transcription
59
+ - Diarization: disabled unless `--diarize` is passed
60
+
61
+ ## Installation
62
+
63
+ Clone the repository and install it into a Python environment:
64
+
65
+ ```bash
66
+ git clone https://github.com/seedds/mlx-whisperx.git
67
+ cd mlx-whisperx
68
+ python -m pip install -e .
69
+ ```
70
+
71
+ Install diarization support only when you need pyannote:
72
+
73
+ ```bash
74
+ python -m pip install -e ".[diarize]"
75
+ ```
76
+
77
+ Install everything this repository currently exposes:
78
+
79
+ ```bash
80
+ python -m pip install -e ".[full]"
81
+ ```
82
+
83
+ `ffmpeg` must be available on `PATH` because audio loading is handled through the ffmpeg CLI.
84
+
85
+ On macOS with Homebrew:
86
+
87
+ ```bash
88
+ brew install ffmpeg
89
+ ```
90
+
91
+ Optional pyannote VAD and diarization use pyannote models and may require a Hugging Face token, depending on the selected model.
92
+
93
+ ## Usage
94
+
95
+ ```bash
96
+ mlx-whisperx AUDIO [AUDIO ...] [OPTIONS]
97
+ ```
98
+
99
+ By default, `mlx-whisperx`:
100
+
101
+ - uses `mlx-community/whisper-turbo`
102
+ - runs Silero VAD
103
+ - performs forced alignment for word timestamps
104
+ - writes outputs to the current directory
105
+ - writes every supported output format when `--output_format` is not specified
106
+
107
+ Common options:
108
+
109
+ - `--model`: local model path or Hugging Face repo
110
+ - `--language`: language code such as `en`, `ja`, or `fr`
111
+ - `--task`: `transcribe` or `translate`
112
+ - `--output_dir`: directory to write output files
113
+ - `--output_name`: custom basename for output files
114
+ - `--output_format`: `all`, `json`, `srt`, `vtt`, `txt`, `tsv`, or `aud`
115
+ - `--no_align`: skip forced alignment
116
+ - `--diarize`: attach speaker labels when diarization is enabled
117
+ - `--hf_token`: Hugging Face token for gated pyannote models
118
+
119
+ Example: output JSON
120
+
121
+ ```bash
122
+ mlx-whisperx audio.wav \
123
+ --output_dir transcripts \
124
+ --output_name audio \
125
+ --output_format json
126
+ ```
127
+
128
+ This writes `transcripts/audio.json`.
129
+
130
+ Example JSON output:
131
+
132
+ ```json
133
+ {
134
+ "segments": [
135
+ {
136
+ "start": 0.0,
137
+ "end": 2.52,
138
+ "text": "Hello and welcome to mlx-whisperx.",
139
+ "words": [
140
+ {"word": "Hello", "start": 0.0, "end": 0.42, "score": 0.99},
141
+ {"word": "and", "start": 0.44, "end": 0.58, "score": 0.98},
142
+ {"word": "welcome", "start": 0.6, "end": 1.05, "score": 0.97},
143
+ {"word": "to", "start": 1.07, "end": 1.18, "score": 0.98},
144
+ {"word": "mlx-whisperx.", "start": 1.2, "end": 2.52, "score": 0.96}
145
+ ]
146
+ }
147
+ ],
148
+ "word_segments": [
149
+ {"word": "Hello", "start": 0.0, "end": 0.42, "score": 0.99},
150
+ {"word": "and", "start": 0.44, "end": 0.58, "score": 0.98}
151
+ ],
152
+ "language": "en"
153
+ }
154
+ ```
155
+
156
+ Example: output SRT
157
+
158
+ ```bash
159
+ mlx-whisperx audio.wav \
160
+ --output_dir subtitles \
161
+ --output_name audio \
162
+ --output_format srt \
163
+ --max_line_width 42 \
164
+ --max_line_count 2
165
+ ```
166
+
167
+ This writes `subtitles/audio.srt`.
168
+
169
+ Example SRT output:
170
+
171
+ ```srt
172
+ 1
173
+ 00:00:00,000 --> 00:00:02,520
174
+ Hello and welcome to
175
+ mlx-whisperx.
176
+
177
+ 2
178
+ 00:00:02,700 --> 00:00:05,100
179
+ This example shows subtitle
180
+ output.
181
+ ```
182
+
183
+ Complete example with optional parameters:
184
+
185
+ ```bash
186
+ mlx-whisperx audio.wav \
187
+ --model mlx-community/whisper-large-v3-turbo \
188
+ --model_dir ./models \
189
+ --model_cache_only False \
190
+ --device cpu \
191
+ --compute_type float32 \
192
+ --output_dir ./out \
193
+ --output_name meeting \
194
+ --output_format all \
195
+ --verbose True \
196
+ --log-level info \
197
+ --task transcribe \
198
+ --language en \
199
+ --align_model jonatasgrosman/wav2vec2-large-xlsr-53-english \
200
+ --interpolate_method nearest \
201
+ --return_char_alignments \
202
+ --vad_method pyannote \
203
+ --vad_onset 0.5 \
204
+ --vad_offset 0.363 \
205
+ --vad_model pyannote/segmentation-3.0 \
206
+ --chunk_size 30 \
207
+ --vad_dump_path ./out/meeting.vad.json \
208
+ --diarize \
209
+ --min_speakers 2 \
210
+ --max_speakers 4 \
211
+ --diarize_model pyannote/speaker-diarization-community-1 \
212
+ --speaker_embeddings \
213
+ --hf_token YOUR_HF_TOKEN \
214
+ --temperature 0.0 \
215
+ --temperature_increment_on_fallback 0.2 \
216
+ --best_of 5 \
217
+ --beam_size 5 \
218
+ --patience 1.0 \
219
+ --length_penalty 1.0 \
220
+ --suppress_tokens -1 \
221
+ --suppress_numerals \
222
+ --initial_prompt "Technical meeting about MLX WhisperX." \
223
+ --hotwords "MLX, WhisperX, pyannote, diarization" \
224
+ --condition_on_previous_text True \
225
+ --compression_ratio_threshold 2.4 \
226
+ --logprob_threshold -1.0 \
227
+ --no_speech_threshold 0.6 \
228
+ --max_line_width 42 \
229
+ --max_line_count 2 \
230
+ --max_words_per_line 8 \
231
+ --highlight_words False \
232
+ --print_progress True
233
+ ```
234
+
235
+ Notes:
236
+
237
+ - `--output_format all` writes `.txt`, `.vtt`, `.srt`, `.tsv`, `.json`, and `.aud`.
238
+ - `--max_line_count` only has an effect when `--max_line_width` is also set.
239
+ - `--highlight_words` applies to `srt` and `vtt`.
240
+ - `--hf_token` is only needed for gated pyannote models.
241
+
242
+ ## Python API
243
+
244
+ ```python
245
+ from mlx_whisperx import transcribe
246
+
247
+ result = transcribe(
248
+ "audio.wav",
249
+ model="mlx-community/whisper-large-v3-turbo",
250
+ language="en",
251
+ )
252
+ ```
253
+
254
+ Print one transcript segment per line:
255
+
256
+ ```python
257
+ for segment in result["segments"]:
258
+ print(segment["text"].strip())
259
+ ```
260
+
261
+ Print segment timestamps:
262
+
263
+ ```python
264
+ for segment in result["segments"]:
265
+ print(f"[{segment['start']:.2f} -> {segment['end']:.2f}] {segment['text'].strip()}")
266
+ ```
267
+
268
+ Print word-level timestamps:
269
+
270
+ ```python
271
+ for word in result["word_segments"]:
272
+ print(f"[{word['start']:.2f} -> {word['end']:.2f}] {word['word']}")
273
+ ```
274
+
275
+ Common API options match the CLI names:
276
+
277
+ ```python
278
+ result = transcribe(
279
+ "audio.wav",
280
+ model="mlx-community/whisper-turbo",
281
+ language="en",
282
+ beam_size=5,
283
+ temperature=0.0,
284
+ no_align=False,
285
+ diarize=False,
286
+ vad_method="silero",
287
+ )
288
+ ```
289
+
290
+ `language` accepts either canonical codes such as `en` or case-insensitive names and aliases such as `English` or `Portuguese`.
291
+
292
+ ## Output Schema
293
+
294
+ JSON output follows the WhisperX-style shape:
295
+
296
+ ```json
297
+ {
298
+ "segments": [
299
+ {
300
+ "start": 0.0,
301
+ "end": 2.5,
302
+ "text": "Example transcript text.",
303
+ "words": [
304
+ {"word": "Example", "start": 0.0, "end": 0.6, "score": 0.98}
305
+ ]
306
+ }
307
+ ],
308
+ "word_segments": [
309
+ {"word": "Example", "start": 0.0, "end": 0.6, "score": 0.98}
310
+ ],
311
+ "language": "en"
312
+ }
313
+ ```
314
+
315
+ When diarization is enabled, speaker labels are included where available:
316
+
317
+ ```json
318
+ {"word": "Hello", "start": 0.0, "end": 0.4, "score": 0.99, "speaker": "SPEAKER_00"}
319
+ ```
320
+
321
+ ## CLI Reference
322
+
323
+ Basic options:
324
+
325
+ - `--model`: `mlx-whisper` model directory or Hugging Face repo.
326
+ - `--language`: language code or case-insensitive name/alias. If omitted, language is auto-detected by ASR.
327
+ - `--task`: `transcribe` or `translate`.
328
+ - `--output_format`: `all`, `srt`, `vtt`, `txt`, `tsv`, `json`, or `aud`.
329
+ - `--output_dir`: directory for output files.
330
+ - `--output_name`: custom output basename.
331
+ - `--verbose`: print transcript and logs.
332
+
333
+ English-only Whisper models such as `.en` checkpoints force `language=en` and do not support `--task translate`.
334
+
335
+ Decoding options:
336
+
337
+ - `--temperature`: sampling temperature. Default is `0.0`.
338
+ - `--beam_size`: beam size when `temperature=0`. Default is `5`.
339
+ - `--best_of`: number of candidates when sampling with `temperature > 0`.
340
+ - `--patience`: beam-search patience.
341
+ - `--length_penalty`: beam-search length penalty.
342
+ - `--suppress_tokens`: comma-separated token IDs to suppress.
343
+ - `--suppress_numerals`: suppress numeric and currency-symbol tokens.
344
+ - `--initial_prompt`: initial prompt for ASR.
345
+ - `--hotwords`: hint phrases appended to the prompt.
346
+ - `--condition_on_previous_text`: prompt backend windows with previous text inside each VAD chunk.
347
+
348
+ Precision and model-cache options:
349
+
350
+ - `--compute_type float16`: force MLX ASR fp16. This is the default.
351
+ - `--compute_type float32`: force MLX ASR fp32.
352
+ - `--model_dir`: cache directory for ASR, alignment, pyannote VAD, and diarization models.
353
+ - `--model_cache_only`: cached ASR, alignment, pyannote VAD, and diarization models only.
354
+
355
+ VAD options:
356
+
357
+ - `--vad_method silero`: default VAD backend.
358
+ - `--vad_method pyannote`: use pyannote VAD if your environment supports it.
359
+ - `--vad_onset`: VAD onset threshold.
360
+ - `--vad_offset`: VAD offset threshold.
361
+ - `--vad_model`: Hugging Face pyannote segmentation model used with `--vad_method pyannote`. Defaults to `pyannote/segmentation-3.0`.
362
+ - `--chunk_size`: merged VAD chunk size in seconds.
363
+ - `--no_vad`: transcribe the full file as one chunk.
364
+ - `--clip_timestamps`: comma-separated clip start/end pairs in seconds. Requires `--no_vad`.
365
+ - `--vad_dump_path`: write VAD chunks and settings to JSON.
366
+
367
+ Silero VAD loads from the local Torch Hub cache first. To force a local Silero checkout, set:
368
+
369
+ ```bash
370
+ export MLX_WHISPERX_SILERO_VAD_PATH=/path/to/snakers4_silero-vad
371
+ ```
372
+
373
+ Alignment options:
374
+
375
+ - `--no_align`: skip forced alignment.
376
+ - `--align_model`: override the alignment model.
377
+ - `--interpolate_method`: `nearest`, `linear`, or `ignore`.
378
+ - `--return_char_alignments`: include character alignments in JSON.
379
+
380
+ Diarization options:
381
+
382
+ - `--diarize`: assign speaker labels.
383
+ - `--diarize_model`: pyannote diarization model name.
384
+ - `--min_speakers`: minimum speaker count.
385
+ - `--max_speakers`: maximum speaker count.
386
+ - `--speaker_embeddings`: include speaker embeddings in JSON when available.
387
+ - `--hf_token`: Hugging Face token for gated pyannote models.
388
+
389
+ Subtitle options:
390
+
391
+ - `--max_line_width`: target subtitle line width.
392
+ - `--max_line_count`: maximum lines per subtitle cue. Requires `--max_line_width`.
393
+ - `--max_words_per_line`: maximum words per subtitle cue.
394
+ - `--highlight_words`: underline the active word in SRT/VTT output.
395
+
396
+ ## Examples
397
+
398
+ Inspect VAD chunks before ASR:
399
+
400
+ ```bash
401
+ mlx-whisperx audio.wav \
402
+ --output_format json \
403
+ --vad_dump_path audio.vad.json
404
+ ```
405
+
406
+ Transcribe only selected clips without VAD chunking:
407
+
408
+ ```bash
409
+ mlx-whisperx audio.wav \
410
+ --no_vad \
411
+ --clip_timestamps 0,15,30,45 \
412
+ --output_format json
413
+ ```
414
+
415
+ Run deterministic beam search explicitly:
416
+
417
+ ```bash
418
+ mlx-whisperx audio.wav \
419
+ --language en \
420
+ --temperature 0 \
421
+ --beam_size 5 \
422
+ --output_format json
423
+ ```
424
+
425
+ Use temperature fallback:
426
+
427
+ ```bash
428
+ mlx-whisperx audio.wav \
429
+ --temperature 0 \
430
+ --temperature_increment_on_fallback 0.2
431
+ ```
432
+
433
+ Suppress numerals and currency symbols during decoding:
434
+
435
+ ```bash
436
+ mlx-whisperx audio.wav --suppress_numerals --output_format json
437
+ ```
438
+
439
+ Use pyannote VAD instead of the default Silero VAD:
440
+
441
+ ```bash
442
+ mlx-whisperx audio.wav \
443
+ --vad_method pyannote \
444
+ --vad_model pyannote/segmentation-3.0 \
445
+ --hf_token YOUR_HF_TOKEN \
446
+ --output_format json
447
+ ```
448
+
449
+ Skip forced alignment:
450
+
451
+ ```bash
452
+ mlx-whisperx audio.wav --no_align --output_format json
453
+ ```
454
+
455
+ Run diarization:
456
+
457
+ ```bash
458
+ mlx-whisperx audio.wav \
459
+ --diarize \
460
+ --hf_token YOUR_HF_TOKEN \
461
+ --output_format json
462
+ ```
463
+
464
+ Process multiple files:
465
+
466
+ ```bash
467
+ mlx-whisperx first.wav second.wav third.wav --output_dir transcripts --output_format all
468
+ ```
469
+
470
+ ## Current Behavior and Limitations
471
+
472
+ - ASR decodes merged VAD chunks serially.
473
+ - There is no `batch_size` CLI or API option.
474
+ - `translate` skips forced alignment because alignment models are transcription-language specific.
475
+ - `clip_timestamps` is only supported with `--no_vad` because VAD chunking changes the timing base before ASR runs.
476
+ - Pyannote VAD and diarization depend on a compatible PyTorch, torchaudio, pyannote installation, and Hugging Face model access when the selected model is gated.
477
+ - The vendored ASR backend lives under `mlx_whisperx.backend.mlx_whisper` so decoder behavior can be changed without modifying external reference repositories.
478
+
479
+ ## Development Checks
480
+
481
+ Compile the package:
482
+
483
+ ```bash
484
+ python -m py_compile mlx_whisperx/**/*.py
485
+ ```
486
+
487
+ Check CLI help:
488
+
489
+ ```bash
490
+ python -m mlx_whisperx --help
491
+ ```
492
+
493
+ Build a wheel:
494
+
495
+ ```bash
496
+ python -m build
497
+ ```