voxweave 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. voxweave-0.1.0/LICENSE +21 -0
  2. voxweave-0.1.0/PKG-INFO +447 -0
  3. voxweave-0.1.0/README.md +406 -0
  4. voxweave-0.1.0/pyproject.toml +109 -0
  5. voxweave-0.1.0/setup.cfg +4 -0
  6. voxweave-0.1.0/tests/test_align.py +150 -0
  7. voxweave-0.1.0/tests/test_asrfix.py +228 -0
  8. voxweave-0.1.0/tests/test_backend.py +1075 -0
  9. voxweave-0.1.0/tests/test_breakpoints.py +89 -0
  10. voxweave-0.1.0/tests/test_chunking.py +35 -0
  11. voxweave-0.1.0/tests/test_cli.py +225 -0
  12. voxweave-0.1.0/tests/test_config.py +113 -0
  13. voxweave-0.1.0/tests/test_config_gap.py +41 -0
  14. voxweave-0.1.0/tests/test_debug.py +54 -0
  15. voxweave-0.1.0/tests/test_gap_split.py +32 -0
  16. voxweave-0.1.0/tests/test_kinsoku.py +64 -0
  17. voxweave-0.1.0/tests/test_lang.py +36 -0
  18. voxweave-0.1.0/tests/test_pipeline.py +92 -0
  19. voxweave-0.1.0/tests/test_pipeline_vad.py +76 -0
  20. voxweave-0.1.0/tests/test_progress.py +48 -0
  21. voxweave-0.1.0/tests/test_realign.py +983 -0
  22. voxweave-0.1.0/tests/test_scenarios.py +83 -0
  23. voxweave-0.1.0/tests/test_smart_split.py +313 -0
  24. voxweave-0.1.0/tests/test_smart_split_cleanup.py +60 -0
  25. voxweave-0.1.0/tests/test_smart_split_gap.py +267 -0
  26. voxweave-0.1.0/tests/test_smart_split_glue.py +152 -0
  27. voxweave-0.1.0/tests/test_smart_split_stutter.py +100 -0
  28. voxweave-0.1.0/tests/test_snap_punct.py +127 -0
  29. voxweave-0.1.0/tests/test_songdet.py +300 -0
  30. voxweave-0.1.0/tests/test_timestamps.py +28 -0
  31. voxweave-0.1.0/tests/test_translate.py +332 -0
  32. voxweave-0.1.0/voxweave/__init__.py +1 -0
  33. voxweave-0.1.0/voxweave/__main__.py +6 -0
  34. voxweave-0.1.0/voxweave/asrfix.py +208 -0
  35. voxweave-0.1.0/voxweave/backend.py +1346 -0
  36. voxweave-0.1.0/voxweave/chunking.py +132 -0
  37. voxweave-0.1.0/voxweave/cli.py +446 -0
  38. voxweave-0.1.0/voxweave/config.py +254 -0
  39. voxweave-0.1.0/voxweave/core/__init__.py +5 -0
  40. voxweave-0.1.0/voxweave/core/breakpoints.py +198 -0
  41. voxweave-0.1.0/voxweave/core/conjunctions.py +619 -0
  42. voxweave-0.1.0/voxweave/core/gap_split.py +51 -0
  43. voxweave-0.1.0/voxweave/core/kinsoku.py +67 -0
  44. voxweave-0.1.0/voxweave/core/smart_split.py +971 -0
  45. voxweave-0.1.0/voxweave/debug.py +81 -0
  46. voxweave-0.1.0/voxweave/lang.py +52 -0
  47. voxweave-0.1.0/voxweave/pipeline.py +942 -0
  48. voxweave-0.1.0/voxweave/progress.py +34 -0
  49. voxweave-0.1.0/voxweave/realign.py +855 -0
  50. voxweave-0.1.0/voxweave/songdet.py +382 -0
  51. voxweave-0.1.0/voxweave/timestamps.py +9 -0
  52. voxweave-0.1.0/voxweave/translate.py +250 -0
  53. voxweave-0.1.0/voxweave/ui.py +236 -0
  54. voxweave-0.1.0/voxweave/vendor/__init__.py +7 -0
  55. voxweave-0.1.0/voxweave/vendor/attend.py +112 -0
  56. voxweave-0.1.0/voxweave/vendor/mel_band_roformer.py +471 -0
  57. voxweave-0.1.0/voxweave/vendor/vocals_mel_band_roformer.yaml +50 -0
  58. voxweave-0.1.0/voxweave.egg-info/PKG-INFO +447 -0
  59. voxweave-0.1.0/voxweave.egg-info/SOURCES.txt +61 -0
  60. voxweave-0.1.0/voxweave.egg-info/dependency_links.txt +1 -0
  61. voxweave-0.1.0/voxweave.egg-info/entry_points.txt +2 -0
  62. voxweave-0.1.0/voxweave.egg-info/requires.txt +28 -0
  63. voxweave-0.1.0/voxweave.egg-info/top_level.txt +1 -0
voxweave-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Hao Li
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,447 @@
1
+ Metadata-Version: 2.4
2
+ Name: voxweave
3
+ Version: 0.1.0
4
+ Summary: BGM-robust subtitles for anime / film / clips: vocal separation + song-skip so ASR doesn't hallucinate on background music, OP/ED, or insert songs. Local-first Qwen3 ASR + forced alignment + edit-and-resync, CJK-aware.
5
+ Author: Hao Li
6
+ License-Expression: MIT
7
+ Keywords: subtitles,asr,forced-alignment,whisper,qwen,vtt
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
10
+ Classifier: Topic :: Text Processing :: Linguistic
11
+ Classifier: Environment :: GPU :: NVIDIA CUDA
12
+ Requires-Python: >=3.11
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: silero-vad>=5
16
+ Requires-Dist: soundfile>=0.12
17
+ Requires-Dist: numpy>=1.26
18
+ Requires-Dist: click>=8.1
19
+ Requires-Dist: rich>=13
20
+ Requires-Dist: qwen-asr>=0.0.4
21
+ Requires-Dist: einops>=0.8
22
+ Requires-Dist: rotary-embedding-torch>=0.6
23
+ Requires-Dist: beartype>=0.18
24
+ Requires-Dist: librosa>=0.10
25
+ Requires-Dist: pyyaml>=6
26
+ Requires-Dist: torchaudio>=2.1
27
+ Requires-Dist: ctc-forced-aligner>=1.0.2
28
+ Requires-Dist: onnxruntime-gpu>=1.20
29
+ Requires-Dist: unidecode>=1.3
30
+ Requires-Dist: pysbd>=0.3.4
31
+ Requires-Dist: panns-inference>=0.1.0
32
+ Requires-Dist: budoux>=0.8
33
+ Requires-Dist: jieba>=0.42
34
+ Requires-Dist: openai>=1.40
35
+ Provides-Extra: qwen
36
+ Provides-Extra: whisper
37
+ Requires-Dist: faster-whisper>=1.1; extra == "whisper"
38
+ Provides-Extra: all
39
+ Requires-Dist: voxweave[whisper]; extra == "all"
40
+ Dynamic: license-file
41
+
42
+ <div align="center">
43
+
44
+ <img src="resources/VoxWeave_icon.png" alt="VoxWeave" width="200"/>
45
+
46
+ # VoxWeave
47
+
48
+ **BGM-robust subtitles for anime, film, and clips.**
49
+
50
+ Vocal separation and song-skip so ASR never hallucinates on background music, OP/ED, or
51
+ insert songs. Local-first Qwen3 ASR, forced alignment, and edit-and-resync — CJK-aware.
52
+
53
+ ![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)
54
+ ![Python 3.11+](https://img.shields.io/badge/Python-3.11+-blue.svg)
55
+ ![CUDA cu128](https://img.shields.io/badge/CUDA-cu128-76B900?logo=nvidia&logoColor=white)
56
+ [![Buy Me A Coffee](https://img.shields.io/badge/Buy_Me_A_Coffee-FFDD00?logo=buymeacoffee&logoColor=black)](https://buymeacoffee.com/hali0515)
57
+
58
+ https://github.com/user-attachments/assets/e75b6dd3-fa37-4afe-89db-b6ee2c28f6bc
59
+
60
+ <sub>Sliced clip under heavy BGM · <code>voxweave Test.mp4</code> · Qwen3-ASR-1.7B</sub>
61
+
62
+ </div>
63
+
64
+ > [!NOTE]
65
+ > **100% local.** Separation, ASR, and forced alignment all run in-process with PyTorch on
66
+ > your GPU — no network endpoints, no audio leaves the machine. Weights download once on
67
+ > first run. (Translation and ASR-correction are the only optional features that call an
68
+ > external LLM, and only when you invoke them.)
69
+
70
+ VoxWeave derives from the WhisperX "edit-and-resync" workflow: transcribe once, then edit
71
+ the text and re-align it against the original audio for frame-accurate timestamps. Where it
72
+ differs is the front end — vocal separation and song-skip keep background music out of the
73
+ ASR, and a CJK-aware layout/alignment stack (MMS-300m for Japanese, BudouX/jieba for line
74
+ breaks) handles Chinese/Japanese/English as first-class.
75
+
76
+ ## Contents
77
+
78
+ - [Why VoxWeave](#why-voxweave)
79
+ - [Setup](#setup)
80
+ - [Quickstart](#quickstart)
81
+ - [Usage](#usage)
82
+ - [Transcribe (`voxweave <media>`)](#transcribe)
83
+ - [Re-align after editing (`align`)](#re-align-after-editing)
84
+ - [Re-layout offline (`split`)](#re-layout-offline)
85
+ - [ASR correction (`correct`)](#asr-correction)
86
+ - [Translate (`translate`)](#translate)
87
+ - [The edit-and-resync workflow](#the-edit-and-resync-workflow)
88
+ - [How it works](#how-it-works)
89
+ - [Configuration](#configuration)
90
+ - [Data contract](#data-contract)
91
+ - [Testing](#testing)
92
+ - [Support](#support)
93
+ - [License](#license)
94
+ - [Acknowledgments](#acknowledgments)
95
+
96
+ ## Why VoxWeave
97
+
98
+ - **BGM removal before ASR.** A Mel-Band Roformer vocal separator (pure torch, full-band
99
+ 44.1k) strips music first, so ASR doesn't transcribe lyrics or hallucinate on score.
100
+ - **Song-skip.** PANNs detects singing/music on the separated vocals and skips OP/ED and
101
+ insert songs before ASR — on by default, `--no-skip-songs` to keep them.
102
+ - **Local Qwen3 ASR + forced alignment.** Text and word-level timestamps in one pass, fully
103
+ on-device. A faster-whisper hybrid engine is available for when you prefer Whisper text.
104
+ - **Edit-and-resync.** Fix the transcript by hand, then `align` re-derives timestamps from
105
+ the audio — timestamps are _never_ hand-written.
106
+ - **CJK-aware.** Japanese aligns with MMS-300m + uroman (zero-OOV, immune to the per-cue
107
+ drift that breaks wav2vec2-xlsr on rare kanji); line breaks use BudouX phrase atoms + jieba.
108
+ - **Optional LLM steps.** `correct` cleans up ASR typos/garbled names before alignment;
109
+ `translate` does whole-episode context-aware translation while preserving cue count.
110
+
111
+ ## Setup
112
+
113
+ Requires an **NVIDIA GPU** (Blackwell sm_120 / cu128 by default) and `ffmpeg` on PATH.
114
+
115
+ <details>
116
+ <summary><b>Install ffmpeg</b></summary>
117
+
118
+ ```bash
119
+ # Ubuntu / Debian
120
+ sudo apt update && sudo apt install ffmpeg
121
+ # Arch Linux
122
+ sudo pacman -S ffmpeg
123
+ # macOS (Homebrew)
124
+ brew install ffmpeg
125
+ ```
126
+
127
+ </details>
128
+
129
+ <details>
130
+ <summary><b>CUDA / PyTorch notes</b></summary>
131
+
132
+ The torch wheel is pinned to the **cu128** build (Blackwell sm_120) and installed into an
133
+ isolated `uv` tool venv. The CUDA toolkit does **not** need to be installed separately — the
134
+ cu128 wheel bundles the required runtime libraries; only an NVIDIA driver is required on the
135
+ host. To build for a different target, override per-invocation: `make install TORCH_BACKEND=cpu`.
136
+
137
+ </details>
138
+
139
+ **End-user install** (puts the global `voxweave` command on PATH):
140
+
141
+ ```bash
142
+ make install # = uv tool install --torch-backend=cu128 ".[all]"
143
+ make reinstall # after pulling new code
144
+ make uninstall
145
+ ```
146
+
147
+ The full local pipeline — vocal separation, ASR, forced alignment (incl. MMS-300m for
148
+ Japanese/CJK), layout, song-skip — plus CJK line-break and translation are baked into the
149
+ **core dependencies**, so a bare `uv tool install voxweave` already works out of the box.
150
+ `[all]` additionally pulls the faster-whisper hybrid engine.
151
+
152
+ <details>
153
+ <summary><b>Extras & what each pulls</b></summary>
154
+
155
+ - The core pulls `qwen-asr` (hard-pins `transformers==4.57.6` + `accelerate==1.12.0`) + a
156
+ pure-torch Mel-Band Roformer vendored in `voxweave.vendor` (**no onnx/onnxruntime** —
157
+ `audio-separator` is intentionally avoided because it eagerly imports onnxruntime at the
158
+ top level) + MMS-300m forced aligner (`ctc-forced-aligner` + `onnxruntime-gpu`) + layout
159
+ (`pysbd`) + song-skip (`panns-inference`) + CJK break (`budoux` + `jieba`) + translation (`openai`).
160
+ - The only extras left are **`[whisper]`** (adds faster-whisper) and **`[all]`** (= core +
161
+ `[whisper]`). `[qwen]` remains as a no-op back-compat alias.
162
+ - Slim install without the whisper engine: `make install EXTRAS=qwen`.
163
+ - **Development**: `make dev` (= `uv sync --all-extras --dev`).
164
+
165
+ </details>
166
+
167
+ ## Quickstart
168
+
169
+ ```bash
170
+ # Transcribe a video to a timestamped VTT (+ a JSON source of truth)
171
+ voxweave episode.mkv
172
+
173
+ # ...edit episode.vtt by hand (fix wording, line breaks)...
174
+
175
+ # Re-align the edited text against the original audio
176
+ voxweave align episode.vtt
177
+
178
+ # Optionally translate the aligned subtitles to Chinese
179
+ voxweave translate episode.vtt --to zh
180
+ ```
181
+
182
+ ## Usage
183
+
184
+ ### Transcribe
185
+
186
+ `voxweave <media>` — separation → song-skip → VAD chunking → ASR + forced alignment →
187
+ smart_split → writes `<stem>.vtt` (editable) + `<stem>.json` (word-level timestamp source of
188
+ truth). Models load in-process (see `voxweave.backend`); the separator is released from VRAM
189
+ before ASR+alignment load, so peak usage is ≈ max(sep, asr) rather than their sum.
190
+
191
+ ```bash
192
+ voxweave episode.mkv
193
+ voxweave clip.mp4 --no-separate # clean speech (podcast/lecture): skip separation
194
+ voxweave episode.mkv --model qwen3-asr-1.7B # larger, more accurate ASR
195
+ ```
196
+
197
+ <details>
198
+ <summary><b>Options</b></summary>
199
+
200
+ | Option | Description |
201
+ | ------------------------------ | ------------------------------------------------------------------------------------------------------- |
202
+ | `--language` | Force language (ISO code or full name); default auto-detect. |
203
+ | `--no-separate` | Skip vocal separation (for clean speech) to save GPU time. |
204
+ | `--no-skip-songs` | Keep lyrics / transcribe purely musical content (song-skip is on by default). |
205
+ | `--model` | Local ASR model (default `Qwen3-ASR-0.6B`; `qwen3-asr-1.7B` is more accurate). |
206
+ | `--normalize` | Apply loudness normalization (`loudnorm`) to the 16k ASR input. |
207
+ | `--timestamps/--no-timestamps` | VTT carries word-level timestamps (default on); `--no-timestamps` writes a plain-text editing draft. |
208
+ | `--debug` | Write intermediate artifacts (full-band / vocals / per-chunk VAD + ASR + alignment) to `debug/<stem>/`. |
209
+
210
+ </details>
211
+
212
+ ### Re-align after editing
213
+
214
+ `voxweave align <vtt>` — takes the edited VTT text and **re-runs forced alignment against the
215
+ original audio**, overwriting the timestamped VTT and updating the JSON. Does not re-run ASR
216
+ or touch smart_split. Aligns on separated 16k vocals by default (prevents BGM interference);
217
+ prefers a cached `cache/<stem>.16k.flac`, otherwise re-separates and caches.
218
+
219
+ ```bash
220
+ voxweave align episode.vtt # finds episode.<ext> in the same dir
221
+ voxweave align episode.vtt --media original.mkv
222
+ voxweave align episode.vtt --no-separate # align on the original audio (clean sources)
223
+ ```
224
+
225
+ <details>
226
+ <summary><b>Options</b></summary>
227
+
228
+ | Option | Description |
229
+ | --------------- | ------------------------------------------------------------------ |
230
+ | `--media` | Source media path (default: same-name file in the same directory). |
231
+ | `--language` | Force language (ISO code or full name); default: read from JSON. |
232
+ | `--no-separate` | Align on the original audio instead of separated vocals. |
233
+ | `--normalize` | Apply `loudnorm` to the 16k alignment input. |
234
+
235
+ </details>
236
+
237
+ ### Re-layout offline
238
+
239
+ `voxweave split <json>` — re-run smart_split from `<stem>.json` **without any models** (adjust
240
+ line width / sentence breaks instantly).
241
+
242
+ ```bash
243
+ voxweave split episode.json --max-line-length 14 --max-lines 1
244
+ voxweave split episode.json --no-timestamps # plain-text editing draft
245
+ ```
246
+
247
+ ### ASR correction
248
+
249
+ `voxweave correct <vtt>` — optional **pre-align** LLM pass that fixes obvious ASR typos, split
250
+ words, and garbled proper nouns, producing a reviewable diff. Conservative substitution only
251
+ (no completion/rewrite), gated by a code check that the matched text equals the original
252
+ line-for-line. By default writes only a sidecar `<stem>.asrfix.vtt` + audit JSON — the
253
+ original VTT is untouched. Use `--apply` to overwrite, **then run `align`** to reassign timing.
254
+
255
+ ```bash
256
+ voxweave correct episode.vtt --glossary names.json # review the sidecar
257
+ voxweave correct episode.vtt --glossary names.json --apply
258
+ voxweave align episode.vtt
259
+ ```
260
+
261
+ <details>
262
+ <summary><b>Options</b></summary>
263
+
264
+ | Option | Description |
265
+ | ------------------------------ | ------------------------------------------------------------------------------------------------------------ |
266
+ | `--glossary` | Term/name glossary (`.json` → mapping; other → raw prompt). Strongly recommended for ambiguous proper nouns. |
267
+ | `--apply` | Overwrite the original VTT (default: sidecar only, for review). |
268
+ | `--model` | Correction model (default `VOXWEAVE_FIX_MODEL` env or `gpt-5.3-chat-latest`). |
269
+ | `--base-url` / `--api-key-env` | OpenAI-compatible endpoint + which env var holds the key. |
270
+
271
+ </details>
272
+
273
+ ### Translate
274
+
275
+ `voxweave translate <vtt>` — **after align**, translate each cue with whole-episode context,
276
+ preserving cue count, into `<stem>.<to>.vtt` (the original is left unchanged).
277
+
278
+ ```bash
279
+ voxweave translate episode.vtt --to zh
280
+ voxweave translate episode.vtt --to en --context "sci-fi, formal register" --glossary terms.json
281
+ ```
282
+
283
+ <details>
284
+ <summary><b>Options</b></summary>
285
+
286
+ | Option | Description |
287
+ | ------------------------------ | ------------------------------------------------------------------------------------ |
288
+ | `--to` | Target language code, written to `<stem>.<to>.vtt` (default `zh`). |
289
+ | `--context` | Show/tone context injected into the prompt. |
290
+ | `--glossary` | Term/name glossary (`.json` → mapping; other → raw prompt). |
291
+ | `--model` | Translation model (default `VOXWEAVE_TRANSLATE_MODEL` env or `gpt-5.3-chat-latest`). |
292
+ | `--base-url` / `--api-key-env` | OpenAI-compatible endpoint + which env var holds the key. |
293
+
294
+ </details>
295
+
296
+ Progress is rendered with rich: countable stages (demix windows / PANNs batches / per-chunk
297
+ ASR+alignment / align per-cue / translate streaming per-line) show a real `x/N` bar with
298
+ elapsed time; indeterminate stages (decode / file write) show a pulse bar. `-v/--verbose`
299
+ enables DEBUG logging.
300
+
301
+ ## The edit-and-resync workflow
302
+
303
+ ```
304
+ voxweave episode.mkv # 1. transcribe -> episode.vtt + episode.json
305
+ └─ (optional) correct # 2. LLM ASR fix -> episode.asrfix.vtt (--apply to commit)
306
+ edit episode.vtt by hand # 3. fix wording / line breaks
307
+ voxweave align episode.vtt # 4. re-derive timestamps from audio (overwrites VTT + JSON)
308
+ voxweave translate episode.vtt --to zh # 5. context-aware translation
309
+ ```
310
+
311
+ Timestamps are **always** derived from the audio by the forced aligner — you never hand-edit
312
+ them. Edit the text freely; `align` puts the timing back.
313
+
314
+ ## How it works
315
+
316
+ | Stage | What runs |
317
+ | --------------- | -------------------------------------------------------------------------------------------------------------------------- |
318
+ | **Separation** | Mel-Band Roformer (full-band 44.1k stereo, vendored pure-torch) isolates vocals; downsampled to 16k afterwards. |
319
+ | **Song-skip** | PANNs (route ii) flags singing/music on the separated vocals before ASR. |
320
+ | **Chunking** | Silero VAD splits speech into ≤120s chunks (longer risks ASR repetition-loop collapse). |
321
+ | **ASR + align** | Qwen3-ASR (default, text + units in one pass) / faster-whisper hybrid / dual-ASR fusion — the pipeline is engine-agnostic. |
322
+ | **Alignment** | `ja` → MMS-300m + uroman (full-file single pass, WhisperX-gold); `en` → wav2vec2-LV60K CTC per-cue; `zh`·`yue` → Qwen. |
323
+ | **Layout** | gap-aware `smart_split`: word-level gaps + BudouX phrase atoms + line-length, on a shared timeline forked per language. |
324
+
325
+ ## Configuration
326
+
327
+ Precedence: **CLI flag > env var > `~/.config/voxweave.conf` > built-in default.** A commented
328
+ default config is written on first run (migrated automatically from a pre-rename `qsub.conf`).
329
+
330
+ <details>
331
+ <summary><b>Environment variables</b></summary>
332
+
333
+ **Models**
334
+
335
+ - `VOXWEAVE_ASR_MODEL` (default `Qwen/Qwen3-ASR-0.6B`; same as `--model`)
336
+ - `VOXWEAVE_ALIGNER_MODEL` (default `Qwen/Qwen3-ForcedAligner-0.6B`)
337
+ - `VOXWEAVE_DEVICE` (default `cuda:0`)
338
+
339
+ All model weights are cached under `~/.cache/huggingface/hub` (auto-downloaded on first use), so a
340
+ container only needs to bind-mount that one directory. Each model exposes an env override to swap
341
+ the HF repo, or to point at an explicit local file (which, if it exists, skips the HF download):
342
+
343
+ - `VOXWEAVE_SEPARATOR_REPO` / `VOXWEAVE_SEPARATOR_REPO_FILE` (default `KimberleyJSN/melbandroformer` /
344
+ `MelBandRoformer.ckpt`), or `VOXWEAVE_SEPARATOR_CKPT` / `VOXWEAVE_SEPARATOR_CONFIG` for explicit
345
+ weights + matching yaml
346
+ - `VOXWEAVE_PANNS_REPO` / `VOXWEAVE_PANNS_REPO_FILE` (default `thelou1s/panns-inference` /
347
+ `Cnn14_mAP=0.431.pth`), or `VOXWEAVE_PANNS_CKPT` for an explicit checkpoint (song-skip CNN)
348
+ - `VOXWEAVE_MMS_REPO` / `VOXWEAVE_MMS_REPO_FILE` (default `deskpai/ctc_forced_aligner` /
349
+ `04ac86b67129634da93aea76e0147ef3.onnx`), or `VOXWEAVE_MMS_MODEL` for an explicit onnx path
350
+ (Japanese/CJK MMS-300m aligner)
351
+
352
+ **Tuning**
353
+
354
+ - `VOXWEAVE_MAX_CHUNK_SEC` (default 120; shorter chunks reduce ASR repetition loops on long segments)
355
+ - `VOXWEAVE_LOUDNORM` (default `loudnorm=I=-16:TP=-1.5:LRA=11`; the `-af` filter for `--normalize`)
356
+ - `VOXWEAVE_MIN_CUE_SEC` (default 0.8; minimum cue display duration in `align`)
357
+ - `VOXWEAVE_SNAP_VAD_THRESHOLD` (default 0.25; sensitive VAD used when repositioning
358
+ zero-duration units against the original audio)
359
+
360
+ </details>
361
+
362
+ <details>
363
+ <summary><b>Config file (<code>~/.config/voxweave.conf</code>, TOML)</b></summary>
364
+
365
+ Every key below is optional — delete a line to fall back to its built-in default. The values
366
+ shown are a usable starting point, not the defaults (the auto-written template has everything
367
+ commented out).
368
+
369
+ ```toml
370
+ # ~/.config/voxweave.conf — TOML
371
+ # Precedence: CLI flag > env var > this file > built-in default.
372
+
373
+ # Default ASR model (= --model). Short name (qwen3-asr-0.6b | qwen3-asr-1.7b) or full HF id.
374
+ # Special value "hybrid" (= --hybrid) -> dual-ASR fusion (whisper text + Qwen punctuation).
375
+ asr_model = "Qwen/Qwen3-ASR-1.7B" # built-in default: Qwen/Qwen3-ASR-0.6B
376
+
377
+ # Model load strategy:
378
+ # "peak" (default) — serial peak-shaving: all-chunk ASR -> release -> all-chunk align;
379
+ # ASR and aligner never co-reside, peak VRAM = max(models). Works on 8 GB.
380
+ # "sum" — concurrent per-chunk ASR+align; peak VRAM = sum(models), but skips two
381
+ # model swap round-trips (faster on large-VRAM cards).
382
+ load_strategy = "sum"
383
+
384
+ # dual-ASR fusion sub-models — only consulted when running with --hybrid.
385
+ [fusion]
386
+ whisper = "large-v3-turbo" # faster-whisper size: large-v3 (best) | large-v3-turbo (~5x faster)
387
+ qwen = "Qwen/Qwen3-ASR-1.7B" # punctuation model; must emit punctuation -> 1.7B, not 0.6B
388
+
389
+ # Per-language forced-alignment model. Key = ISO-639-1 code; unlisted languages use Qwen3-ForcedAligner.
390
+ # Values:
391
+ # "mms" — MMS-300m + uroman, full-file single pass (immune to per-cue drift; the gold standard).
392
+ # HF id — wav2vec2 CTC via HF transformers; weights land in ~/.cache/huggingface/hub (per-cue crop).
393
+ # bundle — torchaudio bundle name, e.g. "WAV2VEC2_ASR_LARGE_LV60K_960H" (same model, cached in ~/.cache/torch).
394
+ # "" — explicitly fall back to Qwen for that language.
395
+ [align]
396
+ en = "facebook/wav2vec2-large-960h-lv60-self" # English: LV60K-self CTC, per-cue crop (HF hub)
397
+ ja = "mms" # Japanese: MMS-300m + uroman full-file (= whisperx fork align_ctc)
398
+ # zh = "mms" # Chinese can also use MMS; default is Qwen (native CJK char-level)
399
+ # yue = "" # force Qwen for Cantonese
400
+ ```
401
+
402
+ </details>
403
+
404
+ ## Data contract
405
+
406
+ Each input produces two sibling files:
407
+
408
+ - **`<stem>.json`** — the source of truth: word/character-level segments, language, VAD speech.
409
+ - **`<stem>.vtt`** — editable subtitles. By default cues carry word-level timestamps (same
410
+ precision as `align` output, ready to use); `--no-timestamps` writes a plain-text editing
411
+ draft for hand-correction, which `align` re-times.
412
+
413
+ Both VTT forms are accepted by `align`. The aligner strips punctuation as a hard constraint;
414
+ ASR punctuation is re-injected by time so the final output has correct spacing and breaks
415
+ without stray marks.
416
+
417
+ ## Testing
418
+
419
+ - Unit tests (models mocked, no network): `make test` (= `uv run pytest tests/`)
420
+ - Lint / format: `make lint`
421
+
422
+ ## Support
423
+
424
+ If VoxWeave saves you time, you can support development here:
425
+
426
+ <a href="https://buymeacoffee.com/hali0515"><img src="https://img.shields.io/badge/Buy_Me_A_Coffee-FFDD00?logo=buymeacoffee&logoColor=black" alt="Buy Me A Coffee"/></a>
427
+
428
+ ## License
429
+
430
+ MIT — see [LICENSE](LICENSE).
431
+
432
+ ## Acknowledgments
433
+
434
+ - [WhisperX](https://github.com/m-bain/whisperX) — the forced-alignment + edit-and-resync
435
+ workflow this project builds on; the Japanese MMS full-file alignment path is a faithful
436
+ port of its `ctc` align backend.
437
+ - [stable-ts](https://github.com/jianfch/stable-ts) — inspiration for timestamp post-processing
438
+ and documentation structure.
439
+ - [Qwen3-ASR / Qwen3-ForcedAligner](https://github.com/QwenLM) (Alibaba) — local ASR + aligner.
440
+ - [MMS-300m](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) (Meta) via
441
+ [ctc-forced-aligner](https://github.com/MahmoudAshraf97/ctc-forced-aligner) — zero-OOV CJK alignment.
442
+ - [Mel-Band Roformer](https://github.com/lucidrains/BS-RoFormer) (lucidrains) +
443
+ [KimberleyJSN](https://huggingface.co/KimberleyJSN/melbandroformer) weights — vocal separation.
444
+ - [BudouX](https://github.com/google/budoux), [jieba](https://github.com/fxsjy/jieba),
445
+ [PySBD](https://github.com/nipunsadvilkar/pySBD) — CJK/sentence line-break.
446
+ - [PANNs](https://github.com/qiuqiangkong/audioset_tagging_cnn) — song/music detection.
447
+ - [Silero VAD](https://github.com/snakers4/silero-vad) — voice activity detection.