whisperdrz 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. whisperdrz-0.1.0/LICENSE +21 -0
  2. whisperdrz-0.1.0/PKG-INFO +142 -0
  3. whisperdrz-0.1.0/README.md +114 -0
  4. whisperdrz-0.1.0/pyproject.toml +38 -0
  5. whisperdrz-0.1.0/setup.cfg +4 -0
  6. whisperdrz-0.1.0/src/whisperdrz/__init__.py +22 -0
  7. whisperdrz-0.1.0/src/whisperdrz/assets/gpt2.tiktoken +50256 -0
  8. whisperdrz-0.1.0/src/whisperdrz/assets/multilingual.tiktoken +50257 -0
  9. whisperdrz-0.1.0/src/whisperdrz/audio.py +157 -0
  10. whisperdrz-0.1.0/src/whisperdrz/clean/__init__.py +49 -0
  11. whisperdrz-0.1.0/src/whisperdrz/clean/patterns.py +377 -0
  12. whisperdrz-0.1.0/src/whisperdrz/clean/speakers.py +62 -0
  13. whisperdrz-0.1.0/src/whisperdrz/clean/tags.py +222 -0
  14. whisperdrz-0.1.0/src/whisperdrz/clean/timings.py +287 -0
  15. whisperdrz-0.1.0/src/whisperdrz/cli.py +57 -0
  16. whisperdrz-0.1.0/src/whisperdrz/config.py +63 -0
  17. whisperdrz-0.1.0/src/whisperdrz/decoding.py +2162 -0
  18. whisperdrz-0.1.0/src/whisperdrz/dtw_align.py +1311 -0
  19. whisperdrz-0.1.0/src/whisperdrz/evals.py +177 -0
  20. whisperdrz-0.1.0/src/whisperdrz/inference_utils.py +1061 -0
  21. whisperdrz-0.1.0/src/whisperdrz/model.py +894 -0
  22. whisperdrz-0.1.0/src/whisperdrz/speaker_clustering.py +159 -0
  23. whisperdrz-0.1.0/src/whisperdrz/tokenizer.py +371 -0
  24. whisperdrz-0.1.0/src/whisperdrz/transcriber.py +1848 -0
  25. whisperdrz-0.1.0/src/whisperdrz/utils.py +62 -0
  26. whisperdrz-0.1.0/src/whisperdrz/vocab_constraints.py +450 -0
  27. whisperdrz-0.1.0/src/whisperdrz.egg-info/PKG-INFO +142 -0
  28. whisperdrz-0.1.0/src/whisperdrz.egg-info/SOURCES.txt +30 -0
  29. whisperdrz-0.1.0/src/whisperdrz.egg-info/dependency_links.txt +1 -0
  30. whisperdrz-0.1.0/src/whisperdrz.egg-info/entry_points.txt +2 -0
  31. whisperdrz-0.1.0/src/whisperdrz.egg-info/requires.txt +21 -0
  32. whisperdrz-0.1.0/src/whisperdrz.egg-info/top_level.txt +1 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Fluxions
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,142 @@
1
+ Metadata-Version: 2.4
2
+ Name: whisperdrz
3
+ Version: 0.1.0
4
+ Summary: Speaker-aware ASR with word-level timestamps (inference)
5
+ License: MIT
6
+ Requires-Python: >=3.12
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: torch
10
+ Requires-Dist: torchaudio
11
+ Requires-Dist: torchcodec
12
+ Requires-Dist: numpy
13
+ Requires-Dist: scipy
14
+ Requires-Dist: einops
15
+ Requires-Dist: numba
16
+ Requires-Dist: tiktoken
17
+ Requires-Dist: safetensors
18
+ Requires-Dist: huggingface_hub
19
+ Provides-Extra: flash
20
+ Requires-Dist: flash-attn; extra == "flash"
21
+ Requires-Dist: triton; extra == "flash"
22
+ Provides-Extra: demo
23
+ Requires-Dist: gradio; extra == "demo"
24
+ Provides-Extra: eval
25
+ Requires-Dist: jiwer; extra == "eval"
26
+ Requires-Dist: whisper-normalizer; extra == "eval"
27
+ Dynamic: license-file
28
+
29
+ # WhisperDRZ - Adding Diarization to Whisper
30
+
31
+ WhisperDRZ is a speaker-aware automatic speech recognition model. It transcribes audio
32
+ into text with word-level timestamps, per-line speaker tags, and non-speech event
33
+ tags. It is a Whisper-style encoder-decoder model and handles long audio by
34
+ chunking and stitching internally.
35
+
36
+ This repository is inference only. Weights are distributed on the Hugging Face
37
+ Hub at [`fluxions/whisperdrz`](https://huggingface.co/fluxions/whisperdrz).
38
+
39
+ ## Install
40
+
41
+ ```bash
42
+ pip install -e .
43
+ # optional: Flash Attention for much faster decoding on CUDA
44
+ pip install -e ".[flash]"
45
+ # optional: the gradio demo
46
+ pip install -e ".[demo]"
47
+ ```
48
+
49
+ Requires Python 3.12+. Runs on a CUDA GPU or on CPU. Flash Attention is
50
+ optional: when it (and a CUDA GPU) is available it is used with CUDA graphs for
51
+ fast decoding; otherwise a pure-torch attention path is used automatically.
52
+
53
+ ## Command line
54
+
55
+ ```bash
56
+ whisperdrz audio.wav # defaults: --model whisperdrz-large-v3.safetensors, --lang en
57
+ whisperdrz audio.wav --output_format json > out.json
58
+ whisperdrz audio.wav --model my-checkpoint.pt --lang auto # override the defaults
59
+ ```
60
+
61
+ `--model` defaults to `whisperdrz-large-v3.safetensors` and accepts a local checkpoint,
62
+ a filename hosted in the weights repo, or a Hugging Face repo id. The weights
63
+ download automatically on first use. `--lang` defaults to `en`; use `auto` to detect.
64
+
65
+ ## Python
66
+
67
+ ```python
68
+ import whisperdrz
69
+ from whisperdrz.audio import load_audio, SAMPLE_RATE
70
+
71
+ transcriber = whisperdrz.load_model("whisperdrz-large-v3.safetensors", lang="en")
72
+
73
+ audio, _ = load_audio("audio.wav", sample_rate=SAMPLE_RATE)
74
+ result = transcriber.transcribe(audio.mean(0)) # mono, 16 kHz
75
+
76
+ print(result.text) # speaker-tagged text with timestamps
77
+ print(result.segments) # list of {speaker, start, end, text}
78
+ ```
79
+
80
+ ## Output format
81
+
82
+ Each line begins with a speaker tag. Timed words and tags are wrapped in a
83
+ start/end timestamp pair; not every word is timed, but the first and last word
84
+ of each line always are:
85
+
86
+ ```
87
+ [0] <|0.00|>Hello<|0.45|> there <|0.80|>world.<|1.10|>
88
+ [1] <|1.20|>Hi<|1.40|> <|1.45|>[laugh]<|1.60|> <|1.70|>there.<|1.95|>
89
+ ```
90
+
91
+ - `[0]`, `[1]`, ... are speaker IDs; `[c]` marks crowd/ambient.
92
+ - `<|t|>` are timestamps in seconds (two decimals), always in a pair wrapping a word or tag.
93
+ - The first and last word of every line are always timed; middle words may be bare.
94
+ - `[laugh]`, `[breath]`, and similar are non-speech event tags.
95
+
96
+ `transcribe()` returns a `TranscribeResults` with the raw `text` plus parsed
97
+ `segments`, each a dict of `speaker`, `start`, `end`, and `text`.
98
+
99
+ ## Evaluation
100
+
101
+ WhisperDRZ reports two metrics:
102
+
103
+ - **WER** (word error rate) — Levenshtein word distance after normalization
104
+ (whisper-normalizer), measuring transcription accuracy.
105
+ - **WDER** (word diarization error rate) — of the words that align between
106
+ reference and hypothesis, the fraction assigned to the wrong speaker under
107
+ the best speaker permutation. It isolates diarization quality from WER
108
+ (insertions/deletions don't count toward WDER).
109
+
110
+ Measured on the released checkpoint:
111
+
112
+ | Benchmark | Metric | Score |
113
+ |---|---|---|
114
+ | ESB (English ASR, 1000 utts) | WER | 9.6% macro / 5.9% micro |
115
+ | VoxConverse dev (216 files, overlap-heavy) | DER | 40.7% (63.4% on overlapping speech) |
116
+ | Internal conversational (26 clips) | WER / WDER | 11.1% / 33% |
117
+
118
+ WhisperDRZ is an ASR-first model: transcription is strong, but diarization trails
119
+ purpose-built systems (which reach ~10–25% DER on VoxConverse) and overlap is its
120
+ weak spot. See [the write-up](blog.md) for full results (timing, non-speech
121
+ events, multilingual) and analysis.
122
+
123
+ Reproduce WER/WDER on your own data with the metrics in `whisperdrz.evals` and the runner:
124
+
125
+ ```bash
126
+ pip install -e ".[eval]"
127
+ # manifest.jsonl: one {"audio": "a.wav", "text": "[0] ref ... [1] ..."} per line
128
+ python scripts/eval.py manifest.jsonl --model whisperdrz-large-v3.safetensors
129
+ ```
130
+
131
+ `text` is the reference transcript; `[N]` speaker tags are optional and WDER is
132
+ only computed for multi-speaker references.
133
+
134
+ ## Demo
135
+
136
+ ```bash
137
+ python demo/app.py --model whisperdrz-large-v3.safetensors
138
+ ```
139
+
140
+ ## License
141
+
142
+ MIT. See [LICENSE](LICENSE).
@@ -0,0 +1,114 @@
1
+ # WhisperDRZ - Adding Diarization to Whisper
2
+
3
+ WhisperDRZ is a speaker-aware automatic speech recognition model. It transcribes audio
4
+ into text with word-level timestamps, per-line speaker tags, and non-speech event
5
+ tags. It is a Whisper-style encoder-decoder model and handles long audio by
6
+ chunking and stitching internally.
7
+
8
+ This repository is inference only. Weights are distributed on the Hugging Face
9
+ Hub at [`fluxions/whisperdrz`](https://huggingface.co/fluxions/whisperdrz).
10
+
11
+ ## Install
12
+
13
+ ```bash
14
+ pip install -e .
15
+ # optional: Flash Attention for much faster decoding on CUDA
16
+ pip install -e ".[flash]"
17
+ # optional: the gradio demo
18
+ pip install -e ".[demo]"
19
+ ```
20
+
21
+ Requires Python 3.12+. Runs on a CUDA GPU or on CPU. Flash Attention is
22
+ optional: when it (and a CUDA GPU) is available it is used with CUDA graphs for
23
+ fast decoding; otherwise a pure-torch attention path is used automatically.
24
+
25
+ ## Command line
26
+
27
+ ```bash
28
+ whisperdrz audio.wav # defaults: --model whisperdrz-large-v3.safetensors, --lang en
29
+ whisperdrz audio.wav --output_format json > out.json
30
+ whisperdrz audio.wav --model my-checkpoint.pt --lang auto # override the defaults
31
+ ```
32
+
33
+ `--model` defaults to `whisperdrz-large-v3.safetensors` and accepts a local checkpoint,
34
+ a filename hosted in the weights repo, or a Hugging Face repo id. The weights
35
+ download automatically on first use. `--lang` defaults to `en`; use `auto` to detect.
36
+
37
+ ## Python
38
+
39
+ ```python
40
+ import whisperdrz
41
+ from whisperdrz.audio import load_audio, SAMPLE_RATE
42
+
43
+ transcriber = whisperdrz.load_model("whisperdrz-large-v3.safetensors", lang="en")
44
+
45
+ audio, _ = load_audio("audio.wav", sample_rate=SAMPLE_RATE)
46
+ result = transcriber.transcribe(audio.mean(0)) # mono, 16 kHz
47
+
48
+ print(result.text) # speaker-tagged text with timestamps
49
+ print(result.segments) # list of {speaker, start, end, text}
50
+ ```
51
+
52
+ ## Output format
53
+
54
+ Each line begins with a speaker tag. Timed words and tags are wrapped in a
55
+ start/end timestamp pair; not every word is timed, but the first and last word
56
+ of each line always are:
57
+
58
+ ```
59
+ [0] <|0.00|>Hello<|0.45|> there <|0.80|>world.<|1.10|>
60
+ [1] <|1.20|>Hi<|1.40|> <|1.45|>[laugh]<|1.60|> <|1.70|>there.<|1.95|>
61
+ ```
62
+
63
+ - `[0]`, `[1]`, ... are speaker IDs; `[c]` marks crowd/ambient.
64
+ - `<|t|>` are timestamps in seconds (two decimals), always in a pair wrapping a word or tag.
65
+ - The first and last word of every line are always timed; middle words may be bare.
66
+ - `[laugh]`, `[breath]`, and similar are non-speech event tags.
67
+
68
+ `transcribe()` returns a `TranscribeResults` with the raw `text` plus parsed
69
+ `segments`, each a dict of `speaker`, `start`, `end`, and `text`.
70
+
71
+ ## Evaluation
72
+
73
+ WhisperDRZ reports two metrics:
74
+
75
+ - **WER** (word error rate) — Levenshtein word distance after normalization
76
+ (whisper-normalizer), measuring transcription accuracy.
77
+ - **WDER** (word diarization error rate) — of the words that align between
78
+ reference and hypothesis, the fraction assigned to the wrong speaker under
79
+ the best speaker permutation. It isolates diarization quality from WER
80
+ (insertions/deletions don't count toward WDER).
81
+
82
+ Measured on the released checkpoint:
83
+
84
+ | Benchmark | Metric | Score |
85
+ |---|---|---|
86
+ | ESB (English ASR, 1000 utts) | WER | 9.6% macro / 5.9% micro |
87
+ | VoxConverse dev (216 files, overlap-heavy) | DER | 40.7% (63.4% on overlapping speech) |
88
+ | Internal conversational (26 clips) | WER / WDER | 11.1% / 33% |
89
+
90
+ WhisperDRZ is an ASR-first model: transcription is strong, but diarization trails
91
+ purpose-built systems (which reach ~10–25% DER on VoxConverse) and overlap is its
92
+ weak spot. See [the write-up](blog.md) for full results (timing, non-speech
93
+ events, multilingual) and analysis.
94
+
95
+ Reproduce WER/WDER on your own data with the metrics in `whisperdrz.evals` and the runner:
96
+
97
+ ```bash
98
+ pip install -e ".[eval]"
99
+ # manifest.jsonl: one {"audio": "a.wav", "text": "[0] ref ... [1] ..."} per line
100
+ python scripts/eval.py manifest.jsonl --model whisperdrz-large-v3.safetensors
101
+ ```
102
+
103
+ `text` is the reference transcript; `[N]` speaker tags are optional and WDER is
104
+ only computed for multi-speaker references.
105
+
106
+ ## Demo
107
+
108
+ ```bash
109
+ python demo/app.py --model whisperdrz-large-v3.safetensors
110
+ ```
111
+
112
+ ## License
113
+
114
+ MIT. See [LICENSE](LICENSE).
@@ -0,0 +1,38 @@
1
+ [build-system]
2
+ requires = ["setuptools", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "whisperdrz"
7
+ version = "0.1.0"
8
+ description = "Speaker-aware ASR with word-level timestamps (inference)"
9
+ readme = "README.md"
10
+ requires-python = ">=3.12"
11
+ license = { text = "MIT" }
12
+ dependencies = [
13
+ "torch",
14
+ "torchaudio",
15
+ "torchcodec",
16
+ "numpy",
17
+ "scipy",
18
+ "einops",
19
+ "numba",
20
+ "tiktoken",
21
+ "safetensors",
22
+ "huggingface_hub",
23
+ ]
24
+
25
+ [project.optional-dependencies]
26
+ flash = ["flash-attn", "triton"]
27
+ demo = ["gradio"]
28
+ eval = ["jiwer", "whisper-normalizer"]
29
+
30
+ [project.scripts]
31
+ whisperdrz = "whisperdrz.cli:main"
32
+
33
+ [tool.setuptools.packages.find]
34
+ where = ["src"]
35
+ include = ["whisperdrz*"]
36
+
37
+ [tool.setuptools.package-data]
38
+ whisperdrz = ["assets/*.tiktoken"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,22 @@
1
+ from whisperdrz.model import WhisperDRZ
2
+ from whisperdrz.transcriber import Transcriber
3
+
4
+ __version__ = "0.1.0"
5
+
6
+ __all__ = ["WhisperDRZ", "Transcriber", "load_model", "__version__"]
7
+
8
+
9
+ def load_model(name_or_path: str, lang: str | None = "en", **kwargs) -> Transcriber:
10
+ """Load a WhisperDRZ model for transcription.
11
+
12
+ Args:
13
+ name_or_path: A local checkpoint path (``.safetensors`` or ``.pt``), a
14
+ filename hosted in the default Hugging Face repo, or a Hugging Face
15
+ repo id.
16
+ lang: Force a language (e.g. ``"en"``). Pass ``None`` to auto-detect.
17
+ **kwargs: Forwarded to :class:`~whisperdrz.transcriber.Transcriber`.
18
+
19
+ Returns:
20
+ A :class:`~whisperdrz.transcriber.Transcriber` ready to ``transcribe(audio)``.
21
+ """
22
+ return Transcriber(model_path=name_or_path, lang=lang, **kwargs)