whisperdrz 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- whisperdrz-0.1.0/LICENSE +21 -0
- whisperdrz-0.1.0/PKG-INFO +142 -0
- whisperdrz-0.1.0/README.md +114 -0
- whisperdrz-0.1.0/pyproject.toml +38 -0
- whisperdrz-0.1.0/setup.cfg +4 -0
- whisperdrz-0.1.0/src/whisperdrz/__init__.py +22 -0
- whisperdrz-0.1.0/src/whisperdrz/assets/gpt2.tiktoken +50256 -0
- whisperdrz-0.1.0/src/whisperdrz/assets/multilingual.tiktoken +50257 -0
- whisperdrz-0.1.0/src/whisperdrz/audio.py +157 -0
- whisperdrz-0.1.0/src/whisperdrz/clean/__init__.py +49 -0
- whisperdrz-0.1.0/src/whisperdrz/clean/patterns.py +377 -0
- whisperdrz-0.1.0/src/whisperdrz/clean/speakers.py +62 -0
- whisperdrz-0.1.0/src/whisperdrz/clean/tags.py +222 -0
- whisperdrz-0.1.0/src/whisperdrz/clean/timings.py +287 -0
- whisperdrz-0.1.0/src/whisperdrz/cli.py +57 -0
- whisperdrz-0.1.0/src/whisperdrz/config.py +63 -0
- whisperdrz-0.1.0/src/whisperdrz/decoding.py +2162 -0
- whisperdrz-0.1.0/src/whisperdrz/dtw_align.py +1311 -0
- whisperdrz-0.1.0/src/whisperdrz/evals.py +177 -0
- whisperdrz-0.1.0/src/whisperdrz/inference_utils.py +1061 -0
- whisperdrz-0.1.0/src/whisperdrz/model.py +894 -0
- whisperdrz-0.1.0/src/whisperdrz/speaker_clustering.py +159 -0
- whisperdrz-0.1.0/src/whisperdrz/tokenizer.py +371 -0
- whisperdrz-0.1.0/src/whisperdrz/transcriber.py +1848 -0
- whisperdrz-0.1.0/src/whisperdrz/utils.py +62 -0
- whisperdrz-0.1.0/src/whisperdrz/vocab_constraints.py +450 -0
- whisperdrz-0.1.0/src/whisperdrz.egg-info/PKG-INFO +142 -0
- whisperdrz-0.1.0/src/whisperdrz.egg-info/SOURCES.txt +30 -0
- whisperdrz-0.1.0/src/whisperdrz.egg-info/dependency_links.txt +1 -0
- whisperdrz-0.1.0/src/whisperdrz.egg-info/entry_points.txt +2 -0
- whisperdrz-0.1.0/src/whisperdrz.egg-info/requires.txt +21 -0
- whisperdrz-0.1.0/src/whisperdrz.egg-info/top_level.txt +1 -0
whisperdrz-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Fluxions
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: whisperdrz
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Speaker-aware ASR with word-level timestamps (inference)
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.12
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: torch
|
|
10
|
+
Requires-Dist: torchaudio
|
|
11
|
+
Requires-Dist: torchcodec
|
|
12
|
+
Requires-Dist: numpy
|
|
13
|
+
Requires-Dist: scipy
|
|
14
|
+
Requires-Dist: einops
|
|
15
|
+
Requires-Dist: numba
|
|
16
|
+
Requires-Dist: tiktoken
|
|
17
|
+
Requires-Dist: safetensors
|
|
18
|
+
Requires-Dist: huggingface_hub
|
|
19
|
+
Provides-Extra: flash
|
|
20
|
+
Requires-Dist: flash-attn; extra == "flash"
|
|
21
|
+
Requires-Dist: triton; extra == "flash"
|
|
22
|
+
Provides-Extra: demo
|
|
23
|
+
Requires-Dist: gradio; extra == "demo"
|
|
24
|
+
Provides-Extra: eval
|
|
25
|
+
Requires-Dist: jiwer; extra == "eval"
|
|
26
|
+
Requires-Dist: whisper-normalizer; extra == "eval"
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+
# WhisperDRZ - Adding Diarization to Whisper
|
|
30
|
+
|
|
31
|
+
WhisperDRZ is a speaker-aware automatic speech recognition model. It transcribes audio
|
|
32
|
+
into text with word-level timestamps, per-line speaker tags, and non-speech event
|
|
33
|
+
tags. It is a Whisper-style encoder-decoder model and handles long audio by
|
|
34
|
+
chunking and stitching internally.
|
|
35
|
+
|
|
36
|
+
This repository is inference only. Weights are distributed on the Hugging Face
|
|
37
|
+
Hub at [`fluxions/whisperdrz`](https://huggingface.co/fluxions/whisperdrz).
|
|
38
|
+
|
|
39
|
+
## Install
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install -e .
|
|
43
|
+
# optional: Flash Attention for much faster decoding on CUDA
|
|
44
|
+
pip install -e ".[flash]"
|
|
45
|
+
# optional: the gradio demo
|
|
46
|
+
pip install -e ".[demo]"
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Requires Python 3.12+. Runs on a CUDA GPU or on CPU. Flash Attention is
|
|
50
|
+
optional: when it (and a CUDA GPU) is available it is used with CUDA graphs for
|
|
51
|
+
fast decoding; otherwise a pure-torch attention path is used automatically.
|
|
52
|
+
|
|
53
|
+
## Command line
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
whisperdrz audio.wav # defaults: --model whisperdrz-large-v3.safetensors, --lang en
|
|
57
|
+
whisperdrz audio.wav --output_format json > out.json
|
|
58
|
+
whisperdrz audio.wav --model my-checkpoint.pt --lang auto # override the defaults
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
`--model` defaults to `whisperdrz-large-v3.safetensors` and accepts a local checkpoint,
|
|
62
|
+
a filename hosted in the weights repo, or a Hugging Face repo id. The weights
|
|
63
|
+
download automatically on first use. `--lang` defaults to `en`; use `auto` to detect.
|
|
64
|
+
|
|
65
|
+
## Python
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
import whisperdrz
|
|
69
|
+
from whisperdrz.audio import load_audio, SAMPLE_RATE
|
|
70
|
+
|
|
71
|
+
transcriber = whisperdrz.load_model("whisperdrz-large-v3.safetensors", lang="en")
|
|
72
|
+
|
|
73
|
+
audio, _ = load_audio("audio.wav", sample_rate=SAMPLE_RATE)
|
|
74
|
+
result = transcriber.transcribe(audio.mean(0)) # mono, 16 kHz
|
|
75
|
+
|
|
76
|
+
print(result.text) # speaker-tagged text with timestamps
|
|
77
|
+
print(result.segments) # list of {speaker, start, end, text}
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Output format
|
|
81
|
+
|
|
82
|
+
Each line begins with a speaker tag. Timed words and tags are wrapped in a
|
|
83
|
+
start/end timestamp pair; not every word is timed, but the first and last word
|
|
84
|
+
of each line always are:
|
|
85
|
+
|
|
86
|
+
```
|
|
87
|
+
[0] <|0.00|>Hello<|0.45|> there <|0.80|>world.<|1.10|>
|
|
88
|
+
[1] <|1.20|>Hi<|1.40|> <|1.45|>[laugh]<|1.60|> <|1.70|>there.<|1.95|>
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
- `[0]`, `[1]`, ... are speaker IDs; `[c]` marks crowd/ambient.
|
|
92
|
+
- `<|t|>` are timestamps in seconds (two decimals), always in a pair wrapping a word or tag.
|
|
93
|
+
- The first and last word of every line are always timed; middle words may be bare.
|
|
94
|
+
- `[laugh]`, `[breath]`, and similar are non-speech event tags.
|
|
95
|
+
|
|
96
|
+
`transcribe()` returns a `TranscribeResults` with the raw `text` plus parsed
|
|
97
|
+
`segments`, each a dict of `speaker`, `start`, `end`, and `text`.
|
|
98
|
+
|
|
99
|
+
## Evaluation
|
|
100
|
+
|
|
101
|
+
WhisperDRZ reports two metrics:
|
|
102
|
+
|
|
103
|
+
- **WER** (word error rate) — Levenshtein word distance after normalization
|
|
104
|
+
(whisper-normalizer), measuring transcription accuracy.
|
|
105
|
+
- **WDER** (word diarization error rate) — of the words that align between
|
|
106
|
+
reference and hypothesis, the fraction assigned to the wrong speaker under
|
|
107
|
+
the best speaker permutation. It isolates diarization quality from WER
|
|
108
|
+
(insertions/deletions don't count toward WDER).
|
|
109
|
+
|
|
110
|
+
Measured on the released checkpoint:
|
|
111
|
+
|
|
112
|
+
| Benchmark | Metric | Score |
|
|
113
|
+
|---|---|---|
|
|
114
|
+
| ESB (English ASR, 1000 utts) | WER | 9.6% macro / 5.9% micro |
|
|
115
|
+
| VoxConverse dev (216 files, overlap-heavy) | DER | 40.7% (63.4% on overlapping speech) |
|
|
116
|
+
| Internal conversational (26 clips) | WER / WDER | 11.1% / 33% |
|
|
117
|
+
|
|
118
|
+
WhisperDRZ is an ASR-first model: transcription is strong, but diarization trails
|
|
119
|
+
purpose-built systems (which reach ~10–25% DER on VoxConverse) and overlap is its
|
|
120
|
+
weak spot. See [the write-up](blog.md) for full results (timing, non-speech
|
|
121
|
+
events, multilingual) and analysis.
|
|
122
|
+
|
|
123
|
+
Reproduce WER/WDER on your own data with the metrics in `whisperdrz.evals` and the runner:
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
pip install -e ".[eval]"
|
|
127
|
+
# manifest.jsonl: one {"audio": "a.wav", "text": "[0] ref ... [1] ..."} per line
|
|
128
|
+
python scripts/eval.py manifest.jsonl --model whisperdrz-large-v3.safetensors
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
`text` is the reference transcript; `[N]` speaker tags are optional and WDER is
|
|
132
|
+
only computed for multi-speaker references.
|
|
133
|
+
|
|
134
|
+
## Demo
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
python demo/app.py --model whisperdrz-large-v3.safetensors
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## License
|
|
141
|
+
|
|
142
|
+
MIT. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# WhisperDRZ - Adding Diarization to Whisper
|
|
2
|
+
|
|
3
|
+
WhisperDRZ is a speaker-aware automatic speech recognition model. It transcribes audio
|
|
4
|
+
into text with word-level timestamps, per-line speaker tags, and non-speech event
|
|
5
|
+
tags. It is a Whisper-style encoder-decoder model and handles long audio by
|
|
6
|
+
chunking and stitching internally.
|
|
7
|
+
|
|
8
|
+
This repository is inference only. Weights are distributed on the Hugging Face
|
|
9
|
+
Hub at [`fluxions/whisperdrz`](https://huggingface.co/fluxions/whisperdrz).
|
|
10
|
+
|
|
11
|
+
## Install
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
pip install -e .
|
|
15
|
+
# optional: Flash Attention for much faster decoding on CUDA
|
|
16
|
+
pip install -e ".[flash]"
|
|
17
|
+
# optional: the gradio demo
|
|
18
|
+
pip install -e ".[demo]"
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
Requires Python 3.12+. Runs on a CUDA GPU or on CPU. Flash Attention is
|
|
22
|
+
optional: when it (and a CUDA GPU) is available it is used with CUDA graphs for
|
|
23
|
+
fast decoding; otherwise a pure-torch attention path is used automatically.
|
|
24
|
+
|
|
25
|
+
## Command line
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
whisperdrz audio.wav # defaults: --model whisperdrz-large-v3.safetensors, --lang en
|
|
29
|
+
whisperdrz audio.wav --output_format json > out.json
|
|
30
|
+
whisperdrz audio.wav --model my-checkpoint.pt --lang auto # override the defaults
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
`--model` defaults to `whisperdrz-large-v3.safetensors` and accepts a local checkpoint,
|
|
34
|
+
a filename hosted in the weights repo, or a Hugging Face repo id. The weights
|
|
35
|
+
download automatically on first use. `--lang` defaults to `en`; use `auto` to detect.
|
|
36
|
+
|
|
37
|
+
## Python
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
import whisperdrz
|
|
41
|
+
from whisperdrz.audio import load_audio, SAMPLE_RATE
|
|
42
|
+
|
|
43
|
+
transcriber = whisperdrz.load_model("whisperdrz-large-v3.safetensors", lang="en")
|
|
44
|
+
|
|
45
|
+
audio, _ = load_audio("audio.wav", sample_rate=SAMPLE_RATE)
|
|
46
|
+
result = transcriber.transcribe(audio.mean(0)) # mono, 16 kHz
|
|
47
|
+
|
|
48
|
+
print(result.text) # speaker-tagged text with timestamps
|
|
49
|
+
print(result.segments) # list of {speaker, start, end, text}
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Output format
|
|
53
|
+
|
|
54
|
+
Each line begins with a speaker tag. Timed words and tags are wrapped in a
|
|
55
|
+
start/end timestamp pair; not every word is timed, but the first and last word
|
|
56
|
+
of each line always are:
|
|
57
|
+
|
|
58
|
+
```
|
|
59
|
+
[0] <|0.00|>Hello<|0.45|> there <|0.80|>world.<|1.10|>
|
|
60
|
+
[1] <|1.20|>Hi<|1.40|> <|1.45|>[laugh]<|1.60|> <|1.70|>there.<|1.95|>
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
- `[0]`, `[1]`, ... are speaker IDs; `[c]` marks crowd/ambient.
|
|
64
|
+
- `<|t|>` are timestamps in seconds (two decimals), always in a pair wrapping a word or tag.
|
|
65
|
+
- The first and last word of every line are always timed; middle words may be bare.
|
|
66
|
+
- `[laugh]`, `[breath]`, and similar are non-speech event tags.
|
|
67
|
+
|
|
68
|
+
`transcribe()` returns a `TranscribeResults` with the raw `text` plus parsed
|
|
69
|
+
`segments`, each a dict of `speaker`, `start`, `end`, and `text`.
|
|
70
|
+
|
|
71
|
+
## Evaluation
|
|
72
|
+
|
|
73
|
+
WhisperDRZ reports two metrics:
|
|
74
|
+
|
|
75
|
+
- **WER** (word error rate) — Levenshtein word distance after normalization
|
|
76
|
+
(whisper-normalizer), measuring transcription accuracy.
|
|
77
|
+
- **WDER** (word diarization error rate) — of the words that align between
|
|
78
|
+
reference and hypothesis, the fraction assigned to the wrong speaker under
|
|
79
|
+
the best speaker permutation. It isolates diarization quality from WER
|
|
80
|
+
(insertions/deletions don't count toward WDER).
|
|
81
|
+
|
|
82
|
+
Measured on the released checkpoint:
|
|
83
|
+
|
|
84
|
+
| Benchmark | Metric | Score |
|
|
85
|
+
|---|---|---|
|
|
86
|
+
| ESB (English ASR, 1000 utts) | WER | 9.6% macro / 5.9% micro |
|
|
87
|
+
| VoxConverse dev (216 files, overlap-heavy) | DER | 40.7% (63.4% on overlapping speech) |
|
|
88
|
+
| Internal conversational (26 clips) | WER / WDER | 11.1% / 33% |
|
|
89
|
+
|
|
90
|
+
WhisperDRZ is an ASR-first model: transcription is strong, but diarization trails
|
|
91
|
+
purpose-built systems (which reach ~10–25% DER on VoxConverse) and overlap is its
|
|
92
|
+
weak spot. See [the write-up](blog.md) for full results (timing, non-speech
|
|
93
|
+
events, multilingual) and analysis.
|
|
94
|
+
|
|
95
|
+
Reproduce WER/WDER on your own data with the metrics in `whisperdrz.evals` and the runner:
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
pip install -e ".[eval]"
|
|
99
|
+
# manifest.jsonl: one {"audio": "a.wav", "text": "[0] ref ... [1] ..."} per line
|
|
100
|
+
python scripts/eval.py manifest.jsonl --model whisperdrz-large-v3.safetensors
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
`text` is the reference transcript; `[N]` speaker tags are optional and WDER is
|
|
104
|
+
only computed for multi-speaker references.
|
|
105
|
+
|
|
106
|
+
## Demo
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
python demo/app.py --model whisperdrz-large-v3.safetensors
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## License
|
|
113
|
+
|
|
114
|
+
MIT. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "whisperdrz"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Speaker-aware ASR with word-level timestamps (inference)"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.12"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
dependencies = [
|
|
13
|
+
"torch",
|
|
14
|
+
"torchaudio",
|
|
15
|
+
"torchcodec",
|
|
16
|
+
"numpy",
|
|
17
|
+
"scipy",
|
|
18
|
+
"einops",
|
|
19
|
+
"numba",
|
|
20
|
+
"tiktoken",
|
|
21
|
+
"safetensors",
|
|
22
|
+
"huggingface_hub",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
[project.optional-dependencies]
|
|
26
|
+
flash = ["flash-attn", "triton"]
|
|
27
|
+
demo = ["gradio"]
|
|
28
|
+
eval = ["jiwer", "whisper-normalizer"]
|
|
29
|
+
|
|
30
|
+
[project.scripts]
|
|
31
|
+
whisperdrz = "whisperdrz.cli:main"
|
|
32
|
+
|
|
33
|
+
[tool.setuptools.packages.find]
|
|
34
|
+
where = ["src"]
|
|
35
|
+
include = ["whisperdrz*"]
|
|
36
|
+
|
|
37
|
+
[tool.setuptools.package-data]
|
|
38
|
+
whisperdrz = ["assets/*.tiktoken"]
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from whisperdrz.model import WhisperDRZ
|
|
2
|
+
from whisperdrz.transcriber import Transcriber
|
|
3
|
+
|
|
4
|
+
__version__ = "0.1.0"
|
|
5
|
+
|
|
6
|
+
__all__ = ["WhisperDRZ", "Transcriber", "load_model", "__version__"]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def load_model(name_or_path: str, lang: str | None = "en", **kwargs) -> Transcriber:
|
|
10
|
+
"""Load a WhisperDRZ model for transcription.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
name_or_path: A local checkpoint path (``.safetensors`` or ``.pt``), a
|
|
14
|
+
filename hosted in the default Hugging Face repo, or a Hugging Face
|
|
15
|
+
repo id.
|
|
16
|
+
lang: Force a language (e.g. ``"en"``). Pass ``None`` to auto-detect.
|
|
17
|
+
**kwargs: Forwarded to :class:`~whisperdrz.transcriber.Transcriber`.
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
A :class:`~whisperdrz.transcriber.Transcriber` ready to ``transcribe(audio)``.
|
|
21
|
+
"""
|
|
22
|
+
return Transcriber(model_path=name_or_path, lang=lang, **kwargs)
|