easytranscriber 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easytranscriber-0.1.0/PKG-INFO +34 -0
- easytranscriber-0.1.0/README.md +14 -0
- easytranscriber-0.1.0/pyproject.toml +57 -0
- easytranscriber-0.1.0/setup.cfg +4 -0
- easytranscriber-0.1.0/src/easytranscriber/asr/ct2.py +245 -0
- easytranscriber-0.1.0/src/easytranscriber/asr/hf.py +106 -0
- easytranscriber-0.1.0/src/easytranscriber/audio.py +148 -0
- easytranscriber-0.1.0/src/easytranscriber/data/__init__.py +3 -0
- easytranscriber-0.1.0/src/easytranscriber/data/collators.py +30 -0
- easytranscriber-0.1.0/src/easytranscriber/data/datamodel.py +214 -0
- easytranscriber-0.1.0/src/easytranscriber/data/dataset.py +248 -0
- easytranscriber-0.1.0/src/easytranscriber/pipelines.py +352 -0
- easytranscriber-0.1.0/src/easytranscriber/text/normalization.py +42 -0
- easytranscriber-0.1.0/src/easytranscriber/utils.py +43 -0
- easytranscriber-0.1.0/src/easytranscriber.egg-info/PKG-INFO +34 -0
- easytranscriber-0.1.0/src/easytranscriber.egg-info/SOURCES.txt +17 -0
- easytranscriber-0.1.0/src/easytranscriber.egg-info/dependency_links.txt +1 -0
- easytranscriber-0.1.0/src/easytranscriber.egg-info/requires.txt +11 -0
- easytranscriber-0.1.0/src/easytranscriber.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: easytranscriber
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Speech recognition with accurate word-level timestamps.
|
|
5
|
+
Author: Faton Rekathati
|
|
6
|
+
Project-URL: Repository, https://github.com/kb-labb/easytranscriber
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: transformers>=4.45.0
|
|
10
|
+
Requires-Dist: torch!=2.9.*,>=2.7.0
|
|
11
|
+
Requires-Dist: torchaudio!=2.9.*,>=2.7.0
|
|
12
|
+
Requires-Dist: tqdm>=4.66.1
|
|
13
|
+
Requires-Dist: soundfile>=0.12.1
|
|
14
|
+
Requires-Dist: nltk>=3.8.2
|
|
15
|
+
Requires-Dist: pyannote-audio>=3.3.1
|
|
16
|
+
Requires-Dist: silero-vad~=6.0
|
|
17
|
+
Requires-Dist: ctranslate2>=4.4.0
|
|
18
|
+
Requires-Dist: msgspec
|
|
19
|
+
Requires-Dist: easyaligner==0.*
|
|
20
|
+
|
|
21
|
+
<div align="center"><img width="1020" height="340" alt="image" src="https://github.com/user-attachments/assets/7f1bdf33-5161-40c1-b6a7-6f1f586e030b" /></div>
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
`easytranscriber` is an automatic speech recognition library built for efficient, large-scale transcription with accurate word-level timestamps. The library is backend-agnostic, featuring modular, parallelizable, pipeline components (VAD, transcription, feature/emission extraction, forced alignment), with support for both `ctranslate2` and `Hugging Face` inference backends. Notable features include:
|
|
25
|
+
|
|
26
|
+
* **GPU accelerated forced alignment**, using [Pytorch's forced alignment API](https://docs.pytorch.org/audio/main/tutorials/ctc_forced_alignment_api_tutorial.html). Forced alignment is based on a GPU implementation of the Viterbi algorithm ([Pratap et al., 2024](https://jmlr.org/papers/volume25/23-1318/23-1318.pdf#page=8)).
|
|
27
|
+
* **Parallel loading and pre-fetching of audio files** for efficient data loading and batch processing.
|
|
28
|
+
* **Flexible text normalization for improved alignment quality**. Users can supply custom regex-based text normalization functions to preprocess ASR outputs before alignment. A mapping from the original text to the normalized text is maintained internally. All of the applied normalizations and transformations are consequently **non-destructive and reversible after alignment**.
|
|
29
|
+
* **35% to 102% faster inference compared to [`WhisperX`](https://github.com/m-bain/whisperX)**. See the [benchmarks](#benchmarks) for more details.
|
|
30
|
+
* Batch inference support for both wav2vec2 and Whisper models.
|
|
31
|
+
|
|
32
|
+
### Benchmarks
|
|
33
|
+
|
|
34
|
+

|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
<div align="center"><img width="1020" height="340" alt="image" src="https://github.com/user-attachments/assets/7f1bdf33-5161-40c1-b6a7-6f1f586e030b" /></div>
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
`easytranscriber` is an automatic speech recognition library built for efficient, large-scale transcription with accurate word-level timestamps. The library is backend-agnostic, featuring modular, parallelizable, pipeline components (VAD, transcription, feature/emission extraction, forced alignment), with support for both `ctranslate2` and `Hugging Face` inference backends. Notable features include:
|
|
5
|
+
|
|
6
|
+
* **GPU accelerated forced alignment**, using [Pytorch's forced alignment API](https://docs.pytorch.org/audio/main/tutorials/ctc_forced_alignment_api_tutorial.html). Forced alignment is based on a GPU implementation of the Viterbi algorithm ([Pratap et al., 2024](https://jmlr.org/papers/volume25/23-1318/23-1318.pdf#page=8)).
|
|
7
|
+
* **Parallel loading and pre-fetching of audio files** for efficient data loading and batch processing.
|
|
8
|
+
* **Flexible text normalization for improved alignment quality**. Users can supply custom regex-based text normalization functions to preprocess ASR outputs before alignment. A mapping from the original text to the normalized text is maintained internally. All of the applied normalizations and transformations are consequently **non-destructive and reversible after alignment**.
|
|
9
|
+
* **35% to 102% faster inference compared to [`WhisperX`](https://github.com/m-bain/whisperX)**. See the [benchmarks](#benchmarks) for more details.
|
|
10
|
+
* Batch inference support for both wav2vec2 and Whisper models.
|
|
11
|
+
|
|
12
|
+
### Benchmarks
|
|
13
|
+
|
|
14
|
+

|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=67.0.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
version = "0.1.0"
|
|
7
|
+
name = "easytranscriber"
|
|
8
|
+
requires-python = ">= 3.10"
|
|
9
|
+
description = "Speech recognition with accurate word-level timestamps."
|
|
10
|
+
readme = "README.md"
|
|
11
|
+
authors = [{ name = "Faton Rekathati" }]
|
|
12
|
+
|
|
13
|
+
dependencies = [
|
|
14
|
+
"transformers>=4.45.0",
|
|
15
|
+
"torch>=2.7.0,!=2.9.*",
|
|
16
|
+
"torchaudio>=2.7.0,!=2.9.*",
|
|
17
|
+
"tqdm>=4.66.1",
|
|
18
|
+
"soundfile>=0.12.1",
|
|
19
|
+
"nltk>=3.8.2",
|
|
20
|
+
"pyannote-audio>=3.3.1",
|
|
21
|
+
"silero-vad~=6.0",
|
|
22
|
+
"ctranslate2>=4.4.0",
|
|
23
|
+
"msgspec",
|
|
24
|
+
"easyaligner==0.*"
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[project.urls]
|
|
28
|
+
Repository = "https://github.com/kb-labb/easytranscriber"
|
|
29
|
+
|
|
30
|
+
[tool.uv.sources]
|
|
31
|
+
torch = [
|
|
32
|
+
{ index = "pytorch-cpu", marker = "sys_platform == 'darwin'" },
|
|
33
|
+
{ index = "pytorch-cuda", marker = "sys_platform != 'darwin'" },
|
|
34
|
+
]
|
|
35
|
+
torchaudio = [
|
|
36
|
+
{ index = "pytorch-cpu", marker = "sys_platform == 'darwin'" },
|
|
37
|
+
{ index = "pytorch-cuda", marker = "sys_platform != 'darwin'" },
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
[[tool.uv.index]]
|
|
41
|
+
name = "pytorch-cuda"
|
|
42
|
+
url = "https://download.pytorch.org/whl/cu128"
|
|
43
|
+
explicit = true
|
|
44
|
+
|
|
45
|
+
[[tool.uv.index]]
|
|
46
|
+
name = "pytorch-cpu"
|
|
47
|
+
url = "https://download.pytorch.org/whl/cpu"
|
|
48
|
+
explicit = true
|
|
49
|
+
|
|
50
|
+
[tool.ruff]
|
|
51
|
+
line-length = 99
|
|
52
|
+
indent-width = 4
|
|
53
|
+
target-version = "py310"
|
|
54
|
+
|
|
55
|
+
[tool.ruff.lint]
|
|
56
|
+
# Disable fix for unused imports (`F401`).
|
|
57
|
+
unfixable = ["F401"]
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CTranslate2-based Whisper transcription module.
|
|
3
|
+
|
|
4
|
+
This module provides a transcribe function using ctranslate2 for efficient
|
|
5
|
+
Whisper inference, mirroring the HuggingFace implementation in hf.py.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
import ctranslate2
|
|
12
|
+
import numpy as np
|
|
13
|
+
import torch
|
|
14
|
+
from easyaligner.utils import save_metadata_json, save_metadata_msgpack
|
|
15
|
+
from easytranscriber.data.collators import transcribe_collate_fn
|
|
16
|
+
from tqdm import tqdm
|
|
17
|
+
from transformers import WhisperProcessor
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def transcribe(
|
|
23
|
+
model: ctranslate2.models.Whisper,
|
|
24
|
+
processor: WhisperProcessor,
|
|
25
|
+
file_dataloader: torch.utils.data.DataLoader,
|
|
26
|
+
language: str | None = None,
|
|
27
|
+
task: str = "transcribe",
|
|
28
|
+
batch_size: int = 8,
|
|
29
|
+
beam_size: int = 5,
|
|
30
|
+
patience: float = 1.0,
|
|
31
|
+
length_penalty: float = 1.0,
|
|
32
|
+
repetition_penalty: float = 1.0,
|
|
33
|
+
no_repeat_ngram_size: int = 0,
|
|
34
|
+
max_length: int = 448,
|
|
35
|
+
suppress_blank: bool = True,
|
|
36
|
+
num_workers: int = 2,
|
|
37
|
+
prefetch_factor: int = 2,
|
|
38
|
+
output_dir: str = "output/transcriptions",
|
|
39
|
+
):
|
|
40
|
+
"""
|
|
41
|
+
Transcribe audio files using CTranslate2 Whisper model.
|
|
42
|
+
|
|
43
|
+
This function processes audio files through a dataloader structure similar
|
|
44
|
+
to the HuggingFace implementation, but uses ctranslate2 for inference.
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
model : ctranslate2.models.Whisper
|
|
49
|
+
CTranslate2 Whisper model.
|
|
50
|
+
processor : transformers.WhisperProcessor
|
|
51
|
+
WhisperProcessor for tokenization and decoding.
|
|
52
|
+
file_dataloader : torch.utils.data.DataLoader
|
|
53
|
+
DataLoader yielding audio file datasets.
|
|
54
|
+
language : str, optional
|
|
55
|
+
Language code (e.g., 'sv', 'en'). If None, auto-detect.
|
|
56
|
+
batch_size : int, optional
|
|
57
|
+
Batch size for feature processing.
|
|
58
|
+
task : str, optional
|
|
59
|
+
Task type - 'transcribe' or 'translate'.
|
|
60
|
+
beam_size : int, optional
|
|
61
|
+
Beam size for search. Default is 5.
|
|
62
|
+
patience : float, optional
|
|
63
|
+
Beam search patience factor. Default is 1.0.
|
|
64
|
+
length_penalty : float, optional
|
|
65
|
+
Length penalty for beam search. Default is 1.0.
|
|
66
|
+
repetition_penalty : float, optional
|
|
67
|
+
Repetition penalty. Default is 1.0.
|
|
68
|
+
no_repeat_ngram_size : int, optional
|
|
69
|
+
N-gram size for no repeat. Default is 0.
|
|
70
|
+
max_length : int, optional
|
|
71
|
+
Maximum output length. Default is 448.
|
|
72
|
+
suppress_blank : bool, optional
|
|
73
|
+
Whether to suppress blank tokens. Default is True.
|
|
74
|
+
num_workers : int, optional
|
|
75
|
+
Number of workers for feature dataloader (file dataloader is created outside
|
|
76
|
+
of this function).
|
|
77
|
+
prefetch_factor : int, optional
|
|
78
|
+
Prefetch factor for feature dataloader (file dataloader is created outside
|
|
79
|
+
of this function).
|
|
80
|
+
output_dir : str, optional
|
|
81
|
+
Directory to save transcription JSON files. Default is `output/transcriptions`.
|
|
82
|
+
"""
|
|
83
|
+
for features in tqdm(file_dataloader, desc="Transcribing audio files"):
|
|
84
|
+
slice_dataset = features[0]["dataset"]
|
|
85
|
+
metadata = features[0]["dataset"].metadata
|
|
86
|
+
transcription_texts = []
|
|
87
|
+
language_detections = []
|
|
88
|
+
|
|
89
|
+
feature_dataloader = torch.utils.data.DataLoader(
|
|
90
|
+
slice_dataset,
|
|
91
|
+
batch_size=batch_size,
|
|
92
|
+
num_workers=num_workers,
|
|
93
|
+
prefetch_factor=prefetch_factor,
|
|
94
|
+
pin_memory=True,
|
|
95
|
+
collate_fn=transcribe_collate_fn,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
logger.info(f"Transcribing {metadata.audio_path} ...")
|
|
99
|
+
|
|
100
|
+
for batch in feature_dataloader:
|
|
101
|
+
batch_features = batch["features"].numpy() # Convert to numpy for ctranslate2
|
|
102
|
+
current_batch_size = batch_features.shape[0]
|
|
103
|
+
|
|
104
|
+
# Convert to ctranslate2 StorageView
|
|
105
|
+
features_ct2 = ctranslate2.StorageView.from_array(batch_features)
|
|
106
|
+
|
|
107
|
+
if language is not None:
|
|
108
|
+
# Build the prompt tokens for the batch
|
|
109
|
+
prompt_tokens = [
|
|
110
|
+
"<|startoftranscript|>",
|
|
111
|
+
f"<|{language}|>",
|
|
112
|
+
f"<|{task}|>",
|
|
113
|
+
"<|notimestamps|>",
|
|
114
|
+
]
|
|
115
|
+
prompt_ids = processor.tokenizer.convert_tokens_to_ids(prompt_tokens)
|
|
116
|
+
prompt_ids = [prompt_ids] * current_batch_size
|
|
117
|
+
else:
|
|
118
|
+
languages = detect_language(model, features_ct2)
|
|
119
|
+
language_detections.extend(languages)
|
|
120
|
+
prompt_ids = []
|
|
121
|
+
for curr_lang in languages:
|
|
122
|
+
prompt_tokens = [
|
|
123
|
+
"<|startoftranscript|>",
|
|
124
|
+
curr_lang["language"],
|
|
125
|
+
f"<|{task}|>",
|
|
126
|
+
"<|notimestamps|>",
|
|
127
|
+
]
|
|
128
|
+
|
|
129
|
+
prompt_ids.append(processor.tokenizer.convert_tokens_to_ids(prompt_tokens))
|
|
130
|
+
|
|
131
|
+
# Generate transcriptions
|
|
132
|
+
outputs = model.generate(
|
|
133
|
+
features_ct2,
|
|
134
|
+
prompt_ids,
|
|
135
|
+
beam_size=beam_size,
|
|
136
|
+
patience=patience,
|
|
137
|
+
length_penalty=length_penalty,
|
|
138
|
+
repetition_penalty=repetition_penalty,
|
|
139
|
+
no_repeat_ngram_size=no_repeat_ngram_size,
|
|
140
|
+
max_length=max_length,
|
|
141
|
+
suppress_blank=suppress_blank,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
# Decode the sequences
|
|
145
|
+
sequences = [result.sequences_ids[0] for result in outputs]
|
|
146
|
+
transcription = processor.batch_decode(sequences, skip_special_tokens=True)
|
|
147
|
+
|
|
148
|
+
transcription_texts.extend(transcription)
|
|
149
|
+
|
|
150
|
+
# Update metadata with transcriptions
|
|
151
|
+
for i, speech in enumerate(metadata.speeches):
|
|
152
|
+
for j, chunk in enumerate(speech.chunks):
|
|
153
|
+
chunk.text = transcription_texts[j].strip()
|
|
154
|
+
if len(language_detections) > 0:
|
|
155
|
+
chunk.language = language_detections[j]["language"]
|
|
156
|
+
chunk.language_prob = language_detections[j]["probability"]
|
|
157
|
+
|
|
158
|
+
# Save transcription to file
|
|
159
|
+
output_path = Path(output_dir) / Path(metadata.audio_path).with_suffix(".json")
|
|
160
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
161
|
+
save_metadata_json(metadata, output_dir=output_dir)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def lang_detect_only(
|
|
165
|
+
model,
|
|
166
|
+
file_dataloader,
|
|
167
|
+
batch_size=8,
|
|
168
|
+
num_workers=2,
|
|
169
|
+
prefetch_factor=2,
|
|
170
|
+
output_dir=None,
|
|
171
|
+
):
|
|
172
|
+
"""
|
|
173
|
+
Run language detection only.
|
|
174
|
+
|
|
175
|
+
Parameters
|
|
176
|
+
----------
|
|
177
|
+
model : ctranslate2.models.Whisper
|
|
178
|
+
CTranslate2 Whisper model.
|
|
179
|
+
file_dataloader : torch.utils.data.DataLoader
|
|
180
|
+
DataLoader yielding audio file datasets.
|
|
181
|
+
batch_size : int, optional
|
|
182
|
+
Batch size. Default is 8.
|
|
183
|
+
num_workers : int, optional
|
|
184
|
+
Number of workers. Default is 2.
|
|
185
|
+
prefetch_factor : int, optional
|
|
186
|
+
Prefetch factor. Default is 2.
|
|
187
|
+
output_dir : str, optional
|
|
188
|
+
Output directory. Default is None.
|
|
189
|
+
"""
|
|
190
|
+
for features in file_dataloader:
|
|
191
|
+
slice_dataset = features[0]["dataset"]
|
|
192
|
+
metadata = features[0]["dataset"].metadata
|
|
193
|
+
language_detections = []
|
|
194
|
+
|
|
195
|
+
feature_dataloader = torch.utils.data.DataLoader(
|
|
196
|
+
slice_dataset,
|
|
197
|
+
batch_size=batch_size,
|
|
198
|
+
num_workers=num_workers,
|
|
199
|
+
prefetch_factor=prefetch_factor,
|
|
200
|
+
pin_memory=True,
|
|
201
|
+
collate_fn=transcribe_collate_fn,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
for batch in feature_dataloader:
|
|
205
|
+
features_ct2 = batch["features"].numpy()
|
|
206
|
+
features_ct2 = ctranslate2.StorageView.from_array(features_ct2)
|
|
207
|
+
languages = detect_language(model, features_ct2)
|
|
208
|
+
language_detections.append(languages)
|
|
209
|
+
|
|
210
|
+
for i, speech in enumerate(metadata.speeches):
|
|
211
|
+
for j, chunk in enumerate(speech.chunks):
|
|
212
|
+
chunk.language = language_detections[j]["language"]
|
|
213
|
+
chunk.language_probability = language_detections[j]["probability"]
|
|
214
|
+
|
|
215
|
+
# Save transcription to file
|
|
216
|
+
output_path = Path(output_dir) / Path(metadata.audio_path).with_suffix(".json")
|
|
217
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
218
|
+
save_metadata_json(metadata, output_dir=output_dir)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def detect_language(model: ctranslate2.models.Whisper, features: ctranslate2.StorageView) -> list:
|
|
222
|
+
"""
|
|
223
|
+
Return the highest probability language for each chunk in the features batch.
|
|
224
|
+
|
|
225
|
+
Parameters
|
|
226
|
+
----------
|
|
227
|
+
model : ctranslate2.models.Whisper
|
|
228
|
+
CTranslate2 Whisper model.
|
|
229
|
+
features : ctranslate2.StorageView
|
|
230
|
+
Input features.
|
|
231
|
+
|
|
232
|
+
Returns
|
|
233
|
+
-------
|
|
234
|
+
list
|
|
235
|
+
List of dicts containing 'language' and 'probability'.
|
|
236
|
+
"""
|
|
237
|
+
|
|
238
|
+
# List of tuple[str, float] with language and probability
|
|
239
|
+
lang_probs = model.detect_language(features)
|
|
240
|
+
|
|
241
|
+
top_langs = []
|
|
242
|
+
for chunk in lang_probs:
|
|
243
|
+
top_langs.append({"language": chunk[0][0], "probability": chunk[0][1]})
|
|
244
|
+
|
|
245
|
+
return top_langs
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import torch
|
|
5
|
+
from easyaligner.data.dataset import AudioFileDataset
|
|
6
|
+
from easyaligner.utils import save_metadata_json
|
|
7
|
+
from easytranscriber.data.collators import transcribe_collate_fn
|
|
8
|
+
from tqdm import tqdm
|
|
9
|
+
from transformers import WhisperForConditionalGeneration, WhisperProcessor
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def transcribe(
|
|
15
|
+
model: WhisperForConditionalGeneration,
|
|
16
|
+
processor: WhisperProcessor,
|
|
17
|
+
file_dataloader: torch.utils.data.DataLoader,
|
|
18
|
+
language: str | None = None,
|
|
19
|
+
task: str = "transcribe",
|
|
20
|
+
batch_size: int = 4,
|
|
21
|
+
beam_size: int = 5,
|
|
22
|
+
length_penalty: float = 1.0,
|
|
23
|
+
repetition_penalty: float = 1.0,
|
|
24
|
+
max_length: int = 250,
|
|
25
|
+
num_workers: int = 2,
|
|
26
|
+
prefetch_factor: int = 2,
|
|
27
|
+
output_dir: str = "output/transcriptions",
|
|
28
|
+
device: str = "cuda",
|
|
29
|
+
):
|
|
30
|
+
"""
|
|
31
|
+
Transcribe audio files using HuggingFace Whisper model.
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
model : transformers.WhisperForConditionalGeneration
|
|
36
|
+
HuggingFace Whisper model.
|
|
37
|
+
processor : transformers.WhisperProcessor
|
|
38
|
+
HuggingFace Whisper processor.
|
|
39
|
+
file_dataloader : torch.utils.data.DataLoader
|
|
40
|
+
DataLoader yielding audio file datasets.
|
|
41
|
+
language : str, optional
|
|
42
|
+
Language code (e.g., 'sv', 'en'). Default is `None` (auto-detect).
|
|
43
|
+
batch_size : int, optional
|
|
44
|
+
Batch size for inference.
|
|
45
|
+
beam_size : int, optional
|
|
46
|
+
Number of beams for beam search. Default is 5.
|
|
47
|
+
length_penalty : float, optional
|
|
48
|
+
Length penalty. Default is 1.0.
|
|
49
|
+
repetition_penalty : float, optional
|
|
50
|
+
Repetition penalty. Default is 1.0.
|
|
51
|
+
max_length : int, optional
|
|
52
|
+
Maximum length of generated text. Default is 250.
|
|
53
|
+
num_workers : int, optional
|
|
54
|
+
Number of workers for feature dataloader.
|
|
55
|
+
prefetch_factor : int, optional
|
|
56
|
+
Prefetch factor for feature dataloader.
|
|
57
|
+
output_dir : str, optional
|
|
58
|
+
Directory to save transcription JSON files. Default is `output/transcriptions`.
|
|
59
|
+
device : str, optional
|
|
60
|
+
Device to run inference on. Default is `cuda`.
|
|
61
|
+
"""
|
|
62
|
+
for features in tqdm(file_dataloader, desc="Transcribing audio files"):
|
|
63
|
+
slice_dataset = features[0]["dataset"]
|
|
64
|
+
metadata = features[0]["dataset"].metadata
|
|
65
|
+
transcription_texts = []
|
|
66
|
+
|
|
67
|
+
feature_dataloader = torch.utils.data.DataLoader(
|
|
68
|
+
slice_dataset,
|
|
69
|
+
batch_size=batch_size,
|
|
70
|
+
num_workers=num_workers,
|
|
71
|
+
prefetch_factor=prefetch_factor,
|
|
72
|
+
collate_fn=transcribe_collate_fn,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
logger.info(f"Transcribing {metadata.audio_path} ...")
|
|
76
|
+
|
|
77
|
+
for batch in feature_dataloader:
|
|
78
|
+
with torch.inference_mode():
|
|
79
|
+
batch = batch["features"].to(device).half()
|
|
80
|
+
predicted_ids = model.generate(
|
|
81
|
+
batch,
|
|
82
|
+
return_dict_in_generate=True,
|
|
83
|
+
task=task,
|
|
84
|
+
language=language,
|
|
85
|
+
output_scores=False,
|
|
86
|
+
max_length=max_length,
|
|
87
|
+
num_beams=beam_size,
|
|
88
|
+
repetition_penalty=repetition_penalty,
|
|
89
|
+
length_penalty=length_penalty,
|
|
90
|
+
early_stopping=True,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
transcription = processor.batch_decode(
|
|
94
|
+
predicted_ids["sequences"], skip_special_tokens=True
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
transcription_texts.extend(transcription)
|
|
98
|
+
|
|
99
|
+
for i, speech in enumerate(metadata.speeches):
|
|
100
|
+
for j, chunk in enumerate(speech.chunks):
|
|
101
|
+
chunk.text = transcription_texts[j].strip()
|
|
102
|
+
|
|
103
|
+
# Write final transcription to file with msgspec serialization
|
|
104
|
+
output_path = Path(output_dir) / Path(metadata.audio_path).with_suffix(".json")
|
|
105
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
106
|
+
save_metadata_json(metadata, output_dir=output_dir)
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import subprocess
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Tuple
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def read_audio_segment(
|
|
12
|
+
audio_path: str | Path,
|
|
13
|
+
start_sec: float,
|
|
14
|
+
duration_sec: float,
|
|
15
|
+
sample_rate: int = 16000,
|
|
16
|
+
) -> np.ndarray:
|
|
17
|
+
"""
|
|
18
|
+
Read a segment of audio using ffmpeg subprocess with seek.
|
|
19
|
+
|
|
20
|
+
Uses ffmpeg's fast seek (-ss before -i) to efficiently read only the
|
|
21
|
+
required segment, with resampling to the target sample rate and mono conversion.
|
|
22
|
+
|
|
23
|
+
Parameters
|
|
24
|
+
----------
|
|
25
|
+
audio_path : str or Path
|
|
26
|
+
Path to the audio file.
|
|
27
|
+
start_sec : float
|
|
28
|
+
Start time in seconds.
|
|
29
|
+
duration_sec : float
|
|
30
|
+
Duration to read in seconds.
|
|
31
|
+
sample_rate : int, optional
|
|
32
|
+
Target sample rate for resampling.
|
|
33
|
+
|
|
34
|
+
Returns
|
|
35
|
+
-------
|
|
36
|
+
np.ndarray
|
|
37
|
+
Audio data as float32 numpy array.
|
|
38
|
+
"""
|
|
39
|
+
cmd = [
|
|
40
|
+
"ffmpeg",
|
|
41
|
+
"-ss",
|
|
42
|
+
str(start_sec), # Seek to position (before -i = fast seek)
|
|
43
|
+
"-i",
|
|
44
|
+
str(audio_path),
|
|
45
|
+
"-t",
|
|
46
|
+
str(duration_sec), # Read this many seconds
|
|
47
|
+
"-ar",
|
|
48
|
+
str(sample_rate), # Resample
|
|
49
|
+
"-ac",
|
|
50
|
+
"1", # Mono
|
|
51
|
+
"-f",
|
|
52
|
+
"f32le", # Raw float32 little-endian output
|
|
53
|
+
"-loglevel",
|
|
54
|
+
"error",
|
|
55
|
+
"pipe:1", # Output to stdout
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
proc = subprocess.run(cmd, capture_output=True, check=True)
|
|
60
|
+
audio = np.frombuffer(proc.stdout, dtype=np.float32)
|
|
61
|
+
return audio
|
|
62
|
+
except subprocess.CalledProcessError as e:
|
|
63
|
+
logger.error(f"ffmpeg error reading {audio_path}: {e.stderr.decode()}")
|
|
64
|
+
raise
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def convert_audio_to_array(input_file: str, sample_rate: int = 16000) -> Tuple[np.ndarray, int]:
|
|
68
|
+
"""
|
|
69
|
+
Convert audio to in-memory numpy array.
|
|
70
|
+
|
|
71
|
+
Parameters
|
|
72
|
+
----------
|
|
73
|
+
input_file : str
|
|
74
|
+
Path to the input audio file.
|
|
75
|
+
sample_rate : int, optional
|
|
76
|
+
Target sample rate.
|
|
77
|
+
|
|
78
|
+
Returns
|
|
79
|
+
-------
|
|
80
|
+
Tuple[np.ndarray, int]
|
|
81
|
+
Tuple containing the audio array (int16) and the sample rate.
|
|
82
|
+
|
|
83
|
+
Raises
|
|
84
|
+
------
|
|
85
|
+
RuntimeError
|
|
86
|
+
If ffmpeg command fails.
|
|
87
|
+
"""
|
|
88
|
+
# fmt: off
|
|
89
|
+
command = [
|
|
90
|
+
"ffmpeg",
|
|
91
|
+
"-i", input_file,
|
|
92
|
+
"-f", "s16le", # raw PCM 16-bit little endian
|
|
93
|
+
"-acodec", "pcm_s16le",
|
|
94
|
+
"-ac", "1", # mono
|
|
95
|
+
"-ar", str(sample_rate), # 16 kHz
|
|
96
|
+
"-loglevel", "error", # suppress output
|
|
97
|
+
"-hide_banner",
|
|
98
|
+
"-nostats",
|
|
99
|
+
]
|
|
100
|
+
# fmt: on
|
|
101
|
+
|
|
102
|
+
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
103
|
+
out, err = process.communicate()
|
|
104
|
+
|
|
105
|
+
if process.returncode != 0:
|
|
106
|
+
raise RuntimeError(f"ffmpeg error: {err.decode()}")
|
|
107
|
+
|
|
108
|
+
# Convert byte output to numpy array
|
|
109
|
+
audio_array = np.frombuffer(out, dtype=np.int16)
|
|
110
|
+
|
|
111
|
+
return audio_array, sample_rate # (samples, sample_rate)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def convert_audio_to_wav(input_file: str, output_file: str) -> None:
|
|
115
|
+
"""
|
|
116
|
+
Convert audio file to WAV format with 16kHz sample rate and mono channel.
|
|
117
|
+
|
|
118
|
+
Parameters
|
|
119
|
+
----------
|
|
120
|
+
input_file : str
|
|
121
|
+
Path to the input audio file.
|
|
122
|
+
output_file : str
|
|
123
|
+
Path to the output WAV file.
|
|
124
|
+
|
|
125
|
+
Raises
|
|
126
|
+
------
|
|
127
|
+
RuntimeError
|
|
128
|
+
If ffmpeg command fails.
|
|
129
|
+
"""
|
|
130
|
+
# fmt: off
|
|
131
|
+
command = [
|
|
132
|
+
'ffmpeg',
|
|
133
|
+
'-i', input_file,
|
|
134
|
+
'-ar', '16000', # Set the audio sample rate to 16kHz
|
|
135
|
+
'-ac', '1', # Set the number of audio channels to 1 (mono)
|
|
136
|
+
'-c:a', 'pcm_s16le',
|
|
137
|
+
'-loglevel', 'warning',
|
|
138
|
+
'-hide_banner',
|
|
139
|
+
'-nostats',
|
|
140
|
+
'-nostdin',
|
|
141
|
+
output_file
|
|
142
|
+
]
|
|
143
|
+
# fmt: on
|
|
144
|
+
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
145
|
+
_, err = process.communicate()
|
|
146
|
+
|
|
147
|
+
if process.returncode != 0:
|
|
148
|
+
raise RuntimeError(f"ffmpeg error: {err.decode()}")
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def transcribe_collate_fn(batch: list[dict]) -> dict:
|
|
5
|
+
"""
|
|
6
|
+
Collate function for transcription.
|
|
7
|
+
|
|
8
|
+
Parameters
|
|
9
|
+
----------
|
|
10
|
+
batch : list of dict
|
|
11
|
+
List of samples from the dataset.
|
|
12
|
+
|
|
13
|
+
Returns
|
|
14
|
+
-------
|
|
15
|
+
dict
|
|
16
|
+
Collated batch with 'features', 'start_times', and 'speech_ids'.
|
|
17
|
+
"""
|
|
18
|
+
# Remove None values
|
|
19
|
+
speech_ids = [b["speech_id"] for b in batch if b is not None]
|
|
20
|
+
start_times = [b["start_time_global"] for b in batch if b is not None]
|
|
21
|
+
batch = [b["feature"] for b in batch if b is not None]
|
|
22
|
+
|
|
23
|
+
# Concat, keep batch dimension
|
|
24
|
+
batch = torch.cat(batch, dim=0)
|
|
25
|
+
|
|
26
|
+
return {
|
|
27
|
+
"features": batch,
|
|
28
|
+
"start_times": start_times,
|
|
29
|
+
"speech_ids": speech_ids,
|
|
30
|
+
}
|