easytranscriber 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,34 @@
1
+ Metadata-Version: 2.4
2
+ Name: easytranscriber
3
+ Version: 0.1.0
4
+ Summary: Speech recognition with accurate word-level timestamps.
5
+ Author: Faton Rekathati
6
+ Project-URL: Repository, https://github.com/kb-labb/easytranscriber
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: transformers>=4.45.0
10
+ Requires-Dist: torch!=2.9.*,>=2.7.0
11
+ Requires-Dist: torchaudio!=2.9.*,>=2.7.0
12
+ Requires-Dist: tqdm>=4.66.1
13
+ Requires-Dist: soundfile>=0.12.1
14
+ Requires-Dist: nltk>=3.8.2
15
+ Requires-Dist: pyannote-audio>=3.3.1
16
+ Requires-Dist: silero-vad~=6.0
17
+ Requires-Dist: ctranslate2>=4.4.0
18
+ Requires-Dist: msgspec
19
+ Requires-Dist: easyaligner==0.*
20
+
21
+ <div align="center"><img width="1020" height="340" alt="image" src="https://github.com/user-attachments/assets/7f1bdf33-5161-40c1-b6a7-6f1f586e030b" /></div>
22
+
23
+
24
+ `easytranscriber` is an automatic speech recognition library built for efficient, large-scale transcription with accurate word-level timestamps. The library is backend-agnostic, featuring modular, parallelizable, pipeline components (VAD, transcription, feature/emission extraction, forced alignment), with support for both `ctranslate2` and `Hugging Face` inference backends. Notable features include:
25
+
26
+ * **GPU accelerated forced alignment**, using [Pytorch's forced alignment API](https://docs.pytorch.org/audio/main/tutorials/ctc_forced_alignment_api_tutorial.html). Forced alignment is based on a GPU implementation of the Viterbi algorithm ([Pratap et al., 2024](https://jmlr.org/papers/volume25/23-1318/23-1318.pdf#page=8)).
27
+ * **Parallel loading and pre-fetching of audio files** for efficient data loading and batch processing.
28
+ * **Flexible text normalization for improved alignment quality**. Users can supply custom regex-based text normalization functions to preprocess ASR outputs before alignment. A mapping from the original text to the normalized text is maintained internally. All of the applied normalizations and transformations are consequently **non-destructive and reversible after alignment**.
29
+ * **35% to 102% faster inference compared to [`WhisperX`](https://github.com/m-bain/whisperX)**. See the [benchmarks](#benchmarks) for more details.
30
+ * Batch inference support for both wav2vec2 and Whisper models.
31
+
32
+ ### Benchmarks
33
+
34
+ ![Benchmarks](benchmarks/plots/all_speedup.png)
@@ -0,0 +1,14 @@
1
+ <div align="center"><img width="1020" height="340" alt="image" src="https://github.com/user-attachments/assets/7f1bdf33-5161-40c1-b6a7-6f1f586e030b" /></div>
2
+
3
+
4
+ `easytranscriber` is an automatic speech recognition library built for efficient, large-scale transcription with accurate word-level timestamps. The library is backend-agnostic, featuring modular, parallelizable, pipeline components (VAD, transcription, feature/emission extraction, forced alignment), with support for both `ctranslate2` and `Hugging Face` inference backends. Notable features include:
5
+
6
+ * **GPU accelerated forced alignment**, using [Pytorch's forced alignment API](https://docs.pytorch.org/audio/main/tutorials/ctc_forced_alignment_api_tutorial.html). Forced alignment is based on a GPU implementation of the Viterbi algorithm ([Pratap et al., 2024](https://jmlr.org/papers/volume25/23-1318/23-1318.pdf#page=8)).
7
+ * **Parallel loading and pre-fetching of audio files** for efficient data loading and batch processing.
8
+ * **Flexible text normalization for improved alignment quality**. Users can supply custom regex-based text normalization functions to preprocess ASR outputs before alignment. A mapping from the original text to the normalized text is maintained internally. All of the applied normalizations and transformations are consequently **non-destructive and reversible after alignment**.
9
+ * **35% to 102% faster inference compared to [`WhisperX`](https://github.com/m-bain/whisperX)**. See the [benchmarks](#benchmarks) for more details.
10
+ * Batch inference support for both wav2vec2 and Whisper models.
11
+
12
+ ### Benchmarks
13
+
14
+ ![Benchmarks](benchmarks/plots/all_speedup.png)
@@ -0,0 +1,57 @@
1
+ [build-system]
2
+ requires = ["setuptools>=67.0.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ version = "0.1.0"
7
+ name = "easytranscriber"
8
+ requires-python = ">= 3.10"
9
+ description = "Speech recognition with accurate word-level timestamps."
10
+ readme = "README.md"
11
+ authors = [{ name = "Faton Rekathati" }]
12
+
13
+ dependencies = [
14
+ "transformers>=4.45.0",
15
+ "torch>=2.7.0,!=2.9.*",
16
+ "torchaudio>=2.7.0,!=2.9.*",
17
+ "tqdm>=4.66.1",
18
+ "soundfile>=0.12.1",
19
+ "nltk>=3.8.2",
20
+ "pyannote-audio>=3.3.1",
21
+ "silero-vad~=6.0",
22
+ "ctranslate2>=4.4.0",
23
+ "msgspec",
24
+ "easyaligner==0.*"
25
+ ]
26
+
27
+ [project.urls]
28
+ Repository = "https://github.com/kb-labb/easytranscriber"
29
+
30
+ [tool.uv.sources]
31
+ torch = [
32
+ { index = "pytorch-cpu", marker = "sys_platform == 'darwin'" },
33
+ { index = "pytorch-cuda", marker = "sys_platform != 'darwin'" },
34
+ ]
35
+ torchaudio = [
36
+ { index = "pytorch-cpu", marker = "sys_platform == 'darwin'" },
37
+ { index = "pytorch-cuda", marker = "sys_platform != 'darwin'" },
38
+ ]
39
+
40
+ [[tool.uv.index]]
41
+ name = "pytorch-cuda"
42
+ url = "https://download.pytorch.org/whl/cu128"
43
+ explicit = true
44
+
45
+ [[tool.uv.index]]
46
+ name = "pytorch-cpu"
47
+ url = "https://download.pytorch.org/whl/cpu"
48
+ explicit = true
49
+
50
+ [tool.ruff]
51
+ line-length = 99
52
+ indent-width = 4
53
+ target-version = "py310"
54
+
55
+ [tool.ruff.lint]
56
+ # Disable fix for unused imports (`F401`).
57
+ unfixable = ["F401"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,245 @@
1
+ """
2
+ CTranslate2-based Whisper transcription module.
3
+
4
+ This module provides a transcribe function using ctranslate2 for efficient
5
+ Whisper inference, mirroring the HuggingFace implementation in hf.py.
6
+ """
7
+
8
+ import logging
9
+ from pathlib import Path
10
+
11
+ import ctranslate2
12
+ import numpy as np
13
+ import torch
14
+ from easyaligner.utils import save_metadata_json, save_metadata_msgpack
15
+ from easytranscriber.data.collators import transcribe_collate_fn
16
+ from tqdm import tqdm
17
+ from transformers import WhisperProcessor
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def transcribe(
23
+ model: ctranslate2.models.Whisper,
24
+ processor: WhisperProcessor,
25
+ file_dataloader: torch.utils.data.DataLoader,
26
+ language: str | None = None,
27
+ task: str = "transcribe",
28
+ batch_size: int = 8,
29
+ beam_size: int = 5,
30
+ patience: float = 1.0,
31
+ length_penalty: float = 1.0,
32
+ repetition_penalty: float = 1.0,
33
+ no_repeat_ngram_size: int = 0,
34
+ max_length: int = 448,
35
+ suppress_blank: bool = True,
36
+ num_workers: int = 2,
37
+ prefetch_factor: int = 2,
38
+ output_dir: str = "output/transcriptions",
39
+ ):
40
+ """
41
+ Transcribe audio files using CTranslate2 Whisper model.
42
+
43
+ This function processes audio files through a dataloader structure similar
44
+ to the HuggingFace implementation, but uses ctranslate2 for inference.
45
+
46
+ Parameters
47
+ ----------
48
+ model : ctranslate2.models.Whisper
49
+ CTranslate2 Whisper model.
50
+ processor : transformers.WhisperProcessor
51
+ WhisperProcessor for tokenization and decoding.
52
+ file_dataloader : torch.utils.data.DataLoader
53
+ DataLoader yielding audio file datasets.
54
+ language : str, optional
55
+ Language code (e.g., 'sv', 'en'). If None, auto-detect.
56
+ batch_size : int, optional
57
+ Batch size for feature processing.
58
+ task : str, optional
59
+ Task type - 'transcribe' or 'translate'.
60
+ beam_size : int, optional
61
+ Beam size for search. Default is 5.
62
+ patience : float, optional
63
+ Beam search patience factor. Default is 1.0.
64
+ length_penalty : float, optional
65
+ Length penalty for beam search. Default is 1.0.
66
+ repetition_penalty : float, optional
67
+ Repetition penalty. Default is 1.0.
68
+ no_repeat_ngram_size : int, optional
69
+ N-gram size for no repeat. Default is 0.
70
+ max_length : int, optional
71
+ Maximum output length. Default is 448.
72
+ suppress_blank : bool, optional
73
+ Whether to suppress blank tokens. Default is True.
74
+ num_workers : int, optional
75
+ Number of workers for feature dataloader (file dataloader is created outside
76
+ of this function).
77
+ prefetch_factor : int, optional
78
+ Prefetch factor for feature dataloader (file dataloader is created outside
79
+ of this function).
80
+ output_dir : str, optional
81
+ Directory to save transcription JSON files. Default is `output/transcriptions`.
82
+ """
83
+ for features in tqdm(file_dataloader, desc="Transcribing audio files"):
84
+ slice_dataset = features[0]["dataset"]
85
+ metadata = features[0]["dataset"].metadata
86
+ transcription_texts = []
87
+ language_detections = []
88
+
89
+ feature_dataloader = torch.utils.data.DataLoader(
90
+ slice_dataset,
91
+ batch_size=batch_size,
92
+ num_workers=num_workers,
93
+ prefetch_factor=prefetch_factor,
94
+ pin_memory=True,
95
+ collate_fn=transcribe_collate_fn,
96
+ )
97
+
98
+ logger.info(f"Transcribing {metadata.audio_path} ...")
99
+
100
+ for batch in feature_dataloader:
101
+ batch_features = batch["features"].numpy() # Convert to numpy for ctranslate2
102
+ current_batch_size = batch_features.shape[0]
103
+
104
+ # Convert to ctranslate2 StorageView
105
+ features_ct2 = ctranslate2.StorageView.from_array(batch_features)
106
+
107
+ if language is not None:
108
+ # Build the prompt tokens for the batch
109
+ prompt_tokens = [
110
+ "<|startoftranscript|>",
111
+ f"<|{language}|>",
112
+ f"<|{task}|>",
113
+ "<|notimestamps|>",
114
+ ]
115
+ prompt_ids = processor.tokenizer.convert_tokens_to_ids(prompt_tokens)
116
+ prompt_ids = [prompt_ids] * current_batch_size
117
+ else:
118
+ languages = detect_language(model, features_ct2)
119
+ language_detections.extend(languages)
120
+ prompt_ids = []
121
+ for curr_lang in languages:
122
+ prompt_tokens = [
123
+ "<|startoftranscript|>",
124
+ curr_lang["language"],
125
+ f"<|{task}|>",
126
+ "<|notimestamps|>",
127
+ ]
128
+
129
+ prompt_ids.append(processor.tokenizer.convert_tokens_to_ids(prompt_tokens))
130
+
131
+ # Generate transcriptions
132
+ outputs = model.generate(
133
+ features_ct2,
134
+ prompt_ids,
135
+ beam_size=beam_size,
136
+ patience=patience,
137
+ length_penalty=length_penalty,
138
+ repetition_penalty=repetition_penalty,
139
+ no_repeat_ngram_size=no_repeat_ngram_size,
140
+ max_length=max_length,
141
+ suppress_blank=suppress_blank,
142
+ )
143
+
144
+ # Decode the sequences
145
+ sequences = [result.sequences_ids[0] for result in outputs]
146
+ transcription = processor.batch_decode(sequences, skip_special_tokens=True)
147
+
148
+ transcription_texts.extend(transcription)
149
+
150
+ # Update metadata with transcriptions
151
+ for i, speech in enumerate(metadata.speeches):
152
+ for j, chunk in enumerate(speech.chunks):
153
+ chunk.text = transcription_texts[j].strip()
154
+ if len(language_detections) > 0:
155
+ chunk.language = language_detections[j]["language"]
156
+ chunk.language_prob = language_detections[j]["probability"]
157
+
158
+ # Save transcription to file
159
+ output_path = Path(output_dir) / Path(metadata.audio_path).with_suffix(".json")
160
+ output_path.parent.mkdir(parents=True, exist_ok=True)
161
+ save_metadata_json(metadata, output_dir=output_dir)
162
+
163
+
164
+ def lang_detect_only(
165
+ model,
166
+ file_dataloader,
167
+ batch_size=8,
168
+ num_workers=2,
169
+ prefetch_factor=2,
170
+ output_dir=None,
171
+ ):
172
+ """
173
+ Run language detection only.
174
+
175
+ Parameters
176
+ ----------
177
+ model : ctranslate2.models.Whisper
178
+ CTranslate2 Whisper model.
179
+ file_dataloader : torch.utils.data.DataLoader
180
+ DataLoader yielding audio file datasets.
181
+ batch_size : int, optional
182
+ Batch size. Default is 8.
183
+ num_workers : int, optional
184
+ Number of workers. Default is 2.
185
+ prefetch_factor : int, optional
186
+ Prefetch factor. Default is 2.
187
+ output_dir : str, optional
188
+ Output directory. Default is None.
189
+ """
190
+ for features in file_dataloader:
191
+ slice_dataset = features[0]["dataset"]
192
+ metadata = features[0]["dataset"].metadata
193
+ language_detections = []
194
+
195
+ feature_dataloader = torch.utils.data.DataLoader(
196
+ slice_dataset,
197
+ batch_size=batch_size,
198
+ num_workers=num_workers,
199
+ prefetch_factor=prefetch_factor,
200
+ pin_memory=True,
201
+ collate_fn=transcribe_collate_fn,
202
+ )
203
+
204
+ for batch in feature_dataloader:
205
+ features_ct2 = batch["features"].numpy()
206
+ features_ct2 = ctranslate2.StorageView.from_array(features_ct2)
207
+ languages = detect_language(model, features_ct2)
208
+ language_detections.append(languages)
209
+
210
+ for i, speech in enumerate(metadata.speeches):
211
+ for j, chunk in enumerate(speech.chunks):
212
+ chunk.language = language_detections[j]["language"]
213
+ chunk.language_probability = language_detections[j]["probability"]
214
+
215
+ # Save transcription to file
216
+ output_path = Path(output_dir) / Path(metadata.audio_path).with_suffix(".json")
217
+ output_path.parent.mkdir(parents=True, exist_ok=True)
218
+ save_metadata_json(metadata, output_dir=output_dir)
219
+
220
+
221
+ def detect_language(model: ctranslate2.models.Whisper, features: ctranslate2.StorageView) -> list:
222
+ """
223
+ Return the highest probability language for each chunk in the features batch.
224
+
225
+ Parameters
226
+ ----------
227
+ model : ctranslate2.models.Whisper
228
+ CTranslate2 Whisper model.
229
+ features : ctranslate2.StorageView
230
+ Input features.
231
+
232
+ Returns
233
+ -------
234
+ list
235
+ List of dicts containing 'language' and 'probability'.
236
+ """
237
+
238
+ # List of tuple[str, float] with language and probability
239
+ lang_probs = model.detect_language(features)
240
+
241
+ top_langs = []
242
+ for chunk in lang_probs:
243
+ top_langs.append({"language": chunk[0][0], "probability": chunk[0][1]})
244
+
245
+ return top_langs
@@ -0,0 +1,106 @@
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ import torch
5
+ from easyaligner.data.dataset import AudioFileDataset
6
+ from easyaligner.utils import save_metadata_json
7
+ from easytranscriber.data.collators import transcribe_collate_fn
8
+ from tqdm import tqdm
9
+ from transformers import WhisperForConditionalGeneration, WhisperProcessor
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def transcribe(
15
+ model: WhisperForConditionalGeneration,
16
+ processor: WhisperProcessor,
17
+ file_dataloader: torch.utils.data.DataLoader,
18
+ language: str | None = None,
19
+ task: str = "transcribe",
20
+ batch_size: int = 4,
21
+ beam_size: int = 5,
22
+ length_penalty: float = 1.0,
23
+ repetition_penalty: float = 1.0,
24
+ max_length: int = 250,
25
+ num_workers: int = 2,
26
+ prefetch_factor: int = 2,
27
+ output_dir: str = "output/transcriptions",
28
+ device: str = "cuda",
29
+ ):
30
+ """
31
+ Transcribe audio files using HuggingFace Whisper model.
32
+
33
+ Parameters
34
+ ----------
35
+ model : transformers.WhisperForConditionalGeneration
36
+ HuggingFace Whisper model.
37
+ processor : transformers.WhisperProcessor
38
+ HuggingFace Whisper processor.
39
+ file_dataloader : torch.utils.data.DataLoader
40
+ DataLoader yielding audio file datasets.
41
+ language : str, optional
42
+ Language code (e.g., 'sv', 'en'). Default is `None` (auto-detect).
43
+ batch_size : int, optional
44
+ Batch size for inference.
45
+ beam_size : int, optional
46
+ Number of beams for beam search. Default is 5.
47
+ length_penalty : float, optional
48
+ Length penalty. Default is 1.0.
49
+ repetition_penalty : float, optional
50
+ Repetition penalty. Default is 1.0.
51
+ max_length : int, optional
52
+ Maximum length of generated text. Default is 250.
53
+ num_workers : int, optional
54
+ Number of workers for feature dataloader.
55
+ prefetch_factor : int, optional
56
+ Prefetch factor for feature dataloader.
57
+ output_dir : str, optional
58
+ Directory to save transcription JSON files. Default is `output/transcriptions`.
59
+ device : str, optional
60
+ Device to run inference on. Default is `cuda`.
61
+ """
62
+ for features in tqdm(file_dataloader, desc="Transcribing audio files"):
63
+ slice_dataset = features[0]["dataset"]
64
+ metadata = features[0]["dataset"].metadata
65
+ transcription_texts = []
66
+
67
+ feature_dataloader = torch.utils.data.DataLoader(
68
+ slice_dataset,
69
+ batch_size=batch_size,
70
+ num_workers=num_workers,
71
+ prefetch_factor=prefetch_factor,
72
+ collate_fn=transcribe_collate_fn,
73
+ )
74
+
75
+ logger.info(f"Transcribing {metadata.audio_path} ...")
76
+
77
+ for batch in feature_dataloader:
78
+ with torch.inference_mode():
79
+ batch = batch["features"].to(device).half()
80
+ predicted_ids = model.generate(
81
+ batch,
82
+ return_dict_in_generate=True,
83
+ task=task,
84
+ language=language,
85
+ output_scores=False,
86
+ max_length=max_length,
87
+ num_beams=beam_size,
88
+ repetition_penalty=repetition_penalty,
89
+ length_penalty=length_penalty,
90
+ early_stopping=True,
91
+ )
92
+
93
+ transcription = processor.batch_decode(
94
+ predicted_ids["sequences"], skip_special_tokens=True
95
+ )
96
+
97
+ transcription_texts.extend(transcription)
98
+
99
+ for i, speech in enumerate(metadata.speeches):
100
+ for j, chunk in enumerate(speech.chunks):
101
+ chunk.text = transcription_texts[j].strip()
102
+
103
+ # Write final transcription to file with msgspec serialization
104
+ output_path = Path(output_dir) / Path(metadata.audio_path).with_suffix(".json")
105
+ output_path.parent.mkdir(parents=True, exist_ok=True)
106
+ save_metadata_json(metadata, output_dir=output_dir)
@@ -0,0 +1,148 @@
1
+ import logging
2
+ import subprocess
3
+ from pathlib import Path
4
+ from typing import Tuple
5
+
6
+ import numpy as np
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ def read_audio_segment(
12
+ audio_path: str | Path,
13
+ start_sec: float,
14
+ duration_sec: float,
15
+ sample_rate: int = 16000,
16
+ ) -> np.ndarray:
17
+ """
18
+ Read a segment of audio using ffmpeg subprocess with seek.
19
+
20
+ Uses ffmpeg's fast seek (-ss before -i) to efficiently read only the
21
+ required segment, with resampling to the target sample rate and mono conversion.
22
+
23
+ Parameters
24
+ ----------
25
+ audio_path : str or Path
26
+ Path to the audio file.
27
+ start_sec : float
28
+ Start time in seconds.
29
+ duration_sec : float
30
+ Duration to read in seconds.
31
+ sample_rate : int, optional
32
+ Target sample rate for resampling.
33
+
34
+ Returns
35
+ -------
36
+ np.ndarray
37
+ Audio data as float32 numpy array.
38
+ """
39
+ cmd = [
40
+ "ffmpeg",
41
+ "-ss",
42
+ str(start_sec), # Seek to position (before -i = fast seek)
43
+ "-i",
44
+ str(audio_path),
45
+ "-t",
46
+ str(duration_sec), # Read this many seconds
47
+ "-ar",
48
+ str(sample_rate), # Resample
49
+ "-ac",
50
+ "1", # Mono
51
+ "-f",
52
+ "f32le", # Raw float32 little-endian output
53
+ "-loglevel",
54
+ "error",
55
+ "pipe:1", # Output to stdout
56
+ ]
57
+
58
+ try:
59
+ proc = subprocess.run(cmd, capture_output=True, check=True)
60
+ audio = np.frombuffer(proc.stdout, dtype=np.float32)
61
+ return audio
62
+ except subprocess.CalledProcessError as e:
63
+ logger.error(f"ffmpeg error reading {audio_path}: {e.stderr.decode()}")
64
+ raise
65
+
66
+
67
+ def convert_audio_to_array(input_file: str, sample_rate: int = 16000) -> Tuple[np.ndarray, int]:
68
+ """
69
+ Convert audio to in-memory numpy array.
70
+
71
+ Parameters
72
+ ----------
73
+ input_file : str
74
+ Path to the input audio file.
75
+ sample_rate : int, optional
76
+ Target sample rate.
77
+
78
+ Returns
79
+ -------
80
+ Tuple[np.ndarray, int]
81
+ Tuple containing the audio array (int16) and the sample rate.
82
+
83
+ Raises
84
+ ------
85
+ RuntimeError
86
+ If ffmpeg command fails.
87
+ """
88
+ # fmt: off
89
+ command = [
90
+ "ffmpeg",
91
+ "-i", input_file,
92
+ "-f", "s16le", # raw PCM 16-bit little endian
93
+ "-acodec", "pcm_s16le",
94
+ "-ac", "1", # mono
95
+ "-ar", str(sample_rate), # 16 kHz
96
+ "-loglevel", "error", # suppress output
97
+ "-hide_banner",
98
+ "-nostats",
99
+ ]
100
+ # fmt: on
101
+
102
+ process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
103
+ out, err = process.communicate()
104
+
105
+ if process.returncode != 0:
106
+ raise RuntimeError(f"ffmpeg error: {err.decode()}")
107
+
108
+ # Convert byte output to numpy array
109
+ audio_array = np.frombuffer(out, dtype=np.int16)
110
+
111
+ return audio_array, sample_rate # (samples, sample_rate)
112
+
113
+
114
+ def convert_audio_to_wav(input_file: str, output_file: str) -> None:
115
+ """
116
+ Convert audio file to WAV format with 16kHz sample rate and mono channel.
117
+
118
+ Parameters
119
+ ----------
120
+ input_file : str
121
+ Path to the input audio file.
122
+ output_file : str
123
+ Path to the output WAV file.
124
+
125
+ Raises
126
+ ------
127
+ RuntimeError
128
+ If ffmpeg command fails.
129
+ """
130
+ # fmt: off
131
+ command = [
132
+ 'ffmpeg',
133
+ '-i', input_file,
134
+ '-ar', '16000', # Set the audio sample rate to 16kHz
135
+ '-ac', '1', # Set the number of audio channels to 1 (mono)
136
+ '-c:a', 'pcm_s16le',
137
+ '-loglevel', 'warning',
138
+ '-hide_banner',
139
+ '-nostats',
140
+ '-nostdin',
141
+ output_file
142
+ ]
143
+ # fmt: on
144
+ process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
145
+ _, err = process.communicate()
146
+
147
+ if process.returncode != 0:
148
+ raise RuntimeError(f"ffmpeg error: {err.decode()}")
@@ -0,0 +1,3 @@
1
+ from easytranscriber.data.dataset import StreamingAudioFileDataset
2
+
3
+ __all__ = ["StreamingAudioFileDataset"]
@@ -0,0 +1,30 @@
1
+ import torch
2
+
3
+
4
+ def transcribe_collate_fn(batch: list[dict]) -> dict:
5
+ """
6
+ Collate function for transcription.
7
+
8
+ Parameters
9
+ ----------
10
+ batch : list of dict
11
+ List of samples from the dataset.
12
+
13
+ Returns
14
+ -------
15
+ dict
16
+ Collated batch with 'features', 'start_times', and 'speech_ids'.
17
+ """
18
+ # Remove None values
19
+ speech_ids = [b["speech_id"] for b in batch if b is not None]
20
+ start_times = [b["start_time_global"] for b in batch if b is not None]
21
+ batch = [b["feature"] for b in batch if b is not None]
22
+
23
+ # Concat, keep batch dimension
24
+ batch = torch.cat(batch, dim=0)
25
+
26
+ return {
27
+ "features": batch,
28
+ "start_times": start_times,
29
+ "speech_ids": speech_ids,
30
+ }