easyaligner 0.2.1__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {easyaligner-0.2.1/src/easyaligner.egg-info → easyaligner-0.3.0}/PKG-INFO +33 -16
- {easyaligner-0.2.1 → easyaligner-0.3.0}/README.md +29 -12
- {easyaligner-0.2.1 → easyaligner-0.3.0}/pyproject.toml +4 -4
- {easyaligner-0.2.1 → easyaligner-0.3.0}/src/easyaligner/pipelines.py +4 -23
- {easyaligner-0.2.1 → easyaligner-0.3.0}/src/easyaligner/vad/pyannote.py +2 -2
- {easyaligner-0.2.1 → easyaligner-0.3.0/src/easyaligner.egg-info}/PKG-INFO +33 -16
- {easyaligner-0.2.1 → easyaligner-0.3.0}/src/easyaligner.egg-info/requires.txt +3 -3
- {easyaligner-0.2.1 → easyaligner-0.3.0}/LICENSE +0 -0
- {easyaligner-0.2.1 → easyaligner-0.3.0}/setup.cfg +0 -0
- {easyaligner-0.2.1 → easyaligner-0.3.0}/src/easyaligner/__init__.py +0 -0
- {easyaligner-0.2.1 → easyaligner-0.3.0}/src/easyaligner/alignment/__init__.py +0 -0
- {easyaligner-0.2.1 → easyaligner-0.3.0}/src/easyaligner/alignment/pytorch.py +0 -0
- {easyaligner-0.2.1 → easyaligner-0.3.0}/src/easyaligner/alignment/utils.py +0 -0
- {easyaligner-0.2.1 → easyaligner-0.3.0}/src/easyaligner/data/__init__.py +0 -0
- {easyaligner-0.2.1 → easyaligner-0.3.0}/src/easyaligner/data/collators.py +0 -0
- {easyaligner-0.2.1 → easyaligner-0.3.0}/src/easyaligner/data/datamodel.py +0 -0
- {easyaligner-0.2.1 → easyaligner-0.3.0}/src/easyaligner/data/dataset.py +0 -0
- {easyaligner-0.2.1 → easyaligner-0.3.0}/src/easyaligner/data/utils.py +0 -0
- {easyaligner-0.2.1 → easyaligner-0.3.0}/src/easyaligner/text/__init__.py +0 -0
- {easyaligner-0.2.1 → easyaligner-0.3.0}/src/easyaligner/text/languages/sv.py +0 -0
- {easyaligner-0.2.1 → easyaligner-0.3.0}/src/easyaligner/text/match.py +0 -0
- {easyaligner-0.2.1 → easyaligner-0.3.0}/src/easyaligner/text/normalization.py +0 -0
- {easyaligner-0.2.1 → easyaligner-0.3.0}/src/easyaligner/text/tokenizer.py +0 -0
- {easyaligner-0.2.1 → easyaligner-0.3.0}/src/easyaligner/utils.py +0 -0
- {easyaligner-0.2.1 → easyaligner-0.3.0}/src/easyaligner/vad/__init__.py +0 -0
- {easyaligner-0.2.1 → easyaligner-0.3.0}/src/easyaligner/vad/silero.py +0 -0
- {easyaligner-0.2.1 → easyaligner-0.3.0}/src/easyaligner/vad/utils.py +0 -0
- {easyaligner-0.2.1 → easyaligner-0.3.0}/src/easyaligner/vad/vad.py +0 -0
- {easyaligner-0.2.1 → easyaligner-0.3.0}/src/easyaligner.egg-info/SOURCES.txt +0 -0
- {easyaligner-0.2.1 → easyaligner-0.3.0}/src/easyaligner.egg-info/dependency_links.txt +0 -0
- {easyaligner-0.2.1 → easyaligner-0.3.0}/src/easyaligner.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: easyaligner
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Forced alignment pipeline designed for efficiency and ease of use.
|
|
5
5
|
Author: Faton Rekathati
|
|
6
6
|
Project-URL: Repository, https://github.com/kb-labb/easyaligner
|
|
@@ -8,12 +8,12 @@ Requires-Python: >=3.10
|
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
9
|
License-File: LICENSE
|
|
10
10
|
Requires-Dist: transformers>=4.45.0
|
|
11
|
-
Requires-Dist: torch
|
|
12
|
-
Requires-Dist: torchaudio
|
|
11
|
+
Requires-Dist: torch<2.9,>=2.7.0
|
|
12
|
+
Requires-Dist: torchaudio<2.9,>=2.7.0
|
|
13
13
|
Requires-Dist: tqdm>=4.66.1
|
|
14
14
|
Requires-Dist: soundfile>=0.12.1
|
|
15
15
|
Requires-Dist: nltk>=3.8.2
|
|
16
|
-
Requires-Dist: pyannote-audio
|
|
16
|
+
Requires-Dist: pyannote-audio<4.0.4,>=3.3.1
|
|
17
17
|
Requires-Dist: silero-vad~=6.0
|
|
18
18
|
Requires-Dist: msgspec
|
|
19
19
|
Requires-Dist: rapidfuzz
|
|
@@ -27,8 +27,9 @@ Dynamic: license-file
|
|
|
27
27
|
|
|
28
28
|
* **GPU accelerated forced alignment**. Uses [Pytorch's forced alignment API](https://docs.pytorch.org/audio/main/tutorials/ctc_forced_alignment_api_tutorial.html) with a GPU based implementation of the Viterbi algorithm. Enables fast and memory-efficient forced alignment of long audio segments ([Pratap et al., 2024](https://jmlr.org/papers/volume25/23-1318/23-1318.pdf#page=8)).
|
|
29
29
|
* **Flexible text normalization for improved alignment quality**. Users can supply custom regex-based text normalization functions to preprocess transcripts before alignment. A mapping from the original text to the normalized text is maintained internally. All of the applied normalizations and transformations are consequently **non-destructive and reversible after alignment**.
|
|
30
|
-
* **Batch processing support for emission extraction**. `easyaligner` supports batched inference for wav2vec2-based models, keeping track of non-padded logits when doing alignment.
|
|
31
|
-
|
|
30
|
+
* **Batch processing support for emission extraction**. `easyaligner` supports batched inference for wav2vec2-based models, keeping track of non-padded logits when doing alignment.
|
|
31
|
+
|
|
32
|
+
Check out the [documentation](https://kb-labb.github.io/easyaligner/) for more details and tutorials!
|
|
32
33
|
|
|
33
34
|
## Installation
|
|
34
35
|
|
|
@@ -68,13 +69,21 @@ from easyaligner.pipelines import pipeline
|
|
|
68
69
|
from easyaligner.text import text_normalizer
|
|
69
70
|
from easyaligner.vad.pyannote import load_vad_model
|
|
70
71
|
|
|
72
|
+
filepath_pattern = "tale-of-two-cities_align-en/taleoftwocities_01_dickens_64kb_align.mp3"
|
|
73
|
+
|
|
74
|
+
# Download mp3 from Hugging Face Hub
|
|
71
75
|
snapshot_download(
|
|
72
76
|
"Lauler/easytranscriber_tutorials",
|
|
73
77
|
repo_type="dataset",
|
|
74
78
|
local_dir="data/tutorials",
|
|
75
|
-
allow_patterns=
|
|
79
|
+
allow_patterns=filepath_pattern,
|
|
76
80
|
)
|
|
77
81
|
|
|
82
|
+
# File(s) to align
|
|
83
|
+
filepath = Path("data/tutorials") / filepath_pattern
|
|
84
|
+
audio_dir = filepath.parent
|
|
85
|
+
audio_files = [filepath.name]
|
|
86
|
+
|
|
78
87
|
text = """
|
|
79
88
|
It was the best of times, it was the worst of times, it was the age of
|
|
80
89
|
wisdom, it was the age of foolishness, it was the epoch of belief, it
|
|
@@ -90,26 +99,21 @@ evil, in the superlative degree of comparison only.
|
|
|
90
99
|
text = text.strip()
|
|
91
100
|
|
|
92
101
|
# The alignments will be organized according to how the text is tokenized
|
|
93
|
-
tokenizer = load_tokenizer(language="english")
|
|
94
|
-
span_list = list(tokenizer.span_tokenize(text))
|
|
102
|
+
tokenizer = load_tokenizer(language="english") # sentence tokenizer
|
|
103
|
+
span_list = list(tokenizer.span_tokenize(text)) # start, end character indices for each sentence
|
|
95
104
|
speeches = [[SpeechSegment(speech_id=0, text=text, text_spans=span_list, start=None, end=None)]]
|
|
96
105
|
|
|
97
106
|
# Load models and run pipeline
|
|
98
107
|
model_vad = load_vad_model()
|
|
99
|
-
model = (
|
|
100
|
-
AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cuda").half()
|
|
101
|
-
)
|
|
108
|
+
model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cuda").half()
|
|
102
109
|
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
|
|
103
110
|
|
|
104
|
-
# File(s) to align
|
|
105
|
-
audio_files = [file.name for file in Path("data/tutorials/tale-of-two-cities_align-en").glob("*")]
|
|
106
|
-
|
|
107
111
|
pipeline(
|
|
108
112
|
vad_model=model_vad,
|
|
109
113
|
emissions_model=model,
|
|
110
114
|
processor=processor,
|
|
111
115
|
audio_paths=audio_files,
|
|
112
|
-
audio_dir=
|
|
116
|
+
audio_dir=audio_dir,
|
|
113
117
|
speeches=speeches,
|
|
114
118
|
alignment_strategy="speech",
|
|
115
119
|
text_normalizer_fn=text_normalizer,
|
|
@@ -148,3 +152,16 @@ The `output/emissions` directory will, in addition to the JSON files, also conta
|
|
|
148
152
|
|
|
149
153
|
All intermediate files can safely be deleted, assuming there is no need to re-run the pipeline from a specific intermediate stage.
|
|
150
154
|
|
|
155
|
+
## Citation
|
|
156
|
+
|
|
157
|
+
If you use `easyaligner` in your research, consider citing the following blog post:
|
|
158
|
+
|
|
159
|
+
```
|
|
160
|
+
@online{rekathati2026,
|
|
161
|
+
author = {Rekathati, Faton},
|
|
162
|
+
title = {Easyaligner: {Forced} Alignment of Text and Audio, Made Easy},
|
|
163
|
+
date = {2026-04-08},
|
|
164
|
+
url = {https://kb-labb.github.io/posts/2026-04-08-easyaligner/},
|
|
165
|
+
langid = {en}
|
|
166
|
+
}
|
|
167
|
+
```
|
|
@@ -6,8 +6,9 @@
|
|
|
6
6
|
|
|
7
7
|
* **GPU accelerated forced alignment**. Uses [Pytorch's forced alignment API](https://docs.pytorch.org/audio/main/tutorials/ctc_forced_alignment_api_tutorial.html) with a GPU based implementation of the Viterbi algorithm. Enables fast and memory-efficient forced alignment of long audio segments ([Pratap et al., 2024](https://jmlr.org/papers/volume25/23-1318/23-1318.pdf#page=8)).
|
|
8
8
|
* **Flexible text normalization for improved alignment quality**. Users can supply custom regex-based text normalization functions to preprocess transcripts before alignment. A mapping from the original text to the normalized text is maintained internally. All of the applied normalizations and transformations are consequently **non-destructive and reversible after alignment**.
|
|
9
|
-
* **Batch processing support for emission extraction**. `easyaligner` supports batched inference for wav2vec2-based models, keeping track of non-padded logits when doing alignment.
|
|
10
|
-
|
|
9
|
+
* **Batch processing support for emission extraction**. `easyaligner` supports batched inference for wav2vec2-based models, keeping track of non-padded logits when doing alignment.
|
|
10
|
+
|
|
11
|
+
Check out the [documentation](https://kb-labb.github.io/easyaligner/) for more details and tutorials!
|
|
11
12
|
|
|
12
13
|
## Installation
|
|
13
14
|
|
|
@@ -47,13 +48,21 @@ from easyaligner.pipelines import pipeline
|
|
|
47
48
|
from easyaligner.text import text_normalizer
|
|
48
49
|
from easyaligner.vad.pyannote import load_vad_model
|
|
49
50
|
|
|
51
|
+
filepath_pattern = "tale-of-two-cities_align-en/taleoftwocities_01_dickens_64kb_align.mp3"
|
|
52
|
+
|
|
53
|
+
# Download mp3 from Hugging Face Hub
|
|
50
54
|
snapshot_download(
|
|
51
55
|
"Lauler/easytranscriber_tutorials",
|
|
52
56
|
repo_type="dataset",
|
|
53
57
|
local_dir="data/tutorials",
|
|
54
|
-
allow_patterns=
|
|
58
|
+
allow_patterns=filepath_pattern,
|
|
55
59
|
)
|
|
56
60
|
|
|
61
|
+
# File(s) to align
|
|
62
|
+
filepath = Path("data/tutorials") / filepath_pattern
|
|
63
|
+
audio_dir = filepath.parent
|
|
64
|
+
audio_files = [filepath.name]
|
|
65
|
+
|
|
57
66
|
text = """
|
|
58
67
|
It was the best of times, it was the worst of times, it was the age of
|
|
59
68
|
wisdom, it was the age of foolishness, it was the epoch of belief, it
|
|
@@ -69,26 +78,21 @@ evil, in the superlative degree of comparison only.
|
|
|
69
78
|
text = text.strip()
|
|
70
79
|
|
|
71
80
|
# The alignments will be organized according to how the text is tokenized
|
|
72
|
-
tokenizer = load_tokenizer(language="english")
|
|
73
|
-
span_list = list(tokenizer.span_tokenize(text))
|
|
81
|
+
tokenizer = load_tokenizer(language="english") # sentence tokenizer
|
|
82
|
+
span_list = list(tokenizer.span_tokenize(text)) # start, end character indices for each sentence
|
|
74
83
|
speeches = [[SpeechSegment(speech_id=0, text=text, text_spans=span_list, start=None, end=None)]]
|
|
75
84
|
|
|
76
85
|
# Load models and run pipeline
|
|
77
86
|
model_vad = load_vad_model()
|
|
78
|
-
model = (
|
|
79
|
-
AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cuda").half()
|
|
80
|
-
)
|
|
87
|
+
model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cuda").half()
|
|
81
88
|
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
|
|
82
89
|
|
|
83
|
-
# File(s) to align
|
|
84
|
-
audio_files = [file.name for file in Path("data/tutorials/tale-of-two-cities_align-en").glob("*")]
|
|
85
|
-
|
|
86
90
|
pipeline(
|
|
87
91
|
vad_model=model_vad,
|
|
88
92
|
emissions_model=model,
|
|
89
93
|
processor=processor,
|
|
90
94
|
audio_paths=audio_files,
|
|
91
|
-
audio_dir=
|
|
95
|
+
audio_dir=audio_dir,
|
|
92
96
|
speeches=speeches,
|
|
93
97
|
alignment_strategy="speech",
|
|
94
98
|
text_normalizer_fn=text_normalizer,
|
|
@@ -127,3 +131,16 @@ The `output/emissions` directory will, in addition to the JSON files, also conta
|
|
|
127
131
|
|
|
128
132
|
All intermediate files can safely be deleted, assuming there is no need to re-run the pipeline from a specific intermediate stage.
|
|
129
133
|
|
|
134
|
+
## Citation
|
|
135
|
+
|
|
136
|
+
If you use `easyaligner` in your research, consider citing the following blog post:
|
|
137
|
+
|
|
138
|
+
```
|
|
139
|
+
@online{rekathati2026,
|
|
140
|
+
author = {Rekathati, Faton},
|
|
141
|
+
title = {Easyaligner: {Forced} Alignment of Text and Audio, Made Easy},
|
|
142
|
+
date = {2026-04-08},
|
|
143
|
+
url = {https://kb-labb.github.io/posts/2026-04-08-easyaligner/},
|
|
144
|
+
langid = {en}
|
|
145
|
+
}
|
|
146
|
+
```
|
|
@@ -3,7 +3,7 @@ requires = ["setuptools>=67.0.0"]
|
|
|
3
3
|
build-backend = "setuptools.build_meta"
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
|
-
version = "0.
|
|
6
|
+
version = "0.3.0"
|
|
7
7
|
name = "easyaligner"
|
|
8
8
|
requires-python = ">= 3.10"
|
|
9
9
|
description = "Forced alignment pipeline designed for efficiency and ease of use."
|
|
@@ -12,12 +12,12 @@ authors = [{ name = "Faton Rekathati" }]
|
|
|
12
12
|
|
|
13
13
|
dependencies = [
|
|
14
14
|
"transformers>=4.45.0",
|
|
15
|
-
"torch>=2.7.0
|
|
16
|
-
"torchaudio>=2.7.0
|
|
15
|
+
"torch>=2.7.0,<2.9",
|
|
16
|
+
"torchaudio>=2.7.0,<2.9",
|
|
17
17
|
"tqdm>=4.66.1",
|
|
18
18
|
"soundfile>=0.12.1",
|
|
19
19
|
"nltk>=3.8.2",
|
|
20
|
-
"pyannote-audio>=3.3.1",
|
|
20
|
+
"pyannote-audio>=3.3.1,<4.0.4",
|
|
21
21
|
"silero-vad~=6.0",
|
|
22
22
|
"msgspec",
|
|
23
23
|
"rapidfuzz"
|
|
@@ -44,7 +44,6 @@ def vad_pipeline_generator(
|
|
|
44
44
|
chunk_size: int = 30,
|
|
45
45
|
sample_rate: int = 16000,
|
|
46
46
|
metadata: list[dict] | None = None,
|
|
47
|
-
batch_size: int = 1,
|
|
48
47
|
num_workers: int = 1,
|
|
49
48
|
prefetch_factor: int = 2,
|
|
50
49
|
save_json: bool = True,
|
|
@@ -73,8 +72,6 @@ def vad_pipeline_generator(
|
|
|
73
72
|
The sample rate to resample the audio to before running VAD.
|
|
74
73
|
metadata : list[dict] or None, optional
|
|
75
74
|
Optional list of additional file level metadata to include.
|
|
76
|
-
batch_size : int, default 1
|
|
77
|
-
The batch size for the DataLoader.
|
|
78
75
|
num_workers : int, default 1
|
|
79
76
|
The number of workers for the DataLoader.
|
|
80
77
|
prefetch_factor : int, default 2
|
|
@@ -99,7 +96,7 @@ def vad_pipeline_generator(
|
|
|
99
96
|
)
|
|
100
97
|
vad_dataloader = torch.utils.data.DataLoader(
|
|
101
98
|
vad_dataset,
|
|
102
|
-
batch_size=
|
|
99
|
+
batch_size=1,
|
|
103
100
|
shuffle=False,
|
|
104
101
|
collate_fn=vad_collate_fn,
|
|
105
102
|
num_workers=num_workers,
|
|
@@ -163,7 +160,6 @@ def vad_pipeline(
|
|
|
163
160
|
chunk_size: int = 30,
|
|
164
161
|
sample_rate: int = 16000,
|
|
165
162
|
metadata: list[dict] | None = None,
|
|
166
|
-
batch_size: int = 1,
|
|
167
163
|
num_workers: int = 1,
|
|
168
164
|
prefetch_factor: int = 2,
|
|
169
165
|
save_json: bool = True,
|
|
@@ -192,8 +188,6 @@ def vad_pipeline(
|
|
|
192
188
|
The sample rate to resample the audio to before running VAD.
|
|
193
189
|
metadata : list[dict] or None, optional
|
|
194
190
|
Optional list of additional file level metadata to include.
|
|
195
|
-
batch_size : int, default 1
|
|
196
|
-
The batch size for the DataLoader.
|
|
197
191
|
num_workers : int, default 1
|
|
198
192
|
The number of workers for the DataLoader.
|
|
199
193
|
prefetch_factor : int, default 2
|
|
@@ -222,7 +216,6 @@ def vad_pipeline(
|
|
|
222
216
|
chunk_size=chunk_size,
|
|
223
217
|
sample_rate=sample_rate,
|
|
224
218
|
metadata=metadata,
|
|
225
|
-
batch_size=batch_size,
|
|
226
219
|
num_workers=num_workers,
|
|
227
220
|
prefetch_factor=prefetch_factor,
|
|
228
221
|
save_json=save_json,
|
|
@@ -249,7 +242,6 @@ def emissions_pipeline_generator(
|
|
|
249
242
|
sample_rate: int = 16000,
|
|
250
243
|
chunk_size: int = 30,
|
|
251
244
|
alignment_strategy: str = "speech",
|
|
252
|
-
batch_size_files: int = 1,
|
|
253
245
|
num_workers_files: int = 1,
|
|
254
246
|
prefetch_factor_files: int = 2,
|
|
255
247
|
batch_size_features: int = 8,
|
|
@@ -287,8 +279,6 @@ def emissions_pipeline_generator(
|
|
|
287
279
|
Strategy for aligning features to text. One of 'speech' or 'chunk'.
|
|
288
280
|
If `speech`, audio is split into `chunk_size` sized chunks based on SpeechSegments.
|
|
289
281
|
If `chunk`, audio is taken from existing VAD chunks.
|
|
290
|
-
batch_size_files : int, default 1
|
|
291
|
-
Batch size for the file DataLoader.
|
|
292
282
|
num_workers_files : int, default 1
|
|
293
283
|
Number of workers for the file DataLoader.
|
|
294
284
|
prefetch_factor_files : int, default 2
|
|
@@ -333,7 +323,7 @@ def emissions_pipeline_generator(
|
|
|
333
323
|
|
|
334
324
|
file_dataloader = torch.utils.data.DataLoader(
|
|
335
325
|
file_dataset,
|
|
336
|
-
batch_size=
|
|
326
|
+
batch_size=1,
|
|
337
327
|
shuffle=False,
|
|
338
328
|
collate_fn=audiofile_collate_fn,
|
|
339
329
|
num_workers=num_workers_files,
|
|
@@ -372,7 +362,7 @@ def emissions_pipeline_generator(
|
|
|
372
362
|
speech_ids = []
|
|
373
363
|
|
|
374
364
|
for batch in feature_dataloader:
|
|
375
|
-
features = batch["features"].
|
|
365
|
+
features = batch["features"].to(device=device, dtype=model.dtype)
|
|
376
366
|
|
|
377
367
|
with torch.inference_mode():
|
|
378
368
|
logits = model(features).logits
|
|
@@ -420,7 +410,6 @@ def emissions_pipeline(
|
|
|
420
410
|
sample_rate: int = 16000,
|
|
421
411
|
chunk_size: int = 30,
|
|
422
412
|
alignment_strategy: str = "speech",
|
|
423
|
-
batch_size_files: int = 1,
|
|
424
413
|
num_workers_files: int = 1,
|
|
425
414
|
prefetch_factor_files: int = 2,
|
|
426
415
|
batch_size_features: int = 8,
|
|
@@ -455,8 +444,6 @@ def emissions_pipeline(
|
|
|
455
444
|
Strategy for aligning features to text. One of 'speech' or 'chunk'.
|
|
456
445
|
If `speech`, audio is split into `chunk_size` sized chunks based on SpeechSegments.
|
|
457
446
|
If `chunk`, audio is taken from existing VAD chunks.
|
|
458
|
-
batch_size_files : int, default 1
|
|
459
|
-
Batch size for the file DataLoader.
|
|
460
447
|
num_workers_files : int, default 1
|
|
461
448
|
Number of workers for the file DataLoader.
|
|
462
449
|
prefetch_factor_files : int, default 2
|
|
@@ -495,7 +482,6 @@ def emissions_pipeline(
|
|
|
495
482
|
sample_rate=sample_rate,
|
|
496
483
|
chunk_size=chunk_size,
|
|
497
484
|
alignment_strategy=alignment_strategy,
|
|
498
|
-
batch_size_files=batch_size_files,
|
|
499
485
|
num_workers_files=num_workers_files,
|
|
500
486
|
prefetch_factor_files=prefetch_factor_files,
|
|
501
487
|
batch_size_features=batch_size_features,
|
|
@@ -773,7 +759,6 @@ def pipeline(
|
|
|
773
759
|
word_boundary: str = "|",
|
|
774
760
|
indent: int = 2,
|
|
775
761
|
ndigits: int = 5,
|
|
776
|
-
batch_size_files: int = 1,
|
|
777
762
|
num_workers_files: int = 2,
|
|
778
763
|
prefetch_factor_files: int = 1,
|
|
779
764
|
batch_size_features: int = 8,
|
|
@@ -839,8 +824,6 @@ def pipeline(
|
|
|
839
824
|
Indentation level for saved JSON files. `None` to disable pretty formatting.
|
|
840
825
|
ndigits : int, default 5
|
|
841
826
|
Number of decimal digits to round the alignment times and scores to.
|
|
842
|
-
batch_size_files : int, default 1
|
|
843
|
-
Batch size for the file DataLoader.
|
|
844
827
|
num_workers_files : int, default 2
|
|
845
828
|
Number of workers for the file DataLoader.
|
|
846
829
|
prefetch_factor_files : int, default 1
|
|
@@ -887,7 +870,6 @@ def pipeline(
|
|
|
887
870
|
speeches=speeches,
|
|
888
871
|
chunk_size=chunk_size,
|
|
889
872
|
sample_rate=sample_rate,
|
|
890
|
-
batch_size=batch_size_files,
|
|
891
873
|
num_workers=num_workers_files,
|
|
892
874
|
prefetch_factor=prefetch_factor_files,
|
|
893
875
|
save_json=save_json,
|
|
@@ -909,7 +891,6 @@ def pipeline(
|
|
|
909
891
|
sample_rate=sample_rate,
|
|
910
892
|
chunk_size=chunk_size,
|
|
911
893
|
alignment_strategy=alignment_strategy,
|
|
912
|
-
batch_size_files=batch_size_files,
|
|
913
894
|
num_workers_files=num_workers_files,
|
|
914
895
|
prefetch_factor_files=prefetch_factor_files,
|
|
915
896
|
batch_size_features=batch_size_features,
|
|
@@ -929,7 +910,7 @@ def pipeline(
|
|
|
929
910
|
)
|
|
930
911
|
json_dataloader = torch.utils.data.DataLoader(
|
|
931
912
|
json_dataset,
|
|
932
|
-
batch_size=
|
|
913
|
+
batch_size=1,
|
|
933
914
|
shuffle=False,
|
|
934
915
|
collate_fn=metadata_collate_fn,
|
|
935
916
|
num_workers=num_workers_files,
|
|
@@ -431,7 +431,7 @@ def run_vad_pipeline(metadata: AudioMetadata, model, audio, sample_rate=16000, c
|
|
|
431
431
|
# Run VAD on entire audio
|
|
432
432
|
vad_segments = model(
|
|
433
433
|
{
|
|
434
|
-
"waveform": torch.
|
|
434
|
+
"waveform": torch.as_tensor(audio).unsqueeze(0).to(torch.float32),
|
|
435
435
|
"sample_rate": sample_rate,
|
|
436
436
|
}
|
|
437
437
|
)
|
|
@@ -455,7 +455,7 @@ def run_vad_pipeline(metadata: AudioMetadata, model, audio, sample_rate=16000, c
|
|
|
455
455
|
|
|
456
456
|
vad_segments = model(
|
|
457
457
|
{
|
|
458
|
-
"waveform": torch.
|
|
458
|
+
"waveform": torch.as_tensor(speech_audio).unsqueeze(0).to(torch.float32),
|
|
459
459
|
"sample_rate": sample_rate,
|
|
460
460
|
}
|
|
461
461
|
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: easyaligner
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Forced alignment pipeline designed for efficiency and ease of use.
|
|
5
5
|
Author: Faton Rekathati
|
|
6
6
|
Project-URL: Repository, https://github.com/kb-labb/easyaligner
|
|
@@ -8,12 +8,12 @@ Requires-Python: >=3.10
|
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
9
|
License-File: LICENSE
|
|
10
10
|
Requires-Dist: transformers>=4.45.0
|
|
11
|
-
Requires-Dist: torch
|
|
12
|
-
Requires-Dist: torchaudio
|
|
11
|
+
Requires-Dist: torch<2.9,>=2.7.0
|
|
12
|
+
Requires-Dist: torchaudio<2.9,>=2.7.0
|
|
13
13
|
Requires-Dist: tqdm>=4.66.1
|
|
14
14
|
Requires-Dist: soundfile>=0.12.1
|
|
15
15
|
Requires-Dist: nltk>=3.8.2
|
|
16
|
-
Requires-Dist: pyannote-audio
|
|
16
|
+
Requires-Dist: pyannote-audio<4.0.4,>=3.3.1
|
|
17
17
|
Requires-Dist: silero-vad~=6.0
|
|
18
18
|
Requires-Dist: msgspec
|
|
19
19
|
Requires-Dist: rapidfuzz
|
|
@@ -27,8 +27,9 @@ Dynamic: license-file
|
|
|
27
27
|
|
|
28
28
|
* **GPU accelerated forced alignment**. Uses [Pytorch's forced alignment API](https://docs.pytorch.org/audio/main/tutorials/ctc_forced_alignment_api_tutorial.html) with a GPU based implementation of the Viterbi algorithm. Enables fast and memory-efficient forced alignment of long audio segments ([Pratap et al., 2024](https://jmlr.org/papers/volume25/23-1318/23-1318.pdf#page=8)).
|
|
29
29
|
* **Flexible text normalization for improved alignment quality**. Users can supply custom regex-based text normalization functions to preprocess transcripts before alignment. A mapping from the original text to the normalized text is maintained internally. All of the applied normalizations and transformations are consequently **non-destructive and reversible after alignment**.
|
|
30
|
-
* **Batch processing support for emission extraction**. `easyaligner` supports batched inference for wav2vec2-based models, keeping track of non-padded logits when doing alignment.
|
|
31
|
-
|
|
30
|
+
* **Batch processing support for emission extraction**. `easyaligner` supports batched inference for wav2vec2-based models, keeping track of non-padded logits when doing alignment.
|
|
31
|
+
|
|
32
|
+
Check out the [documentation](https://kb-labb.github.io/easyaligner/) for more details and tutorials!
|
|
32
33
|
|
|
33
34
|
## Installation
|
|
34
35
|
|
|
@@ -68,13 +69,21 @@ from easyaligner.pipelines import pipeline
|
|
|
68
69
|
from easyaligner.text import text_normalizer
|
|
69
70
|
from easyaligner.vad.pyannote import load_vad_model
|
|
70
71
|
|
|
72
|
+
filepath_pattern = "tale-of-two-cities_align-en/taleoftwocities_01_dickens_64kb_align.mp3"
|
|
73
|
+
|
|
74
|
+
# Download mp3 from Hugging Face Hub
|
|
71
75
|
snapshot_download(
|
|
72
76
|
"Lauler/easytranscriber_tutorials",
|
|
73
77
|
repo_type="dataset",
|
|
74
78
|
local_dir="data/tutorials",
|
|
75
|
-
allow_patterns=
|
|
79
|
+
allow_patterns=filepath_pattern,
|
|
76
80
|
)
|
|
77
81
|
|
|
82
|
+
# File(s) to align
|
|
83
|
+
filepath = Path("data/tutorials") / filepath_pattern
|
|
84
|
+
audio_dir = filepath.parent
|
|
85
|
+
audio_files = [filepath.name]
|
|
86
|
+
|
|
78
87
|
text = """
|
|
79
88
|
It was the best of times, it was the worst of times, it was the age of
|
|
80
89
|
wisdom, it was the age of foolishness, it was the epoch of belief, it
|
|
@@ -90,26 +99,21 @@ evil, in the superlative degree of comparison only.
|
|
|
90
99
|
text = text.strip()
|
|
91
100
|
|
|
92
101
|
# The alignments will be organized according to how the text is tokenized
|
|
93
|
-
tokenizer = load_tokenizer(language="english")
|
|
94
|
-
span_list = list(tokenizer.span_tokenize(text))
|
|
102
|
+
tokenizer = load_tokenizer(language="english") # sentence tokenizer
|
|
103
|
+
span_list = list(tokenizer.span_tokenize(text)) # start, end character indices for each sentence
|
|
95
104
|
speeches = [[SpeechSegment(speech_id=0, text=text, text_spans=span_list, start=None, end=None)]]
|
|
96
105
|
|
|
97
106
|
# Load models and run pipeline
|
|
98
107
|
model_vad = load_vad_model()
|
|
99
|
-
model = (
|
|
100
|
-
AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cuda").half()
|
|
101
|
-
)
|
|
108
|
+
model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cuda").half()
|
|
102
109
|
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
|
|
103
110
|
|
|
104
|
-
# File(s) to align
|
|
105
|
-
audio_files = [file.name for file in Path("data/tutorials/tale-of-two-cities_align-en").glob("*")]
|
|
106
|
-
|
|
107
111
|
pipeline(
|
|
108
112
|
vad_model=model_vad,
|
|
109
113
|
emissions_model=model,
|
|
110
114
|
processor=processor,
|
|
111
115
|
audio_paths=audio_files,
|
|
112
|
-
audio_dir=
|
|
116
|
+
audio_dir=audio_dir,
|
|
113
117
|
speeches=speeches,
|
|
114
118
|
alignment_strategy="speech",
|
|
115
119
|
text_normalizer_fn=text_normalizer,
|
|
@@ -148,3 +152,16 @@ The `output/emissions` directory will, in addition to the JSON files, also conta
|
|
|
148
152
|
|
|
149
153
|
All intermediate files can safely be deleted, assuming there is no need to re-run the pipeline from a specific intermediate stage.
|
|
150
154
|
|
|
155
|
+
## Citation
|
|
156
|
+
|
|
157
|
+
If you use `easyaligner` in your research, consider citing the following blog post:
|
|
158
|
+
|
|
159
|
+
```
|
|
160
|
+
@online{rekathati2026,
|
|
161
|
+
author = {Rekathati, Faton},
|
|
162
|
+
title = {Easyaligner: {Forced} Alignment of Text and Audio, Made Easy},
|
|
163
|
+
date = {2026-04-08},
|
|
164
|
+
url = {https://kb-labb.github.io/posts/2026-04-08-easyaligner/},
|
|
165
|
+
langid = {en}
|
|
166
|
+
}
|
|
167
|
+
```
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
transformers>=4.45.0
|
|
2
|
-
torch
|
|
3
|
-
torchaudio
|
|
2
|
+
torch<2.9,>=2.7.0
|
|
3
|
+
torchaudio<2.9,>=2.7.0
|
|
4
4
|
tqdm>=4.66.1
|
|
5
5
|
soundfile>=0.12.1
|
|
6
6
|
nltk>=3.8.2
|
|
7
|
-
pyannote-audio
|
|
7
|
+
pyannote-audio<4.0.4,>=3.3.1
|
|
8
8
|
silero-vad~=6.0
|
|
9
9
|
msgspec
|
|
10
10
|
rapidfuzz
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|