easyaligner 0.2.3__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {easyaligner-0.2.3/src/easyaligner.egg-info → easyaligner-0.3.1}/PKG-INFO +4 -4
- {easyaligner-0.2.3 → easyaligner-0.3.1}/pyproject.toml +4 -4
- {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/alignment/utils.py +5 -0
- {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/vad/pyannote.py +12 -9
- {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/vad/silero.py +25 -10
- {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/vad/utils.py +29 -0
- {easyaligner-0.2.3 → easyaligner-0.3.1/src/easyaligner.egg-info}/PKG-INFO +4 -4
- {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner.egg-info/requires.txt +3 -3
- {easyaligner-0.2.3 → easyaligner-0.3.1}/LICENSE +0 -0
- {easyaligner-0.2.3 → easyaligner-0.3.1}/README.md +0 -0
- {easyaligner-0.2.3 → easyaligner-0.3.1}/setup.cfg +0 -0
- {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/__init__.py +0 -0
- {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/alignment/__init__.py +0 -0
- {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/alignment/pytorch.py +0 -0
- {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/data/__init__.py +0 -0
- {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/data/collators.py +0 -0
- {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/data/datamodel.py +0 -0
- {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/data/dataset.py +0 -0
- {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/data/utils.py +0 -0
- {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/pipelines.py +0 -0
- {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/text/__init__.py +0 -0
- {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/text/languages/sv.py +0 -0
- {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/text/match.py +0 -0
- {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/text/normalization.py +0 -0
- {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/text/tokenizer.py +0 -0
- {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/utils.py +0 -0
- {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/vad/__init__.py +0 -0
- {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/vad/vad.py +0 -0
- {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner.egg-info/SOURCES.txt +0 -0
- {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner.egg-info/dependency_links.txt +0 -0
- {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: easyaligner
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: Forced alignment pipeline designed for efficiency and ease of use.
|
|
5
5
|
Author: Faton Rekathati
|
|
6
6
|
Project-URL: Repository, https://github.com/kb-labb/easyaligner
|
|
@@ -8,12 +8,12 @@ Requires-Python: >=3.10
|
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
9
|
License-File: LICENSE
|
|
10
10
|
Requires-Dist: transformers>=4.45.0
|
|
11
|
-
Requires-Dist: torch
|
|
12
|
-
Requires-Dist: torchaudio
|
|
11
|
+
Requires-Dist: torch<2.9,>=2.7.0
|
|
12
|
+
Requires-Dist: torchaudio<2.9,>=2.7.0
|
|
13
13
|
Requires-Dist: tqdm>=4.66.1
|
|
14
14
|
Requires-Dist: soundfile>=0.12.1
|
|
15
15
|
Requires-Dist: nltk>=3.8.2
|
|
16
|
-
Requires-Dist: pyannote-audio
|
|
16
|
+
Requires-Dist: pyannote-audio<4.0.4,>=3.3.1
|
|
17
17
|
Requires-Dist: silero-vad~=6.0
|
|
18
18
|
Requires-Dist: msgspec
|
|
19
19
|
Requires-Dist: rapidfuzz
|
|
@@ -3,7 +3,7 @@ requires = ["setuptools>=67.0.0"]
|
|
|
3
3
|
build-backend = "setuptools.build_meta"
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
|
-
version = "0.
|
|
6
|
+
version = "0.3.1"
|
|
7
7
|
name = "easyaligner"
|
|
8
8
|
requires-python = ">= 3.10"
|
|
9
9
|
description = "Forced alignment pipeline designed for efficiency and ease of use."
|
|
@@ -12,12 +12,12 @@ authors = [{ name = "Faton Rekathati" }]
|
|
|
12
12
|
|
|
13
13
|
dependencies = [
|
|
14
14
|
"transformers>=4.45.0",
|
|
15
|
-
"torch>=2.7.0
|
|
16
|
-
"torchaudio>=2.7.0
|
|
15
|
+
"torch>=2.7.0,<2.9",
|
|
16
|
+
"torchaudio>=2.7.0,<2.9",
|
|
17
17
|
"tqdm>=4.66.1",
|
|
18
18
|
"soundfile>=0.12.1",
|
|
19
19
|
"nltk>=3.8.2",
|
|
20
|
-
"pyannote-audio>=3.3.1",
|
|
20
|
+
"pyannote-audio>=3.3.1,<4.0.4",
|
|
21
21
|
"silero-vad~=6.0",
|
|
22
22
|
"msgspec",
|
|
23
23
|
"rapidfuzz"
|
|
@@ -231,6 +231,11 @@ def segment_speech_probs(probs_list: list[np.ndarray], speech_ids: list[str] | l
|
|
|
231
231
|
np.ndarray
|
|
232
232
|
Probabilities for the speech segment.
|
|
233
233
|
"""
|
|
234
|
+
# Nothing to segment (e.g. a file where VAD detected no speech). Yield nothing
|
|
235
|
+
# so callers iterate over an empty result instead of hitting np.concatenate([]).
|
|
236
|
+
if not probs_list:
|
|
237
|
+
return
|
|
238
|
+
|
|
234
239
|
# Count the number of chunks per speech id
|
|
235
240
|
speech_chunk_counts = [
|
|
236
241
|
(key, sum(1 for i in group)) for key, group in itertools.groupby(speech_ids)
|
|
@@ -13,7 +13,7 @@ from pyannote.core import Annotation, Segment, SlidingWindowFeature
|
|
|
13
13
|
from tqdm import tqdm
|
|
14
14
|
|
|
15
15
|
from easyaligner.data.datamodel import AudioMetadata, SpeechSegment
|
|
16
|
-
from easyaligner.vad.utils import encode_vad_segments
|
|
16
|
+
from easyaligner.vad.utils import drop_empty_speeches, encode_vad_segments
|
|
17
17
|
|
|
18
18
|
"""
|
|
19
19
|
This file contains modified functions from WhisperX (BSD-4-Clause License).
|
|
@@ -431,7 +431,7 @@ def run_vad_pipeline(metadata: AudioMetadata, model, audio, sample_rate=16000, c
|
|
|
431
431
|
# Run VAD on entire audio
|
|
432
432
|
vad_segments = model(
|
|
433
433
|
{
|
|
434
|
-
"waveform": torch.
|
|
434
|
+
"waveform": torch.as_tensor(audio).unsqueeze(0).to(torch.float32),
|
|
435
435
|
"sample_rate": sample_rate,
|
|
436
436
|
}
|
|
437
437
|
)
|
|
@@ -439,12 +439,15 @@ def run_vad_pipeline(metadata: AudioMetadata, model, audio, sample_rate=16000, c
|
|
|
439
439
|
vad_segments = merge_chunks(vad_segments, chunk_size=chunk_size)
|
|
440
440
|
segments = encode_vad_segments(vad_segments)
|
|
441
441
|
|
|
442
|
+
# Create a single SpeechSegment based on where speech was detected.
|
|
443
|
+
# An empty `speeches` list signals a file with no detected speech.
|
|
442
444
|
metadata.speeches = []
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
445
|
+
if segments:
|
|
446
|
+
metadata.speeches.append(
|
|
447
|
+
SpeechSegment(
|
|
448
|
+
start=segments[0].start, end=segments[-1].end, text=None, chunks=segments
|
|
449
|
+
)
|
|
446
450
|
)
|
|
447
|
-
)
|
|
448
451
|
else:
|
|
449
452
|
# Run VAD on each speech segment
|
|
450
453
|
for speech in tqdm(metadata.speeches, desc="Running VAD on speeches"):
|
|
@@ -455,7 +458,7 @@ def run_vad_pipeline(metadata: AudioMetadata, model, audio, sample_rate=16000, c
|
|
|
455
458
|
|
|
456
459
|
vad_segments = model(
|
|
457
460
|
{
|
|
458
|
-
"waveform": torch.
|
|
461
|
+
"waveform": torch.as_tensor(speech_audio).unsqueeze(0).to(torch.float32),
|
|
459
462
|
"sample_rate": sample_rate,
|
|
460
463
|
}
|
|
461
464
|
)
|
|
@@ -472,11 +475,11 @@ def run_vad_pipeline(metadata: AudioMetadata, model, audio, sample_rate=16000, c
|
|
|
472
475
|
]
|
|
473
476
|
segments = encode_vad_segments(vad_segments)
|
|
474
477
|
|
|
475
|
-
if speech.duration is None:
|
|
478
|
+
if speech.duration is None and segments:
|
|
476
479
|
speech.start = segments[0].start
|
|
477
480
|
speech.end = segments[-1].end
|
|
478
481
|
speech.calculate_duration()
|
|
479
482
|
|
|
480
483
|
speech.chunks = segments # In place update of chunks in metadata
|
|
481
484
|
|
|
482
|
-
return metadata
|
|
485
|
+
return drop_empty_speeches(metadata)
|
|
@@ -3,7 +3,7 @@ from silero_vad import get_speech_timestamps, load_silero_vad
|
|
|
3
3
|
from tqdm import tqdm
|
|
4
4
|
|
|
5
5
|
from easyaligner.data.datamodel import AudioMetadata, SpeechSegment
|
|
6
|
-
from easyaligner.vad.utils import encode_vad_segments
|
|
6
|
+
from easyaligner.vad.utils import drop_empty_speeches, encode_vad_segments
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
def load_vad_model(onnx=False, opset_version=16):
|
|
@@ -42,6 +42,9 @@ def merge_chunks(segments, chunk_size=30):
|
|
|
42
42
|
List of merged chunks, where each chunk is a dictionary with
|
|
43
43
|
"start", "end", and "segments" keys.
|
|
44
44
|
"""
|
|
45
|
+
if not segments:
|
|
46
|
+
return []
|
|
47
|
+
|
|
45
48
|
current_start = segments[0]["start"]
|
|
46
49
|
current_end = segments[0]["end"]
|
|
47
50
|
merged_segments = []
|
|
@@ -103,17 +106,22 @@ def run_vad_pipeline(
|
|
|
103
106
|
vad_segments = merge_chunks(vad_segments, chunk_size=chunk_size)
|
|
104
107
|
segments = encode_vad_segments(vad_segments)
|
|
105
108
|
|
|
106
|
-
# Create a single SpeechSegment based on where speech was detected
|
|
109
|
+
# Create a single SpeechSegment based on where speech was detected.
|
|
110
|
+
# An empty `speeches` list signals a file with no detected speech.
|
|
107
111
|
metadata.speeches = []
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
112
|
+
if segments:
|
|
113
|
+
metadata.speeches.append(
|
|
114
|
+
SpeechSegment(
|
|
115
|
+
start=segments[0].start, end=segments[-1].end, text=None, chunks=segments
|
|
116
|
+
)
|
|
111
117
|
)
|
|
112
|
-
)
|
|
113
118
|
else:
|
|
114
119
|
# Run VAD on each speech segment
|
|
115
120
|
for speech in tqdm(metadata.speeches, desc="Running VAD on speeches"):
|
|
116
|
-
|
|
121
|
+
start = int(speech.start * sample_rate) if speech.start is not None else None
|
|
122
|
+
end = int(speech.end * sample_rate) if speech.end is not None else None
|
|
123
|
+
# Note: Using `None` as a slicing parameter is the same as omitting it
|
|
124
|
+
speech_audio = audio[start:end]
|
|
117
125
|
vad_segments = get_speech_timestamps(
|
|
118
126
|
speech_audio,
|
|
119
127
|
model,
|
|
@@ -122,15 +130,22 @@ def run_vad_pipeline(
|
|
|
122
130
|
)
|
|
123
131
|
vad_segments = merge_chunks(vad_segments, chunk_size=chunk_size)
|
|
124
132
|
# Add speech.start offset to each segment
|
|
133
|
+
offset = speech.start if speech.start is not None else 0
|
|
125
134
|
vad_segments = [
|
|
126
135
|
{
|
|
127
|
-
"start": seg["start"] +
|
|
128
|
-
"end": seg["end"] +
|
|
136
|
+
"start": seg["start"] + offset,
|
|
137
|
+
"end": seg["end"] + offset,
|
|
129
138
|
"segments": seg["segments"],
|
|
130
139
|
}
|
|
131
140
|
for seg in vad_segments
|
|
132
141
|
]
|
|
133
142
|
segments = encode_vad_segments(vad_segments)
|
|
143
|
+
|
|
144
|
+
if speech.duration is None and segments:
|
|
145
|
+
speech.start = segments[0].start
|
|
146
|
+
speech.end = segments[-1].end
|
|
147
|
+
speech.calculate_duration()
|
|
148
|
+
|
|
134
149
|
speech.chunks = segments
|
|
135
150
|
|
|
136
|
-
return metadata
|
|
151
|
+
return drop_empty_speeches(metadata)
|
|
@@ -124,6 +124,35 @@ def seconds_to_frames(seconds, sr=16000):
|
|
|
124
124
|
return int(seconds * sr)
|
|
125
125
|
|
|
126
126
|
|
|
127
|
+
def drop_empty_speeches(metadata: AudioMetadata) -> AudioMetadata:
|
|
128
|
+
"""
|
|
129
|
+
Remove speeches where VAD detected no speech (i.e. speeches without chunks).
|
|
130
|
+
|
|
131
|
+
Downstream pipeline stages (emissions extraction, alignment) assume every
|
|
132
|
+
speech has at least one VAD chunk. An empty `speeches` list signals a file
|
|
133
|
+
with no detected speech.
|
|
134
|
+
|
|
135
|
+
Parameters
|
|
136
|
+
----------
|
|
137
|
+
metadata : AudioMetadata
|
|
138
|
+
The metadata object to filter after running VAD.
|
|
139
|
+
|
|
140
|
+
Returns
|
|
141
|
+
-------
|
|
142
|
+
AudioMetadata
|
|
143
|
+
The metadata object with chunkless speeches removed.
|
|
144
|
+
"""
|
|
145
|
+
speeches = [speech for speech in metadata.speeches if speech.chunks]
|
|
146
|
+
num_dropped = len(metadata.speeches) - len(speeches)
|
|
147
|
+
if num_dropped > 0:
|
|
148
|
+
logger.warning(
|
|
149
|
+
f"VAD detected no speech in {num_dropped} speech segment(s) of "
|
|
150
|
+
f"{metadata.audio_path}. Dropping them from the metadata."
|
|
151
|
+
)
|
|
152
|
+
metadata.speeches = speeches
|
|
153
|
+
return metadata
|
|
154
|
+
|
|
155
|
+
|
|
127
156
|
def encode_vad_segments(vad_segments):
|
|
128
157
|
"""
|
|
129
158
|
Encode VAD segments into a list of AudioChunk objects.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: easyaligner
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: Forced alignment pipeline designed for efficiency and ease of use.
|
|
5
5
|
Author: Faton Rekathati
|
|
6
6
|
Project-URL: Repository, https://github.com/kb-labb/easyaligner
|
|
@@ -8,12 +8,12 @@ Requires-Python: >=3.10
|
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
9
|
License-File: LICENSE
|
|
10
10
|
Requires-Dist: transformers>=4.45.0
|
|
11
|
-
Requires-Dist: torch
|
|
12
|
-
Requires-Dist: torchaudio
|
|
11
|
+
Requires-Dist: torch<2.9,>=2.7.0
|
|
12
|
+
Requires-Dist: torchaudio<2.9,>=2.7.0
|
|
13
13
|
Requires-Dist: tqdm>=4.66.1
|
|
14
14
|
Requires-Dist: soundfile>=0.12.1
|
|
15
15
|
Requires-Dist: nltk>=3.8.2
|
|
16
|
-
Requires-Dist: pyannote-audio
|
|
16
|
+
Requires-Dist: pyannote-audio<4.0.4,>=3.3.1
|
|
17
17
|
Requires-Dist: silero-vad~=6.0
|
|
18
18
|
Requires-Dist: msgspec
|
|
19
19
|
Requires-Dist: rapidfuzz
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
transformers>=4.45.0
|
|
2
|
-
torch
|
|
3
|
-
torchaudio
|
|
2
|
+
torch<2.9,>=2.7.0
|
|
3
|
+
torchaudio<2.9,>=2.7.0
|
|
4
4
|
tqdm>=4.66.1
|
|
5
5
|
soundfile>=0.12.1
|
|
6
6
|
nltk>=3.8.2
|
|
7
|
-
pyannote-audio
|
|
7
|
+
pyannote-audio<4.0.4,>=3.3.1
|
|
8
8
|
silero-vad~=6.0
|
|
9
9
|
msgspec
|
|
10
10
|
rapidfuzz
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|