easyaligner 0.2.3__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {easyaligner-0.2.3/src/easyaligner.egg-info → easyaligner-0.3.1}/PKG-INFO +4 -4
  2. {easyaligner-0.2.3 → easyaligner-0.3.1}/pyproject.toml +4 -4
  3. {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/alignment/utils.py +5 -0
  4. {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/vad/pyannote.py +12 -9
  5. {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/vad/silero.py +25 -10
  6. {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/vad/utils.py +29 -0
  7. {easyaligner-0.2.3 → easyaligner-0.3.1/src/easyaligner.egg-info}/PKG-INFO +4 -4
  8. {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner.egg-info/requires.txt +3 -3
  9. {easyaligner-0.2.3 → easyaligner-0.3.1}/LICENSE +0 -0
  10. {easyaligner-0.2.3 → easyaligner-0.3.1}/README.md +0 -0
  11. {easyaligner-0.2.3 → easyaligner-0.3.1}/setup.cfg +0 -0
  12. {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/__init__.py +0 -0
  13. {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/alignment/__init__.py +0 -0
  14. {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/alignment/pytorch.py +0 -0
  15. {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/data/__init__.py +0 -0
  16. {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/data/collators.py +0 -0
  17. {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/data/datamodel.py +0 -0
  18. {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/data/dataset.py +0 -0
  19. {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/data/utils.py +0 -0
  20. {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/pipelines.py +0 -0
  21. {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/text/__init__.py +0 -0
  22. {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/text/languages/sv.py +0 -0
  23. {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/text/match.py +0 -0
  24. {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/text/normalization.py +0 -0
  25. {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/text/tokenizer.py +0 -0
  26. {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/utils.py +0 -0
  27. {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/vad/__init__.py +0 -0
  28. {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/vad/vad.py +0 -0
  29. {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner.egg-info/SOURCES.txt +0 -0
  30. {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner.egg-info/dependency_links.txt +0 -0
  31. {easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: easyaligner
3
- Version: 0.2.3
3
+ Version: 0.3.1
4
4
  Summary: Forced alignment pipeline designed for efficiency and ease of use.
5
5
  Author: Faton Rekathati
6
6
  Project-URL: Repository, https://github.com/kb-labb/easyaligner
@@ -8,12 +8,12 @@ Requires-Python: >=3.10
8
8
  Description-Content-Type: text/markdown
9
9
  License-File: LICENSE
10
10
  Requires-Dist: transformers>=4.45.0
11
- Requires-Dist: torch!=2.9.*,>=2.7.0
12
- Requires-Dist: torchaudio!=2.9.*,>=2.7.0
11
+ Requires-Dist: torch<2.9,>=2.7.0
12
+ Requires-Dist: torchaudio<2.9,>=2.7.0
13
13
  Requires-Dist: tqdm>=4.66.1
14
14
  Requires-Dist: soundfile>=0.12.1
15
15
  Requires-Dist: nltk>=3.8.2
16
- Requires-Dist: pyannote-audio>=3.3.1
16
+ Requires-Dist: pyannote-audio<4.0.4,>=3.3.1
17
17
  Requires-Dist: silero-vad~=6.0
18
18
  Requires-Dist: msgspec
19
19
  Requires-Dist: rapidfuzz
@@ -3,7 +3,7 @@ requires = ["setuptools>=67.0.0"]
3
3
  build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
- version = "0.2.3"
6
+ version = "0.3.1"
7
7
  name = "easyaligner"
8
8
  requires-python = ">= 3.10"
9
9
  description = "Forced alignment pipeline designed for efficiency and ease of use."
@@ -12,12 +12,12 @@ authors = [{ name = "Faton Rekathati" }]
12
12
 
13
13
  dependencies = [
14
14
  "transformers>=4.45.0",
15
- "torch>=2.7.0,!=2.9.*",
16
- "torchaudio>=2.7.0,!=2.9.*",
15
+ "torch>=2.7.0,<2.9",
16
+ "torchaudio>=2.7.0,<2.9",
17
17
  "tqdm>=4.66.1",
18
18
  "soundfile>=0.12.1",
19
19
  "nltk>=3.8.2",
20
- "pyannote-audio>=3.3.1",
20
+ "pyannote-audio>=3.3.1,<4.0.4",
21
21
  "silero-vad~=6.0",
22
22
  "msgspec",
23
23
  "rapidfuzz"
@@ -231,6 +231,11 @@ def segment_speech_probs(probs_list: list[np.ndarray], speech_ids: list[str] | l
231
231
  np.ndarray
232
232
  Probabilities for the speech segment.
233
233
  """
234
+ # Nothing to segment (e.g. a file where VAD detected no speech). Yield nothing
235
+ # so callers iterate over an empty result instead of hitting np.concatenate([]).
236
+ if not probs_list:
237
+ return
238
+
234
239
  # Count the number of chunks per speech id
235
240
  speech_chunk_counts = [
236
241
  (key, sum(1 for i in group)) for key, group in itertools.groupby(speech_ids)
@@ -13,7 +13,7 @@ from pyannote.core import Annotation, Segment, SlidingWindowFeature
13
13
  from tqdm import tqdm
14
14
 
15
15
  from easyaligner.data.datamodel import AudioMetadata, SpeechSegment
16
- from easyaligner.vad.utils import encode_vad_segments
16
+ from easyaligner.vad.utils import drop_empty_speeches, encode_vad_segments
17
17
 
18
18
  """
19
19
  This file contains modified functions from WhisperX (BSD-4-Clause License).
@@ -431,7 +431,7 @@ def run_vad_pipeline(metadata: AudioMetadata, model, audio, sample_rate=16000, c
431
431
  # Run VAD on entire audio
432
432
  vad_segments = model(
433
433
  {
434
- "waveform": torch.tensor(audio).unsqueeze(0).to(torch.float32),
434
+ "waveform": torch.as_tensor(audio).unsqueeze(0).to(torch.float32),
435
435
  "sample_rate": sample_rate,
436
436
  }
437
437
  )
@@ -439,12 +439,15 @@ def run_vad_pipeline(metadata: AudioMetadata, model, audio, sample_rate=16000, c
439
439
  vad_segments = merge_chunks(vad_segments, chunk_size=chunk_size)
440
440
  segments = encode_vad_segments(vad_segments)
441
441
 
442
+ # Create a single SpeechSegment based on where speech was detected.
443
+ # An empty `speeches` list signals a file with no detected speech.
442
444
  metadata.speeches = []
443
- metadata.speeches.append(
444
- SpeechSegment(
445
- start=segments[0].start, end=segments[-1].end, text=None, chunks=segments
445
+ if segments:
446
+ metadata.speeches.append(
447
+ SpeechSegment(
448
+ start=segments[0].start, end=segments[-1].end, text=None, chunks=segments
449
+ )
446
450
  )
447
- )
448
451
  else:
449
452
  # Run VAD on each speech segment
450
453
  for speech in tqdm(metadata.speeches, desc="Running VAD on speeches"):
@@ -455,7 +458,7 @@ def run_vad_pipeline(metadata: AudioMetadata, model, audio, sample_rate=16000, c
455
458
 
456
459
  vad_segments = model(
457
460
  {
458
- "waveform": torch.tensor(speech_audio).unsqueeze(0).to(torch.float32),
461
+ "waveform": torch.as_tensor(speech_audio).unsqueeze(0).to(torch.float32),
459
462
  "sample_rate": sample_rate,
460
463
  }
461
464
  )
@@ -472,11 +475,11 @@ def run_vad_pipeline(metadata: AudioMetadata, model, audio, sample_rate=16000, c
472
475
  ]
473
476
  segments = encode_vad_segments(vad_segments)
474
477
 
475
- if speech.duration is None:
478
+ if speech.duration is None and segments:
476
479
  speech.start = segments[0].start
477
480
  speech.end = segments[-1].end
478
481
  speech.calculate_duration()
479
482
 
480
483
  speech.chunks = segments # In place update of chunks in metadata
481
484
 
482
- return metadata
485
+ return drop_empty_speeches(metadata)
@@ -3,7 +3,7 @@ from silero_vad import get_speech_timestamps, load_silero_vad
3
3
  from tqdm import tqdm
4
4
 
5
5
  from easyaligner.data.datamodel import AudioMetadata, SpeechSegment
6
- from easyaligner.vad.utils import encode_vad_segments
6
+ from easyaligner.vad.utils import drop_empty_speeches, encode_vad_segments
7
7
 
8
8
 
9
9
  def load_vad_model(onnx=False, opset_version=16):
@@ -42,6 +42,9 @@ def merge_chunks(segments, chunk_size=30):
42
42
  List of merged chunks, where each chunk is a dictionary with
43
43
  "start", "end", and "segments" keys.
44
44
  """
45
+ if not segments:
46
+ return []
47
+
45
48
  current_start = segments[0]["start"]
46
49
  current_end = segments[0]["end"]
47
50
  merged_segments = []
@@ -103,17 +106,22 @@ def run_vad_pipeline(
103
106
  vad_segments = merge_chunks(vad_segments, chunk_size=chunk_size)
104
107
  segments = encode_vad_segments(vad_segments)
105
108
 
106
- # Create a single SpeechSegment based on where speech was detected
109
+ # Create a single SpeechSegment based on where speech was detected.
110
+ # An empty `speeches` list signals a file with no detected speech.
107
111
  metadata.speeches = []
108
- metadata.speeches.append(
109
- SpeechSegment(
110
- start=segments[0].start, end=segments[-1].end, text=None, chunks=segments
112
+ if segments:
113
+ metadata.speeches.append(
114
+ SpeechSegment(
115
+ start=segments[0].start, end=segments[-1].end, text=None, chunks=segments
116
+ )
111
117
  )
112
- )
113
118
  else:
114
119
  # Run VAD on each speech segment
115
120
  for speech in tqdm(metadata.speeches, desc="Running VAD on speeches"):
116
- speech_audio = audio[int(speech.start * sample_rate) : int(speech.end * sample_rate)]
121
+ start = int(speech.start * sample_rate) if speech.start is not None else None
122
+ end = int(speech.end * sample_rate) if speech.end is not None else None
123
+ # Note: Using `None` as a slicing parameter is the same as omitting it
124
+ speech_audio = audio[start:end]
117
125
  vad_segments = get_speech_timestamps(
118
126
  speech_audio,
119
127
  model,
@@ -122,15 +130,22 @@ def run_vad_pipeline(
122
130
  )
123
131
  vad_segments = merge_chunks(vad_segments, chunk_size=chunk_size)
124
132
  # Add speech.start offset to each segment
133
+ offset = speech.start if speech.start is not None else 0
125
134
  vad_segments = [
126
135
  {
127
- "start": seg["start"] + speech.start,
128
- "end": seg["end"] + speech.start,
136
+ "start": seg["start"] + offset,
137
+ "end": seg["end"] + offset,
129
138
  "segments": seg["segments"],
130
139
  }
131
140
  for seg in vad_segments
132
141
  ]
133
142
  segments = encode_vad_segments(vad_segments)
143
+
144
+ if speech.duration is None and segments:
145
+ speech.start = segments[0].start
146
+ speech.end = segments[-1].end
147
+ speech.calculate_duration()
148
+
134
149
  speech.chunks = segments
135
150
 
136
- return metadata
151
+ return drop_empty_speeches(metadata)
@@ -124,6 +124,35 @@ def seconds_to_frames(seconds, sr=16000):
124
124
  return int(seconds * sr)
125
125
 
126
126
 
127
+ def drop_empty_speeches(metadata: AudioMetadata) -> AudioMetadata:
128
+ """
129
+ Remove speeches where VAD detected no speech (i.e. speeches without chunks).
130
+
131
+ Downstream pipeline stages (emissions extraction, alignment) assume every
132
+ speech has at least one VAD chunk. An empty `speeches` list signals a file
133
+ with no detected speech.
134
+
135
+ Parameters
136
+ ----------
137
+ metadata : AudioMetadata
138
+ The metadata object to filter after running VAD.
139
+
140
+ Returns
141
+ -------
142
+ AudioMetadata
143
+ The metadata object with chunkless speeches removed.
144
+ """
145
+ speeches = [speech for speech in metadata.speeches if speech.chunks]
146
+ num_dropped = len(metadata.speeches) - len(speeches)
147
+ if num_dropped > 0:
148
+ logger.warning(
149
+ f"VAD detected no speech in {num_dropped} speech segment(s) of "
150
+ f"{metadata.audio_path}. Dropping them from the metadata."
151
+ )
152
+ metadata.speeches = speeches
153
+ return metadata
154
+
155
+
127
156
  def encode_vad_segments(vad_segments):
128
157
  """
129
158
  Encode VAD segments into a list of AudioChunk objects.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: easyaligner
3
- Version: 0.2.3
3
+ Version: 0.3.1
4
4
  Summary: Forced alignment pipeline designed for efficiency and ease of use.
5
5
  Author: Faton Rekathati
6
6
  Project-URL: Repository, https://github.com/kb-labb/easyaligner
@@ -8,12 +8,12 @@ Requires-Python: >=3.10
8
8
  Description-Content-Type: text/markdown
9
9
  License-File: LICENSE
10
10
  Requires-Dist: transformers>=4.45.0
11
- Requires-Dist: torch!=2.9.*,>=2.7.0
12
- Requires-Dist: torchaudio!=2.9.*,>=2.7.0
11
+ Requires-Dist: torch<2.9,>=2.7.0
12
+ Requires-Dist: torchaudio<2.9,>=2.7.0
13
13
  Requires-Dist: tqdm>=4.66.1
14
14
  Requires-Dist: soundfile>=0.12.1
15
15
  Requires-Dist: nltk>=3.8.2
16
- Requires-Dist: pyannote-audio>=3.3.1
16
+ Requires-Dist: pyannote-audio<4.0.4,>=3.3.1
17
17
  Requires-Dist: silero-vad~=6.0
18
18
  Requires-Dist: msgspec
19
19
  Requires-Dist: rapidfuzz
@@ -1,10 +1,10 @@
1
1
  transformers>=4.45.0
2
- torch!=2.9.*,>=2.7.0
3
- torchaudio!=2.9.*,>=2.7.0
2
+ torch<2.9,>=2.7.0
3
+ torchaudio<2.9,>=2.7.0
4
4
  tqdm>=4.66.1
5
5
  soundfile>=0.12.1
6
6
  nltk>=3.8.2
7
- pyannote-audio>=3.3.1
7
+ pyannote-audio<4.0.4,>=3.3.1
8
8
  silero-vad~=6.0
9
9
  msgspec
10
10
  rapidfuzz
File without changes
File without changes
File without changes