PyPI - easyaligner - Versions diffs - 0.2.3__tar.gz → 0.3.1__tar.gz - Mend

easyaligner 0.2.3tar.gz → 0.3.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

{easyaligner-0.2.3/src/easyaligner.egg-info → easyaligner-0.3.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: easyaligner
-Version: 0.2.3
+Version: 0.3.1
 Summary: Forced alignment pipeline designed for efficiency and ease of use.
 Author: Faton Rekathati
 Project-URL: Repository, https://github.com/kb-labb/easyaligner
@@ -8,12 +8,12 @@ Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: transformers>=4.45.0
-Requires-Dist: torch!=2.9.*,>=2.7.0
-Requires-Dist: torchaudio!=2.9.*,>=2.7.0
+Requires-Dist: torch<2.9,>=2.7.0
+Requires-Dist: torchaudio<2.9,>=2.7.0
 Requires-Dist: tqdm>=4.66.1
 Requires-Dist: soundfile>=0.12.1
 Requires-Dist: nltk>=3.8.2
-Requires-Dist: pyannote-audio>=3.3.1
+Requires-Dist: pyannote-audio<4.0.4,>=3.3.1
 Requires-Dist: silero-vad~=6.0
 Requires-Dist: msgspec
 Requires-Dist: rapidfuzz

{easyaligner-0.2.3 → easyaligner-0.3.1}/pyproject.toml RENAMED Viewed

@@ -3,7 +3,7 @@ requires = ["setuptools>=67.0.0"]
 build-backend = "setuptools.build_meta"
 [project]
-version = "0.2.3"
+version = "0.3.1"
 name = "easyaligner"
 requires-python = ">= 3.10"
 description = "Forced alignment pipeline designed for efficiency and ease of use."
@@ -12,12 +12,12 @@ authors = [{ name = "Faton Rekathati" }]
 dependencies = [
   "transformers>=4.45.0",
-  "torch>=2.7.0,!=2.9.*",
-  "torchaudio>=2.7.0,!=2.9.*",
+  "torch>=2.7.0,<2.9",
+  "torchaudio>=2.7.0,<2.9",
   "tqdm>=4.66.1",
   "soundfile>=0.12.1",
   "nltk>=3.8.2",
-  "pyannote-audio>=3.3.1",
+  "pyannote-audio>=3.3.1,<4.0.4",
   "silero-vad~=6.0",
   "msgspec",
   "rapidfuzz"

{easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/alignment/utils.py RENAMED Viewed

@@ -231,6 +231,11 @@ def segment_speech_probs(probs_list: list[np.ndarray], speech_ids: list[str] | l
     np.ndarray
         Probabilities for the speech segment.
     """
+    # Nothing to segment (e.g. a file where VAD detected no speech). Yield nothing
+    # so callers iterate over an empty result instead of hitting np.concatenate([]).
+    if not probs_list:
+        return
     # Count the number of chunks per speech id
     speech_chunk_counts = [
         (key, sum(1 for i in group)) for key, group in itertools.groupby(speech_ids)

{easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/vad/pyannote.py RENAMED Viewed

@@ -13,7 +13,7 @@ from pyannote.core import Annotation, Segment, SlidingWindowFeature
 from tqdm import tqdm
 from easyaligner.data.datamodel import AudioMetadata, SpeechSegment
-from easyaligner.vad.utils import encode_vad_segments
+from easyaligner.vad.utils import drop_empty_speeches, encode_vad_segments
 """
 This file contains modified functions from WhisperX (BSD-4-Clause License).
@@ -431,7 +431,7 @@ def run_vad_pipeline(metadata: AudioMetadata, model, audio, sample_rate=16000, c
         # Run VAD on entire audio
         vad_segments = model(
             {
-                "waveform": torch.tensor(audio).unsqueeze(0).to(torch.float32),
+                "waveform": torch.as_tensor(audio).unsqueeze(0).to(torch.float32),
                 "sample_rate": sample_rate,
             }
         )
@@ -439,12 +439,15 @@ def run_vad_pipeline(metadata: AudioMetadata, model, audio, sample_rate=16000, c
         vad_segments = merge_chunks(vad_segments, chunk_size=chunk_size)
         segments = encode_vad_segments(vad_segments)
+        # Create a single SpeechSegment based on where speech was detected.
+        # An empty `speeches` list signals a file with no detected speech.
         metadata.speeches = []
-        metadata.speeches.append(
-            SpeechSegment(
-                start=segments[0].start, end=segments[-1].end, text=None, chunks=segments
+        if segments:
+            metadata.speeches.append(
+                SpeechSegment(
+                    start=segments[0].start, end=segments[-1].end, text=None, chunks=segments
+                )
             )
-        )
     else:
         # Run VAD on each speech segment
         for speech in tqdm(metadata.speeches, desc="Running VAD on speeches"):
@@ -455,7 +458,7 @@ def run_vad_pipeline(metadata: AudioMetadata, model, audio, sample_rate=16000, c
             vad_segments = model(
                 {
-                    "waveform": torch.tensor(speech_audio).unsqueeze(0).to(torch.float32),
+                    "waveform": torch.as_tensor(speech_audio).unsqueeze(0).to(torch.float32),
                     "sample_rate": sample_rate,
                 }
             )
@@ -472,11 +475,11 @@ def run_vad_pipeline(metadata: AudioMetadata, model, audio, sample_rate=16000, c
             ]
             segments = encode_vad_segments(vad_segments)
-            if speech.duration is None:
+            if speech.duration is None and segments:
                 speech.start = segments[0].start
                 speech.end = segments[-1].end
                 speech.calculate_duration()
             speech.chunks = segments  # In place update of chunks in metadata
-    return metadata
+    return drop_empty_speeches(metadata)

{easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/vad/silero.py RENAMED Viewed

@@ -3,7 +3,7 @@ from silero_vad import get_speech_timestamps, load_silero_vad
 from tqdm import tqdm
 from easyaligner.data.datamodel import AudioMetadata, SpeechSegment
-from easyaligner.vad.utils import encode_vad_segments
+from easyaligner.vad.utils import drop_empty_speeches, encode_vad_segments
 def load_vad_model(onnx=False, opset_version=16):
@@ -42,6 +42,9 @@ def merge_chunks(segments, chunk_size=30):
         List of merged chunks, where each chunk is a dictionary with
         "start", "end", and "segments" keys.
     """
+    if not segments:
+        return []
     current_start = segments[0]["start"]
     current_end = segments[0]["end"]
     merged_segments = []
@@ -103,17 +106,22 @@ def run_vad_pipeline(
         vad_segments = merge_chunks(vad_segments, chunk_size=chunk_size)
         segments = encode_vad_segments(vad_segments)
-        # Create a single SpeechSegment based on where speech was detected
+        # Create a single SpeechSegment based on where speech was detected.
+        # An empty `speeches` list signals a file with no detected speech.
         metadata.speeches = []
-        metadata.speeches.append(
-            SpeechSegment(
-                start=segments[0].start, end=segments[-1].end, text=None, chunks=segments
+        if segments:
+            metadata.speeches.append(
+                SpeechSegment(
+                    start=segments[0].start, end=segments[-1].end, text=None, chunks=segments
+                )
             )
-        )
     else:
         # Run VAD on each speech segment
         for speech in tqdm(metadata.speeches, desc="Running VAD on speeches"):
-            speech_audio = audio[int(speech.start * sample_rate) : int(speech.end * sample_rate)]
+            start = int(speech.start * sample_rate) if speech.start is not None else None
+            end = int(speech.end * sample_rate) if speech.end is not None else None
+            # Note: Using `None` as a slicing parameter is the same as omitting it
+            speech_audio = audio[start:end]
             vad_segments = get_speech_timestamps(
                 speech_audio,
                 model,
@@ -122,15 +130,22 @@ def run_vad_pipeline(
             )
             vad_segments = merge_chunks(vad_segments, chunk_size=chunk_size)
             # Add speech.start offset to each segment
+            offset = speech.start if speech.start is not None else 0
             vad_segments = [
                 {
-                    "start": seg["start"] + speech.start,
-                    "end": seg["end"] + speech.start,
+                    "start": seg["start"] + offset,
+                    "end": seg["end"] + offset,
                     "segments": seg["segments"],
                 }
                 for seg in vad_segments
             ]
             segments = encode_vad_segments(vad_segments)
+            if speech.duration is None and segments:
+                speech.start = segments[0].start
+                speech.end = segments[-1].end
+                speech.calculate_duration()
             speech.chunks = segments
-    return metadata
+    return drop_empty_speeches(metadata)

{easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner/vad/utils.py RENAMED Viewed

@@ -124,6 +124,35 @@ def seconds_to_frames(seconds, sr=16000):
     return int(seconds * sr)
+def drop_empty_speeches(metadata: AudioMetadata) -> AudioMetadata:
+    """
+    Remove speeches where VAD detected no speech (i.e. speeches without chunks).
+    Downstream pipeline stages (emissions extraction, alignment) assume every
+    speech has at least one VAD chunk. An empty `speeches` list signals a file
+    with no detected speech.
+    Parameters
+    ----------
+    metadata : AudioMetadata
+        The metadata object to filter after running VAD.
+    Returns
+    -------
+    AudioMetadata
+        The metadata object with chunkless speeches removed.
+    """
+    speeches = [speech for speech in metadata.speeches if speech.chunks]
+    num_dropped = len(metadata.speeches) - len(speeches)
+    if num_dropped > 0:
+        logger.warning(
+            f"VAD detected no speech in {num_dropped} speech segment(s) of "
+            f"{metadata.audio_path}. Dropping them from the metadata."
+        )
+    metadata.speeches = speeches
+    return metadata
 def encode_vad_segments(vad_segments):
     """
     Encode VAD segments into a list of AudioChunk objects.

{easyaligner-0.2.3 → easyaligner-0.3.1/src/easyaligner.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: easyaligner
-Version: 0.2.3
+Version: 0.3.1
 Summary: Forced alignment pipeline designed for efficiency and ease of use.
 Author: Faton Rekathati
 Project-URL: Repository, https://github.com/kb-labb/easyaligner
@@ -8,12 +8,12 @@ Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: transformers>=4.45.0
-Requires-Dist: torch!=2.9.*,>=2.7.0
-Requires-Dist: torchaudio!=2.9.*,>=2.7.0
+Requires-Dist: torch<2.9,>=2.7.0
+Requires-Dist: torchaudio<2.9,>=2.7.0
 Requires-Dist: tqdm>=4.66.1
 Requires-Dist: soundfile>=0.12.1
 Requires-Dist: nltk>=3.8.2
-Requires-Dist: pyannote-audio>=3.3.1
+Requires-Dist: pyannote-audio<4.0.4,>=3.3.1
 Requires-Dist: silero-vad~=6.0
 Requires-Dist: msgspec
 Requires-Dist: rapidfuzz

{easyaligner-0.2.3 → easyaligner-0.3.1}/src/easyaligner.egg-info/requires.txt RENAMED Viewed

@@ -1,10 +1,10 @@
 transformers>=4.45.0
-torch!=2.9.*,>=2.7.0
-torchaudio!=2.9.*,>=2.7.0
+torch<2.9,>=2.7.0
+torchaudio<2.9,>=2.7.0
 tqdm>=4.66.1
 soundfile>=0.12.1
 nltk>=3.8.2
-pyannote-audio>=3.3.1
+pyannote-audio<4.0.4,>=3.3.1
 silero-vad~=6.0
 msgspec
 rapidfuzz