easyaligner 0.2.0__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. easyaligner-0.2.3/PKG-INFO +167 -0
  2. easyaligner-0.2.3/README.md +146 -0
  3. {easyaligner-0.2.0 → easyaligner-0.2.3}/pyproject.toml +3 -2
  4. {easyaligner-0.2.0 → easyaligner-0.2.3}/src/easyaligner/alignment/pytorch.py +1 -1
  5. {easyaligner-0.2.0 → easyaligner-0.2.3}/src/easyaligner/data/datamodel.py +29 -0
  6. {easyaligner-0.2.0 → easyaligner-0.2.3}/src/easyaligner/pipelines.py +4 -23
  7. easyaligner-0.2.3/src/easyaligner/text/__init__.py +28 -0
  8. easyaligner-0.2.3/src/easyaligner/text/match.py +235 -0
  9. easyaligner-0.2.3/src/easyaligner/text/tokenizer.py +54 -0
  10. easyaligner-0.2.3/src/easyaligner.egg-info/PKG-INFO +167 -0
  11. {easyaligner-0.2.0 → easyaligner-0.2.3}/src/easyaligner.egg-info/SOURCES.txt +1 -0
  12. {easyaligner-0.2.0 → easyaligner-0.2.3}/src/easyaligner.egg-info/requires.txt +1 -0
  13. easyaligner-0.2.0/PKG-INFO +0 -160
  14. easyaligner-0.2.0/README.md +0 -140
  15. easyaligner-0.2.0/src/easyaligner/text/__init__.py +0 -15
  16. easyaligner-0.2.0/src/easyaligner/text/tokenizer.py +0 -25
  17. easyaligner-0.2.0/src/easyaligner.egg-info/PKG-INFO +0 -160
  18. {easyaligner-0.2.0 → easyaligner-0.2.3}/LICENSE +0 -0
  19. {easyaligner-0.2.0 → easyaligner-0.2.3}/setup.cfg +0 -0
  20. {easyaligner-0.2.0 → easyaligner-0.2.3}/src/easyaligner/__init__.py +0 -0
  21. {easyaligner-0.2.0 → easyaligner-0.2.3}/src/easyaligner/alignment/__init__.py +0 -0
  22. {easyaligner-0.2.0 → easyaligner-0.2.3}/src/easyaligner/alignment/utils.py +0 -0
  23. {easyaligner-0.2.0 → easyaligner-0.2.3}/src/easyaligner/data/__init__.py +0 -0
  24. {easyaligner-0.2.0 → easyaligner-0.2.3}/src/easyaligner/data/collators.py +0 -0
  25. {easyaligner-0.2.0 → easyaligner-0.2.3}/src/easyaligner/data/dataset.py +0 -0
  26. {easyaligner-0.2.0 → easyaligner-0.2.3}/src/easyaligner/data/utils.py +0 -0
  27. {easyaligner-0.2.0 → easyaligner-0.2.3}/src/easyaligner/text/languages/sv.py +0 -0
  28. {easyaligner-0.2.0 → easyaligner-0.2.3}/src/easyaligner/text/normalization.py +0 -0
  29. {easyaligner-0.2.0 → easyaligner-0.2.3}/src/easyaligner/utils.py +0 -0
  30. {easyaligner-0.2.0 → easyaligner-0.2.3}/src/easyaligner/vad/__init__.py +0 -0
  31. {easyaligner-0.2.0 → easyaligner-0.2.3}/src/easyaligner/vad/pyannote.py +0 -0
  32. {easyaligner-0.2.0 → easyaligner-0.2.3}/src/easyaligner/vad/silero.py +0 -0
  33. {easyaligner-0.2.0 → easyaligner-0.2.3}/src/easyaligner/vad/utils.py +0 -0
  34. {easyaligner-0.2.0 → easyaligner-0.2.3}/src/easyaligner/vad/vad.py +0 -0
  35. {easyaligner-0.2.0 → easyaligner-0.2.3}/src/easyaligner.egg-info/dependency_links.txt +0 -0
  36. {easyaligner-0.2.0 → easyaligner-0.2.3}/src/easyaligner.egg-info/top_level.txt +0 -0
@@ -0,0 +1,167 @@
1
+ Metadata-Version: 2.4
2
+ Name: easyaligner
3
+ Version: 0.2.3
4
+ Summary: Forced alignment pipeline designed for efficiency and ease of use.
5
+ Author: Faton Rekathati
6
+ Project-URL: Repository, https://github.com/kb-labb/easyaligner
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: transformers>=4.45.0
11
+ Requires-Dist: torch!=2.9.*,>=2.7.0
12
+ Requires-Dist: torchaudio!=2.9.*,>=2.7.0
13
+ Requires-Dist: tqdm>=4.66.1
14
+ Requires-Dist: soundfile>=0.12.1
15
+ Requires-Dist: nltk>=3.8.2
16
+ Requires-Dist: pyannote-audio>=3.3.1
17
+ Requires-Dist: silero-vad~=6.0
18
+ Requires-Dist: msgspec
19
+ Requires-Dist: rapidfuzz
20
+ Dynamic: license-file
21
+
22
+ # Easier forced alignment with `easyaligner`
23
+
24
+ <div align="center"><img width="1020" height="340" alt="image" src="https://github.com/user-attachments/assets/a3589539-5c85-4ac1-a4a7-d5e801207faa" /></div>
25
+
26
+ `easyaligner` is a fast and memory efficient forced alignment pipeline for speech and text. Given a text transcript, `easyaligner` will help identify where each word or phrase was spoken in the audio. The library supports aligning both from ground-truth transcripts, as well as from ASR-generated transcripts (`easyaligner` acts as the backend that powers alignment in [`easytranscriber`](https://github.com/kb-labb/easytranscriber)). Some notable features of `easyaligner` include:
27
+
28
+ * **GPU accelerated forced alignment**. Uses [Pytorch's forced alignment API](https://docs.pytorch.org/audio/main/tutorials/ctc_forced_alignment_api_tutorial.html) with a GPU based implementation of the Viterbi algorithm. Enables fast and memory-efficient forced alignment of long audio segments ([Pratap et al., 2024](https://jmlr.org/papers/volume25/23-1318/23-1318.pdf#page=8)).
29
+ * **Flexible text normalization for improved alignment quality**. Users can supply custom regex-based text normalization functions to preprocess transcripts before alignment. A mapping from the original text to the normalized text is maintained internally. All of the applied normalizations and transformations are consequently **non-destructive and reversible after alignment**.
30
+ * **Batch processing support for emission extraction**. `easyaligner` supports batched inference for wav2vec2-based models, keeping track of non-padded logits when doing alignment.
31
+
32
+ Check out the [documentation](https://kb-labb.github.io/easyaligner/) for more details and tutorials!
33
+
34
+ ## Installation
35
+
36
+ ### With GPU support (recommended)
37
+
38
+ ```bash
39
+ pip install easyaligner --extra-index-url https://download.pytorch.org/whl/cu128
40
+ ```
41
+
42
+ > [!TIP]
43
+ > Remove `--extra-index-url` if you want a CPU-only installation.
44
+
45
+ ### Using uv
46
+
47
+ When installing with [uv](https://docs.astral.sh/uv/), it will select the appropriate PyTorch version automatically (CPU for macOS, CUDA for Linux/Windows/ARM):
48
+
49
+ ```bash
50
+ uv pip install easyaligner
51
+ ```
52
+
53
+ ## Usage
54
+
55
+ The example below downloads a short snippet from a LibriVox audiobook recording of [A Tale of Two Cities](https://librivox.org/a-tale-of-two-cities-by-charles-dickens-2/). The snippet is 57 seconds long, and corresponds to the first paragraph of the first chapter of A Tale of Two Cities. The corresponding text to be used for alignment is directly supplied below and assigned to the `text` variable.
56
+
57
+ ```python
58
+ from pathlib import Path
59
+
60
+ from transformers import (
61
+ AutoModelForCTC,
62
+ Wav2Vec2Processor,
63
+ )
64
+ from huggingface_hub import snapshot_download
65
+
66
+ from easyaligner.text import load_tokenizer
67
+ from easyaligner.data.datamodel import SpeechSegment
68
+ from easyaligner.pipelines import pipeline
69
+ from easyaligner.text import text_normalizer
70
+ from easyaligner.vad.pyannote import load_vad_model
71
+
72
+ filepath_pattern = "tale-of-two-cities_align-en/taleoftwocities_01_dickens_64kb_align.mp3"
73
+
74
+ # Download mp3 from Hugging Face Hub
75
+ snapshot_download(
76
+ "Lauler/easytranscriber_tutorials",
77
+ repo_type="dataset",
78
+ local_dir="data/tutorials",
79
+ allow_patterns=filepath_pattern,
80
+ )
81
+
82
+ # File(s) to align
83
+ filepath = Path("data/tutorials") / filepath_pattern
84
+ audio_dir = filepath.parent
85
+ audio_files = [filepath.name]
86
+
87
+ text = """
88
+ It was the best of times, it was the worst of times, it was the age of
89
+ wisdom, it was the age of foolishness, it was the epoch of belief, it
90
+ was the epoch of incredulity, it was the season of Light, it was the
91
+ season of Darkness, it was the spring of hope, it was the winter of
92
+ despair, we had everything before us, we had nothing before us, we were
93
+ all going direct to Heaven, we were all going direct the other way--in
94
+ short, the period was so far like the present period, that some of its
95
+ noisiest authorities insisted on its being received, for good or for
96
+ evil, in the superlative degree of comparison only.
97
+ """
98
+
99
+ text = text.strip()
100
+
101
+ # The alignments will be organized according to how the text is tokenized
102
+ tokenizer = load_tokenizer(language="english") # sentence tokenizer
103
+ span_list = list(tokenizer.span_tokenize(text)) # start, end character indices for each sentence
104
+ speeches = [[SpeechSegment(speech_id=0, text=text, text_spans=span_list, start=None, end=None)]]
105
+
106
+ # Load models and run pipeline
107
+ model_vad = load_vad_model()
108
+ model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cuda").half()
109
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
110
+
111
+ pipeline(
112
+ vad_model=model_vad,
113
+ emissions_model=model,
114
+ processor=processor,
115
+ audio_paths=audio_files,
116
+ audio_dir=audio_dir,
117
+ speeches=speeches,
118
+ alignment_strategy="speech",
119
+ text_normalizer_fn=text_normalizer,
120
+ tokenizer=tokenizer,
121
+ start_wildcard=True,
122
+ end_wildcard=True,
123
+ blank_id=processor.tokenizer.pad_token_id,
124
+ word_boundary="|",
125
+ )
126
+ ```
127
+
128
+ > [!TIP]
129
+ > `easyaligner` allows organizing the output at any level of granularity the user wishes (sentence, paragraph, or other). In the above example, we use an `nltk.tokenize.punkt.PunktTokenizer` to sentence tokenize our text. See the [text processing documentation](https://kb-labb.github.io/easyaligner/get-started/text_processing.html) for a more detailed explanation, and a tutorial for implementing custom tokenizers.
130
+
131
+ ## Documentation
132
+
133
+ Check out the documentation tutorials that cover common scenarios for forced alignment, and the API reference:
134
+
135
+ * [https://kb-labb.github.io/easyaligner/](https://kb-labb.github.io/easyaligner/)
136
+ * [Tutorial 1](https://kb-labb.github.io/easyaligner/get-started/tutorial01.html): Align text and audio when the transcript covers all of the spoken content in the audio.
137
+ * [Tutorial 2](https://kb-labb.github.io/easyaligner/get-started/tutorial02.html): Transcript covers only part of the spoken content in the audio, but we know the relevant audio region in advance.
138
+ * [Tutorial 3](https://kb-labb.github.io/easyaligner/get-started/tutorial03.html): Transcript covers only part of the spoken content in the audio, and we don't know the relevant audio region in advance.
139
+
140
+ ## Outputs
141
+
142
+ By default, `easyaligner` saves the outputs of each stage of the pipeline (VAD, emission extraction, forced alignment) as JSON files in separate directories. The final aligned output can be found in `output/alignments`. The directory structure after running the full pipeline will look as follows:
143
+
144
+ ```
145
+ output
146
+ ├── alignments
147
+ ├── emissions
148
+ └── vad
149
+ ```
150
+
151
+ The `output/emissions` directory will, in addition to the JSON files, also contain output emissions for each JSON file in `.npy` format.
152
+
153
+ All intermediate files can safely be deleted, assuming there is no need to re-run the pipeline from a specific intermediate stage.
154
+
155
+ ## Citation
156
+
157
+ If you use `easyaligner` in your research, consider citing the following blog post:
158
+
159
+ ```
160
+ @online{rekathati2026,
161
+ author = {Rekathati, Faton},
162
+ title = {Easyaligner: {Forced} Alignment of Text and Audio, Made Easy},
163
+ date = {2026-04-08},
164
+ url = {https://kb-labb.github.io/posts/2026-04-08-easyaligner/},
165
+ langid = {en}
166
+ }
167
+ ```
@@ -0,0 +1,146 @@
1
+ # Easier forced alignment with `easyaligner`
2
+
3
+ <div align="center"><img width="1020" height="340" alt="image" src="https://github.com/user-attachments/assets/a3589539-5c85-4ac1-a4a7-d5e801207faa" /></div>
4
+
5
+ `easyaligner` is a fast and memory efficient forced alignment pipeline for speech and text. Given a text transcript, `easyaligner` will help identify where each word or phrase was spoken in the audio. The library supports aligning both from ground-truth transcripts, as well as from ASR-generated transcripts (`easyaligner` acts as the backend that powers alignment in [`easytranscriber`](https://github.com/kb-labb/easytranscriber)). Some notable features of `easyaligner` include:
6
+
7
+ * **GPU accelerated forced alignment**. Uses [Pytorch's forced alignment API](https://docs.pytorch.org/audio/main/tutorials/ctc_forced_alignment_api_tutorial.html) with a GPU based implementation of the Viterbi algorithm. Enables fast and memory-efficient forced alignment of long audio segments ([Pratap et al., 2024](https://jmlr.org/papers/volume25/23-1318/23-1318.pdf#page=8)).
8
+ * **Flexible text normalization for improved alignment quality**. Users can supply custom regex-based text normalization functions to preprocess transcripts before alignment. A mapping from the original text to the normalized text is maintained internally. All of the applied normalizations and transformations are consequently **non-destructive and reversible after alignment**.
9
+ * **Batch processing support for emission extraction**. `easyaligner` supports batched inference for wav2vec2-based models, keeping track of non-padded logits when doing alignment.
10
+
11
+ Check out the [documentation](https://kb-labb.github.io/easyaligner/) for more details and tutorials!
12
+
13
+ ## Installation
14
+
15
+ ### With GPU support (recommended)
16
+
17
+ ```bash
18
+ pip install easyaligner --extra-index-url https://download.pytorch.org/whl/cu128
19
+ ```
20
+
21
+ > [!TIP]
22
+ > Remove `--extra-index-url` if you want a CPU-only installation.
23
+
24
+ ### Using uv
25
+
26
+ When installing with [uv](https://docs.astral.sh/uv/), it will select the appropriate PyTorch version automatically (CPU for macOS, CUDA for Linux/Windows/ARM):
27
+
28
+ ```bash
29
+ uv pip install easyaligner
30
+ ```
31
+
32
+ ## Usage
33
+
34
+ The example below downloads a short snippet from a LibriVox audiobook recording of [A Tale of Two Cities](https://librivox.org/a-tale-of-two-cities-by-charles-dickens-2/). The snippet is 57 seconds long, and corresponds to the first paragraph of the first chapter of A Tale of Two Cities. The corresponding text to be used for alignment is directly supplied below and assigned to the `text` variable.
35
+
36
+ ```python
37
+ from pathlib import Path
38
+
39
+ from transformers import (
40
+ AutoModelForCTC,
41
+ Wav2Vec2Processor,
42
+ )
43
+ from huggingface_hub import snapshot_download
44
+
45
+ from easyaligner.text import load_tokenizer
46
+ from easyaligner.data.datamodel import SpeechSegment
47
+ from easyaligner.pipelines import pipeline
48
+ from easyaligner.text import text_normalizer
49
+ from easyaligner.vad.pyannote import load_vad_model
50
+
51
+ filepath_pattern = "tale-of-two-cities_align-en/taleoftwocities_01_dickens_64kb_align.mp3"
52
+
53
+ # Download mp3 from Hugging Face Hub
54
+ snapshot_download(
55
+ "Lauler/easytranscriber_tutorials",
56
+ repo_type="dataset",
57
+ local_dir="data/tutorials",
58
+ allow_patterns=filepath_pattern,
59
+ )
60
+
61
+ # File(s) to align
62
+ filepath = Path("data/tutorials") / filepath_pattern
63
+ audio_dir = filepath.parent
64
+ audio_files = [filepath.name]
65
+
66
+ text = """
67
+ It was the best of times, it was the worst of times, it was the age of
68
+ wisdom, it was the age of foolishness, it was the epoch of belief, it
69
+ was the epoch of incredulity, it was the season of Light, it was the
70
+ season of Darkness, it was the spring of hope, it was the winter of
71
+ despair, we had everything before us, we had nothing before us, we were
72
+ all going direct to Heaven, we were all going direct the other way--in
73
+ short, the period was so far like the present period, that some of its
74
+ noisiest authorities insisted on its being received, for good or for
75
+ evil, in the superlative degree of comparison only.
76
+ """
77
+
78
+ text = text.strip()
79
+
80
+ # The alignments will be organized according to how the text is tokenized
81
+ tokenizer = load_tokenizer(language="english") # sentence tokenizer
82
+ span_list = list(tokenizer.span_tokenize(text)) # start, end character indices for each sentence
83
+ speeches = [[SpeechSegment(speech_id=0, text=text, text_spans=span_list, start=None, end=None)]]
84
+
85
+ # Load models and run pipeline
86
+ model_vad = load_vad_model()
87
+ model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cuda").half()
88
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
89
+
90
+ pipeline(
91
+ vad_model=model_vad,
92
+ emissions_model=model,
93
+ processor=processor,
94
+ audio_paths=audio_files,
95
+ audio_dir=audio_dir,
96
+ speeches=speeches,
97
+ alignment_strategy="speech",
98
+ text_normalizer_fn=text_normalizer,
99
+ tokenizer=tokenizer,
100
+ start_wildcard=True,
101
+ end_wildcard=True,
102
+ blank_id=processor.tokenizer.pad_token_id,
103
+ word_boundary="|",
104
+ )
105
+ ```
106
+
107
+ > [!TIP]
108
+ > `easyaligner` allows organizing the output at any level of granularity the user wishes (sentence, paragraph, or other). In the above example, we use an `nltk.tokenize.punkt.PunktTokenizer` to sentence tokenize our text. See the [text processing documentation](https://kb-labb.github.io/easyaligner/get-started/text_processing.html) for a more detailed explanation, and a tutorial for implementing custom tokenizers.
109
+
110
+ ## Documentation
111
+
112
+ Check out the documentation tutorials that cover common scenarios for forced alignment, and the API reference:
113
+
114
+ * [https://kb-labb.github.io/easyaligner/](https://kb-labb.github.io/easyaligner/)
115
+ * [Tutorial 1](https://kb-labb.github.io/easyaligner/get-started/tutorial01.html): Align text and audio when the transcript covers all of the spoken content in the audio.
116
+ * [Tutorial 2](https://kb-labb.github.io/easyaligner/get-started/tutorial02.html): Transcript covers only part of the spoken content in the audio, but we know the relevant audio region in advance.
117
+ * [Tutorial 3](https://kb-labb.github.io/easyaligner/get-started/tutorial03.html): Transcript covers only part of the spoken content in the audio, and we don't know the relevant audio region in advance.
118
+
119
+ ## Outputs
120
+
121
+ By default, `easyaligner` saves the outputs of each stage of the pipeline (VAD, emission extraction, forced alignment) as JSON files in separate directories. The final aligned output can be found in `output/alignments`. The directory structure after running the full pipeline will look as follows:
122
+
123
+ ```
124
+ output
125
+ ├── alignments
126
+ ├── emissions
127
+ └── vad
128
+ ```
129
+
130
+ The `output/emissions` directory will, in addition to the JSON files, also contain output emissions for each JSON file in `.npy` format.
131
+
132
+ All intermediate files can safely be deleted, assuming there is no need to re-run the pipeline from a specific intermediate stage.
133
+
134
+ ## Citation
135
+
136
+ If you use `easyaligner` in your research, consider citing the following blog post:
137
+
138
+ ```
139
+ @online{rekathati2026,
140
+ author = {Rekathati, Faton},
141
+ title = {Easyaligner: {Forced} Alignment of Text and Audio, Made Easy},
142
+ date = {2026-04-08},
143
+ url = {https://kb-labb.github.io/posts/2026-04-08-easyaligner/},
144
+ langid = {en}
145
+ }
146
+ ```
@@ -3,7 +3,7 @@ requires = ["setuptools>=67.0.0"]
3
3
  build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
- version = "0.2.0"
6
+ version = "0.2.3"
7
7
  name = "easyaligner"
8
8
  requires-python = ">= 3.10"
9
9
  description = "Forced alignment pipeline designed for efficiency and ease of use."
@@ -19,7 +19,8 @@ dependencies = [
19
19
  "nltk>=3.8.2",
20
20
  "pyannote-audio>=3.3.1",
21
21
  "silero-vad~=6.0",
22
- "msgspec"
22
+ "msgspec",
23
+ "rapidfuzz"
23
24
  ]
24
25
 
25
26
  [project.urls]
@@ -844,7 +844,7 @@ def get_segment_alignment(
844
844
  if token_cursor >= len(mapping):
845
845
  break # No more tokens to process
846
846
 
847
- if start_idx < mapping[token_cursor]["start_char"]:
847
+ if token_cursor == 0 and start_idx < mapping[token_cursor]["start_char"]:
848
848
  logger.warning(
849
849
  "Segment indices start before the first token index. This may be due to "
850
850
  "leading whitespace in the original text. Consider stripping leading/trailing "
@@ -218,3 +218,32 @@ class AudioMetadata(msgspec.Struct):
218
218
 
219
219
  def to_dict(self):
220
220
  return {f: getattr(self, f) for f in self.__struct_fields__}
221
+
222
+
223
+ class FuzzyMatch(msgspec.Struct):
224
+ """
225
+ Result of a fuzzy text match.
226
+
227
+ A `FuzzyMatch` contains the word indices, timestamps, and confidence score
228
+ of the best match found between a needle (ground truth text) and a haystack
229
+ (concatenated word texts from ASR output).
230
+
231
+ Attributes
232
+ ----------
233
+ start_index : int
234
+ Start matching word index in the haystack word list.
235
+ end_index : int
236
+ End matching word index in the haystack word list (inclusive).
237
+ score : float
238
+ Fuzzy match score on a 0-100 scale, as returned by rapidfuzz.
239
+ start : float
240
+ Start time of the match in seconds.
241
+ end : float
242
+ End time of the match in seconds.
243
+ """
244
+
245
+ start_index: int
246
+ end_index: int
247
+ score: float
248
+ start: float | None = None
249
+ end: float | None = None
@@ -44,7 +44,6 @@ def vad_pipeline_generator(
44
44
  chunk_size: int = 30,
45
45
  sample_rate: int = 16000,
46
46
  metadata: list[dict] | None = None,
47
- batch_size: int = 1,
48
47
  num_workers: int = 1,
49
48
  prefetch_factor: int = 2,
50
49
  save_json: bool = True,
@@ -73,8 +72,6 @@ def vad_pipeline_generator(
73
72
  The sample rate to resample the audio to before running VAD.
74
73
  metadata : list[dict] or None, optional
75
74
  Optional list of additional file level metadata to include.
76
- batch_size : int, default 1
77
- The batch size for the DataLoader.
78
75
  num_workers : int, default 1
79
76
  The number of workers for the DataLoader.
80
77
  prefetch_factor : int, default 2
@@ -99,7 +96,7 @@ def vad_pipeline_generator(
99
96
  )
100
97
  vad_dataloader = torch.utils.data.DataLoader(
101
98
  vad_dataset,
102
- batch_size=batch_size,
99
+ batch_size=1,
103
100
  shuffle=False,
104
101
  collate_fn=vad_collate_fn,
105
102
  num_workers=num_workers,
@@ -163,7 +160,6 @@ def vad_pipeline(
163
160
  chunk_size: int = 30,
164
161
  sample_rate: int = 16000,
165
162
  metadata: list[dict] | None = None,
166
- batch_size: int = 1,
167
163
  num_workers: int = 1,
168
164
  prefetch_factor: int = 2,
169
165
  save_json: bool = True,
@@ -192,8 +188,6 @@ def vad_pipeline(
192
188
  The sample rate to resample the audio to before running VAD.
193
189
  metadata : list[dict] or None, optional
194
190
  Optional list of additional file level metadata to include.
195
- batch_size : int, default 1
196
- The batch size for the DataLoader.
197
191
  num_workers : int, default 1
198
192
  The number of workers for the DataLoader.
199
193
  prefetch_factor : int, default 2
@@ -222,7 +216,6 @@ def vad_pipeline(
222
216
  chunk_size=chunk_size,
223
217
  sample_rate=sample_rate,
224
218
  metadata=metadata,
225
- batch_size=batch_size,
226
219
  num_workers=num_workers,
227
220
  prefetch_factor=prefetch_factor,
228
221
  save_json=save_json,
@@ -249,7 +242,6 @@ def emissions_pipeline_generator(
249
242
  sample_rate: int = 16000,
250
243
  chunk_size: int = 30,
251
244
  alignment_strategy: str = "speech",
252
- batch_size_files: int = 1,
253
245
  num_workers_files: int = 1,
254
246
  prefetch_factor_files: int = 2,
255
247
  batch_size_features: int = 8,
@@ -287,8 +279,6 @@ def emissions_pipeline_generator(
287
279
  Strategy for aligning features to text. One of 'speech' or 'chunk'.
288
280
  If `speech`, audio is split into `chunk_size` sized chunks based on SpeechSegments.
289
281
  If `chunk`, audio is taken from existing VAD chunks.
290
- batch_size_files : int, default 1
291
- Batch size for the file DataLoader.
292
282
  num_workers_files : int, default 1
293
283
  Number of workers for the file DataLoader.
294
284
  prefetch_factor_files : int, default 2
@@ -333,7 +323,7 @@ def emissions_pipeline_generator(
333
323
 
334
324
  file_dataloader = torch.utils.data.DataLoader(
335
325
  file_dataset,
336
- batch_size=batch_size_files,
326
+ batch_size=1,
337
327
  shuffle=False,
338
328
  collate_fn=audiofile_collate_fn,
339
329
  num_workers=num_workers_files,
@@ -372,7 +362,7 @@ def emissions_pipeline_generator(
372
362
  speech_ids = []
373
363
 
374
364
  for batch in feature_dataloader:
375
- features = batch["features"].half().to(device)
365
+ features = batch["features"].to(device=device, dtype=model.dtype)
376
366
 
377
367
  with torch.inference_mode():
378
368
  logits = model(features).logits
@@ -420,7 +410,6 @@ def emissions_pipeline(
420
410
  sample_rate: int = 16000,
421
411
  chunk_size: int = 30,
422
412
  alignment_strategy: str = "speech",
423
- batch_size_files: int = 1,
424
413
  num_workers_files: int = 1,
425
414
  prefetch_factor_files: int = 2,
426
415
  batch_size_features: int = 8,
@@ -455,8 +444,6 @@ def emissions_pipeline(
455
444
  Strategy for aligning features to text. One of 'speech' or 'chunk'.
456
445
  If `speech`, audio is split into `chunk_size` sized chunks based on SpeechSegments.
457
446
  If `chunk`, audio is taken from existing VAD chunks.
458
- batch_size_files : int, default 1
459
- Batch size for the file DataLoader.
460
447
  num_workers_files : int, default 1
461
448
  Number of workers for the file DataLoader.
462
449
  prefetch_factor_files : int, default 2
@@ -495,7 +482,6 @@ def emissions_pipeline(
495
482
  sample_rate=sample_rate,
496
483
  chunk_size=chunk_size,
497
484
  alignment_strategy=alignment_strategy,
498
- batch_size_files=batch_size_files,
499
485
  num_workers_files=num_workers_files,
500
486
  prefetch_factor_files=prefetch_factor_files,
501
487
  batch_size_features=batch_size_features,
@@ -773,7 +759,6 @@ def pipeline(
773
759
  word_boundary: str = "|",
774
760
  indent: int = 2,
775
761
  ndigits: int = 5,
776
- batch_size_files: int = 1,
777
762
  num_workers_files: int = 2,
778
763
  prefetch_factor_files: int = 1,
779
764
  batch_size_features: int = 8,
@@ -839,8 +824,6 @@ def pipeline(
839
824
  Indentation level for saved JSON files. `None` to disable pretty formatting.
840
825
  ndigits : int, default 5
841
826
  Number of decimal digits to round the alignment times and scores to.
842
- batch_size_files : int, default 1
843
- Batch size for the file DataLoader.
844
827
  num_workers_files : int, default 2
845
828
  Number of workers for the file DataLoader.
846
829
  prefetch_factor_files : int, default 1
@@ -887,7 +870,6 @@ def pipeline(
887
870
  speeches=speeches,
888
871
  chunk_size=chunk_size,
889
872
  sample_rate=sample_rate,
890
- batch_size=batch_size_files,
891
873
  num_workers=num_workers_files,
892
874
  prefetch_factor=prefetch_factor_files,
893
875
  save_json=save_json,
@@ -909,7 +891,6 @@ def pipeline(
909
891
  sample_rate=sample_rate,
910
892
  chunk_size=chunk_size,
911
893
  alignment_strategy=alignment_strategy,
912
- batch_size_files=batch_size_files,
913
894
  num_workers_files=num_workers_files,
914
895
  prefetch_factor_files=prefetch_factor_files,
915
896
  batch_size_features=batch_size_features,
@@ -929,7 +910,7 @@ def pipeline(
929
910
  )
930
911
  json_dataloader = torch.utils.data.DataLoader(
931
912
  json_dataset,
932
- batch_size=batch_size_files,
913
+ batch_size=1,
933
914
  shuffle=False,
934
915
  collate_fn=metadata_collate_fn,
935
916
  num_workers=num_workers_files,
@@ -0,0 +1,28 @@
1
+ from easyaligner.text.match import (
2
+ FuzzyMatch,
3
+ build_haystack,
4
+ flatten_words,
5
+ fuzzy_match,
6
+ resolve_char_to_word,
7
+ )
8
+ from easyaligner.text.normalization import (
9
+ SpanMapNormalizer,
10
+ add_deletions_to_mapping,
11
+ merge_multitoken_expressions,
12
+ text_normalizer,
13
+ )
14
+ from easyaligner.text.tokenizer import load_tokenizer, paragraph_tokenizer
15
+
16
+ __all__ = [
17
+ "FuzzyMatch",
18
+ "SpanMapNormalizer",
19
+ "add_deletions_to_mapping",
20
+ "build_haystack",
21
+ "flatten_words",
22
+ "fuzzy_match",
23
+ "load_tokenizer",
24
+ "paragraph_tokenizer",
25
+ "merge_multitoken_expressions",
26
+ "resolve_char_to_word",
27
+ "text_normalizer",
28
+ ]