sk-align 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sk_align-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Christoph Minixhofer
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,228 @@
1
+ Metadata-Version: 2.4
2
+ Name: sk-align
3
+ Version: 0.1.0
4
+ Summary: Standalone forced alignment for Scottish Gaelic — no Kaldi/PyKaldi dependency
5
+ Author-email: Christoph Minixhofer <christoph.minixhofer@gmail.com>
6
+ License: MIT
7
+ Project-URL: Model, https://huggingface.co/eist-edinburgh/nnet3_alignment_model
8
+ Keywords: forced-alignment,speech,scottish-gaelic,kaldi,asr
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
19
+ Requires-Python: >=3.10
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: numpy>=1.24
23
+ Requires-Dist: scipy>=1.10
24
+ Requires-Dist: torch>=2.0
25
+ Requires-Dist: k2>=1.24
26
+ Provides-Extra: hub
27
+ Requires-Dist: huggingface_hub>=0.20; extra == "hub"
28
+ Provides-Extra: all
29
+ Requires-Dist: huggingface_hub>=0.20; extra == "all"
30
+ Provides-Extra: test
31
+ Requires-Dist: pytest>=7.0; extra == "test"
32
+ Requires-Dist: huggingface_hub>=0.20; extra == "test"
33
+ Provides-Extra: dev
34
+ Requires-Dist: sk-align[test]; extra == "dev"
35
+ Requires-Dist: ruff; extra == "dev"
36
+ Dynamic: license-file
37
+
38
+ # sk-align
39
+
40
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
41
+ [![PyPI](https://img.shields.io/pypi/v/sk-align.svg)](https://pypi.org/project/sk-align/)
42
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
43
+ [![Hugging Face Model](https://img.shields.io/badge/%F0%9F%A4%97-Model_on_Hub-yellow.svg)](https://huggingface.co/eist-edinburgh/nnet3_alignment_model)
44
+ [![Tests](https://img.shields.io/badge/tests-49%20passing-brightgreen.svg)](#testing)
45
+
46
+ **Standalone forced alignment for Scottish Gaelic** — no Kaldi or PyKaldi dependency.
47
+
48
+ sk-align reimplements Kaldi's nnet3 forced-alignment pipeline entirely in
49
+ Python/NumPy/PyTorch, reading Kaldi model files directly. It produces
50
+ word-level timestamps at parity with PyKaldi while being easier to install
51
+ and deploy.
52
+
53
+ ---
54
+
55
+ ## Features
56
+
57
+ - **Zero Kaldi dependency** — pure Python reads Kaldi binary formats (`final.mdl`, `tree`, `L.fst`, etc.)
58
+ - **`from_pretrained()`** — one-line model download from Hugging Face Hub
59
+ - **MFCC extraction** — vectorised NumPy implementation matching Kaldi output
60
+ - **TDNN-F nnet3 inference** — full PyTorch reimplementation of the forward pass
61
+ - **k2 Viterbi decoder** — fast FSA-based decoding via `intersect_dense` + `shortest_path`
62
+ - **Word-level timestamps** — `[{"word": "hello", "start": 0.12, "end": 0.45}, ...]`
63
+ - **Parity-tested** — 55 tests verify numerical match against PyKaldi reference
64
+
65
+ ## Installation
66
+
67
+ ```bash
68
+ pip install sk-align # core (numpy + scipy + torch + k2)
69
+ pip install sk-align[all] # + huggingface_hub for from_pretrained()
70
+ ```
71
+
72
+ Or install from source:
73
+
74
+ ```bash
75
+ git clone https://github.com/your-org/sk-align.git
76
+ cd sk-align/sk-align
77
+ pip install -e ".[all]" # editable with all extras
78
+ ```
79
+
80
+ ### Optional extras
81
+
82
+ | Extra | Installs | Needed for |
83
+ | --------- | -------------------------------- | ------------------------------------- |
84
+ | `hub` | `huggingface_hub>=0.20` | `Aligner.from_pretrained()` |
85
+ | `all` | `huggingface_hub` | Full end-to-end pipeline |
86
+ | `test` | `pytest` + `huggingface_hub` | Running the test suite |
87
+ | `dev` | `test` extras + `ruff` | Development |
88
+
89
+ ## Quick start
90
+
91
+ ```python
92
+ from sk_align import Aligner
93
+
94
+ # Download model from Hugging Face and load (cached after first call)
95
+ aligner = Aligner.from_pretrained()
96
+
97
+ # audio: float32 numpy array, 16 kHz, mono
98
+ timestamps = aligner.align(audio, ["cumaidh", "sinn", "a'", "dol"])
99
+ # [{"word": "cumaidh", "start": 0.33, "end": 0.72},
100
+ # {"word": "sinn", "start": 0.72, "end": 0.99},
101
+ # ...]
102
+ ```
103
+
104
+ ### Loading a local model
105
+
106
+ ```python
107
+ from sk_align import Aligner
108
+ from sk_align.nnet3_torch import TorchNnetScorer
109
+
110
+ scorer = TorchNnetScorer.from_model_file("/path/to/model/final.mdl")
111
+ aligner = Aligner.from_model_dir("/path/to/model", nnet_scorer=scorer)
112
+
113
+ timestamps = aligner.align(audio, words)
114
+ ```
115
+
116
+ ### Using pre-computed log-likelihoods
117
+
118
+ ```python
119
+ import numpy as np
120
+ from sk_align import Aligner
121
+
122
+ aligner = Aligner.from_model_dir("/path/to/model") # no scorer needed
123
+ loglikes = np.load("loglikes.npy") # (num_frames, num_pdfs)
124
+
125
+ timestamps = aligner.align_with_loglikes(loglikes, words)
126
+ ```
127
+
128
+ ## Architecture
129
+
130
+ The alignment pipeline reimplements each stage of Kaldi's forced alignment
131
+ in pure Python:
132
+
133
+ ```
134
+ Audio (float32, 16 kHz)
135
+
136
+
137
+ ┌─────────────────────┐
138
+ │ MFCC Extraction │ sk_align.mfcc (NumPy, batch-vectorised)
139
+ └─────────┬───────────┘
140
+
141
+ ┌─────────────────────┐
142
+ │ Nnet3 Forward Pass │ sk_align.nnet3_torch (PyTorch TDNN-F)
143
+ └─────────┬───────────┘
144
+
145
+ ┌─────────────────────┐
146
+ │ Graph Compilation │ sk_align.graph (L ∘ G, context expansion)
147
+ └─────────┬───────────┘
148
+
149
+ ┌─────────────────────┐
150
+ │ Viterbi Decoding │ sk_align.k2_decoder (k2 FSA intersection)
151
+ └─────────┬───────────┘
152
+
153
+ ┌─────────────────────┐
154
+ │ Word Alignment │ sk_align.word_align (boundary extraction)
155
+ └─────────────────────┘
156
+
157
+
158
+ [{"word": "...", "start": 0.12, "end": 0.45}, ...]
159
+ ```
160
+
161
+ ### Modules
162
+
163
+ | Module | Description |
164
+ | ----------------------- | ------------------------------------------------------------------ |
165
+ | `sk_align.aligner` | High-level `Aligner` class — main entry point |
166
+ | `sk_align.mfcc` | MFCC feature extraction (batch NumPy, Kaldi-compatible) |
167
+ | `sk_align.nnet3_model` | Kaldi nnet3 binary parser |
168
+ | `sk_align.nnet3_torch` | PyTorch reimplementation of TDNN-F forward pass |
169
+ | `sk_align.fst` | OpenFst binary format reader + FST representation |
170
+ | `sk_align.graph` | Per-utterance decoding graph compiler (L ∘ G + context expansion) |
171
+ | `sk_align.tree` | Kaldi `ContextDependency` tree reader |
172
+ | `sk_align.transition_model` | Kaldi `TransitionModel` reader |
173
+ | `sk_align.k2_decoder` | k2-based Viterbi decoder |
174
+ | `sk_align.word_align` | Word boundary extraction + timestamp conversion |
175
+ | `sk_align.kaldi_io` | Low-level Kaldi binary I/O helpers |
176
+
177
+ ## Model
178
+
179
+ The default model is hosted at
180
+ [`eist-edinburgh/nnet3_alignment_model`](https://huggingface.co/eist-edinburgh/nnet3_alignment_model)
181
+ on Hugging Face Hub. It is a TDNN-F nnet3 alignment model (3456 PDFs) trained
182
+ for Scottish Gaelic.
183
+
184
+ **Expected model files:**
185
+
186
+ ```
187
+ final.mdl TransitionModel + nnet3 weights
188
+ tree ContextDependency tree
189
+ L.fst Lexicon FST (OpenFst binary)
190
+ words.txt Word symbol table
191
+ disambig.int Disambiguation symbol IDs
192
+ word_boundary.int Phone word-boundary types
193
+ ```
194
+
195
+ ## Testing
196
+
197
+ The test suite verifies numerical parity with PyKaldi at every stage.
198
+
199
+ ```bash
200
+ pip install -e ".[test]"
201
+ pytest # 49 tests — MFCC, I/O, graph, decoder, end-to-end parity
202
+ ```
203
+
204
+ Tests include:
205
+ - **MFCC parity** — feature output matches Kaldi within floating-point tolerance
206
+ - **I/O round-trip** — all Kaldi binary readers produce correct data structures
207
+ - **Graph compilation** — decoding graphs match expected state/arc counts
208
+ - **Decoder parity** — k2 decoder alignment matches reference Viterbi output
209
+ - **End-to-end parity** — word timestamps match PyKaldi within 30ms
210
+
211
+ ## Performance
212
+
213
+ Benchmark on a 5-second Scottish Gaelic utterance (25 words), CPU:
214
+
215
+ | Stage | Time | % of total |
216
+ | ------------- | -------- | ---------- |
217
+ | MFCC | 25 ms | 4% |
218
+ | Nnet3 forward | 434 ms | 75% |
219
+ | Graph compile | 46 ms | 8% |
220
+ | k2 decode | 72 ms | 13% |
221
+ | Word align | <1 ms | <1% |
222
+ | **Total** | **578 ms** | — |
223
+
224
+ End-to-end throughput is at parity with PyKaldi (~560 ms per utterance).
225
+
226
+ ## License
227
+
228
+ MIT
@@ -0,0 +1,191 @@
1
+ # sk-align
2
+
3
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
4
+ [![PyPI](https://img.shields.io/pypi/v/sk-align.svg)](https://pypi.org/project/sk-align/)
5
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
6
+ [![Hugging Face Model](https://img.shields.io/badge/%F0%9F%A4%97-Model_on_Hub-yellow.svg)](https://huggingface.co/eist-edinburgh/nnet3_alignment_model)
7
+ [![Tests](https://img.shields.io/badge/tests-49%20passing-brightgreen.svg)](#testing)
8
+
9
+ **Standalone forced alignment for Scottish Gaelic** — no Kaldi or PyKaldi dependency.
10
+
11
+ sk-align reimplements Kaldi's nnet3 forced-alignment pipeline entirely in
12
+ Python/NumPy/PyTorch, reading Kaldi model files directly. It produces
13
+ word-level timestamps at parity with PyKaldi while being easier to install
14
+ and deploy.
15
+
16
+ ---
17
+
18
+ ## Features
19
+
20
+ - **Zero Kaldi dependency** — pure Python reads Kaldi binary formats (`final.mdl`, `tree`, `L.fst`, etc.)
21
+ - **`from_pretrained()`** — one-line model download from Hugging Face Hub
22
+ - **MFCC extraction** — vectorised NumPy implementation matching Kaldi output
23
+ - **TDNN-F nnet3 inference** — full PyTorch reimplementation of the forward pass
24
+ - **k2 Viterbi decoder** — fast FSA-based decoding via `intersect_dense` + `shortest_path`
25
+ - **Word-level timestamps** — `[{"word": "hello", "start": 0.12, "end": 0.45}, ...]`
26
+ - **Parity-tested** — 55 tests verify numerical match against PyKaldi reference
27
+
28
+ ## Installation
29
+
30
+ ```bash
31
+ pip install sk-align # core (numpy + scipy + torch + k2)
32
+ pip install sk-align[all] # + huggingface_hub for from_pretrained()
33
+ ```
34
+
35
+ Or install from source:
36
+
37
+ ```bash
38
+ git clone https://github.com/your-org/sk-align.git
39
+ cd sk-align/sk-align
40
+ pip install -e ".[all]" # editable with all extras
41
+ ```
42
+
43
+ ### Optional extras
44
+
45
+ | Extra | Installs | Needed for |
46
+ | --------- | -------------------------------- | ------------------------------------- |
47
+ | `hub` | `huggingface_hub>=0.20` | `Aligner.from_pretrained()` |
48
+ | `all` | `huggingface_hub` | Full end-to-end pipeline |
49
+ | `test` | `pytest` + `huggingface_hub` | Running the test suite |
50
+ | `dev` | `test` extras + `ruff` | Development |
51
+
52
+ ## Quick start
53
+
54
+ ```python
55
+ from sk_align import Aligner
56
+
57
+ # Download model from Hugging Face and load (cached after first call)
58
+ aligner = Aligner.from_pretrained()
59
+
60
+ # audio: float32 numpy array, 16 kHz, mono
61
+ timestamps = aligner.align(audio, ["cumaidh", "sinn", "a'", "dol"])
62
+ # [{"word": "cumaidh", "start": 0.33, "end": 0.72},
63
+ # {"word": "sinn", "start": 0.72, "end": 0.99},
64
+ # ...]
65
+ ```
66
+
67
+ ### Loading a local model
68
+
69
+ ```python
70
+ from sk_align import Aligner
71
+ from sk_align.nnet3_torch import TorchNnetScorer
72
+
73
+ scorer = TorchNnetScorer.from_model_file("/path/to/model/final.mdl")
74
+ aligner = Aligner.from_model_dir("/path/to/model", nnet_scorer=scorer)
75
+
76
+ timestamps = aligner.align(audio, words)
77
+ ```
78
+
79
+ ### Using pre-computed log-likelihoods
80
+
81
+ ```python
82
+ import numpy as np
83
+ from sk_align import Aligner
84
+
85
+ aligner = Aligner.from_model_dir("/path/to/model") # no scorer needed
86
+ loglikes = np.load("loglikes.npy") # (num_frames, num_pdfs)
87
+
88
+ timestamps = aligner.align_with_loglikes(loglikes, words)
89
+ ```
90
+
91
+ ## Architecture
92
+
93
+ The alignment pipeline reimplements each stage of Kaldi's forced alignment
94
+ in pure Python:
95
+
96
+ ```
97
+ Audio (float32, 16 kHz)
98
+
99
+
100
+ ┌─────────────────────┐
101
+ │ MFCC Extraction │ sk_align.mfcc (NumPy, batch-vectorised)
102
+ └─────────┬───────────┘
103
+
104
+ ┌─────────────────────┐
105
+ │ Nnet3 Forward Pass │ sk_align.nnet3_torch (PyTorch TDNN-F)
106
+ └─────────┬───────────┘
107
+
108
+ ┌─────────────────────┐
109
+ │ Graph Compilation │ sk_align.graph (L ∘ G, context expansion)
110
+ └─────────┬───────────┘
111
+
112
+ ┌─────────────────────┐
113
+ │ Viterbi Decoding │ sk_align.k2_decoder (k2 FSA intersection)
114
+ └─────────┬───────────┘
115
+
116
+ ┌─────────────────────┐
117
+ │ Word Alignment │ sk_align.word_align (boundary extraction)
118
+ └─────────────────────┘
119
+
120
+
121
+ [{"word": "...", "start": 0.12, "end": 0.45}, ...]
122
+ ```
123
+
124
+ ### Modules
125
+
126
+ | Module | Description |
127
+ | ----------------------- | ------------------------------------------------------------------ |
128
+ | `sk_align.aligner` | High-level `Aligner` class — main entry point |
129
+ | `sk_align.mfcc` | MFCC feature extraction (batch NumPy, Kaldi-compatible) |
130
+ | `sk_align.nnet3_model` | Kaldi nnet3 binary parser |
131
+ | `sk_align.nnet3_torch` | PyTorch reimplementation of TDNN-F forward pass |
132
+ | `sk_align.fst` | OpenFst binary format reader + FST representation |
133
+ | `sk_align.graph` | Per-utterance decoding graph compiler (L ∘ G + context expansion) |
134
+ | `sk_align.tree` | Kaldi `ContextDependency` tree reader |
135
+ | `sk_align.transition_model` | Kaldi `TransitionModel` reader |
136
+ | `sk_align.k2_decoder` | k2-based Viterbi decoder |
137
+ | `sk_align.word_align` | Word boundary extraction + timestamp conversion |
138
+ | `sk_align.kaldi_io` | Low-level Kaldi binary I/O helpers |
139
+
140
+ ## Model
141
+
142
+ The default model is hosted at
143
+ [`eist-edinburgh/nnet3_alignment_model`](https://huggingface.co/eist-edinburgh/nnet3_alignment_model)
144
+ on Hugging Face Hub. It is a TDNN-F nnet3 alignment model (3456 PDFs) trained
145
+ for Scottish Gaelic.
146
+
147
+ **Expected model files:**
148
+
149
+ ```
150
+ final.mdl TransitionModel + nnet3 weights
151
+ tree ContextDependency tree
152
+ L.fst Lexicon FST (OpenFst binary)
153
+ words.txt Word symbol table
154
+ disambig.int Disambiguation symbol IDs
155
+ word_boundary.int Phone word-boundary types
156
+ ```
157
+
158
+ ## Testing
159
+
160
+ The test suite verifies numerical parity with PyKaldi at every stage.
161
+
162
+ ```bash
163
+ pip install -e ".[test]"
164
+ pytest # 49 tests — MFCC, I/O, graph, decoder, end-to-end parity
165
+ ```
166
+
167
+ Tests include:
168
+ - **MFCC parity** — feature output matches Kaldi within floating-point tolerance
169
+ - **I/O round-trip** — all Kaldi binary readers produce correct data structures
170
+ - **Graph compilation** — decoding graphs match expected state/arc counts
171
+ - **Decoder parity** — k2 decoder alignment matches reference Viterbi output
172
+ - **End-to-end parity** — word timestamps match PyKaldi within 30ms
173
+
174
+ ## Performance
175
+
176
+ Benchmark on a 5-second Scottish Gaelic utterance (25 words), CPU:
177
+
178
+ | Stage | Time | % of total |
179
+ | ------------- | -------- | ---------- |
180
+ | MFCC | 25 ms | 4% |
181
+ | Nnet3 forward | 434 ms | 75% |
182
+ | Graph compile | 46 ms | 8% |
183
+ | k2 decode | 72 ms | 13% |
184
+ | Word align | <1 ms | <1% |
185
+ | **Total** | **578 ms** | — |
186
+
187
+ End-to-end throughput is at parity with PyKaldi (~560 ms per utterance).
188
+
189
+ ## License
190
+
191
+ MIT
@@ -0,0 +1,56 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "sk-align"
7
+ version = "0.1.0"
8
+ description = "Standalone forced alignment for Scottish Gaelic — no Kaldi/PyKaldi dependency"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = {text = "MIT"}
12
+ authors = [
13
+ {name = "Christoph Minixhofer", email = "christoph.minixhofer@gmail.com"},
14
+ ]
15
+ keywords = ["forced-alignment", "speech", "scottish-gaelic", "kaldi", "asr"]
16
+ classifiers = [
17
+ "Development Status :: 4 - Beta",
18
+ "Intended Audience :: Science/Research",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Programming Language :: Python :: 3.13",
25
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
26
+ "Topic :: Multimedia :: Sound/Audio :: Speech",
27
+ ]
28
+ dependencies = [
29
+ "numpy>=1.24",
30
+ "scipy>=1.10",
31
+ "torch>=2.0",
32
+ "k2>=1.24",
33
+ ]
34
+
35
+ [project.urls]
36
+ Model = "https://huggingface.co/eist-edinburgh/nnet3_alignment_model"
37
+
38
+ [project.optional-dependencies]
39
+ hub = ["huggingface_hub>=0.20"]
40
+ all = [
41
+ "huggingface_hub>=0.20",
42
+ ]
43
+ test = [
44
+ "pytest>=7.0",
45
+ "huggingface_hub>=0.20",
46
+ ]
47
+ dev = [
48
+ "sk-align[test]",
49
+ "ruff",
50
+ ]
51
+
52
+ [tool.setuptools.packages.find]
53
+ where = ["src"]
54
+
55
+ [tool.pytest.ini_options]
56
+ testpaths = ["tests"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,14 @@
1
+ """sk-align — standalone forced alignment (no Kaldi/PyKaldi dependency)."""
2
+
3
+ from sk_align.aligner import Aligner
4
+
5
+ __all__ = ["Aligner"]
6
+ __version__ = "0.1.0"
7
+
8
+
9
+ def _torch_available() -> bool:
10
+ try:
11
+ import torch # noqa: F401
12
+ return True
13
+ except ImportError:
14
+ return False