tilavet-aligner 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tilavet_aligner-0.1.0/LICENSE +21 -0
- tilavet_aligner-0.1.0/PKG-INFO +143 -0
- tilavet_aligner-0.1.0/README.md +102 -0
- tilavet_aligner-0.1.0/pyproject.toml +75 -0
- tilavet_aligner-0.1.0/setup.cfg +4 -0
- tilavet_aligner-0.1.0/src/tilavet_aligner/__init__.py +43 -0
- tilavet_aligner-0.1.0/src/tilavet_aligner/cli.py +92 -0
- tilavet_aligner-0.1.0/src/tilavet_aligner/contract.py +155 -0
- tilavet_aligner-0.1.0/src/tilavet_aligner/ctc.py +51 -0
- tilavet_aligner-0.1.0/src/tilavet_aligner/posterior.py +160 -0
- tilavet_aligner-0.1.0/src/tilavet_aligner/viterbi.py +308 -0
- tilavet_aligner-0.1.0/src/tilavet_aligner/word_spans.py +205 -0
- tilavet_aligner-0.1.0/src/tilavet_aligner.egg-info/PKG-INFO +143 -0
- tilavet_aligner-0.1.0/src/tilavet_aligner.egg-info/SOURCES.txt +23 -0
- tilavet_aligner-0.1.0/src/tilavet_aligner.egg-info/dependency_links.txt +1 -0
- tilavet_aligner-0.1.0/src/tilavet_aligner.egg-info/entry_points.txt +2 -0
- tilavet_aligner-0.1.0/src/tilavet_aligner.egg-info/requires.txt +13 -0
- tilavet_aligner-0.1.0/src/tilavet_aligner.egg-info/top_level.txt +1 -0
- tilavet_aligner-0.1.0/tests/test_cli.py +133 -0
- tilavet_aligner-0.1.0/tests/test_contract.py +87 -0
- tilavet_aligner-0.1.0/tests/test_ctc.py +20 -0
- tilavet_aligner-0.1.0/tests/test_posterior.py +82 -0
- tilavet_aligner-0.1.0/tests/test_synthetic_e2e.py +41 -0
- tilavet_aligner-0.1.0/tests/test_viterbi_toy.py +128 -0
- tilavet_aligner-0.1.0/tests/test_word_spans.py +85 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Tarık İsmet ALKAN — Tilavet
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tilavet-aligner
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Monotonic / CTC forced-alignment helpers for the Tilavet Quran teleprompter pipeline.
|
|
5
|
+
Author-email: Tilavet <web3alkan@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/tialkan/tilavet-aligner
|
|
8
|
+
Project-URL: Documentation, https://github.com/tialkan/tilavet-aligner#readme
|
|
9
|
+
Project-URL: Repository, https://github.com/tialkan/tilavet-aligner
|
|
10
|
+
Project-URL: Issues, https://github.com/tialkan/tilavet-aligner/issues
|
|
11
|
+
Project-URL: Companion, https://github.com/tialkan/tilavet-phonemizer
|
|
12
|
+
Keywords: quran,alignment,forced-alignment,ctc,viterbi,phoneme,tajwid,arabic,speech-recognition
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
25
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
26
|
+
Classifier: Natural Language :: Arabic
|
|
27
|
+
Requires-Python: >=3.9
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
License-File: LICENSE
|
|
30
|
+
Provides-Extra: posterior
|
|
31
|
+
Requires-Dist: numpy>=1.24; extra == "posterior"
|
|
32
|
+
Provides-Extra: phonemizer
|
|
33
|
+
Requires-Dist: tilavet-phonemizer>=0.2.0; extra == "phonemizer"
|
|
34
|
+
Provides-Extra: dev
|
|
35
|
+
Requires-Dist: numpy>=1.24; extra == "dev"
|
|
36
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
37
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
38
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
39
|
+
Requires-Dist: tilavet-phonemizer>=0.2.0; extra == "dev"
|
|
40
|
+
Dynamic: license-file
|
|
41
|
+
|
|
42
|
+
# Tilavet Aligner
|
|
43
|
+
|
|
44
|
+
Small MVP package for monotonic Quran phoneme alignment.
|
|
45
|
+
|
|
46
|
+
It consumes `tilavet-phonemizer` alignment targets:
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
target = Phonemizer().phonemize("رَيْبَ ۛ فِيهِ").to_alignment_dict()
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
and aligns frame-level phoneme log probabilities to that known target sequence.
|
|
53
|
+
This package does not perform free ASR decoding.
|
|
54
|
+
|
|
55
|
+
## MVP Modules
|
|
56
|
+
|
|
57
|
+
- `contract.py`: validates the phonemizer alignment target JSON.
|
|
58
|
+
- `posterior.py`: adapts sparse JSON, dense JSON, dense logits, and optional
|
|
59
|
+
`.npy` matrices to frame log-prob dictionaries.
|
|
60
|
+
- `viterbi.py`: aligns frame log-probabilities to target phoneme symbols,
|
|
61
|
+
including optional blank-aware CTC mode.
|
|
62
|
+
- `word_spans.py`: converts symbol frame spans to word-level timestamps.
|
|
63
|
+
- `ctc.py`: vocabulary and frame normalization helpers.
|
|
64
|
+
|
|
65
|
+
## Posterior Inputs
|
|
66
|
+
|
|
67
|
+
Sparse frame dictionaries:
|
|
68
|
+
|
|
69
|
+
```json
|
|
70
|
+
[
|
|
71
|
+
{"a": -0.1, "b": -5.0},
|
|
72
|
+
{"a": -4.0, "b": -0.1}
|
|
73
|
+
]
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Dense matrix JSON or `.npy` input requires a vocabulary file whose order matches
|
|
77
|
+
the posterior columns:
|
|
78
|
+
|
|
79
|
+
```json
|
|
80
|
+
["<blank>", "a", "b"]
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
CLI examples:
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
tilavet-align target.json sparse-frames.json
|
|
87
|
+
tilavet-align target.json dense-logprobs.json --vocab-json vocab.json
|
|
88
|
+
tilavet-align target.json dense-logits.npy --vocab-json vocab.json --from-logits
|
|
89
|
+
tilavet-align target.json ctc-logprobs.json --ctc --blank-symbol "<blank>"
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
`--ctc` uses expanded states `[blank, s0, blank, s1, ...]`; blank frames do
|
|
93
|
+
not expand word-level symbol spans.
|
|
94
|
+
|
|
95
|
+
## Output Confidence
|
|
96
|
+
|
|
97
|
+
Alignment output keeps the raw Viterbi `score` and adds normalized confidence
|
|
98
|
+
fields:
|
|
99
|
+
|
|
100
|
+
```json
|
|
101
|
+
{
|
|
102
|
+
"score": -0.6,
|
|
103
|
+
"frame_count": 5,
|
|
104
|
+
"mean_log_prob": -0.12,
|
|
105
|
+
"confidence": 0.8869,
|
|
106
|
+
"words": [
|
|
107
|
+
{
|
|
108
|
+
"token": "ab",
|
|
109
|
+
"score": -0.2,
|
|
110
|
+
"scored_frames": 2,
|
|
111
|
+
"mean_log_prob": -0.1,
|
|
112
|
+
"confidence": 0.9048
|
|
113
|
+
}
|
|
114
|
+
]
|
|
115
|
+
}
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
In CTC mode, `frame_count` may include blank frames inside the timestamp span;
|
|
119
|
+
`scored_frames` counts only frames assigned to the word's target symbols.
|
|
120
|
+
|
|
121
|
+
## Synthetic End-to-End Demo
|
|
122
|
+
|
|
123
|
+
With the sibling phonemizer source tree present:
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
PYTHONPATH=src python3 examples/synthetic_alignment.py \
|
|
127
|
+
--phonemizer-src "../Tilavet Phonemizer/src"
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
This runs:
|
|
131
|
+
|
|
132
|
+
```text
|
|
133
|
+
Arabic text -> phonemizer.to_alignment_dict() -> synthetic CTC posterior -> word timestamps
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
It is not an acoustic-model test; it is a package-boundary smoke test.
|
|
137
|
+
|
|
138
|
+
## Development
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
python3 -m pytest -q
|
|
142
|
+
python3 -m ruff check .
|
|
143
|
+
```
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# Tilavet Aligner
|
|
2
|
+
|
|
3
|
+
Small MVP package for monotonic Quran phoneme alignment.
|
|
4
|
+
|
|
5
|
+
It consumes `tilavet-phonemizer` alignment targets:
|
|
6
|
+
|
|
7
|
+
```python
|
|
8
|
+
target = Phonemizer().phonemize("رَيْبَ ۛ فِيهِ").to_alignment_dict()
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
and aligns frame-level phoneme log probabilities to that known target sequence.
|
|
12
|
+
This package does not perform free ASR decoding.
|
|
13
|
+
|
|
14
|
+
## MVP Modules
|
|
15
|
+
|
|
16
|
+
- `contract.py`: validates the phonemizer alignment target JSON.
|
|
17
|
+
- `posterior.py`: adapts sparse JSON, dense JSON, dense logits, and optional
|
|
18
|
+
`.npy` matrices to frame log-prob dictionaries.
|
|
19
|
+
- `viterbi.py`: aligns frame log-probabilities to target phoneme symbols,
|
|
20
|
+
including optional blank-aware CTC mode.
|
|
21
|
+
- `word_spans.py`: converts symbol frame spans to word-level timestamps.
|
|
22
|
+
- `ctc.py`: vocabulary and frame normalization helpers.
|
|
23
|
+
|
|
24
|
+
## Posterior Inputs
|
|
25
|
+
|
|
26
|
+
Sparse frame dictionaries:
|
|
27
|
+
|
|
28
|
+
```json
|
|
29
|
+
[
|
|
30
|
+
{"a": -0.1, "b": -5.0},
|
|
31
|
+
{"a": -4.0, "b": -0.1}
|
|
32
|
+
]
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
Dense matrix JSON or `.npy` input requires a vocabulary file whose order matches
|
|
36
|
+
the posterior columns:
|
|
37
|
+
|
|
38
|
+
```json
|
|
39
|
+
["<blank>", "a", "b"]
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
CLI examples:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
tilavet-align target.json sparse-frames.json
|
|
46
|
+
tilavet-align target.json dense-logprobs.json --vocab-json vocab.json
|
|
47
|
+
tilavet-align target.json dense-logits.npy --vocab-json vocab.json --from-logits
|
|
48
|
+
tilavet-align target.json ctc-logprobs.json --ctc --blank-symbol "<blank>"
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
`--ctc` uses expanded states `[blank, s0, blank, s1, ...]`; blank frames do
|
|
52
|
+
not expand word-level symbol spans.
|
|
53
|
+
|
|
54
|
+
## Output Confidence
|
|
55
|
+
|
|
56
|
+
Alignment output keeps the raw Viterbi `score` and adds normalized confidence
|
|
57
|
+
fields:
|
|
58
|
+
|
|
59
|
+
```json
|
|
60
|
+
{
|
|
61
|
+
"score": -0.6,
|
|
62
|
+
"frame_count": 5,
|
|
63
|
+
"mean_log_prob": -0.12,
|
|
64
|
+
"confidence": 0.8869,
|
|
65
|
+
"words": [
|
|
66
|
+
{
|
|
67
|
+
"token": "ab",
|
|
68
|
+
"score": -0.2,
|
|
69
|
+
"scored_frames": 2,
|
|
70
|
+
"mean_log_prob": -0.1,
|
|
71
|
+
"confidence": 0.9048
|
|
72
|
+
}
|
|
73
|
+
]
|
|
74
|
+
}
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
In CTC mode, `frame_count` may include blank frames inside the timestamp span;
|
|
78
|
+
`scored_frames` counts only frames assigned to the word's target symbols.
|
|
79
|
+
|
|
80
|
+
## Synthetic End-to-End Demo
|
|
81
|
+
|
|
82
|
+
With the sibling phonemizer source tree present:
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
PYTHONPATH=src python3 examples/synthetic_alignment.py \
|
|
86
|
+
--phonemizer-src "../Tilavet Phonemizer/src"
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
This runs:
|
|
90
|
+
|
|
91
|
+
```text
|
|
92
|
+
Arabic text -> phonemizer.to_alignment_dict() -> synthetic CTC posterior -> word timestamps
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
It is not an acoustic-model test; it is a package-boundary smoke test.
|
|
96
|
+
|
|
97
|
+
## Development
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
python3 -m pytest -q
|
|
101
|
+
python3 -m ruff check .
|
|
102
|
+
```
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "tilavet-aligner"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Monotonic / CTC forced-alignment helpers for the Tilavet Quran teleprompter pipeline."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
authors = [{ name = "Tilavet", email = "web3alkan@gmail.com" }]
|
|
12
|
+
license = { text = "MIT" }
|
|
13
|
+
keywords = [
|
|
14
|
+
"quran",
|
|
15
|
+
"alignment",
|
|
16
|
+
"forced-alignment",
|
|
17
|
+
"ctc",
|
|
18
|
+
"viterbi",
|
|
19
|
+
"phoneme",
|
|
20
|
+
"tajwid",
|
|
21
|
+
"arabic",
|
|
22
|
+
"speech-recognition",
|
|
23
|
+
]
|
|
24
|
+
classifiers = [
|
|
25
|
+
"Development Status :: 3 - Alpha",
|
|
26
|
+
"Intended Audience :: Developers",
|
|
27
|
+
"Intended Audience :: Science/Research",
|
|
28
|
+
"License :: OSI Approved :: MIT License",
|
|
29
|
+
"Operating System :: OS Independent",
|
|
30
|
+
"Programming Language :: Python :: 3",
|
|
31
|
+
"Programming Language :: Python :: 3.9",
|
|
32
|
+
"Programming Language :: Python :: 3.10",
|
|
33
|
+
"Programming Language :: Python :: 3.11",
|
|
34
|
+
"Programming Language :: Python :: 3.12",
|
|
35
|
+
"Topic :: Multimedia :: Sound/Audio :: Speech",
|
|
36
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
37
|
+
"Topic :: Text Processing :: Linguistic",
|
|
38
|
+
"Natural Language :: Arabic",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
dependencies = []
|
|
42
|
+
|
|
43
|
+
[project.urls]
|
|
44
|
+
Homepage = "https://github.com/tialkan/tilavet-aligner"
|
|
45
|
+
Documentation = "https://github.com/tialkan/tilavet-aligner#readme"
|
|
46
|
+
Repository = "https://github.com/tialkan/tilavet-aligner"
|
|
47
|
+
Issues = "https://github.com/tialkan/tilavet-aligner/issues"
|
|
48
|
+
Companion = "https://github.com/tialkan/tilavet-phonemizer"
|
|
49
|
+
|
|
50
|
+
[project.scripts]
|
|
51
|
+
tilavet-align = "tilavet_aligner.cli:main"
|
|
52
|
+
|
|
53
|
+
[project.optional-dependencies]
|
|
54
|
+
posterior = [
|
|
55
|
+
"numpy>=1.24",
|
|
56
|
+
]
|
|
57
|
+
phonemizer = [
|
|
58
|
+
"tilavet-phonemizer>=0.2.0",
|
|
59
|
+
]
|
|
60
|
+
dev = [
|
|
61
|
+
"numpy>=1.24",
|
|
62
|
+
"pytest>=7.0",
|
|
63
|
+
"pytest-cov>=4.0",
|
|
64
|
+
"ruff>=0.1.0",
|
|
65
|
+
"tilavet-phonemizer>=0.2.0",
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
[tool.setuptools.packages.find]
|
|
69
|
+
where = ["src"]
|
|
70
|
+
|
|
71
|
+
[tool.ruff]
|
|
72
|
+
line-length = 100
|
|
73
|
+
|
|
74
|
+
[tool.ruff.lint]
|
|
75
|
+
select = ["E", "F", "I", "N", "W"]
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Tilavet Quran phoneme alignment helpers."""
|
|
2
|
+
|
|
3
|
+
from .contract import AlignmentTarget, ContractError, PauseMarker, WordTarget, parse_target
|
|
4
|
+
from .ctc import build_vocabulary, normalize_frames, symbol_to_index
|
|
5
|
+
from .posterior import (
|
|
6
|
+
PosteriorError,
|
|
7
|
+
dense_to_frames,
|
|
8
|
+
frames_from_payload,
|
|
9
|
+
load_frames_json,
|
|
10
|
+
load_npy_frames,
|
|
11
|
+
load_vocabulary_json,
|
|
12
|
+
)
|
|
13
|
+
from .viterbi import SymbolFrame, ViterbiResult, align_ctc_frames, align_frames
|
|
14
|
+
from .word_spans import AlignmentResult, PauseHint, WordAlignment, pause_hints, word_alignments
|
|
15
|
+
|
|
16
|
+
__version__ = "0.1.0"
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"AlignmentResult",
|
|
20
|
+
"AlignmentTarget",
|
|
21
|
+
"ContractError",
|
|
22
|
+
"PauseHint",
|
|
23
|
+
"PauseMarker",
|
|
24
|
+
"PosteriorError",
|
|
25
|
+
"SymbolFrame",
|
|
26
|
+
"ViterbiResult",
|
|
27
|
+
"WordAlignment",
|
|
28
|
+
"WordTarget",
|
|
29
|
+
"__version__",
|
|
30
|
+
"align_ctc_frames",
|
|
31
|
+
"align_frames",
|
|
32
|
+
"build_vocabulary",
|
|
33
|
+
"dense_to_frames",
|
|
34
|
+
"frames_from_payload",
|
|
35
|
+
"load_frames_json",
|
|
36
|
+
"load_npy_frames",
|
|
37
|
+
"load_vocabulary_json",
|
|
38
|
+
"normalize_frames",
|
|
39
|
+
"parse_target",
|
|
40
|
+
"pause_hints",
|
|
41
|
+
"symbol_to_index",
|
|
42
|
+
"word_alignments",
|
|
43
|
+
]
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""CLI for aligning a target JSON file to frame log-probabilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import json
|
|
7
|
+
from collections.abc import Mapping
|
|
8
|
+
from typing import Any, Optional
|
|
9
|
+
|
|
10
|
+
from .contract import parse_target
|
|
11
|
+
from .posterior import load_frames_json, load_npy_frames, load_vocabulary_json
|
|
12
|
+
from .viterbi import align_ctc_frames, align_frames
|
|
13
|
+
from .word_spans import alignment_result
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
17
|
+
parser = argparse.ArgumentParser(
|
|
18
|
+
prog="tilavet-align",
|
|
19
|
+
description="Align frame log-probabilities to a known Tilavet phoneme target.",
|
|
20
|
+
)
|
|
21
|
+
parser.add_argument("target_json", help="Path to a to_alignment_dict() JSON payload.")
|
|
22
|
+
parser.add_argument(
|
|
23
|
+
"posterior",
|
|
24
|
+
help="Path to sparse frame JSON, dense matrix JSON, or dense .npy posterior.",
|
|
25
|
+
)
|
|
26
|
+
parser.add_argument(
|
|
27
|
+
"--vocab-json",
|
|
28
|
+
help="Vocabulary JSON for dense matrix / .npy input.",
|
|
29
|
+
)
|
|
30
|
+
parser.add_argument(
|
|
31
|
+
"--from-logits",
|
|
32
|
+
action="store_true",
|
|
33
|
+
help="Treat dense matrix rows as logits and apply log-softmax.",
|
|
34
|
+
)
|
|
35
|
+
parser.add_argument(
|
|
36
|
+
"--ctc",
|
|
37
|
+
action="store_true",
|
|
38
|
+
help="Use blank-aware CTC alignment instead of simple monotonic Viterbi.",
|
|
39
|
+
)
|
|
40
|
+
parser.add_argument(
|
|
41
|
+
"--blank-symbol",
|
|
42
|
+
default="<blank>",
|
|
43
|
+
help="CTC blank symbol name. Default: <blank>.",
|
|
44
|
+
)
|
|
45
|
+
parser.add_argument(
|
|
46
|
+
"--hop-seconds",
|
|
47
|
+
type=float,
|
|
48
|
+
default=0.04,
|
|
49
|
+
help="Frame hop duration in seconds. Default: 0.04.",
|
|
50
|
+
)
|
|
51
|
+
return parser
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def main(argv: Optional[list[str]] = None) -> None:
|
|
55
|
+
parser = _build_parser()
|
|
56
|
+
args = parser.parse_args(argv)
|
|
57
|
+
|
|
58
|
+
with open(args.target_json, "r", encoding="utf-8") as handle:
|
|
59
|
+
target_payload = json.load(handle)
|
|
60
|
+
|
|
61
|
+
target = parse_target(_require_mapping(target_payload, "target_json"))
|
|
62
|
+
vocabulary = load_vocabulary_json(args.vocab_json) if args.vocab_json else None
|
|
63
|
+
if args.posterior.endswith(".npy"):
|
|
64
|
+
if vocabulary is None:
|
|
65
|
+
raise SystemExit("--vocab-json is required for .npy posterior input")
|
|
66
|
+
frames = load_npy_frames(
|
|
67
|
+
args.posterior,
|
|
68
|
+
vocabulary=vocabulary,
|
|
69
|
+
from_logits=args.from_logits,
|
|
70
|
+
)
|
|
71
|
+
else:
|
|
72
|
+
frames = load_frames_json(
|
|
73
|
+
args.posterior,
|
|
74
|
+
vocabulary=vocabulary,
|
|
75
|
+
from_logits=args.from_logits,
|
|
76
|
+
)
|
|
77
|
+
if args.ctc:
|
|
78
|
+
viterbi = align_ctc_frames(frames, target.symbols, blank=args.blank_symbol)
|
|
79
|
+
else:
|
|
80
|
+
viterbi = align_frames(frames, target.symbols)
|
|
81
|
+
result = alignment_result(target, viterbi, hop_seconds=args.hop_seconds)
|
|
82
|
+
print(json.dumps(result.to_dict(), ensure_ascii=False))
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _require_mapping(value: Any, label: str) -> Mapping[str, Any]:
|
|
86
|
+
if not isinstance(value, Mapping):
|
|
87
|
+
raise SystemExit(f"{label} must be a JSON object")
|
|
88
|
+
return value
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
if __name__ == "__main__":
|
|
92
|
+
main()
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
"""Target contract consumed from tilavet-phonemizer."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Mapping, Sequence
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ContractError(ValueError):
|
|
11
|
+
"""Raised when an alignment target does not match the expected contract."""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass(frozen=True)
|
|
15
|
+
class WordTarget:
|
|
16
|
+
"""A source token span over the cleaned target symbol sequence."""
|
|
17
|
+
|
|
18
|
+
token: str
|
|
19
|
+
start: int
|
|
20
|
+
end: int
|
|
21
|
+
|
|
22
|
+
@classmethod
|
|
23
|
+
def from_mapping(cls, data: Mapping[str, Any], symbol_count: int) -> "WordTarget":
|
|
24
|
+
token = _require_str(data, "token")
|
|
25
|
+
start = _require_int(data, "start")
|
|
26
|
+
end = _require_int(data, "end")
|
|
27
|
+
if not 0 <= start < end <= symbol_count:
|
|
28
|
+
raise ContractError(
|
|
29
|
+
f"invalid word span for {token!r}: {start}..{end} "
|
|
30
|
+
f"outside 0..{symbol_count}"
|
|
31
|
+
)
|
|
32
|
+
return cls(token=token, start=start, end=end)
|
|
33
|
+
|
|
34
|
+
def to_dict(self) -> dict[str, Any]:
|
|
35
|
+
return {"token": self.token, "start": self.start, "end": self.end}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass(frozen=True)
|
|
39
|
+
class PauseMarker:
|
|
40
|
+
"""A pause insertion point over the cleaned target symbol sequence."""
|
|
41
|
+
|
|
42
|
+
raw_index: int
|
|
43
|
+
target_index: int
|
|
44
|
+
symbol: str = "PAUSE"
|
|
45
|
+
|
|
46
|
+
@classmethod
|
|
47
|
+
def from_mapping(cls, data: Mapping[str, Any], symbol_count: int) -> "PauseMarker":
|
|
48
|
+
raw_index = _require_int(data, "raw_index")
|
|
49
|
+
target_index = _require_int(data, "target_index")
|
|
50
|
+
symbol = str(data.get("symbol", "PAUSE"))
|
|
51
|
+
if not 0 <= target_index <= symbol_count:
|
|
52
|
+
raise ContractError(
|
|
53
|
+
f"pause target_index {target_index} outside 0..{symbol_count}"
|
|
54
|
+
)
|
|
55
|
+
if raw_index < 0:
|
|
56
|
+
raise ContractError(f"pause raw_index must be non-negative: {raw_index}")
|
|
57
|
+
return cls(raw_index=raw_index, target_index=target_index, symbol=symbol)
|
|
58
|
+
|
|
59
|
+
def to_dict(self) -> dict[str, Any]:
|
|
60
|
+
return {
|
|
61
|
+
"raw_index": self.raw_index,
|
|
62
|
+
"target_index": self.target_index,
|
|
63
|
+
"symbol": self.symbol,
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass(frozen=True)
|
|
68
|
+
class AlignmentTarget:
|
|
69
|
+
"""Clean phoneme target plus word spans and pause metadata."""
|
|
70
|
+
|
|
71
|
+
symbols: tuple[str, ...]
|
|
72
|
+
words: tuple[WordTarget, ...]
|
|
73
|
+
pauses: tuple[PauseMarker, ...]
|
|
74
|
+
text: str
|
|
75
|
+
|
|
76
|
+
def to_dict(self) -> dict[str, Any]:
|
|
77
|
+
return {
|
|
78
|
+
"symbols": list(self.symbols),
|
|
79
|
+
"text": self.text,
|
|
80
|
+
"words": [word.to_dict() for word in self.words],
|
|
81
|
+
"pauses": [pause.to_dict() for pause in self.pauses],
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def parse_target(data: Mapping[str, Any], allow_pause_symbol: bool = False) -> AlignmentTarget:
|
|
86
|
+
"""Validate and normalize a `PhonemizationResult.to_alignment_dict()` payload."""
|
|
87
|
+
|
|
88
|
+
symbols = _require_symbol_list(data.get("symbols"), allow_pause_symbol)
|
|
89
|
+
expected_text = " ".join(symbols)
|
|
90
|
+
text = str(data.get("text", expected_text))
|
|
91
|
+
if text != expected_text:
|
|
92
|
+
raise ContractError(f"text mismatch: expected {expected_text!r}, got {text!r}")
|
|
93
|
+
|
|
94
|
+
word_items = data.get("words", [])
|
|
95
|
+
if not _is_sequence(word_items):
|
|
96
|
+
raise ContractError("words must be a list")
|
|
97
|
+
words = tuple(
|
|
98
|
+
WordTarget.from_mapping(_require_mapping(item, "word"), len(symbols))
|
|
99
|
+
for item in word_items
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
pause_items = data.get("pauses", [])
|
|
103
|
+
if not _is_sequence(pause_items):
|
|
104
|
+
raise ContractError("pauses must be a list")
|
|
105
|
+
pauses = tuple(
|
|
106
|
+
PauseMarker.from_mapping(_require_mapping(item, "pause"), len(symbols))
|
|
107
|
+
for item in pause_items
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
return AlignmentTarget(
|
|
111
|
+
symbols=tuple(symbols),
|
|
112
|
+
text=text,
|
|
113
|
+
words=words,
|
|
114
|
+
pauses=pauses,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _require_symbol_list(value: Any, allow_pause_symbol: bool) -> list[str]:
|
|
119
|
+
if not _is_sequence(value):
|
|
120
|
+
raise ContractError("symbols must be a list of strings")
|
|
121
|
+
|
|
122
|
+
symbols: list[str] = []
|
|
123
|
+
for index, item in enumerate(value):
|
|
124
|
+
if not isinstance(item, str):
|
|
125
|
+
raise ContractError(f"symbols[{index}] must be a string")
|
|
126
|
+
if item == "":
|
|
127
|
+
raise ContractError(f"symbols[{index}] must not be empty")
|
|
128
|
+
if item == "PAUSE" and not allow_pause_symbol:
|
|
129
|
+
raise ContractError("PAUSE must be metadata, not a target symbol")
|
|
130
|
+
symbols.append(item)
|
|
131
|
+
return symbols
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _require_str(data: Mapping[str, Any], key: str) -> str:
|
|
135
|
+
value = data.get(key)
|
|
136
|
+
if not isinstance(value, str):
|
|
137
|
+
raise ContractError(f"{key} must be a string")
|
|
138
|
+
return value
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _require_int(data: Mapping[str, Any], key: str) -> int:
|
|
142
|
+
value = data.get(key)
|
|
143
|
+
if not isinstance(value, int) or isinstance(value, bool):
|
|
144
|
+
raise ContractError(f"{key} must be an integer")
|
|
145
|
+
return value
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _require_mapping(value: Any, label: str) -> Mapping[str, Any]:
|
|
149
|
+
if not isinstance(value, Mapping):
|
|
150
|
+
raise ContractError(f"{label} must be an object")
|
|
151
|
+
return value
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _is_sequence(value: Any) -> bool:
|
|
155
|
+
return isinstance(value, Sequence) and not isinstance(value, (str, bytes, bytearray))
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Small CTC vocabulary helpers for the MVP aligner."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Mapping, Sequence
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def build_vocabulary(symbols: Sequence[str], blank: str = "<blank>") -> list[str]:
|
|
9
|
+
"""Return `[blank] + unique symbols` while preserving first occurrence order."""
|
|
10
|
+
|
|
11
|
+
if not blank:
|
|
12
|
+
raise ValueError("blank symbol must be non-empty")
|
|
13
|
+
|
|
14
|
+
vocabulary = [blank]
|
|
15
|
+
seen = {blank}
|
|
16
|
+
for symbol in symbols:
|
|
17
|
+
if not symbol:
|
|
18
|
+
raise ValueError("symbols must be non-empty")
|
|
19
|
+
if symbol not in seen:
|
|
20
|
+
vocabulary.append(symbol)
|
|
21
|
+
seen.add(symbol)
|
|
22
|
+
return vocabulary
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def symbol_to_index(vocabulary: Sequence[str]) -> dict[str, int]:
|
|
26
|
+
"""Build a stable symbol -> index map and reject duplicates."""
|
|
27
|
+
|
|
28
|
+
mapping: dict[str, int] = {}
|
|
29
|
+
for index, symbol in enumerate(vocabulary):
|
|
30
|
+
if symbol in mapping:
|
|
31
|
+
raise ValueError(f"duplicate vocabulary symbol: {symbol!r}")
|
|
32
|
+
mapping[symbol] = index
|
|
33
|
+
return mapping
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def normalize_frames(
|
|
37
|
+
frames: Sequence[Mapping[str, float]],
|
|
38
|
+
vocabulary: Sequence[str],
|
|
39
|
+
missing_log_prob: float = -1.0e9,
|
|
40
|
+
) -> list[dict[str, float]]:
|
|
41
|
+
"""Project sparse frame log-prob dictionaries onto a fixed vocabulary."""
|
|
42
|
+
|
|
43
|
+
normalized: list[dict[str, float]] = []
|
|
44
|
+
for frame in frames:
|
|
45
|
+
normalized.append(
|
|
46
|
+
{
|
|
47
|
+
symbol: float(frame.get(symbol, missing_log_prob))
|
|
48
|
+
for symbol in vocabulary
|
|
49
|
+
}
|
|
50
|
+
)
|
|
51
|
+
return normalized
|