tilavet-aligner 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Tarık İsmet ALKAN — Tilavet
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,143 @@
1
+ Metadata-Version: 2.4
2
+ Name: tilavet-aligner
3
+ Version: 0.1.0
4
+ Summary: Monotonic / CTC forced-alignment helpers for the Tilavet Quran teleprompter pipeline.
5
+ Author-email: Tilavet <web3alkan@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/tialkan/tilavet-aligner
8
+ Project-URL: Documentation, https://github.com/tialkan/tilavet-aligner#readme
9
+ Project-URL: Repository, https://github.com/tialkan/tilavet-aligner
10
+ Project-URL: Issues, https://github.com/tialkan/tilavet-aligner/issues
11
+ Project-URL: Companion, https://github.com/tialkan/tilavet-phonemizer
12
+ Keywords: quran,alignment,forced-alignment,ctc,viterbi,phoneme,tajwid,arabic,speech-recognition
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
24
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
25
+ Classifier: Topic :: Text Processing :: Linguistic
26
+ Classifier: Natural Language :: Arabic
27
+ Requires-Python: >=3.9
28
+ Description-Content-Type: text/markdown
29
+ License-File: LICENSE
30
+ Provides-Extra: posterior
31
+ Requires-Dist: numpy>=1.24; extra == "posterior"
32
+ Provides-Extra: phonemizer
33
+ Requires-Dist: tilavet-phonemizer>=0.2.0; extra == "phonemizer"
34
+ Provides-Extra: dev
35
+ Requires-Dist: numpy>=1.24; extra == "dev"
36
+ Requires-Dist: pytest>=7.0; extra == "dev"
37
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
38
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
39
+ Requires-Dist: tilavet-phonemizer>=0.2.0; extra == "dev"
40
+ Dynamic: license-file
41
+
42
+ # Tilavet Aligner
43
+
44
+ Small MVP package for monotonic Quran phoneme alignment.
45
+
46
+ It consumes `tilavet-phonemizer` alignment targets:
47
+
48
+ ```python
49
+ target = Phonemizer().phonemize("رَيْبَ ۛ فِيهِ").to_alignment_dict()
50
+ ```
51
+
52
+ and aligns frame-level phoneme log probabilities to that known target sequence.
53
+ This package does not perform free ASR decoding.
54
+
55
+ ## MVP Modules
56
+
57
+ - `contract.py`: validates the phonemizer alignment target JSON.
58
+ - `posterior.py`: adapts sparse JSON, dense JSON, dense logits, and optional
59
+ `.npy` matrices to frame log-prob dictionaries.
60
+ - `viterbi.py`: aligns frame log-probabilities to target phoneme symbols,
61
+ including optional blank-aware CTC mode.
62
+ - `word_spans.py`: converts symbol frame spans to word-level timestamps.
63
+ - `ctc.py`: vocabulary and frame normalization helpers.
64
+
65
+ ## Posterior Inputs
66
+
67
+ Sparse frame dictionaries:
68
+
69
+ ```json
70
+ [
71
+ {"a": -0.1, "b": -5.0},
72
+ {"a": -4.0, "b": -0.1}
73
+ ]
74
+ ```
75
+
76
+ Dense matrix JSON or `.npy` input requires a vocabulary file whose order matches
77
+ the posterior columns:
78
+
79
+ ```json
80
+ ["<blank>", "a", "b"]
81
+ ```
82
+
83
+ CLI examples:
84
+
85
+ ```bash
86
+ tilavet-align target.json sparse-frames.json
87
+ tilavet-align target.json dense-logprobs.json --vocab-json vocab.json
88
+ tilavet-align target.json dense-logits.npy --vocab-json vocab.json --from-logits
89
+ tilavet-align target.json ctc-logprobs.json --ctc --blank-symbol "<blank>"
90
+ ```
91
+
92
+ `--ctc` uses expanded states `[blank, s0, blank, s1, ...]`; blank frames do
93
+ not expand word-level symbol spans.
94
+
95
+ ## Output Confidence
96
+
97
+ Alignment output keeps the raw Viterbi `score` and adds normalized confidence
98
+ fields:
99
+
100
+ ```json
101
+ {
102
+ "score": -0.6,
103
+ "frame_count": 5,
104
+ "mean_log_prob": -0.12,
105
+ "confidence": 0.8869,
106
+ "words": [
107
+ {
108
+ "token": "ab",
109
+ "score": -0.2,
110
+ "scored_frames": 2,
111
+ "mean_log_prob": -0.1,
112
+ "confidence": 0.9048
113
+ }
114
+ ]
115
+ }
116
+ ```
117
+
118
+ In CTC mode, `frame_count` may include blank frames inside the timestamp span;
119
+ `scored_frames` counts only frames assigned to the word's target symbols.
120
+
121
+ ## Synthetic End-to-End Demo
122
+
123
+ With the sibling phonemizer source tree present:
124
+
125
+ ```bash
126
+ PYTHONPATH=src python3 examples/synthetic_alignment.py \
127
+ --phonemizer-src "../Tilavet Phonemizer/src"
128
+ ```
129
+
130
+ This runs:
131
+
132
+ ```text
133
+ Arabic text -> phonemizer.to_alignment_dict() -> synthetic CTC posterior -> word timestamps
134
+ ```
135
+
136
+ It is not an acoustic-model test; it is a package-boundary smoke test.
137
+
138
+ ## Development
139
+
140
+ ```bash
141
+ python3 -m pytest -q
142
+ python3 -m ruff check .
143
+ ```
@@ -0,0 +1,102 @@
1
+ # Tilavet Aligner
2
+
3
+ Small MVP package for monotonic Quran phoneme alignment.
4
+
5
+ It consumes `tilavet-phonemizer` alignment targets:
6
+
7
+ ```python
8
+ target = Phonemizer().phonemize("رَيْبَ ۛ فِيهِ").to_alignment_dict()
9
+ ```
10
+
11
+ and aligns frame-level phoneme log probabilities to that known target sequence.
12
+ This package does not perform free ASR decoding.
13
+
14
+ ## MVP Modules
15
+
16
+ - `contract.py`: validates the phonemizer alignment target JSON.
17
+ - `posterior.py`: adapts sparse JSON, dense JSON, dense logits, and optional
18
+ `.npy` matrices to frame log-prob dictionaries.
19
+ - `viterbi.py`: aligns frame log-probabilities to target phoneme symbols,
20
+ including optional blank-aware CTC mode.
21
+ - `word_spans.py`: converts symbol frame spans to word-level timestamps.
22
+ - `ctc.py`: vocabulary and frame normalization helpers.
23
+
24
+ ## Posterior Inputs
25
+
26
+ Sparse frame dictionaries:
27
+
28
+ ```json
29
+ [
30
+ {"a": -0.1, "b": -5.0},
31
+ {"a": -4.0, "b": -0.1}
32
+ ]
33
+ ```
34
+
35
+ Dense matrix JSON or `.npy` input requires a vocabulary file whose order matches
36
+ the posterior columns:
37
+
38
+ ```json
39
+ ["<blank>", "a", "b"]
40
+ ```
41
+
42
+ CLI examples:
43
+
44
+ ```bash
45
+ tilavet-align target.json sparse-frames.json
46
+ tilavet-align target.json dense-logprobs.json --vocab-json vocab.json
47
+ tilavet-align target.json dense-logits.npy --vocab-json vocab.json --from-logits
48
+ tilavet-align target.json ctc-logprobs.json --ctc --blank-symbol "<blank>"
49
+ ```
50
+
51
+ `--ctc` uses expanded states `[blank, s0, blank, s1, ...]`; blank frames do
52
+ not expand word-level symbol spans.
53
+
54
+ ## Output Confidence
55
+
56
+ Alignment output keeps the raw Viterbi `score` and adds normalized confidence
57
+ fields:
58
+
59
+ ```json
60
+ {
61
+ "score": -0.6,
62
+ "frame_count": 5,
63
+ "mean_log_prob": -0.12,
64
+ "confidence": 0.8869,
65
+ "words": [
66
+ {
67
+ "token": "ab",
68
+ "score": -0.2,
69
+ "scored_frames": 2,
70
+ "mean_log_prob": -0.1,
71
+ "confidence": 0.9048
72
+ }
73
+ ]
74
+ }
75
+ ```
76
+
77
+ In CTC mode, `frame_count` may include blank frames inside the timestamp span;
78
+ `scored_frames` counts only frames assigned to the word's target symbols.
79
+
80
+ ## Synthetic End-to-End Demo
81
+
82
+ With the sibling phonemizer source tree present:
83
+
84
+ ```bash
85
+ PYTHONPATH=src python3 examples/synthetic_alignment.py \
86
+ --phonemizer-src "../Tilavet Phonemizer/src"
87
+ ```
88
+
89
+ This runs:
90
+
91
+ ```text
92
+ Arabic text -> phonemizer.to_alignment_dict() -> synthetic CTC posterior -> word timestamps
93
+ ```
94
+
95
+ It is not an acoustic-model test; it is a package-boundary smoke test.
96
+
97
+ ## Development
98
+
99
+ ```bash
100
+ python3 -m pytest -q
101
+ python3 -m ruff check .
102
+ ```
@@ -0,0 +1,75 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "tilavet-aligner"
7
+ version = "0.1.0"
8
+ description = "Monotonic / CTC forced-alignment helpers for the Tilavet Quran teleprompter pipeline."
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ authors = [{ name = "Tilavet", email = "web3alkan@gmail.com" }]
12
+ license = { text = "MIT" }
13
+ keywords = [
14
+ "quran",
15
+ "alignment",
16
+ "forced-alignment",
17
+ "ctc",
18
+ "viterbi",
19
+ "phoneme",
20
+ "tajwid",
21
+ "arabic",
22
+ "speech-recognition",
23
+ ]
24
+ classifiers = [
25
+ "Development Status :: 3 - Alpha",
26
+ "Intended Audience :: Developers",
27
+ "Intended Audience :: Science/Research",
28
+ "License :: OSI Approved :: MIT License",
29
+ "Operating System :: OS Independent",
30
+ "Programming Language :: Python :: 3",
31
+ "Programming Language :: Python :: 3.9",
32
+ "Programming Language :: Python :: 3.10",
33
+ "Programming Language :: Python :: 3.11",
34
+ "Programming Language :: Python :: 3.12",
35
+ "Topic :: Multimedia :: Sound/Audio :: Speech",
36
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
37
+ "Topic :: Text Processing :: Linguistic",
38
+ "Natural Language :: Arabic",
39
+ ]
40
+
41
+ dependencies = []
42
+
43
+ [project.urls]
44
+ Homepage = "https://github.com/tialkan/tilavet-aligner"
45
+ Documentation = "https://github.com/tialkan/tilavet-aligner#readme"
46
+ Repository = "https://github.com/tialkan/tilavet-aligner"
47
+ Issues = "https://github.com/tialkan/tilavet-aligner/issues"
48
+ Companion = "https://github.com/tialkan/tilavet-phonemizer"
49
+
50
+ [project.scripts]
51
+ tilavet-align = "tilavet_aligner.cli:main"
52
+
53
+ [project.optional-dependencies]
54
+ posterior = [
55
+ "numpy>=1.24",
56
+ ]
57
+ phonemizer = [
58
+ "tilavet-phonemizer>=0.2.0",
59
+ ]
60
+ dev = [
61
+ "numpy>=1.24",
62
+ "pytest>=7.0",
63
+ "pytest-cov>=4.0",
64
+ "ruff>=0.1.0",
65
+ "tilavet-phonemizer>=0.2.0",
66
+ ]
67
+
68
+ [tool.setuptools.packages.find]
69
+ where = ["src"]
70
+
71
+ [tool.ruff]
72
+ line-length = 100
73
+
74
+ [tool.ruff.lint]
75
+ select = ["E", "F", "I", "N", "W"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,43 @@
1
+ """Tilavet Quran phoneme alignment helpers."""
2
+
3
+ from .contract import AlignmentTarget, ContractError, PauseMarker, WordTarget, parse_target
4
+ from .ctc import build_vocabulary, normalize_frames, symbol_to_index
5
+ from .posterior import (
6
+ PosteriorError,
7
+ dense_to_frames,
8
+ frames_from_payload,
9
+ load_frames_json,
10
+ load_npy_frames,
11
+ load_vocabulary_json,
12
+ )
13
+ from .viterbi import SymbolFrame, ViterbiResult, align_ctc_frames, align_frames
14
+ from .word_spans import AlignmentResult, PauseHint, WordAlignment, pause_hints, word_alignments
15
+
16
+ __version__ = "0.1.0"
17
+
18
+ __all__ = [
19
+ "AlignmentResult",
20
+ "AlignmentTarget",
21
+ "ContractError",
22
+ "PauseHint",
23
+ "PauseMarker",
24
+ "PosteriorError",
25
+ "SymbolFrame",
26
+ "ViterbiResult",
27
+ "WordAlignment",
28
+ "WordTarget",
29
+ "__version__",
30
+ "align_ctc_frames",
31
+ "align_frames",
32
+ "build_vocabulary",
33
+ "dense_to_frames",
34
+ "frames_from_payload",
35
+ "load_frames_json",
36
+ "load_npy_frames",
37
+ "load_vocabulary_json",
38
+ "normalize_frames",
39
+ "parse_target",
40
+ "pause_hints",
41
+ "symbol_to_index",
42
+ "word_alignments",
43
+ ]
@@ -0,0 +1,92 @@
1
+ """CLI for aligning a target JSON file to frame log-probabilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+ from collections.abc import Mapping
8
+ from typing import Any, Optional
9
+
10
+ from .contract import parse_target
11
+ from .posterior import load_frames_json, load_npy_frames, load_vocabulary_json
12
+ from .viterbi import align_ctc_frames, align_frames
13
+ from .word_spans import alignment_result
14
+
15
+
16
+ def _build_parser() -> argparse.ArgumentParser:
17
+ parser = argparse.ArgumentParser(
18
+ prog="tilavet-align",
19
+ description="Align frame log-probabilities to a known Tilavet phoneme target.",
20
+ )
21
+ parser.add_argument("target_json", help="Path to a to_alignment_dict() JSON payload.")
22
+ parser.add_argument(
23
+ "posterior",
24
+ help="Path to sparse frame JSON, dense matrix JSON, or dense .npy posterior.",
25
+ )
26
+ parser.add_argument(
27
+ "--vocab-json",
28
+ help="Vocabulary JSON for dense matrix / .npy input.",
29
+ )
30
+ parser.add_argument(
31
+ "--from-logits",
32
+ action="store_true",
33
+ help="Treat dense matrix rows as logits and apply log-softmax.",
34
+ )
35
+ parser.add_argument(
36
+ "--ctc",
37
+ action="store_true",
38
+ help="Use blank-aware CTC alignment instead of simple monotonic Viterbi.",
39
+ )
40
+ parser.add_argument(
41
+ "--blank-symbol",
42
+ default="<blank>",
43
+ help="CTC blank symbol name. Default: <blank>.",
44
+ )
45
+ parser.add_argument(
46
+ "--hop-seconds",
47
+ type=float,
48
+ default=0.04,
49
+ help="Frame hop duration in seconds. Default: 0.04.",
50
+ )
51
+ return parser
52
+
53
+
54
+ def main(argv: Optional[list[str]] = None) -> None:
55
+ parser = _build_parser()
56
+ args = parser.parse_args(argv)
57
+
58
+ with open(args.target_json, "r", encoding="utf-8") as handle:
59
+ target_payload = json.load(handle)
60
+
61
+ target = parse_target(_require_mapping(target_payload, "target_json"))
62
+ vocabulary = load_vocabulary_json(args.vocab_json) if args.vocab_json else None
63
+ if args.posterior.endswith(".npy"):
64
+ if vocabulary is None:
65
+ raise SystemExit("--vocab-json is required for .npy posterior input")
66
+ frames = load_npy_frames(
67
+ args.posterior,
68
+ vocabulary=vocabulary,
69
+ from_logits=args.from_logits,
70
+ )
71
+ else:
72
+ frames = load_frames_json(
73
+ args.posterior,
74
+ vocabulary=vocabulary,
75
+ from_logits=args.from_logits,
76
+ )
77
+ if args.ctc:
78
+ viterbi = align_ctc_frames(frames, target.symbols, blank=args.blank_symbol)
79
+ else:
80
+ viterbi = align_frames(frames, target.symbols)
81
+ result = alignment_result(target, viterbi, hop_seconds=args.hop_seconds)
82
+ print(json.dumps(result.to_dict(), ensure_ascii=False))
83
+
84
+
85
+ def _require_mapping(value: Any, label: str) -> Mapping[str, Any]:
86
+ if not isinstance(value, Mapping):
87
+ raise SystemExit(f"{label} must be a JSON object")
88
+ return value
89
+
90
+
91
+ if __name__ == "__main__":
92
+ main()
@@ -0,0 +1,155 @@
1
+ """Target contract consumed from tilavet-phonemizer."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Mapping, Sequence
6
+ from dataclasses import dataclass
7
+ from typing import Any
8
+
9
+
10
+ class ContractError(ValueError):
11
+ """Raised when an alignment target does not match the expected contract."""
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class WordTarget:
16
+ """A source token span over the cleaned target symbol sequence."""
17
+
18
+ token: str
19
+ start: int
20
+ end: int
21
+
22
+ @classmethod
23
+ def from_mapping(cls, data: Mapping[str, Any], symbol_count: int) -> "WordTarget":
24
+ token = _require_str(data, "token")
25
+ start = _require_int(data, "start")
26
+ end = _require_int(data, "end")
27
+ if not 0 <= start < end <= symbol_count:
28
+ raise ContractError(
29
+ f"invalid word span for {token!r}: {start}..{end} "
30
+ f"outside 0..{symbol_count}"
31
+ )
32
+ return cls(token=token, start=start, end=end)
33
+
34
+ def to_dict(self) -> dict[str, Any]:
35
+ return {"token": self.token, "start": self.start, "end": self.end}
36
+
37
+
38
+ @dataclass(frozen=True)
39
+ class PauseMarker:
40
+ """A pause insertion point over the cleaned target symbol sequence."""
41
+
42
+ raw_index: int
43
+ target_index: int
44
+ symbol: str = "PAUSE"
45
+
46
+ @classmethod
47
+ def from_mapping(cls, data: Mapping[str, Any], symbol_count: int) -> "PauseMarker":
48
+ raw_index = _require_int(data, "raw_index")
49
+ target_index = _require_int(data, "target_index")
50
+ symbol = str(data.get("symbol", "PAUSE"))
51
+ if not 0 <= target_index <= symbol_count:
52
+ raise ContractError(
53
+ f"pause target_index {target_index} outside 0..{symbol_count}"
54
+ )
55
+ if raw_index < 0:
56
+ raise ContractError(f"pause raw_index must be non-negative: {raw_index}")
57
+ return cls(raw_index=raw_index, target_index=target_index, symbol=symbol)
58
+
59
+ def to_dict(self) -> dict[str, Any]:
60
+ return {
61
+ "raw_index": self.raw_index,
62
+ "target_index": self.target_index,
63
+ "symbol": self.symbol,
64
+ }
65
+
66
+
67
+ @dataclass(frozen=True)
68
+ class AlignmentTarget:
69
+ """Clean phoneme target plus word spans and pause metadata."""
70
+
71
+ symbols: tuple[str, ...]
72
+ words: tuple[WordTarget, ...]
73
+ pauses: tuple[PauseMarker, ...]
74
+ text: str
75
+
76
+ def to_dict(self) -> dict[str, Any]:
77
+ return {
78
+ "symbols": list(self.symbols),
79
+ "text": self.text,
80
+ "words": [word.to_dict() for word in self.words],
81
+ "pauses": [pause.to_dict() for pause in self.pauses],
82
+ }
83
+
84
+
85
+ def parse_target(data: Mapping[str, Any], allow_pause_symbol: bool = False) -> AlignmentTarget:
86
+ """Validate and normalize a `PhonemizationResult.to_alignment_dict()` payload."""
87
+
88
+ symbols = _require_symbol_list(data.get("symbols"), allow_pause_symbol)
89
+ expected_text = " ".join(symbols)
90
+ text = str(data.get("text", expected_text))
91
+ if text != expected_text:
92
+ raise ContractError(f"text mismatch: expected {expected_text!r}, got {text!r}")
93
+
94
+ word_items = data.get("words", [])
95
+ if not _is_sequence(word_items):
96
+ raise ContractError("words must be a list")
97
+ words = tuple(
98
+ WordTarget.from_mapping(_require_mapping(item, "word"), len(symbols))
99
+ for item in word_items
100
+ )
101
+
102
+ pause_items = data.get("pauses", [])
103
+ if not _is_sequence(pause_items):
104
+ raise ContractError("pauses must be a list")
105
+ pauses = tuple(
106
+ PauseMarker.from_mapping(_require_mapping(item, "pause"), len(symbols))
107
+ for item in pause_items
108
+ )
109
+
110
+ return AlignmentTarget(
111
+ symbols=tuple(symbols),
112
+ text=text,
113
+ words=words,
114
+ pauses=pauses,
115
+ )
116
+
117
+
118
+ def _require_symbol_list(value: Any, allow_pause_symbol: bool) -> list[str]:
119
+ if not _is_sequence(value):
120
+ raise ContractError("symbols must be a list of strings")
121
+
122
+ symbols: list[str] = []
123
+ for index, item in enumerate(value):
124
+ if not isinstance(item, str):
125
+ raise ContractError(f"symbols[{index}] must be a string")
126
+ if item == "":
127
+ raise ContractError(f"symbols[{index}] must not be empty")
128
+ if item == "PAUSE" and not allow_pause_symbol:
129
+ raise ContractError("PAUSE must be metadata, not a target symbol")
130
+ symbols.append(item)
131
+ return symbols
132
+
133
+
134
+ def _require_str(data: Mapping[str, Any], key: str) -> str:
135
+ value = data.get(key)
136
+ if not isinstance(value, str):
137
+ raise ContractError(f"{key} must be a string")
138
+ return value
139
+
140
+
141
+ def _require_int(data: Mapping[str, Any], key: str) -> int:
142
+ value = data.get(key)
143
+ if not isinstance(value, int) or isinstance(value, bool):
144
+ raise ContractError(f"{key} must be an integer")
145
+ return value
146
+
147
+
148
+ def _require_mapping(value: Any, label: str) -> Mapping[str, Any]:
149
+ if not isinstance(value, Mapping):
150
+ raise ContractError(f"{label} must be an object")
151
+ return value
152
+
153
+
154
+ def _is_sequence(value: Any) -> bool:
155
+ return isinstance(value, Sequence) and not isinstance(value, (str, bytes, bytearray))
@@ -0,0 +1,51 @@
1
+ """Small CTC vocabulary helpers for the MVP aligner."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Mapping, Sequence
6
+
7
+
8
+ def build_vocabulary(symbols: Sequence[str], blank: str = "<blank>") -> list[str]:
9
+ """Return `[blank] + unique symbols` while preserving first occurrence order."""
10
+
11
+ if not blank:
12
+ raise ValueError("blank symbol must be non-empty")
13
+
14
+ vocabulary = [blank]
15
+ seen = {blank}
16
+ for symbol in symbols:
17
+ if not symbol:
18
+ raise ValueError("symbols must be non-empty")
19
+ if symbol not in seen:
20
+ vocabulary.append(symbol)
21
+ seen.add(symbol)
22
+ return vocabulary
23
+
24
+
25
+ def symbol_to_index(vocabulary: Sequence[str]) -> dict[str, int]:
26
+ """Build a stable symbol -> index map and reject duplicates."""
27
+
28
+ mapping: dict[str, int] = {}
29
+ for index, symbol in enumerate(vocabulary):
30
+ if symbol in mapping:
31
+ raise ValueError(f"duplicate vocabulary symbol: {symbol!r}")
32
+ mapping[symbol] = index
33
+ return mapping
34
+
35
+
36
+ def normalize_frames(
37
+ frames: Sequence[Mapping[str, float]],
38
+ vocabulary: Sequence[str],
39
+ missing_log_prob: float = -1.0e9,
40
+ ) -> list[dict[str, float]]:
41
+ """Project sparse frame log-prob dictionaries onto a fixed vocabulary."""
42
+
43
+ normalized: list[dict[str, float]] = []
44
+ for frame in frames:
45
+ normalized.append(
46
+ {
47
+ symbol: float(frame.get(symbol, missing_log_prob))
48
+ for symbol in vocabulary
49
+ }
50
+ )
51
+ return normalized