kala-tts 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. kala_tts-0.1.0/LICENSE +21 -0
  2. kala_tts-0.1.0/PKG-INFO +112 -0
  3. kala_tts-0.1.0/README.md +81 -0
  4. kala_tts-0.1.0/kala_tts/__init__.py +17 -0
  5. kala_tts-0.1.0/kala_tts/__main__.py +74 -0
  6. kala_tts-0.1.0/kala_tts/_api.py +63 -0
  7. kala_tts-0.1.0/kala_tts/_download.py +29 -0
  8. kala_tts-0.1.0/kala_tts/_infer.py +175 -0
  9. kala_tts-0.1.0/kala_tts.egg-info/PKG-INFO +112 -0
  10. kala_tts-0.1.0/kala_tts.egg-info/SOURCES.txt +44 -0
  11. kala_tts-0.1.0/kala_tts.egg-info/dependency_links.txt +1 -0
  12. kala_tts-0.1.0/kala_tts.egg-info/entry_points.txt +2 -0
  13. kala_tts-0.1.0/kala_tts.egg-info/requires.txt +6 -0
  14. kala_tts-0.1.0/kala_tts.egg-info/top_level.txt +3 -0
  15. kala_tts-0.1.0/nepali_frontend/__init__.py +7 -0
  16. kala_tts-0.1.0/nepali_frontend/_data/candidates_lexicon.tsv +48586 -0
  17. kala_tts-0.1.0/nepali_frontend/_data/gold_lexicon.tsv +5 -0
  18. kala_tts-0.1.0/nepali_frontend/_data/ipa_map.tsv +64 -0
  19. kala_tts-0.1.0/nepali_frontend/_data/loanwords_latin.tsv +5594 -0
  20. kala_tts-0.1.0/nepali_frontend/_data/phones.tsv +64 -0
  21. kala_tts-0.1.0/nepali_frontend/data.py +89 -0
  22. kala_tts-0.1.0/nepali_frontend/g2p/__init__.py +1 -0
  23. kala_tts-0.1.0/nepali_frontend/g2p/akshara.py +264 -0
  24. kala_tts-0.1.0/nepali_frontend/g2p/base_map.py +174 -0
  25. kala_tts-0.1.0/nepali_frontend/g2p/phonemizer.py +153 -0
  26. kala_tts-0.1.0/nepali_frontend/g2p/post_rules.py +471 -0
  27. kala_tts-0.1.0/nepali_frontend/g2p/reverse.py +199 -0
  28. kala_tts-0.1.0/nepali_frontend/g2p/reverse_model.py +287 -0
  29. kala_tts-0.1.0/nepali_frontend/normalize/__init__.py +33 -0
  30. kala_tts-0.1.0/nepali_frontend/normalize/numbers.py +250 -0
  31. kala_tts-0.1.0/nepali_frontend/normalize/phones.py +107 -0
  32. kala_tts-0.1.0/nepali_frontend/normalize/text.py +43 -0
  33. kala_tts-0.1.0/nepali_frontend/prosody/__init__.py +5 -0
  34. kala_tts-0.1.0/nepali_frontend/prosody/chunker.py +115 -0
  35. kala_tts-0.1.0/nepali_frontend/tokenize/__init__.py +7 -0
  36. kala_tts-0.1.0/nepali_frontend/tokenize/script.py +95 -0
  37. kala_tts-0.1.0/nepali_frontend/trace.py +143 -0
  38. kala_tts-0.1.0/nepali_frontend/transliterate.py +218 -0
  39. kala_tts-0.1.0/pyproject.toml +54 -0
  40. kala_tts-0.1.0/real_nepali/__init__.py +8 -0
  41. kala_tts-0.1.0/real_nepali/g2p.py +289 -0
  42. kala_tts-0.1.0/real_nepali/kokoro.py +262 -0
  43. kala_tts-0.1.0/real_nepali/manifest.py +79 -0
  44. kala_tts-0.1.0/real_nepali/profiles.py +63 -0
  45. kala_tts-0.1.0/real_nepali/prosody.py +161 -0
  46. kala_tts-0.1.0/setup.cfg +4 -0
kala_tts-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Ampixa Labs
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,112 @@
1
+ Metadata-Version: 2.4
2
+ Name: kala-tts
3
+ Version: 0.1.0
4
+ Summary: CPU-native Nepali TTS — real_nepali G2P + VITS ONNX inference
5
+ Author-email: Ampixa <hello@ampixa.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://tts.ampixa.com/kala/
8
+ Project-URL: Repository, https://github.com/voidash/kala-tts
9
+ Project-URL: Hugging Face, https://huggingface.co/ampixa/real-nepali-v0.2-kala
10
+ Project-URL: Bug Tracker, https://github.com/voidash/kala-tts/issues
11
+ Keywords: nepali,tts,text-to-speech,speech-synthesis,g2p,onnx
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Requires-Python: >=3.10
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: numpy>=1.24
26
+ Requires-Dist: onnxruntime>=1.17
27
+ Requires-Dist: huggingface_hub>=0.23
28
+ Provides-Extra: test
29
+ Requires-Dist: pytest>=7; extra == "test"
30
+ Dynamic: license-file
31
+
32
+ # kala-tts
33
+
34
+ CPU-native Nepali text-to-speech. The first open-source Nepali TTS built without eSpeak.
35
+
36
+ ```bash
37
+ pip install kala-tts
38
+ ```
39
+
40
+ ```python
41
+ import kala_tts
42
+
43
+ kala_tts.synthesize_to_file("नमस्कार, कसरी हुनुहुन्छ?", "out.wav")
44
+ ```
45
+
46
+ ## What makes it different
47
+
48
+ eSpeak-ng maps Nepali affricates to alveolar labels (`ts`, `tsh`) that do not match
49
+ how Kathmandu speakers actually produce **च** and **छ**. `kala-tts` uses the
50
+ `real_nepali` G2P — built from Khatiwada (2009) and a 48 000-entry curated lexicon —
51
+ which outputs palatal `ch`/`chh` as the acoustic target.
52
+
53
+ | Feature | eSpeak `ne` | kala-tts |
54
+ |---|---|---|
55
+ | च / छ | ts / tsh (alveolar) | ch / chh (palatal) |
56
+ | Gemination | often lost | explicit ː tokens |
57
+ | Schwa deletion | heuristic | rule-based, audited |
58
+ | Latin code-switch | undefined | letter-by-letter + lexicon |
59
+ | Lexicon | none | 48 000 entries |
60
+
61
+ ## API
62
+
63
+ ```python
64
+ import kala_tts
65
+
66
+ # Returns WAV bytes (16-bit PCM mono, 22050 Hz)
67
+ wav: bytes = kala_tts.synthesize("नेपाल सुन्दर देश हो।", speaker="kala")
68
+
69
+ # Write directly to file
70
+ kala_tts.synthesize_to_file("नमस्कार", "output.wav", speaker="barsha")
71
+
72
+ # Speed control
73
+ wav = kala_tts.synthesize("राम्रो दिन!", speed=0.85) # slower
74
+
75
+ # List speakers
76
+ print(kala_tts.list_speakers())
77
+ # ('kala', 'barsha', 'slr143_F', 'slr43_0546', 'slr43_2099')
78
+ ```
79
+
80
+ ## CLI
81
+
82
+ ```bash
83
+ kala-tts "नमस्कार, कसरी हुनुहुन्छ?" --speaker kala -o out.wav
84
+ kala-tts --list-speakers
85
+ echo "नेपाल सुन्दर देश हो।" | kala-tts -o out.wav
86
+ ```
87
+
88
+ ## Speakers
89
+
90
+ | Name | Data | Notes |
91
+ |---|---|---|
92
+ | `kala` | human studio | recommended |
93
+ | `barsha` | human recording | most data |
94
+ | `slr143_F` | OpenSLR-143 | neutral prosody |
95
+ | `slr43_0546` | OpenSLR-43 | read speech |
96
+ | `slr43_2099` | OpenSLR-43 | read speech |
97
+
98
+ ## Model
99
+
100
+ The first call auto-downloads the ONNX model (~60 MB) from
101
+ [ampixa/real-nepali-v0.2-kala](https://huggingface.co/ampixa/real-nepali-v0.2-kala)
102
+ via `huggingface_hub` and caches it locally. No internet needed after first run.
103
+ No GPU required.
104
+
105
+ - Architecture: VITS (ONNX FP32)
106
+ - Sample rate: 22050 Hz
107
+ - CPU RTF: ~0.020 (50× real-time on a laptop)
108
+
109
+ ## License
110
+
111
+ MIT (code) — CC-BY-SA 4.0 (model weights and training data).
112
+ See [LICENSES.md](LICENSES.md) for full attribution.
@@ -0,0 +1,81 @@
1
+ # kala-tts
2
+
3
+ CPU-native Nepali text-to-speech. The first open-source Nepali TTS built without eSpeak.
4
+
5
+ ```bash
6
+ pip install kala-tts
7
+ ```
8
+
9
+ ```python
10
+ import kala_tts
11
+
12
+ kala_tts.synthesize_to_file("नमस्कार, कसरी हुनुहुन्छ?", "out.wav")
13
+ ```
14
+
15
+ ## What makes it different
16
+
17
+ eSpeak-ng maps Nepali affricates to alveolar labels (`ts`, `tsh`) that do not match
18
+ how Kathmandu speakers actually produce **च** and **छ**. `kala-tts` uses the
19
+ `real_nepali` G2P — built from Khatiwada (2009) and a 48 000-entry curated lexicon —
20
+ which outputs palatal `ch`/`chh` as the acoustic target.
21
+
22
+ | Feature | eSpeak `ne` | kala-tts |
23
+ |---|---|---|
24
+ | च / छ | ts / tsh (alveolar) | ch / chh (palatal) |
25
+ | Gemination | often lost | explicit ː tokens |
26
+ | Schwa deletion | heuristic | rule-based, audited |
27
+ | Latin code-switch | undefined | letter-by-letter + lexicon |
28
+ | Lexicon | none | 48 000 entries |
29
+
30
+ ## API
31
+
32
+ ```python
33
+ import kala_tts
34
+
35
+ # Returns WAV bytes (16-bit PCM mono, 22050 Hz)
36
+ wav: bytes = kala_tts.synthesize("नेपाल सुन्दर देश हो।", speaker="kala")
37
+
38
+ # Write directly to file
39
+ kala_tts.synthesize_to_file("नमस्कार", "output.wav", speaker="barsha")
40
+
41
+ # Speed control
42
+ wav = kala_tts.synthesize("राम्रो दिन!", speed=0.85) # slower
43
+
44
+ # List speakers
45
+ print(kala_tts.list_speakers())
46
+ # ('kala', 'barsha', 'slr143_F', 'slr43_0546', 'slr43_2099')
47
+ ```
48
+
49
+ ## CLI
50
+
51
+ ```bash
52
+ kala-tts "नमस्कार, कसरी हुनुहुन्छ?" --speaker kala -o out.wav
53
+ kala-tts --list-speakers
54
+ echo "नेपाल सुन्दर देश हो।" | kala-tts -o out.wav
55
+ ```
56
+
57
+ ## Speakers
58
+
59
+ | Name | Data | Notes |
60
+ |---|---|---|
61
+ | `kala` | human studio | recommended |
62
+ | `barsha` | human recording | most data |
63
+ | `slr143_F` | OpenSLR-143 | neutral prosody |
64
+ | `slr43_0546` | OpenSLR-43 | read speech |
65
+ | `slr43_2099` | OpenSLR-43 | read speech |
66
+
67
+ ## Model
68
+
69
+ The first call auto-downloads the ONNX model (~60 MB) from
70
+ [ampixa/real-nepali-v0.2-kala](https://huggingface.co/ampixa/real-nepali-v0.2-kala)
71
+ via `huggingface_hub` and caches it locally. No internet needed after first run.
72
+ No GPU required.
73
+
74
+ - Architecture: VITS (ONNX FP32)
75
+ - Sample rate: 22050 Hz
76
+ - CPU RTF: ~0.020 (50× real-time on a laptop)
77
+
78
+ ## License
79
+
80
+ MIT (code) — CC-BY-SA 4.0 (model weights and training data).
81
+ See [LICENSES.md](LICENSES.md) for full attribution.
@@ -0,0 +1,17 @@
1
+ """kala-tts: CPU-native Nepali text-to-speech using the real_nepali G2P.
2
+
3
+ Quick start::
4
+
5
+ from kala_tts import synthesize_to_file
6
+ synthesize_to_file("नमस्कार, कसरी हुनुहुन्छ?", "out.wav")
7
+
8
+ The first call downloads the ONNX model (~60 MB) from HuggingFace Hub
9
+ and caches it in the HF Hub cache directory.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from kala_tts._api import list_speakers, synthesize, synthesize_to_file
15
+
16
+ __all__ = ["synthesize", "synthesize_to_file", "list_speakers"]
17
+ __version__ = "0.1.0"
@@ -0,0 +1,74 @@
1
+ """CLI entry point: python -m kala_tts "..." or kala-tts "..." """
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ from kala_tts._api import DEFAULT_SPEAKER, list_speakers, synthesize_to_file
10
+
11
+
12
+ def _parse_args() -> argparse.Namespace:
13
+ parser = argparse.ArgumentParser(
14
+ prog="kala-tts",
15
+ description="Synthesize Nepali text with the kala TTS model.",
16
+ )
17
+ parser.add_argument(
18
+ "text",
19
+ nargs="?",
20
+ help="Nepali text to synthesize (Devanagari). Read from stdin if omitted.",
21
+ )
22
+ parser.add_argument(
23
+ "-o", "--out",
24
+ default="out.wav",
25
+ type=Path,
26
+ metavar="FILE",
27
+ help="Output WAV file path (default: out.wav).",
28
+ )
29
+ parser.add_argument(
30
+ "--speaker",
31
+ default=DEFAULT_SPEAKER,
32
+ metavar="NAME",
33
+ help=f"Speaker name (default: {DEFAULT_SPEAKER}). Use --list-speakers to see choices.",
34
+ )
35
+ parser.add_argument(
36
+ "--speed",
37
+ type=float,
38
+ default=1.0,
39
+ metavar="X",
40
+ help="Speaking speed multiplier: 0.8=slower, 1.0=natural, 1.3=faster (default: 1.0).",
41
+ )
42
+ parser.add_argument(
43
+ "--list-speakers",
44
+ action="store_true",
45
+ help="Print available speaker names and exit.",
46
+ )
47
+ return parser.parse_args()
48
+
49
+
50
+ def main() -> int:
51
+ args = _parse_args()
52
+
53
+ if args.list_speakers:
54
+ for name in list_speakers():
55
+ print(name)
56
+ return 0
57
+
58
+ text = args.text
59
+ if text is None:
60
+ text = sys.stdin.read()
61
+ text = text.strip()
62
+ if not text:
63
+ print("error: no text provided", file=sys.stderr)
64
+ return 1
65
+
66
+ out_path: Path = args.out
67
+ out_path.parent.mkdir(parents=True, exist_ok=True)
68
+ synthesize_to_file(text, out_path, speaker=args.speaker, speed=args.speed)
69
+ print(f"wrote {out_path}")
70
+ return 0
71
+
72
+
73
+ if __name__ == "__main__":
74
+ raise SystemExit(main())
@@ -0,0 +1,63 @@
1
+ """High-level public API for kala-tts."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import threading
6
+ from pathlib import Path
7
+
8
+ from kala_tts._download import config_path, model_path, speakers_path
9
+ from kala_tts._infer import DEFAULT_SPEAKER, KalaEngine
10
+
11
+
12
+ _engine: KalaEngine | None = None
13
+ _lock = threading.Lock()
14
+
15
+
16
+ def _get_engine() -> KalaEngine:
17
+ global _engine
18
+ if _engine is None:
19
+ with _lock:
20
+ if _engine is None:
21
+ _engine = KalaEngine(
22
+ model_path=model_path(),
23
+ config_path=config_path(),
24
+ speakers_path=speakers_path(),
25
+ )
26
+ return _engine
27
+
28
+
29
+ def synthesize(
30
+ text: str,
31
+ *,
32
+ speaker: str = DEFAULT_SPEAKER,
33
+ speed: float = 1.0,
34
+ ) -> bytes:
35
+ """Synthesize ``text`` and return WAV bytes.
36
+
37
+ Args:
38
+ text: Nepali text in Devanagari script (Latin code-switching supported).
39
+ speaker: One of the available speaker names. Defaults to ``"kala"``.
40
+ speed: Speaking speed multiplier. 1.0 is natural, 0.8 is slower, 1.3 faster.
41
+
42
+ Returns:
43
+ WAV audio as bytes (16-bit PCM mono, 22050 Hz).
44
+ """
45
+ length_scale = 1.0 / max(speed, 0.1)
46
+ return _get_engine().synthesize(text, speaker=speaker, length_scale=length_scale)
47
+
48
+
49
+ def synthesize_to_file(
50
+ text: str,
51
+ output_path: str | Path,
52
+ *,
53
+ speaker: str = DEFAULT_SPEAKER,
54
+ speed: float = 1.0,
55
+ ) -> None:
56
+ """Synthesize ``text`` and write a WAV file to ``output_path``."""
57
+ wav = synthesize(text, speaker=speaker, speed=speed)
58
+ Path(output_path).write_bytes(wav)
59
+
60
+
61
+ def list_speakers() -> tuple[str, ...]:
62
+ """Return the names of the available speakers."""
63
+ return _get_engine().speakers
@@ -0,0 +1,29 @@
1
+ """HuggingFace Hub download helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from huggingface_hub import hf_hub_download
8
+
9
+
10
+ HF_REPO_ID = "ampixa/real-nepali-v0.2-kala"
11
+ ONNX_FILENAME = "real_nepali_v02_kala.fp32.onnx"
12
+ CONFIG_FILENAME = "config.json"
13
+ SPEAKERS_FILENAME = "speaker_id_map.json"
14
+
15
+
16
+ def _download(filename: str, repo_id: str = HF_REPO_ID) -> Path:
17
+ return Path(hf_hub_download(repo_id=repo_id, filename=filename))
18
+
19
+
20
+ def model_path() -> Path:
21
+ return _download(ONNX_FILENAME)
22
+
23
+
24
+ def config_path() -> Path:
25
+ return _download(CONFIG_FILENAME)
26
+
27
+
28
+ def speakers_path() -> Path:
29
+ return _download(SPEAKERS_FILENAME)
@@ -0,0 +1,175 @@
1
+ """ONNX inference core for the real_nepali_v02_kala model."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ import json
7
+ import re
8
+ import time
9
+ import wave
10
+ from dataclasses import dataclass
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ import numpy as np
15
+ import onnxruntime as ort
16
+
17
+ from real_nepali import g2p as _g2p
18
+
19
+
20
+ PUNCTUATION_PATTERN = re.compile(r"([^।॥?!.,،;:]+)([।॥?!.,،;:]*)")
21
+
22
+ PAUSE_SECONDS: dict[str, float] = {
23
+ "period": 0.18,
24
+ "question": 0.22,
25
+ "exclaim": 0.18,
26
+ "comma": 0.10,
27
+ }
28
+
29
+ PUBLIC_SPEAKERS = ("kala", "barsha", "slr143_F", "slr43_0546", "slr43_2099")
30
+ DEFAULT_SPEAKER = "kala"
31
+ SAMPLE_RATE = 22050
32
+ G2P_PROFILE = "real_nepali_v0.2"
33
+
34
+
35
+ @dataclass(frozen=True)
36
+ class _Chunk:
37
+ text: str
38
+ punctuation: str
39
+ pause_s: float
40
+
41
+
42
+ def _pause_seconds(punctuation: str) -> float:
43
+ pauses: list[float] = []
44
+ for ch in punctuation:
45
+ if ch in {"?", "?"}:
46
+ pauses.append(PAUSE_SECONDS["question"])
47
+ elif ch == "!":
48
+ pauses.append(PAUSE_SECONDS["exclaim"])
49
+ elif ch in {",", "،", ";", ":"}:
50
+ pauses.append(PAUSE_SECONDS["comma"])
51
+ elif ch in {".", "।", "॥"}:
52
+ pauses.append(PAUSE_SECONDS["period"])
53
+ return max(pauses) if pauses else 0.0
54
+
55
+
56
+ def _split_chunks(text: str) -> list[_Chunk]:
57
+ chunks: list[_Chunk] = []
58
+ for m in PUNCTUATION_PATTERN.finditer(text):
59
+ chunk_text = " ".join(m.group(1).split())
60
+ if not chunk_text:
61
+ continue
62
+ punct = m.group(2) or ""
63
+ chunks.append(_Chunk(text=chunk_text, punctuation=punct, pause_s=_pause_seconds(punct)))
64
+ if not chunks and text.strip():
65
+ chunks.append(_Chunk(text=" ".join(text.split()), punctuation="", pause_s=0.0))
66
+ return chunks
67
+
68
+
69
+ def _phones_to_ids(phones: list[str], id_map: dict[str, list[int]]) -> list[int]:
70
+ ids = [id_map["^"][0]]
71
+ for phone in phones:
72
+ if phone in {".", "|"}:
73
+ continue
74
+ if phone not in id_map:
75
+ raise ValueError(f"phone not in model vocabulary: {phone!r}")
76
+ ids.extend(id_map[phone])
77
+ ids.append(id_map["$"][0])
78
+ return ids
79
+
80
+
81
+ def _phonemize_chunk(text: str, id_map: dict[str, list[int]]) -> list[int]:
82
+ phones: list[str] = []
83
+ for word in _g2p.phonemize_text(text, profile=G2P_PROFILE):
84
+ if not word.phones:
85
+ continue
86
+ phones.extend(word.phones)
87
+ phones.append("|")
88
+ if phones and phones[-1] == "|":
89
+ phones.pop()
90
+ return _phones_to_ids(phones, id_map)
91
+
92
+
93
+ def _silence(seconds: float) -> np.ndarray:
94
+ n = max(0, int(round(seconds * SAMPLE_RATE)))
95
+ return np.zeros(n, dtype=np.float32)
96
+
97
+
98
+ def _audio_to_wav_bytes(audio: np.ndarray) -> bytes:
99
+ samples = np.clip(audio.reshape(-1).astype(np.float32), -1.0, 1.0)
100
+ pcm = (samples * 32767.0).astype(np.int16)
101
+ buf = io.BytesIO()
102
+ with wave.open(buf, "wb") as wf:
103
+ wf.setnchannels(1)
104
+ wf.setsampwidth(2)
105
+ wf.setframerate(SAMPLE_RATE)
106
+ wf.writeframes(pcm.tobytes())
107
+ return buf.getvalue()
108
+
109
+
110
+ class KalaEngine:
111
+ """Loaded ONNX inference engine. Keep one instance and call synthesize()."""
112
+
113
+ def __init__(self, model_path: Path, config_path: Path, speakers_path: Path) -> None:
114
+ config = json.loads(config_path.read_text(encoding="utf-8"))
115
+ self._id_map: dict[str, list[int]] = {
116
+ str(k): [int(i) for i in v]
117
+ for k, v in config["phoneme_id_map"].items()
118
+ }
119
+ self._speaker_map: dict[str, int] = {
120
+ str(k): int(v)
121
+ for k, v in json.loads(speakers_path.read_text(encoding="utf-8")).items()
122
+ }
123
+ self._session = ort.InferenceSession(
124
+ str(model_path),
125
+ providers=["CPUExecutionProvider"],
126
+ )
127
+ self._input_names = {inp.name for inp in self._session.get_inputs()}
128
+
129
+ @property
130
+ def speakers(self) -> tuple[str, ...]:
131
+ return tuple(s for s in PUBLIC_SPEAKERS if s in self._speaker_map)
132
+
133
+ def synthesize(
134
+ self,
135
+ text: str,
136
+ *,
137
+ speaker: str = DEFAULT_SPEAKER,
138
+ length_scale: float = 1.0,
139
+ noise_scale: float = 0.667,
140
+ noise_scale_w: float = 0.8,
141
+ ) -> bytes:
142
+ if speaker not in self._speaker_map:
143
+ valid = ", ".join(sorted(self._speaker_map))
144
+ raise ValueError(f"unknown speaker {speaker!r}; valid: {valid}")
145
+
146
+ sid = np.array([self._speaker_map[speaker]], dtype=np.int64)
147
+ scales = np.array([noise_scale, length_scale, noise_scale_w], dtype=np.float32)
148
+ audio_parts: list[np.ndarray] = []
149
+
150
+ for chunk in _split_chunks(" ".join(text.split())):
151
+ phone_ids = _phonemize_chunk(chunk.text, self._id_map)
152
+ feed: dict[str, Any] = {
153
+ "input": np.array([phone_ids], dtype=np.int64),
154
+ "input_lengths": np.array([len(phone_ids)], dtype=np.int64),
155
+ "scales": scales,
156
+ }
157
+ if "sid" in self._input_names:
158
+ feed["sid"] = sid
159
+ output = self._session.run(None, feed)[0]
160
+ audio_parts.append(np.asarray(output).reshape(-1).astype(np.float32))
161
+ if chunk.pause_s > 0:
162
+ audio_parts.append(_silence(chunk.pause_s))
163
+
164
+ audio = np.concatenate(audio_parts) if audio_parts else np.array([], dtype=np.float32)
165
+ return _audio_to_wav_bytes(audio)
166
+
167
+ def rtf(self, text: str, speaker: str = DEFAULT_SPEAKER) -> float:
168
+ """Return real-time factor (inference time / audio duration) on this text."""
169
+ t0 = time.perf_counter()
170
+ wav = self.synthesize(text, speaker=speaker)
171
+ elapsed = time.perf_counter() - t0
172
+ buf = io.BytesIO(wav)
173
+ with wave.open(buf) as wf:
174
+ duration = wf.getnframes() / wf.getframerate()
175
+ return elapsed / duration if duration > 0 else float("inf")
@@ -0,0 +1,112 @@
1
+ Metadata-Version: 2.4
2
+ Name: kala-tts
3
+ Version: 0.1.0
4
+ Summary: CPU-native Nepali TTS — real_nepali G2P + VITS ONNX inference
5
+ Author-email: Ampixa <hello@ampixa.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://tts.ampixa.com/kala/
8
+ Project-URL: Repository, https://github.com/voidash/kala-tts
9
+ Project-URL: Hugging Face, https://huggingface.co/ampixa/real-nepali-v0.2-kala
10
+ Project-URL: Bug Tracker, https://github.com/voidash/kala-tts/issues
11
+ Keywords: nepali,tts,text-to-speech,speech-synthesis,g2p,onnx
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Requires-Python: >=3.10
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: numpy>=1.24
26
+ Requires-Dist: onnxruntime>=1.17
27
+ Requires-Dist: huggingface_hub>=0.23
28
+ Provides-Extra: test
29
+ Requires-Dist: pytest>=7; extra == "test"
30
+ Dynamic: license-file
31
+
32
+ # kala-tts
33
+
34
+ CPU-native Nepali text-to-speech. The first open-source Nepali TTS built without eSpeak.
35
+
36
+ ```bash
37
+ pip install kala-tts
38
+ ```
39
+
40
+ ```python
41
+ import kala_tts
42
+
43
+ kala_tts.synthesize_to_file("नमस्कार, कसरी हुनुहुन्छ?", "out.wav")
44
+ ```
45
+
46
+ ## What makes it different
47
+
48
+ eSpeak-ng maps Nepali affricates to alveolar labels (`ts`, `tsh`) that do not match
49
+ how Kathmandu speakers actually produce **च** and **छ**. `kala-tts` uses the
50
+ `real_nepali` G2P — built from Khatiwada (2009) and a 48 000-entry curated lexicon —
51
+ which outputs palatal `ch`/`chh` as the acoustic target.
52
+
53
+ | Feature | eSpeak `ne` | kala-tts |
54
+ |---|---|---|
55
+ | च / छ | ts / tsh (alveolar) | ch / chh (palatal) |
56
+ | Gemination | often lost | explicit ː tokens |
57
+ | Schwa deletion | heuristic | rule-based, audited |
58
+ | Latin code-switch | undefined | letter-by-letter + lexicon |
59
+ | Lexicon | none | 48 000 entries |
60
+
61
+ ## API
62
+
63
+ ```python
64
+ import kala_tts
65
+
66
+ # Returns WAV bytes (16-bit PCM mono, 22050 Hz)
67
+ wav: bytes = kala_tts.synthesize("नेपाल सुन्दर देश हो।", speaker="kala")
68
+
69
+ # Write directly to file
70
+ kala_tts.synthesize_to_file("नमस्कार", "output.wav", speaker="barsha")
71
+
72
+ # Speed control
73
+ wav = kala_tts.synthesize("राम्रो दिन!", speed=0.85) # slower
74
+
75
+ # List speakers
76
+ print(kala_tts.list_speakers())
77
+ # ('kala', 'barsha', 'slr143_F', 'slr43_0546', 'slr43_2099')
78
+ ```
79
+
80
+ ## CLI
81
+
82
+ ```bash
83
+ kala-tts "नमस्कार, कसरी हुनुहुन्छ?" --speaker kala -o out.wav
84
+ kala-tts --list-speakers
85
+ echo "नेपाल सुन्दर देश हो।" | kala-tts -o out.wav
86
+ ```
87
+
88
+ ## Speakers
89
+
90
+ | Name | Data | Notes |
91
+ |---|---|---|
92
+ | `kala` | human studio | recommended |
93
+ | `barsha` | human recording | most data |
94
+ | `slr143_F` | OpenSLR-143 | neutral prosody |
95
+ | `slr43_0546` | OpenSLR-43 | read speech |
96
+ | `slr43_2099` | OpenSLR-43 | read speech |
97
+
98
+ ## Model
99
+
100
+ The first call auto-downloads the ONNX model (~60 MB) from
101
+ [ampixa/real-nepali-v0.2-kala](https://huggingface.co/ampixa/real-nepali-v0.2-kala)
102
+ via `huggingface_hub` and caches it locally. No internet needed after first run.
103
+ No GPU required.
104
+
105
+ - Architecture: VITS (ONNX FP32)
106
+ - Sample rate: 22050 Hz
107
+ - CPU RTF: ~0.020 (50× real-time on a laptop)
108
+
109
+ ## License
110
+
111
+ MIT (code) — CC-BY-SA 4.0 (model weights and training data).
112
+ See [LICENSES.md](LICENSES.md) for full attribution.