kala-tts 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kala_tts-0.1.0/LICENSE +21 -0
- kala_tts-0.1.0/PKG-INFO +112 -0
- kala_tts-0.1.0/README.md +81 -0
- kala_tts-0.1.0/kala_tts/__init__.py +17 -0
- kala_tts-0.1.0/kala_tts/__main__.py +74 -0
- kala_tts-0.1.0/kala_tts/_api.py +63 -0
- kala_tts-0.1.0/kala_tts/_download.py +29 -0
- kala_tts-0.1.0/kala_tts/_infer.py +175 -0
- kala_tts-0.1.0/kala_tts.egg-info/PKG-INFO +112 -0
- kala_tts-0.1.0/kala_tts.egg-info/SOURCES.txt +44 -0
- kala_tts-0.1.0/kala_tts.egg-info/dependency_links.txt +1 -0
- kala_tts-0.1.0/kala_tts.egg-info/entry_points.txt +2 -0
- kala_tts-0.1.0/kala_tts.egg-info/requires.txt +6 -0
- kala_tts-0.1.0/kala_tts.egg-info/top_level.txt +3 -0
- kala_tts-0.1.0/nepali_frontend/__init__.py +7 -0
- kala_tts-0.1.0/nepali_frontend/_data/candidates_lexicon.tsv +48586 -0
- kala_tts-0.1.0/nepali_frontend/_data/gold_lexicon.tsv +5 -0
- kala_tts-0.1.0/nepali_frontend/_data/ipa_map.tsv +64 -0
- kala_tts-0.1.0/nepali_frontend/_data/loanwords_latin.tsv +5594 -0
- kala_tts-0.1.0/nepali_frontend/_data/phones.tsv +64 -0
- kala_tts-0.1.0/nepali_frontend/data.py +89 -0
- kala_tts-0.1.0/nepali_frontend/g2p/__init__.py +1 -0
- kala_tts-0.1.0/nepali_frontend/g2p/akshara.py +264 -0
- kala_tts-0.1.0/nepali_frontend/g2p/base_map.py +174 -0
- kala_tts-0.1.0/nepali_frontend/g2p/phonemizer.py +153 -0
- kala_tts-0.1.0/nepali_frontend/g2p/post_rules.py +471 -0
- kala_tts-0.1.0/nepali_frontend/g2p/reverse.py +199 -0
- kala_tts-0.1.0/nepali_frontend/g2p/reverse_model.py +287 -0
- kala_tts-0.1.0/nepali_frontend/normalize/__init__.py +33 -0
- kala_tts-0.1.0/nepali_frontend/normalize/numbers.py +250 -0
- kala_tts-0.1.0/nepali_frontend/normalize/phones.py +107 -0
- kala_tts-0.1.0/nepali_frontend/normalize/text.py +43 -0
- kala_tts-0.1.0/nepali_frontend/prosody/__init__.py +5 -0
- kala_tts-0.1.0/nepali_frontend/prosody/chunker.py +115 -0
- kala_tts-0.1.0/nepali_frontend/tokenize/__init__.py +7 -0
- kala_tts-0.1.0/nepali_frontend/tokenize/script.py +95 -0
- kala_tts-0.1.0/nepali_frontend/trace.py +143 -0
- kala_tts-0.1.0/nepali_frontend/transliterate.py +218 -0
- kala_tts-0.1.0/pyproject.toml +54 -0
- kala_tts-0.1.0/real_nepali/__init__.py +8 -0
- kala_tts-0.1.0/real_nepali/g2p.py +289 -0
- kala_tts-0.1.0/real_nepali/kokoro.py +262 -0
- kala_tts-0.1.0/real_nepali/manifest.py +79 -0
- kala_tts-0.1.0/real_nepali/profiles.py +63 -0
- kala_tts-0.1.0/real_nepali/prosody.py +161 -0
- kala_tts-0.1.0/setup.cfg +4 -0
kala_tts-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Ampixa Labs
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
kala_tts-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: kala-tts
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: CPU-native Nepali TTS — real_nepali G2P + VITS ONNX inference
|
|
5
|
+
Author-email: Ampixa <hello@ampixa.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://tts.ampixa.com/kala/
|
|
8
|
+
Project-URL: Repository, https://github.com/voidash/kala-tts
|
|
9
|
+
Project-URL: Hugging Face, https://huggingface.co/ampixa/real-nepali-v0.2-kala
|
|
10
|
+
Project-URL: Bug Tracker, https://github.com/voidash/kala-tts/issues
|
|
11
|
+
Keywords: nepali,tts,text-to-speech,speech-synthesis,g2p,onnx
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: numpy>=1.24
|
|
26
|
+
Requires-Dist: onnxruntime>=1.17
|
|
27
|
+
Requires-Dist: huggingface_hub>=0.23
|
|
28
|
+
Provides-Extra: test
|
|
29
|
+
Requires-Dist: pytest>=7; extra == "test"
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
|
|
32
|
+
# kala-tts
|
|
33
|
+
|
|
34
|
+
CPU-native Nepali text-to-speech. The first open-source Nepali TTS built without eSpeak.
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install kala-tts
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
import kala_tts
|
|
42
|
+
|
|
43
|
+
kala_tts.synthesize_to_file("नमस्कार, कसरी हुनुहुन्छ?", "out.wav")
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## What makes it different
|
|
47
|
+
|
|
48
|
+
eSpeak-ng maps Nepali affricates to alveolar labels (`ts`, `tsh`) that do not match
|
|
49
|
+
how Kathmandu speakers actually produce **च** and **छ**. `kala-tts` uses the
|
|
50
|
+
`real_nepali` G2P — built from Khatiwada (2009) and a 48 000-entry curated lexicon —
|
|
51
|
+
which outputs palatal `ch`/`chh` as the acoustic target.
|
|
52
|
+
|
|
53
|
+
| Feature | eSpeak `ne` | kala-tts |
|
|
54
|
+
|---|---|---|
|
|
55
|
+
| च / छ | ts / tsh (alveolar) | ch / chh (palatal) |
|
|
56
|
+
| Gemination | often lost | explicit ː tokens |
|
|
57
|
+
| Schwa deletion | heuristic | rule-based, audited |
|
|
58
|
+
| Latin code-switch | undefined | letter-by-letter + lexicon |
|
|
59
|
+
| Lexicon | none | 48 000 entries |
|
|
60
|
+
|
|
61
|
+
## API
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
import kala_tts
|
|
65
|
+
|
|
66
|
+
# Returns WAV bytes (16-bit PCM mono, 22050 Hz)
|
|
67
|
+
wav: bytes = kala_tts.synthesize("नेपाल सुन्दर देश हो।", speaker="kala")
|
|
68
|
+
|
|
69
|
+
# Write directly to file
|
|
70
|
+
kala_tts.synthesize_to_file("नमस्कार", "output.wav", speaker="barsha")
|
|
71
|
+
|
|
72
|
+
# Speed control
|
|
73
|
+
wav = kala_tts.synthesize("राम्रो दिन!", speed=0.85) # slower
|
|
74
|
+
|
|
75
|
+
# List speakers
|
|
76
|
+
print(kala_tts.list_speakers())
|
|
77
|
+
# ('kala', 'barsha', 'slr143_F', 'slr43_0546', 'slr43_2099')
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## CLI
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
kala-tts "नमस्कार, कसरी हुनुहुन्छ?" --speaker kala -o out.wav
|
|
84
|
+
kala-tts --list-speakers
|
|
85
|
+
echo "नेपाल सुन्दर देश हो।" | kala-tts -o out.wav
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Speakers
|
|
89
|
+
|
|
90
|
+
| Name | Data | Notes |
|
|
91
|
+
|---|---|---|
|
|
92
|
+
| `kala` | human studio | recommended |
|
|
93
|
+
| `barsha` | human recording | most data |
|
|
94
|
+
| `slr143_F` | OpenSLR-143 | neutral prosody |
|
|
95
|
+
| `slr43_0546` | OpenSLR-43 | read speech |
|
|
96
|
+
| `slr43_2099` | OpenSLR-43 | read speech |
|
|
97
|
+
|
|
98
|
+
## Model
|
|
99
|
+
|
|
100
|
+
The first call auto-downloads the ONNX model (~60 MB) from
|
|
101
|
+
[ampixa/real-nepali-v0.2-kala](https://huggingface.co/ampixa/real-nepali-v0.2-kala)
|
|
102
|
+
via `huggingface_hub` and caches it locally. No internet needed after first run.
|
|
103
|
+
No GPU required.
|
|
104
|
+
|
|
105
|
+
- Architecture: VITS (ONNX FP32)
|
|
106
|
+
- Sample rate: 22050 Hz
|
|
107
|
+
- CPU RTF: ~0.020 (50× real-time on a laptop)
|
|
108
|
+
|
|
109
|
+
## License
|
|
110
|
+
|
|
111
|
+
MIT (code) — CC-BY-SA 4.0 (model weights and training data).
|
|
112
|
+
See [LICENSES.md](LICENSES.md) for full attribution.
|
kala_tts-0.1.0/README.md
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# kala-tts
|
|
2
|
+
|
|
3
|
+
CPU-native Nepali text-to-speech. The first open-source Nepali TTS built without eSpeak.
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
pip install kala-tts
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
```python
|
|
10
|
+
import kala_tts
|
|
11
|
+
|
|
12
|
+
kala_tts.synthesize_to_file("नमस्कार, कसरी हुनुहुन्छ?", "out.wav")
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## What makes it different
|
|
16
|
+
|
|
17
|
+
eSpeak-ng maps Nepali affricates to alveolar labels (`ts`, `tsh`) that do not match
|
|
18
|
+
how Kathmandu speakers actually produce **च** and **छ**. `kala-tts` uses the
|
|
19
|
+
`real_nepali` G2P — built from Khatiwada (2009) and a 48 000-entry curated lexicon —
|
|
20
|
+
which outputs palatal `ch`/`chh` as the acoustic target.
|
|
21
|
+
|
|
22
|
+
| Feature | eSpeak `ne` | kala-tts |
|
|
23
|
+
|---|---|---|
|
|
24
|
+
| च / छ | ts / tsh (alveolar) | ch / chh (palatal) |
|
|
25
|
+
| Gemination | often lost | explicit ː tokens |
|
|
26
|
+
| Schwa deletion | heuristic | rule-based, audited |
|
|
27
|
+
| Latin code-switch | undefined | letter-by-letter + lexicon |
|
|
28
|
+
| Lexicon | none | 48 000 entries |
|
|
29
|
+
|
|
30
|
+
## API
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
import kala_tts
|
|
34
|
+
|
|
35
|
+
# Returns WAV bytes (16-bit PCM mono, 22050 Hz)
|
|
36
|
+
wav: bytes = kala_tts.synthesize("नेपाल सुन्दर देश हो।", speaker="kala")
|
|
37
|
+
|
|
38
|
+
# Write directly to file
|
|
39
|
+
kala_tts.synthesize_to_file("नमस्कार", "output.wav", speaker="barsha")
|
|
40
|
+
|
|
41
|
+
# Speed control
|
|
42
|
+
wav = kala_tts.synthesize("राम्रो दिन!", speed=0.85) # slower
|
|
43
|
+
|
|
44
|
+
# List speakers
|
|
45
|
+
print(kala_tts.list_speakers())
|
|
46
|
+
# ('kala', 'barsha', 'slr143_F', 'slr43_0546', 'slr43_2099')
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## CLI
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
kala-tts "नमस्कार, कसरी हुनुहुन्छ?" --speaker kala -o out.wav
|
|
53
|
+
kala-tts --list-speakers
|
|
54
|
+
echo "नेपाल सुन्दर देश हो।" | kala-tts -o out.wav
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Speakers
|
|
58
|
+
|
|
59
|
+
| Name | Data | Notes |
|
|
60
|
+
|---|---|---|
|
|
61
|
+
| `kala` | human studio | recommended |
|
|
62
|
+
| `barsha` | human recording | most data |
|
|
63
|
+
| `slr143_F` | OpenSLR-143 | neutral prosody |
|
|
64
|
+
| `slr43_0546` | OpenSLR-43 | read speech |
|
|
65
|
+
| `slr43_2099` | OpenSLR-43 | read speech |
|
|
66
|
+
|
|
67
|
+
## Model
|
|
68
|
+
|
|
69
|
+
The first call auto-downloads the ONNX model (~60 MB) from
|
|
70
|
+
[ampixa/real-nepali-v0.2-kala](https://huggingface.co/ampixa/real-nepali-v0.2-kala)
|
|
71
|
+
via `huggingface_hub` and caches it locally. No internet needed after first run.
|
|
72
|
+
No GPU required.
|
|
73
|
+
|
|
74
|
+
- Architecture: VITS (ONNX FP32)
|
|
75
|
+
- Sample rate: 22050 Hz
|
|
76
|
+
- CPU RTF: ~0.020 (50× real-time on a laptop)
|
|
77
|
+
|
|
78
|
+
## License
|
|
79
|
+
|
|
80
|
+
MIT (code) — CC-BY-SA 4.0 (model weights and training data).
|
|
81
|
+
See [LICENSES.md](LICENSES.md) for full attribution.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""kala-tts: CPU-native Nepali text-to-speech using the real_nepali G2P.
|
|
2
|
+
|
|
3
|
+
Quick start::
|
|
4
|
+
|
|
5
|
+
from kala_tts import synthesize_to_file
|
|
6
|
+
synthesize_to_file("नमस्कार, कसरी हुनुहुन्छ?", "out.wav")
|
|
7
|
+
|
|
8
|
+
The first call downloads the ONNX model (~60 MB) from HuggingFace Hub
|
|
9
|
+
and caches it in the HF Hub cache directory.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from kala_tts._api import list_speakers, synthesize, synthesize_to_file
|
|
15
|
+
|
|
16
|
+
__all__ = ["synthesize", "synthesize_to_file", "list_speakers"]
|
|
17
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""CLI entry point: python -m kala_tts "..." or kala-tts "..." """
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from kala_tts._api import DEFAULT_SPEAKER, list_speakers, synthesize_to_file
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _parse_args() -> argparse.Namespace:
|
|
13
|
+
parser = argparse.ArgumentParser(
|
|
14
|
+
prog="kala-tts",
|
|
15
|
+
description="Synthesize Nepali text with the kala TTS model.",
|
|
16
|
+
)
|
|
17
|
+
parser.add_argument(
|
|
18
|
+
"text",
|
|
19
|
+
nargs="?",
|
|
20
|
+
help="Nepali text to synthesize (Devanagari). Read from stdin if omitted.",
|
|
21
|
+
)
|
|
22
|
+
parser.add_argument(
|
|
23
|
+
"-o", "--out",
|
|
24
|
+
default="out.wav",
|
|
25
|
+
type=Path,
|
|
26
|
+
metavar="FILE",
|
|
27
|
+
help="Output WAV file path (default: out.wav).",
|
|
28
|
+
)
|
|
29
|
+
parser.add_argument(
|
|
30
|
+
"--speaker",
|
|
31
|
+
default=DEFAULT_SPEAKER,
|
|
32
|
+
metavar="NAME",
|
|
33
|
+
help=f"Speaker name (default: {DEFAULT_SPEAKER}). Use --list-speakers to see choices.",
|
|
34
|
+
)
|
|
35
|
+
parser.add_argument(
|
|
36
|
+
"--speed",
|
|
37
|
+
type=float,
|
|
38
|
+
default=1.0,
|
|
39
|
+
metavar="X",
|
|
40
|
+
help="Speaking speed multiplier: 0.8=slower, 1.0=natural, 1.3=faster (default: 1.0).",
|
|
41
|
+
)
|
|
42
|
+
parser.add_argument(
|
|
43
|
+
"--list-speakers",
|
|
44
|
+
action="store_true",
|
|
45
|
+
help="Print available speaker names and exit.",
|
|
46
|
+
)
|
|
47
|
+
return parser.parse_args()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def main() -> int:
|
|
51
|
+
args = _parse_args()
|
|
52
|
+
|
|
53
|
+
if args.list_speakers:
|
|
54
|
+
for name in list_speakers():
|
|
55
|
+
print(name)
|
|
56
|
+
return 0
|
|
57
|
+
|
|
58
|
+
text = args.text
|
|
59
|
+
if text is None:
|
|
60
|
+
text = sys.stdin.read()
|
|
61
|
+
text = text.strip()
|
|
62
|
+
if not text:
|
|
63
|
+
print("error: no text provided", file=sys.stderr)
|
|
64
|
+
return 1
|
|
65
|
+
|
|
66
|
+
out_path: Path = args.out
|
|
67
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
68
|
+
synthesize_to_file(text, out_path, speaker=args.speaker, speed=args.speed)
|
|
69
|
+
print(f"wrote {out_path}")
|
|
70
|
+
return 0
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
if __name__ == "__main__":
|
|
74
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""High-level public API for kala-tts."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import threading
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from kala_tts._download import config_path, model_path, speakers_path
|
|
9
|
+
from kala_tts._infer import DEFAULT_SPEAKER, KalaEngine
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
_engine: KalaEngine | None = None
|
|
13
|
+
_lock = threading.Lock()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _get_engine() -> KalaEngine:
|
|
17
|
+
global _engine
|
|
18
|
+
if _engine is None:
|
|
19
|
+
with _lock:
|
|
20
|
+
if _engine is None:
|
|
21
|
+
_engine = KalaEngine(
|
|
22
|
+
model_path=model_path(),
|
|
23
|
+
config_path=config_path(),
|
|
24
|
+
speakers_path=speakers_path(),
|
|
25
|
+
)
|
|
26
|
+
return _engine
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def synthesize(
|
|
30
|
+
text: str,
|
|
31
|
+
*,
|
|
32
|
+
speaker: str = DEFAULT_SPEAKER,
|
|
33
|
+
speed: float = 1.0,
|
|
34
|
+
) -> bytes:
|
|
35
|
+
"""Synthesize ``text`` and return WAV bytes.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
text: Nepali text in Devanagari script (Latin code-switching supported).
|
|
39
|
+
speaker: One of the available speaker names. Defaults to ``"kala"``.
|
|
40
|
+
speed: Speaking speed multiplier. 1.0 is natural, 0.8 is slower, 1.3 faster.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
WAV audio as bytes (16-bit PCM mono, 22050 Hz).
|
|
44
|
+
"""
|
|
45
|
+
length_scale = 1.0 / max(speed, 0.1)
|
|
46
|
+
return _get_engine().synthesize(text, speaker=speaker, length_scale=length_scale)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def synthesize_to_file(
|
|
50
|
+
text: str,
|
|
51
|
+
output_path: str | Path,
|
|
52
|
+
*,
|
|
53
|
+
speaker: str = DEFAULT_SPEAKER,
|
|
54
|
+
speed: float = 1.0,
|
|
55
|
+
) -> None:
|
|
56
|
+
"""Synthesize ``text`` and write a WAV file to ``output_path``."""
|
|
57
|
+
wav = synthesize(text, speaker=speaker, speed=speed)
|
|
58
|
+
Path(output_path).write_bytes(wav)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def list_speakers() -> tuple[str, ...]:
|
|
62
|
+
"""Return the names of the available speakers."""
|
|
63
|
+
return _get_engine().speakers
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""HuggingFace Hub download helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from huggingface_hub import hf_hub_download
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
HF_REPO_ID = "ampixa/real-nepali-v0.2-kala"
|
|
11
|
+
ONNX_FILENAME = "real_nepali_v02_kala.fp32.onnx"
|
|
12
|
+
CONFIG_FILENAME = "config.json"
|
|
13
|
+
SPEAKERS_FILENAME = "speaker_id_map.json"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _download(filename: str, repo_id: str = HF_REPO_ID) -> Path:
|
|
17
|
+
return Path(hf_hub_download(repo_id=repo_id, filename=filename))
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def model_path() -> Path:
|
|
21
|
+
return _download(ONNX_FILENAME)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def config_path() -> Path:
|
|
25
|
+
return _download(CONFIG_FILENAME)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def speakers_path() -> Path:
|
|
29
|
+
return _download(SPEAKERS_FILENAME)
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
"""ONNX inference core for the real_nepali_v02_kala model."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import io
|
|
6
|
+
import json
|
|
7
|
+
import re
|
|
8
|
+
import time
|
|
9
|
+
import wave
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
import onnxruntime as ort
|
|
16
|
+
|
|
17
|
+
from real_nepali import g2p as _g2p
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
PUNCTUATION_PATTERN = re.compile(r"([^।॥?!.,،;:]+)([।॥?!.,،;:]*)")
|
|
21
|
+
|
|
22
|
+
PAUSE_SECONDS: dict[str, float] = {
|
|
23
|
+
"period": 0.18,
|
|
24
|
+
"question": 0.22,
|
|
25
|
+
"exclaim": 0.18,
|
|
26
|
+
"comma": 0.10,
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
PUBLIC_SPEAKERS = ("kala", "barsha", "slr143_F", "slr43_0546", "slr43_2099")
|
|
30
|
+
DEFAULT_SPEAKER = "kala"
|
|
31
|
+
SAMPLE_RATE = 22050
|
|
32
|
+
G2P_PROFILE = "real_nepali_v0.2"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass(frozen=True)
|
|
36
|
+
class _Chunk:
|
|
37
|
+
text: str
|
|
38
|
+
punctuation: str
|
|
39
|
+
pause_s: float
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _pause_seconds(punctuation: str) -> float:
|
|
43
|
+
pauses: list[float] = []
|
|
44
|
+
for ch in punctuation:
|
|
45
|
+
if ch in {"?", "?"}:
|
|
46
|
+
pauses.append(PAUSE_SECONDS["question"])
|
|
47
|
+
elif ch == "!":
|
|
48
|
+
pauses.append(PAUSE_SECONDS["exclaim"])
|
|
49
|
+
elif ch in {",", "،", ";", ":"}:
|
|
50
|
+
pauses.append(PAUSE_SECONDS["comma"])
|
|
51
|
+
elif ch in {".", "।", "॥"}:
|
|
52
|
+
pauses.append(PAUSE_SECONDS["period"])
|
|
53
|
+
return max(pauses) if pauses else 0.0
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _split_chunks(text: str) -> list[_Chunk]:
|
|
57
|
+
chunks: list[_Chunk] = []
|
|
58
|
+
for m in PUNCTUATION_PATTERN.finditer(text):
|
|
59
|
+
chunk_text = " ".join(m.group(1).split())
|
|
60
|
+
if not chunk_text:
|
|
61
|
+
continue
|
|
62
|
+
punct = m.group(2) or ""
|
|
63
|
+
chunks.append(_Chunk(text=chunk_text, punctuation=punct, pause_s=_pause_seconds(punct)))
|
|
64
|
+
if not chunks and text.strip():
|
|
65
|
+
chunks.append(_Chunk(text=" ".join(text.split()), punctuation="", pause_s=0.0))
|
|
66
|
+
return chunks
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _phones_to_ids(phones: list[str], id_map: dict[str, list[int]]) -> list[int]:
|
|
70
|
+
ids = [id_map["^"][0]]
|
|
71
|
+
for phone in phones:
|
|
72
|
+
if phone in {".", "|"}:
|
|
73
|
+
continue
|
|
74
|
+
if phone not in id_map:
|
|
75
|
+
raise ValueError(f"phone not in model vocabulary: {phone!r}")
|
|
76
|
+
ids.extend(id_map[phone])
|
|
77
|
+
ids.append(id_map["$"][0])
|
|
78
|
+
return ids
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _phonemize_chunk(text: str, id_map: dict[str, list[int]]) -> list[int]:
|
|
82
|
+
phones: list[str] = []
|
|
83
|
+
for word in _g2p.phonemize_text(text, profile=G2P_PROFILE):
|
|
84
|
+
if not word.phones:
|
|
85
|
+
continue
|
|
86
|
+
phones.extend(word.phones)
|
|
87
|
+
phones.append("|")
|
|
88
|
+
if phones and phones[-1] == "|":
|
|
89
|
+
phones.pop()
|
|
90
|
+
return _phones_to_ids(phones, id_map)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _silence(seconds: float) -> np.ndarray:
|
|
94
|
+
n = max(0, int(round(seconds * SAMPLE_RATE)))
|
|
95
|
+
return np.zeros(n, dtype=np.float32)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _audio_to_wav_bytes(audio: np.ndarray) -> bytes:
|
|
99
|
+
samples = np.clip(audio.reshape(-1).astype(np.float32), -1.0, 1.0)
|
|
100
|
+
pcm = (samples * 32767.0).astype(np.int16)
|
|
101
|
+
buf = io.BytesIO()
|
|
102
|
+
with wave.open(buf, "wb") as wf:
|
|
103
|
+
wf.setnchannels(1)
|
|
104
|
+
wf.setsampwidth(2)
|
|
105
|
+
wf.setframerate(SAMPLE_RATE)
|
|
106
|
+
wf.writeframes(pcm.tobytes())
|
|
107
|
+
return buf.getvalue()
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class KalaEngine:
|
|
111
|
+
"""Loaded ONNX inference engine. Keep one instance and call synthesize()."""
|
|
112
|
+
|
|
113
|
+
def __init__(self, model_path: Path, config_path: Path, speakers_path: Path) -> None:
|
|
114
|
+
config = json.loads(config_path.read_text(encoding="utf-8"))
|
|
115
|
+
self._id_map: dict[str, list[int]] = {
|
|
116
|
+
str(k): [int(i) for i in v]
|
|
117
|
+
for k, v in config["phoneme_id_map"].items()
|
|
118
|
+
}
|
|
119
|
+
self._speaker_map: dict[str, int] = {
|
|
120
|
+
str(k): int(v)
|
|
121
|
+
for k, v in json.loads(speakers_path.read_text(encoding="utf-8")).items()
|
|
122
|
+
}
|
|
123
|
+
self._session = ort.InferenceSession(
|
|
124
|
+
str(model_path),
|
|
125
|
+
providers=["CPUExecutionProvider"],
|
|
126
|
+
)
|
|
127
|
+
self._input_names = {inp.name for inp in self._session.get_inputs()}
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def speakers(self) -> tuple[str, ...]:
|
|
131
|
+
return tuple(s for s in PUBLIC_SPEAKERS if s in self._speaker_map)
|
|
132
|
+
|
|
133
|
+
def synthesize(
|
|
134
|
+
self,
|
|
135
|
+
text: str,
|
|
136
|
+
*,
|
|
137
|
+
speaker: str = DEFAULT_SPEAKER,
|
|
138
|
+
length_scale: float = 1.0,
|
|
139
|
+
noise_scale: float = 0.667,
|
|
140
|
+
noise_scale_w: float = 0.8,
|
|
141
|
+
) -> bytes:
|
|
142
|
+
if speaker not in self._speaker_map:
|
|
143
|
+
valid = ", ".join(sorted(self._speaker_map))
|
|
144
|
+
raise ValueError(f"unknown speaker {speaker!r}; valid: {valid}")
|
|
145
|
+
|
|
146
|
+
sid = np.array([self._speaker_map[speaker]], dtype=np.int64)
|
|
147
|
+
scales = np.array([noise_scale, length_scale, noise_scale_w], dtype=np.float32)
|
|
148
|
+
audio_parts: list[np.ndarray] = []
|
|
149
|
+
|
|
150
|
+
for chunk in _split_chunks(" ".join(text.split())):
|
|
151
|
+
phone_ids = _phonemize_chunk(chunk.text, self._id_map)
|
|
152
|
+
feed: dict[str, Any] = {
|
|
153
|
+
"input": np.array([phone_ids], dtype=np.int64),
|
|
154
|
+
"input_lengths": np.array([len(phone_ids)], dtype=np.int64),
|
|
155
|
+
"scales": scales,
|
|
156
|
+
}
|
|
157
|
+
if "sid" in self._input_names:
|
|
158
|
+
feed["sid"] = sid
|
|
159
|
+
output = self._session.run(None, feed)[0]
|
|
160
|
+
audio_parts.append(np.asarray(output).reshape(-1).astype(np.float32))
|
|
161
|
+
if chunk.pause_s > 0:
|
|
162
|
+
audio_parts.append(_silence(chunk.pause_s))
|
|
163
|
+
|
|
164
|
+
audio = np.concatenate(audio_parts) if audio_parts else np.array([], dtype=np.float32)
|
|
165
|
+
return _audio_to_wav_bytes(audio)
|
|
166
|
+
|
|
167
|
+
def rtf(self, text: str, speaker: str = DEFAULT_SPEAKER) -> float:
|
|
168
|
+
"""Return real-time factor (inference time / audio duration) on this text."""
|
|
169
|
+
t0 = time.perf_counter()
|
|
170
|
+
wav = self.synthesize(text, speaker=speaker)
|
|
171
|
+
elapsed = time.perf_counter() - t0
|
|
172
|
+
buf = io.BytesIO(wav)
|
|
173
|
+
with wave.open(buf) as wf:
|
|
174
|
+
duration = wf.getnframes() / wf.getframerate()
|
|
175
|
+
return elapsed / duration if duration > 0 else float("inf")
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: kala-tts
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: CPU-native Nepali TTS — real_nepali G2P + VITS ONNX inference
|
|
5
|
+
Author-email: Ampixa <hello@ampixa.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://tts.ampixa.com/kala/
|
|
8
|
+
Project-URL: Repository, https://github.com/voidash/kala-tts
|
|
9
|
+
Project-URL: Hugging Face, https://huggingface.co/ampixa/real-nepali-v0.2-kala
|
|
10
|
+
Project-URL: Bug Tracker, https://github.com/voidash/kala-tts/issues
|
|
11
|
+
Keywords: nepali,tts,text-to-speech,speech-synthesis,g2p,onnx
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: numpy>=1.24
|
|
26
|
+
Requires-Dist: onnxruntime>=1.17
|
|
27
|
+
Requires-Dist: huggingface_hub>=0.23
|
|
28
|
+
Provides-Extra: test
|
|
29
|
+
Requires-Dist: pytest>=7; extra == "test"
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
|
|
32
|
+
# kala-tts
|
|
33
|
+
|
|
34
|
+
CPU-native Nepali text-to-speech. The first open-source Nepali TTS built without eSpeak.
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install kala-tts
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
import kala_tts
|
|
42
|
+
|
|
43
|
+
kala_tts.synthesize_to_file("नमस्कार, कसरी हुनुहुन्छ?", "out.wav")
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## What makes it different
|
|
47
|
+
|
|
48
|
+
eSpeak-ng maps Nepali affricates to alveolar labels (`ts`, `tsh`) that do not match
|
|
49
|
+
how Kathmandu speakers actually produce **च** and **छ**. `kala-tts` uses the
|
|
50
|
+
`real_nepali` G2P — built from Khatiwada (2009) and a 48 000-entry curated lexicon —
|
|
51
|
+
which outputs palatal `ch`/`chh` as the acoustic target.
|
|
52
|
+
|
|
53
|
+
| Feature | eSpeak `ne` | kala-tts |
|
|
54
|
+
|---|---|---|
|
|
55
|
+
| च / छ | ts / tsh (alveolar) | ch / chh (palatal) |
|
|
56
|
+
| Gemination | often lost | explicit ː tokens |
|
|
57
|
+
| Schwa deletion | heuristic | rule-based, audited |
|
|
58
|
+
| Latin code-switch | undefined | letter-by-letter + lexicon |
|
|
59
|
+
| Lexicon | none | 48 000 entries |
|
|
60
|
+
|
|
61
|
+
## API
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
import kala_tts
|
|
65
|
+
|
|
66
|
+
# Returns WAV bytes (16-bit PCM mono, 22050 Hz)
|
|
67
|
+
wav: bytes = kala_tts.synthesize("नेपाल सुन्दर देश हो।", speaker="kala")
|
|
68
|
+
|
|
69
|
+
# Write directly to file
|
|
70
|
+
kala_tts.synthesize_to_file("नमस्कार", "output.wav", speaker="barsha")
|
|
71
|
+
|
|
72
|
+
# Speed control
|
|
73
|
+
wav = kala_tts.synthesize("राम्रो दिन!", speed=0.85) # slower
|
|
74
|
+
|
|
75
|
+
# List speakers
|
|
76
|
+
print(kala_tts.list_speakers())
|
|
77
|
+
# ('kala', 'barsha', 'slr143_F', 'slr43_0546', 'slr43_2099')
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## CLI
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
kala-tts "नमस्कार, कसरी हुनुहुन्छ?" --speaker kala -o out.wav
|
|
84
|
+
kala-tts --list-speakers
|
|
85
|
+
echo "नेपाल सुन्दर देश हो।" | kala-tts -o out.wav
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Speakers
|
|
89
|
+
|
|
90
|
+
| Name | Data | Notes |
|
|
91
|
+
|---|---|---|
|
|
92
|
+
| `kala` | human studio | recommended |
|
|
93
|
+
| `barsha` | human recording | most data |
|
|
94
|
+
| `slr143_F` | OpenSLR-143 | neutral prosody |
|
|
95
|
+
| `slr43_0546` | OpenSLR-43 | read speech |
|
|
96
|
+
| `slr43_2099` | OpenSLR-43 | read speech |
|
|
97
|
+
|
|
98
|
+
## Model
|
|
99
|
+
|
|
100
|
+
The first call auto-downloads the ONNX model (~60 MB) from
|
|
101
|
+
[ampixa/real-nepali-v0.2-kala](https://huggingface.co/ampixa/real-nepali-v0.2-kala)
|
|
102
|
+
via `huggingface_hub` and caches it locally. No internet needed after first run.
|
|
103
|
+
No GPU required.
|
|
104
|
+
|
|
105
|
+
- Architecture: VITS (ONNX FP32)
|
|
106
|
+
- Sample rate: 22050 Hz
|
|
107
|
+
- CPU RTF: ~0.020 (50× real-time on a laptop)
|
|
108
|
+
|
|
109
|
+
## License
|
|
110
|
+
|
|
111
|
+
MIT (code) — CC-BY-SA 4.0 (model weights and training data).
|
|
112
|
+
See [LICENSES.md](LICENSES.md) for full attribution.
|