supersonic-tts 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: supersonic-tts
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Stdin-friendly CLI + Python API wrapper for Supertonic — lightning-fast, on-device, multilingual TTS
|
|
5
|
+
Project-URL: Homepage, https://github.com/jxsprt/supersonic-tts
|
|
6
|
+
Project-URL: Repository, https://github.com/jxsprt/supersonic-tts
|
|
7
|
+
Author-email: Jaspreet Singh <jaspreetsinghintp@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
Keywords: on-device,onnx,supertonic,text-to-speech,tts
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Requires-Dist: supertonic>=1.0.0
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
# supersonic-tts
|
|
25
|
+
|
|
26
|
+
**Stdin-friendly CLI + Python API wrapper for [Supertonic](https://github.com/supertone-inc/supertonic)** — lightning-fast, on-device, multilingual TTS via ONNX Runtime.
|
|
27
|
+
|
|
28
|
+
No cloud API. No data leaves your machine. 31 languages. 10 voices.
|
|
29
|
+
|
|
30
|
+
## Why supersonic-tts?
|
|
31
|
+
|
|
32
|
+
Supertonic's official CLI takes text as a positional argument — awkward for piped input, shell scripts, and AI agents. `supersonic-tts` wraps it with:
|
|
33
|
+
|
|
34
|
+
- **Stdin support** — pipe text directly: `echo "hello" | supersonic-tts -o out.wav`
|
|
35
|
+
- **Hermes Agent integration** — drop-in command-based TTS provider
|
|
36
|
+
- **Same engine** — uses supertonic under the hood (supertonic-3, ONNX)
|
|
37
|
+
|
|
38
|
+
## Install
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install supersonic-tts
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
First run auto-downloads the model (~305MB) from HuggingFace.
|
|
45
|
+
|
|
46
|
+
## Usage
|
|
47
|
+
|
|
48
|
+
### CLI
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
# Read from argument
|
|
52
|
+
supersonic-tts "Hello world" -o output.wav
|
|
53
|
+
|
|
54
|
+
# Read from stdin (piped)
|
|
55
|
+
echo "Hello from stdin" | supersonic-tts -o output.wav
|
|
56
|
+
|
|
57
|
+
# Read from file
|
|
58
|
+
cat long_text.txt | supersonic-tts -o output.wav
|
|
59
|
+
|
|
60
|
+
# Choose voice
|
|
61
|
+
supersonic-tts "Crisp and confident" -o output.wav --voice F4
|
|
62
|
+
|
|
63
|
+
# Multilingual
|
|
64
|
+
supersonic-tts "Bonjour le monde" -o french.wav --lang fr --voice F1
|
|
65
|
+
|
|
66
|
+
# Adjust speed
|
|
67
|
+
supersonic-tts "Fast talk" -o fast.wav --speed 1.5
|
|
68
|
+
|
|
69
|
+
# Higher quality
|
|
70
|
+
supersonic-tts "Premium quality" -o premium.wav --steps 10
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Python API
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
from supersonic_tts import SupersonicTTS
|
|
77
|
+
|
|
78
|
+
tts = SupersonicTTS()
|
|
79
|
+
wav, duration = tts.synthesize("Hello world", voice="F4", lang="en")
|
|
80
|
+
tts.save("output.wav")
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Hermes Agent Integration
|
|
84
|
+
|
|
85
|
+
Add to `~/.hermes/config.yaml`:
|
|
86
|
+
|
|
87
|
+
```yaml
|
|
88
|
+
tts:
|
|
89
|
+
provider: supersonic-tts
|
|
90
|
+
supersonic-tts:
|
|
91
|
+
type: command
|
|
92
|
+
command: supersonic-tts -o {output_path} --voice {voice} < {input_path}
|
|
93
|
+
voice: F4
|
|
94
|
+
model: supertonic-3
|
|
95
|
+
output_format: wav
|
|
96
|
+
voice_compatible: true
|
|
97
|
+
timeout: 60
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Voices
|
|
101
|
+
|
|
102
|
+
| Voice | Style |
|
|
103
|
+
|-------|-------|
|
|
104
|
+
| M1 | Lively, upbeat male |
|
|
105
|
+
| M2 | Deep, calm male |
|
|
106
|
+
| M3 | Authoritative male |
|
|
107
|
+
| M4 | Soft, friendly male |
|
|
108
|
+
| M5 | Warm, storytelling male |
|
|
109
|
+
| F1 | Calm, composed female |
|
|
110
|
+
| F2 | Bright, cheerful female |
|
|
111
|
+
| F3 | Professional announcer female |
|
|
112
|
+
| **F4** | **Crisp, confident female** |
|
|
113
|
+
| F5 | Gentle, soothing female |
|
|
114
|
+
|
|
115
|
+
## Languages
|
|
116
|
+
|
|
117
|
+
31 languages (supertonic-3): en, ko, ja, ar, bg, cs, da, de, el, es, et, fi, fr, hi, hr, hu, id, it, lt, lv, nl, pl, pt, ro, ru, sk, sl, sv, tr, uk, vi, na (fallback)
|
|
118
|
+
|
|
119
|
+
## License
|
|
120
|
+
|
|
121
|
+
MIT (code) — supertonic model uses OpenRAIL-M
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# supersonic-tts
|
|
2
|
+
|
|
3
|
+
**Stdin-friendly CLI + Python API wrapper for [Supertonic](https://github.com/supertone-inc/supertonic)** — lightning-fast, on-device, multilingual TTS via ONNX Runtime.
|
|
4
|
+
|
|
5
|
+
No cloud API. No data leaves your machine. 31 languages. 10 voices.
|
|
6
|
+
|
|
7
|
+
## Why supersonic-tts?
|
|
8
|
+
|
|
9
|
+
Supertonic's official CLI takes text as a positional argument — awkward for piped input, shell scripts, and AI agents. `supersonic-tts` wraps it with:
|
|
10
|
+
|
|
11
|
+
- **Stdin support** — pipe text directly: `echo "hello" | supersonic-tts -o out.wav`
|
|
12
|
+
- **Hermes Agent integration** — drop-in command-based TTS provider
|
|
13
|
+
- **Same engine** — uses supertonic under the hood (supertonic-3, ONNX)
|
|
14
|
+
|
|
15
|
+
## Install
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install supersonic-tts
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
First run auto-downloads the model (~305MB) from HuggingFace.
|
|
22
|
+
|
|
23
|
+
## Usage
|
|
24
|
+
|
|
25
|
+
### CLI
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
# Read from argument
|
|
29
|
+
supersonic-tts "Hello world" -o output.wav
|
|
30
|
+
|
|
31
|
+
# Read from stdin (piped)
|
|
32
|
+
echo "Hello from stdin" | supersonic-tts -o output.wav
|
|
33
|
+
|
|
34
|
+
# Read from file
|
|
35
|
+
cat long_text.txt | supersonic-tts -o output.wav
|
|
36
|
+
|
|
37
|
+
# Choose voice
|
|
38
|
+
supersonic-tts "Crisp and confident" -o output.wav --voice F4
|
|
39
|
+
|
|
40
|
+
# Multilingual
|
|
41
|
+
supersonic-tts "Bonjour le monde" -o french.wav --lang fr --voice F1
|
|
42
|
+
|
|
43
|
+
# Adjust speed
|
|
44
|
+
supersonic-tts "Fast talk" -o fast.wav --speed 1.5
|
|
45
|
+
|
|
46
|
+
# Higher quality
|
|
47
|
+
supersonic-tts "Premium quality" -o premium.wav --steps 10
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### Python API
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
from supersonic_tts import SupersonicTTS
|
|
54
|
+
|
|
55
|
+
tts = SupersonicTTS()
|
|
56
|
+
wav, duration = tts.synthesize("Hello world", voice="F4", lang="en")
|
|
57
|
+
tts.save("output.wav")
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### Hermes Agent Integration
|
|
61
|
+
|
|
62
|
+
Add to `~/.hermes/config.yaml`:
|
|
63
|
+
|
|
64
|
+
```yaml
|
|
65
|
+
tts:
|
|
66
|
+
provider: supersonic-tts
|
|
67
|
+
supersonic-tts:
|
|
68
|
+
type: command
|
|
69
|
+
command: supersonic-tts -o {output_path} --voice {voice} < {input_path}
|
|
70
|
+
voice: F4
|
|
71
|
+
model: supertonic-3
|
|
72
|
+
output_format: wav
|
|
73
|
+
voice_compatible: true
|
|
74
|
+
timeout: 60
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Voices
|
|
78
|
+
|
|
79
|
+
| Voice | Style |
|
|
80
|
+
|-------|-------|
|
|
81
|
+
| M1 | Lively, upbeat male |
|
|
82
|
+
| M2 | Deep, calm male |
|
|
83
|
+
| M3 | Authoritative male |
|
|
84
|
+
| M4 | Soft, friendly male |
|
|
85
|
+
| M5 | Warm, storytelling male |
|
|
86
|
+
| F1 | Calm, composed female |
|
|
87
|
+
| F2 | Bright, cheerful female |
|
|
88
|
+
| F3 | Professional announcer female |
|
|
89
|
+
| **F4** | **Crisp, confident female** |
|
|
90
|
+
| F5 | Gentle, soothing female |
|
|
91
|
+
|
|
92
|
+
## Languages
|
|
93
|
+
|
|
94
|
+
31 languages (supertonic-3): en, ko, ja, ar, bg, cs, da, de, el, es, et, fi, fr, hi, hr, hu, id, it, lt, lv, nl, pl, pt, ro, ru, sk, sl, sv, tr, uk, vi, na (fallback)
|
|
95
|
+
|
|
96
|
+
## License
|
|
97
|
+
|
|
98
|
+
MIT (code) — supertonic model uses OpenRAIL-M
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "supersonic-tts"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Stdin-friendly CLI + Python API wrapper for Supertonic — lightning-fast, on-device, multilingual TTS"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Jaspreet Singh", email = "jaspreetsinghintp@gmail.com" },
|
|
14
|
+
]
|
|
15
|
+
keywords = ["tts", "text-to-speech", "supertonic", "on-device", "onnx"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.9",
|
|
22
|
+
"Programming Language :: Python :: 3.10",
|
|
23
|
+
"Programming Language :: Python :: 3.11",
|
|
24
|
+
"Programming Language :: Python :: 3.12",
|
|
25
|
+
"Topic :: Multimedia :: Sound/Audio :: Speech",
|
|
26
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
dependencies = [
|
|
30
|
+
"supertonic>=1.0.0",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.urls]
|
|
34
|
+
Homepage = "https://github.com/jxsprt/supersonic-tts"
|
|
35
|
+
Repository = "https://github.com/jxsprt/supersonic-tts"
|
|
36
|
+
|
|
37
|
+
[project.scripts]
|
|
38
|
+
supersonic-tts = "supersonic_tts.cli:main"
|
|
39
|
+
|
|
40
|
+
[tool.hatch.build.targets.wheel]
|
|
41
|
+
packages = ["src/supersonic_tts"]
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""CLI entry point for supersonic-tts.
|
|
3
|
+
|
|
4
|
+
Reads text from:
|
|
5
|
+
1. First positional argument
|
|
6
|
+
2. Stdin (piped text)
|
|
7
|
+
3. A file via --input
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
supersonic-tts "Hello world" -o out.wav
|
|
11
|
+
echo "Hello" | supersonic-tts -o out.wav
|
|
12
|
+
cat file.txt | supersonic-tts -o out.wav --voice F4
|
|
13
|
+
supersonic-tts -o out.wav --input input.txt
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import argparse
|
|
19
|
+
import sys
|
|
20
|
+
from typing import Optional
|
|
21
|
+
|
|
22
|
+
from .core import SupersonicTTS
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def resolve_text(
|
|
26
|
+
args_text: Optional[str],
|
|
27
|
+
args_input: Optional[str],
|
|
28
|
+
) -> Optional[str]:
|
|
29
|
+
"""Resolve input text from argument, file, or stdin. Returns None if empty."""
|
|
30
|
+
# 1. From --input file
|
|
31
|
+
if args_input:
|
|
32
|
+
with open(args_input, "r", encoding="utf-8") as f:
|
|
33
|
+
return f.read().strip()
|
|
34
|
+
|
|
35
|
+
# 2. From positional argument
|
|
36
|
+
if args_text:
|
|
37
|
+
return args_text.strip()
|
|
38
|
+
|
|
39
|
+
# 3. From stdin (only if stdin is not a TTY — i.e. piped)
|
|
40
|
+
if not sys.stdin.isatty():
|
|
41
|
+
try:
|
|
42
|
+
data = sys.stdin.read()
|
|
43
|
+
if data.strip():
|
|
44
|
+
return data.strip()
|
|
45
|
+
except KeyboardInterrupt:
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
52
|
+
parser = argparse.ArgumentParser(
|
|
53
|
+
prog="supersonic-tts",
|
|
54
|
+
description="Lightning-fast, on-device, multilingual TTS",
|
|
55
|
+
)
|
|
56
|
+
parser.add_argument(
|
|
57
|
+
"text",
|
|
58
|
+
nargs="?",
|
|
59
|
+
default=None,
|
|
60
|
+
help="Text to synthesize (omit to read from stdin or --input)",
|
|
61
|
+
)
|
|
62
|
+
parser.add_argument(
|
|
63
|
+
"-o", "--output",
|
|
64
|
+
help="Output WAV file path",
|
|
65
|
+
)
|
|
66
|
+
parser.add_argument(
|
|
67
|
+
"--input",
|
|
68
|
+
help="Read input text from a file",
|
|
69
|
+
)
|
|
70
|
+
parser.add_argument(
|
|
71
|
+
"--voice",
|
|
72
|
+
default="M1",
|
|
73
|
+
choices=sorted(SupersonicTTS.list_voices().keys()),
|
|
74
|
+
help="Voice style (default: M1)",
|
|
75
|
+
)
|
|
76
|
+
parser.add_argument(
|
|
77
|
+
"--lang",
|
|
78
|
+
default="en",
|
|
79
|
+
help="Language code (default: en). Use --list-langs to see all.",
|
|
80
|
+
)
|
|
81
|
+
parser.add_argument(
|
|
82
|
+
"--steps",
|
|
83
|
+
type=int,
|
|
84
|
+
default=5,
|
|
85
|
+
help="Synthesis quality steps (default: 5, higher = better)",
|
|
86
|
+
)
|
|
87
|
+
parser.add_argument(
|
|
88
|
+
"--speed",
|
|
89
|
+
type=float,
|
|
90
|
+
default=1.05,
|
|
91
|
+
help="Speech speed (0.7-2.0, default: 1.05)",
|
|
92
|
+
)
|
|
93
|
+
parser.add_argument(
|
|
94
|
+
"--list-voices",
|
|
95
|
+
action="store_true",
|
|
96
|
+
help="List available voice styles and exit",
|
|
97
|
+
)
|
|
98
|
+
parser.add_argument(
|
|
99
|
+
"--list-langs",
|
|
100
|
+
action="store_true",
|
|
101
|
+
help="List supported language codes and exit",
|
|
102
|
+
)
|
|
103
|
+
parser.add_argument(
|
|
104
|
+
"-v", "--verbose",
|
|
105
|
+
action="store_true",
|
|
106
|
+
help="Show generation timing info",
|
|
107
|
+
)
|
|
108
|
+
return parser
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def main() -> None:
|
|
112
|
+
parser = build_parser()
|
|
113
|
+
args = parser.parse_args()
|
|
114
|
+
|
|
115
|
+
# Handle info commands
|
|
116
|
+
if args.list_voices:
|
|
117
|
+
print("Available voices:")
|
|
118
|
+
for name, desc in SupersonicTTS.list_voices().items():
|
|
119
|
+
print(f" {name}: {desc}")
|
|
120
|
+
return
|
|
121
|
+
|
|
122
|
+
if args.list_langs:
|
|
123
|
+
print("Supported languages:")
|
|
124
|
+
for code in SupersonicTTS.list_languages():
|
|
125
|
+
print(f" {code}")
|
|
126
|
+
return
|
|
127
|
+
|
|
128
|
+
# Validate -o is present for synthesis
|
|
129
|
+
if not args.output:
|
|
130
|
+
parser.print_help()
|
|
131
|
+
print("\nERROR: --output/-o is required for synthesis", file=sys.stderr)
|
|
132
|
+
sys.exit(1)
|
|
133
|
+
|
|
134
|
+
# Resolve text
|
|
135
|
+
text = resolve_text(args.text, args.input)
|
|
136
|
+
if not text:
|
|
137
|
+
parser.print_help()
|
|
138
|
+
print("\nERROR: No input text provided. Pass text as argument, pipe to stdin, or use --input.", file=sys.stderr)
|
|
139
|
+
sys.exit(1)
|
|
140
|
+
|
|
141
|
+
# Synthesize
|
|
142
|
+
tts = SupersonicTTS(verbose=args.verbose)
|
|
143
|
+
|
|
144
|
+
try:
|
|
145
|
+
wav, _duration = tts.synthesize(
|
|
146
|
+
text,
|
|
147
|
+
voice=args.voice,
|
|
148
|
+
lang=args.lang,
|
|
149
|
+
steps=args.steps,
|
|
150
|
+
speed=args.speed,
|
|
151
|
+
)
|
|
152
|
+
except Exception as e:
|
|
153
|
+
print(f"ERROR: Synthesis failed: {e}", file=sys.stderr)
|
|
154
|
+
sys.exit(1)
|
|
155
|
+
|
|
156
|
+
# Save
|
|
157
|
+
try:
|
|
158
|
+
tts.save(wav, args.output)
|
|
159
|
+
except Exception as e:
|
|
160
|
+
print(f"ERROR: Failed to save audio: {e}", file=sys.stderr)
|
|
161
|
+
sys.exit(1)
|
|
162
|
+
|
|
163
|
+
if args.verbose:
|
|
164
|
+
print(f"Audio saved to {args.output}", file=sys.stderr)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
if __name__ == "__main__":
|
|
168
|
+
main()
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""Core Python API for SupersonicTTS."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from typing import Optional, Tuple
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
from supertonic import TTS as _TTS
|
|
12
|
+
except ImportError:
|
|
13
|
+
_TTS = None
|
|
14
|
+
|
|
15
|
+
__version__ = "0.1.0"
|
|
16
|
+
|
|
17
|
+
# Available voice styles
|
|
18
|
+
VOICES = {
|
|
19
|
+
"M1": "Lively, upbeat male",
|
|
20
|
+
"M2": "Deep, calm male",
|
|
21
|
+
"M3": "Authoritative male",
|
|
22
|
+
"M4": "Soft, friendly male",
|
|
23
|
+
"M5": "Warm, storytelling male",
|
|
24
|
+
"F1": "Calm, composed female",
|
|
25
|
+
"F2": "Bright, cheerful female",
|
|
26
|
+
"F3": "Professional announcer female",
|
|
27
|
+
"F4": "Crisp, confident female",
|
|
28
|
+
"F5": "Gentle, soothing female",
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
# Available languages
|
|
32
|
+
LANGUAGES = [
|
|
33
|
+
"en", "ko", "ja", "ar", "bg", "cs", "da", "de", "el", "es",
|
|
34
|
+
"et", "fi", "fr", "hi", "hr", "hu", "id", "it", "lt", "lv",
|
|
35
|
+
"nl", "pl", "pt", "ro", "ru", "sk", "sl", "sv", "tr", "uk",
|
|
36
|
+
"vi", "na",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class SupersonicTTS:
|
|
41
|
+
"""High-level Python wrapper for Supertonic TTS.
|
|
42
|
+
|
|
43
|
+
Usage:
|
|
44
|
+
tts = SupersonicTTS()
|
|
45
|
+
wav, duration = tts.synthesize("Hello world", voice="F4", lang="en")
|
|
46
|
+
tts.save(wav, "output.wav")
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(self, auto_download: bool = True, verbose: bool = False):
|
|
50
|
+
if _TTS is None:
|
|
51
|
+
raise ImportError(
|
|
52
|
+
"supertonic is not installed. Run: pip install supertonic"
|
|
53
|
+
)
|
|
54
|
+
self._tts = _TTS(auto_download=auto_download)
|
|
55
|
+
self._verbose = verbose
|
|
56
|
+
|
|
57
|
+
def synthesize(
|
|
58
|
+
self,
|
|
59
|
+
text: str,
|
|
60
|
+
voice: str = "M1",
|
|
61
|
+
lang: str = "en",
|
|
62
|
+
steps: int = 5,
|
|
63
|
+
speed: float = 1.05,
|
|
64
|
+
) -> Tuple[np.ndarray, np.ndarray]:
|
|
65
|
+
"""Synthesize speech from text.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
text: Text to synthesize.
|
|
69
|
+
voice: Voice style name (M1-M5, F1-F5). Default: M1.
|
|
70
|
+
lang: Language code. Default: en.
|
|
71
|
+
steps: Synthesis quality steps. Default: 5.
|
|
72
|
+
speed: Speech speed multiplier (0.7-2.0). Default: 1.05.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
Tuple of (waveform, duration) where waveform has shape (1, n_samples).
|
|
76
|
+
"""
|
|
77
|
+
style = self._tts.get_voice_style(voice_name=voice)
|
|
78
|
+
|
|
79
|
+
t0 = time.time()
|
|
80
|
+
wav, duration = self._tts.synthesize(
|
|
81
|
+
text,
|
|
82
|
+
voice_style=style,
|
|
83
|
+
lang=lang,
|
|
84
|
+
total_steps=steps,
|
|
85
|
+
speed=speed,
|
|
86
|
+
)
|
|
87
|
+
elapsed = time.time() - t0
|
|
88
|
+
|
|
89
|
+
if self._verbose:
|
|
90
|
+
print(
|
|
91
|
+
f"Generated {float(duration[0]):.2f}s of audio "
|
|
92
|
+
f"in {elapsed:.2f}s (voice={voice})",
|
|
93
|
+
file=__import__("sys").stderr,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
return wav, duration
|
|
97
|
+
|
|
98
|
+
def save(self, waveform: np.ndarray, path: str) -> None:
|
|
99
|
+
"""Save waveform to a WAV file.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
waveform: Audio array from synthesize().
|
|
103
|
+
path: Output file path (should end in .wav).
|
|
104
|
+
"""
|
|
105
|
+
self._tts.save_audio(waveform, path)
|
|
106
|
+
|
|
107
|
+
@staticmethod
|
|
108
|
+
def list_voices() -> dict:
|
|
109
|
+
"""Return dict of {voice_name: description}."""
|
|
110
|
+
return dict(VOICES)
|
|
111
|
+
|
|
112
|
+
@staticmethod
|
|
113
|
+
def list_languages() -> list:
|
|
114
|
+
"""Return list of supported language codes."""
|
|
115
|
+
return list(LANGUAGES)
|
|
116
|
+
|
|
117
|
+
def __repr__(self) -> str:
|
|
118
|
+
return f"SupersonicTTS(voices={len(VOICES)}, languages={len(LANGUAGES)})"
|