fastkokoro 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fastkokoro-0.2.0/PKG-INFO +256 -0
- fastkokoro-0.2.0/README.md +235 -0
- fastkokoro-0.2.0/pyproject.toml +50 -0
- fastkokoro-0.2.0/src/fastkokoro/__init__.py +3 -0
- fastkokoro-0.2.0/src/fastkokoro/assets.py +78 -0
- fastkokoro-0.2.0/src/fastkokoro/audio.py +44 -0
- fastkokoro-0.2.0/src/fastkokoro/cli.py +16 -0
- fastkokoro-0.2.0/src/fastkokoro/config.py +89 -0
- fastkokoro-0.2.0/src/fastkokoro/engine.py +99 -0
- fastkokoro-0.2.0/src/fastkokoro/json.py +13 -0
- fastkokoro-0.2.0/src/fastkokoro/onnx.py +44 -0
- fastkokoro-0.2.0/src/fastkokoro/openai.py +30 -0
- fastkokoro-0.2.0/src/fastkokoro/py.typed +0 -0
- fastkokoro-0.2.0/src/fastkokoro/server.py +101 -0
- fastkokoro-0.2.0/src/fastkokoro/voices.py +154 -0
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: fastkokoro
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Lightweight OpenAI-compatible Kokoro TTS server powered by ONNX Runtime
|
|
5
|
+
Author: Vilson Rodrigues
|
|
6
|
+
Author-email: Vilson Rodrigues <vilson@msgflux.com>
|
|
7
|
+
Requires-Dist: fastapi>=0.115.0
|
|
8
|
+
Requires-Dist: huggingface-hub>=0.36.0
|
|
9
|
+
Requires-Dist: kokoro-onnx>=0.5.0
|
|
10
|
+
Requires-Dist: numpy>=2.0.0
|
|
11
|
+
Requires-Dist: onnxruntime>=1.20.1
|
|
12
|
+
Requires-Dist: orjson>=3.10.0
|
|
13
|
+
Requires-Dist: pydantic>=2.0.0
|
|
14
|
+
Requires-Dist: soundfile>=0.13.0
|
|
15
|
+
Requires-Dist: uvicorn>=0.32.0
|
|
16
|
+
Requires-Dist: uvloop>=0.21.0 ; sys_platform != 'win32'
|
|
17
|
+
Requires-Dist: onnxruntime-gpu>=1.20.0 ; platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'gpu'
|
|
18
|
+
Requires-Python: >=3.12
|
|
19
|
+
Provides-Extra: gpu
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# fastkokoro
|
|
23
|
+
|
|
24
|
+
Lightweight OpenAI-compatible Kokoro TTS server powered by ONNX Runtime.
|
|
25
|
+
|
|
26
|
+
`fastkokoro` runs the 82M-parameter Kokoro text-to-speech model with low startup
|
|
27
|
+
overhead, fast local inference, and a small dependency footprint. It supports CPU
|
|
28
|
+
and GPU execution through ONNX Runtime providers, including CUDA, TensorRT, and
|
|
29
|
+
OpenVINO when the matching runtime package is installed. The default model is
|
|
30
|
+
NVIDIA's optimized ONNX export: `nvidia/kokoro-82M-onnx-opt`.
|
|
31
|
+
|
|
32
|
+
The NVIDIA repo's `voices.bin` uses a raw float32 layout. `fastkokoro` converts it
|
|
33
|
+
once into the `.npz` voice format expected by `kokoro-onnx`, so the default model
|
|
34
|
+
and voices both come from `nvidia/kokoro-82M-onnx-opt`.
|
|
35
|
+
|
|
36
|
+
## Install
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
uv sync
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
From PyPI:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install fastkokoro
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
For GPU builds on platforms supported by `onnxruntime-gpu`:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
uv sync --extra gpu
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Run
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
uv run fastkokoro
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
The server starts on `http://0.0.0.0:8880` by default.
|
|
61
|
+
|
|
62
|
+
Docker CPU:
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
docker build -f Dockerfile.cpu -t fastkokoro:cpu .
|
|
66
|
+
docker run -p 8880:8880 fastkokoro:cpu
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Docker Hub CPU:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
docker run -p 8880:8880 msgflux/fastkokoro:cpu
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Docker GPU:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
docker build -f Dockerfile.gpu -t fastkokoro:gpu .
|
|
79
|
+
docker run --gpus all -p 8880:8880 fastkokoro:gpu
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Docker Hub GPU:
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
docker run --gpus all -p 8880:8880 msgflux/fastkokoro:gpu
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Environment variables:
|
|
89
|
+
|
|
90
|
+
| Variable | Default |
|
|
91
|
+
| --- | --- |
|
|
92
|
+
| `FASTKOKORO_HOST` | `0.0.0.0` |
|
|
93
|
+
| `FASTKOKORO_PORT` | `8880` |
|
|
94
|
+
| `FASTKOKORO_MODEL_REPO` | `nvidia/kokoro-82M-onnx-opt` |
|
|
95
|
+
| `FASTKOKORO_MODEL_FILE` | `kokoro-82m-v1.0.onnx` |
|
|
96
|
+
| `FASTKOKORO_MODEL_PATH` | unset; downloads from Hugging Face |
|
|
97
|
+
| `FASTKOKORO_VOICES_FILE` | `voices.bin` |
|
|
98
|
+
| `FASTKOKORO_VOICES_INDEX_FILE` | `voices.txt` |
|
|
99
|
+
| `FASTKOKORO_VOICES_PATH` | unset; downloads and converts NVIDIA voices |
|
|
100
|
+
| `FASTKOKORO_DEFAULT_VOICE` | `af_heart` |
|
|
101
|
+
| `FASTKOKORO_DEFAULT_LANG` | `en-us` |
|
|
102
|
+
| `FASTKOKORO_WARMUP` | `true` |
|
|
103
|
+
| `FASTKOKORO_WARMUP_TEXT` | `hello` |
|
|
104
|
+
| `FASTKOKORO_ONNX_PROVIDERS` | `CPUExecutionProvider` |
|
|
105
|
+
| `FASTKOKORO_ONNX_AUTO_PROVIDERS` | `false` |
|
|
106
|
+
| `FASTKOKORO_ONNX_INTRA_OP_NUM_THREADS` | unset |
|
|
107
|
+
| `FASTKOKORO_ONNX_INTER_OP_NUM_THREADS` | unset |
|
|
108
|
+
|
|
109
|
+
`FASTKOKORO_WARMUP=true` runs a short synthesis during startup. This makes the
|
|
110
|
+
server take a little longer to become ready, but avoids paying most of the first
|
|
111
|
+
request latency on the first user request.
|
|
112
|
+
|
|
113
|
+
## ONNX Runtime Providers
|
|
114
|
+
|
|
115
|
+
`fastkokoro` creates the ONNX Runtime session directly, so provider selection is
|
|
116
|
+
explicit and predictable.
|
|
117
|
+
|
|
118
|
+
CPU:
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
FASTKOKORO_ONNX_PROVIDERS=CPUExecutionProvider uv run fastkokoro
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
CUDA with CPU fallback:
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
FASTKOKORO_ONNX_PROVIDERS=CUDAExecutionProvider,CPUExecutionProvider uv run fastkokoro
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
TensorRT with CUDA and CPU fallback:
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
FASTKOKORO_ONNX_PROVIDERS=TensorrtExecutionProvider,CUDAExecutionProvider,CPUExecutionProvider uv run fastkokoro
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Intel/OpenVINO builds can use:
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
FASTKOKORO_ONNX_PROVIDERS=OpenVINOExecutionProvider,CPUExecutionProvider uv run fastkokoro
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
Set `FASTKOKORO_ONNX_AUTO_PROVIDERS=true` to pass every provider available in the
|
|
143
|
+
installed ONNX Runtime build to the session. Use this mostly for quick local
|
|
144
|
+
experiments; production deployments should pin an explicit provider order.
|
|
145
|
+
|
|
146
|
+
## API
|
|
147
|
+
|
|
148
|
+
Health:
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
curl http://localhost:8880/health
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
Models:
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
curl http://localhost:8880/v1/models
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
The server exposes the local Kokoro model as `kokoro`. For client compatibility,
|
|
161
|
+
`/v1/audio/speech` also accepts `tts-1` and `gpt-4o-mini-tts` as aliases, but
|
|
162
|
+
they are not listed by `/v1/models` because the server is not running OpenAI TTS
|
|
163
|
+
models.
|
|
164
|
+
|
|
165
|
+
## Voices and Languages
|
|
166
|
+
|
|
167
|
+
The official Kokoro voice list maps voices to language codes. `fastkokoro`
|
|
168
|
+
accepts the Kokoro language code and common locale aliases, then validates that
|
|
169
|
+
the requested voice belongs to the resolved language.
|
|
170
|
+
|
|
171
|
+
| Language | Request `lang` values | Voices |
|
|
172
|
+
| --- | --- | --- |
|
|
173
|
+
| American English | `a`, `en-us`, `american` | `af_heart`, `af_alloy`, `af_aoede`, `af_bella`, `af_jessica`, `af_kore`, `af_nicole`, `af_nova`, `af_river`, `af_sarah`, `af_sky`, `am_adam`, `am_echo`, `am_eric`, `am_fenrir`, `am_liam`, `am_michael`, `am_onyx`, `am_puck`, `am_santa` |
|
|
174
|
+
| British English | `b`, `en-gb`, `british` | `bf_alice`, `bf_emma`, `bf_isabella`, `bf_lily`, `bm_daniel`, `bm_fable`, `bm_george`, `bm_lewis` |
|
|
175
|
+
| Japanese | `j`, `ja`, `ja-jp` | `jf_alpha`, `jf_gongitsune`, `jf_nezumi`, `jf_tebukuro`, `jm_kumo` |
|
|
176
|
+
| Mandarin Chinese | `z`, `zh`, `zh-cn`, `mandarin` | `zf_xiaobei`, `zf_xiaoni`, `zf_xiaoxiao`, `zf_xiaoyi`, `zm_yunjian`, `zm_yunxi`, `zm_yunxia`, `zm_yunyang` |
|
|
177
|
+
| Spanish | `e`, `es`, `es-es` | `ef_dora`, `em_alex`, `em_santa` |
|
|
178
|
+
| French | `f`, `fr`, `fr-fr` | `ff_siwis` |
|
|
179
|
+
| Hindi | `h`, `hi`, `hi-in` | `hf_alpha`, `hf_beta`, `hm_omega`, `hm_psi` |
|
|
180
|
+
| Italian | `i`, `it`, `it-it` | `if_sara`, `im_nicola` |
|
|
181
|
+
| Brazilian Portuguese | `p`, `pt`, `pt-br` | `pf_dora`, `pm_alex`, `pm_santa` |
|
|
182
|
+
|
|
183
|
+
Speech:
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
curl http://localhost:8880/v1/audio/speech \
|
|
187
|
+
-H 'Content-Type: application/json' \
|
|
188
|
+
-d '{
|
|
189
|
+
"model": "kokoro",
|
|
190
|
+
"input": "Hello from fastkokoro.",
|
|
191
|
+
"voice": "af_heart",
|
|
192
|
+
"response_format": "wav"
|
|
193
|
+
}' \
|
|
194
|
+
--output speech.wav
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
Streaming PCM:
|
|
198
|
+
|
|
199
|
+
```bash
|
|
200
|
+
curl http://localhost:8880/v1/audio/speech \
|
|
201
|
+
-H 'Content-Type: application/json' \
|
|
202
|
+
-d '{
|
|
203
|
+
"model": "kokoro",
|
|
204
|
+
"input": "Streaming from fastkokoro.",
|
|
205
|
+
"voice": "af_heart",
|
|
206
|
+
"response_format": "pcm",
|
|
207
|
+
"stream": true
|
|
208
|
+
}' \
|
|
209
|
+
--output speech.pcm
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
## OpenAI SDK Examples
|
|
213
|
+
|
|
214
|
+
The examples use inline script dependencies, so they can run directly with `uv`
|
|
215
|
+
without adding the OpenAI SDK to the project environment.
|
|
216
|
+
|
|
217
|
+
Start `fastkokoro` first:
|
|
218
|
+
|
|
219
|
+
```bash
|
|
220
|
+
uv run fastkokoro
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
Save synthesized audio to a file:
|
|
224
|
+
|
|
225
|
+
```bash
|
|
226
|
+
uv run examples/tts_save_file.py
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
Consume streamed audio chunks:
|
|
230
|
+
|
|
231
|
+
```bash
|
|
232
|
+
uv run examples/tts_stream_chunks.py
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
Useful environment variables:
|
|
236
|
+
|
|
237
|
+
| Variable | Default |
|
|
238
|
+
| --- | --- |
|
|
239
|
+
| `FASTKOKORO_BASE_URL` | `http://localhost:8880/v1` |
|
|
240
|
+
| `FASTKOKORO_API_KEY` | `fastkokoro` |
|
|
241
|
+
| `FASTKOKORO_VOICE` | `pf_dora` |
|
|
242
|
+
| `FASTKOKORO_TEXT` | `Ola, tudo bem?` |
|
|
243
|
+
| `FASTKOKORO_TTS_OUTPUT` | `speech.wav` |
|
|
244
|
+
|
|
245
|
+
## Python
|
|
246
|
+
|
|
247
|
+
```python
|
|
248
|
+
from fastkokoro import FastKokoro
|
|
249
|
+
|
|
250
|
+
engine = FastKokoro()
|
|
251
|
+
audio = engine.create(
|
|
252
|
+
"Hello from fastkokoro.",
|
|
253
|
+
voice="af_heart",
|
|
254
|
+
response_format="wav",
|
|
255
|
+
)
|
|
256
|
+
```
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
# fastkokoro
|
|
2
|
+
|
|
3
|
+
Lightweight OpenAI-compatible Kokoro TTS server powered by ONNX Runtime.
|
|
4
|
+
|
|
5
|
+
`fastkokoro` runs the 82M-parameter Kokoro text-to-speech model with low startup
|
|
6
|
+
overhead, fast local inference, and a small dependency footprint. It supports CPU
|
|
7
|
+
and GPU execution through ONNX Runtime providers, including CUDA, TensorRT, and
|
|
8
|
+
OpenVINO when the matching runtime package is installed. The default model is
|
|
9
|
+
NVIDIA's optimized ONNX export: `nvidia/kokoro-82M-onnx-opt`.
|
|
10
|
+
|
|
11
|
+
The NVIDIA repo's `voices.bin` uses a raw float32 layout. `fastkokoro` converts it
|
|
12
|
+
once into the `.npz` voice format expected by `kokoro-onnx`, so the default model
|
|
13
|
+
and voices both come from `nvidia/kokoro-82M-onnx-opt`.
|
|
14
|
+
|
|
15
|
+
## Install
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
uv sync
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
From PyPI:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install fastkokoro
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
For GPU builds on platforms supported by `onnxruntime-gpu`:
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
uv sync --extra gpu
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Run
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
uv run fastkokoro
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
The server starts on `http://0.0.0.0:8880` by default.
|
|
40
|
+
|
|
41
|
+
Docker CPU:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
docker build -f Dockerfile.cpu -t fastkokoro:cpu .
|
|
45
|
+
docker run -p 8880:8880 fastkokoro:cpu
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Docker Hub CPU:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
docker run -p 8880:8880 msgflux/fastkokoro:cpu
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Docker GPU:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
docker build -f Dockerfile.gpu -t fastkokoro:gpu .
|
|
58
|
+
docker run --gpus all -p 8880:8880 fastkokoro:gpu
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
Docker Hub GPU:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
docker run --gpus all -p 8880:8880 msgflux/fastkokoro:gpu
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Environment variables:
|
|
68
|
+
|
|
69
|
+
| Variable | Default |
|
|
70
|
+
| --- | --- |
|
|
71
|
+
| `FASTKOKORO_HOST` | `0.0.0.0` |
|
|
72
|
+
| `FASTKOKORO_PORT` | `8880` |
|
|
73
|
+
| `FASTKOKORO_MODEL_REPO` | `nvidia/kokoro-82M-onnx-opt` |
|
|
74
|
+
| `FASTKOKORO_MODEL_FILE` | `kokoro-82m-v1.0.onnx` |
|
|
75
|
+
| `FASTKOKORO_MODEL_PATH` | unset; downloads from Hugging Face |
|
|
76
|
+
| `FASTKOKORO_VOICES_FILE` | `voices.bin` |
|
|
77
|
+
| `FASTKOKORO_VOICES_INDEX_FILE` | `voices.txt` |
|
|
78
|
+
| `FASTKOKORO_VOICES_PATH` | unset; downloads and converts NVIDIA voices |
|
|
79
|
+
| `FASTKOKORO_DEFAULT_VOICE` | `af_heart` |
|
|
80
|
+
| `FASTKOKORO_DEFAULT_LANG` | `en-us` |
|
|
81
|
+
| `FASTKOKORO_WARMUP` | `true` |
|
|
82
|
+
| `FASTKOKORO_WARMUP_TEXT` | `hello` |
|
|
83
|
+
| `FASTKOKORO_ONNX_PROVIDERS` | `CPUExecutionProvider` |
|
|
84
|
+
| `FASTKOKORO_ONNX_AUTO_PROVIDERS` | `false` |
|
|
85
|
+
| `FASTKOKORO_ONNX_INTRA_OP_NUM_THREADS` | unset |
|
|
86
|
+
| `FASTKOKORO_ONNX_INTER_OP_NUM_THREADS` | unset |
|
|
87
|
+
|
|
88
|
+
`FASTKOKORO_WARMUP=true` runs a short synthesis during startup. This makes the
|
|
89
|
+
server take a little longer to become ready, but avoids paying most of the first
|
|
90
|
+
request latency on the first user request.
|
|
91
|
+
|
|
92
|
+
## ONNX Runtime Providers
|
|
93
|
+
|
|
94
|
+
`fastkokoro` creates the ONNX Runtime session directly, so provider selection is
|
|
95
|
+
explicit and predictable.
|
|
96
|
+
|
|
97
|
+
CPU:
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
FASTKOKORO_ONNX_PROVIDERS=CPUExecutionProvider uv run fastkokoro
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
CUDA with CPU fallback:
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
FASTKOKORO_ONNX_PROVIDERS=CUDAExecutionProvider,CPUExecutionProvider uv run fastkokoro
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
TensorRT with CUDA and CPU fallback:
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
FASTKOKORO_ONNX_PROVIDERS=TensorrtExecutionProvider,CUDAExecutionProvider,CPUExecutionProvider uv run fastkokoro
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
Intel/OpenVINO builds can use:
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
FASTKOKORO_ONNX_PROVIDERS=OpenVINOExecutionProvider,CPUExecutionProvider uv run fastkokoro
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
Set `FASTKOKORO_ONNX_AUTO_PROVIDERS=true` to pass every provider available in the
|
|
122
|
+
installed ONNX Runtime build to the session. Use this mostly for quick local
|
|
123
|
+
experiments; production deployments should pin an explicit provider order.
|
|
124
|
+
|
|
125
|
+
## API
|
|
126
|
+
|
|
127
|
+
Health:
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
curl http://localhost:8880/health
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
Models:
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
curl http://localhost:8880/v1/models
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
The server exposes the local Kokoro model as `kokoro`. For client compatibility,
|
|
140
|
+
`/v1/audio/speech` also accepts `tts-1` and `gpt-4o-mini-tts` as aliases, but
|
|
141
|
+
they are not listed by `/v1/models` because the server is not running OpenAI TTS
|
|
142
|
+
models.
|
|
143
|
+
|
|
144
|
+
## Voices and Languages
|
|
145
|
+
|
|
146
|
+
The official Kokoro voice list maps voices to language codes. `fastkokoro`
|
|
147
|
+
accepts the Kokoro language code and common locale aliases, then validates that
|
|
148
|
+
the requested voice belongs to the resolved language.
|
|
149
|
+
|
|
150
|
+
| Language | Request `lang` values | Voices |
|
|
151
|
+
| --- | --- | --- |
|
|
152
|
+
| American English | `a`, `en-us`, `american` | `af_heart`, `af_alloy`, `af_aoede`, `af_bella`, `af_jessica`, `af_kore`, `af_nicole`, `af_nova`, `af_river`, `af_sarah`, `af_sky`, `am_adam`, `am_echo`, `am_eric`, `am_fenrir`, `am_liam`, `am_michael`, `am_onyx`, `am_puck`, `am_santa` |
|
|
153
|
+
| British English | `b`, `en-gb`, `british` | `bf_alice`, `bf_emma`, `bf_isabella`, `bf_lily`, `bm_daniel`, `bm_fable`, `bm_george`, `bm_lewis` |
|
|
154
|
+
| Japanese | `j`, `ja`, `ja-jp` | `jf_alpha`, `jf_gongitsune`, `jf_nezumi`, `jf_tebukuro`, `jm_kumo` |
|
|
155
|
+
| Mandarin Chinese | `z`, `zh`, `zh-cn`, `mandarin` | `zf_xiaobei`, `zf_xiaoni`, `zf_xiaoxiao`, `zf_xiaoyi`, `zm_yunjian`, `zm_yunxi`, `zm_yunxia`, `zm_yunyang` |
|
|
156
|
+
| Spanish | `e`, `es`, `es-es` | `ef_dora`, `em_alex`, `em_santa` |
|
|
157
|
+
| French | `f`, `fr`, `fr-fr` | `ff_siwis` |
|
|
158
|
+
| Hindi | `h`, `hi`, `hi-in` | `hf_alpha`, `hf_beta`, `hm_omega`, `hm_psi` |
|
|
159
|
+
| Italian | `i`, `it`, `it-it` | `if_sara`, `im_nicola` |
|
|
160
|
+
| Brazilian Portuguese | `p`, `pt`, `pt-br` | `pf_dora`, `pm_alex`, `pm_santa` |
|
|
161
|
+
|
|
162
|
+
Speech:
|
|
163
|
+
|
|
164
|
+
```bash
|
|
165
|
+
curl http://localhost:8880/v1/audio/speech \
|
|
166
|
+
-H 'Content-Type: application/json' \
|
|
167
|
+
-d '{
|
|
168
|
+
"model": "kokoro",
|
|
169
|
+
"input": "Hello from fastkokoro.",
|
|
170
|
+
"voice": "af_heart",
|
|
171
|
+
"response_format": "wav"
|
|
172
|
+
}' \
|
|
173
|
+
--output speech.wav
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
Streaming PCM:
|
|
177
|
+
|
|
178
|
+
```bash
|
|
179
|
+
curl http://localhost:8880/v1/audio/speech \
|
|
180
|
+
-H 'Content-Type: application/json' \
|
|
181
|
+
-d '{
|
|
182
|
+
"model": "kokoro",
|
|
183
|
+
"input": "Streaming from fastkokoro.",
|
|
184
|
+
"voice": "af_heart",
|
|
185
|
+
"response_format": "pcm",
|
|
186
|
+
"stream": true
|
|
187
|
+
}' \
|
|
188
|
+
--output speech.pcm
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
## OpenAI SDK Examples
|
|
192
|
+
|
|
193
|
+
The examples use inline script dependencies, so they can run directly with `uv`
|
|
194
|
+
without adding the OpenAI SDK to the project environment.
|
|
195
|
+
|
|
196
|
+
Start `fastkokoro` first:
|
|
197
|
+
|
|
198
|
+
```bash
|
|
199
|
+
uv run fastkokoro
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
Save synthesized audio to a file:
|
|
203
|
+
|
|
204
|
+
```bash
|
|
205
|
+
uv run examples/tts_save_file.py
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
Consume streamed audio chunks:
|
|
209
|
+
|
|
210
|
+
```bash
|
|
211
|
+
uv run examples/tts_stream_chunks.py
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
Useful environment variables:
|
|
215
|
+
|
|
216
|
+
| Variable | Default |
|
|
217
|
+
| --- | --- |
|
|
218
|
+
| `FASTKOKORO_BASE_URL` | `http://localhost:8880/v1` |
|
|
219
|
+
| `FASTKOKORO_API_KEY` | `fastkokoro` |
|
|
220
|
+
| `FASTKOKORO_VOICE` | `pf_dora` |
|
|
221
|
+
| `FASTKOKORO_TEXT` | `Ola, tudo bem?` |
|
|
222
|
+
| `FASTKOKORO_TTS_OUTPUT` | `speech.wav` |
|
|
223
|
+
|
|
224
|
+
## Python
|
|
225
|
+
|
|
226
|
+
```python
|
|
227
|
+
from fastkokoro import FastKokoro
|
|
228
|
+
|
|
229
|
+
engine = FastKokoro()
|
|
230
|
+
audio = engine.create(
|
|
231
|
+
"Hello from fastkokoro.",
|
|
232
|
+
voice="af_heart",
|
|
233
|
+
response_format="wav",
|
|
234
|
+
)
|
|
235
|
+
```
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "fastkokoro"
|
|
3
|
+
version = "0.2.0"
|
|
4
|
+
description = "Lightweight OpenAI-compatible Kokoro TTS server powered by ONNX Runtime"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "Vilson Rodrigues", email = "vilson@msgflux.com" }
|
|
8
|
+
]
|
|
9
|
+
requires-python = ">=3.12"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"fastapi>=0.115.0",
|
|
12
|
+
"huggingface-hub>=0.36.0",
|
|
13
|
+
"kokoro-onnx>=0.5.0",
|
|
14
|
+
"numpy>=2.0.0",
|
|
15
|
+
"onnxruntime>=1.20.1",
|
|
16
|
+
"orjson>=3.10.0",
|
|
17
|
+
"pydantic>=2.0.0",
|
|
18
|
+
"soundfile>=0.13.0",
|
|
19
|
+
"uvicorn>=0.32.0",
|
|
20
|
+
"uvloop>=0.21.0 ; sys_platform != 'win32'",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[project.optional-dependencies]
|
|
24
|
+
gpu = [
|
|
25
|
+
"onnxruntime-gpu>=1.20.0; platform_machine == 'x86_64' and sys_platform != 'darwin'",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
[project.scripts]
|
|
29
|
+
fastkokoro = "fastkokoro.cli:main"
|
|
30
|
+
|
|
31
|
+
[dependency-groups]
|
|
32
|
+
dev = [
|
|
33
|
+
"httpx>=0.28.0",
|
|
34
|
+
"packaging>=24.0",
|
|
35
|
+
"pytest>=8.0.0",
|
|
36
|
+
"pytest-asyncio>=1.0.0",
|
|
37
|
+
"ruff>=0.12.0",
|
|
38
|
+
"twine>=6.0.0",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
[tool.ruff]
|
|
42
|
+
line-length = 88
|
|
43
|
+
target-version = "py312"
|
|
44
|
+
|
|
45
|
+
[tool.ruff.lint]
|
|
46
|
+
select = ["E", "F", "I", "UP", "B", "SIM"]
|
|
47
|
+
|
|
48
|
+
[build-system]
|
|
49
|
+
requires = ["uv_build>=0.11.14,<0.12.0"]
|
|
50
|
+
build-backend = "uv_build"
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from huggingface_hub import hf_hub_download
|
|
7
|
+
|
|
8
|
+
from fastkokoro.config import Settings
|
|
9
|
+
|
|
10
|
+
VOICE_STYLE_SHAPE = (510, 1, 256)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def resolve_model_path(settings: Settings) -> Path:
|
|
14
|
+
if settings.model_path is not None:
|
|
15
|
+
return settings.model_path
|
|
16
|
+
|
|
17
|
+
path = hf_hub_download(
|
|
18
|
+
repo_id=settings.model_repo,
|
|
19
|
+
filename=settings.model_file,
|
|
20
|
+
cache_dir=settings.cache_dir,
|
|
21
|
+
)
|
|
22
|
+
return Path(path)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def resolve_voices_path(settings: Settings) -> Path:
|
|
26
|
+
if settings.voices_path is not None:
|
|
27
|
+
return settings.voices_path
|
|
28
|
+
|
|
29
|
+
voices_bin = Path(
|
|
30
|
+
hf_hub_download(
|
|
31
|
+
repo_id=settings.model_repo,
|
|
32
|
+
filename=settings.voices_file,
|
|
33
|
+
cache_dir=settings.cache_dir,
|
|
34
|
+
)
|
|
35
|
+
)
|
|
36
|
+
voices_index = Path(
|
|
37
|
+
hf_hub_download(
|
|
38
|
+
repo_id=settings.model_repo,
|
|
39
|
+
filename=settings.voices_index_file,
|
|
40
|
+
cache_dir=settings.cache_dir,
|
|
41
|
+
)
|
|
42
|
+
)
|
|
43
|
+
return convert_raw_voices_to_npz(voices_bin, voices_index, settings.cache_dir)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def convert_raw_voices_to_npz(
|
|
47
|
+
voices_bin: Path, voices_index: Path, cache_dir: Path
|
|
48
|
+
) -> Path:
|
|
49
|
+
destination = cache_dir / "voices-kokoro-onnx.npz"
|
|
50
|
+
if destination.exists():
|
|
51
|
+
return destination
|
|
52
|
+
|
|
53
|
+
names = parse_voice_names(voices_index)
|
|
54
|
+
raw = np.fromfile(voices_bin, dtype=np.float32)
|
|
55
|
+
expected_values = len(names) * np.prod(VOICE_STYLE_SHAPE)
|
|
56
|
+
if raw.size != expected_values:
|
|
57
|
+
raise ValueError(
|
|
58
|
+
"Unexpected voices.bin shape: "
|
|
59
|
+
f"got {raw.size} float32 values for {len(names)} voices, "
|
|
60
|
+
f"expected {expected_values}."
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
styles = raw.reshape((len(names), *VOICE_STYLE_SHAPE))
|
|
64
|
+
voices = {name: styles[index] for index, name in enumerate(names)}
|
|
65
|
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
66
|
+
np.savez(destination, **voices)
|
|
67
|
+
return destination
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def parse_voice_names(path: Path) -> list[str]:
|
|
71
|
+
names: list[str] = []
|
|
72
|
+
for line in path.read_text(encoding="utf-8").splitlines():
|
|
73
|
+
line = line.strip()
|
|
74
|
+
if not line:
|
|
75
|
+
continue
|
|
76
|
+
_, name = line.split("=", 1)
|
|
77
|
+
names.append(name)
|
|
78
|
+
return names
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from io import BytesIO
|
|
4
|
+
from typing import Literal
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import soundfile as sf
|
|
8
|
+
|
|
9
|
+
AudioFormat = Literal["pcm", "wav", "mp3", "opus", "flac"]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def encode_audio(
|
|
13
|
+
samples: np.ndarray, sample_rate: int, audio_format: AudioFormat
|
|
14
|
+
) -> bytes:
|
|
15
|
+
if audio_format == "pcm":
|
|
16
|
+
clipped = np.clip(samples, -1.0, 1.0)
|
|
17
|
+
return (clipped * 32767).astype("<i2").tobytes()
|
|
18
|
+
|
|
19
|
+
subtype = None
|
|
20
|
+
container = audio_format.upper()
|
|
21
|
+
if audio_format == "wav":
|
|
22
|
+
container = "WAV"
|
|
23
|
+
subtype = "PCM_16"
|
|
24
|
+
elif audio_format == "mp3":
|
|
25
|
+
container = "MP3"
|
|
26
|
+
elif audio_format == "opus":
|
|
27
|
+
container = "OGG"
|
|
28
|
+
subtype = "OPUS"
|
|
29
|
+
elif audio_format == "flac":
|
|
30
|
+
container = "FLAC"
|
|
31
|
+
|
|
32
|
+
with BytesIO() as buffer:
|
|
33
|
+
sf.write(buffer, samples, sample_rate, format=container, subtype=subtype)
|
|
34
|
+
return buffer.getvalue()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def media_type(audio_format: AudioFormat) -> str:
|
|
38
|
+
return {
|
|
39
|
+
"pcm": "audio/pcm",
|
|
40
|
+
"wav": "audio/wav",
|
|
41
|
+
"mp3": "audio/mpeg",
|
|
42
|
+
"opus": "audio/ogg",
|
|
43
|
+
"flac": "audio/flac",
|
|
44
|
+
}[audio_format]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import uvicorn
|
|
4
|
+
|
|
5
|
+
from fastkokoro.config import Settings
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def main() -> None:
|
|
9
|
+
settings = Settings.from_env()
|
|
10
|
+
uvicorn.run(
|
|
11
|
+
"fastkokoro.server:app",
|
|
12
|
+
host=settings.host,
|
|
13
|
+
port=settings.port,
|
|
14
|
+
loop="auto",
|
|
15
|
+
reload=False,
|
|
16
|
+
)
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
DEFAULT_MODEL_REPO = "nvidia/kokoro-82M-onnx-opt"
|
|
8
|
+
DEFAULT_MODEL_FILE = "kokoro-82m-v1.0.onnx"
|
|
9
|
+
DEFAULT_VOICES_FILE = "voices.bin"
|
|
10
|
+
DEFAULT_VOICES_INDEX_FILE = "voices.txt"
|
|
11
|
+
DEFAULT_VOICE = "af_heart"
|
|
12
|
+
DEFAULT_LANG = "en-us"
|
|
13
|
+
DEFAULT_HOST = "0.0.0.0"
|
|
14
|
+
DEFAULT_PORT = 8880
|
|
15
|
+
DEFAULT_ONNX_PROVIDERS = ("CPUExecutionProvider",)
|
|
16
|
+
DEFAULT_WARMUP_TEXT = "hello"
|
|
17
|
+
SAMPLE_RATE = 24000
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(frozen=True)
|
|
21
|
+
class Settings:
|
|
22
|
+
model_repo: str
|
|
23
|
+
model_file: str
|
|
24
|
+
voices_file: str
|
|
25
|
+
voices_index_file: str
|
|
26
|
+
model_path: Path | None
|
|
27
|
+
voices_path: Path | None
|
|
28
|
+
cache_dir: Path
|
|
29
|
+
default_voice: str
|
|
30
|
+
default_lang: str
|
|
31
|
+
host: str
|
|
32
|
+
port: int
|
|
33
|
+
onnx_providers: tuple[str, ...]
|
|
34
|
+
onnx_auto_providers: bool
|
|
35
|
+
onnx_intra_op_num_threads: int | None
|
|
36
|
+
onnx_inter_op_num_threads: int | None
|
|
37
|
+
warmup: bool
|
|
38
|
+
warmup_text: str
|
|
39
|
+
|
|
40
|
+
@classmethod
|
|
41
|
+
def from_env(cls) -> Settings:
|
|
42
|
+
model_path = os.getenv("FASTKOKORO_MODEL_PATH")
|
|
43
|
+
voices_path = os.getenv("FASTKOKORO_VOICES_PATH")
|
|
44
|
+
cache_dir = os.getenv("FASTKOKORO_CACHE_DIR")
|
|
45
|
+
providers = os.getenv("FASTKOKORO_ONNX_PROVIDERS")
|
|
46
|
+
|
|
47
|
+
return cls(
|
|
48
|
+
model_repo=os.getenv("FASTKOKORO_MODEL_REPO", DEFAULT_MODEL_REPO),
|
|
49
|
+
model_file=os.getenv("FASTKOKORO_MODEL_FILE", DEFAULT_MODEL_FILE),
|
|
50
|
+
voices_file=os.getenv("FASTKOKORO_VOICES_FILE", DEFAULT_VOICES_FILE),
|
|
51
|
+
voices_index_file=os.getenv(
|
|
52
|
+
"FASTKOKORO_VOICES_INDEX_FILE", DEFAULT_VOICES_INDEX_FILE
|
|
53
|
+
),
|
|
54
|
+
model_path=Path(model_path).expanduser() if model_path else None,
|
|
55
|
+
voices_path=Path(voices_path).expanduser() if voices_path else None,
|
|
56
|
+
cache_dir=Path(cache_dir or "~/.cache/fastkokoro").expanduser(),
|
|
57
|
+
default_voice=os.getenv("FASTKOKORO_DEFAULT_VOICE", DEFAULT_VOICE),
|
|
58
|
+
default_lang=os.getenv("FASTKOKORO_DEFAULT_LANG", DEFAULT_LANG),
|
|
59
|
+
host=os.getenv("FASTKOKORO_HOST", DEFAULT_HOST),
|
|
60
|
+
port=int(os.getenv("FASTKOKORO_PORT", str(DEFAULT_PORT))),
|
|
61
|
+
onnx_providers=parse_csv(providers) or DEFAULT_ONNX_PROVIDERS,
|
|
62
|
+
onnx_auto_providers=parse_bool(os.getenv("FASTKOKORO_ONNX_AUTO_PROVIDERS")),
|
|
63
|
+
onnx_intra_op_num_threads=parse_optional_int(
|
|
64
|
+
os.getenv("FASTKOKORO_ONNX_INTRA_OP_NUM_THREADS")
|
|
65
|
+
),
|
|
66
|
+
onnx_inter_op_num_threads=parse_optional_int(
|
|
67
|
+
os.getenv("FASTKOKORO_ONNX_INTER_OP_NUM_THREADS")
|
|
68
|
+
),
|
|
69
|
+
warmup=parse_bool(os.getenv("FASTKOKORO_WARMUP"), default=True),
|
|
70
|
+
warmup_text=os.getenv("FASTKOKORO_WARMUP_TEXT", DEFAULT_WARMUP_TEXT),
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def parse_csv(value: str | None) -> tuple[str, ...]:
|
|
75
|
+
if not value:
|
|
76
|
+
return ()
|
|
77
|
+
return tuple(item.strip() for item in value.split(",") if item.strip())
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def parse_bool(value: str | None, *, default: bool = False) -> bool:
|
|
81
|
+
if value is None:
|
|
82
|
+
return default
|
|
83
|
+
return value.strip().lower() in {"1", "true", "yes", "on"}
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def parse_optional_int(value: str | None) -> int | None:
|
|
87
|
+
if value is None or value.strip() == "":
|
|
88
|
+
return None
|
|
89
|
+
return int(value)
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from collections.abc import AsyncGenerator
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
from kokoro_onnx import Kokoro
|
|
8
|
+
|
|
9
|
+
from fastkokoro.assets import resolve_model_path, resolve_voices_path
|
|
10
|
+
from fastkokoro.audio import AudioFormat, encode_audio
|
|
11
|
+
from fastkokoro.config import Settings
|
|
12
|
+
from fastkokoro.onnx import create_session
|
|
13
|
+
from fastkokoro.voices import normalize_language, validate_voice_language
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger("uvicorn.error")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FastKokoro:
|
|
19
|
+
def __init__(self, settings: Settings | None = None):
|
|
20
|
+
self.settings = settings or Settings.from_env()
|
|
21
|
+
self.model_path = resolve_model_path(self.settings)
|
|
22
|
+
self.voices_path = resolve_voices_path(self.settings)
|
|
23
|
+
self.session = create_session(self.model_path, self.settings)
|
|
24
|
+
self.kokoro = Kokoro.from_session(self.session, str(self.voices_path))
|
|
25
|
+
logger.info(
|
|
26
|
+
"fastkokoro engine initialized: model_repo=%s model_file=%s "
|
|
27
|
+
"model_path=%s voices_path=%s active_providers=%s "
|
|
28
|
+
"default_voice=%s default_lang=%s warmup=%s",
|
|
29
|
+
self.settings.model_repo,
|
|
30
|
+
self.settings.model_file,
|
|
31
|
+
self.model_path,
|
|
32
|
+
self.voices_path,
|
|
33
|
+
self.session.get_providers(),
|
|
34
|
+
self.settings.default_voice,
|
|
35
|
+
self.settings.default_lang,
|
|
36
|
+
self.settings.warmup,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
def voices(self) -> list[str]:
|
|
40
|
+
return self.kokoro.get_voices()
|
|
41
|
+
|
|
42
|
+
def warmup(self) -> None:
|
|
43
|
+
self.create(
|
|
44
|
+
self.settings.warmup_text,
|
|
45
|
+
voice=self.settings.default_voice,
|
|
46
|
+
response_format="pcm",
|
|
47
|
+
lang=self.settings.default_lang,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
def resolve_request(self, voice: str | None, lang: str | None) -> tuple[str, str]:
|
|
51
|
+
resolved_voice = voice or self.settings.default_voice
|
|
52
|
+
resolved_lang = normalize_language(
|
|
53
|
+
lang, resolved_voice, self.settings.default_lang
|
|
54
|
+
)
|
|
55
|
+
validate_voice_language(resolved_voice, resolved_lang, set(self.voices()))
|
|
56
|
+
return resolved_voice, resolved_lang
|
|
57
|
+
|
|
58
|
+
def create(
|
|
59
|
+
self,
|
|
60
|
+
text: str,
|
|
61
|
+
*,
|
|
62
|
+
voice: str | None = None,
|
|
63
|
+
speed: float = 1.0,
|
|
64
|
+
response_format: AudioFormat = "mp3",
|
|
65
|
+
lang: str | None = None,
|
|
66
|
+
) -> bytes:
|
|
67
|
+
resolved_voice, resolved_lang = self.resolve_request(voice, lang)
|
|
68
|
+
|
|
69
|
+
samples, sample_rate = self.kokoro.create(
|
|
70
|
+
text,
|
|
71
|
+
voice=resolved_voice,
|
|
72
|
+
speed=speed,
|
|
73
|
+
lang=resolved_lang,
|
|
74
|
+
)
|
|
75
|
+
return encode_audio(samples, sample_rate, response_format)
|
|
76
|
+
|
|
77
|
+
async def create_stream(
|
|
78
|
+
self,
|
|
79
|
+
text: str,
|
|
80
|
+
*,
|
|
81
|
+
voice: str | None = None,
|
|
82
|
+
speed: float = 1.0,
|
|
83
|
+
response_format: AudioFormat = "pcm",
|
|
84
|
+
lang: str | None = None,
|
|
85
|
+
) -> AsyncGenerator[bytes, None]:
|
|
86
|
+
resolved_voice, resolved_lang = self.resolve_request(voice, lang)
|
|
87
|
+
|
|
88
|
+
stream = self.kokoro.create_stream(
|
|
89
|
+
text,
|
|
90
|
+
voice=resolved_voice,
|
|
91
|
+
speed=speed,
|
|
92
|
+
lang=resolved_lang,
|
|
93
|
+
)
|
|
94
|
+
async for samples, sample_rate in stream:
|
|
95
|
+
yield encode_audio(
|
|
96
|
+
samples.astype(np.float32),
|
|
97
|
+
sample_rate,
|
|
98
|
+
response_format,
|
|
99
|
+
)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
import orjson
|
|
6
|
+
from starlette.responses import Response
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class FastJSONResponse(Response):
|
|
10
|
+
media_type = "application/json"
|
|
11
|
+
|
|
12
|
+
def render(self, content: Any) -> bytes:
|
|
13
|
+
return orjson.dumps(content)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import onnxruntime as ort
|
|
7
|
+
|
|
8
|
+
from fastkokoro.config import Settings
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger("uvicorn.error")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def create_session(model_path: Path, settings: Settings) -> ort.InferenceSession:
|
|
14
|
+
available = ort.get_available_providers()
|
|
15
|
+
providers = (
|
|
16
|
+
available if settings.onnx_auto_providers else list(settings.onnx_providers)
|
|
17
|
+
)
|
|
18
|
+
missing = [provider for provider in providers if provider not in available]
|
|
19
|
+
if missing:
|
|
20
|
+
raise ValueError(
|
|
21
|
+
"Requested ONNX Runtime provider(s) are not available: "
|
|
22
|
+
f"{', '.join(missing)}. Available providers: {', '.join(available)}"
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
session_options = ort.SessionOptions()
|
|
26
|
+
if settings.onnx_intra_op_num_threads is not None:
|
|
27
|
+
session_options.intra_op_num_threads = settings.onnx_intra_op_num_threads
|
|
28
|
+
if settings.onnx_inter_op_num_threads is not None:
|
|
29
|
+
session_options.inter_op_num_threads = settings.onnx_inter_op_num_threads
|
|
30
|
+
|
|
31
|
+
session = ort.InferenceSession(
|
|
32
|
+
str(model_path),
|
|
33
|
+
providers=providers,
|
|
34
|
+
sess_options=session_options,
|
|
35
|
+
)
|
|
36
|
+
logger.info(
|
|
37
|
+
"ONNX Runtime session initialized: model=%s requested_providers=%s "
|
|
38
|
+
"active_providers=%s available_providers=%s",
|
|
39
|
+
model_path,
|
|
40
|
+
providers,
|
|
41
|
+
session.get_providers(),
|
|
42
|
+
available,
|
|
43
|
+
)
|
|
44
|
+
return session
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from typing import Literal
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
|
|
8
|
+
from fastkokoro.audio import AudioFormat
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SpeechRequest(BaseModel):
|
|
12
|
+
model: str = "kokoro"
|
|
13
|
+
input: str = Field(min_length=1)
|
|
14
|
+
voice: str = "af_heart"
|
|
15
|
+
response_format: AudioFormat = "mp3"
|
|
16
|
+
speed: float = Field(default=1.0, ge=0.5, le=2.0)
|
|
17
|
+
stream: bool | None = None
|
|
18
|
+
lang: str | None = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ModelObject(BaseModel):
|
|
22
|
+
id: str
|
|
23
|
+
object: Literal["model"] = "model"
|
|
24
|
+
created: int = Field(default_factory=lambda: int(time.time()))
|
|
25
|
+
owned_by: str = "fastkokoro"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ModelList(BaseModel):
|
|
29
|
+
object: Literal["list"] = "list"
|
|
30
|
+
data: list[ModelObject]
|
|
File without changes
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import AsyncGenerator
|
|
4
|
+
from contextlib import asynccontextmanager
|
|
5
|
+
|
|
6
|
+
from fastapi import FastAPI, HTTPException
|
|
7
|
+
from fastapi.responses import Response, StreamingResponse
|
|
8
|
+
|
|
9
|
+
from fastkokoro.audio import media_type
|
|
10
|
+
from fastkokoro.engine import FastKokoro
|
|
11
|
+
from fastkokoro.json import FastJSONResponse
|
|
12
|
+
from fastkokoro.openai import ModelList, ModelObject, SpeechRequest
|
|
13
|
+
from fastkokoro.voices import KOKORO_MODEL_ID, SUPPORTED_MODEL_IDS
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def create_app(engine: FastKokoro | None = None) -> FastAPI:
|
|
17
|
+
@asynccontextmanager
|
|
18
|
+
async def lifespan(app: FastAPI):
|
|
19
|
+
app_engine = get_engine()
|
|
20
|
+
if app_engine.settings.warmup:
|
|
21
|
+
app_engine.warmup()
|
|
22
|
+
yield
|
|
23
|
+
|
|
24
|
+
app = FastAPI(
|
|
25
|
+
title="fastkokoro",
|
|
26
|
+
version="0.1.0",
|
|
27
|
+
default_response_class=FastJSONResponse,
|
|
28
|
+
lifespan=lifespan,
|
|
29
|
+
)
|
|
30
|
+
app.state.engine = engine
|
|
31
|
+
|
|
32
|
+
def get_engine() -> FastKokoro:
|
|
33
|
+
if app.state.engine is None:
|
|
34
|
+
app.state.engine = FastKokoro()
|
|
35
|
+
return app.state.engine
|
|
36
|
+
|
|
37
|
+
@app.get("/health")
|
|
38
|
+
def health() -> dict[str, str]:
|
|
39
|
+
return {"status": "healthy"}
|
|
40
|
+
|
|
41
|
+
@app.get("/v1/models")
|
|
42
|
+
def models() -> ModelList:
|
|
43
|
+
return ModelList(
|
|
44
|
+
data=[
|
|
45
|
+
ModelObject(id=KOKORO_MODEL_ID),
|
|
46
|
+
]
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
@app.get("/v1/audio/voices")
|
|
50
|
+
def voices() -> dict[str, list[str]]:
|
|
51
|
+
return {"voices": get_engine().voices()}
|
|
52
|
+
|
|
53
|
+
@app.post("/v1/audio/speech")
|
|
54
|
+
async def speech(request: SpeechRequest) -> Response:
|
|
55
|
+
if request.model not in SUPPORTED_MODEL_IDS:
|
|
56
|
+
raise HTTPException(
|
|
57
|
+
status_code=400,
|
|
58
|
+
detail=f"Unsupported model {request.model!r}. Use {KOKORO_MODEL_ID!r}.",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
engine = get_engine()
|
|
62
|
+
content_type = media_type(request.response_format)
|
|
63
|
+
should_stream = request.stream is True
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
resolved_voice, resolved_lang = engine.resolve_request(
|
|
67
|
+
request.voice, request.lang
|
|
68
|
+
)
|
|
69
|
+
except ValueError as exc:
|
|
70
|
+
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
|
71
|
+
|
|
72
|
+
if should_stream:
|
|
73
|
+
|
|
74
|
+
async def chunks() -> AsyncGenerator[bytes, None]:
|
|
75
|
+
async for chunk in engine.create_stream(
|
|
76
|
+
request.input,
|
|
77
|
+
voice=resolved_voice,
|
|
78
|
+
speed=request.speed,
|
|
79
|
+
response_format=request.response_format,
|
|
80
|
+
lang=resolved_lang,
|
|
81
|
+
):
|
|
82
|
+
yield chunk
|
|
83
|
+
|
|
84
|
+
return StreamingResponse(chunks(), media_type=content_type)
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
audio = engine.create(
|
|
88
|
+
request.input,
|
|
89
|
+
voice=resolved_voice,
|
|
90
|
+
speed=request.speed,
|
|
91
|
+
response_format=request.response_format,
|
|
92
|
+
lang=resolved_lang,
|
|
93
|
+
)
|
|
94
|
+
except (AssertionError, RuntimeError, ValueError) as exc:
|
|
95
|
+
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
|
96
|
+
return Response(content=audio, media_type=content_type)
|
|
97
|
+
|
|
98
|
+
return app
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
app = create_app()
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
KOKORO_MODEL_ID = "kokoro"
|
|
6
|
+
OPENAI_MODEL_ALIASES = frozenset({"tts-1", "gpt-4o-mini-tts"})
|
|
7
|
+
SUPPORTED_MODEL_IDS = frozenset({KOKORO_MODEL_ID, *OPENAI_MODEL_ALIASES})
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class LanguageSpec:
|
|
12
|
+
kokoro_code: str
|
|
13
|
+
runtime_lang: str
|
|
14
|
+
aliases: tuple[str, ...]
|
|
15
|
+
voices: tuple[str, ...]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
LANGUAGES: tuple[LanguageSpec, ...] = (
|
|
19
|
+
LanguageSpec(
|
|
20
|
+
kokoro_code="a",
|
|
21
|
+
runtime_lang="en-us",
|
|
22
|
+
aliases=("a", "en-us", "en_us", "american", "american-english"),
|
|
23
|
+
voices=(
|
|
24
|
+
"af_heart",
|
|
25
|
+
"af_alloy",
|
|
26
|
+
"af_aoede",
|
|
27
|
+
"af_bella",
|
|
28
|
+
"af_jessica",
|
|
29
|
+
"af_kore",
|
|
30
|
+
"af_nicole",
|
|
31
|
+
"af_nova",
|
|
32
|
+
"af_river",
|
|
33
|
+
"af_sarah",
|
|
34
|
+
"af_sky",
|
|
35
|
+
"am_adam",
|
|
36
|
+
"am_echo",
|
|
37
|
+
"am_eric",
|
|
38
|
+
"am_fenrir",
|
|
39
|
+
"am_liam",
|
|
40
|
+
"am_michael",
|
|
41
|
+
"am_onyx",
|
|
42
|
+
"am_puck",
|
|
43
|
+
"am_santa",
|
|
44
|
+
),
|
|
45
|
+
),
|
|
46
|
+
LanguageSpec(
|
|
47
|
+
kokoro_code="b",
|
|
48
|
+
runtime_lang="en-gb",
|
|
49
|
+
aliases=("b", "en-gb", "en_gb", "british", "british-english"),
|
|
50
|
+
voices=(
|
|
51
|
+
"bf_alice",
|
|
52
|
+
"bf_emma",
|
|
53
|
+
"bf_isabella",
|
|
54
|
+
"bf_lily",
|
|
55
|
+
"bm_daniel",
|
|
56
|
+
"bm_fable",
|
|
57
|
+
"bm_george",
|
|
58
|
+
"bm_lewis",
|
|
59
|
+
),
|
|
60
|
+
),
|
|
61
|
+
LanguageSpec(
|
|
62
|
+
kokoro_code="j",
|
|
63
|
+
runtime_lang="ja",
|
|
64
|
+
aliases=("j", "ja", "ja-jp", "ja_jp", "japanese"),
|
|
65
|
+
voices=("jf_alpha", "jf_gongitsune", "jf_nezumi", "jf_tebukuro", "jm_kumo"),
|
|
66
|
+
),
|
|
67
|
+
LanguageSpec(
|
|
68
|
+
kokoro_code="z",
|
|
69
|
+
runtime_lang="zh",
|
|
70
|
+
aliases=("z", "zh", "zh-cn", "zh_cn", "mandarin", "mandarin-chinese"),
|
|
71
|
+
voices=(
|
|
72
|
+
"zf_xiaobei",
|
|
73
|
+
"zf_xiaoni",
|
|
74
|
+
"zf_xiaoxiao",
|
|
75
|
+
"zf_xiaoyi",
|
|
76
|
+
"zm_yunjian",
|
|
77
|
+
"zm_yunxi",
|
|
78
|
+
"zm_yunxia",
|
|
79
|
+
"zm_yunyang",
|
|
80
|
+
),
|
|
81
|
+
),
|
|
82
|
+
LanguageSpec(
|
|
83
|
+
kokoro_code="e",
|
|
84
|
+
runtime_lang="es",
|
|
85
|
+
aliases=("e", "es", "es-es", "es_es", "spanish"),
|
|
86
|
+
voices=("ef_dora", "em_alex", "em_santa"),
|
|
87
|
+
),
|
|
88
|
+
LanguageSpec(
|
|
89
|
+
kokoro_code="f",
|
|
90
|
+
runtime_lang="fr-fr",
|
|
91
|
+
aliases=("f", "fr", "fr-fr", "fr_fr", "french"),
|
|
92
|
+
voices=("ff_siwis",),
|
|
93
|
+
),
|
|
94
|
+
LanguageSpec(
|
|
95
|
+
kokoro_code="h",
|
|
96
|
+
runtime_lang="hi",
|
|
97
|
+
aliases=("h", "hi", "hi-in", "hi_in", "hindi"),
|
|
98
|
+
voices=("hf_alpha", "hf_beta", "hm_omega", "hm_psi"),
|
|
99
|
+
),
|
|
100
|
+
LanguageSpec(
|
|
101
|
+
kokoro_code="i",
|
|
102
|
+
runtime_lang="it",
|
|
103
|
+
aliases=("i", "it", "it-it", "it_it", "italian"),
|
|
104
|
+
voices=("if_sara", "im_nicola"),
|
|
105
|
+
),
|
|
106
|
+
LanguageSpec(
|
|
107
|
+
kokoro_code="p",
|
|
108
|
+
runtime_lang="pt-br",
|
|
109
|
+
aliases=("p", "pt", "pt-br", "pt_br", "brazilian-portuguese", "portuguese"),
|
|
110
|
+
voices=("pf_dora", "pm_alex", "pm_santa"),
|
|
111
|
+
),
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
LANGUAGE_BY_ALIAS = {
|
|
115
|
+
alias: language for language in LANGUAGES for alias in language.aliases
|
|
116
|
+
}
|
|
117
|
+
VOICE_TO_LANGUAGE = {
|
|
118
|
+
voice: language for language in LANGUAGES for voice in language.voices
|
|
119
|
+
}
|
|
120
|
+
SUPPORTED_LANGUAGE_ALIASES = frozenset(LANGUAGE_BY_ALIAS)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def normalize_language(lang: str | None, voice: str | None, default_lang: str) -> str:
|
|
124
|
+
raw_lang = lang or ""
|
|
125
|
+
key = raw_lang.strip().lower().replace("_", "-")
|
|
126
|
+
if key:
|
|
127
|
+
language = LANGUAGE_BY_ALIAS.get(key)
|
|
128
|
+
if language is None:
|
|
129
|
+
raise ValueError(
|
|
130
|
+
"Unsupported language. Supported values: "
|
|
131
|
+
+ ", ".join(sorted(SUPPORTED_LANGUAGE_ALIASES))
|
|
132
|
+
)
|
|
133
|
+
return language.runtime_lang
|
|
134
|
+
|
|
135
|
+
if voice and voice in VOICE_TO_LANGUAGE:
|
|
136
|
+
return VOICE_TO_LANGUAGE[voice].runtime_lang
|
|
137
|
+
|
|
138
|
+
default_key = default_lang.strip().lower().replace("_", "-")
|
|
139
|
+
language = LANGUAGE_BY_ALIAS.get(default_key)
|
|
140
|
+
if language is None:
|
|
141
|
+
raise ValueError(f"Unsupported default language: {default_lang}")
|
|
142
|
+
return language.runtime_lang
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def validate_voice_language(voice: str, lang: str, available_voices: set[str]) -> None:
|
|
146
|
+
if voice not in available_voices:
|
|
147
|
+
raise ValueError(f"Voice {voice!r} is not available")
|
|
148
|
+
|
|
149
|
+
language = VOICE_TO_LANGUAGE.get(voice)
|
|
150
|
+
if language is not None and language.runtime_lang != lang:
|
|
151
|
+
raise ValueError(
|
|
152
|
+
f"Voice {voice!r} belongs to language {language.runtime_lang!r}, "
|
|
153
|
+
f"but request language resolved to {lang!r}"
|
|
154
|
+
)
|