madcat-tts 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- madcat_tts-0.3.0/.github/workflows/main.yml +29 -0
- madcat_tts-0.3.0/.gitignore +32 -0
- madcat_tts-0.3.0/AGENTS.md +148 -0
- madcat_tts-0.3.0/PKG-INFO +19 -0
- madcat_tts-0.3.0/README.md +157 -0
- madcat_tts-0.3.0/conf/carts/bt7274.toml +60 -0
- madcat_tts-0.3.0/conf/carts/lessac.toml +14 -0
- madcat_tts-0.3.0/conf/madcat-tts.service +23 -0
- madcat_tts-0.3.0/conf/personas/bt7274.pcart/cart.toml +40 -0
- madcat_tts-0.3.0/conf/personas/lessac.pcart/cart.toml +28 -0
- madcat_tts-0.3.0/conf/vllm-tts.service +19 -0
- madcat_tts-0.3.0/conf/vllm-tts.yaml +30 -0
- madcat_tts-0.3.0/docs/PLAN.md +185 -0
- madcat_tts-0.3.0/install.sh +77 -0
- madcat_tts-0.3.0/pyproject.toml +40 -0
- madcat_tts-0.3.0/src/madcat_tts/__init__.py +9 -0
- madcat_tts-0.3.0/src/madcat_tts/__main__.py +46 -0
- madcat_tts-0.3.0/src/madcat_tts/chunker.py +185 -0
- madcat_tts-0.3.0/src/madcat_tts/config.py +178 -0
- madcat_tts-0.3.0/src/madcat_tts/engines/__init__.py +5 -0
- madcat_tts-0.3.0/src/madcat_tts/engines/base.py +54 -0
- madcat_tts-0.3.0/src/madcat_tts/engines/chatterbox.py +81 -0
- madcat_tts-0.3.0/src/madcat_tts/engines/chatterbox_turbo.py +98 -0
- madcat_tts-0.3.0/src/madcat_tts/engines/piper.py +80 -0
- madcat_tts-0.3.0/src/madcat_tts/engines/xtts_proxy.py +172 -0
- madcat_tts-0.3.0/src/madcat_tts/main.py +489 -0
- madcat_tts-0.3.0/src/madcat_tts/normalize.py +136 -0
- madcat_tts-0.3.0/src/madcat_tts/pcart.py +452 -0
- madcat_tts-0.3.0/src/madcat_tts/postprocess.py +117 -0
- madcat_tts-0.3.0/src/madcat_tts/registry.py +264 -0
- madcat_tts-0.3.0/src/madcat_tts/schemas.py +101 -0
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
|
|
8
|
+
permissions:
|
|
9
|
+
contents: read
|
|
10
|
+
id-token: write
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
publish:
|
|
14
|
+
runs-on: ubuntu-latest
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
|
|
18
|
+
- uses: actions/setup-python@v5
|
|
19
|
+
with:
|
|
20
|
+
python-version: "3.12"
|
|
21
|
+
|
|
22
|
+
- name: Install build tools
|
|
23
|
+
run: pip install --upgrade build
|
|
24
|
+
|
|
25
|
+
- name: Build sdist and wheel
|
|
26
|
+
run: python -m build
|
|
27
|
+
|
|
28
|
+
- name: Publish to PyPI
|
|
29
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.egg-info/
|
|
6
|
+
.eggs/
|
|
7
|
+
build/
|
|
8
|
+
dist/
|
|
9
|
+
.venv
|
|
10
|
+
.venv/
|
|
11
|
+
venv
|
|
12
|
+
venv/
|
|
13
|
+
.python-version
|
|
14
|
+
|
|
15
|
+
# uv
|
|
16
|
+
uv.lock
|
|
17
|
+
|
|
18
|
+
# Editors
|
|
19
|
+
.vscode/
|
|
20
|
+
.idea/
|
|
21
|
+
*.swp
|
|
22
|
+
.DS_Store
|
|
23
|
+
|
|
24
|
+
# Local state / outputs
|
|
25
|
+
/state.toml
|
|
26
|
+
/tmp/
|
|
27
|
+
*.wav
|
|
28
|
+
*.onnx
|
|
29
|
+
*.onnx.json
|
|
30
|
+
|
|
31
|
+
# OS
|
|
32
|
+
.DS_Store
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# madcat-tts Agent Guide
|
|
2
|
+
|
|
3
|
+
## Workspace Structure
|
|
4
|
+
|
|
5
|
+
```
|
|
6
|
+
madcat-tts/
|
|
7
|
+
├── src/madcat_tts/
|
|
8
|
+
│ ├── __main__.py # Entry point: uvicorn launcher, env var parsing
|
|
9
|
+
│ ├── main.py # FastAPI app, EnginePool, route handlers
|
|
10
|
+
│ ├── schemas.py # Pydantic models (Voice, Cart, AudioSpeechRequest, …)
|
|
11
|
+
│ ├── registry.py # Cart catalog loader (TOML) + state persistence
|
|
12
|
+
│ ├── normalize.py # LLM-based text normalizer (vllm-tts LoRA via httpx)
|
|
13
|
+
│ └── engines/
|
|
14
|
+
│ ├── base.py # Engine ABC + SynthResult dataclass
|
|
15
|
+
│ ├── chatterbox.py # ChatterboxMultilingualTTS, GPU-resident
|
|
16
|
+
│ ├── piper.py # PiperVoice ONNX, CPU-resident, lazy per-path cache
|
|
17
|
+
│ └── xtts_proxy.py # HTTP proxy to remote Auralis/XTTS-v2 backend
|
|
18
|
+
├── conf/
|
|
19
|
+
│ ├── carts/
|
|
20
|
+
│ │ ├── bt7274.toml # BT-7274 persona — chatterbox EN/PL + piper + xtts voices
|
|
21
|
+
│ │ └── lessac.toml # Piper CPU-only fallback voice
|
|
22
|
+
│ └── madcat-tts.service # systemd user unit (port 14099)
|
|
23
|
+
├── install.sh # Idempotent deploy script for sinanju
|
|
24
|
+
├── pyproject.toml
|
|
25
|
+
└── README.md
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Runtime paths (XDG-based, not in-repo):
|
|
29
|
+
- Cart catalog: `~/.config/madcat-tts/carts/*.toml`
|
|
30
|
+
- Active state: `~/.local/state/madcat-tts/state.toml`
|
|
31
|
+
- Voice refs (chatterbox WAVs): `~/.local/share/bt7274/*.wav`
|
|
32
|
+
- Piper models: `~/piper-voices/*.onnx` (sibling `.onnx.json` required)
|
|
33
|
+
|
|
34
|
+
## Key Commands
|
|
35
|
+
|
|
36
|
+
```sh
|
|
37
|
+
# Install / deploy to sinanju (idempotent)
|
|
38
|
+
./install.sh
|
|
39
|
+
|
|
40
|
+
# Reuse existing CUDA torch venv (for chatterbox on GB10/DGX/aarch64 CUDA 13)
|
|
41
|
+
MADCAT_TTS_VENV=$HOME/cb ./install.sh
|
|
42
|
+
|
|
43
|
+
# Run directly
|
|
44
|
+
python -m madcat_tts
|
|
45
|
+
# or
|
|
46
|
+
madcat-tts
|
|
47
|
+
|
|
48
|
+
# Build/sync venv
|
|
49
|
+
uv sync
|
|
50
|
+
|
|
51
|
+
# Systemd user unit management
|
|
52
|
+
systemctl --user start madcat-tts
|
|
53
|
+
systemctl --user restart madcat-tts
|
|
54
|
+
systemctl --user status madcat-tts
|
|
55
|
+
journalctl --user -u madcat-tts -f
|
|
56
|
+
|
|
57
|
+
# Health check
|
|
58
|
+
curl http://localhost:14099/health | python3 -m json.tool
|
|
59
|
+
|
|
60
|
+
# List carts
|
|
61
|
+
curl http://localhost:14099/carts
|
|
62
|
+
|
|
63
|
+
# Switch active cart
|
|
64
|
+
curl -X POST http://localhost:14099/carts/active -H 'Content-Type: application/json' -d '{"id":"bt7274"}'
|
|
65
|
+
|
|
66
|
+
# Synthesize
|
|
67
|
+
curl -X POST http://localhost:14099/v1/audio/speech \
|
|
68
|
+
-H 'Content-Type: application/json' \
|
|
69
|
+
-d '{"input":"Hello world","voice":"bt7274-en","response_format":"wav"}' \
|
|
70
|
+
--output out.wav
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Critical Implementation Details
|
|
74
|
+
|
|
75
|
+
**Engine pool serialization:** Each engine is locked per-call (`threading.Lock`). GPU contention from chatterbox makes single-threading correct. Piper also benefits. Never add async synthesis — model init and inference are not async-safe.
|
|
76
|
+
|
|
77
|
+
**Single worker only:** uvicorn runs `workers=1`. Engines are model-resident and not fork-safe. Do not change this.
|
|
78
|
+
|
|
79
|
+
**Leading silence padding:** 300ms of zeros is prepended to every response (`MADCAT_TTS_LEADING_SILENCE_MS`, default 300). Compensates for AVAudioPlayer / afplay decode warmup latency on macOS. Override with `MADCAT_TTS_LEADING_SILENCE_MS=0` to disable.
|
|
80
|
+
|
|
81
|
+
**XTTS proxy pattern:** Coqui XTTS-v2 requires `transformers==5.9.0`, chatterbox requires `transformers==5.2.0` — incompatible. XTTS runs in a separate process (Auralis on junkpile:8020). The `XttsProxyEngine` base64-encodes the local WAV ref, POSTs to the remote, decodes the audio response. Reference WAV is read from the daemon's local filesystem.
|
|
82
|
+
|
|
83
|
+
**Auralis upstream bug workaround:** `AudioSpeechGenerationRequest.speed` has a trailing-comma tuple default in Auralis 0.2.8.post2 that raises `TypeError` on `<=` comparison when `speed` is absent. The proxy **always** sends `speed` explicitly. This is load-bearing — do not remove.
|
|
84
|
+
|
|
85
|
+
**Voice resolution priority:**
|
|
86
|
+
1. Exact voice id (e.g. `bt7274-en`)
|
|
87
|
+
2. Cart tag (e.g. `bt7274`) → resolves to that cart's `default_voice`
|
|
88
|
+
3. `null` / omitted → active voice from `state.toml`
|
|
89
|
+
4. `list[str]` → ad-hoc base64 WAV clone (chatterbox only, first element used)
|
|
90
|
+
|
|
91
|
+
**Engine init failures are non-fatal:** If chatterbox fails to load (CUDA unavailable, model missing), the daemon continues and returns HTTP 503 only for requests targeting that engine. Piper + xtts remain available.
|
|
92
|
+
|
|
93
|
+
**Text normalization:** `TextNormalizer` calls a vLLM instance at `MADCAT_TTS_NORMALIZER_URL` (default `http://localhost:8002`) with the `qwen25-7b-tts-norm` LoRA model. Any failure falls back to raw text — normalization is best-effort. LRU cache (512 entries, configurable via `MADCAT_TTS_NORMALIZER_CACHE_SIZE`). Disable with `MADCAT_TTS_NORMALIZE=0`.
|
|
94
|
+
|
|
95
|
+
**Audio formats:** Only `wav`, `flac`, `pcm` are actually supported despite `ResponseFormat` including `mp3`. Format table in `main.py` controls what soundfile gets asked to write.
|
|
96
|
+
|
|
97
|
+
**State persistence:** `Registry._write_state()` uses atomic write (tmp file + rename). Reads back on daemon restart. If `state.toml` references a no-longer-existing cart/voice, it is ignored and the first cart's default voice is used.
|
|
98
|
+
|
|
99
|
+
**Chatterbox params:** `cfg_weight` (0–1, default 0.5) and `exaggeration` (0–2, default 0.5). Cross-language references (e.g. EN voice for PL synthesis) often need `cfg_weight` lowered. `t3_model` param selects alternate checkpoint.
|
|
100
|
+
|
|
101
|
+
**Piper voice caching:** `PiperEngine` caches loaded `PiperVoice` instances keyed by path. Multiple carts can share an ONNX model file without reloading.
|
|
102
|
+
|
|
103
|
+
## Database / Storage
|
|
104
|
+
|
|
105
|
+
No database. Two TOML-based stores:
|
|
106
|
+
|
|
107
|
+
| Path | Purpose | Access |
|
|
108
|
+
|------|---------|--------|
|
|
109
|
+
| `~/.config/madcat-tts/carts/*.toml` | Cart catalog (voices, engine, params) | Read-only at runtime |
|
|
110
|
+
| `~/.local/state/madcat-tts/state.toml` | Active cart + voice selection | Read/write by daemon |
|
|
111
|
+
|
|
112
|
+
Cart file schema (per-file): `default_voice` string + `[[voices]]` array. Each voice: `id`, `engine`, `lang`, `description`, `voice_path`, `[params]`.
|
|
113
|
+
|
|
114
|
+
## Development Notes
|
|
115
|
+
|
|
116
|
+
- **Language:** Python 3.11+, uses `from __future__ import annotations` throughout
|
|
117
|
+
- **Python version in use:** 3.14 (CPython, aarch64, sinanju/GB10)
|
|
118
|
+
- **Framework:** FastAPI + uvicorn (single worker)
|
|
119
|
+
- **Package manager:** uv — `uv sync` for fresh venv, `uv pip install` for reuse path
|
|
120
|
+
- **Key deps:** `chatterbox-tts`, `piper-tts>=1.4.0`, `torch`, `soundfile`, `fastapi`, `httpx`, `pydantic>=2.7`, `tomli-w`
|
|
121
|
+
- **TOML:** Read via stdlib `tomllib` (py3.11+), write via `tomli-w`
|
|
122
|
+
- **Target host:** sinanju (DGX Spark / GB10 / aarch64 / CUDA 13)
|
|
123
|
+
- **Service port:** 14099 (default)
|
|
124
|
+
|
|
125
|
+
## Conventions
|
|
126
|
+
|
|
127
|
+
- Engine names are lowercase string literals: `"chatterbox"`, `"piper"`, `"xtts"` — the `EngineName` `Literal` type in schemas.py is the source of truth
|
|
128
|
+
- Voice IDs follow `<persona>-<lang>[-<engine>]` convention: `bt7274-en`, `bt7274-pl-xtts`, `bt7274-en-piper`
|
|
129
|
+
- Cart tag = TOML filename stem: `bt7274.toml` → tag `bt7274`
|
|
130
|
+
- All engine `synth()` calls return `SynthResult(audio: np.ndarray float32 [-1,1], sample_rate: int)` — encoding to WAV/FLAC/PCM happens in the HTTP layer
|
|
131
|
+
- Response headers carry synthesis metadata: `X-Madcat-Cart`, `X-Madcat-Voice`, `X-Madcat-Engine`, `X-Madcat-Sample-Rate`, `X-Madcat-Duration-Sec`, `X-Madcat-Leading-Silence-Ms`
|
|
132
|
+
- No multi-worker, no async synth — model-resident GPU engines are intentionally synchronous
|
|
133
|
+
- Env vars follow `MADCAT_TTS_*` prefix convention
|
|
134
|
+
|
|
135
|
+
## Entry Points
|
|
136
|
+
|
|
137
|
+
| Surface | Detail |
|
|
138
|
+
|---------|--------|
|
|
139
|
+
| `python -m madcat_tts` | Main daemon entry (`__main__.py` → uvicorn) |
|
|
140
|
+
| `madcat-tts` | Console script alias (same) |
|
|
141
|
+
| `POST /v1/audio/speech` | OpenAI-compatible synthesis endpoint |
|
|
142
|
+
| `GET /health` | Engine availability + normalizer stats |
|
|
143
|
+
| `GET /carts` | Full cart catalog |
|
|
144
|
+
| `POST /carts/active` | Switch active voice/cart |
|
|
145
|
+
| `GET /docs` | FastAPI Swagger UI |
|
|
146
|
+
| `Engine` ABC | `engines/base.py` — implement for new backends |
|
|
147
|
+
| `Registry` | `registry.py` — cart catalog and state |
|
|
148
|
+
| `EnginePool` | `main.py` — engine lifecycle, lock management |
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: madcat-tts
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: TTS daemon (Chatterbox + Piper in-process; XTTS proxied) for the MADCAT OS substrate.
|
|
5
|
+
Author: chi
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.11
|
|
8
|
+
Requires-Dist: chatterbox-tts
|
|
9
|
+
Requires-Dist: fastapi>=0.115
|
|
10
|
+
Requires-Dist: httpx>=0.27
|
|
11
|
+
Requires-Dist: numpy
|
|
12
|
+
Requires-Dist: piper-tts>=1.4.0
|
|
13
|
+
Requires-Dist: pydantic>=2.7
|
|
14
|
+
Requires-Dist: setproctitle>=1.3
|
|
15
|
+
Requires-Dist: soundfile
|
|
16
|
+
Requires-Dist: tomli-w>=1.0
|
|
17
|
+
Requires-Dist: torch
|
|
18
|
+
Requires-Dist: torchaudio
|
|
19
|
+
Requires-Dist: uvicorn[standard]>=0.30
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
# madcat-tts
|
|
2
|
+
|
|
3
|
+
Single TTS daemon fronting four engines behind one OpenAI-compatible HTTP
|
|
4
|
+
endpoint.
|
|
5
|
+
|
|
6
|
+
| engine | location | strengths |
|
|
7
|
+
| ------------------ | ------------------ | -------------------------------------------- |
|
|
8
|
+
| `chatterbox` | in-process, GPU | Voice cloning, EN + PL, multilingual |
|
|
9
|
+
| `chatterbox-turbo` | in-process, GPU | Fast single-step, paralinguistic tags (EN) |
|
|
10
|
+
| `piper` | in-process, CPU | Fast, deterministic, no GPU |
|
|
11
|
+
| `xtts` | HTTP-proxied | XTTS-v2 zero-shot clone via remote backend |
|
|
12
|
+
|
|
13
|
+
## Install
|
|
14
|
+
|
|
15
|
+
```sh
|
|
16
|
+
pip install madcat-tts
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
Or with [uv](https://docs.astral.sh/uv/):
|
|
20
|
+
|
|
21
|
+
```sh
|
|
22
|
+
uv pip install madcat-tts
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Quick start
|
|
26
|
+
|
|
27
|
+
```sh
|
|
28
|
+
# Start the daemon
|
|
29
|
+
madcat-tts
|
|
30
|
+
|
|
31
|
+
# Health check
|
|
32
|
+
curl http://localhost:14099/health
|
|
33
|
+
|
|
34
|
+
# Synthesize
|
|
35
|
+
curl -X POST http://localhost:14099/v1/audio/speech \
|
|
36
|
+
-H 'Content-Type: application/json' \
|
|
37
|
+
-d '{"input":"Hello world","voice":"bt7274-en","response_format":"wav"}' \
|
|
38
|
+
--output out.wav
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Configuration
|
|
42
|
+
|
|
43
|
+
Config lives in `~/.config/madcat/config.toml`:
|
|
44
|
+
|
|
45
|
+
```toml
|
|
46
|
+
[tts]
|
|
47
|
+
host = "0.0.0.0"
|
|
48
|
+
port = 14099
|
|
49
|
+
log_level = "info"
|
|
50
|
+
leading_silence_ms = 300
|
|
51
|
+
|
|
52
|
+
[tts.normalizer]
|
|
53
|
+
url = "http://localhost:8002"
|
|
54
|
+
model = "qwen3-4b-tts-norm"
|
|
55
|
+
enabled = true
|
|
56
|
+
|
|
57
|
+
[tts.chunking]
|
|
58
|
+
chunk_size = 250
|
|
59
|
+
chunk_gap_ms = 200
|
|
60
|
+
trim_silence = true
|
|
61
|
+
max_silence_ms = 400
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
All settings have env var overrides (`MADCAT_TTS_HOST`, `MADCAT_TTS_PORT`,
|
|
65
|
+
etc.) and sensible defaults. The daemon runs without a config file.
|
|
66
|
+
|
|
67
|
+
## Endpoints
|
|
68
|
+
|
|
69
|
+
| route | method | purpose |
|
|
70
|
+
| ----------------------- | ------ | --------------------------------------------------------------- |
|
|
71
|
+
| `/v1/audio/speech` | POST | OpenAI-compat synthesis. `voice` = cart/voice id or base64 WAVs |
|
|
72
|
+
| `/carts` | GET | List cart catalog with active marker |
|
|
73
|
+
| `/carts/active` | GET | Current active cart + voice |
|
|
74
|
+
| `/carts/active` | POST | Switch active cart/voice |
|
|
75
|
+
| `/health` | GET | Liveness + per-engine availability + normalizer stats |
|
|
76
|
+
| `/docs` | GET | Swagger UI |
|
|
77
|
+
|
|
78
|
+
## Carts
|
|
79
|
+
|
|
80
|
+
A **cart** maps a persona to one or more voices. Each voice binds an engine,
|
|
81
|
+
language, reference audio, and engine-specific params. Catalog lives at
|
|
82
|
+
`~/.config/madcat/carts/*.toml` (one file per cart).
|
|
83
|
+
|
|
84
|
+
Example cart (`bt7274.toml`):
|
|
85
|
+
|
|
86
|
+
```toml
|
|
87
|
+
default_voice = "bt7274-en-piper"
|
|
88
|
+
|
|
89
|
+
[[voices]]
|
|
90
|
+
id = "bt7274-en"
|
|
91
|
+
engine = "chatterbox"
|
|
92
|
+
lang = "en"
|
|
93
|
+
description = "English voice — Chatterbox GPU cloning"
|
|
94
|
+
voice_path = "/path/to/english.wav"
|
|
95
|
+
|
|
96
|
+
[voices.params]
|
|
97
|
+
cfg_weight = 0.5
|
|
98
|
+
exaggeration = 0.5
|
|
99
|
+
|
|
100
|
+
[[voices]]
|
|
101
|
+
id = "bt7274-en-piper"
|
|
102
|
+
engine = "piper"
|
|
103
|
+
lang = "en"
|
|
104
|
+
description = "English voice — Piper CPU fallback"
|
|
105
|
+
voice_path = "/path/to/english.onnx"
|
|
106
|
+
|
|
107
|
+
[voices.params]
|
|
108
|
+
length_scale = 1.0
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
Active selection persists to `~/.local/share/madcat/tts-state.toml`.
|
|
112
|
+
|
|
113
|
+
## Text normalization
|
|
114
|
+
|
|
115
|
+
Optional LLM-based preprocessor that expands numbers, acronyms, and symbols
|
|
116
|
+
before synthesis. Calls any OpenAI-compatible `/v1/chat/completions` endpoint.
|
|
117
|
+
Disable with `MADCAT_TTS_NORMALIZE=0` or per-request `"normalize": false`.
|
|
118
|
+
|
|
119
|
+
## Engine notes
|
|
120
|
+
|
|
121
|
+
- **chatterbox**: Multilingual voice cloning. Requires CUDA GPU. 10s mono
|
|
122
|
+
reference WAV. Params: `cfg_weight` (0-1), `exaggeration` (0-2).
|
|
123
|
+
- **chatterbox-turbo**: 350M parameter English-only model. Supports
|
|
124
|
+
paralinguistic tags: `[laugh]`, `[chuckle]`, `[cough]`, `[sigh]`,
|
|
125
|
+
`[gasp]`, `[groan]`. Single-step decoder, lighter VRAM.
|
|
126
|
+
- **piper**: CPU ONNX inference. Deterministic. Voice = `.onnx` file with
|
|
127
|
+
sibling `.onnx.json` config. Params: `length_scale`, `noise_scale`.
|
|
128
|
+
- **xtts**: HTTP proxy to a remote XTTS-v2 backend (e.g. Auralis). Configure
|
|
129
|
+
via `MADCAT_TTS_XTTS_URL` (default `http://localhost:8020`).
|
|
130
|
+
|
|
131
|
+
Engine init failures are non-fatal — the daemon stays up and returns 503
|
|
132
|
+
only for requests targeting unavailable engines.
|
|
133
|
+
|
|
134
|
+
## Layout
|
|
135
|
+
|
|
136
|
+
```
|
|
137
|
+
src/madcat_tts/
|
|
138
|
+
├── __main__.py # uvicorn launcher
|
|
139
|
+
├── main.py # FastAPI app, EnginePool, routes
|
|
140
|
+
├── schemas.py # Pydantic models
|
|
141
|
+
├── registry.py # Cart catalog + state persistence
|
|
142
|
+
├── config.py # Config reader (~/.config/madcat/config.toml)
|
|
143
|
+
├── normalize.py # LLM text normalizer
|
|
144
|
+
├── chunker.py # Sentence-boundary chunking
|
|
145
|
+
├── postprocess.py # Audio post-processing (silence trim/compress)
|
|
146
|
+
├── pcart.py # Cart format utilities
|
|
147
|
+
└── engines/
|
|
148
|
+
├── base.py # Engine ABC + SynthResult
|
|
149
|
+
├── chatterbox.py # ChatterboxMultilingualTTS
|
|
150
|
+
├── chatterbox_turbo.py # ChatterboxTurboTTS (paralinguistic)
|
|
151
|
+
├── piper.py # PiperVoice ONNX
|
|
152
|
+
└── xtts_proxy.py # HTTP proxy to remote backend
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
## License
|
|
156
|
+
|
|
157
|
+
MIT
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# BT-7274 — Vanguard-class Titan AI
|
|
2
|
+
# Voices across multiple TTS engines for the canonical persona.
|
|
3
|
+
|
|
4
|
+
default_voice = "bt7274-en-piper"
|
|
5
|
+
|
|
6
|
+
[[voices]]
|
|
7
|
+
id = "bt7274-pl"
|
|
8
|
+
engine = "chatterbox"
|
|
9
|
+
lang = "pl"
|
|
10
|
+
description = "Canonical Polish voice — Chatterbox GPU cloning, 10s reference"
|
|
11
|
+
voice_path = "/home/madcat/.local/share/bt7274/canonical_pl_ref_22k_mono.wav"
|
|
12
|
+
|
|
13
|
+
[voices.params]
|
|
14
|
+
cfg_weight = 0.5
|
|
15
|
+
exaggeration = 0.5
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
[[voices]]
|
|
19
|
+
id = "bt7274-en"
|
|
20
|
+
engine = "chatterbox"
|
|
21
|
+
lang = "en"
|
|
22
|
+
description = "Canonical English voice — Chatterbox GPU cloning, 10s reference"
|
|
23
|
+
voice_path = "/home/madcat/.local/share/bt7274/canonical_en_ref_22k_mono.wav"
|
|
24
|
+
|
|
25
|
+
[voices.params]
|
|
26
|
+
cfg_weight = 0.5
|
|
27
|
+
exaggeration = 0.5
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
[[voices]]
|
|
31
|
+
id = "bt7274-en-xtts"
|
|
32
|
+
engine = "xtts"
|
|
33
|
+
lang = "en"
|
|
34
|
+
description = "English via XTTS-v2 zero-shot clone — proxied to Auralis on junkpile"
|
|
35
|
+
voice_path = "/home/madcat/.local/share/bt7274/canonical_en_ref_22k_mono.wav"
|
|
36
|
+
|
|
37
|
+
[voices.params]
|
|
38
|
+
speed = 1.0
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
[[voices]]
|
|
42
|
+
id = "bt7274-pl-xtts"
|
|
43
|
+
engine = "xtts"
|
|
44
|
+
lang = "pl"
|
|
45
|
+
description = "Polish via XTTS-v2 zero-shot clone — proxied to Auralis on junkpile"
|
|
46
|
+
voice_path = "/home/madcat/.local/share/bt7274/canonical_pl_ref_22k_mono.wav"
|
|
47
|
+
|
|
48
|
+
[voices.params]
|
|
49
|
+
speed = 1.0
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
[[voices]]
|
|
53
|
+
id = "bt7274-en-piper"
|
|
54
|
+
engine = "piper"
|
|
55
|
+
lang = "en"
|
|
56
|
+
description = "English via Piper ONNX — fast CPU fallback, custom trained"
|
|
57
|
+
voice_path = "/home/madcat/piper-voices/bt7274.onnx"
|
|
58
|
+
|
|
59
|
+
[voices.params]
|
|
60
|
+
length_scale = 1.0
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# Lessac — Piper CPU fallback voice
|
|
2
|
+
# No cloning, fast deterministic synthesis. CPU-only baseline.
|
|
3
|
+
|
|
4
|
+
default_voice = "lessac"
|
|
5
|
+
|
|
6
|
+
[[voices]]
|
|
7
|
+
id = "lessac"
|
|
8
|
+
engine = "piper"
|
|
9
|
+
lang = "en"
|
|
10
|
+
description = "Piper lessac medium — fast EN fallback, no GPU required"
|
|
11
|
+
voice_path = "/home/madcat/piper-voices/en_US-lessac-medium.onnx"
|
|
12
|
+
|
|
13
|
+
[voices.params]
|
|
14
|
+
length_scale = 1.0
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
[Unit]
|
|
2
|
+
Description=madcat-tts — MADCAT OS TTS daemon (Chatterbox + Piper in-process)
|
|
3
|
+
After=network-online.target
|
|
4
|
+
Wants=network-online.target
|
|
5
|
+
|
|
6
|
+
[Service]
|
|
7
|
+
Type=simple
|
|
8
|
+
WorkingDirectory=%h/Projects/madcat-tts
|
|
9
|
+
Environment=PYTHONUNBUFFERED=1
|
|
10
|
+
Environment=MADCAT_TTS_HOST=0.0.0.0
|
|
11
|
+
Environment=MADCAT_TTS_PORT=14099
|
|
12
|
+
Environment=MADCAT_TTS_LOG=info
|
|
13
|
+
# XTTS proxy target. Sin runs Auralis locally on :8020.
|
|
14
|
+
Environment=MADCAT_TTS_XTTS_URL=http://localhost:8020
|
|
15
|
+
# LLM text normalizer (vllm-tts LoRA). Set MADCAT_TTS_NORMALIZE=0 to bypass.
|
|
16
|
+
Environment=MADCAT_TTS_NORMALIZER_URL=http://localhost:8002
|
|
17
|
+
# uv-managed venv at $WorkingDirectory/.venv
|
|
18
|
+
ExecStart=%h/Projects/madcat-tts/.venv/bin/python -m madcat_tts
|
|
19
|
+
Restart=on-failure
|
|
20
|
+
RestartSec=5s
|
|
21
|
+
|
|
22
|
+
[Install]
|
|
23
|
+
WantedBy=default.target
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
schema_version = 1
|
|
2
|
+
|
|
3
|
+
[persona]
|
|
4
|
+
slug = "bt7274"
|
|
5
|
+
name = "bt7274"
|
|
6
|
+
type = "assistant"
|
|
7
|
+
tagline = ""
|
|
8
|
+
prompt = "prompt.md"
|
|
9
|
+
default_lang = "pl"
|
|
10
|
+
|
|
11
|
+
[theme]
|
|
12
|
+
palette = "default"
|
|
13
|
+
typography = "sans"
|
|
14
|
+
mode = "dark"
|
|
15
|
+
accent = ""
|
|
16
|
+
|
|
17
|
+
[sink]
|
|
18
|
+
default = ""
|
|
19
|
+
|
|
20
|
+
[voices.pl]
|
|
21
|
+
id = "bt7274-pl"
|
|
22
|
+
backend = "chatterbox"
|
|
23
|
+
lang = "pl"
|
|
24
|
+
voice_path = "/home/madcat/.local/share/bt7274/canonical_pl_ref_22k_mono.wav"
|
|
25
|
+
description = "Canonical Polish voice — Chatterbox GPU cloning, 10s reference"
|
|
26
|
+
|
|
27
|
+
[voices.pl.params]
|
|
28
|
+
cfg_weight = 0.5
|
|
29
|
+
exaggeration = 0.5
|
|
30
|
+
|
|
31
|
+
[voices.en]
|
|
32
|
+
id = "bt7274-en"
|
|
33
|
+
backend = "chatterbox"
|
|
34
|
+
lang = "en"
|
|
35
|
+
voice_path = "/home/madcat/.local/share/bt7274/canonical_en_ref_22k_mono.wav"
|
|
36
|
+
description = "Canonical English voice — Chatterbox GPU cloning, 10s reference"
|
|
37
|
+
|
|
38
|
+
[voices.en.params]
|
|
39
|
+
cfg_weight = 0.5
|
|
40
|
+
exaggeration = 0.5
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
schema_version = 1
|
|
2
|
+
|
|
3
|
+
[persona]
|
|
4
|
+
slug = "lessac"
|
|
5
|
+
name = "lessac"
|
|
6
|
+
type = "assistant"
|
|
7
|
+
tagline = ""
|
|
8
|
+
prompt = "prompt.md"
|
|
9
|
+
default_lang = "en"
|
|
10
|
+
|
|
11
|
+
[theme]
|
|
12
|
+
palette = "default"
|
|
13
|
+
typography = "sans"
|
|
14
|
+
mode = "dark"
|
|
15
|
+
accent = ""
|
|
16
|
+
|
|
17
|
+
[sink]
|
|
18
|
+
default = ""
|
|
19
|
+
|
|
20
|
+
[voices.en]
|
|
21
|
+
id = "lessac"
|
|
22
|
+
backend = "piper-remote"
|
|
23
|
+
lang = "en"
|
|
24
|
+
voice_path = "/home/madcat/piper-voices/en_US-lessac-medium.onnx"
|
|
25
|
+
description = "Piper lessac medium — fast EN fallback, no GPU required"
|
|
26
|
+
|
|
27
|
+
[voices.en.params]
|
|
28
|
+
length_scale = 1.0
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
[Unit]
|
|
2
|
+
Description=vLLM TTS Normalizer (Qwen3-4B-Instruct-2507)
|
|
3
|
+
After=network-online.target
|
|
4
|
+
Wants=network-online.target
|
|
5
|
+
|
|
6
|
+
[Service]
|
|
7
|
+
Type=simple
|
|
8
|
+
User=madcat
|
|
9
|
+
Group=sudo
|
|
10
|
+
ExecStart=/usr/local/bin/vllm serve --config /etc/vllm/tts.yaml
|
|
11
|
+
Restart=on-failure
|
|
12
|
+
RestartSec=5
|
|
13
|
+
Environment="HF_HOME=/home/madcat/.cache/huggingface"
|
|
14
|
+
Environment="HF_TOKEN=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"
|
|
15
|
+
Environment="CUDA_VISIBLE_DEVICES=0"
|
|
16
|
+
Environment="VLLM_USAGE_SOURCE=production"
|
|
17
|
+
|
|
18
|
+
[Install]
|
|
19
|
+
WantedBy=multi-user.target
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# vLLM config — TTS text normalizer (Qwen3-4B-Instruct-2507)
|
|
2
|
+
# Replaces Qwen2.5-1.5B: stronger multilingual (119 langs, incl. Polish),
|
|
3
|
+
# no LoRA needed — base instruct model handles TTS normalization via prompting.
|
|
4
|
+
# Non-thinking-only variant: no CoT overhead, fast inference for text transforms.
|
|
5
|
+
|
|
6
|
+
# ── Model ────────────────────────────────────────────────────
|
|
7
|
+
model: Qwen/Qwen3-4B-Instruct-2507
|
|
8
|
+
served-model-name: qwen3-4b-tts-norm
|
|
9
|
+
dtype: auto
|
|
10
|
+
|
|
11
|
+
# ── Context & Batching ───────────────────────────────────────
|
|
12
|
+
# Normalization is short-form: inputs rarely exceed 200 tokens.
|
|
13
|
+
max-model-len: 1024
|
|
14
|
+
max-num-seqs: 4
|
|
15
|
+
max-num-batched-tokens: 1024
|
|
16
|
+
|
|
17
|
+
# ── Memory Management ───────────────────────────────────────
|
|
18
|
+
# 4B model is ~8GiB in bf16. On GB10's 128GB unified memory,
|
|
19
|
+
# 0.15 = ~19GiB — enough for weights + KV cache + activations.
|
|
20
|
+
# Old value 0.08 (10GiB) left barely 2GiB for KV cache.
|
|
21
|
+
gpu-memory-utilization: 0.15
|
|
22
|
+
|
|
23
|
+
# ── Performance ──────────────────────────────────────────────
|
|
24
|
+
# Prefix caching helps when the same system prompt is reused
|
|
25
|
+
# across all normalization calls (which it is).
|
|
26
|
+
enable-prefix-caching: true
|
|
27
|
+
|
|
28
|
+
# ── Network ──────────────────────────────────────────────────
|
|
29
|
+
host: 0.0.0.0
|
|
30
|
+
port: 8002
|