madcat-tts 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. madcat_tts-0.3.0/.github/workflows/main.yml +29 -0
  2. madcat_tts-0.3.0/.gitignore +32 -0
  3. madcat_tts-0.3.0/AGENTS.md +148 -0
  4. madcat_tts-0.3.0/PKG-INFO +19 -0
  5. madcat_tts-0.3.0/README.md +157 -0
  6. madcat_tts-0.3.0/conf/carts/bt7274.toml +60 -0
  7. madcat_tts-0.3.0/conf/carts/lessac.toml +14 -0
  8. madcat_tts-0.3.0/conf/madcat-tts.service +23 -0
  9. madcat_tts-0.3.0/conf/personas/bt7274.pcart/cart.toml +40 -0
  10. madcat_tts-0.3.0/conf/personas/lessac.pcart/cart.toml +28 -0
  11. madcat_tts-0.3.0/conf/vllm-tts.service +19 -0
  12. madcat_tts-0.3.0/conf/vllm-tts.yaml +30 -0
  13. madcat_tts-0.3.0/docs/PLAN.md +185 -0
  14. madcat_tts-0.3.0/install.sh +77 -0
  15. madcat_tts-0.3.0/pyproject.toml +40 -0
  16. madcat_tts-0.3.0/src/madcat_tts/__init__.py +9 -0
  17. madcat_tts-0.3.0/src/madcat_tts/__main__.py +46 -0
  18. madcat_tts-0.3.0/src/madcat_tts/chunker.py +185 -0
  19. madcat_tts-0.3.0/src/madcat_tts/config.py +178 -0
  20. madcat_tts-0.3.0/src/madcat_tts/engines/__init__.py +5 -0
  21. madcat_tts-0.3.0/src/madcat_tts/engines/base.py +54 -0
  22. madcat_tts-0.3.0/src/madcat_tts/engines/chatterbox.py +81 -0
  23. madcat_tts-0.3.0/src/madcat_tts/engines/chatterbox_turbo.py +98 -0
  24. madcat_tts-0.3.0/src/madcat_tts/engines/piper.py +80 -0
  25. madcat_tts-0.3.0/src/madcat_tts/engines/xtts_proxy.py +172 -0
  26. madcat_tts-0.3.0/src/madcat_tts/main.py +489 -0
  27. madcat_tts-0.3.0/src/madcat_tts/normalize.py +136 -0
  28. madcat_tts-0.3.0/src/madcat_tts/pcart.py +452 -0
  29. madcat_tts-0.3.0/src/madcat_tts/postprocess.py +117 -0
  30. madcat_tts-0.3.0/src/madcat_tts/registry.py +264 -0
  31. madcat_tts-0.3.0/src/madcat_tts/schemas.py +101 -0
@@ -0,0 +1,29 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+
8
+ permissions:
9
+ contents: read
10
+ id-token: write
11
+
12
+ jobs:
13
+ publish:
14
+ runs-on: ubuntu-latest
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+
18
+ - uses: actions/setup-python@v5
19
+ with:
20
+ python-version: "3.12"
21
+
22
+ - name: Install build tools
23
+ run: pip install --upgrade build
24
+
25
+ - name: Build sdist and wheel
26
+ run: python -m build
27
+
28
+ - name: Publish to PyPI
29
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,32 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.egg-info/
6
+ .eggs/
7
+ build/
8
+ dist/
9
+ .venv
10
+ .venv/
11
+ venv
12
+ venv/
13
+ .python-version
14
+
15
+ # uv
16
+ uv.lock
17
+
18
+ # Editors
19
+ .vscode/
20
+ .idea/
21
+ *.swp
22
+ .DS_Store
23
+
24
+ # Local state / outputs
25
+ /state.toml
26
+ /tmp/
27
+ *.wav
28
+ *.onnx
29
+ *.onnx.json
30
+
31
+ # OS
32
+ .DS_Store
@@ -0,0 +1,148 @@
1
+ # madcat-tts Agent Guide
2
+
3
+ ## Workspace Structure
4
+
5
+ ```
6
+ madcat-tts/
7
+ ├── src/madcat_tts/
8
+ │ ├── __main__.py # Entry point: uvicorn launcher, env var parsing
9
+ │ ├── main.py # FastAPI app, EnginePool, route handlers
10
+ │ ├── schemas.py # Pydantic models (Voice, Cart, AudioSpeechRequest, …)
11
+ │ ├── registry.py # Cart catalog loader (TOML) + state persistence
12
+ │ ├── normalize.py # LLM-based text normalizer (vllm-tts LoRA via httpx)
13
+ │ └── engines/
14
+ │ ├── base.py # Engine ABC + SynthResult dataclass
15
+ │ ├── chatterbox.py # ChatterboxMultilingualTTS, GPU-resident
16
+ │ ├── piper.py # PiperVoice ONNX, CPU-resident, lazy per-path cache
17
+ │ └── xtts_proxy.py # HTTP proxy to remote Auralis/XTTS-v2 backend
18
+ ├── conf/
19
+ │ ├── carts/
20
+ │ │ ├── bt7274.toml # BT-7274 persona — chatterbox EN/PL + piper + xtts voices
21
+ │ │ └── lessac.toml # Piper CPU-only fallback voice
22
+ │ └── madcat-tts.service # systemd user unit (port 14099)
23
+ ├── install.sh # Idempotent deploy script for sinanju
24
+ ├── pyproject.toml
25
+ └── README.md
26
+ ```
27
+
28
+ Runtime paths (XDG-based, not in-repo):
29
+ - Cart catalog: `~/.config/madcat-tts/carts/*.toml`
30
+ - Active state: `~/.local/state/madcat-tts/state.toml`
31
+ - Voice refs (chatterbox WAVs): `~/.local/share/bt7274/*.wav`
32
+ - Piper models: `~/piper-voices/*.onnx` (sibling `.onnx.json` required)
33
+
34
+ ## Key Commands
35
+
36
+ ```sh
37
+ # Install / deploy to sinanju (idempotent)
38
+ ./install.sh
39
+
40
+ # Reuse existing CUDA torch venv (for chatterbox on GB10/DGX/aarch64 CUDA 13)
41
+ MADCAT_TTS_VENV=$HOME/cb ./install.sh
42
+
43
+ # Run directly
44
+ python -m madcat_tts
45
+ # or
46
+ madcat-tts
47
+
48
+ # Build/sync venv
49
+ uv sync
50
+
51
+ # Systemd user unit management
52
+ systemctl --user start madcat-tts
53
+ systemctl --user restart madcat-tts
54
+ systemctl --user status madcat-tts
55
+ journalctl --user -u madcat-tts -f
56
+
57
+ # Health check
58
+ curl http://localhost:14099/health | python3 -m json.tool
59
+
60
+ # List carts
61
+ curl http://localhost:14099/carts
62
+
63
+ # Switch active cart
64
+ curl -X POST http://localhost:14099/carts/active -H 'Content-Type: application/json' -d '{"id":"bt7274"}'
65
+
66
+ # Synthesize
67
+ curl -X POST http://localhost:14099/v1/audio/speech \
68
+ -H 'Content-Type: application/json' \
69
+ -d '{"input":"Hello world","voice":"bt7274-en","response_format":"wav"}' \
70
+ --output out.wav
71
+ ```
72
+
73
+ ## Critical Implementation Details
74
+
75
+ **Engine pool serialization:** Each engine is locked per-call (`threading.Lock`). GPU contention from chatterbox makes single-threading correct. Piper also benefits. Never add async synthesis — model init and inference are not async-safe.
76
+
77
+ **Single worker only:** uvicorn runs `workers=1`. Engines are model-resident and not fork-safe. Do not change this.
78
+
79
+ **Leading silence padding:** 300ms of zeros is prepended to every response (`MADCAT_TTS_LEADING_SILENCE_MS`, default 300). Compensates for AVAudioPlayer / afplay decode warmup latency on macOS. Override with `MADCAT_TTS_LEADING_SILENCE_MS=0` to disable.
80
+
81
+ **XTTS proxy pattern:** Coqui XTTS-v2 requires `transformers==5.9.0`, chatterbox requires `transformers==5.2.0` — incompatible. XTTS runs in a separate process (Auralis on junkpile:8020). The `XttsProxyEngine` base64-encodes the local WAV ref, POSTs to the remote, decodes the audio response. Reference WAV is read from the daemon's local filesystem.
82
+
83
+ **Auralis upstream bug workaround:** `AudioSpeechGenerationRequest.speed` has a trailing-comma tuple default in Auralis 0.2.8.post2 that raises `TypeError` on `<=` comparison when `speed` is absent. The proxy **always** sends `speed` explicitly. This is load-bearing — do not remove.
84
+
85
+ **Voice resolution priority:**
86
+ 1. Exact voice id (e.g. `bt7274-en`)
87
+ 2. Cart tag (e.g. `bt7274`) → resolves to that cart's `default_voice`
88
+ 3. `null` / omitted → active voice from `state.toml`
89
+ 4. `list[str]` → ad-hoc base64 WAV clone (chatterbox only, first element used)
90
+
91
+ **Engine init failures are non-fatal:** If chatterbox fails to load (CUDA unavailable, model missing), the daemon continues and returns HTTP 503 only for requests targeting that engine. Piper + xtts remain available.
92
+
93
+ **Text normalization:** `TextNormalizer` calls a vLLM instance at `MADCAT_TTS_NORMALIZER_URL` (default `http://localhost:8002`) with the `qwen25-7b-tts-norm` LoRA model. Any failure falls back to raw text — normalization is best-effort. LRU cache (512 entries, configurable via `MADCAT_TTS_NORMALIZER_CACHE_SIZE`). Disable with `MADCAT_TTS_NORMALIZE=0`.
94
+
95
+ **Audio formats:** Only `wav`, `flac`, `pcm` are actually supported despite `ResponseFormat` including `mp3`. Format table in `main.py` controls what soundfile gets asked to write.
96
+
97
+ **State persistence:** `Registry._write_state()` uses atomic write (tmp file + rename). Reads back on daemon restart. If `state.toml` references a no-longer-existing cart/voice, it is ignored and the first cart's default voice is used.
98
+
99
+ **Chatterbox params:** `cfg_weight` (0–1, default 0.5) and `exaggeration` (0–2, default 0.5). Cross-language references (e.g. EN voice for PL synthesis) often need `cfg_weight` lowered. `t3_model` param selects alternate checkpoint.
100
+
101
+ **Piper voice caching:** `PiperEngine` caches loaded `PiperVoice` instances keyed by path. Multiple carts can share an ONNX model file without reloading.
102
+
103
+ ## Database / Storage
104
+
105
+ No database. Two TOML-based stores:
106
+
107
+ | Path | Purpose | Access |
108
+ |------|---------|--------|
109
+ | `~/.config/madcat-tts/carts/*.toml` | Cart catalog (voices, engine, params) | Read-only at runtime |
110
+ | `~/.local/state/madcat-tts/state.toml` | Active cart + voice selection | Read/write by daemon |
111
+
112
+ Cart file schema (per-file): `default_voice` string + `[[voices]]` array. Each voice: `id`, `engine`, `lang`, `description`, `voice_path`, `[params]`.
113
+
114
+ ## Development Notes
115
+
116
+ - **Language:** Python 3.11+, uses `from __future__ import annotations` throughout
117
+ - **Python version in use:** 3.14 (CPython, aarch64, sinanju/GB10)
118
+ - **Framework:** FastAPI + uvicorn (single worker)
119
+ - **Package manager:** uv — `uv sync` for fresh venv, `uv pip install` for reuse path
120
+ - **Key deps:** `chatterbox-tts`, `piper-tts>=1.4.0`, `torch`, `soundfile`, `fastapi`, `httpx`, `pydantic>=2.7`, `tomli-w`
121
+ - **TOML:** Read via stdlib `tomllib` (py3.11+), write via `tomli-w`
122
+ - **Target host:** sinanju (DGX Spark / GB10 / aarch64 / CUDA 13)
123
+ - **Service port:** 14099 (default)
124
+
125
+ ## Conventions
126
+
127
+ - Engine names are lowercase string literals: `"chatterbox"`, `"piper"`, `"xtts"` — the `EngineName` `Literal` type in schemas.py is the source of truth
128
+ - Voice IDs follow `<persona>-<lang>[-<engine>]` convention: `bt7274-en`, `bt7274-pl-xtts`, `bt7274-en-piper`
129
+ - Cart tag = TOML filename stem: `bt7274.toml` → tag `bt7274`
130
+ - All engine `synth()` calls return `SynthResult(audio: np.ndarray float32 [-1,1], sample_rate: int)` — encoding to WAV/FLAC/PCM happens in the HTTP layer
131
+ - Response headers carry synthesis metadata: `X-Madcat-Cart`, `X-Madcat-Voice`, `X-Madcat-Engine`, `X-Madcat-Sample-Rate`, `X-Madcat-Duration-Sec`, `X-Madcat-Leading-Silence-Ms`
132
+ - No multi-worker, no async synth — model-resident GPU engines are intentionally synchronous
133
+ - Env vars follow `MADCAT_TTS_*` prefix convention
134
+
135
+ ## Entry Points
136
+
137
+ | Surface | Detail |
138
+ |---------|--------|
139
+ | `python -m madcat_tts` | Main daemon entry (`__main__.py` → uvicorn) |
140
+ | `madcat-tts` | Console script alias (same) |
141
+ | `POST /v1/audio/speech` | OpenAI-compatible synthesis endpoint |
142
+ | `GET /health` | Engine availability + normalizer stats |
143
+ | `GET /carts` | Full cart catalog |
144
+ | `POST /carts/active` | Switch active voice/cart |
145
+ | `GET /docs` | FastAPI Swagger UI |
146
+ | `Engine` ABC | `engines/base.py` — implement for new backends |
147
+ | `Registry` | `registry.py` — cart catalog and state |
148
+ | `EnginePool` | `main.py` — engine lifecycle, lock management |
@@ -0,0 +1,19 @@
1
+ Metadata-Version: 2.4
2
+ Name: madcat-tts
3
+ Version: 0.3.0
4
+ Summary: TTS daemon (Chatterbox + Piper in-process; XTTS proxied) for the MADCAT OS substrate.
5
+ Author: chi
6
+ License: MIT
7
+ Requires-Python: >=3.11
8
+ Requires-Dist: chatterbox-tts
9
+ Requires-Dist: fastapi>=0.115
10
+ Requires-Dist: httpx>=0.27
11
+ Requires-Dist: numpy
12
+ Requires-Dist: piper-tts>=1.4.0
13
+ Requires-Dist: pydantic>=2.7
14
+ Requires-Dist: setproctitle>=1.3
15
+ Requires-Dist: soundfile
16
+ Requires-Dist: tomli-w>=1.0
17
+ Requires-Dist: torch
18
+ Requires-Dist: torchaudio
19
+ Requires-Dist: uvicorn[standard]>=0.30
@@ -0,0 +1,157 @@
1
+ # madcat-tts
2
+
3
+ Single TTS daemon fronting four engines behind one OpenAI-compatible HTTP
4
+ endpoint.
5
+
6
+ | engine | location | strengths |
7
+ | ------------------ | ------------------ | -------------------------------------------- |
8
+ | `chatterbox` | in-process, GPU | Voice cloning, EN + PL, multilingual |
9
+ | `chatterbox-turbo` | in-process, GPU | Fast single-step, paralinguistic tags (EN) |
10
+ | `piper` | in-process, CPU | Fast, deterministic, no GPU |
11
+ | `xtts` | HTTP-proxied | XTTS-v2 zero-shot clone via remote backend |
12
+
13
+ ## Install
14
+
15
+ ```sh
16
+ pip install madcat-tts
17
+ ```
18
+
19
+ Or with [uv](https://docs.astral.sh/uv/):
20
+
21
+ ```sh
22
+ uv pip install madcat-tts
23
+ ```
24
+
25
+ ## Quick start
26
+
27
+ ```sh
28
+ # Start the daemon
29
+ madcat-tts
30
+
31
+ # Health check
32
+ curl http://localhost:14099/health
33
+
34
+ # Synthesize
35
+ curl -X POST http://localhost:14099/v1/audio/speech \
36
+ -H 'Content-Type: application/json' \
37
+ -d '{"input":"Hello world","voice":"bt7274-en","response_format":"wav"}' \
38
+ --output out.wav
39
+ ```
40
+
41
+ ## Configuration
42
+
43
+ Config lives in `~/.config/madcat/config.toml`:
44
+
45
+ ```toml
46
+ [tts]
47
+ host = "0.0.0.0"
48
+ port = 14099
49
+ log_level = "info"
50
+ leading_silence_ms = 300
51
+
52
+ [tts.normalizer]
53
+ url = "http://localhost:8002"
54
+ model = "qwen3-4b-tts-norm"
55
+ enabled = true
56
+
57
+ [tts.chunking]
58
+ chunk_size = 250
59
+ chunk_gap_ms = 200
60
+ trim_silence = true
61
+ max_silence_ms = 400
62
+ ```
63
+
64
+ All settings have env var overrides (`MADCAT_TTS_HOST`, `MADCAT_TTS_PORT`,
65
+ etc.) and sensible defaults. The daemon runs without a config file.
66
+
67
+ ## Endpoints
68
+
69
+ | route | method | purpose |
70
+ | ----------------------- | ------ | --------------------------------------------------------------- |
71
+ | `/v1/audio/speech` | POST | OpenAI-compat synthesis. `voice` = cart/voice id or base64 WAVs |
72
+ | `/carts` | GET | List cart catalog with active marker |
73
+ | `/carts/active` | GET | Current active cart + voice |
74
+ | `/carts/active` | POST | Switch active cart/voice |
75
+ | `/health` | GET | Liveness + per-engine availability + normalizer stats |
76
+ | `/docs` | GET | Swagger UI |
77
+
78
+ ## Carts
79
+
80
+ A **cart** maps a persona to one or more voices. Each voice binds an engine,
81
+ language, reference audio, and engine-specific params. Catalog lives at
82
+ `~/.config/madcat/carts/*.toml` (one file per cart).
83
+
84
+ Example cart (`bt7274.toml`):
85
+
86
+ ```toml
87
+ default_voice = "bt7274-en-piper"
88
+
89
+ [[voices]]
90
+ id = "bt7274-en"
91
+ engine = "chatterbox"
92
+ lang = "en"
93
+ description = "English voice — Chatterbox GPU cloning"
94
+ voice_path = "/path/to/english.wav"
95
+
96
+ [voices.params]
97
+ cfg_weight = 0.5
98
+ exaggeration = 0.5
99
+
100
+ [[voices]]
101
+ id = "bt7274-en-piper"
102
+ engine = "piper"
103
+ lang = "en"
104
+ description = "English voice — Piper CPU fallback"
105
+ voice_path = "/path/to/english.onnx"
106
+
107
+ [voices.params]
108
+ length_scale = 1.0
109
+ ```
110
+
111
+ Active selection persists to `~/.local/share/madcat/tts-state.toml`.
112
+
113
+ ## Text normalization
114
+
115
+ Optional LLM-based preprocessor that expands numbers, acronyms, and symbols
116
+ before synthesis. Calls any OpenAI-compatible `/v1/chat/completions` endpoint.
117
+ Disable with `MADCAT_TTS_NORMALIZE=0` or per-request `"normalize": false`.
118
+
119
+ ## Engine notes
120
+
121
+ - **chatterbox**: Multilingual voice cloning. Requires CUDA GPU. 10s mono
122
+ reference WAV. Params: `cfg_weight` (0-1), `exaggeration` (0-2).
123
+ - **chatterbox-turbo**: 350M parameter English-only model. Supports
124
+ paralinguistic tags: `[laugh]`, `[chuckle]`, `[cough]`, `[sigh]`,
125
+ `[gasp]`, `[groan]`. Single-step decoder, lighter VRAM.
126
+ - **piper**: CPU ONNX inference. Deterministic. Voice = `.onnx` file with
127
+ sibling `.onnx.json` config. Params: `length_scale`, `noise_scale`.
128
+ - **xtts**: HTTP proxy to a remote XTTS-v2 backend (e.g. Auralis). Configure
129
+ via `MADCAT_TTS_XTTS_URL` (default `http://localhost:8020`).
130
+
131
+ Engine init failures are non-fatal — the daemon stays up and returns 503
132
+ only for requests targeting unavailable engines.
133
+
134
+ ## Layout
135
+
136
+ ```
137
+ src/madcat_tts/
138
+ ├── __main__.py # uvicorn launcher
139
+ ├── main.py # FastAPI app, EnginePool, routes
140
+ ├── schemas.py # Pydantic models
141
+ ├── registry.py # Cart catalog + state persistence
142
+ ├── config.py # Config reader (~/.config/madcat/config.toml)
143
+ ├── normalize.py # LLM text normalizer
144
+ ├── chunker.py # Sentence-boundary chunking
145
+ ├── postprocess.py # Audio post-processing (silence trim/compress)
146
+ ├── pcart.py # Cart format utilities
147
+ └── engines/
148
+ ├── base.py # Engine ABC + SynthResult
149
+ ├── chatterbox.py # ChatterboxMultilingualTTS
150
+ ├── chatterbox_turbo.py # ChatterboxTurboTTS (paralinguistic)
151
+ ├── piper.py # PiperVoice ONNX
152
+ └── xtts_proxy.py # HTTP proxy to remote backend
153
+ ```
154
+
155
+ ## License
156
+
157
+ MIT
@@ -0,0 +1,60 @@
1
+ # BT-7274 — Vanguard-class Titan AI
2
+ # Voices across multiple TTS engines for the canonical persona.
3
+
4
+ default_voice = "bt7274-en-piper"
5
+
6
+ [[voices]]
7
+ id = "bt7274-pl"
8
+ engine = "chatterbox"
9
+ lang = "pl"
10
+ description = "Canonical Polish voice — Chatterbox GPU cloning, 10s reference"
11
+ voice_path = "/home/madcat/.local/share/bt7274/canonical_pl_ref_22k_mono.wav"
12
+
13
+ [voices.params]
14
+ cfg_weight = 0.5
15
+ exaggeration = 0.5
16
+
17
+
18
+ [[voices]]
19
+ id = "bt7274-en"
20
+ engine = "chatterbox"
21
+ lang = "en"
22
+ description = "Canonical English voice — Chatterbox GPU cloning, 10s reference"
23
+ voice_path = "/home/madcat/.local/share/bt7274/canonical_en_ref_22k_mono.wav"
24
+
25
+ [voices.params]
26
+ cfg_weight = 0.5
27
+ exaggeration = 0.5
28
+
29
+
30
+ [[voices]]
31
+ id = "bt7274-en-xtts"
32
+ engine = "xtts"
33
+ lang = "en"
34
+ description = "English via XTTS-v2 zero-shot clone — proxied to Auralis on junkpile"
35
+ voice_path = "/home/madcat/.local/share/bt7274/canonical_en_ref_22k_mono.wav"
36
+
37
+ [voices.params]
38
+ speed = 1.0
39
+
40
+
41
+ [[voices]]
42
+ id = "bt7274-pl-xtts"
43
+ engine = "xtts"
44
+ lang = "pl"
45
+ description = "Polish via XTTS-v2 zero-shot clone — proxied to Auralis on junkpile"
46
+ voice_path = "/home/madcat/.local/share/bt7274/canonical_pl_ref_22k_mono.wav"
47
+
48
+ [voices.params]
49
+ speed = 1.0
50
+
51
+
52
+ [[voices]]
53
+ id = "bt7274-en-piper"
54
+ engine = "piper"
55
+ lang = "en"
56
+ description = "English via Piper ONNX — fast CPU fallback, custom trained"
57
+ voice_path = "/home/madcat/piper-voices/bt7274.onnx"
58
+
59
+ [voices.params]
60
+ length_scale = 1.0
@@ -0,0 +1,14 @@
1
+ # Lessac — Piper CPU fallback voice
2
+ # No cloning, fast deterministic synthesis. CPU-only baseline.
3
+
4
+ default_voice = "lessac"
5
+
6
+ [[voices]]
7
+ id = "lessac"
8
+ engine = "piper"
9
+ lang = "en"
10
+ description = "Piper lessac medium — fast EN fallback, no GPU required"
11
+ voice_path = "/home/madcat/piper-voices/en_US-lessac-medium.onnx"
12
+
13
+ [voices.params]
14
+ length_scale = 1.0
@@ -0,0 +1,23 @@
1
+ [Unit]
2
+ Description=madcat-tts — MADCAT OS TTS daemon (Chatterbox + Piper in-process)
3
+ After=network-online.target
4
+ Wants=network-online.target
5
+
6
+ [Service]
7
+ Type=simple
8
+ WorkingDirectory=%h/Projects/madcat-tts
9
+ Environment=PYTHONUNBUFFERED=1
10
+ Environment=MADCAT_TTS_HOST=0.0.0.0
11
+ Environment=MADCAT_TTS_PORT=14099
12
+ Environment=MADCAT_TTS_LOG=info
13
+ # XTTS proxy target. Sin runs Auralis locally on :8020.
14
+ Environment=MADCAT_TTS_XTTS_URL=http://localhost:8020
15
+ # LLM text normalizer (vllm-tts LoRA). Set MADCAT_TTS_NORMALIZE=0 to bypass.
16
+ Environment=MADCAT_TTS_NORMALIZER_URL=http://localhost:8002
17
+ # uv-managed venv at $WorkingDirectory/.venv
18
+ ExecStart=%h/Projects/madcat-tts/.venv/bin/python -m madcat_tts
19
+ Restart=on-failure
20
+ RestartSec=5s
21
+
22
+ [Install]
23
+ WantedBy=default.target
@@ -0,0 +1,40 @@
1
+ schema_version = 1
2
+
3
+ [persona]
4
+ slug = "bt7274"
5
+ name = "bt7274"
6
+ type = "assistant"
7
+ tagline = ""
8
+ prompt = "prompt.md"
9
+ default_lang = "pl"
10
+
11
+ [theme]
12
+ palette = "default"
13
+ typography = "sans"
14
+ mode = "dark"
15
+ accent = ""
16
+
17
+ [sink]
18
+ default = ""
19
+
20
+ [voices.pl]
21
+ id = "bt7274-pl"
22
+ backend = "chatterbox"
23
+ lang = "pl"
24
+ voice_path = "/home/madcat/.local/share/bt7274/canonical_pl_ref_22k_mono.wav"
25
+ description = "Canonical Polish voice — Chatterbox GPU cloning, 10s reference"
26
+
27
+ [voices.pl.params]
28
+ cfg_weight = 0.5
29
+ exaggeration = 0.5
30
+
31
+ [voices.en]
32
+ id = "bt7274-en"
33
+ backend = "chatterbox"
34
+ lang = "en"
35
+ voice_path = "/home/madcat/.local/share/bt7274/canonical_en_ref_22k_mono.wav"
36
+ description = "Canonical English voice — Chatterbox GPU cloning, 10s reference"
37
+
38
+ [voices.en.params]
39
+ cfg_weight = 0.5
40
+ exaggeration = 0.5
@@ -0,0 +1,28 @@
1
+ schema_version = 1
2
+
3
+ [persona]
4
+ slug = "lessac"
5
+ name = "lessac"
6
+ type = "assistant"
7
+ tagline = ""
8
+ prompt = "prompt.md"
9
+ default_lang = "en"
10
+
11
+ [theme]
12
+ palette = "default"
13
+ typography = "sans"
14
+ mode = "dark"
15
+ accent = ""
16
+
17
+ [sink]
18
+ default = ""
19
+
20
+ [voices.en]
21
+ id = "lessac"
22
+ backend = "piper-remote"
23
+ lang = "en"
24
+ voice_path = "/home/madcat/piper-voices/en_US-lessac-medium.onnx"
25
+ description = "Piper lessac medium — fast EN fallback, no GPU required"
26
+
27
+ [voices.en.params]
28
+ length_scale = 1.0
@@ -0,0 +1,19 @@
1
+ [Unit]
2
+ Description=vLLM TTS Normalizer (Qwen3-4B-Instruct-2507)
3
+ After=network-online.target
4
+ Wants=network-online.target
5
+
6
+ [Service]
7
+ Type=simple
8
+ User=madcat
9
+ Group=sudo
10
+ ExecStart=/usr/local/bin/vllm serve --config /etc/vllm/tts.yaml
11
+ Restart=on-failure
12
+ RestartSec=5
13
+ Environment="HF_HOME=/home/madcat/.cache/huggingface"
14
+ Environment="HF_TOKEN=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"
15
+ Environment="CUDA_VISIBLE_DEVICES=0"
16
+ Environment="VLLM_USAGE_SOURCE=production"
17
+
18
+ [Install]
19
+ WantedBy=multi-user.target
@@ -0,0 +1,30 @@
1
+ # vLLM config — TTS text normalizer (Qwen3-4B-Instruct-2507)
2
+ # Replaces Qwen2.5-1.5B: stronger multilingual (119 langs, incl. Polish),
3
+ # no LoRA needed — base instruct model handles TTS normalization via prompting.
4
+ # Non-thinking-only variant: no CoT overhead, fast inference for text transforms.
5
+
6
+ # ── Model ────────────────────────────────────────────────────
7
+ model: Qwen/Qwen3-4B-Instruct-2507
8
+ served-model-name: qwen3-4b-tts-norm
9
+ dtype: auto
10
+
11
+ # ── Context & Batching ───────────────────────────────────────
12
+ # Normalization is short-form: inputs rarely exceed 200 tokens.
13
+ max-model-len: 1024
14
+ max-num-seqs: 4
15
+ max-num-batched-tokens: 1024
16
+
17
+ # ── Memory Management ───────────────────────────────────────
18
+ # 4B model is ~8GiB in bf16. On GB10's 128GB unified memory,
19
+ # 0.15 = ~19GiB — enough for weights + KV cache + activations.
20
+ # Old value 0.08 (10GiB) left barely 2GiB for KV cache.
21
+ gpu-memory-utilization: 0.15
22
+
23
+ # ── Performance ──────────────────────────────────────────────
24
+ # Prefix caching helps when the same system prompt is reused
25
+ # across all normalization calls (which it is).
26
+ enable-prefix-caching: true
27
+
28
+ # ── Network ──────────────────────────────────────────────────
29
+ host: 0.0.0.0
30
+ port: 8002