pipecat-local-tts-server 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ BSD 2-Clause License
2
+
3
+ Copyright (c) 2026, Varun Singh
4
+
5
+ Redistribution and use in source and binary forms, with or without
6
+ modification, are permitted provided that the following conditions are met:
7
+
8
+ 1. Redistributions of source code must retain the above copyright notice, this
9
+ list of conditions and the following disclaimer.
10
+
11
+ 2. Redistributions in binary form must reproduce the above copyright notice,
12
+ this list of conditions and the following disclaimer in the documentation
13
+ and/or other materials provided with the distribution.
14
+
15
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,293 @@
1
+ Metadata-Version: 2.3
2
+ Name: pipecat-local-tts-server
3
+ Version: 0.1.0
4
+ Summary: Standalone local WebSocket text-to-speech (TTS) server, client, and pluggable mlx-audio backends for the Pipecat ecosystem
5
+ Keywords: tts,text-to-speech,kokoro,mlx,websocket,pipecat,speech,audio
6
+ Author: Varun Singh
7
+ Author-email: Varun Singh <varun@varunsingh.net>
8
+ License: BSD 2-Clause License
9
+
10
+ Copyright (c) 2026, Varun Singh
11
+
12
+ Redistribution and use in source and binary forms, with or without
13
+ modification, are permitted provided that the following conditions are met:
14
+
15
+ 1. Redistributions of source code must retain the above copyright notice, this
16
+ list of conditions and the following disclaimer.
17
+
18
+ 2. Redistributions in binary form must reproduce the above copyright notice,
19
+ this list of conditions and the following disclaimer in the documentation
20
+ and/or other materials provided with the distribution.
21
+
22
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
23
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
26
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
28
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
+ Classifier: Development Status :: 3 - Alpha
33
+ Classifier: Intended Audience :: Developers
34
+ Classifier: License :: OSI Approved :: BSD License
35
+ Classifier: Operating System :: MacOS :: MacOS X
36
+ Classifier: Programming Language :: Python :: 3
37
+ Classifier: Programming Language :: Python :: 3.12
38
+ Classifier: Programming Language :: Python :: 3.13
39
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
40
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
41
+ Requires-Dist: websockets>=13.0
42
+ Requires-Dist: websockets>=13.0 ; extra == 'client'
43
+ Requires-Dist: websockets>=13.0 ; extra == 'examples'
44
+ Requires-Dist: pipecat-ai==1.4.0 ; extra == 'examples'
45
+ Requires-Dist: websockets>=13.0 ; extra == 'kokoro'
46
+ Requires-Dist: mlx-audio==0.4.4 ; extra == 'kokoro'
47
+ Requires-Dist: misaki[en] ; extra == 'kokoro'
48
+ Requires-Python: >=3.12
49
+ Project-URL: Homepage, https://github.com/vr000m/pipecat-local-tts-server
50
+ Project-URL: Issues, https://github.com/vr000m/pipecat-local-tts-server/issues
51
+ Project-URL: Repository, https://github.com/vr000m/pipecat-local-tts-server
52
+ Provides-Extra: client
53
+ Provides-Extra: examples
54
+ Provides-Extra: kokoro
55
+ Description-Content-Type: text/markdown
56
+
57
+ # pipecat-local-tts-server
58
+
59
+ Standalone, local WebSocket **text-to-speech (TTS) server** — text in, audio
60
+ out — for the Pipecat ecosystem. It mirrors the sibling
61
+ [`pipecat-local-stt-server`](https://github.com/vr000m/pipecat-local-stt-server):
62
+ same websocket transport, an OpenAI-Realtime-inspired protocol subset, a
63
+ pluggable backend abstraction, and lazy-imported per-model backends behind
64
+ optional extras so a client-only consumer never pulls the heavy TTS runtime.
65
+
66
+ Distributed as `pipecat-local-tts-server`; the import name is `tts_server`
67
+ (every `import tts_server` / `python -m tts_server` invocation works).
68
+ **Kokoro-first** (mlx-audio, Apple Silicon); more backends land later.
69
+
70
+ The server owes its client exactly two things (the contracts everything else is
71
+ built around):
72
+
73
+ 1. **An exact, stable advertised rate.** `server.hello.audio.rate` is the true
74
+ model rate (Kokoro = 24000 Hz). Every audio frame is int16-LE mono PCM at
75
+ *exactly* that rate, with no per-utterance drift. The client resamples
76
+ model-rate → device-rate off this single value.
77
+ 2. **A steady in-response stream.** Once a response starts, audio arrives
78
+ continuously (each model segment is emitted as it completes) so the client's
79
+ playback buffer never starves.
80
+
81
+ ## Install
82
+
83
+ The base package is lean — `websockets` only. Backends live behind extras.
84
+
85
+ From [PyPI](https://pypi.org/project/pipecat-local-tts-server/) (consumers):
86
+
87
+ ```sh
88
+ # client-only lean base (websockets) — for a bot that just talks to a server
89
+ uv add pipecat-local-tts-server # or: pip install pipecat-local-tts-server
90
+
91
+ # Kokoro backend (Apple Silicon; pulls mlx-audio==0.4.4 + misaki[en])
92
+ uv add "pipecat-local-tts-server[kokoro]" # or: pip install "pipecat-local-tts-server[kokoro]"
93
+ ```
94
+
95
+ From source (development):
96
+
97
+ ```sh
98
+ # client-only (lean base: websockets) — for a bot that just talks to a server
99
+ uv sync --extra client
100
+
101
+ # Kokoro backend (Apple Silicon; pulls mlx-audio==0.4.4 + misaki[en], which
102
+ # drags in spacy/num2words/torch — heavy by design, kept out of lean base)
103
+ uv sync --extra kokoro
104
+
105
+ # the reference Pipecat adapter example (pulls the Pipecat framework)
106
+ uv sync --extra examples
107
+ ```
108
+
109
+ > Run every command through `uv run` (or activate the venv once per shell). A
110
+ > bare `python -m tts_server …` uses the system interpreter and fails with
111
+ > `ModuleNotFoundError: No module named 'websockets'`.
112
+
113
+ ## Running the server
114
+
115
+ ```sh
116
+ # Kokoro over a Unix domain socket (recommended for local use)
117
+ uv run python -m tts_server serve --backend kokoro \
118
+ --socket-path ~/Library/Caches/pipecat-tts/tts.sock
119
+
120
+ # pick a specific model (any compatible mlx-community Kokoro repo id)
121
+ uv run python -m tts_server serve --backend kokoro \
122
+ --model mlx-community/Kokoro-82M-bf16 \
123
+ --socket-path ~/Library/Caches/pipecat-tts/tts.sock
124
+
125
+ # loopback TCP instead of a socket (choose any free port)
126
+ uv run python -m tts_server serve --backend kokoro --host 127.0.0.1 --port 8765
127
+ ```
128
+
129
+ The server logs the resolved backend + model at startup, *before* the
130
+ (potentially slow) model load, so you can see what is being loaded. The rate is
131
+ read from the loaded model, so model load completes before the first
132
+ `server.hello` is sent.
133
+
134
+ `--log-level` (default `INFO`) sets the server's logging verbosity (any standard
135
+ Python level name, e.g. `DEBUG`/`WARNING`).
136
+
137
+ On startup over a Unix socket, the server **auto-clears a stale socket** left by
138
+ a previous crash (`SIGKILL` / power loss), so the documented restart works
139
+ without manual `rm`. It refuses to start — surfacing a diagnostic instead of
140
+ clobbering — if a **non-socket file** exists at the path, or if a **live server**
141
+ is already listening there (the socket is genuinely in use).
142
+
143
+ ### Environment variables
144
+
145
+ Endpoint precedence (server and client both): **URI > socket > host+port**. The
146
+ `TTS_WS_*` vars mirror `STT_WS_*`.
147
+
148
+ | Variable | Side | Purpose |
149
+ |---|---|---|
150
+ | `TTS_WS_URI` | client | Full `ws://`/`wss://` URI; highest endpoint precedence. |
151
+ | `TTS_WS_SOCKET` | client | Unix-socket path (used when no URI). |
152
+ | `TTS_WS_HOST` | client | TCP host (used when no URI/socket). |
153
+ | `TTS_WS_PORT` | client | TCP port (paired with host). |
154
+ | `TTS_WS_TOKEN` | client | Bearer token the client/probe sends. Never falls back to the server var. |
155
+ | `TTS_WS_DEFAULT_SOCKET` | client | Explicit fallback socket for `status` when nothing else is set. |
156
+ | `PIPECAT_TTS_AUTH_TOKEN` | server | Bearer token the server requires (optional auth). |
157
+ | `PIPECAT_TTS_KOKORO_EXTRA_LANGS` | server | Comma-separated ISO codes (e.g. `ja,zh`) to advertise after installing their extra G2P package. See [Kokoro language support](#kokoro-language-support-advertised--synthesizable). |
158
+
159
+ Auth notes: the server reads `PIPECAT_TTS_AUTH_TOKEN`; the client/probe reads
160
+ `TTS_WS_TOKEN` (the two are deliberately separate so a probe can never mask a
161
+ client 401 or leak the server secret to a remote host). A plaintext
162
+ `--auth-token` flag is intentionally unsupported (`ps` exposure) — use
163
+ `--auth-token-file`. A token-less server bound to a non-loopback TCP address logs
164
+ a cleartext-remote warning; a Unix socket does not. Sending a bearer over
165
+ cleartext `ws://` to a remote host also warns client-side — use `wss://` or a
166
+ Unix socket.
167
+
168
+ ## Checking server health
169
+
170
+ ```sh
171
+ uv run python -m tts_server status \
172
+ --socket-path ~/Library/Caches/pipecat-tts/tts.sock
173
+ ```
174
+
175
+ `status` connects, performs the handshake, requests a `server.status` snapshot,
176
+ and prints the backend, model, audio format/rate, capabilities
177
+ (streaming / binary_audio / voice_count), session id, synthesis **queue depth**,
178
+ the **voice list**, buffered chars, uptime, and pid. It exits non-zero if no
179
+ server is reachable.
180
+
181
+ `status` resolves its endpoint with the same **URI > socket > host+port**
182
+ precedence as the client and additionally accepts `--uri ws://…`/`wss://…` (the
183
+ serve path has no `--uri`, since it builds its listener from socket-path/host+port).
184
+ Two probe-only flags: `--timeout` (overall probe budget in seconds, default `3.0`)
185
+ and `--json` (emit the raw `hello`+`status` JSON instead of the text summary).
186
+
187
+ For day-to-day operation on macOS the [`justfile`](justfile) carries read-only
188
+ operator recipes mirroring the sibling stt server: `just tts-list` lists every
189
+ `pipecat.tts-server*` launchd agent with state, pid, and live backend, and
190
+ `just tts-status` runs the wire `status` probe against the canonical socket
191
+ (override with `just tts-status socket=…`).
192
+
193
+ ## Protocol
194
+
195
+ The full wire contract is in [`docs/protocol.md`](docs/protocol.md). In brief:
196
+ every message is a JSON text frame with a `type` field. The client drives the
197
+ session — `session.update` → `input_text.append`* → `input_text.commit` — and
198
+ the server streams `response.audio.delta` frames (base64 pcm16, `seq` from 0, no
199
+ gaps) ending in `response.audio.done`. `response.cancel` is barge-in. Audio is
200
+ base64-in-JSON for v1 (`binary_audio: false`); binary frames are a later
201
+ optimization.
202
+
203
+ **The client segments the text; the commit is the unit of work.** Using
204
+ `capabilities.streaming` and `capabilities.ideal_words`, the client splits long
205
+ text into commits, rounding `ideal_words` up to the next **sentence boundary**
206
+ — never splitting mid-sentence (a half-sentence commit makes the model apply
207
+ sentence-final prosody mid-phrase). `max_text_chars` is the hard server cap.
208
+
209
+ ### Kokoro capabilities (as shipped)
210
+
211
+ Built per-backend (`server.hello.capabilities`). Verified against
212
+ mlx-community/Kokoro-82M-bf16 (mlx-audio 0.4.4):
213
+
214
+ | Field | Value | Note |
215
+ |---|---|---|
216
+ | rate | **24000** | from `server.hello.audio.rate`, not capabilities |
217
+ | `streaming` | `false` | no sub-segment streaming; segments still stream per `\n+` |
218
+ | `binary_audio` | `false` | base64-in-JSON for v1 |
219
+ | `text_formats` | `["plain"]` | ssml/ipa not supported |
220
+ | `languages` | `["en","es","fr","hi","it","pt"]` | from voice prefixes, minus languages needing extra G2P — **`ja`/`zh` are off by default** (opt-in, see below) |
221
+ | `voice_count` | `54` | full list via `status` |
222
+ | `extras` | `["speed"]` | Kokoro's only effective `generate()` kwarg |
223
+ | `ideal_words` | `40` | soft target; client rounds up to a sentence boundary |
224
+ | `max_text_chars` | `2000` | hard server cap |
225
+
226
+ ### Kokoro language support (advertised = synthesizable)
227
+
228
+ The advertised `languages` list reflects what this deployment can actually
229
+ synthesize, not just what voices the model ships. The default `kokoro` extra
230
+ pins `misaki[en]` only; verified live against mlx-community/Kokoro-82M-bf16:
231
+
232
+ - **`en`** uses misaki[en]; **`es`/`fr`/`it`/`pt`/`hi`** route through the
233
+ espeak-ng G2P bundled with misaki[en] — all synthesize fine and are advertised.
234
+ (`hi`'s first call loads its G2P lazily and can exceed a 60 s client timeout.)
235
+ - **`ja` and `zh` need an extra G2P package** — `misaki[ja]` (`pyopenjtalk`) and
236
+ `misaki[zh]` (`ordered_set`) respectively, which the `kokoro` extra does not
237
+ install. Because synthesis would fail at runtime (`response.failed`,
238
+ `code=backend_error`) without them, **they are not advertised by default** and
239
+ a request for them is rejected up front with `invalid_config` (before a
240
+ synthesis slot is consumed) rather than failing mid-response.
241
+
242
+ **Enabling `ja` / `zh`** is a two-step, build-time decision:
243
+
244
+ 1. Install the G2P package(s):
245
+ `uv pip install "misaki[ja]" "misaki[zh]"`
246
+ 2. Opt the language(s) back into the advertised set:
247
+ `export PIPECAT_TTS_KOKORO_EXTRA_LANGS=ja,zh`
248
+
249
+ The server logs its advertised language set at startup (including a reminder of
250
+ any languages left disabled). The opt-in only *re-adds* a language the model
251
+ already ships voices for; it cannot advertise one the model lacks. If you set the
252
+ env var without installing the package, that language is advertised again and
253
+ will fail at synthesis — install first.
254
+
255
+ See [`tests/smoke/`](tests/smoke/) for the live end-to-end smoke scripts that
256
+ verify this (`just smoke-tone` / `just smoke-kokoro` / `just smoke-multilingual`).
257
+
258
+ ### Kokoro cancellation caveat
259
+
260
+ Kokoro yields one segment per `\n+` boundary and the cancel flag is only checked
261
+ at a segment boundary. The **client-visible** cancel is prompt regardless: a
262
+ `response.cancel` is acknowledged with `response.cancelled` in ~1 ms (measured on
263
+ Apple Silicon), and no audio follows it. What runs to the segment boundary is the
264
+ backend worker / Metal lock: a **long single-segment** commit keeps the lock
265
+ until its `generate()` reaches the yield (≈ the full single-segment synthesis
266
+ time — a few seconds for a ~1700-char segment, bounded by `drain_timeout_seconds`),
267
+ so the *next* commit can't start synthesizing until then. To free the lock sooner
268
+ for back-to-back commits, **clients should chunk at sentence/newline boundaries**
269
+ for Kokoro. The server's hard guarantee is "no more audio after
270
+ `response.cancelled`". (See the dev plan's *Phase 2 measured results* for the full
271
+ re-measurement; the earlier "≈ tens of seconds" figure was a bridge-bug artifact.)
272
+
273
+ ## Examples
274
+
275
+ - [`examples/reference_client.py`](examples/reference_client.py) — a lightweight
276
+ stdlib + `websockets` oracle (no `tts_server` install, no Pipecat). It speaks
277
+ the wire protocol directly and writes the reassembled audio to a WAV. Useful
278
+ for manual end-to-end smoke checks once a server is running.
279
+ - [`examples/pipecat_tts_service.py`](examples/pipecat_tts_service.py) — a
280
+ reference Pipecat-framework `TTSService` adapter (`LocalTTSService`) that wraps
281
+ the async `tts_server.client.TTSClient` so a bot pipeline can speak through a
282
+ running server. Streams `TTSAudioRawFrame`s at the server-advertised rate and
283
+ sends `response.cancel` on interruption. Requires the Pipecat framework
284
+ (`uv sync --extra examples`, which pins `pipecat-ai==1.4.0`).
285
+
286
+ ## Layout
287
+
288
+ - `tts_server/` — protocol, backend abstraction, server, async client, CLI.
289
+ - `tts_server/backends/` — lazy-imported per-model backends (Kokoro first).
290
+ - `examples/` — the stdlib oracle and the Pipecat service adapter.
291
+ - `justfile` — macOS operator recipes (`tts-list`, `tts-status`).
292
+ - `docs/protocol.md` — the wire protocol specification.
293
+ - `docs/dev_plans/` — development plans.
@@ -0,0 +1,237 @@
1
+ # pipecat-local-tts-server
2
+
3
+ Standalone, local WebSocket **text-to-speech (TTS) server** — text in, audio
4
+ out — for the Pipecat ecosystem. It mirrors the sibling
5
+ [`pipecat-local-stt-server`](https://github.com/vr000m/pipecat-local-stt-server):
6
+ same websocket transport, an OpenAI-Realtime-inspired protocol subset, a
7
+ pluggable backend abstraction, and lazy-imported per-model backends behind
8
+ optional extras so a client-only consumer never pulls the heavy TTS runtime.
9
+
10
+ Distributed as `pipecat-local-tts-server`; the import name is `tts_server`
11
+ (every `import tts_server` / `python -m tts_server` invocation works).
12
+ **Kokoro-first** (mlx-audio, Apple Silicon); more backends land later.
13
+
14
+ The server owes its client exactly two things (the contracts everything else is
15
+ built around):
16
+
17
+ 1. **An exact, stable advertised rate.** `server.hello.audio.rate` is the true
18
+ model rate (Kokoro = 24000 Hz). Every audio frame is int16-LE mono PCM at
19
+ *exactly* that rate, with no per-utterance drift. The client resamples
20
+ model-rate → device-rate off this single value.
21
+ 2. **A steady in-response stream.** Once a response starts, audio arrives
22
+ continuously (each model segment is emitted as it completes) so the client's
23
+ playback buffer never starves.
24
+
25
+ ## Install
26
+
27
+ The base package is lean — `websockets` only. Backends live behind extras.
28
+
29
+ From [PyPI](https://pypi.org/project/pipecat-local-tts-server/) (consumers):
30
+
31
+ ```sh
32
+ # client-only lean base (websockets) — for a bot that just talks to a server
33
+ uv add pipecat-local-tts-server # or: pip install pipecat-local-tts-server
34
+
35
+ # Kokoro backend (Apple Silicon; pulls mlx-audio==0.4.4 + misaki[en])
36
+ uv add "pipecat-local-tts-server[kokoro]" # or: pip install "pipecat-local-tts-server[kokoro]"
37
+ ```
38
+
39
+ From source (development):
40
+
41
+ ```sh
42
+ # client-only (lean base: websockets) — for a bot that just talks to a server
43
+ uv sync --extra client
44
+
45
+ # Kokoro backend (Apple Silicon; pulls mlx-audio==0.4.4 + misaki[en], which
46
+ # drags in spacy/num2words/torch — heavy by design, kept out of lean base)
47
+ uv sync --extra kokoro
48
+
49
+ # the reference Pipecat adapter example (pulls the Pipecat framework)
50
+ uv sync --extra examples
51
+ ```
52
+
53
+ > Run every command through `uv run` (or activate the venv once per shell). A
54
+ > bare `python -m tts_server …` uses the system interpreter and fails with
55
+ > `ModuleNotFoundError: No module named 'websockets'`.
56
+
57
+ ## Running the server
58
+
59
+ ```sh
60
+ # Kokoro over a Unix domain socket (recommended for local use)
61
+ uv run python -m tts_server serve --backend kokoro \
62
+ --socket-path ~/Library/Caches/pipecat-tts/tts.sock
63
+
64
+ # pick a specific model (any compatible mlx-community Kokoro repo id)
65
+ uv run python -m tts_server serve --backend kokoro \
66
+ --model mlx-community/Kokoro-82M-bf16 \
67
+ --socket-path ~/Library/Caches/pipecat-tts/tts.sock
68
+
69
+ # loopback TCP instead of a socket (choose any free port)
70
+ uv run python -m tts_server serve --backend kokoro --host 127.0.0.1 --port 8765
71
+ ```
72
+
73
+ The server logs the resolved backend + model at startup, *before* the
74
+ (potentially slow) model load, so you can see what is being loaded. The rate is
75
+ read from the loaded model, so model load completes before the first
76
+ `server.hello` is sent.
77
+
78
+ `--log-level` (default `INFO`) sets the server's logging verbosity (any standard
79
+ Python level name, e.g. `DEBUG`/`WARNING`).
80
+
81
+ On startup over a Unix socket, the server **auto-clears a stale socket** left by
82
+ a previous crash (`SIGKILL` / power loss), so the documented restart works
83
+ without manual `rm`. It refuses to start — surfacing a diagnostic instead of
84
+ clobbering — if a **non-socket file** exists at the path, or if a **live server**
85
+ is already listening there (the socket is genuinely in use).
86
+
87
+ ### Environment variables
88
+
89
+ Endpoint precedence (server and client both): **URI > socket > host+port**. The
90
+ `TTS_WS_*` vars mirror `STT_WS_*`.
91
+
92
+ | Variable | Side | Purpose |
93
+ |---|---|---|
94
+ | `TTS_WS_URI` | client | Full `ws://`/`wss://` URI; highest endpoint precedence. |
95
+ | `TTS_WS_SOCKET` | client | Unix-socket path (used when no URI). |
96
+ | `TTS_WS_HOST` | client | TCP host (used when no URI/socket). |
97
+ | `TTS_WS_PORT` | client | TCP port (paired with host). |
98
+ | `TTS_WS_TOKEN` | client | Bearer token the client/probe sends. Never falls back to the server var. |
99
+ | `TTS_WS_DEFAULT_SOCKET` | client | Explicit fallback socket for `status` when nothing else is set. |
100
+ | `PIPECAT_TTS_AUTH_TOKEN` | server | Bearer token the server requires (optional auth). |
101
+ | `PIPECAT_TTS_KOKORO_EXTRA_LANGS` | server | Comma-separated ISO codes (e.g. `ja,zh`) to advertise after installing their extra G2P package. See [Kokoro language support](#kokoro-language-support-advertised--synthesizable). |
102
+
103
+ Auth notes: the server reads `PIPECAT_TTS_AUTH_TOKEN`; the client/probe reads
104
+ `TTS_WS_TOKEN` (the two are deliberately separate so a probe can never mask a
105
+ client 401 or leak the server secret to a remote host). A plaintext
106
+ `--auth-token` flag is intentionally unsupported (`ps` exposure) — use
107
+ `--auth-token-file`. A token-less server bound to a non-loopback TCP address logs
108
+ a cleartext-remote warning; a Unix socket does not. Sending a bearer over
109
+ cleartext `ws://` to a remote host also warns client-side — use `wss://` or a
110
+ Unix socket.
111
+
112
+ ## Checking server health
113
+
114
+ ```sh
115
+ uv run python -m tts_server status \
116
+ --socket-path ~/Library/Caches/pipecat-tts/tts.sock
117
+ ```
118
+
119
+ `status` connects, performs the handshake, requests a `server.status` snapshot,
120
+ and prints the backend, model, audio format/rate, capabilities
121
+ (streaming / binary_audio / voice_count), session id, synthesis **queue depth**,
122
+ the **voice list**, buffered chars, uptime, and pid. It exits non-zero if no
123
+ server is reachable.
124
+
125
+ `status` resolves its endpoint with the same **URI > socket > host+port**
126
+ precedence as the client and additionally accepts `--uri ws://…`/`wss://…` (the
127
+ serve path has no `--uri`, since it builds its listener from socket-path/host+port).
128
+ Two probe-only flags: `--timeout` (overall probe budget in seconds, default `3.0`)
129
+ and `--json` (emit the raw `hello`+`status` JSON instead of the text summary).
130
+
131
+ For day-to-day operation on macOS the [`justfile`](justfile) carries read-only
132
+ operator recipes mirroring the sibling stt server: `just tts-list` lists every
133
+ `pipecat.tts-server*` launchd agent with state, pid, and live backend, and
134
+ `just tts-status` runs the wire `status` probe against the canonical socket
135
+ (override with `just tts-status socket=…`).
136
+
137
+ ## Protocol
138
+
139
+ The full wire contract is in [`docs/protocol.md`](docs/protocol.md). In brief:
140
+ every message is a JSON text frame with a `type` field. The client drives the
141
+ session — `session.update` → `input_text.append`* → `input_text.commit` — and
142
+ the server streams `response.audio.delta` frames (base64 pcm16, `seq` from 0, no
143
+ gaps) ending in `response.audio.done`. `response.cancel` is barge-in. Audio is
144
+ base64-in-JSON for v1 (`binary_audio: false`); binary frames are a later
145
+ optimization.
146
+
147
+ **The client segments the text; the commit is the unit of work.** Using
148
+ `capabilities.streaming` and `capabilities.ideal_words`, the client splits long
149
+ text into commits, rounding `ideal_words` up to the next **sentence boundary**
150
+ — never splitting mid-sentence (a half-sentence commit makes the model apply
151
+ sentence-final prosody mid-phrase). `max_text_chars` is the hard server cap.
152
+
153
+ ### Kokoro capabilities (as shipped)
154
+
155
+ Built per-backend (`server.hello.capabilities`). Verified against
156
+ mlx-community/Kokoro-82M-bf16 (mlx-audio 0.4.4):
157
+
158
+ | Field | Value | Note |
159
+ |---|---|---|
160
+ | rate | **24000** | from `server.hello.audio.rate`, not capabilities |
161
+ | `streaming` | `false` | no sub-segment streaming; segments still stream per `\n+` |
162
+ | `binary_audio` | `false` | base64-in-JSON for v1 |
163
+ | `text_formats` | `["plain"]` | ssml/ipa not supported |
164
+ | `languages` | `["en","es","fr","hi","it","pt"]` | from voice prefixes, minus languages needing extra G2P — **`ja`/`zh` are off by default** (opt-in, see below) |
165
+ | `voice_count` | `54` | full list via `status` |
166
+ | `extras` | `["speed"]` | Kokoro's only effective `generate()` kwarg |
167
+ | `ideal_words` | `40` | soft target; client rounds up to a sentence boundary |
168
+ | `max_text_chars` | `2000` | hard server cap |
169
+
170
+ ### Kokoro language support (advertised = synthesizable)
171
+
172
+ The advertised `languages` list reflects what this deployment can actually
173
+ synthesize, not just what voices the model ships. The default `kokoro` extra
174
+ pins `misaki[en]` only; verified live against mlx-community/Kokoro-82M-bf16:
175
+
176
+ - **`en`** uses misaki[en]; **`es`/`fr`/`it`/`pt`/`hi`** route through the
177
+ espeak-ng G2P bundled with misaki[en] — all synthesize fine and are advertised.
178
+ (`hi`'s first call loads its G2P lazily and can exceed a 60 s client timeout.)
179
+ - **`ja` and `zh` need an extra G2P package** — `misaki[ja]` (`pyopenjtalk`) and
180
+ `misaki[zh]` (`ordered_set`) respectively, which the `kokoro` extra does not
181
+ install. Because synthesis would fail at runtime (`response.failed`,
182
+ `code=backend_error`) without them, **they are not advertised by default** and
183
+ a request for them is rejected up front with `invalid_config` (before a
184
+ synthesis slot is consumed) rather than failing mid-response.
185
+
186
+ **Enabling `ja` / `zh`** is a two-step, build-time decision:
187
+
188
+ 1. Install the G2P package(s):
189
+ `uv pip install "misaki[ja]" "misaki[zh]"`
190
+ 2. Opt the language(s) back into the advertised set:
191
+ `export PIPECAT_TTS_KOKORO_EXTRA_LANGS=ja,zh`
192
+
193
+ The server logs its advertised language set at startup (including a reminder of
194
+ any languages left disabled). The opt-in only *re-adds* a language the model
195
+ already ships voices for; it cannot advertise one the model lacks. If you set the
196
+ env var without installing the package, that language is advertised again and
197
+ will fail at synthesis — install first.
198
+
199
+ See [`tests/smoke/`](tests/smoke/) for the live end-to-end smoke scripts that
200
+ verify this (`just smoke-tone` / `just smoke-kokoro` / `just smoke-multilingual`).
201
+
202
+ ### Kokoro cancellation caveat
203
+
204
+ Kokoro yields one segment per `\n+` boundary and the cancel flag is only checked
205
+ at a segment boundary. The **client-visible** cancel is prompt regardless: a
206
+ `response.cancel` is acknowledged with `response.cancelled` in ~1 ms (measured on
207
+ Apple Silicon), and no audio follows it. What runs to the segment boundary is the
208
+ backend worker / Metal lock: a **long single-segment** commit keeps the lock
209
+ until its `generate()` reaches the yield (≈ the full single-segment synthesis
210
+ time — a few seconds for a ~1700-char segment, bounded by `drain_timeout_seconds`),
211
+ so the *next* commit can't start synthesizing until then. To free the lock sooner
212
+ for back-to-back commits, **clients should chunk at sentence/newline boundaries**
213
+ for Kokoro. The server's hard guarantee is "no more audio after
214
+ `response.cancelled`". (See the dev plan's *Phase 2 measured results* for the full
215
+ re-measurement; the earlier "≈ tens of seconds" figure was a bridge-bug artifact.)
216
+
217
+ ## Examples
218
+
219
+ - [`examples/reference_client.py`](examples/reference_client.py) — a lightweight
220
+ stdlib + `websockets` oracle (no `tts_server` install, no Pipecat). It speaks
221
+ the wire protocol directly and writes the reassembled audio to a WAV. Useful
222
+ for manual end-to-end smoke checks once a server is running.
223
+ - [`examples/pipecat_tts_service.py`](examples/pipecat_tts_service.py) — a
224
+ reference Pipecat-framework `TTSService` adapter (`LocalTTSService`) that wraps
225
+ the async `tts_server.client.TTSClient` so a bot pipeline can speak through a
226
+ running server. Streams `TTSAudioRawFrame`s at the server-advertised rate and
227
+ sends `response.cancel` on interruption. Requires the Pipecat framework
228
+ (`uv sync --extra examples`, which pins `pipecat-ai==1.4.0`).
229
+
230
+ ## Layout
231
+
232
+ - `tts_server/` — protocol, backend abstraction, server, async client, CLI.
233
+ - `tts_server/backends/` — lazy-imported per-model backends (Kokoro first).
234
+ - `examples/` — the stdlib oracle and the Pipecat service adapter.
235
+ - `justfile` — macOS operator recipes (`tts-list`, `tts-status`).
236
+ - `docs/protocol.md` — the wire protocol specification.
237
+ - `docs/dev_plans/` — development plans.
@@ -0,0 +1,104 @@
1
+ [project]
2
+ name = "pipecat-local-tts-server"
3
+ version = "0.1.0"
4
+ description = "Standalone local WebSocket text-to-speech (TTS) server, client, and pluggable mlx-audio backends for the Pipecat ecosystem"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ license = { file = "LICENSE" }
8
+ authors = [{ name = "Varun Singh", email = "varun@varunsingh.net" }]
9
+ keywords = ["tts", "text-to-speech", "kokoro", "mlx", "websocket", "pipecat", "speech", "audio"]
10
+ classifiers = [
11
+ "Development Status :: 3 - Alpha",
12
+ "Intended Audience :: Developers",
13
+ "License :: OSI Approved :: BSD License",
14
+ "Operating System :: MacOS :: MacOS X",
15
+ "Programming Language :: Python :: 3",
16
+ "Programming Language :: Python :: 3.12",
17
+ "Programming Language :: Python :: 3.13",
18
+ "Topic :: Multimedia :: Sound/Audio :: Speech",
19
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
20
+ ]
21
+ dependencies = [
22
+ # The wire transport. The client (TTSClient) and the server runtime both
23
+ # depend on websockets; it is the only hard runtime dependency of the base
24
+ # package. Backend TTS engines live behind the [kokoro] (and later
25
+ # [voxtral_tts] / [pocket_tts] / [dia]) extras so a client-only consumer
26
+ # (e.g. a bot that just talks to a running server) does not pull mlx-audio
27
+ # or its heavy transitive deps (torch, spacy, ...). numpy is dev-only and
28
+ # is NEVER a runtime dependency — ToneBackend and the shared float->pcm16
29
+ # converter are stdlib-only.
30
+ "websockets>=13.0",
31
+ ]
32
+
33
+ [project.optional-dependencies]
34
+ # Pure client deps — what a downstream consumer of `tts_server.client` needs
35
+ # without pulling any mlx-audio runtime.
36
+ client = ["websockets>=13.0"]
37
+ # Full server install with the Kokoro mlx-audio backend.
38
+ #
39
+ # mlx-audio is pinned to 0.4.4 (R8): the TTS API drifted between 0.3.0 and
40
+ # 0.4.4 (streaming-chunk fields appeared, voxtral_tts was added, pocket_tts
41
+ # kwargs changed). An unpinned bump can silently break facts verified via
42
+ # scripts/verify_mlx_tts_api.py against 0.4.4 — re-run that script before
43
+ # widening the pin. Importing Kokoro pulls misaki (G2P) -> num2words/spacy/
44
+ # en_core_web_sm AND torch, so this extra is heavy by design; keep it out of
45
+ # lean base.
46
+ #
47
+ # mlx-audio 0.4.4 imports misaki lazily (a helpful ImportError at synth time if
48
+ # absent), so it is NOT a transitive hard dep of mlx-audio — Kokoro's G2P needs
49
+ # it declared explicitly. ``misaki[en]`` drags in num2words/spacy/en_core_web_sm
50
+ # AND torch, which is what makes this extra heavy (R3); keep it behind the extra.
51
+ kokoro = ["websockets>=13.0", "mlx-audio==0.4.4", "misaki[en]"]
52
+ # Reference Pipecat-framework adapter (``examples/pipecat_tts_service.py``).
53
+ # This is for running the *example*, not the server itself — it pulls the
54
+ # Pipecat framework (and ``client`` for ``tts_server.client``). It is NOT a
55
+ # dependency of the server or the lean client; a consumer that already has
56
+ # Pipecat installed does not need this extra.
57
+ #
58
+ # pipecat-ai is pinned to ==1.4.0 (same policy as mlx-audio): the adapter must
59
+ # override TTSService's READ-ONLY ``sample_rate`` property by writing its private
60
+ # ``_sample_rate`` backing field (no public post-handshake setter exists), so a
61
+ # version skew can silently mis-negotiate the audio rate. 1.4.0 is the version
62
+ # verified by tests/test_pipecat_adapter.py and exercised by the test-examples CI
63
+ # job. LocalTTSService._update_sample_rate also guards the write at runtime
64
+ # (raises if the field is gone). Re-run the adapter tests before widening this pin.
65
+ examples = ["websockets>=13.0", "pipecat-ai==1.4.0"]
66
+
67
+ [dependency-groups]
68
+ dev = [
69
+ "pytest>=8.0.0",
70
+ "pytest-asyncio>=0.24.0",
71
+ "ruff>=0.8.0",
72
+ # Test/verification infrastructure only. numpy is used by the mlx-audio
73
+ # verification scripts (scripts/verify_mlx_tts_api.py) and by tests that
74
+ # explicitly opt into it. It is intentionally NOT a runtime dependency:
75
+ # runtime package code (ToneBackend, the float->pcm16 converter) stays
76
+ # stdlib-only so the lean base never requires a dev-only dependency.
77
+ "numpy>=1.26",
78
+ ]
79
+
80
+ [project.urls]
81
+ Homepage = "https://github.com/vr000m/pipecat-local-tts-server"
82
+ Repository = "https://github.com/vr000m/pipecat-local-tts-server"
83
+ Issues = "https://github.com/vr000m/pipecat-local-tts-server/issues"
84
+
85
+ [build-system]
86
+ requires = ["uv_build>=0.9,<0.10"]
87
+ build-backend = "uv_build"
88
+
89
+ [tool.uv.build-backend]
90
+ # Import name stays `tts_server` so every `import tts_server` callsite, every
91
+ # test, and the `python -m tts_server` invocation work without rename.
92
+ module-name = "tts_server"
93
+ module-root = ""
94
+
95
+ [tool.pytest.ini_options]
96
+ asyncio_mode = "auto"
97
+ testpaths = ["tests"]
98
+ # Verification scripts under scripts/ are importable under pytest (mirrors the
99
+ # stt repo), so tests that drive them can `from scripts... import ...`.
100
+ pythonpath = ["."]
101
+
102
+ [tool.ruff]
103
+ target-version = "py312"
104
+ line-length = 100