PyPI - pipecat-local-tts-server - Versions diffs - 0.1.0__tar.gz - Mend

pipecat-local-tts-server 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

pipecat_local_tts_server-0.1.0/LICENSE +24 -0
pipecat_local_tts_server-0.1.0/PKG-INFO +293 -0
pipecat_local_tts_server-0.1.0/README.md +237 -0
pipecat_local_tts_server-0.1.0/pyproject.toml +104 -0
pipecat_local_tts_server-0.1.0/tts_server/__init__.py +22 -0
pipecat_local_tts_server-0.1.0/tts_server/__main__.py +333 -0
pipecat_local_tts_server-0.1.0/tts_server/_audio.py +70 -0
pipecat_local_tts_server-0.1.0/tts_server/backend.py +320 -0
pipecat_local_tts_server-0.1.0/tts_server/backends/__init__.py +45 -0
pipecat_local_tts_server-0.1.0/tts_server/backends/_stream_util.py +266 -0
pipecat_local_tts_server-0.1.0/tts_server/backends/kokoro.py +597 -0
pipecat_local_tts_server-0.1.0/tts_server/client.py +228 -0
pipecat_local_tts_server-0.1.0/tts_server/env.py +140 -0
pipecat_local_tts_server-0.1.0/tts_server/protocol.py +157 -0
pipecat_local_tts_server-0.1.0/tts_server/server.py +1606 -0

pipecat_local_tts_server-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,24 @@
+BSD 2-Clause License
+Copyright (c) 2026, Varun Singh
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

pipecat_local_tts_server-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,293 @@
+Metadata-Version: 2.3
+Name: pipecat-local-tts-server
+Version: 0.1.0
+Summary: Standalone local WebSocket text-to-speech (TTS) server, client, and pluggable mlx-audio backends for the Pipecat ecosystem
+Keywords: tts,text-to-speech,kokoro,mlx,websocket,pipecat,speech,audio
+Author: Varun Singh
+Author-email: Varun Singh <varun@varunsingh.net>
+License: BSD 2-Clause License
+         Copyright (c) 2026, Varun Singh
+         Redistribution and use in source and binary forms, with or without
+         modification, are permitted provided that the following conditions are met:
+         1. Redistributions of source code must retain the above copyright notice, this
+            list of conditions and the following disclaimer.
+         2. Redistributions in binary form must reproduce the above copyright notice,
+            this list of conditions and the following disclaimer in the documentation
+            and/or other materials provided with the distribution.
+         THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+         AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+         IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+         DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+         FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+         DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+         SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+         CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+         OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+         OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Operating System :: MacOS :: MacOS X
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Dist: websockets>=13.0
+Requires-Dist: websockets>=13.0 ; extra == 'client'
+Requires-Dist: websockets>=13.0 ; extra == 'examples'
+Requires-Dist: pipecat-ai==1.4.0 ; extra == 'examples'
+Requires-Dist: websockets>=13.0 ; extra == 'kokoro'
+Requires-Dist: mlx-audio==0.4.4 ; extra == 'kokoro'
+Requires-Dist: misaki[en] ; extra == 'kokoro'
+Requires-Python: >=3.12
+Project-URL: Homepage, https://github.com/vr000m/pipecat-local-tts-server
+Project-URL: Issues, https://github.com/vr000m/pipecat-local-tts-server/issues
+Project-URL: Repository, https://github.com/vr000m/pipecat-local-tts-server
+Provides-Extra: client
+Provides-Extra: examples
+Provides-Extra: kokoro
+Description-Content-Type: text/markdown
+# pipecat-local-tts-server
+Standalone, local WebSocket **text-to-speech (TTS) server** — text in, audio
+out — for the Pipecat ecosystem. It mirrors the sibling
+[`pipecat-local-stt-server`](https://github.com/vr000m/pipecat-local-stt-server):
+same websocket transport, an OpenAI-Realtime-inspired protocol subset, a
+pluggable backend abstraction, and lazy-imported per-model backends behind
+optional extras so a client-only consumer never pulls the heavy TTS runtime.
+Distributed as `pipecat-local-tts-server`; the import name is `tts_server`
+(every `import tts_server` / `python -m tts_server` invocation works).
+**Kokoro-first** (mlx-audio, Apple Silicon); more backends land later.
+The server owes its client exactly two things (the contracts everything else is
+built around):
+1. **An exact, stable advertised rate.** `server.hello.audio.rate` is the true
+   model rate (Kokoro = 24000 Hz). Every audio frame is int16-LE mono PCM at
+   *exactly* that rate, with no per-utterance drift. The client resamples
+   model-rate → device-rate off this single value.
+2. **A steady in-response stream.** Once a response starts, audio arrives
+   continuously (each model segment is emitted as it completes) so the client's
+   playback buffer never starves.
+## Install
+The base package is lean — `websockets` only. Backends live behind extras.
+From [PyPI](https://pypi.org/project/pipecat-local-tts-server/) (consumers):
+```sh
+# client-only lean base (websockets) — for a bot that just talks to a server
+uv add pipecat-local-tts-server            # or: pip install pipecat-local-tts-server
+# Kokoro backend (Apple Silicon; pulls mlx-audio==0.4.4 + misaki[en])
+uv add "pipecat-local-tts-server[kokoro]"  # or: pip install "pipecat-local-tts-server[kokoro]"
+```
+From source (development):
+```sh
+# client-only (lean base: websockets) — for a bot that just talks to a server
+uv sync --extra client
+# Kokoro backend (Apple Silicon; pulls mlx-audio==0.4.4 + misaki[en], which
+# drags in spacy/num2words/torch — heavy by design, kept out of lean base)
+uv sync --extra kokoro
+# the reference Pipecat adapter example (pulls the Pipecat framework)
+uv sync --extra examples
+```
+> Run every command through `uv run` (or activate the venv once per shell). A
+> bare `python -m tts_server …` uses the system interpreter and fails with
+> `ModuleNotFoundError: No module named 'websockets'`.
+## Running the server
+```sh
+# Kokoro over a Unix domain socket (recommended for local use)
+uv run python -m tts_server serve --backend kokoro \
+    --socket-path ~/Library/Caches/pipecat-tts/tts.sock
+# pick a specific model (any compatible mlx-community Kokoro repo id)
+uv run python -m tts_server serve --backend kokoro \
+    --model mlx-community/Kokoro-82M-bf16 \
+    --socket-path ~/Library/Caches/pipecat-tts/tts.sock
+# loopback TCP instead of a socket (choose any free port)
+uv run python -m tts_server serve --backend kokoro --host 127.0.0.1 --port 8765
+```
+The server logs the resolved backend + model at startup, *before* the
+(potentially slow) model load, so you can see what is being loaded. The rate is
+read from the loaded model, so model load completes before the first
+`server.hello` is sent.
+`--log-level` (default `INFO`) sets the server's logging verbosity (any standard
+Python level name, e.g. `DEBUG`/`WARNING`).
+On startup over a Unix socket, the server **auto-clears a stale socket** left by
+a previous crash (`SIGKILL` / power loss), so the documented restart works
+without manual `rm`. It refuses to start — surfacing a diagnostic instead of
+clobbering — if a **non-socket file** exists at the path, or if a **live server**
+is already listening there (the socket is genuinely in use).
+### Environment variables
+Endpoint precedence (server and client both): **URI > socket > host+port**. The
+`TTS_WS_*` vars mirror `STT_WS_*`.
+| Variable | Side | Purpose |
+|---|---|---|
+| `TTS_WS_URI` | client | Full `ws://`/`wss://` URI; highest endpoint precedence. |
+| `TTS_WS_SOCKET` | client | Unix-socket path (used when no URI). |
+| `TTS_WS_HOST` | client | TCP host (used when no URI/socket). |
+| `TTS_WS_PORT` | client | TCP port (paired with host). |
+| `TTS_WS_TOKEN` | client | Bearer token the client/probe sends. Never falls back to the server var. |
+| `TTS_WS_DEFAULT_SOCKET` | client | Explicit fallback socket for `status` when nothing else is set. |
+| `PIPECAT_TTS_AUTH_TOKEN` | server | Bearer token the server requires (optional auth). |
+| `PIPECAT_TTS_KOKORO_EXTRA_LANGS` | server | Comma-separated ISO codes (e.g. `ja,zh`) to advertise after installing their extra G2P package. See [Kokoro language support](#kokoro-language-support-advertised--synthesizable). |
+Auth notes: the server reads `PIPECAT_TTS_AUTH_TOKEN`; the client/probe reads
+`TTS_WS_TOKEN` (the two are deliberately separate so a probe can never mask a
+client 401 or leak the server secret to a remote host). A plaintext
+`--auth-token` flag is intentionally unsupported (`ps` exposure) — use
+`--auth-token-file`. A token-less server bound to a non-loopback TCP address logs
+a cleartext-remote warning; a Unix socket does not. Sending a bearer over
+cleartext `ws://` to a remote host also warns client-side — use `wss://` or a
+Unix socket.
+## Checking server health
+```sh
+uv run python -m tts_server status \
+    --socket-path ~/Library/Caches/pipecat-tts/tts.sock
+```
+`status` connects, performs the handshake, requests a `server.status` snapshot,
+and prints the backend, model, audio format/rate, capabilities
+(streaming / binary_audio / voice_count), session id, synthesis **queue depth**,
+the **voice list**, buffered chars, uptime, and pid. It exits non-zero if no
+server is reachable.
+`status` resolves its endpoint with the same **URI > socket > host+port**
+precedence as the client and additionally accepts `--uri ws://…`/`wss://…` (the
+serve path has no `--uri`, since it builds its listener from socket-path/host+port).
+Two probe-only flags: `--timeout` (overall probe budget in seconds, default `3.0`)
+and `--json` (emit the raw `hello`+`status` JSON instead of the text summary).
+For day-to-day operation on macOS the [`justfile`](justfile) carries read-only
+operator recipes mirroring the sibling stt server: `just tts-list` lists every
+`pipecat.tts-server*` launchd agent with state, pid, and live backend, and
+`just tts-status` runs the wire `status` probe against the canonical socket
+(override with `just tts-status socket=…`).
+## Protocol
+The full wire contract is in [`docs/protocol.md`](docs/protocol.md). In brief:
+every message is a JSON text frame with a `type` field. The client drives the
+session — `session.update` → `input_text.append`* → `input_text.commit` — and
+the server streams `response.audio.delta` frames (base64 pcm16, `seq` from 0, no
+gaps) ending in `response.audio.done`. `response.cancel` is barge-in. Audio is
+base64-in-JSON for v1 (`binary_audio: false`); binary frames are a later
+optimization.
+**The client segments the text; the commit is the unit of work.** Using
+`capabilities.streaming` and `capabilities.ideal_words`, the client splits long
+text into commits, rounding `ideal_words` up to the next **sentence boundary**
+— never splitting mid-sentence (a half-sentence commit makes the model apply
+sentence-final prosody mid-phrase). `max_text_chars` is the hard server cap.
+### Kokoro capabilities (as shipped)
+Built per-backend (`server.hello.capabilities`). Verified against
+mlx-community/Kokoro-82M-bf16 (mlx-audio 0.4.4):
+| Field | Value | Note |
+|---|---|---|
+| rate | **24000** | from `server.hello.audio.rate`, not capabilities |
+| `streaming` | `false` | no sub-segment streaming; segments still stream per `\n+` |
+| `binary_audio` | `false` | base64-in-JSON for v1 |
+| `text_formats` | `["plain"]` | ssml/ipa not supported |
+| `languages` | `["en","es","fr","hi","it","pt"]` | from voice prefixes, minus languages needing extra G2P — **`ja`/`zh` are off by default** (opt-in, see below) |
+| `voice_count` | `54` | full list via `status` |
+| `extras` | `["speed"]` | Kokoro's only effective `generate()` kwarg |
+| `ideal_words` | `40` | soft target; client rounds up to a sentence boundary |
+| `max_text_chars` | `2000` | hard server cap |
+### Kokoro language support (advertised = synthesizable)
+The advertised `languages` list reflects what this deployment can actually
+synthesize, not just what voices the model ships. The default `kokoro` extra
+pins `misaki[en]` only; verified live against mlx-community/Kokoro-82M-bf16:
+- **`en`** uses misaki[en]; **`es`/`fr`/`it`/`pt`/`hi`** route through the
+  espeak-ng G2P bundled with misaki[en] — all synthesize fine and are advertised.
+  (`hi`'s first call loads its G2P lazily and can exceed a 60 s client timeout.)
+- **`ja` and `zh` need an extra G2P package** — `misaki[ja]` (`pyopenjtalk`) and
+  `misaki[zh]` (`ordered_set`) respectively, which the `kokoro` extra does not
+  install. Because synthesis would fail at runtime (`response.failed`,
+  `code=backend_error`) without them, **they are not advertised by default** and
+  a request for them is rejected up front with `invalid_config` (before a
+  synthesis slot is consumed) rather than failing mid-response.
+**Enabling `ja` / `zh`** is a two-step, build-time decision:
+1. Install the G2P package(s):
+   `uv pip install "misaki[ja]" "misaki[zh]"`
+2. Opt the language(s) back into the advertised set:
+   `export PIPECAT_TTS_KOKORO_EXTRA_LANGS=ja,zh`
+The server logs its advertised language set at startup (including a reminder of
+any languages left disabled). The opt-in only *re-adds* a language the model
+already ships voices for; it cannot advertise one the model lacks. If you set the
+env var without installing the package, that language is advertised again and
+will fail at synthesis — install first.
+See [`tests/smoke/`](tests/smoke/) for the live end-to-end smoke scripts that
+verify this (`just smoke-tone` / `just smoke-kokoro` / `just smoke-multilingual`).
+### Kokoro cancellation caveat
+Kokoro yields one segment per `\n+` boundary and the cancel flag is only checked
+at a segment boundary. The **client-visible** cancel is prompt regardless: a
+`response.cancel` is acknowledged with `response.cancelled` in ~1 ms (measured on
+Apple Silicon), and no audio follows it. What runs to the segment boundary is the
+backend worker / Metal lock: a **long single-segment** commit keeps the lock
+until its `generate()` reaches the yield (≈ the full single-segment synthesis
+time — a few seconds for a ~1700-char segment, bounded by `drain_timeout_seconds`),
+so the *next* commit can't start synthesizing until then. To free the lock sooner
+for back-to-back commits, **clients should chunk at sentence/newline boundaries**
+for Kokoro. The server's hard guarantee is "no more audio after
+`response.cancelled`". (See the dev plan's *Phase 2 measured results* for the full
+re-measurement; the earlier "≈ tens of seconds" figure was a bridge-bug artifact.)
+## Examples
+- [`examples/reference_client.py`](examples/reference_client.py) — a lightweight
+  stdlib + `websockets` oracle (no `tts_server` install, no Pipecat). It speaks
+  the wire protocol directly and writes the reassembled audio to a WAV. Useful
+  for manual end-to-end smoke checks once a server is running.
+- [`examples/pipecat_tts_service.py`](examples/pipecat_tts_service.py) — a
+  reference Pipecat-framework `TTSService` adapter (`LocalTTSService`) that wraps
+  the async `tts_server.client.TTSClient` so a bot pipeline can speak through a
+  running server. Streams `TTSAudioRawFrame`s at the server-advertised rate and
+  sends `response.cancel` on interruption. Requires the Pipecat framework
+  (`uv sync --extra examples`, which pins `pipecat-ai==1.4.0`).
+## Layout
+- `tts_server/` — protocol, backend abstraction, server, async client, CLI.
+- `tts_server/backends/` — lazy-imported per-model backends (Kokoro first).
+- `examples/` — the stdlib oracle and the Pipecat service adapter.
+- `justfile` — macOS operator recipes (`tts-list`, `tts-status`).
+- `docs/protocol.md` — the wire protocol specification.
+- `docs/dev_plans/` — development plans.

pipecat_local_tts_server-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,237 @@
+# pipecat-local-tts-server
+Standalone, local WebSocket **text-to-speech (TTS) server** — text in, audio
+out — for the Pipecat ecosystem. It mirrors the sibling
+[`pipecat-local-stt-server`](https://github.com/vr000m/pipecat-local-stt-server):
+same websocket transport, an OpenAI-Realtime-inspired protocol subset, a
+pluggable backend abstraction, and lazy-imported per-model backends behind
+optional extras so a client-only consumer never pulls the heavy TTS runtime.
+Distributed as `pipecat-local-tts-server`; the import name is `tts_server`
+(every `import tts_server` / `python -m tts_server` invocation works).
+**Kokoro-first** (mlx-audio, Apple Silicon); more backends land later.
+The server owes its client exactly two things (the contracts everything else is
+built around):
+1. **An exact, stable advertised rate.** `server.hello.audio.rate` is the true
+   model rate (Kokoro = 24000 Hz). Every audio frame is int16-LE mono PCM at
+   *exactly* that rate, with no per-utterance drift. The client resamples
+   model-rate → device-rate off this single value.
+2. **A steady in-response stream.** Once a response starts, audio arrives
+   continuously (each model segment is emitted as it completes) so the client's
+   playback buffer never starves.
+## Install
+The base package is lean — `websockets` only. Backends live behind extras.
+From [PyPI](https://pypi.org/project/pipecat-local-tts-server/) (consumers):
+```sh
+# client-only lean base (websockets) — for a bot that just talks to a server
+uv add pipecat-local-tts-server            # or: pip install pipecat-local-tts-server
+# Kokoro backend (Apple Silicon; pulls mlx-audio==0.4.4 + misaki[en])
+uv add "pipecat-local-tts-server[kokoro]"  # or: pip install "pipecat-local-tts-server[kokoro]"
+```
+From source (development):
+```sh
+# client-only (lean base: websockets) — for a bot that just talks to a server
+uv sync --extra client
+# Kokoro backend (Apple Silicon; pulls mlx-audio==0.4.4 + misaki[en], which
+# drags in spacy/num2words/torch — heavy by design, kept out of lean base)
+uv sync --extra kokoro
+# the reference Pipecat adapter example (pulls the Pipecat framework)
+uv sync --extra examples
+```
+> Run every command through `uv run` (or activate the venv once per shell). A
+> bare `python -m tts_server …` uses the system interpreter and fails with
+> `ModuleNotFoundError: No module named 'websockets'`.
+## Running the server
+```sh
+# Kokoro over a Unix domain socket (recommended for local use)
+uv run python -m tts_server serve --backend kokoro \
+    --socket-path ~/Library/Caches/pipecat-tts/tts.sock
+# pick a specific model (any compatible mlx-community Kokoro repo id)
+uv run python -m tts_server serve --backend kokoro \
+    --model mlx-community/Kokoro-82M-bf16 \
+    --socket-path ~/Library/Caches/pipecat-tts/tts.sock
+# loopback TCP instead of a socket (choose any free port)
+uv run python -m tts_server serve --backend kokoro --host 127.0.0.1 --port 8765
+```
+The server logs the resolved backend + model at startup, *before* the
+(potentially slow) model load, so you can see what is being loaded. The rate is
+read from the loaded model, so model load completes before the first
+`server.hello` is sent.
+`--log-level` (default `INFO`) sets the server's logging verbosity (any standard
+Python level name, e.g. `DEBUG`/`WARNING`).
+On startup over a Unix socket, the server **auto-clears a stale socket** left by
+a previous crash (`SIGKILL` / power loss), so the documented restart works
+without manual `rm`. It refuses to start — surfacing a diagnostic instead of
+clobbering — if a **non-socket file** exists at the path, or if a **live server**
+is already listening there (the socket is genuinely in use).
+### Environment variables
+Endpoint precedence (server and client both): **URI > socket > host+port**. The
+`TTS_WS_*` vars mirror `STT_WS_*`.
+| Variable | Side | Purpose |
+|---|---|---|
+| `TTS_WS_URI` | client | Full `ws://`/`wss://` URI; highest endpoint precedence. |
+| `TTS_WS_SOCKET` | client | Unix-socket path (used when no URI). |
+| `TTS_WS_HOST` | client | TCP host (used when no URI/socket). |
+| `TTS_WS_PORT` | client | TCP port (paired with host). |
+| `TTS_WS_TOKEN` | client | Bearer token the client/probe sends. Never falls back to the server var. |
+| `TTS_WS_DEFAULT_SOCKET` | client | Explicit fallback socket for `status` when nothing else is set. |
+| `PIPECAT_TTS_AUTH_TOKEN` | server | Bearer token the server requires (optional auth). |
+| `PIPECAT_TTS_KOKORO_EXTRA_LANGS` | server | Comma-separated ISO codes (e.g. `ja,zh`) to advertise after installing their extra G2P package. See [Kokoro language support](#kokoro-language-support-advertised--synthesizable). |
+Auth notes: the server reads `PIPECAT_TTS_AUTH_TOKEN`; the client/probe reads
+`TTS_WS_TOKEN` (the two are deliberately separate so a probe can never mask a
+client 401 or leak the server secret to a remote host). A plaintext
+`--auth-token` flag is intentionally unsupported (`ps` exposure) — use
+`--auth-token-file`. A token-less server bound to a non-loopback TCP address logs
+a cleartext-remote warning; a Unix socket does not. Sending a bearer over
+cleartext `ws://` to a remote host also warns client-side — use `wss://` or a
+Unix socket.
+## Checking server health
+```sh
+uv run python -m tts_server status \
+    --socket-path ~/Library/Caches/pipecat-tts/tts.sock
+```
+`status` connects, performs the handshake, requests a `server.status` snapshot,
+and prints the backend, model, audio format/rate, capabilities
+(streaming / binary_audio / voice_count), session id, synthesis **queue depth**,
+the **voice list**, buffered chars, uptime, and pid. It exits non-zero if no
+server is reachable.
+`status` resolves its endpoint with the same **URI > socket > host+port**
+precedence as the client and additionally accepts `--uri ws://…`/`wss://…` (the
+serve path has no `--uri`, since it builds its listener from socket-path/host+port).
+Two probe-only flags: `--timeout` (overall probe budget in seconds, default `3.0`)
+and `--json` (emit the raw `hello`+`status` JSON instead of the text summary).
+For day-to-day operation on macOS the [`justfile`](justfile) carries read-only
+operator recipes mirroring the sibling stt server: `just tts-list` lists every
+`pipecat.tts-server*` launchd agent with state, pid, and live backend, and
+`just tts-status` runs the wire `status` probe against the canonical socket
+(override with `just tts-status socket=…`).
+## Protocol
+The full wire contract is in [`docs/protocol.md`](docs/protocol.md). In brief:
+every message is a JSON text frame with a `type` field. The client drives the
+session — `session.update` → `input_text.append`* → `input_text.commit` — and
+the server streams `response.audio.delta` frames (base64 pcm16, `seq` from 0, no
+gaps) ending in `response.audio.done`. `response.cancel` is barge-in. Audio is
+base64-in-JSON for v1 (`binary_audio: false`); binary frames are a later
+optimization.
+**The client segments the text; the commit is the unit of work.** Using
+`capabilities.streaming` and `capabilities.ideal_words`, the client splits long
+text into commits, rounding `ideal_words` up to the next **sentence boundary**
+— never splitting mid-sentence (a half-sentence commit makes the model apply
+sentence-final prosody mid-phrase). `max_text_chars` is the hard server cap.
+### Kokoro capabilities (as shipped)
+Built per-backend (`server.hello.capabilities`). Verified against
+mlx-community/Kokoro-82M-bf16 (mlx-audio 0.4.4):
+| Field | Value | Note |
+|---|---|---|
+| rate | **24000** | from `server.hello.audio.rate`, not capabilities |
+| `streaming` | `false` | no sub-segment streaming; segments still stream per `\n+` |
+| `binary_audio` | `false` | base64-in-JSON for v1 |
+| `text_formats` | `["plain"]` | ssml/ipa not supported |
+| `languages` | `["en","es","fr","hi","it","pt"]` | from voice prefixes, minus languages needing extra G2P — **`ja`/`zh` are off by default** (opt-in, see below) |
+| `voice_count` | `54` | full list via `status` |
+| `extras` | `["speed"]` | Kokoro's only effective `generate()` kwarg |
+| `ideal_words` | `40` | soft target; client rounds up to a sentence boundary |
+| `max_text_chars` | `2000` | hard server cap |
+### Kokoro language support (advertised = synthesizable)
+The advertised `languages` list reflects what this deployment can actually
+synthesize, not just what voices the model ships. The default `kokoro` extra
+pins `misaki[en]` only; verified live against mlx-community/Kokoro-82M-bf16:
+- **`en`** uses misaki[en]; **`es`/`fr`/`it`/`pt`/`hi`** route through the
+  espeak-ng G2P bundled with misaki[en] — all synthesize fine and are advertised.
+  (`hi`'s first call loads its G2P lazily and can exceed a 60 s client timeout.)
+- **`ja` and `zh` need an extra G2P package** — `misaki[ja]` (`pyopenjtalk`) and
+  `misaki[zh]` (`ordered_set`) respectively, which the `kokoro` extra does not
+  install. Because synthesis would fail at runtime (`response.failed`,
+  `code=backend_error`) without them, **they are not advertised by default** and
+  a request for them is rejected up front with `invalid_config` (before a
+  synthesis slot is consumed) rather than failing mid-response.
+**Enabling `ja` / `zh`** is a two-step, build-time decision:
+1. Install the G2P package(s):
+   `uv pip install "misaki[ja]" "misaki[zh]"`
+2. Opt the language(s) back into the advertised set:
+   `export PIPECAT_TTS_KOKORO_EXTRA_LANGS=ja,zh`
+The server logs its advertised language set at startup (including a reminder of
+any languages left disabled). The opt-in only *re-adds* a language the model
+already ships voices for; it cannot advertise one the model lacks. If you set the
+env var without installing the package, that language is advertised again and
+will fail at synthesis — install first.
+See [`tests/smoke/`](tests/smoke/) for the live end-to-end smoke scripts that
+verify this (`just smoke-tone` / `just smoke-kokoro` / `just smoke-multilingual`).
+### Kokoro cancellation caveat
+Kokoro yields one segment per `\n+` boundary and the cancel flag is only checked
+at a segment boundary. The **client-visible** cancel is prompt regardless: a
+`response.cancel` is acknowledged with `response.cancelled` in ~1 ms (measured on
+Apple Silicon), and no audio follows it. What runs to the segment boundary is the
+backend worker / Metal lock: a **long single-segment** commit keeps the lock
+until its `generate()` reaches the yield (≈ the full single-segment synthesis
+time — a few seconds for a ~1700-char segment, bounded by `drain_timeout_seconds`),
+so the *next* commit can't start synthesizing until then. To free the lock sooner
+for back-to-back commits, **clients should chunk at sentence/newline boundaries**
+for Kokoro. The server's hard guarantee is "no more audio after
+`response.cancelled`". (See the dev plan's *Phase 2 measured results* for the full
+re-measurement; the earlier "≈ tens of seconds" figure was a bridge-bug artifact.)
+## Examples
+- [`examples/reference_client.py`](examples/reference_client.py) — a lightweight
+  stdlib + `websockets` oracle (no `tts_server` install, no Pipecat). It speaks
+  the wire protocol directly and writes the reassembled audio to a WAV. Useful
+  for manual end-to-end smoke checks once a server is running.
+- [`examples/pipecat_tts_service.py`](examples/pipecat_tts_service.py) — a
+  reference Pipecat-framework `TTSService` adapter (`LocalTTSService`) that wraps
+  the async `tts_server.client.TTSClient` so a bot pipeline can speak through a
+  running server. Streams `TTSAudioRawFrame`s at the server-advertised rate and
+  sends `response.cancel` on interruption. Requires the Pipecat framework
+  (`uv sync --extra examples`, which pins `pipecat-ai==1.4.0`).
+## Layout
+- `tts_server/` — protocol, backend abstraction, server, async client, CLI.
+- `tts_server/backends/` — lazy-imported per-model backends (Kokoro first).
+- `examples/` — the stdlib oracle and the Pipecat service adapter.
+- `justfile` — macOS operator recipes (`tts-list`, `tts-status`).
+- `docs/protocol.md` — the wire protocol specification.
+- `docs/dev_plans/` — development plans.

pipecat_local_tts_server-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,104 @@
+[project]
+name = "pipecat-local-tts-server"
+version = "0.1.0"
+description = "Standalone local WebSocket text-to-speech (TTS) server, client, and pluggable mlx-audio backends for the Pipecat ecosystem"
+readme = "README.md"
+requires-python = ">=3.12"
+license = { file = "LICENSE" }
+authors = [{ name = "Varun Singh", email = "varun@varunsingh.net" }]
+keywords = ["tts", "text-to-speech", "kokoro", "mlx", "websocket", "pipecat", "speech", "audio"]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: BSD License",
+    "Operating System :: MacOS :: MacOS X",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Topic :: Multimedia :: Sound/Audio :: Speech",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+dependencies = [
+    # The wire transport. The client (TTSClient) and the server runtime both
+    # depend on websockets; it is the only hard runtime dependency of the base
+    # package. Backend TTS engines live behind the [kokoro] (and later
+    # [voxtral_tts] / [pocket_tts] / [dia]) extras so a client-only consumer
+    # (e.g. a bot that just talks to a running server) does not pull mlx-audio
+    # or its heavy transitive deps (torch, spacy, ...). numpy is dev-only and
+    # is NEVER a runtime dependency — ToneBackend and the shared float->pcm16
+    # converter are stdlib-only.
+    "websockets>=13.0",
+]
+[project.optional-dependencies]
+# Pure client deps — what a downstream consumer of `tts_server.client` needs
+# without pulling any mlx-audio runtime.
+client = ["websockets>=13.0"]
+# Full server install with the Kokoro mlx-audio backend.
+#
+# mlx-audio is pinned to 0.4.4 (R8): the TTS API drifted between 0.3.0 and
+# 0.4.4 (streaming-chunk fields appeared, voxtral_tts was added, pocket_tts
+# kwargs changed). An unpinned bump can silently break facts verified via
+# scripts/verify_mlx_tts_api.py against 0.4.4 — re-run that script before
+# widening the pin. Importing Kokoro pulls misaki (G2P) -> num2words/spacy/
+# en_core_web_sm AND torch, so this extra is heavy by design; keep it out of
+# lean base.
+#
+# mlx-audio 0.4.4 imports misaki lazily (a helpful ImportError at synth time if
+# absent), so it is NOT a transitive hard dep of mlx-audio — Kokoro's G2P needs
+# it declared explicitly. ``misaki[en]`` drags in num2words/spacy/en_core_web_sm
+# AND torch, which is what makes this extra heavy (R3); keep it behind the extra.
+kokoro = ["websockets>=13.0", "mlx-audio==0.4.4", "misaki[en]"]
+# Reference Pipecat-framework adapter (``examples/pipecat_tts_service.py``).
+# This is for running the *example*, not the server itself — it pulls the
+# Pipecat framework (and ``client`` for ``tts_server.client``). It is NOT a
+# dependency of the server or the lean client; a consumer that already has
+# Pipecat installed does not need this extra.
+#
+# pipecat-ai is pinned to ==1.4.0 (same policy as mlx-audio): the adapter must
+# override TTSService's READ-ONLY ``sample_rate`` property by writing its private
+# ``_sample_rate`` backing field (no public post-handshake setter exists), so a
+# version skew can silently mis-negotiate the audio rate. 1.4.0 is the version
+# verified by tests/test_pipecat_adapter.py and exercised by the test-examples CI
+# job. LocalTTSService._update_sample_rate also guards the write at runtime
+# (raises if the field is gone). Re-run the adapter tests before widening this pin.
+examples = ["websockets>=13.0", "pipecat-ai==1.4.0"]
+[dependency-groups]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-asyncio>=0.24.0",
+    "ruff>=0.8.0",
+    # Test/verification infrastructure only. numpy is used by the mlx-audio
+    # verification scripts (scripts/verify_mlx_tts_api.py) and by tests that
+    # explicitly opt into it. It is intentionally NOT a runtime dependency:
+    # runtime package code (ToneBackend, the float->pcm16 converter) stays
+    # stdlib-only so the lean base never requires a dev-only dependency.
+    "numpy>=1.26",
+]
+[project.urls]
+Homepage = "https://github.com/vr000m/pipecat-local-tts-server"
+Repository = "https://github.com/vr000m/pipecat-local-tts-server"
+Issues = "https://github.com/vr000m/pipecat-local-tts-server/issues"
+[build-system]
+requires = ["uv_build>=0.9,<0.10"]
+build-backend = "uv_build"
+[tool.uv.build-backend]
+# Import name stays `tts_server` so every `import tts_server` callsite, every
+# test, and the `python -m tts_server` invocation work without rename.
+module-name = "tts_server"
+module-root = ""
+[tool.pytest.ini_options]
+asyncio_mode = "auto"
+testpaths = ["tests"]
+# Verification scripts under scripts/ are importable under pytest (mirrors the
+# stt repo), so tests that drive them can `from scripts... import ...`.
+pythonpath = ["."]
+[tool.ruff]
+target-version = "py312"
+line-length = 100