voxcaster 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 ibrahim Alfa
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,261 @@
1
+ Metadata-Version: 2.4
2
+ Name: voxcaster
3
+ Version: 0.2.0
4
+ Summary: Local text-to-speech daemon and CLI for speech notifications from scripts, builds, cron jobs, and ML training runs, powered by Kokoro TTS with GPU offload and espeak fallback.
5
+ Author: ibrahim Alfa
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/I-Alpha/speakd
8
+ Project-URL: Repository, https://github.com/I-Alpha/speakd
9
+ Project-URL: Issues, https://github.com/I-Alpha/speakd/issues
10
+ Project-URL: Changelog, https://github.com/I-Alpha/speakd/blob/master/CHANGELOG.md
11
+ Keywords: tts,text-to-speech,speech-synthesis,speech-notifications,kokoro,kokoro-tts,daemon,unix-socket,cli,narration,notifications,machine-learning,gpu,cuda,developer-tools
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Environment :: No Input/Output (Daemon)
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: POSIX :: Linux
17
+ Classifier: Operating System :: MacOS
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
23
+ Classifier: Topic :: System :: Monitoring
24
+ Requires-Python: >=3.10
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Requires-Dist: kokoro>=0.9
28
+ Requires-Dist: soundfile>=0.12
29
+ Requires-Dist: numpy>=1.24
30
+ Requires-Dist: tomli>=2.0; python_version < "3.11"
31
+ Dynamic: license-file
32
+
33
+ # speakd
34
+
35
+ [![PyPI](https://img.shields.io/pypi/v/speakd)](https://pypi.org/project/speakd/)
36
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/speakd)](https://pypi.org/project/speakd/)
37
+
38
+ **A local text-to-speech daemon and CLI for speech notifications from
39
+ long-running jobs.**
40
+
41
+ `speakd` is a Python TTS daemon powered by
42
+ [Kokoro](https://github.com/hexgrad/kokoro) (a fast, high-quality local
43
+ text-to-speech model). Shell scripts, machine-learning training runs, builds,
44
+ cron jobs, CI hooks, and Python programs can send fire-and-forget speech
45
+ notifications over a Unix socket; the caller returns in about a millisecond
46
+ while the daemon queues, synthesizes, and plays each line in order. If anything
47
+ in the audio stack fails, the line degrades to espeak instead of disappearing.
48
+
49
+ It was built to narrate machine-learning training runs on a single-GPU
50
+ workstation, which shaped its defining feature: **the TTS model dynamically
51
+ offloads itself from the GPU** when narration goes quiet, so it never holds
52
+ VRAM hostage from the workload it is narrating.
53
+
54
+ ```
55
+ $ pip install speakd
56
+ $ speak "training started" # daemon auto-spawns on first use
57
+ $ speak --interrupt "loss is NaN" # cuts off whatever is playing, speaks NOW
58
+ $ make 2>&1 | tail -1 | speak # pipe-friendly
59
+ ```
60
+
61
+ ## Use cases
62
+
63
+ - Add voice alerts to machine-learning training runs when epochs finish,
64
+ checkpoints save, loss becomes NaN, or jobs crash.
65
+ - Turn shell scripts, Makefiles, cron jobs, and CI hooks into spoken status
66
+ updates.
67
+ - Use Kokoro TTS locally without blocking the process that asked for speech.
68
+ - Share one text-to-speech queue across multiple processes so messages do not
69
+ overlap.
70
+ - Release GPU VRAM after narration bursts with dynamic CPU/GPU offload.
71
+
72
+ ## Why a daemon?
73
+
74
+ Calling a TTS library inline is the obvious approach and the wrong one for
75
+ narration: it blocks the caller for seconds per line, loads a model per
76
+ process, and overlapping lines talk over each other. `speakd` inverts this:
77
+
78
+ - **~1 ms per call.** The client writes one line to a Unix socket and returns.
79
+ Narration can sit inside hot loops and signal handlers.
80
+ - **One model, one queue.** A single daemon owns the model and serialises
81
+ playback. Ten processes can narrate concurrently without crosstalk.
82
+ - **Failure-proof by design.** Daemon down? The client spawns it. Spawn fails?
83
+ espeak fallback. No audio at all? The caller still never raises.
84
+
85
+ ## Architecture
86
+
87
+ ```
88
+ any process, any language speakd daemon (one per socket, flock-enforced)
89
+ ┌──────────────────────┐ ┌───────────────────────────────────────────────┐
90
+ │ speak "epoch done" │──┐ │ asyncio Unix-socket server │
91
+ └──────────────────────┘ │ │ │ │
92
+ ┌──────────────────────┐ │ UTF-8 │ ├── volume msg ──▶ live volume │
93
+ │ Python: speak(...) │──┼─ line ─▶│ ├── interrupt ───▶ drain queue + │
94
+ └──────────────────────┘ │ over │ │ kill playback │
95
+ ┌──────────────────────┐ │ socket │ ▼ │
96
+ │ CI job, cron, hook │──┘ │ FIFO queue ──▶ worker (thread executor) │
97
+ └──────────────────────┘ │ │ │
98
+ ▲ │ ▼ │
99
+ │ "OK\n" ack │ Kokoro TTS ──▶ wav ──▶ mpv ──▶ 🔊 │
100
+ │ (blocking mode only) │ CPU ⇄ GPU │
101
+ └─────────────────────────│ (offloads after idle keepalive) │
102
+ │ │
103
+ │ any failure ──▶ espeak fallback │
104
+ └───────────────────────────────────────────────┘
105
+ ```
106
+
107
+ ## Features
108
+
109
+ - **Fire-and-forget socket design** — newline-terminated UTF-8 over a Unix
110
+ domain socket; trivially scriptable from any language. Optional `OK` ack
111
+ for blocking callers.
112
+ - **Dynamic GPU offload with keepalive** — the model loads on CPU, hops onto
113
+ the GPU for narration bursts, and releases its VRAM (~3 GB) after a
114
+ configurable idle period. If the GPU is full (another job grabbed it), that
115
+ request simply synthesizes on CPU instead of failing.
116
+ - **Interrupt protocol** — an urgent line drains the pending queue, kills
117
+ in-flight playback mid-word, and speaks immediately.
118
+ - **Live volume control** — one socket message, applies from the next line;
119
+ no restart.
120
+ - **Singleton via `flock(2)`** — clients can race to auto-spawn the daemon;
121
+ exactly one wins, the rest exit cleanly. Stale sockets are detected and
122
+ removed on startup.
123
+ - **Graceful fallback** — Kokoro import error, synthesis failure, playback
124
+ failure, or daemon unreachable: the line is spoken by espeak and the event
125
+ is logged. Narration degrades; it never silently vanishes.
126
+ - **One TOML file, env-var overrides, zero-config defaults** — works out of
127
+ the box on CPU with no config file at all.
128
+
129
+ ## Requirements
130
+
131
+ - Linux or macOS (Unix sockets + `flock`), Python ≥ 3.10
132
+ - [mpv](https://mpv.io/) for playback (`apt install mpv`) — or any player,
133
+ via config
134
+ - [espeak](https://espeak.sourceforge.net/) for the fallback voice
135
+ (`apt install espeak`) — optional but recommended
136
+ - A CUDA-capable GPU is **optional**; everything works on CPU
137
+
138
+ ## Install
139
+
140
+ ```bash
141
+ pip install speakd
142
+ ```
143
+
144
+ This installs the `kokoro` TTS package (which pulls in PyTorch) and two
145
+ console commands: `speakd` (the daemon) and `speak` (the client).
146
+
147
+ To install from source:
148
+
149
+ ```bash
150
+ git clone https://github.com/I-Alpha/speakd && cd speakd
151
+ pip install .
152
+ ```
153
+
154
+ ## Quickstart
155
+
156
+ ```bash
157
+ # 1. Just speak — the daemon auto-spawns on first use:
158
+ speak "hello from speakd"
159
+
160
+ # 2. Or run the daemon in the foreground to watch it work:
161
+ speakd --device cpu --voice af_heart
162
+
163
+ # 3. Script it:
164
+ speak --blocking "waits until this has been spoken"
165
+ speak --interrupt "queue drained, this plays immediately"
166
+ speak --volume 60 "quieter from now on"
167
+ echo "pipes work too" | speak
168
+ ```
169
+
170
+ From Python:
171
+
172
+ ```python
173
+ from speakd import speak, set_volume
174
+
175
+ speak("checkpoint saved") # ~1 ms, non-blocking
176
+ speak("eval finished", blocking=True) # wait until spoken
177
+ speak("loss is NaN — stopping", interrupt=True) # jump the queue
178
+ set_volume(85)
179
+ ```
180
+
181
+ See [`examples/`](examples/) for runnable demos of narration, interrupts,
182
+ and volume control.
183
+
184
+ ## Configuration
185
+
186
+ Defaults work with no config at all. To customise, copy
187
+ [`config.example.toml`](config.example.toml) to `~/.config/speakd/config.toml`
188
+ (or point `$SPEAKD_CONFIG` at any path). Environment variables override the
189
+ file; CLI flags override both.
190
+
191
+ | TOML key | Env override | Default | Meaning |
192
+ |---|---|---|---|
193
+ | `tts.voice` | `SPEAKD_VOICE` | `af_heart` | Kokoro voice id (`af_*`, `am_*`, `bf_*`, `bm_*`, ...) |
194
+ | `tts.speed` | `SPEAKD_SPEED` | `1.0` | Speech-rate multiplier |
195
+ | `tts.lang_code` | `SPEAKD_LANG` | `a` | Kokoro language code (`a` US English, `b` UK English) |
196
+ | `device.policy` | `SPEAKD_DEVICE` | `auto` | `auto` (dynamic offload) / `cpu` / `gpu` |
197
+ | `device.keepalive_seconds` | `SPEAKD_KEEPALIVE` | `180` | Idle seconds before GPU→CPU offload |
198
+ | `daemon.socket_path` | `SPEAKD_SOCKET` | `$XDG_RUNTIME_DIR/speakd.sock` | Unix socket path |
199
+ | `daemon.socket_mode` | — | `"600"` | Octal permissions on the socket file |
200
+ | `daemon.log_file` | `SPEAKD_LOG_FILE` | `~/.local/state/speakd/daemon.log` | Log target for auto-spawned daemons |
201
+ | `audio.volume` | `SPEAKD_VOLUME` | `100` | Playback volume `0–130` (mpv scale) |
202
+ | `audio.max_playback_seconds` | — | `120` | Kill a single line's playback after this |
203
+ | `audio.player` | — | mpv template | Player argv; `{file}` and `{volume}` are substituted |
204
+ | `fallback.command` | — | espeak template | Fallback argv; `{text}` is substituted; `[]` disables |
205
+ | `client.connect_timeout` | — | `0.5` | Socket connect/send timeout (s) |
206
+ | `client.ack_timeout` | — | `300.0` | `--blocking` wait for the spoken-ack (s) |
207
+ | `client.spawn_wait` | — | `4.0` | Wait for an auto-spawned daemon (s) |
208
+
209
+ `speakd --print-config` shows the fully-resolved effective configuration.
210
+
211
+ ## Wire protocol
212
+
213
+ One newline-terminated UTF-8 line per connection — easy to speak from any
214
+ language without a client library:
215
+
216
+ | Message | Bytes | Effect |
217
+ |---|---|---|
218
+ | Speak | `<text>\n` | Queue the line; daemon replies `OK\n` when spoken |
219
+ | Interrupt | `\x01INTERRUPT\x01<text>\n` | Drain queue, kill playback, speak now |
220
+ | Volume | `\x02VOLUME\x02<int>\n` | Set live volume (0–130) |
221
+
222
+ ```bash
223
+ # speak from raw shell, no client needed:
224
+ printf 'hello from netcat\n' | nc -U "$XDG_RUNTIME_DIR/speakd.sock"
225
+ ```
226
+
227
+ The control markers are ASCII SOH/STX characters that cannot occur in normal
228
+ text, so no escaping is ever needed.
229
+
230
+ ## GPU offload in detail
231
+
232
+ The `auto` policy exists for machines where the GPU has a day job:
233
+
234
+ 1. The model loads on **CPU** at first request.
235
+ 2. Each synthesis tries to move it to the **GPU** first (a few hundred ms,
236
+ then synthesis is much faster). If CUDA is busy or OOM, that line
237
+ synthesizes on CPU — no error, just slower.
238
+ 3. After `keepalive_seconds` (default 180 s) without a request, an idle timer
239
+ moves the model back to **CPU** and calls `torch.cuda.empty_cache()`,
240
+ releasing the VRAM.
241
+
242
+ The effect: during an active narration burst the voice is snappy and
243
+ GPU-accelerated; ten minutes into a silent stretch, your training job has its
244
+ VRAM back. All device moves are serialised with synthesis under one lock, so
245
+ the model can never be moved mid-utterance.
246
+
247
+ ## Troubleshooting
248
+
249
+ | Symptom | Likely cause / fix |
250
+ |---|---|
251
+ | `speak` says *fallback engine used* | Daemon failed to start — check `~/.local/state/speakd/daemon.log`. Most common: `kokoro` not installed in the Python that spawned it (set `SPEAKD_DAEMON_CMD="/path/to/python -m speakd.daemon"`). |
252
+ | No audio, no errors | Is `mpv` installed and does it play a wav from your terminal? Swap `audio.player` if you use a different player. |
253
+ | First line is slow | Cold start: model weights load on first request (a few seconds). Subsequent lines are fast. |
254
+ | Robotic voice instead of Kokoro | That *is* the espeak fallback working as designed — see the first row. |
255
+ | Two daemons after a crash | They cannot coexist: the flock singleton makes the second exit immediately, and stale sockets are cleaned on startup. Delete `<socket>.lock` only if a machine crash left it owned by a dead PID holder (flock releases on process death, so this is near-impossible). |
256
+ | `daemon already running (pid N)` | Working as intended — the running daemon serves all clients. |
257
+ | GPU memory not released | The model offloads after `device.keepalive_seconds` of *no requests*; lower it, or run with `--device cpu`. |
258
+
259
+ ## License
260
+
261
+ [MIT](LICENSE) © 2026 ibrahim Alfa
@@ -0,0 +1,229 @@
1
+ # speakd
2
+
3
+ [![PyPI](https://img.shields.io/pypi/v/speakd)](https://pypi.org/project/speakd/)
4
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/speakd)](https://pypi.org/project/speakd/)
5
+
6
+ **A local text-to-speech daemon and CLI for speech notifications from
7
+ long-running jobs.**
8
+
9
+ `speakd` is a Python TTS daemon powered by
10
+ [Kokoro](https://github.com/hexgrad/kokoro) (a fast, high-quality local
11
+ text-to-speech model). Shell scripts, machine-learning training runs, builds,
12
+ cron jobs, CI hooks, and Python programs can send fire-and-forget speech
13
+ notifications over a Unix socket; the caller returns in about a millisecond
14
+ while the daemon queues, synthesizes, and plays each line in order. If anything
15
+ in the audio stack fails, the line degrades to espeak instead of disappearing.
16
+
17
+ It was built to narrate machine-learning training runs on a single-GPU
18
+ workstation, which shaped its defining feature: **the TTS model dynamically
19
+ offloads itself from the GPU** when narration goes quiet, so it never holds
20
+ VRAM hostage from the workload it is narrating.
21
+
22
+ ```
23
+ $ pip install speakd
24
+ $ speak "training started" # daemon auto-spawns on first use
25
+ $ speak --interrupt "loss is NaN" # cuts off whatever is playing, speaks NOW
26
+ $ make 2>&1 | tail -1 | speak # pipe-friendly
27
+ ```
28
+
29
+ ## Use cases
30
+
31
+ - Add voice alerts to machine-learning training runs when epochs finish,
32
+ checkpoints save, loss becomes NaN, or jobs crash.
33
+ - Turn shell scripts, Makefiles, cron jobs, and CI hooks into spoken status
34
+ updates.
35
+ - Use Kokoro TTS locally without blocking the process that asked for speech.
36
+ - Share one text-to-speech queue across multiple processes so messages do not
37
+ overlap.
38
+ - Release GPU VRAM after narration bursts with dynamic CPU/GPU offload.
39
+
40
+ ## Why a daemon?
41
+
42
+ Calling a TTS library inline is the obvious approach and the wrong one for
43
+ narration: it blocks the caller for seconds per line, loads a model per
44
+ process, and overlapping lines talk over each other. `speakd` inverts this:
45
+
46
+ - **~1 ms per call.** The client writes one line to a Unix socket and returns.
47
+ Narration can sit inside hot loops and signal handlers.
48
+ - **One model, one queue.** A single daemon owns the model and serialises
49
+ playback. Ten processes can narrate concurrently without crosstalk.
50
+ - **Failure-proof by design.** Daemon down? The client spawns it. Spawn fails?
51
+ espeak fallback. No audio at all? The caller still never raises.
52
+
53
+ ## Architecture
54
+
55
+ ```
56
+ any process, any language speakd daemon (one per socket, flock-enforced)
57
+ ┌──────────────────────┐ ┌───────────────────────────────────────────────┐
58
+ │ speak "epoch done" │──┐ │ asyncio Unix-socket server │
59
+ └──────────────────────┘ │ │ │ │
60
+ ┌──────────────────────┐ │ UTF-8 │ ├── volume msg ──▶ live volume │
61
+ │ Python: speak(...) │──┼─ line ─▶│ ├── interrupt ───▶ drain queue + │
62
+ └──────────────────────┘ │ over │ │ kill playback │
63
+ ┌──────────────────────┐ │ socket │ ▼ │
64
+ │ CI job, cron, hook │──┘ │ FIFO queue ──▶ worker (thread executor) │
65
+ └──────────────────────┘ │ │ │
66
+ ▲ │ ▼ │
67
+ │ "OK\n" ack │ Kokoro TTS ──▶ wav ──▶ mpv ──▶ 🔊 │
68
+ │ (blocking mode only) │ CPU ⇄ GPU │
69
+ └─────────────────────────│ (offloads after idle keepalive) │
70
+ │ │
71
+ │ any failure ──▶ espeak fallback │
72
+ └───────────────────────────────────────────────┘
73
+ ```
74
+
75
+ ## Features
76
+
77
+ - **Fire-and-forget socket design** — newline-terminated UTF-8 over a Unix
78
+ domain socket; trivially scriptable from any language. Optional `OK` ack
79
+ for blocking callers.
80
+ - **Dynamic GPU offload with keepalive** — the model loads on CPU, hops onto
81
+ the GPU for narration bursts, and releases its VRAM (~3 GB) after a
82
+ configurable idle period. If the GPU is full (another job grabbed it), that
83
+ request simply synthesizes on CPU instead of failing.
84
+ - **Interrupt protocol** — an urgent line drains the pending queue, kills
85
+ in-flight playback mid-word, and speaks immediately.
86
+ - **Live volume control** — one socket message, applies from the next line;
87
+ no restart.
88
+ - **Singleton via `flock(2)`** — clients can race to auto-spawn the daemon;
89
+ exactly one wins, the rest exit cleanly. Stale sockets are detected and
90
+ removed on startup.
91
+ - **Graceful fallback** — Kokoro import error, synthesis failure, playback
92
+ failure, or daemon unreachable: the line is spoken by espeak and the event
93
+ is logged. Narration degrades; it never silently vanishes.
94
+ - **One TOML file, env-var overrides, zero-config defaults** — works out of
95
+ the box on CPU with no config file at all.
96
+
97
+ ## Requirements
98
+
99
+ - Linux or macOS (Unix sockets + `flock`), Python ≥ 3.10
100
+ - [mpv](https://mpv.io/) for playback (`apt install mpv`) — or any player,
101
+ via config
102
+ - [espeak](https://espeak.sourceforge.net/) for the fallback voice
103
+ (`apt install espeak`) — optional but recommended
104
+ - A CUDA-capable GPU is **optional**; everything works on CPU
105
+
106
+ ## Install
107
+
108
+ ```bash
109
+ pip install speakd
110
+ ```
111
+
112
+ This installs the `kokoro` TTS package (which pulls in PyTorch) and two
113
+ console commands: `speakd` (the daemon) and `speak` (the client).
114
+
115
+ To install from source:
116
+
117
+ ```bash
118
+ git clone https://github.com/I-Alpha/speakd && cd speakd
119
+ pip install .
120
+ ```
121
+
122
+ ## Quickstart
123
+
124
+ ```bash
125
+ # 1. Just speak — the daemon auto-spawns on first use:
126
+ speak "hello from speakd"
127
+
128
+ # 2. Or run the daemon in the foreground to watch it work:
129
+ speakd --device cpu --voice af_heart
130
+
131
+ # 3. Script it:
132
+ speak --blocking "waits until this has been spoken"
133
+ speak --interrupt "queue drained, this plays immediately"
134
+ speak --volume 60 "quieter from now on"
135
+ echo "pipes work too" | speak
136
+ ```
137
+
138
+ From Python:
139
+
140
+ ```python
141
+ from speakd import speak, set_volume
142
+
143
+ speak("checkpoint saved") # ~1 ms, non-blocking
144
+ speak("eval finished", blocking=True) # wait until spoken
145
+ speak("loss is NaN — stopping", interrupt=True) # jump the queue
146
+ set_volume(85)
147
+ ```
148
+
149
+ See [`examples/`](examples/) for runnable demos of narration, interrupts,
150
+ and volume control.
151
+
152
+ ## Configuration
153
+
154
+ Defaults work with no config at all. To customise, copy
155
+ [`config.example.toml`](config.example.toml) to `~/.config/speakd/config.toml`
156
+ (or point `$SPEAKD_CONFIG` at any path). Environment variables override the
157
+ file; CLI flags override both.
158
+
159
+ | TOML key | Env override | Default | Meaning |
160
+ |---|---|---|---|
161
+ | `tts.voice` | `SPEAKD_VOICE` | `af_heart` | Kokoro voice id (`af_*`, `am_*`, `bf_*`, `bm_*`, ...) |
162
+ | `tts.speed` | `SPEAKD_SPEED` | `1.0` | Speech-rate multiplier |
163
+ | `tts.lang_code` | `SPEAKD_LANG` | `a` | Kokoro language code (`a` US English, `b` UK English) |
164
+ | `device.policy` | `SPEAKD_DEVICE` | `auto` | `auto` (dynamic offload) / `cpu` / `gpu` |
165
+ | `device.keepalive_seconds` | `SPEAKD_KEEPALIVE` | `180` | Idle seconds before GPU→CPU offload |
166
+ | `daemon.socket_path` | `SPEAKD_SOCKET` | `$XDG_RUNTIME_DIR/speakd.sock` | Unix socket path |
167
+ | `daemon.socket_mode` | — | `"600"` | Octal permissions on the socket file |
168
+ | `daemon.log_file` | `SPEAKD_LOG_FILE` | `~/.local/state/speakd/daemon.log` | Log target for auto-spawned daemons |
169
+ | `audio.volume` | `SPEAKD_VOLUME` | `100` | Playback volume `0–130` (mpv scale) |
170
+ | `audio.max_playback_seconds` | — | `120` | Kill a single line's playback after this |
171
+ | `audio.player` | — | mpv template | Player argv; `{file}` and `{volume}` are substituted |
172
+ | `fallback.command` | — | espeak template | Fallback argv; `{text}` is substituted; `[]` disables |
173
+ | `client.connect_timeout` | — | `0.5` | Socket connect/send timeout (s) |
174
+ | `client.ack_timeout` | — | `300.0` | `--blocking` wait for the spoken-ack (s) |
175
+ | `client.spawn_wait` | — | `4.0` | Wait for an auto-spawned daemon (s) |
176
+
177
+ `speakd --print-config` shows the fully-resolved effective configuration.
178
+
179
+ ## Wire protocol
180
+
181
+ One newline-terminated UTF-8 line per connection — easy to speak from any
182
+ language without a client library:
183
+
184
+ | Message | Bytes | Effect |
185
+ |---|---|---|
186
+ | Speak | `<text>\n` | Queue the line; daemon replies `OK\n` when spoken |
187
+ | Interrupt | `\x01INTERRUPT\x01<text>\n` | Drain queue, kill playback, speak now |
188
+ | Volume | `\x02VOLUME\x02<int>\n` | Set live volume (0–130) |
189
+
190
+ ```bash
191
+ # speak from raw shell, no client needed:
192
+ printf 'hello from netcat\n' | nc -U "$XDG_RUNTIME_DIR/speakd.sock"
193
+ ```
194
+
195
+ The control markers are ASCII SOH/STX characters that cannot occur in normal
196
+ text, so no escaping is ever needed.
197
+
198
+ ## GPU offload in detail
199
+
200
+ The `auto` policy exists for machines where the GPU has a day job:
201
+
202
+ 1. The model loads on **CPU** at first request.
203
+ 2. Each synthesis tries to move it to the **GPU** first (a few hundred ms,
204
+ then synthesis is much faster). If CUDA is busy or OOM, that line
205
+ synthesizes on CPU — no error, just slower.
206
+ 3. After `keepalive_seconds` (default 180 s) without a request, an idle timer
207
+ moves the model back to **CPU** and calls `torch.cuda.empty_cache()`,
208
+ releasing the VRAM.
209
+
210
+ The effect: during an active narration burst the voice is snappy and
211
+ GPU-accelerated; ten minutes into a silent stretch, your training job has its
212
+ VRAM back. All device moves are serialised with synthesis under one lock, so
213
+ the model can never be moved mid-utterance.
214
+
215
+ ## Troubleshooting
216
+
217
+ | Symptom | Likely cause / fix |
218
+ |---|---|
219
+ | `speak` says *fallback engine used* | Daemon failed to start — check `~/.local/state/speakd/daemon.log`. Most common: `kokoro` not installed in the Python that spawned it (set `SPEAKD_DAEMON_CMD="/path/to/python -m speakd.daemon"`). |
220
+ | No audio, no errors | Is `mpv` installed and does it play a wav from your terminal? Swap `audio.player` if you use a different player. |
221
+ | First line is slow | Cold start: model weights load on first request (a few seconds). Subsequent lines are fast. |
222
+ | Robotic voice instead of Kokoro | That *is* the espeak fallback working as designed — see the first row. |
223
+ | Two daemons after a crash | They cannot coexist: the flock singleton makes the second exit immediately, and stale sockets are cleaned on startup. Delete `<socket>.lock` only if a machine crash left it owned by a dead PID holder (flock releases on process death, so this is near-impossible). |
224
+ | `daemon already running (pid N)` | Working as intended — the running daemon serves all clients. |
225
+ | GPU memory not released | The model offloads after `device.keepalive_seconds` of *no requests*; lower it, or run with `--device cpu`. |
226
+
227
+ ## License
228
+
229
+ [MIT](LICENSE) © 2026 ibrahim Alfa
@@ -0,0 +1,62 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "voxcaster"
7
+ version = "0.2.0"
8
+ description = "Local text-to-speech daemon and CLI for speech notifications from scripts, builds, cron jobs, and ML training runs, powered by Kokoro TTS with GPU offload and espeak fallback."
9
+ readme = "README.md"
10
+ license = { text = "MIT" }
11
+ authors = [{ name = "ibrahim Alfa" }]
12
+ requires-python = ">=3.10"
13
+ keywords = [
14
+ "tts",
15
+ "text-to-speech",
16
+ "speech-synthesis",
17
+ "speech-notifications",
18
+ "kokoro",
19
+ "kokoro-tts",
20
+ "daemon",
21
+ "unix-socket",
22
+ "cli",
23
+ "narration",
24
+ "notifications",
25
+ "machine-learning",
26
+ "gpu",
27
+ "cuda",
28
+ "developer-tools",
29
+ ]
30
+ classifiers = [
31
+ "Development Status :: 4 - Beta",
32
+ "Environment :: No Input/Output (Daemon)",
33
+ "Intended Audience :: Developers",
34
+ "License :: OSI Approved :: MIT License",
35
+ "Operating System :: POSIX :: Linux",
36
+ "Operating System :: MacOS",
37
+ "Programming Language :: Python :: 3",
38
+ "Programming Language :: Python :: 3.10",
39
+ "Programming Language :: Python :: 3.11",
40
+ "Programming Language :: Python :: 3.12",
41
+ "Topic :: Multimedia :: Sound/Audio :: Speech",
42
+ "Topic :: System :: Monitoring",
43
+ ]
44
+ dependencies = [
45
+ "kokoro>=0.9",
46
+ "soundfile>=0.12",
47
+ "numpy>=1.24",
48
+ "tomli>=2.0; python_version < '3.11'",
49
+ ]
50
+
51
+ [project.urls]
52
+ Homepage = "https://github.com/I-Alpha/speakd"
53
+ Repository = "https://github.com/I-Alpha/speakd"
54
+ Issues = "https://github.com/I-Alpha/speakd/issues"
55
+ Changelog = "https://github.com/I-Alpha/speakd/blob/master/CHANGELOG.md"
56
+
57
+ [project.scripts]
58
+ voxcaster = "voxcaster.daemon:main"
59
+ speak = "voxcaster.client:main"
60
+
61
+ [tool.setuptools.packages.find]
62
+ where = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,47 @@
1
+ """speakd — fire-and-forget local TTS narration over a Unix socket.
2
+
3
+ A small daemon that turns text lines into speech with `Kokoro
4
+ <https://github.com/hexgrad/kokoro>`_, plus a zero-dependency client.
5
+ Designed for narrating long-running work (training runs, builds, pipelines)
6
+ without ever blocking or crashing the thing doing the work.
7
+
8
+ Quickstart::
9
+
10
+ from speakd import speak
11
+ speak("experiment finished") # fire-and-forget
12
+ speak("loss is NaN — stopping", interrupt=True)
13
+ """
14
+ from typing import TYPE_CHECKING
15
+
16
+ __version__ = "0.1.0"
17
+
18
+ if TYPE_CHECKING: # real imports for type checkers / IDEs
19
+ from .client import ensure_daemon, ping, set_volume, speak
20
+ from .config import Config, load_config
21
+ from .markdown import extract_tts_summary, preprocess_for_speech, strip_markdown
22
+
23
+ __all__ = [
24
+ "speak", "ping", "set_volume", "ensure_daemon",
25
+ "Config", "load_config",
26
+ "strip_markdown", "extract_tts_summary", "preprocess_for_speech",
27
+ "__version__",
28
+ ]
29
+
30
+ _CLIENT_ATTRS = ("speak", "ping", "set_volume", "ensure_daemon")
31
+ _CONFIG_ATTRS = ("Config", "load_config")
32
+ _MARKDOWN_ATTRS = ("strip_markdown", "extract_tts_summary", "preprocess_for_speech")
33
+
34
+
35
+ def __getattr__(name: str):
36
+ """Lazy re-exports (PEP 562): keep ``import speakd`` instant and avoid
37
+ eagerly importing submodules that ``python -m speakd.<mod>`` re-executes."""
38
+ if name in _CLIENT_ATTRS:
39
+ from . import client
40
+ return getattr(client, name)
41
+ if name in _CONFIG_ATTRS:
42
+ from . import config
43
+ return getattr(config, name)
44
+ if name in _MARKDOWN_ATTRS:
45
+ from . import markdown
46
+ return getattr(markdown, name)
47
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")