voicecc 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/.claude-plugin/plugin.json +6 -0
  2. package/README.md +48 -0
  3. package/bin/voicecc.js +39 -0
  4. package/dashboard/dist/assets/index-BXemFrMp.css +1 -0
  5. package/dashboard/dist/assets/index-dAYfRls7.js +11 -0
  6. package/dashboard/dist/audio-processor.js +126 -0
  7. package/dashboard/dist/index.html +13 -0
  8. package/dashboard/routes/auth.ts +119 -0
  9. package/dashboard/routes/browser-call.ts +87 -0
  10. package/dashboard/routes/claude-md.ts +50 -0
  11. package/dashboard/routes/conversations.ts +203 -0
  12. package/dashboard/routes/integrations.ts +154 -0
  13. package/dashboard/routes/mcp-servers.ts +198 -0
  14. package/dashboard/routes/settings.ts +64 -0
  15. package/dashboard/routes/tunnel.ts +66 -0
  16. package/dashboard/routes/twilio.ts +120 -0
  17. package/dashboard/routes/voice.ts +48 -0
  18. package/dashboard/routes/webrtc.ts +85 -0
  19. package/dashboard/server.ts +130 -0
  20. package/dashboard/tsconfig.json +13 -0
  21. package/init/CLAUDE.md +18 -0
  22. package/package.json +59 -0
  23. package/run.ts +68 -0
  24. package/scripts/postinstall.js +228 -0
  25. package/services/browser-call-manager.ts +106 -0
  26. package/services/device-pairing.ts +176 -0
  27. package/services/env.ts +88 -0
  28. package/services/tunnel.ts +204 -0
  29. package/services/twilio-manager.ts +126 -0
  30. package/sidecar/assets/startup.pcm +0 -0
  31. package/sidecar/audio-adapter.ts +60 -0
  32. package/sidecar/audio-capture.ts +220 -0
  33. package/sidecar/browser-audio-playback.test.ts +149 -0
  34. package/sidecar/browser-audio.ts +147 -0
  35. package/sidecar/browser-server.ts +331 -0
  36. package/sidecar/chime.test.ts +69 -0
  37. package/sidecar/chime.ts +54 -0
  38. package/sidecar/claude-session.ts +295 -0
  39. package/sidecar/endpointing.ts +163 -0
  40. package/sidecar/index.ts +83 -0
  41. package/sidecar/local-audio.ts +126 -0
  42. package/sidecar/mic-vpio +0 -0
  43. package/sidecar/mic-vpio.swift +484 -0
  44. package/sidecar/mock-tts-server-tagged.mjs +132 -0
  45. package/sidecar/narration.ts +204 -0
  46. package/sidecar/scripts/generate-startup-audio.py +79 -0
  47. package/sidecar/session-lock.ts +123 -0
  48. package/sidecar/sherpa-onnx-node.d.ts +4 -0
  49. package/sidecar/stt.ts +199 -0
  50. package/sidecar/tts-server.py +193 -0
  51. package/sidecar/tts.ts +481 -0
  52. package/sidecar/twilio-audio.ts +338 -0
  53. package/sidecar/twilio-server.ts +436 -0
  54. package/sidecar/types.ts +210 -0
  55. package/sidecar/vad.ts +101 -0
  56. package/sidecar/voice-loop-bugs.test.ts +522 -0
  57. package/sidecar/voice-session.ts +523 -0
  58. package/skills/voice/SKILL.md +26 -0
  59. package/tsconfig.json +22 -0
@@ -0,0 +1,193 @@
1
+ """
2
+ Persistent Python TTS subprocess for the voice sidecar.
3
+
4
+ Loads a Kokoro (or other mlx-audio) model once on the Apple Silicon GPU,
5
+ then accepts JSON commands on stdin and writes length-prefixed raw PCM
6
+ audio to stdout.
7
+
8
+ Responsibilities:
9
+ - Load the TTS model on startup via mlx-audio
10
+ - Accept generate/interrupt/quit commands on stdin (JSON lines)
11
+ - Stream raw 16-bit signed PCM audio chunks to stdout (length-prefixed)
12
+ - Support interruption of in-progress generation
13
+
14
+ Protocol:
15
+ stdin (JSON lines):
16
+ {"cmd": "generate", "text": "Hello world"}
17
+ {"cmd": "interrupt"}
18
+ {"cmd": "quit"}
19
+
20
+ stdout (binary, length-prefixed):
21
+ [4 bytes uint32 BE = chunk length] [N bytes raw int16 PCM at 24kHz mono]
22
+ [4 bytes 0x00000000] = end of generation
23
+
24
+ stderr (text lines):
25
+ READY
26
+ ERROR: <message>
27
+ (plus any log output)
28
+ """
29
+
30
+ import sys
31
+ import json
32
+ import struct
33
+ import signal
34
+ import threading
35
+ import queue
36
+ import numpy as np
37
+
38
+ # ============================================================================
39
+ # CONSTANTS
40
+ # ============================================================================
41
+
42
+ SAMPLE_RATE = 24000
43
+ DEFAULT_MODEL = "prince-canuma/Kokoro-82M"
44
+ DEFAULT_VOICE = "af_heart"
45
+
46
+ # ============================================================================
47
+ # MAIN HANDLERS
48
+ # ============================================================================
49
+
50
+ def main():
51
+ """Load model and enter the command loop."""
52
+ model_id = sys.argv[1] if len(sys.argv) > 1 else DEFAULT_MODEL
53
+ voice = sys.argv[2] if len(sys.argv) > 2 else DEFAULT_VOICE
54
+
55
+ # Load model
56
+ log(f"Loading model: {model_id}")
57
+ try:
58
+ from mlx_audio.tts.utils import load_model
59
+ model = load_model(model_id)
60
+ except Exception as e:
61
+ log(f"ERROR: Failed to load model: {e}")
62
+ sys.exit(1)
63
+
64
+ log(f"Model loaded (sample_rate={model.sample_rate})")
65
+
66
+ # Warm-up: run one short generation to prime the GPU pipeline
67
+ log("Warming up...")
68
+ try:
69
+ for _ in model.generate(text="Hello.", voice=voice):
70
+ pass
71
+ log("Warm-up done")
72
+ except Exception as e:
73
+ log(f"WARNING: Warm-up failed: {e}")
74
+
75
+ # Signal readiness
76
+ sys.stderr.write("READY\n")
77
+ sys.stderr.flush()
78
+
79
+ # State shared between stdin reader thread and main thread
80
+ interrupted = threading.Event()
81
+ command_queue = queue.Queue()
82
+
83
+ # Ignore SIGINT — let the parent Node.js process handle it
84
+ signal.signal(signal.SIGINT, signal.SIG_IGN)
85
+
86
+ # Read stdin on a background thread so interrupt commands are processed
87
+ # immediately, even while handle_generate is running on the main thread.
88
+ def stdin_reader():
89
+ for line in sys.stdin:
90
+ line = line.strip()
91
+ if not line:
92
+ continue
93
+ try:
94
+ cmd = json.loads(line)
95
+ except json.JSONDecodeError as e:
96
+ log(f"ERROR: Invalid JSON: {e}")
97
+ continue
98
+
99
+ if cmd.get("cmd") == "interrupt":
100
+ interrupted.set()
101
+ else:
102
+ command_queue.put(cmd)
103
+
104
+ reader = threading.Thread(target=stdin_reader, daemon=True)
105
+ reader.start()
106
+
107
+ # Main thread: process generate/quit commands from the queue
108
+ while True:
109
+ cmd = command_queue.get()
110
+
111
+ if cmd.get("cmd") == "generate":
112
+ interrupted.clear()
113
+ handle_generate(model, cmd.get("text", ""), voice, interrupted)
114
+ elif cmd.get("cmd") == "quit":
115
+ break
116
+ else:
117
+ log(f"ERROR: Unknown command: {cmd.get('cmd')}")
118
+
119
+ log("Shutting down")
120
+
121
+
122
+ def handle_generate(model, text: str, voice: str, interrupted: threading.Event):
123
+ """
124
+ Generate audio for the given text and write PCM chunks to stdout.
125
+
126
+ @param model - The loaded mlx-audio TTS model
127
+ @param text - Text to synthesize
128
+ @param voice - Voice ID (e.g. "af_heart")
129
+ @param interrupted - Event flag set when generation should stop
130
+ """
131
+ if not text.strip():
132
+ write_end_marker()
133
+ return
134
+
135
+ try:
136
+ results = model.generate(text=text, voice=voice, stream=True)
137
+
138
+ for result in results:
139
+ if interrupted.is_set():
140
+ break
141
+
142
+ audio = np.array(result.audio, copy=False)
143
+ pcm = float32_to_int16_pcm(audio)
144
+ write_audio_chunk(pcm)
145
+
146
+ except Exception as e:
147
+ log(f"ERROR: Generation failed: {e}")
148
+
149
+ write_end_marker()
150
+
151
+
152
+ # ============================================================================
153
+ # HELPER FUNCTIONS
154
+ # ============================================================================
155
+
156
+ def float32_to_int16_pcm(audio: np.ndarray) -> bytes:
157
+ """
158
+ Convert float32 audio samples (-1.0..1.0) to 16-bit signed PCM bytes.
159
+
160
+ @param audio - numpy array of float32 samples
161
+ @returns Raw bytes of int16 little-endian PCM
162
+ """
163
+ clamped = np.clip(audio, -1.0, 1.0)
164
+ int16 = (clamped * 32767).astype(np.int16)
165
+ return int16.tobytes()
166
+
167
+
168
+ def write_audio_chunk(pcm_bytes: bytes):
169
+ """
170
+ Write a length-prefixed audio chunk to stdout.
171
+
172
+ @param pcm_bytes - Raw PCM bytes to write
173
+ """
174
+ header = struct.pack(">I", len(pcm_bytes))
175
+ sys.stdout.buffer.write(header)
176
+ sys.stdout.buffer.write(pcm_bytes)
177
+ sys.stdout.buffer.flush()
178
+
179
+
180
+ def write_end_marker():
181
+ """Write a 0-length frame to signal end of generation."""
182
+ sys.stdout.buffer.write(struct.pack(">I", 0))
183
+ sys.stdout.buffer.flush()
184
+
185
+
186
+ def log(msg: str):
187
+ """Write a log message to stderr."""
188
+ sys.stderr.write(f"[tts-server] {msg}\n")
189
+ sys.stderr.flush()
190
+
191
+
192
+ if __name__ == "__main__":
193
+ main()
package/sidecar/tts.ts ADDED
@@ -0,0 +1,481 @@
1
+ /**
2
+ * Local text-to-speech via mlx-audio (Chatterbox Turbo) with VPIO playback.
3
+ *
4
+ * Spawns a persistent Python subprocess (tts-server.py) that loads the TTS model
5
+ * once on the Apple Silicon GPU via MLX, then generates audio on demand. Text is
6
+ * buffered into sentences before being sent to the subprocess. Audio is received
7
+ * as length-prefixed raw PCM and written to the VPIO speaker stream for playback
8
+ * with echo cancellation.
9
+ *
10
+ * Responsibilities:
11
+ * - Spawn and manage the tts-server.py Python subprocess lifecycle
12
+ * - Buffer streaming text deltas into complete sentences for generation
13
+ * - Read length-prefixed PCM audio chunks from the subprocess stdout
14
+ * - Write audio to the VPIO speaker stream (echo cancellation handled by VPIO)
15
+ * - Support interruption via VPIO ring buffer clear
16
+ */
17
+
18
+ import { ChildProcess, spawn } from "child_process";
19
+ import { join, dirname } from "path";
20
+ import { fileURLToPath } from "url";
21
+
22
+ import type { Writable } from "stream";
23
+ import type { TtsConfig, TextChunk } from "./types.js";
24
+
25
+ // ============================================================================
26
+ // INTERFACES
27
+ // ============================================================================
28
+
29
+ /**
30
+ * TTS player instance that converts text to spoken audio output.
31
+ */
32
+ export interface TtsPlayer {
33
+ /**
34
+ * Convert text to audio and play it through the speakers.
35
+ * @param text - The text to speak
36
+ * @returns Resolves when all PCM has been sent to the speaker stream
37
+ */
38
+ speak(text: string): Promise<void>;
39
+
40
+ /**
41
+ * Stream text chunks into TTS for incremental playback.
42
+ * First audio plays while later chunks are still generating.
43
+ * @param texts - Async iterable of text chunks (plain string = buffer, { text, flush } = immediate)
44
+ * @returns Resolves when all chunks have been sent to the speaker stream
45
+ */
46
+ speakStream(texts: AsyncIterable<TextChunk>): Promise<void>;
47
+
48
+ /**
49
+ * Interrupt current playback immediately.
50
+ * Clears the VPIO ring buffer and cancels in-progress generation.
51
+ */
52
+ interrupt(): void;
53
+
54
+ /**
55
+ * Check whether TTS is currently generating and playing audio.
56
+ * @returns true if a speak/speakStream call is active
57
+ */
58
+ isSpeaking(): boolean;
59
+
60
+ /**
61
+ * Free all TTS model resources and kill the subprocess.
62
+ */
63
+ destroy(): void;
64
+ }
65
+
66
+ // ============================================================================
67
+ // CONSTANTS
68
+ // ============================================================================
69
+
70
+ /** TTS output sample rate in Hz (Chatterbox outputs at 24kHz) */
71
+ const TTS_SAMPLE_RATE = 24000;
72
+
73
+ /** Speaker audio configuration */
74
+ const SPEAKER_CHANNELS = 1;
75
+ const SPEAKER_BIT_DEPTH = 16;
76
+
77
+ /** Path to the Python TTS server script */
78
+ const __dirname = dirname(fileURLToPath(import.meta.url));
79
+ const TTS_SERVER_SCRIPT = join(__dirname, "tts-server.py");
80
+
81
+ /** Path to the Python venv binary */
82
+ const PYTHON_BIN = join(__dirname, ".venv", "bin", "python3");
83
+
84
+ /** Timeout for waiting for the Python subprocess to be ready (ms) */
85
+ const READY_TIMEOUT_MS = 120_000;
86
+
87
+ /** Sentence-ending punctuation pattern: .!? followed by whitespace or end */
88
+ const SENTENCE_END_RE = /[.!?][\s]+/;
89
+
90
+ /** Minimum sentence length before we'll split on punctuation */
91
+ const MIN_SENTENCE_LENGTH = 20;
92
+
93
+ // ============================================================================
94
+ // MAIN HANDLERS
95
+ // ============================================================================
96
+
97
+ /**
98
+ * Initialize the mlx-audio TTS subprocess and create a TtsPlayer instance.
99
+ *
100
+ * Spawns tts-server.py which loads the model on the Apple Silicon GPU.
101
+ * First run downloads the model from HuggingFace (~3GB for fp16).
102
+ *
103
+ * @param config - TTS configuration (model ID, voice, speaker stream, interrupt callback)
104
+ * @returns A TtsPlayer instance ready for playback
105
+ * @throws Error if subprocess fails to start or model fails to load
106
+ */
107
+ export async function createTts(config: TtsConfig): Promise<TtsPlayer> {
108
+ const cmd = config.serverCommand ?? [PYTHON_BIN, TTS_SERVER_SCRIPT, config.model, config.voice];
109
+
110
+ const proc = spawn(cmd[0], cmd.slice(1), {
111
+ stdio: ["pipe", "pipe", "pipe"],
112
+ });
113
+
114
+ await waitForReady(proc);
115
+
116
+ const { speakerInput, interruptPlayback, resumePlayback } = config;
117
+ let destroyed = false;
118
+ let speaking = false;
119
+ let interruptFlag = false;
120
+ let wasInterrupted = false;
121
+ let midGeneration = false;
122
+
123
+ /**
124
+ * Generate audio for a single text string and play it.
125
+ * @param text - The text to speak
126
+ */
127
+ async function speak(text: string): Promise<void> {
128
+ if (destroyed) throw new Error("TtsPlayer has been destroyed");
129
+
130
+ interruptFlag = false;
131
+ speaking = true;
132
+ if (wasInterrupted) {
133
+ if (midGeneration) {
134
+ await drainStaleChunks(proc);
135
+ midGeneration = false;
136
+ }
137
+ resumePlayback();
138
+ wasInterrupted = false;
139
+ }
140
+
141
+ sendCommand(proc, { cmd: "generate", text });
142
+ midGeneration = true;
143
+
144
+ try {
145
+ let loopBroken = false;
146
+ for await (const pcmBuffer of readPcmChunks(proc)) {
147
+ if (interruptFlag) { loopBroken = true; break; }
148
+ await writePcm(speakerInput, pcmBuffer);
149
+ }
150
+ if (!loopBroken) midGeneration = false;
151
+ } finally {
152
+ speaking = false;
153
+ }
154
+ }
155
+
156
+ /**
157
+ * Stream text chunks into TTS for pipelined playback.
158
+ * Buffers text deltas into sentences, generates audio per sentence,
159
+ * and writes PCM to the VPIO speaker stream.
160
+ * @param texts - Async iterable of text chunks from the narrator
161
+ */
162
+ async function speakStream(texts: AsyncIterable<TextChunk>): Promise<void> {
163
+ if (destroyed) throw new Error("TtsPlayer has been destroyed");
164
+
165
+ const t0 = Date.now();
166
+ let firstTextLogged = false;
167
+ let chunkIndex = 0;
168
+ let playbackFinishAt = 0;
169
+
170
+ interruptFlag = false;
171
+ speaking = true;
172
+ if (wasInterrupted) {
173
+ if (midGeneration) {
174
+ await drainStaleChunks(proc);
175
+ midGeneration = false;
176
+ }
177
+ resumePlayback();
178
+ wasInterrupted = false;
179
+ }
180
+
181
+ try {
182
+ for await (const sentence of bufferSentences(texts)) {
183
+ if (interruptFlag) break;
184
+
185
+ if (!firstTextLogged) {
186
+ console.log(`[tts] first sentence at +${Date.now() - t0}ms: "${sentence.slice(0, 50)}${sentence.length > 50 ? "..." : ""}"`);
187
+ firstTextLogged = true;
188
+ }
189
+
190
+ const sentAt = Date.now();
191
+ sendCommand(proc, { cmd: "generate", text: sentence });
192
+ midGeneration = true;
193
+
194
+ let loopBroken = false;
195
+ for await (const pcmBuffer of readPcmChunks(proc)) {
196
+ if (interruptFlag) { loopBroken = true; break; }
197
+
198
+ const now = Date.now() - t0;
199
+ const audioDurationMs =
200
+ (pcmBuffer.length / (TTS_SAMPLE_RATE * (SPEAKER_BIT_DEPTH / 8) * SPEAKER_CHANNELS)) * 1000;
201
+ const genMs = Date.now() - sentAt;
202
+ console.log(
203
+ `[tts] chunk ${chunkIndex} at +${now}ms (${(audioDurationMs / 1000).toFixed(1)}s audio, generated in ${genMs}ms)`
204
+ );
205
+ chunkIndex++;
206
+
207
+ await writePcm(speakerInput, pcmBuffer);
208
+
209
+ // Track estimated playback end. If the speaker buffer drained during a
210
+ // gap (e.g. tool call), new audio starts from now, not after previous audio.
211
+ playbackFinishAt = Math.max(playbackFinishAt, Date.now()) + audioDurationMs;
212
+ }
213
+
214
+ if (!loopBroken) midGeneration = false;
215
+ if (interruptFlag) break;
216
+ }
217
+
218
+ // Wait for buffered audio to finish playing through the speakers
219
+ if (!interruptFlag && playbackFinishAt > 0) {
220
+ const remainingMs = playbackFinishAt - Date.now();
221
+ if (remainingMs > 0) {
222
+ console.log(`[tts] waiting ${(remainingMs / 1000).toFixed(1)}s for playback to finish`);
223
+ await new Promise<void>((resolve) => {
224
+ const timer = setTimeout(resolve, remainingMs);
225
+ // Allow interruption to cancel the wait
226
+ const check = setInterval(() => {
227
+ if (interruptFlag) {
228
+ clearTimeout(timer);
229
+ clearInterval(check);
230
+ resolve();
231
+ }
232
+ }, 50);
233
+ // Clean up interval when timer fires naturally
234
+ setTimeout(() => clearInterval(check), remainingMs + 100);
235
+ });
236
+ }
237
+ }
238
+ } finally {
239
+ speaking = false;
240
+ }
241
+ }
242
+
243
+ /**
244
+ * Interrupt current playback and generation immediately.
245
+ * Clears the VPIO ring buffer and cancels TTS generation.
246
+ */
247
+ function interrupt(): void {
248
+ if (destroyed) return;
249
+ interruptFlag = true;
250
+ wasInterrupted = true;
251
+ interruptPlayback();
252
+ sendCommand(proc, { cmd: "interrupt" });
253
+ }
254
+
255
+ /**
256
+ * Check whether TTS is currently active.
257
+ */
258
+ function checkIsSpeaking(): boolean {
259
+ return speaking;
260
+ }
261
+
262
+ /**
263
+ * Free all resources: kill the Python subprocess.
264
+ */
265
+ function destroyPlayer(): void {
266
+ if (destroyed) return;
267
+ destroyed = true;
268
+ interrupt();
269
+ sendCommand(proc, { cmd: "quit" });
270
+ proc.kill("SIGTERM");
271
+ }
272
+
273
+ return {
274
+ speak,
275
+ speakStream,
276
+ interrupt,
277
+ isSpeaking: checkIsSpeaking,
278
+ destroy: destroyPlayer,
279
+ };
280
+ }
281
+
282
+ // ============================================================================
283
+ // HELPER FUNCTIONS
284
+ // ============================================================================
285
+
286
+ /**
287
+ * Wait for the Python subprocess to print READY on stderr.
288
+ * @param proc - The child process to monitor
289
+ * @throws Error if the subprocess exits or times out before READY
290
+ */
291
+ function waitForReady(proc: ChildProcess): Promise<void> {
292
+ return new Promise<void>((resolve, reject) => {
293
+ const timeout = setTimeout(() => {
294
+ reject(new Error(`tts-server.py did not become ready within ${READY_TIMEOUT_MS}ms`));
295
+ }, READY_TIMEOUT_MS);
296
+
297
+ let stderrBuffer = "";
298
+
299
+ const onData = (data: Buffer) => {
300
+ const text = data.toString();
301
+ stderrBuffer += text;
302
+
303
+ // Log all stderr output (model download progress, etc.)
304
+ for (const line of text.split("\n")) {
305
+ const trimmed = line.trim();
306
+ if (trimmed && trimmed !== "READY") {
307
+ console.log(`[tts-server] ${trimmed}`);
308
+ }
309
+ }
310
+
311
+ if (stderrBuffer.includes("READY")) {
312
+ clearTimeout(timeout);
313
+ proc.stderr!.off("data", onData);
314
+
315
+ // Continue logging stderr after READY
316
+ proc.stderr!.on("data", (d: Buffer) => {
317
+ for (const line of d.toString().split("\n")) {
318
+ const trimmed = line.trim();
319
+ if (trimmed) console.log(`[tts-server] ${trimmed}`);
320
+ }
321
+ });
322
+
323
+ resolve();
324
+ }
325
+ };
326
+
327
+ proc.stderr!.on("data", onData);
328
+
329
+ proc.on("error", (err) => {
330
+ clearTimeout(timeout);
331
+ reject(new Error(`tts-server.py failed to start: ${err.message}`));
332
+ });
333
+
334
+ proc.on("exit", (code) => {
335
+ clearTimeout(timeout);
336
+ reject(new Error(`tts-server.py exited with code ${code} before READY`));
337
+ });
338
+ });
339
+ }
340
+
341
+ /**
342
+ * Send a JSON command to the Python subprocess stdin.
343
+ * @param proc - The child process
344
+ * @param cmd - The command object to send
345
+ */
346
+ function sendCommand(proc: ChildProcess, cmd: Record<string, unknown>): void {
347
+ proc.stdin!.write(JSON.stringify(cmd) + "\n");
348
+ }
349
+
350
+ /**
351
+ * Drain stale PCM data from the subprocess stdout after an interruption.
352
+ * Reads and discards remaining chunks until the end marker (0-length frame).
353
+ * @param proc - The child process to drain from
354
+ */
355
+ async function drainStaleChunks(proc: ChildProcess): Promise<void> {
356
+ for await (const _chunk of readPcmChunks(proc)) {
357
+ // Discard stale chunks until end marker
358
+ }
359
+ }
360
+
361
+ /**
362
+ * Async generator that reads length-prefixed PCM chunks from the subprocess stdout.
363
+ * Yields Buffer objects until a 0-length end marker is received.
364
+ * @param proc - The child process to read from
365
+ * @yields Buffer of raw 16-bit signed PCM audio
366
+ */
367
+ async function* readPcmChunks(proc: ChildProcess): AsyncGenerator<Buffer> {
368
+ const stdout = proc.stdout!;
369
+
370
+ while (true) {
371
+ const header = await readExactly(stdout, 4);
372
+ const length = header.readUInt32BE(0);
373
+
374
+ if (length === 0) return;
375
+
376
+ const pcmData = await readExactly(stdout, length);
377
+ yield pcmData;
378
+ }
379
+ }
380
+
381
+ /**
382
+ * Read exactly N bytes from a readable stream.
383
+ * @param stream - The readable stream
384
+ * @param size - Number of bytes to read
385
+ * @returns Buffer containing exactly size bytes
386
+ */
387
+ function readExactly(stream: NodeJS.ReadableStream, size: number): Promise<Buffer> {
388
+ return new Promise<Buffer>((resolve, reject) => {
389
+ const chunks: Buffer[] = [];
390
+ let received = 0;
391
+
392
+ const onError = (err: Error) => {
393
+ stream.removeListener("end", onEnd);
394
+ reject(err);
395
+ };
396
+
397
+ const onEnd = () => {
398
+ stream.removeListener("error", onError);
399
+ reject(new Error("Stream ended before reading enough bytes"));
400
+ };
401
+
402
+ const tryRead = () => {
403
+ while (received < size) {
404
+ const remaining = size - received;
405
+ const chunk = (stream as any).read(remaining) as Buffer | null;
406
+ if (chunk === null) {
407
+ stream.once("readable", tryRead);
408
+ return;
409
+ }
410
+ chunks.push(chunk);
411
+ received += chunk.length;
412
+ }
413
+
414
+ stream.removeListener("error", onError);
415
+ stream.removeListener("end", onEnd);
416
+ const result = Buffer.concat(chunks);
417
+ resolve(result.subarray(0, size));
418
+ };
419
+
420
+ stream.once("error", onError);
421
+ stream.once("end", onEnd);
422
+
423
+ tryRead();
424
+ });
425
+ }
426
+
427
+ /**
428
+ * Write a PCM buffer to the speaker stream, respecting backpressure.
429
+ * @param stream - The VPIO speaker writable stream
430
+ * @param pcmBuffer - Raw PCM bytes to write
431
+ */
432
+ function writePcm(stream: Writable, pcmBuffer: Buffer): Promise<void> {
433
+ return new Promise<void>((resolve, reject) => {
434
+ const ok = stream.write(pcmBuffer, (err: Error | null | undefined) => {
435
+ if (err) reject(err);
436
+ });
437
+ if (ok) {
438
+ resolve();
439
+ } else {
440
+ stream.once("drain", () => resolve());
441
+ }
442
+ });
443
+ }
444
+
445
+ /**
446
+ * Buffer streaming text deltas into complete sentences for TTS generation.
447
+ * Chunks tagged with { flush: true } are yielded immediately (e.g. tool narration).
448
+ * Plain string chunks are buffered and split on sentence-ending punctuation.
449
+ * @param texts - Async iterable of TextChunk from the narrator
450
+ * @yields Complete sentences ready for TTS
451
+ */
452
+ async function* bufferSentences(texts: AsyncIterable<TextChunk>): AsyncGenerator<string> {
453
+ let buffer = "";
454
+
455
+ for await (const raw of texts) {
456
+ if (typeof raw !== "string") {
457
+ if (buffer.trim()) {
458
+ yield buffer.trim();
459
+ buffer = "";
460
+ }
461
+ yield raw.text;
462
+ continue;
463
+ }
464
+
465
+ buffer += raw;
466
+
467
+ while (buffer.length >= MIN_SENTENCE_LENGTH) {
468
+ const match = SENTENCE_END_RE.exec(buffer.slice(MIN_SENTENCE_LENGTH - 1));
469
+ if (!match) break;
470
+
471
+ const splitIndex = MIN_SENTENCE_LENGTH - 1 + match.index + match[0].length;
472
+ const sentence = buffer.slice(0, splitIndex).trim();
473
+ buffer = buffer.slice(splitIndex);
474
+
475
+ if (sentence) yield sentence;
476
+ }
477
+ }
478
+
479
+ const remaining = buffer.trim();
480
+ if (remaining) yield remaining;
481
+ }