voicecc 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +6 -0
- package/README.md +48 -0
- package/bin/voicecc.js +39 -0
- package/dashboard/dist/assets/index-BXemFrMp.css +1 -0
- package/dashboard/dist/assets/index-dAYfRls7.js +11 -0
- package/dashboard/dist/audio-processor.js +126 -0
- package/dashboard/dist/index.html +13 -0
- package/dashboard/routes/auth.ts +119 -0
- package/dashboard/routes/browser-call.ts +87 -0
- package/dashboard/routes/claude-md.ts +50 -0
- package/dashboard/routes/conversations.ts +203 -0
- package/dashboard/routes/integrations.ts +154 -0
- package/dashboard/routes/mcp-servers.ts +198 -0
- package/dashboard/routes/settings.ts +64 -0
- package/dashboard/routes/tunnel.ts +66 -0
- package/dashboard/routes/twilio.ts +120 -0
- package/dashboard/routes/voice.ts +48 -0
- package/dashboard/routes/webrtc.ts +85 -0
- package/dashboard/server.ts +130 -0
- package/dashboard/tsconfig.json +13 -0
- package/init/CLAUDE.md +18 -0
- package/package.json +59 -0
- package/run.ts +68 -0
- package/scripts/postinstall.js +228 -0
- package/services/browser-call-manager.ts +106 -0
- package/services/device-pairing.ts +176 -0
- package/services/env.ts +88 -0
- package/services/tunnel.ts +204 -0
- package/services/twilio-manager.ts +126 -0
- package/sidecar/assets/startup.pcm +0 -0
- package/sidecar/audio-adapter.ts +60 -0
- package/sidecar/audio-capture.ts +220 -0
- package/sidecar/browser-audio-playback.test.ts +149 -0
- package/sidecar/browser-audio.ts +147 -0
- package/sidecar/browser-server.ts +331 -0
- package/sidecar/chime.test.ts +69 -0
- package/sidecar/chime.ts +54 -0
- package/sidecar/claude-session.ts +295 -0
- package/sidecar/endpointing.ts +163 -0
- package/sidecar/index.ts +83 -0
- package/sidecar/local-audio.ts +126 -0
- package/sidecar/mic-vpio +0 -0
- package/sidecar/mic-vpio.swift +484 -0
- package/sidecar/mock-tts-server-tagged.mjs +132 -0
- package/sidecar/narration.ts +204 -0
- package/sidecar/scripts/generate-startup-audio.py +79 -0
- package/sidecar/session-lock.ts +123 -0
- package/sidecar/sherpa-onnx-node.d.ts +4 -0
- package/sidecar/stt.ts +199 -0
- package/sidecar/tts-server.py +193 -0
- package/sidecar/tts.ts +481 -0
- package/sidecar/twilio-audio.ts +338 -0
- package/sidecar/twilio-server.ts +436 -0
- package/sidecar/types.ts +210 -0
- package/sidecar/vad.ts +101 -0
- package/sidecar/voice-loop-bugs.test.ts +522 -0
- package/sidecar/voice-session.ts +523 -0
- package/skills/voice/SKILL.md +26 -0
- package/tsconfig.json +22 -0
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Persistent Python TTS subprocess for the voice sidecar.
|
|
3
|
+
|
|
4
|
+
Loads a Kokoro (or other mlx-audio) model once on the Apple Silicon GPU,
|
|
5
|
+
then accepts JSON commands on stdin and writes length-prefixed raw PCM
|
|
6
|
+
audio to stdout.
|
|
7
|
+
|
|
8
|
+
Responsibilities:
|
|
9
|
+
- Load the TTS model on startup via mlx-audio
|
|
10
|
+
- Accept generate/interrupt/quit commands on stdin (JSON lines)
|
|
11
|
+
- Stream raw 16-bit signed PCM audio chunks to stdout (length-prefixed)
|
|
12
|
+
- Support interruption of in-progress generation
|
|
13
|
+
|
|
14
|
+
Protocol:
|
|
15
|
+
stdin (JSON lines):
|
|
16
|
+
{"cmd": "generate", "text": "Hello world"}
|
|
17
|
+
{"cmd": "interrupt"}
|
|
18
|
+
{"cmd": "quit"}
|
|
19
|
+
|
|
20
|
+
stdout (binary, length-prefixed):
|
|
21
|
+
[4 bytes uint32 BE = chunk length] [N bytes raw int16 PCM at 24kHz mono]
|
|
22
|
+
[4 bytes 0x00000000] = end of generation
|
|
23
|
+
|
|
24
|
+
stderr (text lines):
|
|
25
|
+
READY
|
|
26
|
+
ERROR: <message>
|
|
27
|
+
(plus any log output)
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
import sys
|
|
31
|
+
import json
|
|
32
|
+
import struct
|
|
33
|
+
import signal
|
|
34
|
+
import threading
|
|
35
|
+
import queue
|
|
36
|
+
import numpy as np
|
|
37
|
+
|
|
38
|
+
# ============================================================================
|
|
39
|
+
# CONSTANTS
|
|
40
|
+
# ============================================================================
|
|
41
|
+
|
|
42
|
+
SAMPLE_RATE = 24000
|
|
43
|
+
DEFAULT_MODEL = "prince-canuma/Kokoro-82M"
|
|
44
|
+
DEFAULT_VOICE = "af_heart"
|
|
45
|
+
|
|
46
|
+
# ============================================================================
|
|
47
|
+
# MAIN HANDLERS
|
|
48
|
+
# ============================================================================
|
|
49
|
+
|
|
50
|
+
def main():
|
|
51
|
+
"""Load model and enter the command loop."""
|
|
52
|
+
model_id = sys.argv[1] if len(sys.argv) > 1 else DEFAULT_MODEL
|
|
53
|
+
voice = sys.argv[2] if len(sys.argv) > 2 else DEFAULT_VOICE
|
|
54
|
+
|
|
55
|
+
# Load model
|
|
56
|
+
log(f"Loading model: {model_id}")
|
|
57
|
+
try:
|
|
58
|
+
from mlx_audio.tts.utils import load_model
|
|
59
|
+
model = load_model(model_id)
|
|
60
|
+
except Exception as e:
|
|
61
|
+
log(f"ERROR: Failed to load model: {e}")
|
|
62
|
+
sys.exit(1)
|
|
63
|
+
|
|
64
|
+
log(f"Model loaded (sample_rate={model.sample_rate})")
|
|
65
|
+
|
|
66
|
+
# Warm-up: run one short generation to prime the GPU pipeline
|
|
67
|
+
log("Warming up...")
|
|
68
|
+
try:
|
|
69
|
+
for _ in model.generate(text="Hello.", voice=voice):
|
|
70
|
+
pass
|
|
71
|
+
log("Warm-up done")
|
|
72
|
+
except Exception as e:
|
|
73
|
+
log(f"WARNING: Warm-up failed: {e}")
|
|
74
|
+
|
|
75
|
+
# Signal readiness
|
|
76
|
+
sys.stderr.write("READY\n")
|
|
77
|
+
sys.stderr.flush()
|
|
78
|
+
|
|
79
|
+
# State shared between stdin reader thread and main thread
|
|
80
|
+
interrupted = threading.Event()
|
|
81
|
+
command_queue = queue.Queue()
|
|
82
|
+
|
|
83
|
+
# Ignore SIGINT — let the parent Node.js process handle it
|
|
84
|
+
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
|
85
|
+
|
|
86
|
+
# Read stdin on a background thread so interrupt commands are processed
|
|
87
|
+
# immediately, even while handle_generate is running on the main thread.
|
|
88
|
+
def stdin_reader():
|
|
89
|
+
for line in sys.stdin:
|
|
90
|
+
line = line.strip()
|
|
91
|
+
if not line:
|
|
92
|
+
continue
|
|
93
|
+
try:
|
|
94
|
+
cmd = json.loads(line)
|
|
95
|
+
except json.JSONDecodeError as e:
|
|
96
|
+
log(f"ERROR: Invalid JSON: {e}")
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
if cmd.get("cmd") == "interrupt":
|
|
100
|
+
interrupted.set()
|
|
101
|
+
else:
|
|
102
|
+
command_queue.put(cmd)
|
|
103
|
+
|
|
104
|
+
reader = threading.Thread(target=stdin_reader, daemon=True)
|
|
105
|
+
reader.start()
|
|
106
|
+
|
|
107
|
+
# Main thread: process generate/quit commands from the queue
|
|
108
|
+
while True:
|
|
109
|
+
cmd = command_queue.get()
|
|
110
|
+
|
|
111
|
+
if cmd.get("cmd") == "generate":
|
|
112
|
+
interrupted.clear()
|
|
113
|
+
handle_generate(model, cmd.get("text", ""), voice, interrupted)
|
|
114
|
+
elif cmd.get("cmd") == "quit":
|
|
115
|
+
break
|
|
116
|
+
else:
|
|
117
|
+
log(f"ERROR: Unknown command: {cmd.get('cmd')}")
|
|
118
|
+
|
|
119
|
+
log("Shutting down")
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def handle_generate(model, text: str, voice: str, interrupted: threading.Event):
|
|
123
|
+
"""
|
|
124
|
+
Generate audio for the given text and write PCM chunks to stdout.
|
|
125
|
+
|
|
126
|
+
@param model - The loaded mlx-audio TTS model
|
|
127
|
+
@param text - Text to synthesize
|
|
128
|
+
@param voice - Voice ID (e.g. "af_heart")
|
|
129
|
+
@param interrupted - Event flag set when generation should stop
|
|
130
|
+
"""
|
|
131
|
+
if not text.strip():
|
|
132
|
+
write_end_marker()
|
|
133
|
+
return
|
|
134
|
+
|
|
135
|
+
try:
|
|
136
|
+
results = model.generate(text=text, voice=voice, stream=True)
|
|
137
|
+
|
|
138
|
+
for result in results:
|
|
139
|
+
if interrupted.is_set():
|
|
140
|
+
break
|
|
141
|
+
|
|
142
|
+
audio = np.array(result.audio, copy=False)
|
|
143
|
+
pcm = float32_to_int16_pcm(audio)
|
|
144
|
+
write_audio_chunk(pcm)
|
|
145
|
+
|
|
146
|
+
except Exception as e:
|
|
147
|
+
log(f"ERROR: Generation failed: {e}")
|
|
148
|
+
|
|
149
|
+
write_end_marker()
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# ============================================================================
|
|
153
|
+
# HELPER FUNCTIONS
|
|
154
|
+
# ============================================================================
|
|
155
|
+
|
|
156
|
+
def float32_to_int16_pcm(audio: np.ndarray) -> bytes:
|
|
157
|
+
"""
|
|
158
|
+
Convert float32 audio samples (-1.0..1.0) to 16-bit signed PCM bytes.
|
|
159
|
+
|
|
160
|
+
@param audio - numpy array of float32 samples
|
|
161
|
+
@returns Raw bytes of int16 little-endian PCM
|
|
162
|
+
"""
|
|
163
|
+
clamped = np.clip(audio, -1.0, 1.0)
|
|
164
|
+
int16 = (clamped * 32767).astype(np.int16)
|
|
165
|
+
return int16.tobytes()
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def write_audio_chunk(pcm_bytes: bytes):
|
|
169
|
+
"""
|
|
170
|
+
Write a length-prefixed audio chunk to stdout.
|
|
171
|
+
|
|
172
|
+
@param pcm_bytes - Raw PCM bytes to write
|
|
173
|
+
"""
|
|
174
|
+
header = struct.pack(">I", len(pcm_bytes))
|
|
175
|
+
sys.stdout.buffer.write(header)
|
|
176
|
+
sys.stdout.buffer.write(pcm_bytes)
|
|
177
|
+
sys.stdout.buffer.flush()
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def write_end_marker():
|
|
181
|
+
"""Write a 0-length frame to signal end of generation."""
|
|
182
|
+
sys.stdout.buffer.write(struct.pack(">I", 0))
|
|
183
|
+
sys.stdout.buffer.flush()
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def log(msg: str):
|
|
187
|
+
"""Write a log message to stderr."""
|
|
188
|
+
sys.stderr.write(f"[tts-server] {msg}\n")
|
|
189
|
+
sys.stderr.flush()
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
if __name__ == "__main__":
|
|
193
|
+
main()
|
package/sidecar/tts.ts
ADDED
|
@@ -0,0 +1,481 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Local text-to-speech via mlx-audio (Chatterbox Turbo) with VPIO playback.
|
|
3
|
+
*
|
|
4
|
+
* Spawns a persistent Python subprocess (tts-server.py) that loads the TTS model
|
|
5
|
+
* once on the Apple Silicon GPU via MLX, then generates audio on demand. Text is
|
|
6
|
+
* buffered into sentences before being sent to the subprocess. Audio is received
|
|
7
|
+
* as length-prefixed raw PCM and written to the VPIO speaker stream for playback
|
|
8
|
+
* with echo cancellation.
|
|
9
|
+
*
|
|
10
|
+
* Responsibilities:
|
|
11
|
+
* - Spawn and manage the tts-server.py Python subprocess lifecycle
|
|
12
|
+
* - Buffer streaming text deltas into complete sentences for generation
|
|
13
|
+
* - Read length-prefixed PCM audio chunks from the subprocess stdout
|
|
14
|
+
* - Write audio to the VPIO speaker stream (echo cancellation handled by VPIO)
|
|
15
|
+
* - Support interruption via VPIO ring buffer clear
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import { ChildProcess, spawn } from "child_process";
|
|
19
|
+
import { join, dirname } from "path";
|
|
20
|
+
import { fileURLToPath } from "url";
|
|
21
|
+
|
|
22
|
+
import type { Writable } from "stream";
|
|
23
|
+
import type { TtsConfig, TextChunk } from "./types.js";
|
|
24
|
+
|
|
25
|
+
// ============================================================================
|
|
26
|
+
// INTERFACES
|
|
27
|
+
// ============================================================================
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* TTS player instance that converts text to spoken audio output.
|
|
31
|
+
*/
|
|
32
|
+
export interface TtsPlayer {
|
|
33
|
+
/**
|
|
34
|
+
* Convert text to audio and play it through the speakers.
|
|
35
|
+
* @param text - The text to speak
|
|
36
|
+
* @returns Resolves when all PCM has been sent to the speaker stream
|
|
37
|
+
*/
|
|
38
|
+
speak(text: string): Promise<void>;
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Stream text chunks into TTS for incremental playback.
|
|
42
|
+
* First audio plays while later chunks are still generating.
|
|
43
|
+
* @param texts - Async iterable of text chunks (plain string = buffer, { text, flush } = immediate)
|
|
44
|
+
* @returns Resolves when all chunks have been sent to the speaker stream
|
|
45
|
+
*/
|
|
46
|
+
speakStream(texts: AsyncIterable<TextChunk>): Promise<void>;
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Interrupt current playback immediately.
|
|
50
|
+
* Clears the VPIO ring buffer and cancels in-progress generation.
|
|
51
|
+
*/
|
|
52
|
+
interrupt(): void;
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Check whether TTS is currently generating and playing audio.
|
|
56
|
+
* @returns true if a speak/speakStream call is active
|
|
57
|
+
*/
|
|
58
|
+
isSpeaking(): boolean;
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Free all TTS model resources and kill the subprocess.
|
|
62
|
+
*/
|
|
63
|
+
destroy(): void;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// ============================================================================
|
|
67
|
+
// CONSTANTS
|
|
68
|
+
// ============================================================================
|
|
69
|
+
|
|
70
|
+
/** TTS output sample rate in Hz (Chatterbox outputs at 24kHz) */
|
|
71
|
+
const TTS_SAMPLE_RATE = 24000;
|
|
72
|
+
|
|
73
|
+
/** Speaker audio configuration */
|
|
74
|
+
const SPEAKER_CHANNELS = 1;
|
|
75
|
+
const SPEAKER_BIT_DEPTH = 16;
|
|
76
|
+
|
|
77
|
+
/** Path to the Python TTS server script */
|
|
78
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
79
|
+
const TTS_SERVER_SCRIPT = join(__dirname, "tts-server.py");
|
|
80
|
+
|
|
81
|
+
/** Path to the Python venv binary */
|
|
82
|
+
const PYTHON_BIN = join(__dirname, ".venv", "bin", "python3");
|
|
83
|
+
|
|
84
|
+
/** Timeout for waiting for the Python subprocess to be ready (ms) */
|
|
85
|
+
const READY_TIMEOUT_MS = 120_000;
|
|
86
|
+
|
|
87
|
+
/** Sentence-ending punctuation pattern: .!? followed by whitespace or end */
|
|
88
|
+
const SENTENCE_END_RE = /[.!?][\s]+/;
|
|
89
|
+
|
|
90
|
+
/** Minimum sentence length before we'll split on punctuation */
|
|
91
|
+
const MIN_SENTENCE_LENGTH = 20;
|
|
92
|
+
|
|
93
|
+
// ============================================================================
|
|
94
|
+
// MAIN HANDLERS
|
|
95
|
+
// ============================================================================
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Initialize the mlx-audio TTS subprocess and create a TtsPlayer instance.
|
|
99
|
+
*
|
|
100
|
+
* Spawns tts-server.py which loads the model on the Apple Silicon GPU.
|
|
101
|
+
* First run downloads the model from HuggingFace (~3GB for fp16).
|
|
102
|
+
*
|
|
103
|
+
* @param config - TTS configuration (model ID, voice, speaker stream, interrupt callback)
|
|
104
|
+
* @returns A TtsPlayer instance ready for playback
|
|
105
|
+
* @throws Error if subprocess fails to start or model fails to load
|
|
106
|
+
*/
|
|
107
|
+
export async function createTts(config: TtsConfig): Promise<TtsPlayer> {
|
|
108
|
+
const cmd = config.serverCommand ?? [PYTHON_BIN, TTS_SERVER_SCRIPT, config.model, config.voice];
|
|
109
|
+
|
|
110
|
+
const proc = spawn(cmd[0], cmd.slice(1), {
|
|
111
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
112
|
+
});
|
|
113
|
+
|
|
114
|
+
await waitForReady(proc);
|
|
115
|
+
|
|
116
|
+
const { speakerInput, interruptPlayback, resumePlayback } = config;
|
|
117
|
+
let destroyed = false;
|
|
118
|
+
let speaking = false;
|
|
119
|
+
let interruptFlag = false;
|
|
120
|
+
let wasInterrupted = false;
|
|
121
|
+
let midGeneration = false;
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Generate audio for a single text string and play it.
|
|
125
|
+
* @param text - The text to speak
|
|
126
|
+
*/
|
|
127
|
+
async function speak(text: string): Promise<void> {
|
|
128
|
+
if (destroyed) throw new Error("TtsPlayer has been destroyed");
|
|
129
|
+
|
|
130
|
+
interruptFlag = false;
|
|
131
|
+
speaking = true;
|
|
132
|
+
if (wasInterrupted) {
|
|
133
|
+
if (midGeneration) {
|
|
134
|
+
await drainStaleChunks(proc);
|
|
135
|
+
midGeneration = false;
|
|
136
|
+
}
|
|
137
|
+
resumePlayback();
|
|
138
|
+
wasInterrupted = false;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
sendCommand(proc, { cmd: "generate", text });
|
|
142
|
+
midGeneration = true;
|
|
143
|
+
|
|
144
|
+
try {
|
|
145
|
+
let loopBroken = false;
|
|
146
|
+
for await (const pcmBuffer of readPcmChunks(proc)) {
|
|
147
|
+
if (interruptFlag) { loopBroken = true; break; }
|
|
148
|
+
await writePcm(speakerInput, pcmBuffer);
|
|
149
|
+
}
|
|
150
|
+
if (!loopBroken) midGeneration = false;
|
|
151
|
+
} finally {
|
|
152
|
+
speaking = false;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Stream text chunks into TTS for pipelined playback.
|
|
158
|
+
* Buffers text deltas into sentences, generates audio per sentence,
|
|
159
|
+
* and writes PCM to the VPIO speaker stream.
|
|
160
|
+
* @param texts - Async iterable of text chunks from the narrator
|
|
161
|
+
*/
|
|
162
|
+
async function speakStream(texts: AsyncIterable<TextChunk>): Promise<void> {
|
|
163
|
+
if (destroyed) throw new Error("TtsPlayer has been destroyed");
|
|
164
|
+
|
|
165
|
+
const t0 = Date.now();
|
|
166
|
+
let firstTextLogged = false;
|
|
167
|
+
let chunkIndex = 0;
|
|
168
|
+
let playbackFinishAt = 0;
|
|
169
|
+
|
|
170
|
+
interruptFlag = false;
|
|
171
|
+
speaking = true;
|
|
172
|
+
if (wasInterrupted) {
|
|
173
|
+
if (midGeneration) {
|
|
174
|
+
await drainStaleChunks(proc);
|
|
175
|
+
midGeneration = false;
|
|
176
|
+
}
|
|
177
|
+
resumePlayback();
|
|
178
|
+
wasInterrupted = false;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
try {
|
|
182
|
+
for await (const sentence of bufferSentences(texts)) {
|
|
183
|
+
if (interruptFlag) break;
|
|
184
|
+
|
|
185
|
+
if (!firstTextLogged) {
|
|
186
|
+
console.log(`[tts] first sentence at +${Date.now() - t0}ms: "${sentence.slice(0, 50)}${sentence.length > 50 ? "..." : ""}"`);
|
|
187
|
+
firstTextLogged = true;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
const sentAt = Date.now();
|
|
191
|
+
sendCommand(proc, { cmd: "generate", text: sentence });
|
|
192
|
+
midGeneration = true;
|
|
193
|
+
|
|
194
|
+
let loopBroken = false;
|
|
195
|
+
for await (const pcmBuffer of readPcmChunks(proc)) {
|
|
196
|
+
if (interruptFlag) { loopBroken = true; break; }
|
|
197
|
+
|
|
198
|
+
const now = Date.now() - t0;
|
|
199
|
+
const audioDurationMs =
|
|
200
|
+
(pcmBuffer.length / (TTS_SAMPLE_RATE * (SPEAKER_BIT_DEPTH / 8) * SPEAKER_CHANNELS)) * 1000;
|
|
201
|
+
const genMs = Date.now() - sentAt;
|
|
202
|
+
console.log(
|
|
203
|
+
`[tts] chunk ${chunkIndex} at +${now}ms (${(audioDurationMs / 1000).toFixed(1)}s audio, generated in ${genMs}ms)`
|
|
204
|
+
);
|
|
205
|
+
chunkIndex++;
|
|
206
|
+
|
|
207
|
+
await writePcm(speakerInput, pcmBuffer);
|
|
208
|
+
|
|
209
|
+
// Track estimated playback end. If the speaker buffer drained during a
|
|
210
|
+
// gap (e.g. tool call), new audio starts from now, not after previous audio.
|
|
211
|
+
playbackFinishAt = Math.max(playbackFinishAt, Date.now()) + audioDurationMs;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
if (!loopBroken) midGeneration = false;
|
|
215
|
+
if (interruptFlag) break;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
// Wait for buffered audio to finish playing through the speakers
|
|
219
|
+
if (!interruptFlag && playbackFinishAt > 0) {
|
|
220
|
+
const remainingMs = playbackFinishAt - Date.now();
|
|
221
|
+
if (remainingMs > 0) {
|
|
222
|
+
console.log(`[tts] waiting ${(remainingMs / 1000).toFixed(1)}s for playback to finish`);
|
|
223
|
+
await new Promise<void>((resolve) => {
|
|
224
|
+
const timer = setTimeout(resolve, remainingMs);
|
|
225
|
+
// Allow interruption to cancel the wait
|
|
226
|
+
const check = setInterval(() => {
|
|
227
|
+
if (interruptFlag) {
|
|
228
|
+
clearTimeout(timer);
|
|
229
|
+
clearInterval(check);
|
|
230
|
+
resolve();
|
|
231
|
+
}
|
|
232
|
+
}, 50);
|
|
233
|
+
// Clean up interval when timer fires naturally
|
|
234
|
+
setTimeout(() => clearInterval(check), remainingMs + 100);
|
|
235
|
+
});
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
} finally {
|
|
239
|
+
speaking = false;
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
/**
|
|
244
|
+
* Interrupt current playback and generation immediately.
|
|
245
|
+
* Clears the VPIO ring buffer and cancels TTS generation.
|
|
246
|
+
*/
|
|
247
|
+
function interrupt(): void {
|
|
248
|
+
if (destroyed) return;
|
|
249
|
+
interruptFlag = true;
|
|
250
|
+
wasInterrupted = true;
|
|
251
|
+
interruptPlayback();
|
|
252
|
+
sendCommand(proc, { cmd: "interrupt" });
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
/**
|
|
256
|
+
* Check whether TTS is currently active.
|
|
257
|
+
*/
|
|
258
|
+
function checkIsSpeaking(): boolean {
|
|
259
|
+
return speaking;
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
/**
|
|
263
|
+
* Free all resources: kill the Python subprocess.
|
|
264
|
+
*/
|
|
265
|
+
function destroyPlayer(): void {
|
|
266
|
+
if (destroyed) return;
|
|
267
|
+
destroyed = true;
|
|
268
|
+
interrupt();
|
|
269
|
+
sendCommand(proc, { cmd: "quit" });
|
|
270
|
+
proc.kill("SIGTERM");
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
return {
|
|
274
|
+
speak,
|
|
275
|
+
speakStream,
|
|
276
|
+
interrupt,
|
|
277
|
+
isSpeaking: checkIsSpeaking,
|
|
278
|
+
destroy: destroyPlayer,
|
|
279
|
+
};
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
// ============================================================================
|
|
283
|
+
// HELPER FUNCTIONS
|
|
284
|
+
// ============================================================================
|
|
285
|
+
|
|
286
|
+
/**
|
|
287
|
+
* Wait for the Python subprocess to print READY on stderr.
|
|
288
|
+
* @param proc - The child process to monitor
|
|
289
|
+
* @throws Error if the subprocess exits or times out before READY
|
|
290
|
+
*/
|
|
291
|
+
function waitForReady(proc: ChildProcess): Promise<void> {
|
|
292
|
+
return new Promise<void>((resolve, reject) => {
|
|
293
|
+
const timeout = setTimeout(() => {
|
|
294
|
+
reject(new Error(`tts-server.py did not become ready within ${READY_TIMEOUT_MS}ms`));
|
|
295
|
+
}, READY_TIMEOUT_MS);
|
|
296
|
+
|
|
297
|
+
let stderrBuffer = "";
|
|
298
|
+
|
|
299
|
+
const onData = (data: Buffer) => {
|
|
300
|
+
const text = data.toString();
|
|
301
|
+
stderrBuffer += text;
|
|
302
|
+
|
|
303
|
+
// Log all stderr output (model download progress, etc.)
|
|
304
|
+
for (const line of text.split("\n")) {
|
|
305
|
+
const trimmed = line.trim();
|
|
306
|
+
if (trimmed && trimmed !== "READY") {
|
|
307
|
+
console.log(`[tts-server] ${trimmed}`);
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
if (stderrBuffer.includes("READY")) {
|
|
312
|
+
clearTimeout(timeout);
|
|
313
|
+
proc.stderr!.off("data", onData);
|
|
314
|
+
|
|
315
|
+
// Continue logging stderr after READY
|
|
316
|
+
proc.stderr!.on("data", (d: Buffer) => {
|
|
317
|
+
for (const line of d.toString().split("\n")) {
|
|
318
|
+
const trimmed = line.trim();
|
|
319
|
+
if (trimmed) console.log(`[tts-server] ${trimmed}`);
|
|
320
|
+
}
|
|
321
|
+
});
|
|
322
|
+
|
|
323
|
+
resolve();
|
|
324
|
+
}
|
|
325
|
+
};
|
|
326
|
+
|
|
327
|
+
proc.stderr!.on("data", onData);
|
|
328
|
+
|
|
329
|
+
proc.on("error", (err) => {
|
|
330
|
+
clearTimeout(timeout);
|
|
331
|
+
reject(new Error(`tts-server.py failed to start: ${err.message}`));
|
|
332
|
+
});
|
|
333
|
+
|
|
334
|
+
proc.on("exit", (code) => {
|
|
335
|
+
clearTimeout(timeout);
|
|
336
|
+
reject(new Error(`tts-server.py exited with code ${code} before READY`));
|
|
337
|
+
});
|
|
338
|
+
});
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
/**
|
|
342
|
+
* Send a JSON command to the Python subprocess stdin.
|
|
343
|
+
* @param proc - The child process
|
|
344
|
+
* @param cmd - The command object to send
|
|
345
|
+
*/
|
|
346
|
+
function sendCommand(proc: ChildProcess, cmd: Record<string, unknown>): void {
|
|
347
|
+
proc.stdin!.write(JSON.stringify(cmd) + "\n");
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
/**
|
|
351
|
+
* Drain stale PCM data from the subprocess stdout after an interruption.
|
|
352
|
+
* Reads and discards remaining chunks until the end marker (0-length frame).
|
|
353
|
+
* @param proc - The child process to drain from
|
|
354
|
+
*/
|
|
355
|
+
async function drainStaleChunks(proc: ChildProcess): Promise<void> {
|
|
356
|
+
for await (const _chunk of readPcmChunks(proc)) {
|
|
357
|
+
// Discard stale chunks until end marker
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
/**
|
|
362
|
+
* Async generator that reads length-prefixed PCM chunks from the subprocess stdout.
|
|
363
|
+
* Yields Buffer objects until a 0-length end marker is received.
|
|
364
|
+
* @param proc - The child process to read from
|
|
365
|
+
* @yields Buffer of raw 16-bit signed PCM audio
|
|
366
|
+
*/
|
|
367
|
+
async function* readPcmChunks(proc: ChildProcess): AsyncGenerator<Buffer> {
|
|
368
|
+
const stdout = proc.stdout!;
|
|
369
|
+
|
|
370
|
+
while (true) {
|
|
371
|
+
const header = await readExactly(stdout, 4);
|
|
372
|
+
const length = header.readUInt32BE(0);
|
|
373
|
+
|
|
374
|
+
if (length === 0) return;
|
|
375
|
+
|
|
376
|
+
const pcmData = await readExactly(stdout, length);
|
|
377
|
+
yield pcmData;
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
/**
|
|
382
|
+
* Read exactly N bytes from a readable stream.
|
|
383
|
+
* @param stream - The readable stream
|
|
384
|
+
* @param size - Number of bytes to read
|
|
385
|
+
* @returns Buffer containing exactly size bytes
|
|
386
|
+
*/
|
|
387
|
+
function readExactly(stream: NodeJS.ReadableStream, size: number): Promise<Buffer> {
|
|
388
|
+
return new Promise<Buffer>((resolve, reject) => {
|
|
389
|
+
const chunks: Buffer[] = [];
|
|
390
|
+
let received = 0;
|
|
391
|
+
|
|
392
|
+
const onError = (err: Error) => {
|
|
393
|
+
stream.removeListener("end", onEnd);
|
|
394
|
+
reject(err);
|
|
395
|
+
};
|
|
396
|
+
|
|
397
|
+
const onEnd = () => {
|
|
398
|
+
stream.removeListener("error", onError);
|
|
399
|
+
reject(new Error("Stream ended before reading enough bytes"));
|
|
400
|
+
};
|
|
401
|
+
|
|
402
|
+
const tryRead = () => {
|
|
403
|
+
while (received < size) {
|
|
404
|
+
const remaining = size - received;
|
|
405
|
+
const chunk = (stream as any).read(remaining) as Buffer | null;
|
|
406
|
+
if (chunk === null) {
|
|
407
|
+
stream.once("readable", tryRead);
|
|
408
|
+
return;
|
|
409
|
+
}
|
|
410
|
+
chunks.push(chunk);
|
|
411
|
+
received += chunk.length;
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
stream.removeListener("error", onError);
|
|
415
|
+
stream.removeListener("end", onEnd);
|
|
416
|
+
const result = Buffer.concat(chunks);
|
|
417
|
+
resolve(result.subarray(0, size));
|
|
418
|
+
};
|
|
419
|
+
|
|
420
|
+
stream.once("error", onError);
|
|
421
|
+
stream.once("end", onEnd);
|
|
422
|
+
|
|
423
|
+
tryRead();
|
|
424
|
+
});
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
/**
|
|
428
|
+
* Write a PCM buffer to the speaker stream, respecting backpressure.
|
|
429
|
+
* @param stream - The VPIO speaker writable stream
|
|
430
|
+
* @param pcmBuffer - Raw PCM bytes to write
|
|
431
|
+
*/
|
|
432
|
+
function writePcm(stream: Writable, pcmBuffer: Buffer): Promise<void> {
|
|
433
|
+
return new Promise<void>((resolve, reject) => {
|
|
434
|
+
const ok = stream.write(pcmBuffer, (err: Error | null | undefined) => {
|
|
435
|
+
if (err) reject(err);
|
|
436
|
+
});
|
|
437
|
+
if (ok) {
|
|
438
|
+
resolve();
|
|
439
|
+
} else {
|
|
440
|
+
stream.once("drain", () => resolve());
|
|
441
|
+
}
|
|
442
|
+
});
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
/**
|
|
446
|
+
* Buffer streaming text deltas into complete sentences for TTS generation.
|
|
447
|
+
* Chunks tagged with { flush: true } are yielded immediately (e.g. tool narration).
|
|
448
|
+
* Plain string chunks are buffered and split on sentence-ending punctuation.
|
|
449
|
+
* @param texts - Async iterable of TextChunk from the narrator
|
|
450
|
+
* @yields Complete sentences ready for TTS
|
|
451
|
+
*/
|
|
452
|
+
async function* bufferSentences(texts: AsyncIterable<TextChunk>): AsyncGenerator<string> {
|
|
453
|
+
let buffer = "";
|
|
454
|
+
|
|
455
|
+
for await (const raw of texts) {
|
|
456
|
+
if (typeof raw !== "string") {
|
|
457
|
+
if (buffer.trim()) {
|
|
458
|
+
yield buffer.trim();
|
|
459
|
+
buffer = "";
|
|
460
|
+
}
|
|
461
|
+
yield raw.text;
|
|
462
|
+
continue;
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
buffer += raw;
|
|
466
|
+
|
|
467
|
+
while (buffer.length >= MIN_SENTENCE_LENGTH) {
|
|
468
|
+
const match = SENTENCE_END_RE.exec(buffer.slice(MIN_SENTENCE_LENGTH - 1));
|
|
469
|
+
if (!match) break;
|
|
470
|
+
|
|
471
|
+
const splitIndex = MIN_SENTENCE_LENGTH - 1 + match.index + match[0].length;
|
|
472
|
+
const sentence = buffer.slice(0, splitIndex).trim();
|
|
473
|
+
buffer = buffer.slice(splitIndex);
|
|
474
|
+
|
|
475
|
+
if (sentence) yield sentence;
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
const remaining = buffer.trim();
|
|
480
|
+
if (remaining) yield remaining;
|
|
481
|
+
}
|