voicecc 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +6 -0
- package/README.md +48 -0
- package/bin/voicecc.js +39 -0
- package/dashboard/dist/assets/index-BXemFrMp.css +1 -0
- package/dashboard/dist/assets/index-dAYfRls7.js +11 -0
- package/dashboard/dist/audio-processor.js +126 -0
- package/dashboard/dist/index.html +13 -0
- package/dashboard/routes/auth.ts +119 -0
- package/dashboard/routes/browser-call.ts +87 -0
- package/dashboard/routes/claude-md.ts +50 -0
- package/dashboard/routes/conversations.ts +203 -0
- package/dashboard/routes/integrations.ts +154 -0
- package/dashboard/routes/mcp-servers.ts +198 -0
- package/dashboard/routes/settings.ts +64 -0
- package/dashboard/routes/tunnel.ts +66 -0
- package/dashboard/routes/twilio.ts +120 -0
- package/dashboard/routes/voice.ts +48 -0
- package/dashboard/routes/webrtc.ts +85 -0
- package/dashboard/server.ts +130 -0
- package/dashboard/tsconfig.json +13 -0
- package/init/CLAUDE.md +18 -0
- package/package.json +59 -0
- package/run.ts +68 -0
- package/scripts/postinstall.js +228 -0
- package/services/browser-call-manager.ts +106 -0
- package/services/device-pairing.ts +176 -0
- package/services/env.ts +88 -0
- package/services/tunnel.ts +204 -0
- package/services/twilio-manager.ts +126 -0
- package/sidecar/assets/startup.pcm +0 -0
- package/sidecar/audio-adapter.ts +60 -0
- package/sidecar/audio-capture.ts +220 -0
- package/sidecar/browser-audio-playback.test.ts +149 -0
- package/sidecar/browser-audio.ts +147 -0
- package/sidecar/browser-server.ts +331 -0
- package/sidecar/chime.test.ts +69 -0
- package/sidecar/chime.ts +54 -0
- package/sidecar/claude-session.ts +295 -0
- package/sidecar/endpointing.ts +163 -0
- package/sidecar/index.ts +83 -0
- package/sidecar/local-audio.ts +126 -0
- package/sidecar/mic-vpio +0 -0
- package/sidecar/mic-vpio.swift +484 -0
- package/sidecar/mock-tts-server-tagged.mjs +132 -0
- package/sidecar/narration.ts +204 -0
- package/sidecar/scripts/generate-startup-audio.py +79 -0
- package/sidecar/session-lock.ts +123 -0
- package/sidecar/sherpa-onnx-node.d.ts +4 -0
- package/sidecar/stt.ts +199 -0
- package/sidecar/tts-server.py +193 -0
- package/sidecar/tts.ts +481 -0
- package/sidecar/twilio-audio.ts +338 -0
- package/sidecar/twilio-server.ts +436 -0
- package/sidecar/types.ts +210 -0
- package/sidecar/vad.ts +101 -0
- package/sidecar/voice-loop-bugs.test.ts +522 -0
- package/sidecar/voice-session.ts +523 -0
- package/skills/voice/SKILL.md +26 -0
- package/tsconfig.json +22 -0
|
@@ -0,0 +1,436 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTTP + WebSocket server that accepts Twilio phone calls and creates a
|
|
3
|
+
* voice session per call.
|
|
4
|
+
*
|
|
5
|
+
* Standalone entry point for the Twilio voice call path. Runs as a separate
|
|
6
|
+
* process from the local mic path (index.ts).
|
|
7
|
+
*
|
|
8
|
+
* Responsibilities:
|
|
9
|
+
* - Start HTTP server on TWILIO_PORT for Twilio webhooks
|
|
10
|
+
* - Validate incoming call webhooks via Twilio signature verification
|
|
11
|
+
* - Generate per-call UUID tokens for secure WebSocket upgrade
|
|
12
|
+
* - Accept Twilio media stream WebSocket connections
|
|
13
|
+
* - Create a TwilioAudioAdapter + VoiceSession per call
|
|
14
|
+
* - Enforce global session limit via session locks
|
|
15
|
+
* - Tear down sessions on hangup, stop phrase, or error
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import "dotenv/config";
|
|
19
|
+
|
|
20
|
+
import { randomUUID } from "crypto";
|
|
21
|
+
import { createServer, request as httpRequest } from "http";
|
|
22
|
+
import { homedir } from "os";
|
|
23
|
+
import { join } from "path";
|
|
24
|
+
|
|
25
|
+
import twilio from "twilio";
|
|
26
|
+
import { WebSocketServer } from "ws";
|
|
27
|
+
|
|
28
|
+
import { createTwilioAudioAdapter } from "./twilio-audio.js";
|
|
29
|
+
import { createVoiceSession } from "./voice-session.js";
|
|
30
|
+
|
|
31
|
+
import type { IncomingMessage, ServerResponse } from "http";
|
|
32
|
+
import type { Duplex } from "stream";
|
|
33
|
+
import type { WebSocket } from "ws";
|
|
34
|
+
import type { VoiceSession } from "./voice-session.js";
|
|
35
|
+
|
|
36
|
+
// ============================================================================
|
|
37
|
+
// CONSTANTS
|
|
38
|
+
// ============================================================================
|
|
39
|
+
|
|
40
|
+
/** Default port for the Twilio HTTP/WebSocket server */
|
|
41
|
+
const DEFAULT_PORT = 8080;
|
|
42
|
+
|
|
43
|
+
/** Interruption threshold for phone calls (higher than local mic due to no VPIO echo cancellation) */
|
|
44
|
+
const PHONE_INTERRUPTION_THRESHOLD_MS = 2000;
|
|
45
|
+
|
|
46
|
+
/** Default voice session config for phone calls (same as index.ts DEFAULT_CONFIG but with phone-tuned threshold) */
|
|
47
|
+
const DEFAULT_CONFIG = {
|
|
48
|
+
stopPhrase: "stop listening",
|
|
49
|
+
sttModelPath: join(homedir(), ".claude-voice-models", "whisper-small"),
|
|
50
|
+
ttsModel: "prince-canuma/Kokoro-82M",
|
|
51
|
+
ttsVoice: "af_heart",
|
|
52
|
+
modelCacheDir: join(homedir(), ".claude-voice-models"),
|
|
53
|
+
interruptionThresholdMs: PHONE_INTERRUPTION_THRESHOLD_MS,
|
|
54
|
+
endpointing: {
|
|
55
|
+
silenceThresholdMs: 700,
|
|
56
|
+
maxSilenceBeforeTimeoutMs: 1200,
|
|
57
|
+
minWordCountForFastPath: 2,
|
|
58
|
+
enableHaikuFallback: false,
|
|
59
|
+
},
|
|
60
|
+
narration: {
|
|
61
|
+
summaryIntervalMs: 12000,
|
|
62
|
+
},
|
|
63
|
+
claudeSession: {
|
|
64
|
+
allowedTools: [] as string[],
|
|
65
|
+
permissionMode: "bypassPermissions",
|
|
66
|
+
systemPrompt:
|
|
67
|
+
"Respond concisely. You are in voice mode -- your responses will be spoken aloud. Keep answers conversational and brief.",
|
|
68
|
+
},
|
|
69
|
+
};
|
|
70
|
+
|
|
71
|
+
// ============================================================================
|
|
72
|
+
// TYPES
|
|
73
|
+
// ============================================================================
|
|
74
|
+
|
|
75
|
+
/** Tracks an active phone call from Twilio webhook through WebSocket session */
|
|
76
|
+
interface ActiveCall {
|
|
77
|
+
/** Twilio call SID (populated when the WebSocket start event arrives) */
|
|
78
|
+
callSid: string;
|
|
79
|
+
/** Voice session handle (null until WebSocket start event creates it) */
|
|
80
|
+
session: VoiceSession | null;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// ============================================================================
|
|
84
|
+
// STATE
|
|
85
|
+
// ============================================================================
|
|
86
|
+
|
|
87
|
+
/** Active calls keyed by per-call UUID token */
|
|
88
|
+
const activeCalls = new Map<string, ActiveCall>();
|
|
89
|
+
|
|
90
|
+
// ============================================================================
|
|
91
|
+
// MAIN ENTRYPOINT
|
|
92
|
+
// ============================================================================
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Start the Twilio HTTP + WebSocket server.
|
|
96
|
+
*
|
|
97
|
+
* Loads configuration from .env via dotenv. Throws immediately if required
|
|
98
|
+
* env vars (TWILIO_AUTH_TOKEN, TWILIO_WEBHOOK_URL) are missing.
|
|
99
|
+
* Creates an HTTP server for the /twilio/incoming-call webhook and a
|
|
100
|
+
* WebSocket server for Twilio media stream connections.
|
|
101
|
+
*
|
|
102
|
+
* @returns Resolves when the server is listening
|
|
103
|
+
* @throws Error if TWILIO_AUTH_TOKEN or TWILIO_WEBHOOK_URL are not set
|
|
104
|
+
*/
|
|
105
|
+
async function startTwilioServer(): Promise<void> {
|
|
106
|
+
const authToken = process.env.TWILIO_AUTH_TOKEN;
|
|
107
|
+
const webhookUrl = process.env.TWILIO_WEBHOOK_URL;
|
|
108
|
+
const port = parseInt(process.env.TWILIO_PORT ?? "", 10) || DEFAULT_PORT;
|
|
109
|
+
|
|
110
|
+
if (!authToken) {
|
|
111
|
+
throw new Error("TWILIO_AUTH_TOKEN is required in .env");
|
|
112
|
+
}
|
|
113
|
+
if (!webhookUrl) {
|
|
114
|
+
throw new Error("TWILIO_WEBHOOK_URL is required in .env");
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// Extract the host from the webhook URL for TwiML WebSocket URLs
|
|
118
|
+
const webhookHost = new URL(webhookUrl).host;
|
|
119
|
+
|
|
120
|
+
const dashboardPort = parseInt(process.env.DASHBOARD_PORT ?? "", 10);
|
|
121
|
+
|
|
122
|
+
// Create HTTP server
|
|
123
|
+
const server = createServer((req, res) => {
|
|
124
|
+
if (req.method === "POST" && req.url === "/twilio/incoming-call") {
|
|
125
|
+
handleIncomingCall(req, res, authToken, webhookUrl, webhookHost);
|
|
126
|
+
return;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Proxy all other requests to the dashboard server
|
|
130
|
+
if (dashboardPort) {
|
|
131
|
+
proxyToDashboard(req, res, dashboardPort);
|
|
132
|
+
return;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
res.writeHead(404, { "Content-Type": "text/plain" });
|
|
136
|
+
res.end("Not Found");
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
// Create WebSocket server (no automatic HTTP handling -- upgrades only)
|
|
140
|
+
const wss = new WebSocketServer({ noServer: true });
|
|
141
|
+
|
|
142
|
+
// Handle WebSocket upgrade requests
|
|
143
|
+
server.on("upgrade", (req: IncomingMessage, socket: Duplex, head: Buffer) => {
|
|
144
|
+
handleWebSocketUpgrade(req, socket, head, wss);
|
|
145
|
+
});
|
|
146
|
+
|
|
147
|
+
// Start listening
|
|
148
|
+
return new Promise<void>((resolve) => {
|
|
149
|
+
server.listen(port, () => {
|
|
150
|
+
console.log(`Twilio server listening on port ${port}`);
|
|
151
|
+
console.log(`Webhook URL: ${webhookUrl}`);
|
|
152
|
+
resolve();
|
|
153
|
+
});
|
|
154
|
+
});
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// ============================================================================
|
|
158
|
+
// MAIN HANDLERS
|
|
159
|
+
// ============================================================================
|
|
160
|
+
|
|
161
|
+
/**
|
|
162
|
+
* Handle an incoming call webhook from Twilio (POST /twilio/incoming-call).
|
|
163
|
+
*
|
|
164
|
+
* Validates the Twilio request signature, generates a per-call token, and
|
|
165
|
+
* responds with TwiML that tells Twilio to connect a media stream WebSocket.
|
|
166
|
+
*
|
|
167
|
+
* @param req - HTTP request from Twilio
|
|
168
|
+
* @param res - HTTP response to send TwiML back
|
|
169
|
+
* @param authToken - Twilio auth token for signature validation
|
|
170
|
+
* @param webhookUrl - Public webhook URL for signature validation
|
|
171
|
+
* @param webhookHost - Host portion of the webhook URL for WebSocket URLs
|
|
172
|
+
*/
|
|
173
|
+
function handleIncomingCall(
|
|
174
|
+
req: IncomingMessage,
|
|
175
|
+
res: ServerResponse,
|
|
176
|
+
authToken: string,
|
|
177
|
+
webhookUrl: string,
|
|
178
|
+
webhookHost: string,
|
|
179
|
+
): void {
|
|
180
|
+
// Collect the POST body for signature validation
|
|
181
|
+
let body = "";
|
|
182
|
+
req.on("data", (chunk: Buffer) => {
|
|
183
|
+
body += chunk.toString();
|
|
184
|
+
});
|
|
185
|
+
|
|
186
|
+
req.on("end", () => {
|
|
187
|
+
// Parse URL-encoded POST body into key-value params
|
|
188
|
+
const params = parseUrlEncodedBody(body);
|
|
189
|
+
|
|
190
|
+
// Validate Twilio signature (use full URL -- Twilio signs against the complete endpoint URL)
|
|
191
|
+
const validationUrl = webhookUrl.replace(/\/$/, "") + req.url;
|
|
192
|
+
const signature = req.headers["x-twilio-signature"] as string;
|
|
193
|
+
if (!signature || !twilio.validateRequest(authToken, signature, validationUrl, params)) {
|
|
194
|
+
console.log("Rejected incoming call: invalid Twilio signature");
|
|
195
|
+
res.writeHead(403, { "Content-Type": "text/plain" });
|
|
196
|
+
res.end("Forbidden");
|
|
197
|
+
return;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
// Generate per-call token and register in active calls
|
|
201
|
+
const token = randomUUID();
|
|
202
|
+
activeCalls.set(token, { callSid: "", session: null });
|
|
203
|
+
|
|
204
|
+
console.log(`Incoming call accepted, token: ${token}`);
|
|
205
|
+
|
|
206
|
+
// Respond with TwiML to connect a media stream
|
|
207
|
+
const twiml = [
|
|
208
|
+
'<?xml version="1.0" encoding="UTF-8"?>',
|
|
209
|
+
"<Response>",
|
|
210
|
+
" <Connect>",
|
|
211
|
+
` <Stream url="wss://${webhookHost}/media/${token}" />`,
|
|
212
|
+
" </Connect>",
|
|
213
|
+
"</Response>",
|
|
214
|
+
].join("\n");
|
|
215
|
+
|
|
216
|
+
res.writeHead(200, { "Content-Type": "text/xml" });
|
|
217
|
+
res.end(twiml);
|
|
218
|
+
});
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
/**
|
|
222
|
+
* Handle a WebSocket upgrade request for the Twilio media stream.
|
|
223
|
+
*
|
|
224
|
+
* Extracts the per-call token from the URL path, validates it against
|
|
225
|
+
* the activeCalls map, and either accepts or rejects the connection.
|
|
226
|
+
*
|
|
227
|
+
* @param req - HTTP upgrade request
|
|
228
|
+
* @param socket - Underlying TCP socket
|
|
229
|
+
* @param head - First packet of the upgraded stream
|
|
230
|
+
* @param wss - WebSocketServer instance to accept the upgrade
|
|
231
|
+
*/
|
|
232
|
+
function handleWebSocketUpgrade(
|
|
233
|
+
req: IncomingMessage,
|
|
234
|
+
socket: Duplex,
|
|
235
|
+
head: Buffer,
|
|
236
|
+
wss: WebSocketServer,
|
|
237
|
+
): void {
|
|
238
|
+
// Extract token from URL path: /media/:token
|
|
239
|
+
const url = req.url ?? "";
|
|
240
|
+
const match = url.match(/^\/media\/([a-f0-9-]+)$/);
|
|
241
|
+
|
|
242
|
+
if (!match) {
|
|
243
|
+
console.log(`Rejected WebSocket upgrade: invalid path ${url}`);
|
|
244
|
+
socket.destroy();
|
|
245
|
+
return;
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
const token = match[1];
|
|
249
|
+
|
|
250
|
+
if (!activeCalls.has(token)) {
|
|
251
|
+
console.log(`Rejected WebSocket upgrade: unknown token ${token}`);
|
|
252
|
+
socket.destroy();
|
|
253
|
+
return;
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
// Accept the WebSocket connection
|
|
257
|
+
wss.handleUpgrade(req, socket, head, (ws: WebSocket) => {
|
|
258
|
+
wss.emit("connection", ws, req);
|
|
259
|
+
handleCallSession(ws, token);
|
|
260
|
+
});
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
/**
|
|
264
|
+
* Handle a connected Twilio media stream WebSocket session.
|
|
265
|
+
*
|
|
266
|
+
* Listens for Twilio WebSocket events (start, media, stop) and manages
|
|
267
|
+
* the voice session lifecycle. On the "start" event, creates a
|
|
268
|
+
* TwilioAudioAdapter and VoiceSession. On "stop" or WebSocket close,
|
|
269
|
+
* tears down the session and cleans up.
|
|
270
|
+
*
|
|
271
|
+
* @param ws - Connected WebSocket for the Twilio media stream
|
|
272
|
+
* @param token - Per-call UUID token identifying this call
|
|
273
|
+
*/
|
|
274
|
+
function handleCallSession(ws: WebSocket, token: string): void {
|
|
275
|
+
let cleaned = false;
|
|
276
|
+
|
|
277
|
+
/**
|
|
278
|
+
* Clean up the call session. Stops the voice session, removes from
|
|
279
|
+
* activeCalls map. Uses cleaned flag to prevent double-cleanup.
|
|
280
|
+
*/
|
|
281
|
+
async function cleanup(): Promise<void> {
|
|
282
|
+
if (cleaned) return;
|
|
283
|
+
cleaned = true;
|
|
284
|
+
|
|
285
|
+
const call = activeCalls.get(token);
|
|
286
|
+
if (call?.session) {
|
|
287
|
+
await call.session.stop();
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
activeCalls.delete(token);
|
|
291
|
+
console.log(`Call session cleaned up, token: ${token}`);
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
// WebSocket close handler -- always runs cleanup regardless of cause
|
|
295
|
+
ws.on("close", () => {
|
|
296
|
+
cleanup().catch((err) => {
|
|
297
|
+
console.error(`Error during call cleanup: ${err}`);
|
|
298
|
+
});
|
|
299
|
+
});
|
|
300
|
+
|
|
301
|
+
ws.on("error", (err) => {
|
|
302
|
+
console.error(`WebSocket error for token ${token}: ${err}`);
|
|
303
|
+
});
|
|
304
|
+
|
|
305
|
+
// Listen for Twilio media stream events
|
|
306
|
+
ws.on("message", (data: Buffer | string) => {
|
|
307
|
+
const msg = JSON.parse(typeof data === "string" ? data : data.toString("utf-8"));
|
|
308
|
+
|
|
309
|
+
if (msg.event === "start") {
|
|
310
|
+
handleStreamStart(ws, token, msg).catch((err) => {
|
|
311
|
+
console.error(`Error handling stream start: ${err}`);
|
|
312
|
+
});
|
|
313
|
+
return;
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
if (msg.event === "stop") {
|
|
317
|
+
console.log(`Twilio stream stopped for token: ${token}`);
|
|
318
|
+
ws.close();
|
|
319
|
+
return;
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
// "connected" and "media" events are handled elsewhere:
|
|
323
|
+
// - "connected": informational, no action needed
|
|
324
|
+
// - "media": handled by TwilioAudioAdapter's onAudio listener
|
|
325
|
+
});
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
// ============================================================================
|
|
329
|
+
// HELPER FUNCTIONS
|
|
330
|
+
// ============================================================================
|
|
331
|
+
|
|
332
|
+
/**
|
|
333
|
+
* Handle the Twilio "start" event on a media stream WebSocket.
|
|
334
|
+
*
|
|
335
|
+
* Extracts the streamSid and callSid, creates a TwilioAudioAdapter and
|
|
336
|
+
* VoiceSession. If session creation fails (e.g. limit reached), logs the
|
|
337
|
+
* error and closes the WebSocket.
|
|
338
|
+
*
|
|
339
|
+
* @param ws - Connected WebSocket for the Twilio media stream
|
|
340
|
+
* @param token - Per-call UUID token
|
|
341
|
+
* @param msg - Parsed Twilio "start" event message
|
|
342
|
+
*/
|
|
343
|
+
async function handleStreamStart(
|
|
344
|
+
ws: WebSocket,
|
|
345
|
+
token: string,
|
|
346
|
+
msg: { start: { streamSid: string; callSid: string } },
|
|
347
|
+
): Promise<void> {
|
|
348
|
+
const { streamSid, callSid } = msg.start;
|
|
349
|
+
console.log(`Stream started -- callSid: ${callSid}, streamSid: ${streamSid}`);
|
|
350
|
+
|
|
351
|
+
// Update the active call entry with the callSid
|
|
352
|
+
const call = activeCalls.get(token);
|
|
353
|
+
if (!call) return;
|
|
354
|
+
call.callSid = callSid;
|
|
355
|
+
|
|
356
|
+
try {
|
|
357
|
+
// Create the Twilio audio adapter
|
|
358
|
+
const adapter = createTwilioAudioAdapter({ ws, streamSid });
|
|
359
|
+
|
|
360
|
+
// Create the voice session (acquires a session lock -- may throw if limit reached)
|
|
361
|
+
const session = await createVoiceSession(adapter, {
|
|
362
|
+
...DEFAULT_CONFIG,
|
|
363
|
+
onSessionEnd: () => ws.close(),
|
|
364
|
+
});
|
|
365
|
+
|
|
366
|
+
call.session = session;
|
|
367
|
+
} catch (err) {
|
|
368
|
+
console.error(`Failed to create voice session for call ${callSid}: ${err}`);
|
|
369
|
+
|
|
370
|
+
// Send a TwiML-style rejection message over the WebSocket is not possible,
|
|
371
|
+
// so just close the WebSocket. The caller will hear silence and Twilio will
|
|
372
|
+
// eventually disconnect.
|
|
373
|
+
ws.close();
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
/**
|
|
378
|
+
* Parse a URL-encoded POST body into a key-value record.
|
|
379
|
+
*
|
|
380
|
+
* @param body - URL-encoded string (e.g. "key1=value1&key2=value2")
|
|
381
|
+
* @returns Record of decoded key-value pairs
|
|
382
|
+
*/
|
|
383
|
+
function parseUrlEncodedBody(body: string): Record<string, string> {
|
|
384
|
+
const params: Record<string, string> = {};
|
|
385
|
+
|
|
386
|
+
if (!body) return params;
|
|
387
|
+
|
|
388
|
+
for (const pair of body.split("&")) {
|
|
389
|
+
const [key, value] = pair.split("=");
|
|
390
|
+
if (key) {
|
|
391
|
+
params[decodeURIComponent(key)] = decodeURIComponent(value ?? "");
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
return params;
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
/**
|
|
399
|
+
* Proxy an HTTP request to the dashboard server on localhost.
|
|
400
|
+
* Forwards the request method, path, headers, and body.
|
|
401
|
+
*
|
|
402
|
+
* @param req - Original incoming request
|
|
403
|
+
* @param res - Response to write the proxied result to
|
|
404
|
+
* @param dashboardPort - Port the dashboard server is listening on
|
|
405
|
+
*/
|
|
406
|
+
function proxyToDashboard(req: IncomingMessage, res: ServerResponse, dashboardPort: number): void {
|
|
407
|
+
const proxyReq = httpRequest(
|
|
408
|
+
{
|
|
409
|
+
hostname: "127.0.0.1",
|
|
410
|
+
port: dashboardPort,
|
|
411
|
+
path: req.url,
|
|
412
|
+
method: req.method,
|
|
413
|
+
headers: req.headers,
|
|
414
|
+
},
|
|
415
|
+
(proxyRes) => {
|
|
416
|
+
res.writeHead(proxyRes.statusCode ?? 502, proxyRes.headers);
|
|
417
|
+
proxyRes.pipe(res);
|
|
418
|
+
},
|
|
419
|
+
);
|
|
420
|
+
|
|
421
|
+
proxyReq.on("error", () => {
|
|
422
|
+
res.writeHead(502, { "Content-Type": "text/plain" });
|
|
423
|
+
res.end("Dashboard unavailable");
|
|
424
|
+
});
|
|
425
|
+
|
|
426
|
+
req.pipe(proxyReq);
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
// ============================================================================
|
|
430
|
+
// ENTRY POINT
|
|
431
|
+
// ============================================================================
|
|
432
|
+
|
|
433
|
+
startTwilioServer().catch((err) => {
|
|
434
|
+
console.error(`Twilio server failed: ${err}`);
|
|
435
|
+
process.exit(1);
|
|
436
|
+
});
|
package/sidecar/types.ts
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared types for the Claude Code voice sidecar.
|
|
3
|
+
*
|
|
4
|
+
* Defines all DTOs and interfaces used across the voice pipeline modules:
|
|
5
|
+
* - Voice loop configuration and state
|
|
6
|
+
* - Audio frame representation
|
|
7
|
+
* - VAD (voice activity detection) events
|
|
8
|
+
* - STT (speech-to-text) results
|
|
9
|
+
* - Endpointing decisions for turn detection
|
|
10
|
+
* - Claude session streaming events
|
|
11
|
+
* - TTS (text-to-speech) configuration
|
|
12
|
+
* - Narration configuration
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
// ============================================================================
|
|
16
|
+
// CONFIGURATION INTERFACES
|
|
17
|
+
// ============================================================================
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Top-level configuration for the voice loop.
|
|
21
|
+
* Passed to `startVoiceLoop` to initialize all modules.
|
|
22
|
+
*/
|
|
23
|
+
export interface VoiceLoopConfig {
|
|
24
|
+
/** Path to the sherpa-onnx Whisper ONNX model directory */
|
|
25
|
+
sttModelPath: string;
|
|
26
|
+
/** mlx-audio model ID for TTS (e.g. "prince-canuma/Kokoro-82M") */
|
|
27
|
+
ttsModel: string;
|
|
28
|
+
/** TTS voice ID (e.g. "af_heart" for Kokoro) */
|
|
29
|
+
ttsVoice: string;
|
|
30
|
+
/** Directory for cached model files */
|
|
31
|
+
modelCacheDir: string;
|
|
32
|
+
/** Endpointing configuration for turn detection */
|
|
33
|
+
endpointing: EndpointingConfig;
|
|
34
|
+
/** Narration configuration for Claude response processing */
|
|
35
|
+
narration: NarrationConfig;
|
|
36
|
+
/** Claude Agent SDK session configuration */
|
|
37
|
+
claudeSession: ClaudeSessionConfig;
|
|
38
|
+
/** Phrase that stops the voice loop when spoken */
|
|
39
|
+
stopPhrase: string;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Configuration for the endpointing module.
|
|
44
|
+
* Controls how the system decides when the user is done speaking.
|
|
45
|
+
*/
|
|
46
|
+
export interface EndpointingConfig {
|
|
47
|
+
/** Silence duration (ms) before considering speech complete */
|
|
48
|
+
silenceThresholdMs: number;
|
|
49
|
+
/** Maximum silence duration (ms) before forcing completion regardless */
|
|
50
|
+
maxSilenceBeforeTimeoutMs: number;
|
|
51
|
+
/** Minimum word count for the VAD fast path (skips Haiku check) */
|
|
52
|
+
minWordCountForFastPath: number;
|
|
53
|
+
/** Whether to use Haiku API for ambiguous short utterances */
|
|
54
|
+
enableHaikuFallback: boolean;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Configuration for the Claude Agent SDK session.
|
|
59
|
+
*/
|
|
60
|
+
export interface ClaudeSessionConfig {
|
|
61
|
+
/** List of allowed tool names (empty array means all tools allowed) */
|
|
62
|
+
allowedTools: string[];
|
|
63
|
+
/** Permission mode -- must be "bypassPermissions" for voice loop */
|
|
64
|
+
permissionMode: string;
|
|
65
|
+
/** System prompt instructing Claude to respond concisely for voice output */
|
|
66
|
+
systemPrompt: string;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Configuration for the narration module.
|
|
71
|
+
* Controls how Claude's streaming output is processed into speakable text.
|
|
72
|
+
*/
|
|
73
|
+
export interface NarrationConfig {
|
|
74
|
+
/** Interval (ms) between "still working..." summaries during long tool runs */
|
|
75
|
+
summaryIntervalMs: number;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Configuration for the TTS (text-to-speech) module.
|
|
80
|
+
*/
|
|
81
|
+
export interface TtsConfig {
|
|
82
|
+
/** mlx-audio model ID (e.g. "prince-canuma/Kokoro-82M") */
|
|
83
|
+
model: string;
|
|
84
|
+
/** Voice ID (e.g. "af_heart" for Kokoro) */
|
|
85
|
+
voice: string;
|
|
86
|
+
/** Writable stream for PCM audio output (VPIO process stdin) */
|
|
87
|
+
speakerInput: import("stream").Writable;
|
|
88
|
+
/** Callback to clear the VPIO playback buffer on interruption */
|
|
89
|
+
interruptPlayback: () => void;
|
|
90
|
+
/** Callback to resume VPIO stdin processing after an interrupt */
|
|
91
|
+
resumePlayback: () => void;
|
|
92
|
+
/** Override server command for testing (default: Python TTS server) */
|
|
93
|
+
serverCommand?: string[];
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// ============================================================================
|
|
97
|
+
// AUDIO TYPES
|
|
98
|
+
// ============================================================================
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* A single frame of audio data from the microphone.
|
|
102
|
+
*/
|
|
103
|
+
export interface AudioFrame {
|
|
104
|
+
/** PCM audio samples normalized to -1.0 to 1.0 range */
|
|
105
|
+
pcm: Float32Array;
|
|
106
|
+
/** Sample rate in Hz */
|
|
107
|
+
sampleRate: number;
|
|
108
|
+
/** Timestamp in milliseconds when this frame was captured */
|
|
109
|
+
timestamp: number;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// ============================================================================
|
|
113
|
+
// VAD TYPES
|
|
114
|
+
// ============================================================================
|
|
115
|
+
|
|
116
|
+
/** Possible VAD event types indicating speech activity state */
|
|
117
|
+
export type VadEventType = "SPEECH_START" | "SPEECH_CONTINUE" | "SPEECH_END" | "SILENCE";
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* Event emitted by the VAD processor after analyzing an audio frame.
|
|
121
|
+
*/
|
|
122
|
+
export interface VadEvent {
|
|
123
|
+
/** The detected speech activity state */
|
|
124
|
+
type: VadEventType;
|
|
125
|
+
/** Speech probability from the VAD model (0.0 to 1.0) */
|
|
126
|
+
probability: number;
|
|
127
|
+
/** Timestamp in milliseconds */
|
|
128
|
+
timestamp: number;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// ============================================================================
|
|
132
|
+
// STT TYPES
|
|
133
|
+
// ============================================================================
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Result from the speech-to-text transcription.
|
|
137
|
+
*/
|
|
138
|
+
export interface TranscriptionResult {
|
|
139
|
+
/** The transcribed text */
|
|
140
|
+
text: string;
|
|
141
|
+
/** Whether this is a final transcription (always true for batch/offline mode) */
|
|
142
|
+
isFinal: boolean;
|
|
143
|
+
/** Timestamp in milliseconds when transcription completed */
|
|
144
|
+
timestamp: number;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// ============================================================================
|
|
148
|
+
// ENDPOINTING TYPES
|
|
149
|
+
// ============================================================================
|
|
150
|
+
|
|
151
|
+
/** Method used to determine that the user finished speaking */
|
|
152
|
+
export type EndpointMethod = "vad_fast" | "haiku_semantic" | "timeout";
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* Decision from the endpointing module on whether the user has finished speaking.
|
|
156
|
+
*/
|
|
157
|
+
export interface EndpointDecision {
|
|
158
|
+
/** Whether the user's turn is considered complete */
|
|
159
|
+
isComplete: boolean;
|
|
160
|
+
/** The current accumulated transcript */
|
|
161
|
+
transcript: string;
|
|
162
|
+
/** Which method was used to make the decision */
|
|
163
|
+
method: EndpointMethod;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// ============================================================================
|
|
167
|
+
// CLAUDE SESSION TYPES
|
|
168
|
+
// ============================================================================
|
|
169
|
+
|
|
170
|
+
/** Possible event types from the Claude streaming response */
|
|
171
|
+
export type ClaudeStreamEventType = "text_delta" | "tool_start" | "tool_end" | "result" | "error";
|
|
172
|
+
|
|
173
|
+
/**
|
|
174
|
+
* Simplified streaming event from the Claude Agent SDK session.
|
|
175
|
+
* Mapped from the raw SDKMessage types for downstream consumption.
|
|
176
|
+
*/
|
|
177
|
+
export interface ClaudeStreamEvent {
|
|
178
|
+
/** The type of streaming event */
|
|
179
|
+
type: ClaudeStreamEventType;
|
|
180
|
+
/** Text content (for text_delta events) or error message (for error events) */
|
|
181
|
+
content: string;
|
|
182
|
+
/** Tool name (only present for tool_start events) */
|
|
183
|
+
toolName?: string;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// ============================================================================
|
|
187
|
+
// TTS TEXT CHUNK TYPES
|
|
188
|
+
// ============================================================================
|
|
189
|
+
|
|
190
|
+
/** A text chunk for TTS. Plain string = streaming fragment (buffer it).
|
|
191
|
+
* Object with flush = complete sentence (speak immediately). */
|
|
192
|
+
export type TextChunk = string | { text: string; flush: true };
|
|
193
|
+
|
|
194
|
+
// ============================================================================
|
|
195
|
+
// VOICE LOOP STATE
|
|
196
|
+
// ============================================================================
|
|
197
|
+
|
|
198
|
+
/** Possible states of the voice loop state machine */
|
|
199
|
+
export type VoiceLoopStatus = "idle" | "listening" | "processing" | "speaking";
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Current state of the voice loop.
|
|
203
|
+
* Used by the state machine in index.ts.
|
|
204
|
+
*/
|
|
205
|
+
export interface VoiceLoopState {
|
|
206
|
+
/** Current state of the voice loop */
|
|
207
|
+
status: VoiceLoopStatus;
|
|
208
|
+
/** Active Claude session ID, or null if no session is active */
|
|
209
|
+
sessionId: string | null;
|
|
210
|
+
}
|