voicecc 1.1.36 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/bin/voicecc.js +94 -1
  2. package/dashboard/dist/assets/index-DCeOdulF.js +28 -0
  3. package/dashboard/dist/index.html +1 -1
  4. package/dashboard/routes/agents.ts +28 -8
  5. package/dashboard/routes/browser-call.ts +3 -2
  6. package/dashboard/routes/chat.ts +75 -55
  7. package/dashboard/routes/providers.ts +5 -74
  8. package/dashboard/routes/twilio.ts +104 -5
  9. package/dashboard/routes/voice.ts +98 -0
  10. package/dashboard/server.ts +48 -1
  11. package/package.json +2 -3
  12. package/server/index.ts +96 -8
  13. package/server/services/twilio-manager.ts +29 -10
  14. package/dashboard/dist/assets/index-C62C9Gp0.js +0 -28
  15. package/dashboard/dist/audio-processor.js +0 -126
  16. package/server/services/heartbeat.ts +0 -403
  17. package/server/voice/assets/chime.wav +0 -0
  18. package/server/voice/assets/startup.pcm +0 -0
  19. package/server/voice/audio-adapter.ts +0 -60
  20. package/server/voice/audio-inactivity.test.ts +0 -108
  21. package/server/voice/audio-inactivity.ts +0 -91
  22. package/server/voice/browser-audio-playback.test.ts +0 -149
  23. package/server/voice/browser-audio.ts +0 -147
  24. package/server/voice/browser-server.ts +0 -311
  25. package/server/voice/chat-server.ts +0 -236
  26. package/server/voice/chime.test.ts +0 -69
  27. package/server/voice/chime.ts +0 -36
  28. package/server/voice/claude-session.ts +0 -293
  29. package/server/voice/endpointing.ts +0 -163
  30. package/server/voice/mic-vpio +0 -0
  31. package/server/voice/narration.ts +0 -204
  32. package/server/voice/prompt-builder.ts +0 -108
  33. package/server/voice/session-lock.ts +0 -123
  34. package/server/voice/stt-elevenlabs.ts +0 -210
  35. package/server/voice/stt-provider.ts +0 -106
  36. package/server/voice/tts-elevenlabs-hiss.test.ts +0 -183
  37. package/server/voice/tts-elevenlabs.ts +0 -397
  38. package/server/voice/tts-provider.ts +0 -155
  39. package/server/voice/twilio-audio.ts +0 -338
  40. package/server/voice/twilio-server.ts +0 -540
  41. package/server/voice/types.ts +0 -282
  42. package/server/voice/vad.ts +0 -101
  43. package/server/voice/voice-loop-bugs.test.ts +0 -348
  44. package/server/voice/voice-server.ts +0 -129
  45. package/server/voice/voice-session.ts +0 -539
@@ -1,540 +0,0 @@
1
- /**
2
- * Twilio voice call handlers for the unified voice server.
3
- *
4
- * Provides HTTP request handlers and WebSocket upgrade logic for Twilio
5
- * phone calls. Used by voice-server.ts which owns the HTTP server.
6
- *
7
- * Responsibilities:
8
- * - Handle incoming call webhooks via Twilio signature verification
9
- * - Generate per-call UUID tokens for secure WebSocket upgrade
10
- * - Accept Twilio media stream WebSocket connections
11
- * - Create a TwilioAudioAdapter + VoiceSession per call
12
- * - Enforce global session limit via session locks
13
- * - Tear down sessions on hangup, stop phrase, or error
14
- */
15
-
16
- import { randomUUID } from "crypto";
17
- import { join } from "path";
18
-
19
- import twilio from "twilio";
20
- import { WebSocketServer } from "ws";
21
-
22
- import { createTwilioAudioAdapter } from "./twilio-audio.js";
23
- import { createVoiceSession } from "./voice-session.js";
24
- import { createAudioInactivityWatchdog } from "./audio-inactivity.js";
25
- import { buildAgentPrompt, buildDefaultPrompt } from "./prompt-builder.js";
26
- import { getAgent, AGENTS_DIR } from "../services/agent-store.js";
27
- import { getTunnelUrl } from "../services/tunnel.js";
28
- import { readEnv } from "../services/env.js";
29
-
30
- import type { IncomingMessage, ServerResponse } from "http";
31
- import type { Duplex } from "stream";
32
- import type { WebSocket } from "ws";
33
- import type { VoiceSession } from "./voice-session.js";
34
- import type { TtsProviderConfig, SttProviderConfig } from "./types.js";
35
-
36
- // ============================================================================
37
- // CONSTANTS
38
- // ============================================================================
39
-
40
- /** Interruption threshold for phone calls (higher than local mic due to no VPIO echo cancellation) */
41
- const PHONE_INTERRUPTION_THRESHOLD_MS = 2000;
42
-
43
- /** Close the WebSocket if no Twilio audio frames arrive within this window (ms) */
44
- const AUDIO_INACTIVITY_TIMEOUT_MS = 5000;
45
-
46
- /** How often to check for audio inactivity (ms) */
47
- const AUDIO_INACTIVITY_CHECK_INTERVAL_MS = 2000;
48
-
49
- /** Default ElevenLabs voice ID (used when not set in .env) */
50
- const DEFAULT_ELEVENLABS_VOICE_ID = "WrjxnKxK0m1uiaH0uteU";
51
-
52
- /** Default ElevenLabs TTS model ID (used when not set in .env) */
53
- const DEFAULT_ELEVENLABS_MODEL_ID = "eleven_turbo_v2_5";
54
-
55
- /** Default ElevenLabs STT model ID (used when not set in .env) */
56
- const DEFAULT_ELEVENLABS_STT_MODEL_ID = "scribe_v1";
57
-
58
- // ============================================================================
59
- // TYPES
60
- // ============================================================================
61
-
62
- /** Tracks an active phone call from Twilio webhook through WebSocket session */
63
- interface ActiveCall {
64
- /** Twilio call SID (populated when the WebSocket start event arrives) */
65
- callSid: string;
66
- /** Voice session handle (null until WebSocket start event creates it) */
67
- session: VoiceSession | null;
68
- /** Agent identifier for agent-initiated calls (undefined for default inbound calls) */
69
- agentId?: string;
70
- /** Initial prompt for the agent to speak first (e.g. "Call Me" or heartbeat reason) */
71
- initialPrompt?: string;
72
- /** Pre-existing Claude session from heartbeat (passed to voice session instead of creating new one) */
73
- claudeSession?: import("./claude-session.js").ClaudeSession;
74
- }
75
-
76
- // ============================================================================
77
- // STATE
78
- // ============================================================================
79
-
80
- /** Active calls keyed by per-call UUID token */
81
- const activeCalls = new Map<string, ActiveCall>();
82
-
83
- // ============================================================================
84
- // EXPORTED HANDLERS
85
- // ============================================================================
86
-
87
- /**
88
- * Attach a pre-existing Claude session to a registered call token.
89
- * Called by the heartbeat scheduler after registering a token, so the
90
- * voice session can continue the same Claude session instead of creating a new one.
91
- *
92
- * @param token - The call token previously registered via /register-call
93
- * @param session - The live Claude session from the heartbeat check
94
- */
95
- export function setCallClaudeSession(token: string, session: import("./claude-session.js").ClaudeSession): void {
96
- const call = activeCalls.get(token);
97
- if (call) {
98
- call.claudeSession = session;
99
- }
100
- }
101
-
102
- /**
103
- * Handle Twilio-specific HTTP requests.
104
- *
105
- * Routes POST /twilio/incoming-call and POST /register-call.
106
- * Returns true if the request was handled, false otherwise (so the
107
- * caller can fall through to other handlers like the dashboard proxy).
108
- *
109
- * @param req - HTTP request
110
- * @param res - HTTP response
111
- * @returns true if handled
112
- */
113
- export function handleTwilioHttpRequest(req: IncomingMessage, res: ServerResponse): boolean {
114
- if (req.method === "POST" && req.url === "/twilio/incoming-call") {
115
- handleIncomingCall(req, res);
116
- return true;
117
- }
118
-
119
- if (req.method === "POST" && req.url === "/register-call") {
120
- handleRegisterCall(req, res);
121
- return true;
122
- }
123
-
124
- return false;
125
- }
126
-
127
- /**
128
- * Handle a WebSocket upgrade for Twilio media streams.
129
- *
130
- * Delegates to the internal handleWebSocketUpgrade with the shared WSS.
131
- *
132
- * @param req - HTTP upgrade request
133
- * @param socket - Underlying TCP socket
134
- * @param head - First packet of the upgraded stream
135
- * @param wss - WebSocketServer instance to accept the upgrade
136
- */
137
- export function handleTwilioUpgrade(
138
- req: IncomingMessage,
139
- socket: Duplex,
140
- head: Buffer,
141
- wss: WebSocketServer,
142
- ): void {
143
- handleWebSocketUpgrade(req, socket, head, wss);
144
- }
145
-
146
- // ============================================================================
147
- // MAIN HANDLERS
148
- // ============================================================================
149
-
150
- /**
151
- * Handle an incoming call webhook from Twilio (POST /twilio/incoming-call).
152
- *
153
- * Validates the Twilio request signature, generates a per-call token, and
154
- * responds with TwiML that tells Twilio to connect a media stream WebSocket.
155
- * Reads auth token and tunnel URL lazily per-request so values are always current.
156
- *
157
- * @param req - HTTP request from Twilio
158
- * @param res - HTTP response to send TwiML back
159
- */
160
- function handleIncomingCall(
161
- req: IncomingMessage,
162
- res: ServerResponse,
163
- ): void {
164
- // Collect the POST body for signature validation
165
- let body = "";
166
- req.on("data", (chunk: Buffer) => {
167
- body += chunk.toString();
168
- });
169
-
170
- req.on("end", async () => {
171
- // Read auth token and tunnel URL lazily per-request
172
- const env = await readEnv();
173
- const authToken = env.TWILIO_AUTH_TOKEN;
174
- const tunnelUrl = getTunnelUrl();
175
-
176
- if (!authToken) {
177
- console.log("Rejected incoming call: TWILIO_AUTH_TOKEN not set");
178
- res.writeHead(500, { "Content-Type": "text/plain" });
179
- res.end("Server misconfigured");
180
- return;
181
- }
182
-
183
- if (!tunnelUrl) {
184
- console.log("Rejected incoming call: no tunnel URL available");
185
- res.writeHead(500, { "Content-Type": "text/plain" });
186
- res.end("Server misconfigured");
187
- return;
188
- }
189
-
190
- const webhookHost = new URL(tunnelUrl).host;
191
-
192
- // Parse URL-encoded POST body into key-value params
193
- const params = parseUrlEncodedBody(body);
194
-
195
- // Validate Twilio signature (use full URL -- Twilio signs against the complete endpoint URL)
196
- const webhookUrl = tunnelUrl.replace(/\/$/, "");
197
- const validationUrl = webhookUrl + req.url;
198
- const signature = req.headers["x-twilio-signature"] as string;
199
- if (!signature || !twilio.validateRequest(authToken, signature, validationUrl, params)) {
200
- console.log("Rejected incoming call: invalid Twilio signature");
201
- console.log(" validationUrl:", validationUrl);
202
- console.log(" signature:", signature);
203
- res.writeHead(403, { "Content-Type": "text/plain" });
204
- res.end("Forbidden");
205
- return;
206
- }
207
-
208
- // Generate per-call token and register in active calls
209
- const token = randomUUID();
210
- activeCalls.set(token, { callSid: "", session: null });
211
-
212
- console.log(`Incoming call accepted, token: ${token}`);
213
-
214
- // Respond with TwiML to connect a media stream
215
- const twiml = [
216
- '<?xml version="1.0" encoding="UTF-8"?>',
217
- "<Response>",
218
- " <Connect>",
219
- ` <Stream url="wss://${webhookHost}/media/${token}" />`,
220
- " </Connect>",
221
- "</Response>",
222
- ].join("\n");
223
-
224
- res.writeHead(200, { "Content-Type": "text/xml" });
225
- res.end(twiml);
226
- });
227
- }
228
-
229
- /**
230
- * Handle a POST /register-call request to pre-register an outbound call token.
231
- *
232
- * Called by the heartbeat scheduler or API before placing an outbound Twilio call.
233
- * Registers the token in activeCalls so the subsequent WebSocket upgrade is accepted.
234
- *
235
- * @param req - HTTP request with JSON body { token, agentId }
236
- * @param res - HTTP response
237
- */
238
- function handleRegisterCall(req: IncomingMessage, res: ServerResponse): void {
239
- let body = "";
240
- req.on("data", (chunk: Buffer) => {
241
- body += chunk.toString();
242
- });
243
-
244
- req.on("end", () => {
245
- const { token, agentId, initialPrompt } = JSON.parse(body) as { token: string; agentId: string; initialPrompt?: string };
246
- activeCalls.set(token, { callSid: "", session: null, agentId, initialPrompt });
247
-
248
- console.log(`Registered outbound call token: ${token}, agentId: ${agentId}`);
249
-
250
- res.writeHead(200, { "Content-Type": "application/json" });
251
- res.end(JSON.stringify({ success: true }));
252
- });
253
- }
254
-
255
- /**
256
- * Handle a WebSocket upgrade request for the Twilio media stream.
257
- *
258
- * Extracts the per-call token from the URL path, validates it against
259
- * the activeCalls map, and either accepts or rejects the connection.
260
- *
261
- * @param req - HTTP upgrade request
262
- * @param socket - Underlying TCP socket
263
- * @param head - First packet of the upgraded stream
264
- * @param wss - WebSocketServer instance to accept the upgrade
265
- */
266
- function handleWebSocketUpgrade(
267
- req: IncomingMessage,
268
- socket: Duplex,
269
- head: Buffer,
270
- wss: WebSocketServer,
271
- ): void {
272
- // Extract token from URL path: /media/:token (allow optional query params)
273
- const url = req.url ?? "";
274
- const match = url.match(/^\/media\/([a-f0-9-]+)(?:\?.*)?$/);
275
-
276
- if (!match) {
277
- console.log(`Rejected WebSocket upgrade: invalid path ${url}`);
278
- socket.destroy();
279
- return;
280
- }
281
-
282
- const token = match[1];
283
-
284
- if (!activeCalls.has(token)) {
285
- console.log(`Rejected WebSocket upgrade: unknown token ${token}`);
286
- socket.destroy();
287
- return;
288
- }
289
-
290
- // Parse agentId from query string if present (used for outbound agent calls)
291
- const urlObj = new URL(url, "http://localhost");
292
- const queryAgentId = urlObj.searchParams.get("agentId");
293
- if (queryAgentId) {
294
- activeCalls.get(token)!.agentId = queryAgentId;
295
- }
296
-
297
- // Accept the WebSocket connection
298
- wss.handleUpgrade(req, socket, head, (ws: WebSocket) => {
299
- wss.emit("connection", ws, req);
300
- handleCallSession(ws, token);
301
- });
302
- }
303
-
304
- /**
305
- * Handle a connected Twilio media stream WebSocket session.
306
- *
307
- * Listens for Twilio WebSocket events (start, media, stop) and manages
308
- * the voice session lifecycle. On the "start" event, creates a
309
- * TwilioAudioAdapter and VoiceSession. On "stop" or WebSocket close,
310
- * tears down the session and cleans up.
311
- *
312
- * @param ws - Connected WebSocket for the Twilio media stream
313
- * @param token - Per-call UUID token identifying this call
314
- */
315
- function handleCallSession(ws: WebSocket, token: string): void {
316
- let cleaned = false;
317
-
318
- // Detect stale calls: if Twilio stops sending audio frames (caller hung up
319
- // but WebSocket didn't close cleanly), close the WebSocket to trigger cleanup.
320
- const watchdog = createAudioInactivityWatchdog({
321
- timeoutMs: AUDIO_INACTIVITY_TIMEOUT_MS,
322
- checkIntervalMs: AUDIO_INACTIVITY_CHECK_INTERVAL_MS,
323
- onTimeout: () => {
324
- console.log(`[twilio-server] No audio received, closing stale call (token: ${token})`);
325
- ws.close();
326
- },
327
- });
328
-
329
- /**
330
- * Clean up the call session. Stops the voice session, removes from
331
- * activeCalls map. Uses cleaned flag to prevent double-cleanup.
332
- */
333
- async function cleanup(): Promise<void> {
334
- if (cleaned) return;
335
- cleaned = true;
336
-
337
- watchdog.dispose();
338
-
339
- const call = activeCalls.get(token);
340
- if (call?.session) {
341
- await call.session.stop();
342
- }
343
-
344
- activeCalls.delete(token);
345
- console.log(`Call session cleaned up, token: ${token}`);
346
- }
347
-
348
- // WebSocket close handler -- always runs cleanup regardless of cause
349
- ws.on("close", () => {
350
- cleanup().catch((err) => {
351
- console.error(`Error during call cleanup: ${err}`);
352
- });
353
- });
354
-
355
- ws.on("error", (err) => {
356
- console.error(`WebSocket error for token ${token}: ${err}`);
357
- ws.close();
358
- });
359
-
360
- // Listen for Twilio media stream events
361
- ws.on("message", (data: Buffer | string) => {
362
- const msg = JSON.parse(typeof data === "string" ? data : data.toString("utf-8"));
363
-
364
- if (msg.event === "media") {
365
- watchdog.ping();
366
- // Don't return -- TwilioAudioAdapter's onAudio listener also handles media events
367
- }
368
-
369
- if (msg.event === "start") {
370
- watchdog.ping();
371
- handleStreamStart(ws, token, msg).catch((err) => {
372
- console.error(`Error handling stream start: ${err}`);
373
- });
374
- return;
375
- }
376
-
377
- if (msg.event === "stop") {
378
- console.log(`Twilio stream stopped for token: ${token}`);
379
- ws.close();
380
- return;
381
- }
382
- });
383
- }
384
-
385
- // ============================================================================
386
- // HELPER FUNCTIONS
387
- // ============================================================================
388
-
389
- /**
390
- * Build provider config by reading the latest values from .env.
391
- * Called per-session so changes to API keys, voice IDs, or model IDs
392
- * take effect without a server restart.
393
- *
394
- * @returns TTS and STT provider configs with current .env values
395
- */
396
- async function buildProviderConfig(): Promise<{ ttsProvider: TtsProviderConfig; sttProvider: SttProviderConfig }> {
397
- const env = await readEnv();
398
-
399
- const apiKey = env.ELEVENLABS_API_KEY ?? "";
400
- const voiceId = env.ELEVENLABS_VOICE_ID ?? DEFAULT_ELEVENLABS_VOICE_ID;
401
- const modelId = env.ELEVENLABS_MODEL_ID ?? DEFAULT_ELEVENLABS_MODEL_ID;
402
- const sttModelId = env.ELEVENLABS_STT_MODEL_ID ?? DEFAULT_ELEVENLABS_STT_MODEL_ID;
403
-
404
- return {
405
- ttsProvider: { provider: "elevenlabs", elevenlabs: { apiKey, voiceId, modelId } },
406
- sttProvider: { provider: "elevenlabs", elevenlabs: { apiKey, modelId: sttModelId } },
407
- };
408
- }
409
-
410
- /**
411
- * Handle the Twilio "start" event on a media stream WebSocket.
412
- *
413
- * Extracts the streamSid and callSid, creates a TwilioAudioAdapter and
414
- * VoiceSession. If session creation fails (e.g. limit reached), logs the
415
- * error and closes the WebSocket.
416
- *
417
- * @param ws - Connected WebSocket for the Twilio media stream
418
- * @param token - Per-call UUID token
419
- * @param msg - Parsed Twilio "start" event message
420
- */
421
- async function handleStreamStart(
422
- ws: WebSocket,
423
- token: string,
424
- msg: { start: { streamSid: string; callSid: string } },
425
- ): Promise<void> {
426
- const { streamSid, callSid } = msg.start;
427
- console.log(`Stream started -- callSid: ${callSid}, streamSid: ${streamSid}`);
428
-
429
- // Update the active call entry with the callSid
430
- const call = activeCalls.get(token);
431
- if (!call) return;
432
- call.callSid = callSid;
433
-
434
- // Read provider config fresh from .env so key/model/voice changes take effect without restart
435
- const { ttsProvider, sttProvider } = await buildProviderConfig();
436
-
437
- const defaultConfig = {
438
- stopPhrase: "stop listening",
439
- ttsProvider,
440
- sttProvider,
441
- interruptionThresholdMs: PHONE_INTERRUPTION_THRESHOLD_MS,
442
- endpointing: {
443
- silenceThresholdMs: 700,
444
- maxSilenceBeforeTimeoutMs: 1200,
445
- minWordCountForFastPath: 2,
446
- enableHaikuFallback: false,
447
- },
448
- narration: {
449
- summaryIntervalMs: 12000,
450
- },
451
- claudeSession: {
452
- allowedTools: [] as string[],
453
- permissionMode: "bypassPermissions",
454
- systemPrompt: buildDefaultPrompt("voice"),
455
- } as import("./types.js").ClaudeSessionConfig,
456
- };
457
-
458
- // Build session config -- use agent personality if agentId is set, otherwise default
459
- const agentId = call.agentId;
460
- let sessionConfig: Parameters<typeof createVoiceSession>[1] = { ...defaultConfig, onSessionEnd: () => ws.close() };
461
-
462
- if (agentId) {
463
- try {
464
- const agentPrompt = await buildAgentPrompt(agentId, "voice");
465
- const agentDir = join(AGENTS_DIR, agentId);
466
- sessionConfig = {
467
- ...defaultConfig,
468
- claudeSession: {
469
- ...defaultConfig.claudeSession,
470
- customSystemPrompt: agentPrompt,
471
- cwd: agentDir,
472
- },
473
- onSessionEnd: () => ws.close(),
474
- };
475
- // Override TTS voice if the agent has a preference
476
- const agent = await getAgent(agentId);
477
- if (agent.config.voice?.elevenlabs) {
478
- const voicePref = agent.config.voice.elevenlabs;
479
- const overriddenTts: TtsProviderConfig = {
480
- ...ttsProvider,
481
- elevenlabs: { ...ttsProvider.elevenlabs, voiceId: voicePref.id },
482
- };
483
- sessionConfig = { ...sessionConfig, ttsProvider: overriddenTts };
484
- console.log(`Using voice "${voicePref.name}" (${voicePref.id}) for agent "${agentId}"`);
485
- }
486
-
487
- // If heartbeat attached a live Claude session, pass it through
488
- if (call.claudeSession) {
489
- sessionConfig.existingClaudeSession = call.claudeSession;
490
- sessionConfig.initialPrompt = "The user just answered your call. Greet them and briefly explain why you're calling.";
491
- console.log(`Using existing heartbeat Claude session for agent "${agentId}" call ${callSid}`);
492
- } else if (call.initialPrompt) {
493
- sessionConfig.initialPrompt = call.initialPrompt;
494
- console.log(`Using agent "${agentId}" with initial prompt for call ${callSid}`);
495
- } else {
496
- console.log(`Using agent "${agentId}" personality for call ${callSid}`);
497
- }
498
- } catch (err) {
499
- console.error(`Failed to load agent "${agentId}", using default config:`, err);
500
- }
501
- }
502
-
503
- try {
504
- // Create the Twilio audio adapter
505
- const adapter = createTwilioAudioAdapter({ ws, streamSid });
506
-
507
- // Create the voice session (acquires a session lock -- may throw if limit reached)
508
- const session = await createVoiceSession(adapter, sessionConfig);
509
-
510
- call.session = session;
511
- } catch (err) {
512
- console.error(`Failed to create voice session for call ${callSid}: ${err}`);
513
-
514
- // Send a TwiML-style rejection message over the WebSocket is not possible,
515
- // so just close the WebSocket. The caller will hear silence and Twilio will
516
- // eventually disconnect.
517
- ws.close();
518
- }
519
- }
520
-
521
- /**
522
- * Parse a URL-encoded POST body into a key-value record.
523
- *
524
- * @param body - URL-encoded string (e.g. "key1=value1&key2=value2")
525
- * @returns Record of decoded key-value pairs
526
- */
527
- function parseUrlEncodedBody(body: string): Record<string, string> {
528
- const params: Record<string, string> = {};
529
-
530
- if (!body) return params;
531
-
532
- for (const pair of body.split("&")) {
533
- const [key, value] = pair.split("=");
534
- if (key) {
535
- params[decodeURIComponent(key)] = decodeURIComponent(value ?? "");
536
- }
537
- }
538
-
539
- return params;
540
- }