voicecc 1.1.35 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/bin/voicecc.js +94 -1
  2. package/dashboard/dist/assets/index-DCeOdulF.js +28 -0
  3. package/dashboard/dist/index.html +1 -1
  4. package/dashboard/routes/agents.ts +28 -8
  5. package/dashboard/routes/browser-call.ts +3 -2
  6. package/dashboard/routes/chat.ts +75 -55
  7. package/dashboard/routes/providers.ts +5 -74
  8. package/dashboard/routes/twilio.ts +104 -5
  9. package/dashboard/routes/voice.ts +98 -0
  10. package/dashboard/server.ts +58 -2
  11. package/package.json +2 -3
  12. package/server/index.ts +96 -8
  13. package/server/services/device-pairing.ts +18 -2
  14. package/server/services/twilio-manager.ts +29 -10
  15. package/dashboard/dist/assets/index-C62C9Gp0.js +0 -28
  16. package/dashboard/dist/audio-processor.js +0 -126
  17. package/server/services/heartbeat.ts +0 -403
  18. package/server/voice/assets/chime.wav +0 -0
  19. package/server/voice/assets/startup.pcm +0 -0
  20. package/server/voice/audio-adapter.ts +0 -60
  21. package/server/voice/audio-inactivity.test.ts +0 -108
  22. package/server/voice/audio-inactivity.ts +0 -91
  23. package/server/voice/browser-audio-playback.test.ts +0 -149
  24. package/server/voice/browser-audio.ts +0 -147
  25. package/server/voice/browser-server.ts +0 -311
  26. package/server/voice/chat-server.ts +0 -236
  27. package/server/voice/chime.test.ts +0 -69
  28. package/server/voice/chime.ts +0 -36
  29. package/server/voice/claude-session.ts +0 -293
  30. package/server/voice/endpointing.ts +0 -163
  31. package/server/voice/mic-vpio +0 -0
  32. package/server/voice/narration.ts +0 -204
  33. package/server/voice/prompt-builder.ts +0 -108
  34. package/server/voice/session-lock.ts +0 -123
  35. package/server/voice/stt-elevenlabs.ts +0 -210
  36. package/server/voice/stt-provider.ts +0 -106
  37. package/server/voice/tts-elevenlabs-hiss.test.ts +0 -183
  38. package/server/voice/tts-elevenlabs.ts +0 -397
  39. package/server/voice/tts-provider.ts +0 -155
  40. package/server/voice/twilio-audio.ts +0 -338
  41. package/server/voice/twilio-server.ts +0 -540
  42. package/server/voice/types.ts +0 -282
  43. package/server/voice/vad.ts +0 -101
  44. package/server/voice/voice-loop-bugs.test.ts +0 -348
  45. package/server/voice/voice-server.ts +0 -129
  46. package/server/voice/voice-session.ts +0 -539
@@ -1,91 +0,0 @@
1
- /**
2
- * Audio inactivity watchdog for streaming connections.
3
- *
4
- * Detects when a caller hangs up but the WebSocket doesn't close cleanly.
5
- * Twilio sends audio frames continuously during an active call (even silence).
6
- * When frames stop arriving, the call is dead and the onTimeout callback fires
7
- * so the caller can tear down the session.
8
- *
9
- * Responsibilities:
10
- * - Track timestamps of incoming audio frames via ping()
11
- * - Periodically check whether audio has gone silent beyond a threshold
12
- * - Fire a callback when the timeout is exceeded
13
- * - Clean up the interval timer on dispose
14
- */
15
-
16
- // ============================================================================
17
- // CONSTANTS
18
- // ============================================================================
19
-
20
- /** Default: close the connection if no audio frames arrive within this window (ms) */
21
- const DEFAULT_TIMEOUT_MS = 5000;
22
-
23
- /** Default: how often to check for audio inactivity (ms) */
24
- const DEFAULT_CHECK_INTERVAL_MS = 2000;
25
-
26
- // ============================================================================
27
- // INTERFACES
28
- // ============================================================================
29
-
30
- /** Configuration for the audio inactivity watchdog */
31
- export interface AudioInactivityConfig {
32
- /** Time without audio before firing the callback (ms). Default: 5000 */
33
- timeoutMs?: number;
34
- /** How often to check for inactivity (ms). Default: 2000 */
35
- checkIntervalMs?: number;
36
- /** Called when the timeout is exceeded */
37
- onTimeout: () => void;
38
- }
39
-
40
- /** Handle returned by createAudioInactivityWatchdog */
41
- export interface AudioInactivityWatchdog {
42
- /** Call this when an audio frame arrives to reset the timer */
43
- ping: () => void;
44
- /** Stop the watchdog and clean up the interval */
45
- dispose: () => void;
46
- }
47
-
48
- // ============================================================================
49
- // MAIN ENTRYPOINT
50
- // ============================================================================
51
-
52
- /**
53
- * Create an audio inactivity watchdog that fires a callback when no audio
54
- * frames have arrived within the configured timeout.
55
- *
56
- * @param config - Timeout thresholds and callback
57
- * @returns A watchdog handle with ping() and dispose() methods
58
- */
59
- export function createAudioInactivityWatchdog(config: AudioInactivityConfig): AudioInactivityWatchdog {
60
- const timeoutMs = config.timeoutMs ?? DEFAULT_TIMEOUT_MS;
61
- const checkIntervalMs = config.checkIntervalMs ?? DEFAULT_CHECK_INTERVAL_MS;
62
-
63
- let lastAudioAt = Date.now();
64
- let fired = false;
65
-
66
- const timer = setInterval(() => {
67
- if (fired) return;
68
-
69
- const silentMs = Date.now() - lastAudioAt;
70
- if (silentMs >= timeoutMs) {
71
- fired = true;
72
- config.onTimeout();
73
- }
74
- }, checkIntervalMs);
75
-
76
- /**
77
- * Signal that an audio frame was received. Resets the inactivity clock.
78
- */
79
- function ping(): void {
80
- lastAudioAt = Date.now();
81
- }
82
-
83
- /**
84
- * Stop the watchdog and clean up resources.
85
- */
86
- function dispose(): void {
87
- clearInterval(timer);
88
- }
89
-
90
- return { ping, dispose };
91
- }
@@ -1,149 +0,0 @@
1
- /**
2
- * Tests that the AudioWorklet processor plays back all TTS audio without
3
- * dropping samples, regardless of chunk size or arrival timing.
4
- *
5
- * Loads the actual audio-processor.js and exercises it through the same
6
- * postMessage/process interface the browser uses. Tests outcomes only --
7
- * no assumptions about internal buffering strategy.
8
- *
9
- * Run: npx tsx --test server/voice/browser-audio-playback.test.ts
10
- */
11
-
12
- import { test } from "node:test";
13
- import { strict as assert } from "node:assert";
14
- import { readFileSync } from "fs";
15
- import { join, dirname } from "path";
16
- import { fileURLToPath } from "url";
17
-
18
- const __dirname = dirname(fileURLToPath(import.meta.url));
19
-
20
- // ============================================================================
21
- // HARNESS -- stub browser AudioWorklet APIs so we can load audio-processor.js
22
- // ============================================================================
23
-
24
- function loadProcessor(): {
25
- postMessage: (data: Record<string, unknown>) => void;
26
- process: (inputs: Float32Array[][], outputs: Float32Array[][]) => boolean;
27
- } {
28
- const source = readFileSync(join(__dirname, "../../dashboard/public/audio-processor.js"), "utf-8");
29
-
30
- let ProcessorClass: any;
31
-
32
- // Stub globals that audio-processor.js expects
33
- const globals = {
34
- AudioWorkletProcessor: class {
35
- port = {
36
- onmessage: null as ((event: { data: Record<string, unknown> }) => void) | null,
37
- postMessage(_data: unknown) {},
38
- };
39
- },
40
- registerProcessor(_name: string, cls: any) {
41
- ProcessorClass = cls;
42
- },
43
- };
44
-
45
- const fn = new Function(...Object.keys(globals), source);
46
- fn(...Object.values(globals));
47
-
48
- const instance = new ProcessorClass();
49
-
50
- return {
51
- postMessage(data: Record<string, unknown>) {
52
- instance.port.onmessage?.({ data });
53
- },
54
- process(inputs: Float32Array[][], outputs: Float32Array[][]) {
55
- return instance.process(inputs, outputs, {});
56
- },
57
- };
58
- }
59
-
60
- // ============================================================================
61
- // TESTS
62
- // ============================================================================
63
-
64
- /**
65
- * Simulates the exact scenario from the logs:
66
- * chunk 0: 2.0s audio at 24kHz -> 96,000 samples at 48kHz
67
- * chunk 1: 3.0s audio at 24kHz -> 144,000 samples at 48kHz
68
- *
69
- * Both chunks arrive within ~500ms. The process() callback drains 128
70
- * samples per frame. Between the two chunk arrivals, only ~24,000 samples
71
- * drain -- far less than the total audio.
72
- *
73
- * All 240,000 samples should be played back with no drops.
74
- */
75
- test("all TTS audio plays back without drops across multi-second chunks", () => {
76
- const proc = loadProcessor();
77
- const BROWSER_RATE = 48_000;
78
- const FRAME_SIZE = 128;
79
-
80
- // Chunk 0: 2s at 48kHz, filled with 0.5
81
- const chunk0 = new Float32Array(2.0 * BROWSER_RATE);
82
- chunk0.fill(0.5);
83
-
84
- // Chunk 1: 3s at 48kHz, filled with 0.3
85
- const chunk1 = new Float32Array(3.0 * BROWSER_RATE);
86
- chunk1.fill(0.3);
87
-
88
- const totalSamples = chunk0.length + chunk1.length; // 240,000
89
-
90
- // Post chunk 0
91
- proc.postMessage({ type: "playback", samples: chunk0 });
92
-
93
- // Simulate ~500ms of process() draining between chunk arrivals
94
- const framesBetweenChunks = Math.floor((0.5 * BROWSER_RATE) / FRAME_SIZE);
95
- let totalNonSilent = 0;
96
-
97
- for (let i = 0; i < framesBetweenChunks; i++) {
98
- const output = new Float32Array(FRAME_SIZE);
99
- proc.process([[new Float32Array(FRAME_SIZE)]], [[output]]);
100
- for (let j = 0; j < output.length; j++) {
101
- if (output[j] !== 0) totalNonSilent++;
102
- }
103
- }
104
-
105
- // Post chunk 1
106
- proc.postMessage({ type: "playback", samples: chunk1 });
107
-
108
- // Drain until we get a full frame of silence (queue exhausted)
109
- let silentFrames = 0;
110
- while (silentFrames < 3) {
111
- const output = new Float32Array(FRAME_SIZE);
112
- proc.process([[new Float32Array(FRAME_SIZE)]], [[output]]);
113
-
114
- let frameSilent = true;
115
- for (let j = 0; j < output.length; j++) {
116
- if (output[j] !== 0) {
117
- totalNonSilent++;
118
- frameSilent = false;
119
- }
120
- }
121
- silentFrames = frameSilent ? silentFrames + 1 : 0;
122
- }
123
-
124
- assert.equal(
125
- totalNonSilent, totalSamples,
126
- `Expected all ${totalSamples} samples (${(totalSamples / BROWSER_RATE).toFixed(1)}s) to play back, ` +
127
- `but only ${totalNonSilent} (${(totalNonSilent / BROWSER_RATE).toFixed(1)}s) were non-silent. ` +
128
- `${totalSamples - totalNonSilent} samples were dropped.`
129
- );
130
- });
131
-
132
- /**
133
- * Verifies that "clear" discards all pending audio immediately.
134
- * After clear, process() should output silence.
135
- */
136
- test("clear discards all pending audio", () => {
137
- const proc = loadProcessor();
138
- const FRAME_SIZE = 128;
139
-
140
- proc.postMessage({ type: "playback", samples: new Float32Array(100_000).fill(0.5) });
141
- proc.postMessage({ type: "clear" });
142
-
143
- const output = new Float32Array(FRAME_SIZE);
144
- proc.process([[new Float32Array(FRAME_SIZE)]], [[output]]);
145
-
146
- for (let i = 0; i < output.length; i++) {
147
- assert.equal(output[i], 0, `Expected silence at index ${i} after clear`);
148
- }
149
- });
@@ -1,147 +0,0 @@
1
- /**
2
- * Browser audio adapter for direct WebSocket connections.
3
- *
4
- * Implements the AudioAdapter interface for browser-based voice calls by
5
- * exchanging raw PCM audio over a WebSocket. Simpler than TwilioAudioAdapter --
6
- * no mulaw codec, no Twilio-specific protocol framing.
7
- *
8
- * Responsibilities:
9
- * - Receive Float32Array PCM at 16kHz from the browser via binary WebSocket messages
10
- * - Send int16 24kHz PCM as binary WebSocket messages to the browser
11
- * - Handle backpressure on writeSpeaker via ws.send callback
12
- * - Send JSON control messages (e.g. "clear" for interruption)
13
- * - Cache the ready chime as 24kHz PCM for playback
14
- */
15
-
16
- import type { WebSocket } from "ws";
17
- import type { AudioAdapter } from "./audio-adapter.js";
18
-
19
- import { decodeChimeToPcm } from "./chime.js";
20
-
21
- // ============================================================================
22
- // TYPES
23
- // ============================================================================
24
-
25
- /** Configuration for creating a browser audio adapter */
26
- export interface BrowserAudioAdapterConfig {
27
- /** Active WebSocket connection to the browser */
28
- ws: WebSocket;
29
- }
30
-
31
- // ============================================================================
32
- // MAIN ENTRYPOINT
33
- // ============================================================================
34
-
35
- /**
36
- * Create an AudioAdapter that reads/writes audio over a browser WebSocket connection.
37
- *
38
- * Decodes the macOS Glass.aiff chime to raw 24kHz PCM during initialization
39
- * and caches the buffer for playChime(). The browser sends Float32Array PCM at
40
- * 16kHz as binary messages, and receives int16 24kHz PCM as binary messages.
41
- *
42
- * @param config - Browser WebSocket connection
43
- * @returns An AudioAdapter for browser audio I/O
44
- */
45
- export function createBrowserAudioAdapter(config: BrowserAudioAdapterConfig): AudioAdapter {
46
- const { ws } = config;
47
-
48
- let wsClosed = false;
49
-
50
- // Track WebSocket close state
51
- ws.on("close", () => {
52
- wsClosed = true;
53
- });
54
-
55
- // Decode chime to raw 24kHz PCM and cache it
56
- const chimePcm = decodeChimeToPcm();
57
-
58
- // --------------------------------------------------------------------------
59
- // AudioAdapter methods
60
- // --------------------------------------------------------------------------
61
-
62
- /**
63
- * Subscribe to incoming audio chunks from the browser.
64
- * Registers a WebSocket binary message handler that converts the incoming
65
- * Buffer to Float32Array and invokes the callback. Ignores text (JSON) messages.
66
- *
67
- * @param callback - Called with each audio chunk as Float32Array (16kHz)
68
- */
69
- function onAudio(callback: (samples: Float32Array) => void): void {
70
- ws.on("message", (data: Buffer | string, isBinary: boolean) => {
71
- if (wsClosed) return;
72
-
73
- // Only process binary messages (audio data)
74
- if (!isBinary) return;
75
-
76
- // Convert Buffer to Float32Array (copy to ensure 4-byte alignment)
77
- const buffer = data as Buffer;
78
- const aligned = new ArrayBuffer(buffer.byteLength);
79
- new Uint8Array(aligned).set(buffer);
80
- const float32 = new Float32Array(aligned);
81
- callback(float32);
82
- });
83
- }
84
-
85
- /**
86
- * Write PCM audio to the browser via WebSocket.
87
- * Sends 24kHz int16 PCM buffer as a binary WebSocket message.
88
- * Uses ws.send callback for backpressure -- resolves when the data is flushed.
89
- * Silently returns if the WebSocket has closed.
90
- *
91
- * @param pcm - Raw PCM buffer (16-bit signed, 24kHz mono)
92
- * @returns Resolves when the write completes
93
- */
94
- function writeSpeaker(pcm: Buffer): Promise<void> {
95
- if (wsClosed) return Promise.resolve();
96
-
97
- return new Promise<void>((resolve) => {
98
- ws.send(pcm, { binary: true }, () => {
99
- // Resolve on both success and error -- write errors mean the
100
- // connection is closing, and callers should not need to handle that
101
- resolve();
102
- });
103
- });
104
- }
105
-
106
- /**
107
- * Clear the browser's playback buffer immediately (user interruption).
108
- * Sends a JSON "clear" message over the WebSocket.
109
- */
110
- function interrupt(): void {
111
- if (wsClosed) return;
112
-
113
- ws.send(JSON.stringify({ type: "clear" }));
114
- }
115
-
116
- /**
117
- * Resume output after an interrupt. No-op for browser --
118
- * AudioWorklet resumes consuming from ring buffer automatically after clear.
119
- */
120
- function resume(): void {
121
- // No-op: browser AudioWorklet resumes automatically
122
- }
123
-
124
- /**
125
- * Play the ready chime by sending the cached 24kHz PCM through writeSpeaker.
126
- */
127
- function playChime(): void {
128
- writeSpeaker(chimePcm);
129
- }
130
-
131
- /**
132
- * Clean up resources. No-op for browser -- WebSocket lifecycle is
133
- * managed by browser-server.ts.
134
- */
135
- function destroy(): void {
136
- // No-op: WebSocket lifecycle managed by browser-server.ts
137
- }
138
-
139
- return {
140
- onAudio,
141
- writeSpeaker,
142
- interrupt,
143
- resume,
144
- playChime,
145
- destroy,
146
- };
147
- }
@@ -1,311 +0,0 @@
1
- /**
2
- * Browser audio WebSocket handlers for the unified voice server.
3
- *
4
- * Provides WebSocket upgrade logic for browser-based audio sessions.
5
- * Used by voice-server.ts which owns the HTTP server.
6
- *
7
- * Responsibilities:
8
- * - Accept WebSocket upgrades on /audio?token=<deviceToken>
9
- * - Validate device tokens via isValidDeviceToken() (localhost bypasses validation)
10
- * - Reject duplicate connections for the same device token
11
- * - Create BrowserAudioAdapter + VoiceSession per connection
12
- */
13
-
14
- import { join } from "path";
15
-
16
- import { WebSocketServer } from "ws";
17
-
18
- import { createBrowserAudioAdapter } from "./browser-audio.js";
19
- import { createVoiceSession } from "./voice-session.js";
20
- import { buildAgentPrompt } from "./prompt-builder.js";
21
- import { isValidDeviceToken } from "../services/device-pairing.js";
22
- import { getAgent, AGENTS_DIR } from "../services/agent-store.js";
23
- import { readEnv } from "../services/env.js";
24
-
25
- import type { IncomingMessage } from "http";
26
- import type { Duplex } from "stream";
27
- import type { WebSocket } from "ws";
28
- import type { VoiceSession } from "./voice-session.js";
29
- import type { TtsProviderConfig, SttProviderConfig } from "./types.js";
30
-
31
- // ============================================================================
32
- // CONSTANTS
33
- // ============================================================================
34
-
35
- /** Interruption threshold for browser calls (lower than Twilio's 2000ms because browser getUserMedia includes AEC) */
36
- const BROWSER_INTERRUPTION_THRESHOLD_MS = 1500;
37
-
38
- /** Default ElevenLabs voice ID (used when not set in .env) */
39
- const DEFAULT_ELEVENLABS_VOICE_ID = "WrjxnKxK0m1uiaH0uteU";
40
-
41
- /** Default ElevenLabs TTS model ID (used when not set in .env) */
42
- const DEFAULT_ELEVENLABS_MODEL_ID = "eleven_turbo_v2_5";
43
-
44
- /** Default ElevenLabs STT model ID (used when not set in .env) */
45
- const DEFAULT_ELEVENLABS_STT_MODEL_ID = "scribe_v1";
46
-
47
- // ============================================================================
48
- // TYPES
49
- // ============================================================================
50
-
51
- /** Tracks an active browser audio session */
52
- interface ActiveBrowserSession {
53
- /** The device token used for this session */
54
- deviceToken: string;
55
- /** Voice session handle (null until created) */
56
- session: VoiceSession | null;
57
- /** Optional agent ID for agent-specific sessions */
58
- agentId?: string;
59
- }
60
-
61
- // ============================================================================
62
- // STATE
63
- // ============================================================================
64
-
65
- /** Active sessions keyed by device token */
66
- const activeSessions = new Map<string, ActiveBrowserSession>();
67
-
68
- // ============================================================================
69
- // EXPORTED HANDLERS
70
- // ============================================================================
71
-
72
- /**
73
- * Handle a WebSocket upgrade request for browser audio.
74
- *
75
- * Validates that the path is /audio, extracts the device token from the query
76
- * string, checks authorization (localhost or valid device token), and rejects
77
- * duplicate connections for the same device token.
78
- *
79
- * @param req - HTTP upgrade request
80
- * @param socket - Underlying TCP socket
81
- * @param head - First packet of the upgraded stream
82
- * @param wss - WebSocketServer instance to accept the upgrade
83
- */
84
- export function handleBrowserUpgrade(
85
- req: IncomingMessage,
86
- socket: Duplex,
87
- head: Buffer,
88
- wss: WebSocketServer,
89
- ): void {
90
- const url = new URL(req.url ?? "", `http://${req.headers.host}`);
91
-
92
- // Validate path
93
- if (url.pathname !== "/audio") {
94
- console.log(`Rejected WebSocket upgrade: invalid path ${url.pathname}`);
95
- socket.destroy();
96
- return;
97
- }
98
-
99
- // Extract device token from query string
100
- const token = url.searchParams.get("token") ?? "";
101
-
102
- // Check authorization: localhost bypasses token validation
103
- const remoteAddr = req.socket.remoteAddress ?? "";
104
- const isLocalhost =
105
- remoteAddr === "127.0.0.1" ||
106
- remoteAddr === "::1" ||
107
- remoteAddr === "::ffff:127.0.0.1";
108
-
109
- if (!isLocalhost && !token) {
110
- console.log("Rejected WebSocket upgrade: missing device token");
111
- socket.destroy();
112
- return;
113
- }
114
-
115
- if (!isLocalhost && !isValidDeviceToken(token)) {
116
- console.log("Rejected WebSocket upgrade: invalid device token");
117
- socket.destroy();
118
- return;
119
- }
120
-
121
- // Reject duplicate connections for the same device token
122
- if (token && activeSessions.has(token)) {
123
- console.log(`Rejected WebSocket upgrade: duplicate device token ${token}`);
124
- socket.destroy();
125
- return;
126
- }
127
-
128
- // Extract optional agentId from query params
129
- const agentId = url.searchParams.get("agentId") || undefined;
130
-
131
- // Accept the WebSocket connection
132
- wss.handleUpgrade(req, socket, head, (ws: WebSocket) => {
133
- wss.emit("connection", ws, req);
134
- handleBrowserSession(ws, token || "localhost", agentId);
135
- });
136
- }
137
-
138
- // ============================================================================
139
- // INTERNAL HANDLERS
140
- // ============================================================================
141
-
142
- /**
143
- * Handle a connected browser audio WebSocket session.
144
- *
145
- * Creates a BrowserAudioAdapter and VoiceSession with browser-tuned config.
146
- * Registers close/error handlers for cleanup. Removes from activeSessions
147
- * on disconnect.
148
- *
149
- * @param ws - Connected WebSocket for browser audio
150
- * @param deviceToken - Device token identifying this connection
151
- * @param agentId - Optional agent ID for agent-specific sessions
152
- */
153
- function handleBrowserSession(ws: WebSocket, deviceToken: string, agentId?: string): void {
154
- let cleaned = false;
155
-
156
- // Register in active sessions
157
- const entry: ActiveBrowserSession = { deviceToken, session: null, agentId };
158
- activeSessions.set(deviceToken, entry);
159
-
160
- console.log(`Browser session connected, token: ${deviceToken}`);
161
-
162
- /**
163
- * Clean up the browser session. Stops the voice session and removes from
164
- * the activeSessions map. Uses cleaned flag to prevent double-cleanup.
165
- */
166
- async function cleanup(): Promise<void> {
167
- if (cleaned) return;
168
- cleaned = true;
169
-
170
- if (entry.session) {
171
- await entry.session.stop();
172
- }
173
-
174
- activeSessions.delete(deviceToken);
175
- console.log(`Browser session cleaned up, token: ${deviceToken}`);
176
- }
177
-
178
- // WebSocket close handler
179
- ws.on("close", () => {
180
- cleanup().catch((err) => {
181
- console.error(`Error during browser session cleanup: ${err}`);
182
- });
183
- });
184
-
185
- ws.on("error", (err) => {
186
- console.error(`WebSocket error for token ${deviceToken}: ${err}`);
187
- });
188
-
189
- // Create adapter and voice session
190
- createSession(ws, entry).catch((err) => {
191
- console.error(`Failed to create voice session for token ${deviceToken}: ${err}`);
192
- ws.close();
193
- });
194
- }
195
-
196
- // ============================================================================
197
- // HELPER FUNCTIONS
198
- // ============================================================================
199
-
200
- /**
201
- * Build provider config by reading the latest values from .env.
202
- * Called per-session so changes to API keys, voice IDs, or model IDs
203
- * take effect without a server restart.
204
- *
205
- * @returns TTS and STT provider configs with current .env values
206
- */
207
- async function buildProviderConfig(): Promise<{ ttsProvider: TtsProviderConfig; sttProvider: SttProviderConfig }> {
208
- const env = await readEnv();
209
-
210
- const apiKey = env.ELEVENLABS_API_KEY ?? "";
211
- const voiceId = env.ELEVENLABS_VOICE_ID ?? DEFAULT_ELEVENLABS_VOICE_ID;
212
- const modelId = env.ELEVENLABS_MODEL_ID ?? DEFAULT_ELEVENLABS_MODEL_ID;
213
- const sttModelId = env.ELEVENLABS_STT_MODEL_ID ?? DEFAULT_ELEVENLABS_STT_MODEL_ID;
214
-
215
- return {
216
- ttsProvider: { provider: "elevenlabs", elevenlabs: { apiKey, voiceId, modelId } },
217
- sttProvider: { provider: "elevenlabs", elevenlabs: { apiKey, modelId: sttModelId } },
218
- };
219
- }
220
-
221
- /**
222
- * Build the default voice session config for browser calls.
223
- *
224
- * @param ttsProvider - TTS provider config
225
- * @param sttProvider - STT provider config
226
- * @returns Default session config object
227
- */
228
- function buildDefaultConfig(ttsProvider: TtsProviderConfig, sttProvider: SttProviderConfig) {
229
- return {
230
- stopPhrase: "stop listening",
231
- ttsProvider,
232
- sttProvider,
233
- interruptionThresholdMs: BROWSER_INTERRUPTION_THRESHOLD_MS,
234
- endpointing: {
235
- silenceThresholdMs: 700,
236
- maxSilenceBeforeTimeoutMs: 1200,
237
- minWordCountForFastPath: 2,
238
- enableHaikuFallback: false,
239
- },
240
- narration: {
241
- summaryIntervalMs: 12000,
242
- },
243
- claudeSession: {
244
- allowedTools: [] as string[],
245
- permissionMode: "bypassPermissions",
246
- systemPrompt:
247
- "Respond concisely. You are in voice mode -- your responses will be spoken aloud. Keep answers conversational and brief.",
248
- },
249
- };
250
- }
251
-
252
- /**
253
- * Create the BrowserAudioAdapter and VoiceSession for a connected WebSocket.
254
- * If the session has an agentId, loads the agent config for custom system prompt,
255
- * voice, and working directory.
256
- *
257
- * @param ws - Connected WebSocket for browser audio
258
- * @param entry - Active session entry to populate with the voice session
259
- */
260
- async function createSession(ws: WebSocket, entry: ActiveBrowserSession): Promise<void> {
261
- const adapter = createBrowserAudioAdapter({ ws });
262
-
263
- const { ttsProvider, sttProvider } = await buildProviderConfig();
264
- const defaultConfig = buildDefaultConfig(ttsProvider, sttProvider);
265
-
266
- // Build session config -- use agent personality if agentId is set, otherwise default
267
- let sessionConfig: Parameters<typeof createVoiceSession>[1] = {
268
- ...defaultConfig,
269
- onSessionEnd: () => ws.close(),
270
- };
271
-
272
- if (entry.agentId) {
273
- try {
274
- const agentPrompt = await buildAgentPrompt(entry.agentId, "voice");
275
-
276
- sessionConfig = {
277
- ...defaultConfig,
278
- claudeSession: {
279
- ...defaultConfig.claudeSession,
280
- customSystemPrompt: agentPrompt,
281
- cwd: join(AGENTS_DIR, entry.agentId),
282
- },
283
- onSessionEnd: () => ws.close(),
284
- };
285
-
286
- // Override TTS voice if the agent has a preference
287
- const agent = await getAgent(entry.agentId);
288
- if (agent.config.voice?.elevenlabs) {
289
- const voicePref = agent.config.voice.elevenlabs;
290
- const overriddenTts: TtsProviderConfig = {
291
- ...ttsProvider,
292
- elevenlabs: { ...ttsProvider.elevenlabs, voiceId: voicePref.id },
293
- };
294
- sessionConfig = { ...sessionConfig, ttsProvider: overriddenTts };
295
- console.log(`Using voice "${voicePref.name}" (${voicePref.id}) for agent "${entry.agentId}"`);
296
- }
297
-
298
- // Set initialPrompt so the agent greets with its personality (skips generic startup PCM)
299
- sessionConfig.initialPrompt = "The user just connected via browser. Greet them briefly.";
300
-
301
- console.log(`Browser session using agent "${entry.agentId}" for token ${entry.deviceToken}`);
302
- } catch (err) {
303
- console.error(`Failed to load agent "${entry.agentId}" for browser session:`, err);
304
- ws.close();
305
- return;
306
- }
307
- }
308
-
309
- const session = await createVoiceSession(adapter, sessionConfig);
310
- entry.session = session;
311
- }