voicecc 1.1.35 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/voicecc.js +94 -1
- package/dashboard/dist/assets/index-DCeOdulF.js +28 -0
- package/dashboard/dist/index.html +1 -1
- package/dashboard/routes/agents.ts +28 -8
- package/dashboard/routes/browser-call.ts +3 -2
- package/dashboard/routes/chat.ts +75 -55
- package/dashboard/routes/providers.ts +5 -74
- package/dashboard/routes/twilio.ts +104 -5
- package/dashboard/routes/voice.ts +98 -0
- package/dashboard/server.ts +58 -2
- package/package.json +2 -3
- package/server/index.ts +96 -8
- package/server/services/device-pairing.ts +18 -2
- package/server/services/twilio-manager.ts +29 -10
- package/dashboard/dist/assets/index-C62C9Gp0.js +0 -28
- package/dashboard/dist/audio-processor.js +0 -126
- package/server/services/heartbeat.ts +0 -403
- package/server/voice/assets/chime.wav +0 -0
- package/server/voice/assets/startup.pcm +0 -0
- package/server/voice/audio-adapter.ts +0 -60
- package/server/voice/audio-inactivity.test.ts +0 -108
- package/server/voice/audio-inactivity.ts +0 -91
- package/server/voice/browser-audio-playback.test.ts +0 -149
- package/server/voice/browser-audio.ts +0 -147
- package/server/voice/browser-server.ts +0 -311
- package/server/voice/chat-server.ts +0 -236
- package/server/voice/chime.test.ts +0 -69
- package/server/voice/chime.ts +0 -36
- package/server/voice/claude-session.ts +0 -293
- package/server/voice/endpointing.ts +0 -163
- package/server/voice/mic-vpio +0 -0
- package/server/voice/narration.ts +0 -204
- package/server/voice/prompt-builder.ts +0 -108
- package/server/voice/session-lock.ts +0 -123
- package/server/voice/stt-elevenlabs.ts +0 -210
- package/server/voice/stt-provider.ts +0 -106
- package/server/voice/tts-elevenlabs-hiss.test.ts +0 -183
- package/server/voice/tts-elevenlabs.ts +0 -397
- package/server/voice/tts-provider.ts +0 -155
- package/server/voice/twilio-audio.ts +0 -338
- package/server/voice/twilio-server.ts +0 -540
- package/server/voice/types.ts +0 -282
- package/server/voice/vad.ts +0 -101
- package/server/voice/voice-loop-bugs.test.ts +0 -348
- package/server/voice/voice-server.ts +0 -129
- package/server/voice/voice-session.ts +0 -539
|
@@ -1,91 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Audio inactivity watchdog for streaming connections.
|
|
3
|
-
*
|
|
4
|
-
* Detects when a caller hangs up but the WebSocket doesn't close cleanly.
|
|
5
|
-
* Twilio sends audio frames continuously during an active call (even silence).
|
|
6
|
-
* When frames stop arriving, the call is dead and the onTimeout callback fires
|
|
7
|
-
* so the caller can tear down the session.
|
|
8
|
-
*
|
|
9
|
-
* Responsibilities:
|
|
10
|
-
* - Track timestamps of incoming audio frames via ping()
|
|
11
|
-
* - Periodically check whether audio has gone silent beyond a threshold
|
|
12
|
-
* - Fire a callback when the timeout is exceeded
|
|
13
|
-
* - Clean up the interval timer on dispose
|
|
14
|
-
*/
|
|
15
|
-
|
|
16
|
-
// ============================================================================
|
|
17
|
-
// CONSTANTS
|
|
18
|
-
// ============================================================================
|
|
19
|
-
|
|
20
|
-
/** Default: close the connection if no audio frames arrive within this window (ms) */
|
|
21
|
-
const DEFAULT_TIMEOUT_MS = 5000;
|
|
22
|
-
|
|
23
|
-
/** Default: how often to check for audio inactivity (ms) */
|
|
24
|
-
const DEFAULT_CHECK_INTERVAL_MS = 2000;
|
|
25
|
-
|
|
26
|
-
// ============================================================================
|
|
27
|
-
// INTERFACES
|
|
28
|
-
// ============================================================================
|
|
29
|
-
|
|
30
|
-
/** Configuration for the audio inactivity watchdog */
|
|
31
|
-
export interface AudioInactivityConfig {
|
|
32
|
-
/** Time without audio before firing the callback (ms). Default: 5000 */
|
|
33
|
-
timeoutMs?: number;
|
|
34
|
-
/** How often to check for inactivity (ms). Default: 2000 */
|
|
35
|
-
checkIntervalMs?: number;
|
|
36
|
-
/** Called when the timeout is exceeded */
|
|
37
|
-
onTimeout: () => void;
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
/** Handle returned by createAudioInactivityWatchdog */
|
|
41
|
-
export interface AudioInactivityWatchdog {
|
|
42
|
-
/** Call this when an audio frame arrives to reset the timer */
|
|
43
|
-
ping: () => void;
|
|
44
|
-
/** Stop the watchdog and clean up the interval */
|
|
45
|
-
dispose: () => void;
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
// ============================================================================
|
|
49
|
-
// MAIN ENTRYPOINT
|
|
50
|
-
// ============================================================================
|
|
51
|
-
|
|
52
|
-
/**
|
|
53
|
-
* Create an audio inactivity watchdog that fires a callback when no audio
|
|
54
|
-
* frames have arrived within the configured timeout.
|
|
55
|
-
*
|
|
56
|
-
* @param config - Timeout thresholds and callback
|
|
57
|
-
* @returns A watchdog handle with ping() and dispose() methods
|
|
58
|
-
*/
|
|
59
|
-
export function createAudioInactivityWatchdog(config: AudioInactivityConfig): AudioInactivityWatchdog {
|
|
60
|
-
const timeoutMs = config.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
61
|
-
const checkIntervalMs = config.checkIntervalMs ?? DEFAULT_CHECK_INTERVAL_MS;
|
|
62
|
-
|
|
63
|
-
let lastAudioAt = Date.now();
|
|
64
|
-
let fired = false;
|
|
65
|
-
|
|
66
|
-
const timer = setInterval(() => {
|
|
67
|
-
if (fired) return;
|
|
68
|
-
|
|
69
|
-
const silentMs = Date.now() - lastAudioAt;
|
|
70
|
-
if (silentMs >= timeoutMs) {
|
|
71
|
-
fired = true;
|
|
72
|
-
config.onTimeout();
|
|
73
|
-
}
|
|
74
|
-
}, checkIntervalMs);
|
|
75
|
-
|
|
76
|
-
/**
|
|
77
|
-
* Signal that an audio frame was received. Resets the inactivity clock.
|
|
78
|
-
*/
|
|
79
|
-
function ping(): void {
|
|
80
|
-
lastAudioAt = Date.now();
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
/**
|
|
84
|
-
* Stop the watchdog and clean up resources.
|
|
85
|
-
*/
|
|
86
|
-
function dispose(): void {
|
|
87
|
-
clearInterval(timer);
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
return { ping, dispose };
|
|
91
|
-
}
|
|
@@ -1,149 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Tests that the AudioWorklet processor plays back all TTS audio without
|
|
3
|
-
* dropping samples, regardless of chunk size or arrival timing.
|
|
4
|
-
*
|
|
5
|
-
* Loads the actual audio-processor.js and exercises it through the same
|
|
6
|
-
* postMessage/process interface the browser uses. Tests outcomes only --
|
|
7
|
-
* no assumptions about internal buffering strategy.
|
|
8
|
-
*
|
|
9
|
-
* Run: npx tsx --test server/voice/browser-audio-playback.test.ts
|
|
10
|
-
*/
|
|
11
|
-
|
|
12
|
-
import { test } from "node:test";
|
|
13
|
-
import { strict as assert } from "node:assert";
|
|
14
|
-
import { readFileSync } from "fs";
|
|
15
|
-
import { join, dirname } from "path";
|
|
16
|
-
import { fileURLToPath } from "url";
|
|
17
|
-
|
|
18
|
-
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
19
|
-
|
|
20
|
-
// ============================================================================
|
|
21
|
-
// HARNESS -- stub browser AudioWorklet APIs so we can load audio-processor.js
|
|
22
|
-
// ============================================================================
|
|
23
|
-
|
|
24
|
-
function loadProcessor(): {
|
|
25
|
-
postMessage: (data: Record<string, unknown>) => void;
|
|
26
|
-
process: (inputs: Float32Array[][], outputs: Float32Array[][]) => boolean;
|
|
27
|
-
} {
|
|
28
|
-
const source = readFileSync(join(__dirname, "../../dashboard/public/audio-processor.js"), "utf-8");
|
|
29
|
-
|
|
30
|
-
let ProcessorClass: any;
|
|
31
|
-
|
|
32
|
-
// Stub globals that audio-processor.js expects
|
|
33
|
-
const globals = {
|
|
34
|
-
AudioWorkletProcessor: class {
|
|
35
|
-
port = {
|
|
36
|
-
onmessage: null as ((event: { data: Record<string, unknown> }) => void) | null,
|
|
37
|
-
postMessage(_data: unknown) {},
|
|
38
|
-
};
|
|
39
|
-
},
|
|
40
|
-
registerProcessor(_name: string, cls: any) {
|
|
41
|
-
ProcessorClass = cls;
|
|
42
|
-
},
|
|
43
|
-
};
|
|
44
|
-
|
|
45
|
-
const fn = new Function(...Object.keys(globals), source);
|
|
46
|
-
fn(...Object.values(globals));
|
|
47
|
-
|
|
48
|
-
const instance = new ProcessorClass();
|
|
49
|
-
|
|
50
|
-
return {
|
|
51
|
-
postMessage(data: Record<string, unknown>) {
|
|
52
|
-
instance.port.onmessage?.({ data });
|
|
53
|
-
},
|
|
54
|
-
process(inputs: Float32Array[][], outputs: Float32Array[][]) {
|
|
55
|
-
return instance.process(inputs, outputs, {});
|
|
56
|
-
},
|
|
57
|
-
};
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
// ============================================================================
|
|
61
|
-
// TESTS
|
|
62
|
-
// ============================================================================
|
|
63
|
-
|
|
64
|
-
/**
|
|
65
|
-
* Simulates the exact scenario from the logs:
|
|
66
|
-
* chunk 0: 2.0s audio at 24kHz -> 96,000 samples at 48kHz
|
|
67
|
-
* chunk 1: 3.0s audio at 24kHz -> 144,000 samples at 48kHz
|
|
68
|
-
*
|
|
69
|
-
* Both chunks arrive within ~500ms. The process() callback drains 128
|
|
70
|
-
* samples per frame. Between the two chunk arrivals, only ~24,000 samples
|
|
71
|
-
* drain -- far less than the total audio.
|
|
72
|
-
*
|
|
73
|
-
* All 240,000 samples should be played back with no drops.
|
|
74
|
-
*/
|
|
75
|
-
test("all TTS audio plays back without drops across multi-second chunks", () => {
|
|
76
|
-
const proc = loadProcessor();
|
|
77
|
-
const BROWSER_RATE = 48_000;
|
|
78
|
-
const FRAME_SIZE = 128;
|
|
79
|
-
|
|
80
|
-
// Chunk 0: 2s at 48kHz, filled with 0.5
|
|
81
|
-
const chunk0 = new Float32Array(2.0 * BROWSER_RATE);
|
|
82
|
-
chunk0.fill(0.5);
|
|
83
|
-
|
|
84
|
-
// Chunk 1: 3s at 48kHz, filled with 0.3
|
|
85
|
-
const chunk1 = new Float32Array(3.0 * BROWSER_RATE);
|
|
86
|
-
chunk1.fill(0.3);
|
|
87
|
-
|
|
88
|
-
const totalSamples = chunk0.length + chunk1.length; // 240,000
|
|
89
|
-
|
|
90
|
-
// Post chunk 0
|
|
91
|
-
proc.postMessage({ type: "playback", samples: chunk0 });
|
|
92
|
-
|
|
93
|
-
// Simulate ~500ms of process() draining between chunk arrivals
|
|
94
|
-
const framesBetweenChunks = Math.floor((0.5 * BROWSER_RATE) / FRAME_SIZE);
|
|
95
|
-
let totalNonSilent = 0;
|
|
96
|
-
|
|
97
|
-
for (let i = 0; i < framesBetweenChunks; i++) {
|
|
98
|
-
const output = new Float32Array(FRAME_SIZE);
|
|
99
|
-
proc.process([[new Float32Array(FRAME_SIZE)]], [[output]]);
|
|
100
|
-
for (let j = 0; j < output.length; j++) {
|
|
101
|
-
if (output[j] !== 0) totalNonSilent++;
|
|
102
|
-
}
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
// Post chunk 1
|
|
106
|
-
proc.postMessage({ type: "playback", samples: chunk1 });
|
|
107
|
-
|
|
108
|
-
// Drain until we get a full frame of silence (queue exhausted)
|
|
109
|
-
let silentFrames = 0;
|
|
110
|
-
while (silentFrames < 3) {
|
|
111
|
-
const output = new Float32Array(FRAME_SIZE);
|
|
112
|
-
proc.process([[new Float32Array(FRAME_SIZE)]], [[output]]);
|
|
113
|
-
|
|
114
|
-
let frameSilent = true;
|
|
115
|
-
for (let j = 0; j < output.length; j++) {
|
|
116
|
-
if (output[j] !== 0) {
|
|
117
|
-
totalNonSilent++;
|
|
118
|
-
frameSilent = false;
|
|
119
|
-
}
|
|
120
|
-
}
|
|
121
|
-
silentFrames = frameSilent ? silentFrames + 1 : 0;
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
assert.equal(
|
|
125
|
-
totalNonSilent, totalSamples,
|
|
126
|
-
`Expected all ${totalSamples} samples (${(totalSamples / BROWSER_RATE).toFixed(1)}s) to play back, ` +
|
|
127
|
-
`but only ${totalNonSilent} (${(totalNonSilent / BROWSER_RATE).toFixed(1)}s) were non-silent. ` +
|
|
128
|
-
`${totalSamples - totalNonSilent} samples were dropped.`
|
|
129
|
-
);
|
|
130
|
-
});
|
|
131
|
-
|
|
132
|
-
/**
|
|
133
|
-
* Verifies that "clear" discards all pending audio immediately.
|
|
134
|
-
* After clear, process() should output silence.
|
|
135
|
-
*/
|
|
136
|
-
test("clear discards all pending audio", () => {
|
|
137
|
-
const proc = loadProcessor();
|
|
138
|
-
const FRAME_SIZE = 128;
|
|
139
|
-
|
|
140
|
-
proc.postMessage({ type: "playback", samples: new Float32Array(100_000).fill(0.5) });
|
|
141
|
-
proc.postMessage({ type: "clear" });
|
|
142
|
-
|
|
143
|
-
const output = new Float32Array(FRAME_SIZE);
|
|
144
|
-
proc.process([[new Float32Array(FRAME_SIZE)]], [[output]]);
|
|
145
|
-
|
|
146
|
-
for (let i = 0; i < output.length; i++) {
|
|
147
|
-
assert.equal(output[i], 0, `Expected silence at index ${i} after clear`);
|
|
148
|
-
}
|
|
149
|
-
});
|
|
@@ -1,147 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Browser audio adapter for direct WebSocket connections.
|
|
3
|
-
*
|
|
4
|
-
* Implements the AudioAdapter interface for browser-based voice calls by
|
|
5
|
-
* exchanging raw PCM audio over a WebSocket. Simpler than TwilioAudioAdapter --
|
|
6
|
-
* no mulaw codec, no Twilio-specific protocol framing.
|
|
7
|
-
*
|
|
8
|
-
* Responsibilities:
|
|
9
|
-
* - Receive Float32Array PCM at 16kHz from the browser via binary WebSocket messages
|
|
10
|
-
* - Send int16 24kHz PCM as binary WebSocket messages to the browser
|
|
11
|
-
* - Handle backpressure on writeSpeaker via ws.send callback
|
|
12
|
-
* - Send JSON control messages (e.g. "clear" for interruption)
|
|
13
|
-
* - Cache the ready chime as 24kHz PCM for playback
|
|
14
|
-
*/
|
|
15
|
-
|
|
16
|
-
import type { WebSocket } from "ws";
|
|
17
|
-
import type { AudioAdapter } from "./audio-adapter.js";
|
|
18
|
-
|
|
19
|
-
import { decodeChimeToPcm } from "./chime.js";
|
|
20
|
-
|
|
21
|
-
// ============================================================================
|
|
22
|
-
// TYPES
|
|
23
|
-
// ============================================================================
|
|
24
|
-
|
|
25
|
-
/** Configuration for creating a browser audio adapter */
|
|
26
|
-
export interface BrowserAudioAdapterConfig {
|
|
27
|
-
/** Active WebSocket connection to the browser */
|
|
28
|
-
ws: WebSocket;
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
// ============================================================================
|
|
32
|
-
// MAIN ENTRYPOINT
|
|
33
|
-
// ============================================================================
|
|
34
|
-
|
|
35
|
-
/**
|
|
36
|
-
* Create an AudioAdapter that reads/writes audio over a browser WebSocket connection.
|
|
37
|
-
*
|
|
38
|
-
* Decodes the macOS Glass.aiff chime to raw 24kHz PCM during initialization
|
|
39
|
-
* and caches the buffer for playChime(). The browser sends Float32Array PCM at
|
|
40
|
-
* 16kHz as binary messages, and receives int16 24kHz PCM as binary messages.
|
|
41
|
-
*
|
|
42
|
-
* @param config - Browser WebSocket connection
|
|
43
|
-
* @returns An AudioAdapter for browser audio I/O
|
|
44
|
-
*/
|
|
45
|
-
export function createBrowserAudioAdapter(config: BrowserAudioAdapterConfig): AudioAdapter {
|
|
46
|
-
const { ws } = config;
|
|
47
|
-
|
|
48
|
-
let wsClosed = false;
|
|
49
|
-
|
|
50
|
-
// Track WebSocket close state
|
|
51
|
-
ws.on("close", () => {
|
|
52
|
-
wsClosed = true;
|
|
53
|
-
});
|
|
54
|
-
|
|
55
|
-
// Decode chime to raw 24kHz PCM and cache it
|
|
56
|
-
const chimePcm = decodeChimeToPcm();
|
|
57
|
-
|
|
58
|
-
// --------------------------------------------------------------------------
|
|
59
|
-
// AudioAdapter methods
|
|
60
|
-
// --------------------------------------------------------------------------
|
|
61
|
-
|
|
62
|
-
/**
|
|
63
|
-
* Subscribe to incoming audio chunks from the browser.
|
|
64
|
-
* Registers a WebSocket binary message handler that converts the incoming
|
|
65
|
-
* Buffer to Float32Array and invokes the callback. Ignores text (JSON) messages.
|
|
66
|
-
*
|
|
67
|
-
* @param callback - Called with each audio chunk as Float32Array (16kHz)
|
|
68
|
-
*/
|
|
69
|
-
function onAudio(callback: (samples: Float32Array) => void): void {
|
|
70
|
-
ws.on("message", (data: Buffer | string, isBinary: boolean) => {
|
|
71
|
-
if (wsClosed) return;
|
|
72
|
-
|
|
73
|
-
// Only process binary messages (audio data)
|
|
74
|
-
if (!isBinary) return;
|
|
75
|
-
|
|
76
|
-
// Convert Buffer to Float32Array (copy to ensure 4-byte alignment)
|
|
77
|
-
const buffer = data as Buffer;
|
|
78
|
-
const aligned = new ArrayBuffer(buffer.byteLength);
|
|
79
|
-
new Uint8Array(aligned).set(buffer);
|
|
80
|
-
const float32 = new Float32Array(aligned);
|
|
81
|
-
callback(float32);
|
|
82
|
-
});
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
/**
|
|
86
|
-
* Write PCM audio to the browser via WebSocket.
|
|
87
|
-
* Sends 24kHz int16 PCM buffer as a binary WebSocket message.
|
|
88
|
-
* Uses ws.send callback for backpressure -- resolves when the data is flushed.
|
|
89
|
-
* Silently returns if the WebSocket has closed.
|
|
90
|
-
*
|
|
91
|
-
* @param pcm - Raw PCM buffer (16-bit signed, 24kHz mono)
|
|
92
|
-
* @returns Resolves when the write completes
|
|
93
|
-
*/
|
|
94
|
-
function writeSpeaker(pcm: Buffer): Promise<void> {
|
|
95
|
-
if (wsClosed) return Promise.resolve();
|
|
96
|
-
|
|
97
|
-
return new Promise<void>((resolve) => {
|
|
98
|
-
ws.send(pcm, { binary: true }, () => {
|
|
99
|
-
// Resolve on both success and error -- write errors mean the
|
|
100
|
-
// connection is closing, and callers should not need to handle that
|
|
101
|
-
resolve();
|
|
102
|
-
});
|
|
103
|
-
});
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
/**
|
|
107
|
-
* Clear the browser's playback buffer immediately (user interruption).
|
|
108
|
-
* Sends a JSON "clear" message over the WebSocket.
|
|
109
|
-
*/
|
|
110
|
-
function interrupt(): void {
|
|
111
|
-
if (wsClosed) return;
|
|
112
|
-
|
|
113
|
-
ws.send(JSON.stringify({ type: "clear" }));
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
/**
|
|
117
|
-
* Resume output after an interrupt. No-op for browser --
|
|
118
|
-
* AudioWorklet resumes consuming from ring buffer automatically after clear.
|
|
119
|
-
*/
|
|
120
|
-
function resume(): void {
|
|
121
|
-
// No-op: browser AudioWorklet resumes automatically
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
/**
|
|
125
|
-
* Play the ready chime by sending the cached 24kHz PCM through writeSpeaker.
|
|
126
|
-
*/
|
|
127
|
-
function playChime(): void {
|
|
128
|
-
writeSpeaker(chimePcm);
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
/**
|
|
132
|
-
* Clean up resources. No-op for browser -- WebSocket lifecycle is
|
|
133
|
-
* managed by browser-server.ts.
|
|
134
|
-
*/
|
|
135
|
-
function destroy(): void {
|
|
136
|
-
// No-op: WebSocket lifecycle managed by browser-server.ts
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
return {
|
|
140
|
-
onAudio,
|
|
141
|
-
writeSpeaker,
|
|
142
|
-
interrupt,
|
|
143
|
-
resume,
|
|
144
|
-
playChime,
|
|
145
|
-
destroy,
|
|
146
|
-
};
|
|
147
|
-
}
|
|
@@ -1,311 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Browser audio WebSocket handlers for the unified voice server.
|
|
3
|
-
*
|
|
4
|
-
* Provides WebSocket upgrade logic for browser-based audio sessions.
|
|
5
|
-
* Used by voice-server.ts which owns the HTTP server.
|
|
6
|
-
*
|
|
7
|
-
* Responsibilities:
|
|
8
|
-
* - Accept WebSocket upgrades on /audio?token=<deviceToken>
|
|
9
|
-
* - Validate device tokens via isValidDeviceToken() (localhost bypasses validation)
|
|
10
|
-
* - Reject duplicate connections for the same device token
|
|
11
|
-
* - Create BrowserAudioAdapter + VoiceSession per connection
|
|
12
|
-
*/
|
|
13
|
-
|
|
14
|
-
import { join } from "path";
|
|
15
|
-
|
|
16
|
-
import { WebSocketServer } from "ws";
|
|
17
|
-
|
|
18
|
-
import { createBrowserAudioAdapter } from "./browser-audio.js";
|
|
19
|
-
import { createVoiceSession } from "./voice-session.js";
|
|
20
|
-
import { buildAgentPrompt } from "./prompt-builder.js";
|
|
21
|
-
import { isValidDeviceToken } from "../services/device-pairing.js";
|
|
22
|
-
import { getAgent, AGENTS_DIR } from "../services/agent-store.js";
|
|
23
|
-
import { readEnv } from "../services/env.js";
|
|
24
|
-
|
|
25
|
-
import type { IncomingMessage } from "http";
|
|
26
|
-
import type { Duplex } from "stream";
|
|
27
|
-
import type { WebSocket } from "ws";
|
|
28
|
-
import type { VoiceSession } from "./voice-session.js";
|
|
29
|
-
import type { TtsProviderConfig, SttProviderConfig } from "./types.js";
|
|
30
|
-
|
|
31
|
-
// ============================================================================
|
|
32
|
-
// CONSTANTS
|
|
33
|
-
// ============================================================================
|
|
34
|
-
|
|
35
|
-
/** Interruption threshold for browser calls (lower than Twilio's 2000ms because browser getUserMedia includes AEC) */
|
|
36
|
-
const BROWSER_INTERRUPTION_THRESHOLD_MS = 1500;
|
|
37
|
-
|
|
38
|
-
/** Default ElevenLabs voice ID (used when not set in .env) */
|
|
39
|
-
const DEFAULT_ELEVENLABS_VOICE_ID = "WrjxnKxK0m1uiaH0uteU";
|
|
40
|
-
|
|
41
|
-
/** Default ElevenLabs TTS model ID (used when not set in .env) */
|
|
42
|
-
const DEFAULT_ELEVENLABS_MODEL_ID = "eleven_turbo_v2_5";
|
|
43
|
-
|
|
44
|
-
/** Default ElevenLabs STT model ID (used when not set in .env) */
|
|
45
|
-
const DEFAULT_ELEVENLABS_STT_MODEL_ID = "scribe_v1";
|
|
46
|
-
|
|
47
|
-
// ============================================================================
|
|
48
|
-
// TYPES
|
|
49
|
-
// ============================================================================
|
|
50
|
-
|
|
51
|
-
/** Tracks an active browser audio session */
|
|
52
|
-
interface ActiveBrowserSession {
|
|
53
|
-
/** The device token used for this session */
|
|
54
|
-
deviceToken: string;
|
|
55
|
-
/** Voice session handle (null until created) */
|
|
56
|
-
session: VoiceSession | null;
|
|
57
|
-
/** Optional agent ID for agent-specific sessions */
|
|
58
|
-
agentId?: string;
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
// ============================================================================
|
|
62
|
-
// STATE
|
|
63
|
-
// ============================================================================
|
|
64
|
-
|
|
65
|
-
/** Active sessions keyed by device token */
|
|
66
|
-
const activeSessions = new Map<string, ActiveBrowserSession>();
|
|
67
|
-
|
|
68
|
-
// ============================================================================
|
|
69
|
-
// EXPORTED HANDLERS
|
|
70
|
-
// ============================================================================
|
|
71
|
-
|
|
72
|
-
/**
|
|
73
|
-
* Handle a WebSocket upgrade request for browser audio.
|
|
74
|
-
*
|
|
75
|
-
* Validates that the path is /audio, extracts the device token from the query
|
|
76
|
-
* string, checks authorization (localhost or valid device token), and rejects
|
|
77
|
-
* duplicate connections for the same device token.
|
|
78
|
-
*
|
|
79
|
-
* @param req - HTTP upgrade request
|
|
80
|
-
* @param socket - Underlying TCP socket
|
|
81
|
-
* @param head - First packet of the upgraded stream
|
|
82
|
-
* @param wss - WebSocketServer instance to accept the upgrade
|
|
83
|
-
*/
|
|
84
|
-
export function handleBrowserUpgrade(
|
|
85
|
-
req: IncomingMessage,
|
|
86
|
-
socket: Duplex,
|
|
87
|
-
head: Buffer,
|
|
88
|
-
wss: WebSocketServer,
|
|
89
|
-
): void {
|
|
90
|
-
const url = new URL(req.url ?? "", `http://${req.headers.host}`);
|
|
91
|
-
|
|
92
|
-
// Validate path
|
|
93
|
-
if (url.pathname !== "/audio") {
|
|
94
|
-
console.log(`Rejected WebSocket upgrade: invalid path ${url.pathname}`);
|
|
95
|
-
socket.destroy();
|
|
96
|
-
return;
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
// Extract device token from query string
|
|
100
|
-
const token = url.searchParams.get("token") ?? "";
|
|
101
|
-
|
|
102
|
-
// Check authorization: localhost bypasses token validation
|
|
103
|
-
const remoteAddr = req.socket.remoteAddress ?? "";
|
|
104
|
-
const isLocalhost =
|
|
105
|
-
remoteAddr === "127.0.0.1" ||
|
|
106
|
-
remoteAddr === "::1" ||
|
|
107
|
-
remoteAddr === "::ffff:127.0.0.1";
|
|
108
|
-
|
|
109
|
-
if (!isLocalhost && !token) {
|
|
110
|
-
console.log("Rejected WebSocket upgrade: missing device token");
|
|
111
|
-
socket.destroy();
|
|
112
|
-
return;
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
if (!isLocalhost && !isValidDeviceToken(token)) {
|
|
116
|
-
console.log("Rejected WebSocket upgrade: invalid device token");
|
|
117
|
-
socket.destroy();
|
|
118
|
-
return;
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
// Reject duplicate connections for the same device token
|
|
122
|
-
if (token && activeSessions.has(token)) {
|
|
123
|
-
console.log(`Rejected WebSocket upgrade: duplicate device token ${token}`);
|
|
124
|
-
socket.destroy();
|
|
125
|
-
return;
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
// Extract optional agentId from query params
|
|
129
|
-
const agentId = url.searchParams.get("agentId") || undefined;
|
|
130
|
-
|
|
131
|
-
// Accept the WebSocket connection
|
|
132
|
-
wss.handleUpgrade(req, socket, head, (ws: WebSocket) => {
|
|
133
|
-
wss.emit("connection", ws, req);
|
|
134
|
-
handleBrowserSession(ws, token || "localhost", agentId);
|
|
135
|
-
});
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
// ============================================================================
|
|
139
|
-
// INTERNAL HANDLERS
|
|
140
|
-
// ============================================================================
|
|
141
|
-
|
|
142
|
-
/**
|
|
143
|
-
* Handle a connected browser audio WebSocket session.
|
|
144
|
-
*
|
|
145
|
-
* Creates a BrowserAudioAdapter and VoiceSession with browser-tuned config.
|
|
146
|
-
* Registers close/error handlers for cleanup. Removes from activeSessions
|
|
147
|
-
* on disconnect.
|
|
148
|
-
*
|
|
149
|
-
* @param ws - Connected WebSocket for browser audio
|
|
150
|
-
* @param deviceToken - Device token identifying this connection
|
|
151
|
-
* @param agentId - Optional agent ID for agent-specific sessions
|
|
152
|
-
*/
|
|
153
|
-
function handleBrowserSession(ws: WebSocket, deviceToken: string, agentId?: string): void {
|
|
154
|
-
let cleaned = false;
|
|
155
|
-
|
|
156
|
-
// Register in active sessions
|
|
157
|
-
const entry: ActiveBrowserSession = { deviceToken, session: null, agentId };
|
|
158
|
-
activeSessions.set(deviceToken, entry);
|
|
159
|
-
|
|
160
|
-
console.log(`Browser session connected, token: ${deviceToken}`);
|
|
161
|
-
|
|
162
|
-
/**
|
|
163
|
-
* Clean up the browser session. Stops the voice session and removes from
|
|
164
|
-
* the activeSessions map. Uses cleaned flag to prevent double-cleanup.
|
|
165
|
-
*/
|
|
166
|
-
async function cleanup(): Promise<void> {
|
|
167
|
-
if (cleaned) return;
|
|
168
|
-
cleaned = true;
|
|
169
|
-
|
|
170
|
-
if (entry.session) {
|
|
171
|
-
await entry.session.stop();
|
|
172
|
-
}
|
|
173
|
-
|
|
174
|
-
activeSessions.delete(deviceToken);
|
|
175
|
-
console.log(`Browser session cleaned up, token: ${deviceToken}`);
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
// WebSocket close handler
|
|
179
|
-
ws.on("close", () => {
|
|
180
|
-
cleanup().catch((err) => {
|
|
181
|
-
console.error(`Error during browser session cleanup: ${err}`);
|
|
182
|
-
});
|
|
183
|
-
});
|
|
184
|
-
|
|
185
|
-
ws.on("error", (err) => {
|
|
186
|
-
console.error(`WebSocket error for token ${deviceToken}: ${err}`);
|
|
187
|
-
});
|
|
188
|
-
|
|
189
|
-
// Create adapter and voice session
|
|
190
|
-
createSession(ws, entry).catch((err) => {
|
|
191
|
-
console.error(`Failed to create voice session for token ${deviceToken}: ${err}`);
|
|
192
|
-
ws.close();
|
|
193
|
-
});
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
// ============================================================================
|
|
197
|
-
// HELPER FUNCTIONS
|
|
198
|
-
// ============================================================================
|
|
199
|
-
|
|
200
|
-
/**
|
|
201
|
-
* Build provider config by reading the latest values from .env.
|
|
202
|
-
* Called per-session so changes to API keys, voice IDs, or model IDs
|
|
203
|
-
* take effect without a server restart.
|
|
204
|
-
*
|
|
205
|
-
* @returns TTS and STT provider configs with current .env values
|
|
206
|
-
*/
|
|
207
|
-
async function buildProviderConfig(): Promise<{ ttsProvider: TtsProviderConfig; sttProvider: SttProviderConfig }> {
|
|
208
|
-
const env = await readEnv();
|
|
209
|
-
|
|
210
|
-
const apiKey = env.ELEVENLABS_API_KEY ?? "";
|
|
211
|
-
const voiceId = env.ELEVENLABS_VOICE_ID ?? DEFAULT_ELEVENLABS_VOICE_ID;
|
|
212
|
-
const modelId = env.ELEVENLABS_MODEL_ID ?? DEFAULT_ELEVENLABS_MODEL_ID;
|
|
213
|
-
const sttModelId = env.ELEVENLABS_STT_MODEL_ID ?? DEFAULT_ELEVENLABS_STT_MODEL_ID;
|
|
214
|
-
|
|
215
|
-
return {
|
|
216
|
-
ttsProvider: { provider: "elevenlabs", elevenlabs: { apiKey, voiceId, modelId } },
|
|
217
|
-
sttProvider: { provider: "elevenlabs", elevenlabs: { apiKey, modelId: sttModelId } },
|
|
218
|
-
};
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
/**
|
|
222
|
-
* Build the default voice session config for browser calls.
|
|
223
|
-
*
|
|
224
|
-
* @param ttsProvider - TTS provider config
|
|
225
|
-
* @param sttProvider - STT provider config
|
|
226
|
-
* @returns Default session config object
|
|
227
|
-
*/
|
|
228
|
-
function buildDefaultConfig(ttsProvider: TtsProviderConfig, sttProvider: SttProviderConfig) {
|
|
229
|
-
return {
|
|
230
|
-
stopPhrase: "stop listening",
|
|
231
|
-
ttsProvider,
|
|
232
|
-
sttProvider,
|
|
233
|
-
interruptionThresholdMs: BROWSER_INTERRUPTION_THRESHOLD_MS,
|
|
234
|
-
endpointing: {
|
|
235
|
-
silenceThresholdMs: 700,
|
|
236
|
-
maxSilenceBeforeTimeoutMs: 1200,
|
|
237
|
-
minWordCountForFastPath: 2,
|
|
238
|
-
enableHaikuFallback: false,
|
|
239
|
-
},
|
|
240
|
-
narration: {
|
|
241
|
-
summaryIntervalMs: 12000,
|
|
242
|
-
},
|
|
243
|
-
claudeSession: {
|
|
244
|
-
allowedTools: [] as string[],
|
|
245
|
-
permissionMode: "bypassPermissions",
|
|
246
|
-
systemPrompt:
|
|
247
|
-
"Respond concisely. You are in voice mode -- your responses will be spoken aloud. Keep answers conversational and brief.",
|
|
248
|
-
},
|
|
249
|
-
};
|
|
250
|
-
}
|
|
251
|
-
|
|
252
|
-
/**
|
|
253
|
-
* Create the BrowserAudioAdapter and VoiceSession for a connected WebSocket.
|
|
254
|
-
* If the session has an agentId, loads the agent config for custom system prompt,
|
|
255
|
-
* voice, and working directory.
|
|
256
|
-
*
|
|
257
|
-
* @param ws - Connected WebSocket for browser audio
|
|
258
|
-
* @param entry - Active session entry to populate with the voice session
|
|
259
|
-
*/
|
|
260
|
-
async function createSession(ws: WebSocket, entry: ActiveBrowserSession): Promise<void> {
|
|
261
|
-
const adapter = createBrowserAudioAdapter({ ws });
|
|
262
|
-
|
|
263
|
-
const { ttsProvider, sttProvider } = await buildProviderConfig();
|
|
264
|
-
const defaultConfig = buildDefaultConfig(ttsProvider, sttProvider);
|
|
265
|
-
|
|
266
|
-
// Build session config -- use agent personality if agentId is set, otherwise default
|
|
267
|
-
let sessionConfig: Parameters<typeof createVoiceSession>[1] = {
|
|
268
|
-
...defaultConfig,
|
|
269
|
-
onSessionEnd: () => ws.close(),
|
|
270
|
-
};
|
|
271
|
-
|
|
272
|
-
if (entry.agentId) {
|
|
273
|
-
try {
|
|
274
|
-
const agentPrompt = await buildAgentPrompt(entry.agentId, "voice");
|
|
275
|
-
|
|
276
|
-
sessionConfig = {
|
|
277
|
-
...defaultConfig,
|
|
278
|
-
claudeSession: {
|
|
279
|
-
...defaultConfig.claudeSession,
|
|
280
|
-
customSystemPrompt: agentPrompt,
|
|
281
|
-
cwd: join(AGENTS_DIR, entry.agentId),
|
|
282
|
-
},
|
|
283
|
-
onSessionEnd: () => ws.close(),
|
|
284
|
-
};
|
|
285
|
-
|
|
286
|
-
// Override TTS voice if the agent has a preference
|
|
287
|
-
const agent = await getAgent(entry.agentId);
|
|
288
|
-
if (agent.config.voice?.elevenlabs) {
|
|
289
|
-
const voicePref = agent.config.voice.elevenlabs;
|
|
290
|
-
const overriddenTts: TtsProviderConfig = {
|
|
291
|
-
...ttsProvider,
|
|
292
|
-
elevenlabs: { ...ttsProvider.elevenlabs, voiceId: voicePref.id },
|
|
293
|
-
};
|
|
294
|
-
sessionConfig = { ...sessionConfig, ttsProvider: overriddenTts };
|
|
295
|
-
console.log(`Using voice "${voicePref.name}" (${voicePref.id}) for agent "${entry.agentId}"`);
|
|
296
|
-
}
|
|
297
|
-
|
|
298
|
-
// Set initialPrompt so the agent greets with its personality (skips generic startup PCM)
|
|
299
|
-
sessionConfig.initialPrompt = "The user just connected via browser. Greet them briefly.";
|
|
300
|
-
|
|
301
|
-
console.log(`Browser session using agent "${entry.agentId}" for token ${entry.deviceToken}`);
|
|
302
|
-
} catch (err) {
|
|
303
|
-
console.error(`Failed to load agent "${entry.agentId}" for browser session:`, err);
|
|
304
|
-
ws.close();
|
|
305
|
-
return;
|
|
306
|
-
}
|
|
307
|
-
}
|
|
308
|
-
|
|
309
|
-
const session = await createVoiceSession(adapter, sessionConfig);
|
|
310
|
-
entry.session = session;
|
|
311
|
-
}
|