@skillhq/concierge 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +91 -0
- package/dist/cli/program.d.ts +3 -0
- package/dist/cli/program.d.ts.map +1 -0
- package/dist/cli/program.js +46 -0
- package/dist/cli/program.js.map +1 -0
- package/dist/cli/shared.d.ts +18 -0
- package/dist/cli/shared.d.ts.map +1 -0
- package/dist/cli/shared.js +2 -0
- package/dist/cli/shared.js.map +1 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +5 -0
- package/dist/cli.js.map +1 -0
- package/dist/commands/call.d.ts +7 -0
- package/dist/commands/call.d.ts.map +1 -0
- package/dist/commands/call.js +409 -0
- package/dist/commands/call.js.map +1 -0
- package/dist/commands/config.d.ts +4 -0
- package/dist/commands/config.d.ts.map +1 -0
- package/dist/commands/config.js +120 -0
- package/dist/commands/config.js.map +1 -0
- package/dist/commands/find-contact.d.ts +4 -0
- package/dist/commands/find-contact.d.ts.map +1 -0
- package/dist/commands/find-contact.js +57 -0
- package/dist/commands/find-contact.js.map +1 -0
- package/dist/commands/server.d.ts +7 -0
- package/dist/commands/server.d.ts.map +1 -0
- package/dist/commands/server.js +212 -0
- package/dist/commands/server.js.map +1 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +3 -0
- package/dist/index.js.map +1 -0
- package/dist/lib/call/audio/mulaw.d.ts +35 -0
- package/dist/lib/call/audio/mulaw.d.ts.map +1 -0
- package/dist/lib/call/audio/mulaw.js +109 -0
- package/dist/lib/call/audio/mulaw.js.map +1 -0
- package/dist/lib/call/audio/pcm-utils.d.ts +62 -0
- package/dist/lib/call/audio/pcm-utils.d.ts.map +1 -0
- package/dist/lib/call/audio/pcm-utils.js +149 -0
- package/dist/lib/call/audio/pcm-utils.js.map +1 -0
- package/dist/lib/call/audio/resample.d.ts +34 -0
- package/dist/lib/call/audio/resample.d.ts.map +1 -0
- package/dist/lib/call/audio/resample.js +97 -0
- package/dist/lib/call/audio/resample.js.map +1 -0
- package/dist/lib/call/audio/streaming-decoder.d.ts +45 -0
- package/dist/lib/call/audio/streaming-decoder.d.ts.map +1 -0
- package/dist/lib/call/audio/streaming-decoder.js +110 -0
- package/dist/lib/call/audio/streaming-decoder.js.map +1 -0
- package/dist/lib/call/call-server.d.ts +110 -0
- package/dist/lib/call/call-server.d.ts.map +1 -0
- package/dist/lib/call/call-server.js +681 -0
- package/dist/lib/call/call-server.js.map +1 -0
- package/dist/lib/call/call-session.d.ts +133 -0
- package/dist/lib/call/call-session.d.ts.map +1 -0
- package/dist/lib/call/call-session.js +890 -0
- package/dist/lib/call/call-session.js.map +1 -0
- package/dist/lib/call/call-types.d.ts +133 -0
- package/dist/lib/call/call-types.d.ts.map +1 -0
- package/dist/lib/call/call-types.js +16 -0
- package/dist/lib/call/call-types.js.map +1 -0
- package/dist/lib/call/conversation-ai.d.ts +56 -0
- package/dist/lib/call/conversation-ai.d.ts.map +1 -0
- package/dist/lib/call/conversation-ai.js +276 -0
- package/dist/lib/call/conversation-ai.js.map +1 -0
- package/dist/lib/call/eval/codec-test.d.ts +45 -0
- package/dist/lib/call/eval/codec-test.d.ts.map +1 -0
- package/dist/lib/call/eval/codec-test.js +169 -0
- package/dist/lib/call/eval/codec-test.js.map +1 -0
- package/dist/lib/call/eval/conversation-scripts.d.ts +55 -0
- package/dist/lib/call/eval/conversation-scripts.d.ts.map +1 -0
- package/dist/lib/call/eval/conversation-scripts.js +359 -0
- package/dist/lib/call/eval/conversation-scripts.js.map +1 -0
- package/dist/lib/call/eval/eval-runner.d.ts +64 -0
- package/dist/lib/call/eval/eval-runner.d.ts.map +1 -0
- package/dist/lib/call/eval/eval-runner.js +369 -0
- package/dist/lib/call/eval/eval-runner.js.map +1 -0
- package/dist/lib/call/eval/index.d.ts +9 -0
- package/dist/lib/call/eval/index.d.ts.map +1 -0
- package/dist/lib/call/eval/index.js +9 -0
- package/dist/lib/call/eval/index.js.map +1 -0
- package/dist/lib/call/eval/integration-test-suite.d.ts +71 -0
- package/dist/lib/call/eval/integration-test-suite.d.ts.map +1 -0
- package/dist/lib/call/eval/integration-test-suite.js +519 -0
- package/dist/lib/call/eval/integration-test-suite.js.map +1 -0
- package/dist/lib/call/eval/turn-taking-test.d.ts +84 -0
- package/dist/lib/call/eval/turn-taking-test.d.ts.map +1 -0
- package/dist/lib/call/eval/turn-taking-test.js +260 -0
- package/dist/lib/call/eval/turn-taking-test.js.map +1 -0
- package/dist/lib/call/index.d.ts +12 -0
- package/dist/lib/call/index.d.ts.map +1 -0
- package/dist/lib/call/index.js +17 -0
- package/dist/lib/call/index.js.map +1 -0
- package/dist/lib/call/providers/deepgram.d.ts +81 -0
- package/dist/lib/call/providers/deepgram.d.ts.map +1 -0
- package/dist/lib/call/providers/deepgram.js +279 -0
- package/dist/lib/call/providers/deepgram.js.map +1 -0
- package/dist/lib/call/providers/elevenlabs.d.ts +78 -0
- package/dist/lib/call/providers/elevenlabs.d.ts.map +1 -0
- package/dist/lib/call/providers/elevenlabs.js +272 -0
- package/dist/lib/call/providers/elevenlabs.js.map +1 -0
- package/dist/lib/call/providers/local-deps.d.ts +18 -0
- package/dist/lib/call/providers/local-deps.d.ts.map +1 -0
- package/dist/lib/call/providers/local-deps.js +114 -0
- package/dist/lib/call/providers/local-deps.js.map +1 -0
- package/dist/lib/call/providers/twilio.d.ts +53 -0
- package/dist/lib/call/providers/twilio.d.ts.map +1 -0
- package/dist/lib/call/providers/twilio.js +173 -0
- package/dist/lib/call/providers/twilio.js.map +1 -0
- package/dist/lib/concierge-client-types.d.ts +68 -0
- package/dist/lib/concierge-client-types.d.ts.map +1 -0
- package/dist/lib/concierge-client-types.js +2 -0
- package/dist/lib/concierge-client-types.js.map +1 -0
- package/dist/lib/concierge-client.d.ts +29 -0
- package/dist/lib/concierge-client.d.ts.map +1 -0
- package/dist/lib/concierge-client.js +534 -0
- package/dist/lib/concierge-client.js.map +1 -0
- package/dist/lib/config.d.ts +9 -0
- package/dist/lib/config.d.ts.map +1 -0
- package/dist/lib/config.js +66 -0
- package/dist/lib/config.js.map +1 -0
- package/dist/lib/output.d.ts +7 -0
- package/dist/lib/output.d.ts.map +1 -0
- package/dist/lib/output.js +114 -0
- package/dist/lib/output.js.map +1 -0
- package/dist/lib/utils/contact-extractor.d.ts +12 -0
- package/dist/lib/utils/contact-extractor.d.ts.map +1 -0
- package/dist/lib/utils/contact-extractor.js +159 -0
- package/dist/lib/utils/contact-extractor.js.map +1 -0
- package/dist/lib/utils/formatters.d.ts +15 -0
- package/dist/lib/utils/formatters.d.ts.map +1 -0
- package/dist/lib/utils/formatters.js +107 -0
- package/dist/lib/utils/formatters.js.map +1 -0
- package/dist/lib/utils/url-parser.d.ts +11 -0
- package/dist/lib/utils/url-parser.d.ts.map +1 -0
- package/dist/lib/utils/url-parser.js +103 -0
- package/dist/lib/utils/url-parser.js.map +1 -0
- package/package.json +67 -0
|
@@ -0,0 +1,890 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Call session management
|
|
3
|
+
* Handles a single phone call with audio streaming, transcription, and synthesis
|
|
4
|
+
*/
|
|
5
|
+
import { EventEmitter } from 'node:events';
|
|
6
|
+
import { mulawToPcm } from './audio/mulaw.js';
|
|
7
|
+
import { calculateRms } from './audio/pcm-utils.js';
|
|
8
|
+
import { createStreamingDecoder } from './audio/streaming-decoder.js';
|
|
9
|
+
import { ConversationAI, extractMostRecentQuestion, isLikelyShortAcknowledgement } from './conversation-ai.js';
|
|
10
|
+
import { createPhoneCallSTT } from './providers/deepgram.js';
|
|
11
|
+
import { createPhoneCallTTS, ElevenLabsApiError } from './providers/elevenlabs.js';
|
|
12
|
+
import { hangupCall } from './providers/twilio.js';
|
|
13
|
+
// Configurable timing constants
|
|
14
|
+
const GREETING_DELAY_MS = 250;
|
|
15
|
+
const CALL_COMPLETION_DELAY_MS = 3000;
|
|
16
|
+
const POST_TTS_STT_SUPPRESSION_MS = 900;
|
|
17
|
+
const PRE_GREETING_IDLE_MS = 700;
|
|
18
|
+
const MAX_BUFFERED_STT_CHUNKS = 500;
|
|
19
|
+
const PRE_GREETING_VAD_RMS_THRESHOLD = 0.015;
|
|
20
|
+
const PRE_GREETING_VAD_MIN_CONSECUTIVE_CHUNKS = 2;
|
|
21
|
+
const MAX_GREETING_DEFERRAL_MS = 2000;
|
|
22
|
+
const TTS_EMPTY_AUDIO_MAX_RETRIES = 1;
|
|
23
|
+
const TTS_EMPTY_AUDIO_RETRY_DELAY_MS = 200;
|
|
24
|
+
const TTS_DECODER_FLUSH_GRACE_MS = 250;
|
|
25
|
+
export class CallSession extends EventEmitter {
|
|
26
|
+
callId;
|
|
27
|
+
config;
|
|
28
|
+
state;
|
|
29
|
+
stt = null;
|
|
30
|
+
tts = null;
|
|
31
|
+
mediaWs = null;
|
|
32
|
+
streamSid = null;
|
|
33
|
+
audioQueue = [];
|
|
34
|
+
isPlaying = false;
|
|
35
|
+
isSpeaking = false;
|
|
36
|
+
conversationAI;
|
|
37
|
+
isProcessingResponse = false; // Prevent overlapping responses
|
|
38
|
+
decoder = null; // ffmpeg MP3 → µ-law
|
|
39
|
+
decoderGeneration = 0; // Track decoder generations to avoid race conditions
|
|
40
|
+
sessionStartTime = Date.now(); // For timestamps
|
|
41
|
+
responseDebounceTimer = null; // Debounce rapid transcripts
|
|
42
|
+
pendingTranscript = ''; // Accumulated transcript before responding
|
|
43
|
+
hangupTimer = null; // Timer for delayed hangup
|
|
44
|
+
greetingTimer = null; // Timer for delayed initial greeting
|
|
45
|
+
cleanedUp = false; // Prevent multiple cleanup calls
|
|
46
|
+
endedEmitted = false;
|
|
47
|
+
suppressSttUntilMs = 0; // Prevent echo from AI audio being transcribed as human speech
|
|
48
|
+
sttTimelineStartMs = 0; // Wall-clock anchor for Deepgram word timestamps
|
|
49
|
+
greetingStarted = false;
|
|
50
|
+
lastInboundTranscriptAtMs = 0;
|
|
51
|
+
lastInboundAudioActivityAtMs = 0;
|
|
52
|
+
consecutiveInboundSpeechChunks = 0;
|
|
53
|
+
callConnectedAtMs = 0;
|
|
54
|
+
bufferedSttAudio = [];
|
|
55
|
+
greetingPrefetchPromise = null;
|
|
56
|
+
// Event handler references for cleanup
|
|
57
|
+
sttHandlers = [];
|
|
58
|
+
ttsHandlers = [];
|
|
59
|
+
mediaWsHandlers = [];
|
|
60
|
+
/** Log with timestamp showing ms since session start */
|
|
61
|
+
log(message) {
|
|
62
|
+
const elapsed = Date.now() - this.sessionStartTime;
|
|
63
|
+
console.log(`[${elapsed.toString().padStart(6)}ms] ${message}`);
|
|
64
|
+
}
|
|
65
|
+
formatError(error) {
|
|
66
|
+
if (error instanceof Error) {
|
|
67
|
+
return error.message;
|
|
68
|
+
}
|
|
69
|
+
return String(error);
|
|
70
|
+
}
|
|
71
|
+
isElevenLabsQuotaExceeded(error) {
|
|
72
|
+
if (error instanceof ElevenLabsApiError) {
|
|
73
|
+
return error.isQuotaExceeded;
|
|
74
|
+
}
|
|
75
|
+
const text = this.formatError(error).toLowerCase();
|
|
76
|
+
return text.includes('quota_exceeded');
|
|
77
|
+
}
|
|
78
|
+
getTTSOperatorMessage(error) {
|
|
79
|
+
if (this.isElevenLabsQuotaExceeded(error)) {
|
|
80
|
+
return 'ElevenLabs quota exceeded: TTS cannot generate audio. Top up ElevenLabs credits and retry the call.';
|
|
81
|
+
}
|
|
82
|
+
return `TTS failed: ${this.formatError(error)}`;
|
|
83
|
+
}
|
|
84
|
+
isEmptyTtsAudioError(error) {
|
|
85
|
+
const message = this.formatError(error).toLowerCase();
|
|
86
|
+
return message.includes('tts produced no audio output');
|
|
87
|
+
}
|
|
88
|
+
constructor(callId, config, phoneNumber, goal, context) {
|
|
89
|
+
super();
|
|
90
|
+
this.callId = callId;
|
|
91
|
+
this.config = config;
|
|
92
|
+
this.state = {
|
|
93
|
+
callId,
|
|
94
|
+
phoneNumber,
|
|
95
|
+
goal,
|
|
96
|
+
context,
|
|
97
|
+
status: 'initiating',
|
|
98
|
+
startedAt: new Date(),
|
|
99
|
+
transcript: [],
|
|
100
|
+
};
|
|
101
|
+
// Initialize conversation AI
|
|
102
|
+
this.conversationAI = new ConversationAI({
|
|
103
|
+
apiKey: config.anthropicApiKey,
|
|
104
|
+
goal,
|
|
105
|
+
context,
|
|
106
|
+
});
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* Initialize the session when Twilio media stream connects
|
|
110
|
+
* @param ws - WebSocket connection
|
|
111
|
+
* @param startMessage - The 'start' event message from Twilio (already received by server)
|
|
112
|
+
*/
|
|
113
|
+
async initializeMediaStream(ws, startMessage) {
|
|
114
|
+
this.log(`[Session ${this.callId}] Initializing media stream...`);
|
|
115
|
+
this.mediaWs = ws;
|
|
116
|
+
// IMPORTANT: Attach WebSocket handlers FIRST, before awaiting STT connection.
|
|
117
|
+
// This ensures we don't drop Twilio media frames that arrive during Deepgram startup.
|
|
118
|
+
let mediaMessageCount = 0;
|
|
119
|
+
const wsMessageHandler = (data) => {
|
|
120
|
+
try {
|
|
121
|
+
const msg = JSON.parse(data.toString());
|
|
122
|
+
// Log every message type received (for debugging)
|
|
123
|
+
if (msg.event === 'media') {
|
|
124
|
+
mediaMessageCount++;
|
|
125
|
+
if (mediaMessageCount <= 5 || mediaMessageCount % 100 === 0) {
|
|
126
|
+
this.log(`[Twilio] Media #${mediaMessageCount}, track: ${msg.media?.track}`);
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
else {
|
|
130
|
+
this.log(`[Twilio] Event: ${msg.event}`);
|
|
131
|
+
}
|
|
132
|
+
this.handleTwilioMessage(msg);
|
|
133
|
+
}
|
|
134
|
+
catch (err) {
|
|
135
|
+
this.log(`[Twilio] Parse error: ${err}`);
|
|
136
|
+
}
|
|
137
|
+
};
|
|
138
|
+
const wsCloseHandler = () => {
|
|
139
|
+
this.handleMediaStreamClose();
|
|
140
|
+
};
|
|
141
|
+
const wsErrorHandler = (err) => {
|
|
142
|
+
this.log(`[Twilio] WebSocket error: ${err}`);
|
|
143
|
+
};
|
|
144
|
+
ws.on('message', wsMessageHandler);
|
|
145
|
+
ws.on('close', wsCloseHandler);
|
|
146
|
+
ws.on('error', wsErrorHandler);
|
|
147
|
+
this.mediaWsHandlers = [
|
|
148
|
+
{ event: 'message', handler: wsMessageHandler },
|
|
149
|
+
{ event: 'close', handler: wsCloseHandler },
|
|
150
|
+
{ event: 'error', handler: wsErrorHandler },
|
|
151
|
+
];
|
|
152
|
+
// NOTE: Do NOT process startMessage here. It must be processed AFTER TTS is ready,
|
|
153
|
+
// because the 'start' event triggers a delayed greeting that requires TTS.
|
|
154
|
+
// See end of this method.
|
|
155
|
+
// Initialize STT
|
|
156
|
+
this.log('[STT] Connecting to Deepgram...');
|
|
157
|
+
this.stt = createPhoneCallSTT(this.config.deepgramApiKey);
|
|
158
|
+
// Store event handlers for cleanup
|
|
159
|
+
const sttTranscriptHandler = (result) => {
|
|
160
|
+
this.handleTranscript(result);
|
|
161
|
+
};
|
|
162
|
+
const sttErrorHandler = (err) => this.log(`[STT] Error: ${err.message}`);
|
|
163
|
+
const sttOpenHandler = () => this.log('[STT] Deepgram connected');
|
|
164
|
+
const sttCloseHandler = () => this.log('[STT] Deepgram disconnected');
|
|
165
|
+
this.stt.on('transcript', sttTranscriptHandler);
|
|
166
|
+
this.stt.on('error', sttErrorHandler);
|
|
167
|
+
this.stt.on('open', sttOpenHandler);
|
|
168
|
+
this.stt.on('close', sttCloseHandler);
|
|
169
|
+
this.sttHandlers = [
|
|
170
|
+
{ event: 'transcript', handler: sttTranscriptHandler },
|
|
171
|
+
{ event: 'error', handler: sttErrorHandler },
|
|
172
|
+
{ event: 'open', handler: sttOpenHandler },
|
|
173
|
+
{ event: 'close', handler: sttCloseHandler },
|
|
174
|
+
];
|
|
175
|
+
this.connectSttInBackground();
|
|
176
|
+
// Initialize TTS with streaming conversion
|
|
177
|
+
this.log('[TTS] Setting up ElevenLabs (streaming mode)...');
|
|
178
|
+
this.tts = createPhoneCallTTS(this.config.elevenLabsApiKey, this.config.elevenLabsVoiceId);
|
|
179
|
+
let ttsChunks = 0;
|
|
180
|
+
let ttsBytes = 0;
|
|
181
|
+
// Store event handlers for cleanup
|
|
182
|
+
const ttsAudioHandler = (chunk, requestId) => {
|
|
183
|
+
if (requestId !== undefined && requestId !== this.decoderGeneration) {
|
|
184
|
+
return;
|
|
185
|
+
}
|
|
186
|
+
ttsChunks++;
|
|
187
|
+
ttsBytes += chunk.length;
|
|
188
|
+
if (ttsChunks === 1) {
|
|
189
|
+
this.log(`[TTS] First audio chunk: ${chunk.length} bytes`);
|
|
190
|
+
}
|
|
191
|
+
// ElevenLabs returns MP3 (regardless of output_format requested!)
|
|
192
|
+
// Stream through ffmpeg decoder to convert to µ-law
|
|
193
|
+
if (this.decoder?.running) {
|
|
194
|
+
this.decoder.write(chunk);
|
|
195
|
+
}
|
|
196
|
+
};
|
|
197
|
+
const ttsDoneHandler = (requestId) => {
|
|
198
|
+
if (requestId !== undefined && requestId !== this.decoderGeneration) {
|
|
199
|
+
return;
|
|
200
|
+
}
|
|
201
|
+
this.log(`[TTS] Stream complete: ${ttsChunks} chunks, ${ttsBytes} bytes total, flushing decoder`);
|
|
202
|
+
// Signal end of input to ffmpeg - it will flush remaining audio
|
|
203
|
+
this.decoder?.end();
|
|
204
|
+
};
|
|
205
|
+
const ttsErrorHandler = (err, requestId) => {
|
|
206
|
+
if (requestId !== undefined && requestId !== this.decoderGeneration) {
|
|
207
|
+
return;
|
|
208
|
+
}
|
|
209
|
+
this.log(`[TTS] Error: ${this.formatError(err)}`);
|
|
210
|
+
};
|
|
211
|
+
this.tts.on('audio', ttsAudioHandler);
|
|
212
|
+
this.tts.on('done', ttsDoneHandler);
|
|
213
|
+
this.tts.on('error', ttsErrorHandler);
|
|
214
|
+
this.ttsHandlers = [
|
|
215
|
+
{ event: 'audio', handler: ttsAudioHandler },
|
|
216
|
+
{ event: 'done', handler: ttsDoneHandler },
|
|
217
|
+
{ event: 'error', handler: ttsErrorHandler },
|
|
218
|
+
];
|
|
219
|
+
this.log('[TTS] Ready (streaming)');
|
|
220
|
+
// Process the start message AFTER TTS is ready.
|
|
221
|
+
// The 'start' event triggers a delayed greeting that requires TTS to be initialized.
|
|
222
|
+
// WebSocket handlers were attached early to capture media frames during STT connect,
|
|
223
|
+
// but the startMessage must be processed here to ensure greeting works.
|
|
224
|
+
if (startMessage) {
|
|
225
|
+
this.log('[Session] Processing initial start message');
|
|
226
|
+
this.handleTwilioMessage(startMessage);
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
/**
|
|
230
|
+
* Handle incoming Twilio media stream messages
|
|
231
|
+
*/
|
|
232
|
+
handleTwilioMessage(msg) {
|
|
233
|
+
switch (msg.event) {
|
|
234
|
+
case 'connected':
|
|
235
|
+
this.log('[Twilio] Media stream connected');
|
|
236
|
+
break;
|
|
237
|
+
case 'start':
|
|
238
|
+
if (msg.start) {
|
|
239
|
+
this.streamSid = msg.start.streamSid;
|
|
240
|
+
this.state.callSid = msg.start.callSid;
|
|
241
|
+
this.callConnectedAtMs = Date.now();
|
|
242
|
+
this.log(`[Session] Stream started - streamSid: ${this.streamSid}`);
|
|
243
|
+
this.updateStatus('in-progress');
|
|
244
|
+
this.emitMessage({ type: 'call_connected', callId: this.callId });
|
|
245
|
+
this.prefetchGreeting();
|
|
246
|
+
// Send AI-generated greeting after a short delay to ensure audio is ready.
|
|
247
|
+
// If the remote party speaks first (common with IVRs), we delay greeting.
|
|
248
|
+
this.scheduleInitialGreeting(GREETING_DELAY_MS);
|
|
249
|
+
}
|
|
250
|
+
break;
|
|
251
|
+
case 'media':
|
|
252
|
+
if (msg.media?.payload) {
|
|
253
|
+
// Convert mulaw to PCM and send to STT (accept any track for now)
|
|
254
|
+
const mulaw = Buffer.from(msg.media.payload, 'base64');
|
|
255
|
+
const pcm = mulawToPcm(mulaw);
|
|
256
|
+
if (!this.greetingStarted && !this.isSpeaking) {
|
|
257
|
+
this.trackInboundSpeechActivity(pcm);
|
|
258
|
+
}
|
|
259
|
+
if (this.stt?.connected) {
|
|
260
|
+
this.stt.sendAudio(pcm);
|
|
261
|
+
}
|
|
262
|
+
else {
|
|
263
|
+
if (this.bufferedSttAudio.length >= MAX_BUFFERED_STT_CHUNKS) {
|
|
264
|
+
this.bufferedSttAudio.shift();
|
|
265
|
+
}
|
|
266
|
+
this.bufferedSttAudio.push(pcm);
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
break;
|
|
270
|
+
case 'stop':
|
|
271
|
+
this.handleMediaStreamClose();
|
|
272
|
+
break;
|
|
273
|
+
case 'mark':
|
|
274
|
+
// Audio playback marker - can be used for timing
|
|
275
|
+
if (msg.mark?.name === 'audio_done') {
|
|
276
|
+
this.isPlaying = false;
|
|
277
|
+
this.flushAudioQueue();
|
|
278
|
+
}
|
|
279
|
+
break;
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
// How long to wait after final transcript before responding (ms)
|
|
283
|
+
// This allows the human to pause mid-sentence without being interrupted
|
|
284
|
+
// 1000ms = 1 second of silence before AI responds
|
|
285
|
+
static RESPONSE_DEBOUNCE_MS = 1000;
|
|
286
|
+
/**
|
|
287
|
+
* Handle transcription results from Deepgram
|
|
288
|
+
*/
|
|
289
|
+
handleTranscript(result) {
|
|
290
|
+
const text = result.text.trim();
|
|
291
|
+
if (!text)
|
|
292
|
+
return;
|
|
293
|
+
// IVRs often speak immediately after answer; avoid talking over them.
|
|
294
|
+
if (!this.greetingStarted) {
|
|
295
|
+
this.lastInboundTranscriptAtMs = Date.now();
|
|
296
|
+
}
|
|
297
|
+
const transcriptEndMs = this.getTranscriptEndTimestampMs(result);
|
|
298
|
+
if (transcriptEndMs !== undefined && transcriptEndMs <= this.suppressSttUntilMs) {
|
|
299
|
+
this.log(`[STT] Ignoring likely overlap transcript by word timing: "${result.text}"`);
|
|
300
|
+
return;
|
|
301
|
+
}
|
|
302
|
+
// Prevent AI voice playback/echo from being treated as human speech.
|
|
303
|
+
// This intentionally drops barge-in during playback to avoid self-transcription loops.
|
|
304
|
+
if (this.isSpeaking || Date.now() < this.suppressSttUntilMs) {
|
|
305
|
+
this.log(`[STT] Ignoring likely echo while AI audio is active: "${text}"`);
|
|
306
|
+
return;
|
|
307
|
+
}
|
|
308
|
+
// Emit interim transcript events (for UI feedback)
|
|
309
|
+
// But DON'T add to state.transcript yet - wait for debounce to combine segments
|
|
310
|
+
this.emitMessage({
|
|
311
|
+
type: 'transcript',
|
|
312
|
+
callId: this.callId,
|
|
313
|
+
text,
|
|
314
|
+
role: 'human',
|
|
315
|
+
isFinal: result.isFinal,
|
|
316
|
+
});
|
|
317
|
+
// For final transcripts, use debouncing to combine segments and avoid interrupting
|
|
318
|
+
if (result.isFinal) {
|
|
319
|
+
// Cancel any pending response timer
|
|
320
|
+
if (this.responseDebounceTimer) {
|
|
321
|
+
clearTimeout(this.responseDebounceTimer);
|
|
322
|
+
this.log(`[Turn] More speech detected, extending wait...`);
|
|
323
|
+
}
|
|
324
|
+
// Accumulate transcript segments
|
|
325
|
+
if (this.pendingTranscript) {
|
|
326
|
+
this.pendingTranscript += ` ${text}`;
|
|
327
|
+
}
|
|
328
|
+
else {
|
|
329
|
+
this.pendingTranscript = text;
|
|
330
|
+
}
|
|
331
|
+
this.log(`[Turn] Accumulated: "${this.pendingTranscript}"`);
|
|
332
|
+
// If already processing a response, don't queue another
|
|
333
|
+
if (this.isProcessingResponse) {
|
|
334
|
+
this.log(`[Turn] AI still speaking, will respond after`);
|
|
335
|
+
return;
|
|
336
|
+
}
|
|
337
|
+
// Start debounce timer - wait for more speech or timeout
|
|
338
|
+
this.log(`[Turn] Waiting ${CallSession.RESPONSE_DEBOUNCE_MS}ms for more speech...`);
|
|
339
|
+
this.responseDebounceTimer = setTimeout(() => {
|
|
340
|
+
this.responseDebounceTimer = null;
|
|
341
|
+
const fullTranscript = this.pendingTranscript.trim();
|
|
342
|
+
this.pendingTranscript = '';
|
|
343
|
+
if (fullTranscript && !this.isProcessingResponse) {
|
|
344
|
+
// NOW add the combined transcript to state
|
|
345
|
+
const entry = {
|
|
346
|
+
role: 'human',
|
|
347
|
+
text: fullTranscript,
|
|
348
|
+
timestamp: new Date(),
|
|
349
|
+
isFinal: true,
|
|
350
|
+
};
|
|
351
|
+
this.state.transcript.push(entry);
|
|
352
|
+
this.log(`[Turn] Silence confirmed, responding to: "${fullTranscript}"`);
|
|
353
|
+
this.generateAIResponse(fullTranscript);
|
|
354
|
+
}
|
|
355
|
+
}, CallSession.RESPONSE_DEBOUNCE_MS);
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
trackInboundSpeechActivity(pcm) {
|
|
359
|
+
const rms = calculateRms(pcm);
|
|
360
|
+
if (rms >= PRE_GREETING_VAD_RMS_THRESHOLD) {
|
|
361
|
+
this.consecutiveInboundSpeechChunks++;
|
|
362
|
+
if (this.consecutiveInboundSpeechChunks >= PRE_GREETING_VAD_MIN_CONSECUTIVE_CHUNKS) {
|
|
363
|
+
this.lastInboundAudioActivityAtMs = Date.now();
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
else {
|
|
367
|
+
this.consecutiveInboundSpeechChunks = 0;
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
scheduleInitialGreeting(delayMs) {
|
|
371
|
+
if (this.greetingStarted || this.cleanedUp)
|
|
372
|
+
return;
|
|
373
|
+
if (this.greetingTimer) {
|
|
374
|
+
clearTimeout(this.greetingTimer);
|
|
375
|
+
}
|
|
376
|
+
this.greetingTimer = setTimeout(() => {
|
|
377
|
+
this.greetingTimer = null;
|
|
378
|
+
this.sendInitialGreeting().catch((err) => {
|
|
379
|
+
this.log(`[AI] Greeting error: ${this.formatError(err)}`);
|
|
380
|
+
});
|
|
381
|
+
}, delayMs);
|
|
382
|
+
}
|
|
383
|
+
connectSttInBackground() {
|
|
384
|
+
void (async () => {
|
|
385
|
+
if (!this.stt)
|
|
386
|
+
return;
|
|
387
|
+
try {
|
|
388
|
+
await this.stt.connect();
|
|
389
|
+
if (this.cleanedUp || !this.stt)
|
|
390
|
+
return;
|
|
391
|
+
this.log('[STT] Connection established');
|
|
392
|
+
this.sttTimelineStartMs = Date.now();
|
|
393
|
+
if (this.bufferedSttAudio.length > 0) {
|
|
394
|
+
this.log(`[STT] Flushing ${this.bufferedSttAudio.length} buffered audio chunk(s)`);
|
|
395
|
+
for (const pcm of this.bufferedSttAudio) {
|
|
396
|
+
this.stt.sendAudio(pcm);
|
|
397
|
+
}
|
|
398
|
+
this.bufferedSttAudio = [];
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
catch (err) {
|
|
402
|
+
if (this.cleanedUp)
|
|
403
|
+
return;
|
|
404
|
+
const message = this.formatError(err);
|
|
405
|
+
this.log(`[STT] Connection failed: ${message}`);
|
|
406
|
+
this.emitMessage({
|
|
407
|
+
type: 'error',
|
|
408
|
+
callId: this.callId,
|
|
409
|
+
message: `STT connection failed: ${message}`,
|
|
410
|
+
});
|
|
411
|
+
}
|
|
412
|
+
})();
|
|
413
|
+
}
|
|
414
|
+
prefetchGreeting() {
|
|
415
|
+
if (this.greetingPrefetchPromise)
|
|
416
|
+
return;
|
|
417
|
+
this.greetingPrefetchPromise = this.conversationAI
|
|
418
|
+
.getGreeting()
|
|
419
|
+
.then((greeting) => greeting.trim() || null)
|
|
420
|
+
.catch((err) => {
|
|
421
|
+
this.log(`[AI] Greeting prefetch failed: ${this.formatError(err)}`);
|
|
422
|
+
return null;
|
|
423
|
+
});
|
|
424
|
+
}
|
|
425
|
+
async sendInitialGreeting() {
|
|
426
|
+
if (this.greetingStarted || this.cleanedUp)
|
|
427
|
+
return;
|
|
428
|
+
const lastInboundActivityAtMs = Math.max(this.lastInboundTranscriptAtMs, this.lastInboundAudioActivityAtMs);
|
|
429
|
+
if (lastInboundActivityAtMs) {
|
|
430
|
+
const elapsed = Date.now() - lastInboundActivityAtMs;
|
|
431
|
+
const callElapsed = this.callConnectedAtMs ? Date.now() - this.callConnectedAtMs : 0;
|
|
432
|
+
if (elapsed < PRE_GREETING_IDLE_MS) {
|
|
433
|
+
if (callElapsed >= MAX_GREETING_DEFERRAL_MS) {
|
|
434
|
+
this.log('[AI] Greeting deferral timeout reached; proceeding');
|
|
435
|
+
}
|
|
436
|
+
else {
|
|
437
|
+
this.log(`[AI] Deferring greeting; remote speech detected ${elapsed}ms ago`);
|
|
438
|
+
this.scheduleInitialGreeting(PRE_GREETING_IDLE_MS - elapsed);
|
|
439
|
+
return;
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
if (this.pendingTranscript || this.state.transcript.some((entry) => entry.role === 'human') || this.isProcessingResponse) {
|
|
444
|
+
this.log('[AI] Skipping initial greeting because remote party spoke first');
|
|
445
|
+
this.greetingStarted = true;
|
|
446
|
+
return;
|
|
447
|
+
}
|
|
448
|
+
this.greetingStarted = true;
|
|
449
|
+
try {
|
|
450
|
+
this.log('[AI] Generating greeting...');
|
|
451
|
+
const prefetchedGreeting = this.greetingPrefetchPromise ? await this.greetingPrefetchPromise : null;
|
|
452
|
+
const greeting = prefetchedGreeting ?? (await this.conversationAI.getGreeting());
|
|
453
|
+
this.log(`[AI] Greeting: "${greeting}"`);
|
|
454
|
+
await this.speak(greeting);
|
|
455
|
+
}
|
|
456
|
+
catch (err) {
|
|
457
|
+
const message = this.formatError(err);
|
|
458
|
+
this.log(`[AI] Greeting error: ${message}`);
|
|
459
|
+
this.emitMessage({
|
|
460
|
+
type: 'error',
|
|
461
|
+
callId: this.callId,
|
|
462
|
+
message: this.isElevenLabsQuotaExceeded(err)
|
|
463
|
+
? this.getTTSOperatorMessage(err)
|
|
464
|
+
: `Greeting generation failed: ${message}`,
|
|
465
|
+
});
|
|
466
|
+
if (this.isElevenLabsQuotaExceeded(err)) {
|
|
467
|
+
await this.hangup();
|
|
468
|
+
return;
|
|
469
|
+
}
|
|
470
|
+
// Fallback to basic greeting.
|
|
471
|
+
const fallback = `Hello! I'm calling about: ${this.state.goal}`;
|
|
472
|
+
this.speak(fallback).catch((fallbackErr) => {
|
|
473
|
+
const fallbackMessage = this.getTTSOperatorMessage(fallbackErr);
|
|
474
|
+
this.log(`[AI] Fallback error: ${fallbackMessage}`);
|
|
475
|
+
this.emitMessage({
|
|
476
|
+
type: 'error',
|
|
477
|
+
callId: this.callId,
|
|
478
|
+
message: fallbackMessage,
|
|
479
|
+
});
|
|
480
|
+
});
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
getTranscriptEndTimestampMs(result) {
|
|
484
|
+
if (!this.sttTimelineStartMs || !result.words || result.words.length === 0) {
|
|
485
|
+
return undefined;
|
|
486
|
+
}
|
|
487
|
+
const maxWordEndSeconds = result.words.reduce((max, word) => Math.max(max, word.end), 0);
|
|
488
|
+
return this.sttTimelineStartMs + Math.round(maxWordEndSeconds * 1000);
|
|
489
|
+
}
|
|
490
|
+
/**
|
|
491
|
+
* Generate and speak AI response to human speech
|
|
492
|
+
*/
|
|
493
|
+
async generateAIResponse(humanSaid) {
|
|
494
|
+
if (this.conversationAI.complete) {
|
|
495
|
+
this.log('[AI] Conversation already complete, ignoring');
|
|
496
|
+
return;
|
|
497
|
+
}
|
|
498
|
+
this.isProcessingResponse = true;
|
|
499
|
+
const responseStart = Date.now();
|
|
500
|
+
try {
|
|
501
|
+
this.log(`[AI] Generating response to: "${humanSaid}"`);
|
|
502
|
+
const lastAssistantUtterance = this.getLastAssistantUtterance();
|
|
503
|
+
const shortAcknowledgement = isLikelyShortAcknowledgement(humanSaid);
|
|
504
|
+
const lastAssistantQuestion = lastAssistantUtterance
|
|
505
|
+
? extractMostRecentQuestion(lastAssistantUtterance)
|
|
506
|
+
: undefined;
|
|
507
|
+
const response = await this.conversationAI.respond(humanSaid, {
|
|
508
|
+
shortAcknowledgement,
|
|
509
|
+
lastAssistantUtterance,
|
|
510
|
+
lastAssistantQuestion,
|
|
511
|
+
});
|
|
512
|
+
this.log(`[AI] Response ready (${Date.now() - responseStart}ms, ${response?.length || 0} chars)`);
|
|
513
|
+
if (response === null) {
|
|
514
|
+
// Conversation is complete
|
|
515
|
+
this.log('[AI] Conversation complete');
|
|
516
|
+
await this.hangup();
|
|
517
|
+
return;
|
|
518
|
+
}
|
|
519
|
+
this.log(`[TTS] Speaking: "${response.substring(0, 50)}..."`);
|
|
520
|
+
await this.speak(response);
|
|
521
|
+
this.log(`[TTS] Speech complete (${Date.now() - responseStart}ms total)`);
|
|
522
|
+
// Check if AI marked conversation complete (handled internally)
|
|
523
|
+
if (this.conversationAI.complete) {
|
|
524
|
+
this.log(`[AI] Marked complete, ending call in ${CALL_COMPLETION_DELAY_MS}ms`);
|
|
525
|
+
// Give a moment for the final response to be spoken
|
|
526
|
+
// Clear any existing hangup timer first
|
|
527
|
+
if (this.hangupTimer) {
|
|
528
|
+
clearTimeout(this.hangupTimer);
|
|
529
|
+
}
|
|
530
|
+
this.hangupTimer = setTimeout(() => {
|
|
531
|
+
this.hangupTimer = null;
|
|
532
|
+
if (!this.cleanedUp) {
|
|
533
|
+
this.hangup().catch((err) => this.log(`[Hangup] Error: ${err}`));
|
|
534
|
+
}
|
|
535
|
+
}, CALL_COMPLETION_DELAY_MS);
|
|
536
|
+
}
|
|
537
|
+
}
|
|
538
|
+
catch (err) {
|
|
539
|
+
const message = this.formatError(err);
|
|
540
|
+
this.log(`[AI] Response error: ${message}`);
|
|
541
|
+
this.emitMessage({
|
|
542
|
+
type: 'error',
|
|
543
|
+
callId: this.callId,
|
|
544
|
+
message: this.isElevenLabsQuotaExceeded(err)
|
|
545
|
+
? this.getTTSOperatorMessage(err)
|
|
546
|
+
: `AI response failed: ${message}`,
|
|
547
|
+
});
|
|
548
|
+
if (this.isElevenLabsQuotaExceeded(err)) {
|
|
549
|
+
await this.hangup();
|
|
550
|
+
return;
|
|
551
|
+
}
|
|
552
|
+
// Try to recover with a fallback
|
|
553
|
+
try {
|
|
554
|
+
await this.speak("I'm sorry, could you repeat that?");
|
|
555
|
+
}
|
|
556
|
+
catch (fallbackErr) {
|
|
557
|
+
const fallbackMessage = this.getTTSOperatorMessage(fallbackErr);
|
|
558
|
+
this.log(`[AI] Fallback response failed: ${fallbackMessage}`);
|
|
559
|
+
this.emitMessage({
|
|
560
|
+
type: 'error',
|
|
561
|
+
callId: this.callId,
|
|
562
|
+
message: fallbackMessage,
|
|
563
|
+
});
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
finally {
|
|
567
|
+
this.isProcessingResponse = false;
|
|
568
|
+
}
|
|
569
|
+
}
|
|
570
|
+
getLastAssistantUtterance() {
|
|
571
|
+
for (let i = this.state.transcript.length - 1; i >= 0; i--) {
|
|
572
|
+
const entry = this.state.transcript[i];
|
|
573
|
+
if (entry.role === 'assistant' && entry.text.trim()) {
|
|
574
|
+
return entry.text.trim();
|
|
575
|
+
}
|
|
576
|
+
}
|
|
577
|
+
return undefined;
|
|
578
|
+
}
|
|
579
|
+
/**
|
|
580
|
+
* Speak text using TTS
|
|
581
|
+
*/
|
|
582
|
+
async speak(text) {
|
|
583
|
+
this.log(`[TTS] speak() called: "${text.substring(0, 50)}..."`);
|
|
584
|
+
if (!this.tts || !this.mediaWs) {
|
|
585
|
+
this.log(`[TTS] speak() failed: not initialized (tts: ${!!this.tts}, ws: ${!!this.mediaWs})`);
|
|
586
|
+
throw new Error('Session not initialized');
|
|
587
|
+
}
|
|
588
|
+
// Cancel any ongoing speech
|
|
589
|
+
if (this.isSpeaking) {
|
|
590
|
+
this.tts.cancel();
|
|
591
|
+
this.decoder?.stop();
|
|
592
|
+
}
|
|
593
|
+
// Reset streaming state
|
|
594
|
+
this.audioQueue = [];
|
|
595
|
+
for (let attempt = 0; attempt <= TTS_EMPTY_AUDIO_MAX_RETRIES; attempt++) {
|
|
596
|
+
this.isSpeaking = true;
|
|
597
|
+
// Increment generation to track which decoder is current
|
|
598
|
+
this.decoderGeneration++;
|
|
599
|
+
const currentGeneration = this.decoderGeneration;
|
|
600
|
+
// Start ffmpeg decoder to convert MP3 → µ-law
|
|
601
|
+
// (ElevenLabs always returns MP3 regardless of output_format requested)
|
|
602
|
+
this.decoder = createStreamingDecoder();
|
|
603
|
+
let resolveFirstChunk = null;
|
|
604
|
+
const firstChunkPromise = new Promise((resolve) => {
|
|
605
|
+
resolveFirstChunk = resolve;
|
|
606
|
+
});
|
|
607
|
+
let decoderChunks = 0;
|
|
608
|
+
let decoderBytes = 0;
|
|
609
|
+
this.decoder.on('data', (mulaw) => {
|
|
610
|
+
// Only process data if this is still the current decoder
|
|
611
|
+
if (currentGeneration !== this.decoderGeneration)
|
|
612
|
+
return;
|
|
613
|
+
decoderChunks++;
|
|
614
|
+
decoderBytes += mulaw.length;
|
|
615
|
+
if (decoderChunks === 1) {
|
|
616
|
+
this.log(`[Decoder] First chunk: ${mulaw.length} bytes, first 4 bytes: ${mulaw.slice(0, 4).toString('hex')}`);
|
|
617
|
+
resolveFirstChunk?.();
|
|
618
|
+
resolveFirstChunk = null;
|
|
619
|
+
}
|
|
620
|
+
if (decoderChunks % 10 === 0) {
|
|
621
|
+
this.log(`[Decoder] ${decoderChunks} chunks, ${decoderBytes} bytes total`);
|
|
622
|
+
}
|
|
623
|
+
// Send µ-law directly to Twilio as it's decoded
|
|
624
|
+
this.sendAudioToTwilio(mulaw);
|
|
625
|
+
});
|
|
626
|
+
this.decoder.on('close', () => {
|
|
627
|
+
// Only update isSpeaking if this is the current decoder
|
|
628
|
+
if (currentGeneration === this.decoderGeneration) {
|
|
629
|
+
this.log('[Decoder] Closed, speech complete');
|
|
630
|
+
this.isSpeaking = false;
|
|
631
|
+
this.suppressSttUntilMs = Date.now() + POST_TTS_STT_SUPPRESSION_MS;
|
|
632
|
+
}
|
|
633
|
+
});
|
|
634
|
+
this.decoder.on('error', (err) => {
|
|
635
|
+
this.log(`[Decoder] Error: ${err}`);
|
|
636
|
+
});
|
|
637
|
+
this.decoder.start();
|
|
638
|
+
try {
|
|
639
|
+
// Start TTS
|
|
640
|
+
await this.tts.speak(text, currentGeneration);
|
|
641
|
+
if (decoderChunks === 0) {
|
|
642
|
+
// The decoder can lag behind the TTS "done" signal slightly; wait briefly before declaring empty output.
|
|
643
|
+
await Promise.race([
|
|
644
|
+
firstChunkPromise,
|
|
645
|
+
new Promise((resolve) => setTimeout(resolve, TTS_DECODER_FLUSH_GRACE_MS)),
|
|
646
|
+
]);
|
|
647
|
+
if (decoderChunks === 0) {
|
|
648
|
+
throw new Error('TTS produced no audio output (decoder emitted 0 chunks)');
|
|
649
|
+
}
|
|
650
|
+
}
|
|
651
|
+
// Add to transcript only after TTS succeeds.
|
|
652
|
+
const entry = {
|
|
653
|
+
role: 'assistant',
|
|
654
|
+
text,
|
|
655
|
+
timestamp: new Date(),
|
|
656
|
+
isFinal: true,
|
|
657
|
+
};
|
|
658
|
+
this.state.transcript.push(entry);
|
|
659
|
+
// Emit transcript event only when audio was actually produced.
|
|
660
|
+
this.emitMessage({
|
|
661
|
+
type: 'transcript',
|
|
662
|
+
callId: this.callId,
|
|
663
|
+
text,
|
|
664
|
+
role: 'assistant',
|
|
665
|
+
isFinal: true,
|
|
666
|
+
});
|
|
667
|
+
return;
|
|
668
|
+
}
|
|
669
|
+
catch (err) {
|
|
670
|
+
// Ensure we don't leave a stalled decoder when synthesis fails.
|
|
671
|
+
if (currentGeneration === this.decoderGeneration) {
|
|
672
|
+
this.decoder?.stop();
|
|
673
|
+
this.isSpeaking = false;
|
|
674
|
+
}
|
|
675
|
+
const canRetry = this.isEmptyTtsAudioError(err) && attempt < TTS_EMPTY_AUDIO_MAX_RETRIES;
|
|
676
|
+
if (!canRetry) {
|
|
677
|
+
throw err;
|
|
678
|
+
}
|
|
679
|
+
const retryCount = attempt + 1;
|
|
680
|
+
this.log(`[TTS] Empty audio output, retrying synthesis (${retryCount}/${TTS_EMPTY_AUDIO_MAX_RETRIES})`);
|
|
681
|
+
this.tts.cancel();
|
|
682
|
+
await new Promise((resolve) => setTimeout(resolve, TTS_EMPTY_AUDIO_RETRY_DELAY_MS));
|
|
683
|
+
}
|
|
684
|
+
}
|
|
685
|
+
throw new Error('TTS failed after retry attempts');
|
|
686
|
+
}
|
|
687
|
+
/**
|
|
688
|
+
* Send queued audio to Twilio
|
|
689
|
+
*/
|
|
690
|
+
flushAudioQueue() {
|
|
691
|
+
if (!this.mediaWs || !this.streamSid || this.isPlaying || this.audioQueue.length === 0) {
|
|
692
|
+
return;
|
|
693
|
+
}
|
|
694
|
+
this.log(`[Audio] Flushing ${this.audioQueue.length} chunks to Twilio`);
|
|
695
|
+
this.isPlaying = true;
|
|
696
|
+
// Send all queued audio
|
|
697
|
+
let totalBytes = 0;
|
|
698
|
+
while (this.audioQueue.length > 0) {
|
|
699
|
+
const audio = this.audioQueue.shift();
|
|
700
|
+
if (audio) {
|
|
701
|
+
this.sendAudioToTwilio(audio);
|
|
702
|
+
totalBytes += audio.length;
|
|
703
|
+
}
|
|
704
|
+
}
|
|
705
|
+
this.log(`[Audio] Sent ${totalBytes} bytes total`);
|
|
706
|
+
// Send mark to know when audio is done
|
|
707
|
+
this.sendMarkToTwilio('audio_done');
|
|
708
|
+
}
|
|
709
|
+
/**
|
|
710
|
+
* Send audio data to Twilio media stream
|
|
711
|
+
*/
|
|
712
|
+
sendAudioToTwilio(mulaw) {
|
|
713
|
+
if (!this.mediaWs || !this.streamSid) {
|
|
714
|
+
return;
|
|
715
|
+
}
|
|
716
|
+
const msg = {
|
|
717
|
+
event: 'media',
|
|
718
|
+
streamSid: this.streamSid,
|
|
719
|
+
media: {
|
|
720
|
+
payload: mulaw.toString('base64'),
|
|
721
|
+
},
|
|
722
|
+
};
|
|
723
|
+
try {
|
|
724
|
+
this.mediaWs.send(JSON.stringify(msg));
|
|
725
|
+
}
|
|
726
|
+
catch (err) {
|
|
727
|
+
this.log(`[Audio] Send error: ${err}`);
|
|
728
|
+
}
|
|
729
|
+
}
|
|
730
|
+
/**
|
|
731
|
+
* Send a mark event to track audio playback
|
|
732
|
+
*/
|
|
733
|
+
sendMarkToTwilio(name) {
|
|
734
|
+
if (!this.mediaWs || !this.streamSid)
|
|
735
|
+
return;
|
|
736
|
+
const msg = {
|
|
737
|
+
event: 'mark',
|
|
738
|
+
streamSid: this.streamSid,
|
|
739
|
+
mark: { name },
|
|
740
|
+
};
|
|
741
|
+
try {
|
|
742
|
+
this.mediaWs.send(JSON.stringify(msg));
|
|
743
|
+
}
|
|
744
|
+
catch (err) {
|
|
745
|
+
console.error('[Media WS] Mark error:', err);
|
|
746
|
+
}
|
|
747
|
+
}
|
|
748
|
+
/**
|
|
749
|
+
* Hang up the call
|
|
750
|
+
*/
|
|
751
|
+
async hangup() {
|
|
752
|
+
this.log('[Session] Hanging up...');
|
|
753
|
+
if (this.state.callSid) {
|
|
754
|
+
try {
|
|
755
|
+
await hangupCall(this.config, this.state.callSid);
|
|
756
|
+
}
|
|
757
|
+
catch (err) {
|
|
758
|
+
this.log(`[Hangup] Error: ${err}`);
|
|
759
|
+
}
|
|
760
|
+
}
|
|
761
|
+
this.cleanup();
|
|
762
|
+
this.updateStatus('completed');
|
|
763
|
+
this.emitEnded();
|
|
764
|
+
}
|
|
765
|
+
endFromProviderStatus(status) {
|
|
766
|
+
if (this.endedEmitted)
|
|
767
|
+
return;
|
|
768
|
+
this.log(`[Session] Ending from provider status: ${status}`);
|
|
769
|
+
this.updateStatus(status);
|
|
770
|
+
this.cleanup();
|
|
771
|
+
this.emitEnded();
|
|
772
|
+
}
|
|
773
|
+
/**
|
|
774
|
+
* Handle media stream close
|
|
775
|
+
*/
|
|
776
|
+
handleMediaStreamClose() {
|
|
777
|
+
this.cleanup();
|
|
778
|
+
if (this.state.status === 'in-progress') {
|
|
779
|
+
this.updateStatus('completed');
|
|
780
|
+
}
|
|
781
|
+
this.emitEnded();
|
|
782
|
+
}
|
|
783
|
+
/**
|
|
784
|
+
* Clean up resources
|
|
785
|
+
*/
|
|
786
|
+
cleanup() {
|
|
787
|
+
if (this.cleanedUp)
|
|
788
|
+
return;
|
|
789
|
+
this.cleanedUp = true;
|
|
790
|
+
// Clear timers
|
|
791
|
+
if (this.responseDebounceTimer) {
|
|
792
|
+
clearTimeout(this.responseDebounceTimer);
|
|
793
|
+
this.responseDebounceTimer = null;
|
|
794
|
+
}
|
|
795
|
+
if (this.hangupTimer) {
|
|
796
|
+
clearTimeout(this.hangupTimer);
|
|
797
|
+
this.hangupTimer = null;
|
|
798
|
+
}
|
|
799
|
+
if (this.greetingTimer) {
|
|
800
|
+
clearTimeout(this.greetingTimer);
|
|
801
|
+
this.greetingTimer = null;
|
|
802
|
+
}
|
|
803
|
+
// Stop decoder
|
|
804
|
+
if (this.decoder) {
|
|
805
|
+
this.decoder.stop();
|
|
806
|
+
this.decoder = null;
|
|
807
|
+
}
|
|
808
|
+
// Remove STT event listeners and close
|
|
809
|
+
if (this.stt) {
|
|
810
|
+
for (const { event, handler } of this.sttHandlers) {
|
|
811
|
+
this.stt.removeListener(event, handler);
|
|
812
|
+
}
|
|
813
|
+
this.stt.close();
|
|
814
|
+
this.stt = null;
|
|
815
|
+
}
|
|
816
|
+
this.sttHandlers = [];
|
|
817
|
+
// Remove TTS event listeners and cancel
|
|
818
|
+
if (this.tts) {
|
|
819
|
+
for (const { event, handler } of this.ttsHandlers) {
|
|
820
|
+
this.tts.removeListener(event, handler);
|
|
821
|
+
}
|
|
822
|
+
this.tts.cancel();
|
|
823
|
+
this.tts = null;
|
|
824
|
+
}
|
|
825
|
+
this.ttsHandlers = [];
|
|
826
|
+
// Remove WebSocket event listeners
|
|
827
|
+
if (this.mediaWs) {
|
|
828
|
+
for (const { event, handler } of this.mediaWsHandlers) {
|
|
829
|
+
this.mediaWs.removeListener(event, handler);
|
|
830
|
+
}
|
|
831
|
+
this.mediaWs = null;
|
|
832
|
+
}
|
|
833
|
+
this.mediaWsHandlers = [];
|
|
834
|
+
this.streamSid = null;
|
|
835
|
+
this.bufferedSttAudio = [];
|
|
836
|
+
this.state.endedAt = new Date();
|
|
837
|
+
}
|
|
838
|
+
/**
|
|
839
|
+
* Update call status
|
|
840
|
+
*/
|
|
841
|
+
updateStatus(status) {
|
|
842
|
+
this.state.status = status;
|
|
843
|
+
}
|
|
844
|
+
/**
|
|
845
|
+
* Set the Twilio call SID
|
|
846
|
+
*/
|
|
847
|
+
setCallSid(callSid) {
|
|
848
|
+
this.state.callSid = callSid;
|
|
849
|
+
}
|
|
850
|
+
/**
|
|
851
|
+
* Emit a server message
|
|
852
|
+
*/
|
|
853
|
+
emitMessage(msg) {
|
|
854
|
+
this.emit('message', msg);
|
|
855
|
+
}
|
|
856
|
+
/**
|
|
857
|
+
* Emit ended event
|
|
858
|
+
*/
|
|
859
|
+
emitEnded() {
|
|
860
|
+
if (this.endedEmitted)
|
|
861
|
+
return;
|
|
862
|
+
this.endedEmitted = true;
|
|
863
|
+
const summary = this.generateSummary();
|
|
864
|
+
this.state.summary = summary;
|
|
865
|
+
this.emitMessage({
|
|
866
|
+
type: 'call_ended',
|
|
867
|
+
callId: this.callId,
|
|
868
|
+
summary,
|
|
869
|
+
status: this.state.status,
|
|
870
|
+
});
|
|
871
|
+
this.emit('ended', this.state);
|
|
872
|
+
}
|
|
873
|
+
/**
|
|
874
|
+
* Generate a conversation summary
|
|
875
|
+
*/
|
|
876
|
+
generateSummary() {
|
|
877
|
+
if (this.state.transcript.length === 0) {
|
|
878
|
+
return 'No conversation recorded.';
|
|
879
|
+
}
|
|
880
|
+
const lines = this.state.transcript.map((t) => `${t.role === 'assistant' ? 'AI' : 'Human'}: ${t.text}`);
|
|
881
|
+
return lines.join('\n');
|
|
882
|
+
}
|
|
883
|
+
/**
|
|
884
|
+
* Get current state
|
|
885
|
+
*/
|
|
886
|
+
getState() {
|
|
887
|
+
return { ...this.state };
|
|
888
|
+
}
|
|
889
|
+
}
|
|
890
|
+
//# sourceMappingURL=call-session.js.map
|