@skillhq/concierge 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. package/README.md +91 -0
  2. package/dist/cli/program.d.ts +3 -0
  3. package/dist/cli/program.d.ts.map +1 -0
  4. package/dist/cli/program.js +46 -0
  5. package/dist/cli/program.js.map +1 -0
  6. package/dist/cli/shared.d.ts +18 -0
  7. package/dist/cli/shared.d.ts.map +1 -0
  8. package/dist/cli/shared.js +2 -0
  9. package/dist/cli/shared.js.map +1 -0
  10. package/dist/cli.d.ts +3 -0
  11. package/dist/cli.d.ts.map +1 -0
  12. package/dist/cli.js +5 -0
  13. package/dist/cli.js.map +1 -0
  14. package/dist/commands/call.d.ts +7 -0
  15. package/dist/commands/call.d.ts.map +1 -0
  16. package/dist/commands/call.js +409 -0
  17. package/dist/commands/call.js.map +1 -0
  18. package/dist/commands/config.d.ts +4 -0
  19. package/dist/commands/config.d.ts.map +1 -0
  20. package/dist/commands/config.js +120 -0
  21. package/dist/commands/config.js.map +1 -0
  22. package/dist/commands/find-contact.d.ts +4 -0
  23. package/dist/commands/find-contact.d.ts.map +1 -0
  24. package/dist/commands/find-contact.js +57 -0
  25. package/dist/commands/find-contact.js.map +1 -0
  26. package/dist/commands/server.d.ts +7 -0
  27. package/dist/commands/server.d.ts.map +1 -0
  28. package/dist/commands/server.js +212 -0
  29. package/dist/commands/server.js.map +1 -0
  30. package/dist/index.d.ts +4 -0
  31. package/dist/index.d.ts.map +1 -0
  32. package/dist/index.js +3 -0
  33. package/dist/index.js.map +1 -0
  34. package/dist/lib/call/audio/mulaw.d.ts +35 -0
  35. package/dist/lib/call/audio/mulaw.d.ts.map +1 -0
  36. package/dist/lib/call/audio/mulaw.js +109 -0
  37. package/dist/lib/call/audio/mulaw.js.map +1 -0
  38. package/dist/lib/call/audio/pcm-utils.d.ts +62 -0
  39. package/dist/lib/call/audio/pcm-utils.d.ts.map +1 -0
  40. package/dist/lib/call/audio/pcm-utils.js +149 -0
  41. package/dist/lib/call/audio/pcm-utils.js.map +1 -0
  42. package/dist/lib/call/audio/resample.d.ts +34 -0
  43. package/dist/lib/call/audio/resample.d.ts.map +1 -0
  44. package/dist/lib/call/audio/resample.js +97 -0
  45. package/dist/lib/call/audio/resample.js.map +1 -0
  46. package/dist/lib/call/audio/streaming-decoder.d.ts +45 -0
  47. package/dist/lib/call/audio/streaming-decoder.d.ts.map +1 -0
  48. package/dist/lib/call/audio/streaming-decoder.js +110 -0
  49. package/dist/lib/call/audio/streaming-decoder.js.map +1 -0
  50. package/dist/lib/call/call-server.d.ts +110 -0
  51. package/dist/lib/call/call-server.d.ts.map +1 -0
  52. package/dist/lib/call/call-server.js +681 -0
  53. package/dist/lib/call/call-server.js.map +1 -0
  54. package/dist/lib/call/call-session.d.ts +133 -0
  55. package/dist/lib/call/call-session.d.ts.map +1 -0
  56. package/dist/lib/call/call-session.js +890 -0
  57. package/dist/lib/call/call-session.js.map +1 -0
  58. package/dist/lib/call/call-types.d.ts +133 -0
  59. package/dist/lib/call/call-types.d.ts.map +1 -0
  60. package/dist/lib/call/call-types.js +16 -0
  61. package/dist/lib/call/call-types.js.map +1 -0
  62. package/dist/lib/call/conversation-ai.d.ts +56 -0
  63. package/dist/lib/call/conversation-ai.d.ts.map +1 -0
  64. package/dist/lib/call/conversation-ai.js +276 -0
  65. package/dist/lib/call/conversation-ai.js.map +1 -0
  66. package/dist/lib/call/eval/codec-test.d.ts +45 -0
  67. package/dist/lib/call/eval/codec-test.d.ts.map +1 -0
  68. package/dist/lib/call/eval/codec-test.js +169 -0
  69. package/dist/lib/call/eval/codec-test.js.map +1 -0
  70. package/dist/lib/call/eval/conversation-scripts.d.ts +55 -0
  71. package/dist/lib/call/eval/conversation-scripts.d.ts.map +1 -0
  72. package/dist/lib/call/eval/conversation-scripts.js +359 -0
  73. package/dist/lib/call/eval/conversation-scripts.js.map +1 -0
  74. package/dist/lib/call/eval/eval-runner.d.ts +64 -0
  75. package/dist/lib/call/eval/eval-runner.d.ts.map +1 -0
  76. package/dist/lib/call/eval/eval-runner.js +369 -0
  77. package/dist/lib/call/eval/eval-runner.js.map +1 -0
  78. package/dist/lib/call/eval/index.d.ts +9 -0
  79. package/dist/lib/call/eval/index.d.ts.map +1 -0
  80. package/dist/lib/call/eval/index.js +9 -0
  81. package/dist/lib/call/eval/index.js.map +1 -0
  82. package/dist/lib/call/eval/integration-test-suite.d.ts +71 -0
  83. package/dist/lib/call/eval/integration-test-suite.d.ts.map +1 -0
  84. package/dist/lib/call/eval/integration-test-suite.js +519 -0
  85. package/dist/lib/call/eval/integration-test-suite.js.map +1 -0
  86. package/dist/lib/call/eval/turn-taking-test.d.ts +84 -0
  87. package/dist/lib/call/eval/turn-taking-test.d.ts.map +1 -0
  88. package/dist/lib/call/eval/turn-taking-test.js +260 -0
  89. package/dist/lib/call/eval/turn-taking-test.js.map +1 -0
  90. package/dist/lib/call/index.d.ts +12 -0
  91. package/dist/lib/call/index.d.ts.map +1 -0
  92. package/dist/lib/call/index.js +17 -0
  93. package/dist/lib/call/index.js.map +1 -0
  94. package/dist/lib/call/providers/deepgram.d.ts +81 -0
  95. package/dist/lib/call/providers/deepgram.d.ts.map +1 -0
  96. package/dist/lib/call/providers/deepgram.js +279 -0
  97. package/dist/lib/call/providers/deepgram.js.map +1 -0
  98. package/dist/lib/call/providers/elevenlabs.d.ts +78 -0
  99. package/dist/lib/call/providers/elevenlabs.d.ts.map +1 -0
  100. package/dist/lib/call/providers/elevenlabs.js +272 -0
  101. package/dist/lib/call/providers/elevenlabs.js.map +1 -0
  102. package/dist/lib/call/providers/local-deps.d.ts +18 -0
  103. package/dist/lib/call/providers/local-deps.d.ts.map +1 -0
  104. package/dist/lib/call/providers/local-deps.js +114 -0
  105. package/dist/lib/call/providers/local-deps.js.map +1 -0
  106. package/dist/lib/call/providers/twilio.d.ts +53 -0
  107. package/dist/lib/call/providers/twilio.d.ts.map +1 -0
  108. package/dist/lib/call/providers/twilio.js +173 -0
  109. package/dist/lib/call/providers/twilio.js.map +1 -0
  110. package/dist/lib/concierge-client-types.d.ts +68 -0
  111. package/dist/lib/concierge-client-types.d.ts.map +1 -0
  112. package/dist/lib/concierge-client-types.js +2 -0
  113. package/dist/lib/concierge-client-types.js.map +1 -0
  114. package/dist/lib/concierge-client.d.ts +29 -0
  115. package/dist/lib/concierge-client.d.ts.map +1 -0
  116. package/dist/lib/concierge-client.js +534 -0
  117. package/dist/lib/concierge-client.js.map +1 -0
  118. package/dist/lib/config.d.ts +9 -0
  119. package/dist/lib/config.d.ts.map +1 -0
  120. package/dist/lib/config.js +66 -0
  121. package/dist/lib/config.js.map +1 -0
  122. package/dist/lib/output.d.ts +7 -0
  123. package/dist/lib/output.d.ts.map +1 -0
  124. package/dist/lib/output.js +114 -0
  125. package/dist/lib/output.js.map +1 -0
  126. package/dist/lib/utils/contact-extractor.d.ts +12 -0
  127. package/dist/lib/utils/contact-extractor.d.ts.map +1 -0
  128. package/dist/lib/utils/contact-extractor.js +159 -0
  129. package/dist/lib/utils/contact-extractor.js.map +1 -0
  130. package/dist/lib/utils/formatters.d.ts +15 -0
  131. package/dist/lib/utils/formatters.d.ts.map +1 -0
  132. package/dist/lib/utils/formatters.js +107 -0
  133. package/dist/lib/utils/formatters.js.map +1 -0
  134. package/dist/lib/utils/url-parser.d.ts +11 -0
  135. package/dist/lib/utils/url-parser.d.ts.map +1 -0
  136. package/dist/lib/utils/url-parser.js +103 -0
  137. package/dist/lib/utils/url-parser.js.map +1 -0
  138. package/package.json +67 -0
@@ -0,0 +1,890 @@
1
+ /**
2
+ * Call session management
3
+ * Handles a single phone call with audio streaming, transcription, and synthesis
4
+ */
5
+ import { EventEmitter } from 'node:events';
6
+ import { mulawToPcm } from './audio/mulaw.js';
7
+ import { calculateRms } from './audio/pcm-utils.js';
8
+ import { createStreamingDecoder } from './audio/streaming-decoder.js';
9
+ import { ConversationAI, extractMostRecentQuestion, isLikelyShortAcknowledgement } from './conversation-ai.js';
10
+ import { createPhoneCallSTT } from './providers/deepgram.js';
11
+ import { createPhoneCallTTS, ElevenLabsApiError } from './providers/elevenlabs.js';
12
+ import { hangupCall } from './providers/twilio.js';
13
+ // Configurable timing constants
14
+ const GREETING_DELAY_MS = 250;
15
+ const CALL_COMPLETION_DELAY_MS = 3000;
16
+ const POST_TTS_STT_SUPPRESSION_MS = 900;
17
+ const PRE_GREETING_IDLE_MS = 700;
18
+ const MAX_BUFFERED_STT_CHUNKS = 500;
19
+ const PRE_GREETING_VAD_RMS_THRESHOLD = 0.015;
20
+ const PRE_GREETING_VAD_MIN_CONSECUTIVE_CHUNKS = 2;
21
+ const MAX_GREETING_DEFERRAL_MS = 2000;
22
+ const TTS_EMPTY_AUDIO_MAX_RETRIES = 1;
23
+ const TTS_EMPTY_AUDIO_RETRY_DELAY_MS = 200;
24
+ const TTS_DECODER_FLUSH_GRACE_MS = 250;
25
+ export class CallSession extends EventEmitter {
26
+ callId;
27
+ config;
28
+ state;
29
+ stt = null;
30
+ tts = null;
31
+ mediaWs = null;
32
+ streamSid = null;
33
+ audioQueue = [];
34
+ isPlaying = false;
35
+ isSpeaking = false;
36
+ conversationAI;
37
+ isProcessingResponse = false; // Prevent overlapping responses
38
+ decoder = null; // ffmpeg MP3 → µ-law
39
+ decoderGeneration = 0; // Track decoder generations to avoid race conditions
40
+ sessionStartTime = Date.now(); // For timestamps
41
+ responseDebounceTimer = null; // Debounce rapid transcripts
42
+ pendingTranscript = ''; // Accumulated transcript before responding
43
+ hangupTimer = null; // Timer for delayed hangup
44
+ greetingTimer = null; // Timer for delayed initial greeting
45
+ cleanedUp = false; // Prevent multiple cleanup calls
46
+ endedEmitted = false;
47
+ suppressSttUntilMs = 0; // Prevent echo from AI audio being transcribed as human speech
48
+ sttTimelineStartMs = 0; // Wall-clock anchor for Deepgram word timestamps
49
+ greetingStarted = false;
50
+ lastInboundTranscriptAtMs = 0;
51
+ lastInboundAudioActivityAtMs = 0;
52
+ consecutiveInboundSpeechChunks = 0;
53
+ callConnectedAtMs = 0;
54
+ bufferedSttAudio = [];
55
+ greetingPrefetchPromise = null;
56
+ // Event handler references for cleanup
57
+ sttHandlers = [];
58
+ ttsHandlers = [];
59
+ mediaWsHandlers = [];
60
+ /** Log with timestamp showing ms since session start */
61
+ log(message) {
62
+ const elapsed = Date.now() - this.sessionStartTime;
63
+ console.log(`[${elapsed.toString().padStart(6)}ms] ${message}`);
64
+ }
65
+ formatError(error) {
66
+ if (error instanceof Error) {
67
+ return error.message;
68
+ }
69
+ return String(error);
70
+ }
71
+ isElevenLabsQuotaExceeded(error) {
72
+ if (error instanceof ElevenLabsApiError) {
73
+ return error.isQuotaExceeded;
74
+ }
75
+ const text = this.formatError(error).toLowerCase();
76
+ return text.includes('quota_exceeded');
77
+ }
78
+ getTTSOperatorMessage(error) {
79
+ if (this.isElevenLabsQuotaExceeded(error)) {
80
+ return 'ElevenLabs quota exceeded: TTS cannot generate audio. Top up ElevenLabs credits and retry the call.';
81
+ }
82
+ return `TTS failed: ${this.formatError(error)}`;
83
+ }
84
+ isEmptyTtsAudioError(error) {
85
+ const message = this.formatError(error).toLowerCase();
86
+ return message.includes('tts produced no audio output');
87
+ }
88
+ constructor(callId, config, phoneNumber, goal, context) {
89
+ super();
90
+ this.callId = callId;
91
+ this.config = config;
92
+ this.state = {
93
+ callId,
94
+ phoneNumber,
95
+ goal,
96
+ context,
97
+ status: 'initiating',
98
+ startedAt: new Date(),
99
+ transcript: [],
100
+ };
101
+ // Initialize conversation AI
102
+ this.conversationAI = new ConversationAI({
103
+ apiKey: config.anthropicApiKey,
104
+ goal,
105
+ context,
106
+ });
107
+ }
108
+ /**
109
+ * Initialize the session when Twilio media stream connects
110
+ * @param ws - WebSocket connection
111
+ * @param startMessage - The 'start' event message from Twilio (already received by server)
112
+ */
113
+ async initializeMediaStream(ws, startMessage) {
114
+ this.log(`[Session ${this.callId}] Initializing media stream...`);
115
+ this.mediaWs = ws;
116
+ // IMPORTANT: Attach WebSocket handlers FIRST, before awaiting STT connection.
117
+ // This ensures we don't drop Twilio media frames that arrive during Deepgram startup.
118
+ let mediaMessageCount = 0;
119
+ const wsMessageHandler = (data) => {
120
+ try {
121
+ const msg = JSON.parse(data.toString());
122
+ // Log every message type received (for debugging)
123
+ if (msg.event === 'media') {
124
+ mediaMessageCount++;
125
+ if (mediaMessageCount <= 5 || mediaMessageCount % 100 === 0) {
126
+ this.log(`[Twilio] Media #${mediaMessageCount}, track: ${msg.media?.track}`);
127
+ }
128
+ }
129
+ else {
130
+ this.log(`[Twilio] Event: ${msg.event}`);
131
+ }
132
+ this.handleTwilioMessage(msg);
133
+ }
134
+ catch (err) {
135
+ this.log(`[Twilio] Parse error: ${err}`);
136
+ }
137
+ };
138
+ const wsCloseHandler = () => {
139
+ this.handleMediaStreamClose();
140
+ };
141
+ const wsErrorHandler = (err) => {
142
+ this.log(`[Twilio] WebSocket error: ${err}`);
143
+ };
144
+ ws.on('message', wsMessageHandler);
145
+ ws.on('close', wsCloseHandler);
146
+ ws.on('error', wsErrorHandler);
147
+ this.mediaWsHandlers = [
148
+ { event: 'message', handler: wsMessageHandler },
149
+ { event: 'close', handler: wsCloseHandler },
150
+ { event: 'error', handler: wsErrorHandler },
151
+ ];
152
+ // NOTE: Do NOT process startMessage here. It must be processed AFTER TTS is ready,
153
+ // because the 'start' event triggers a delayed greeting that requires TTS.
154
+ // See end of this method.
155
+ // Initialize STT
156
+ this.log('[STT] Connecting to Deepgram...');
157
+ this.stt = createPhoneCallSTT(this.config.deepgramApiKey);
158
+ // Store event handlers for cleanup
159
+ const sttTranscriptHandler = (result) => {
160
+ this.handleTranscript(result);
161
+ };
162
+ const sttErrorHandler = (err) => this.log(`[STT] Error: ${err.message}`);
163
+ const sttOpenHandler = () => this.log('[STT] Deepgram connected');
164
+ const sttCloseHandler = () => this.log('[STT] Deepgram disconnected');
165
+ this.stt.on('transcript', sttTranscriptHandler);
166
+ this.stt.on('error', sttErrorHandler);
167
+ this.stt.on('open', sttOpenHandler);
168
+ this.stt.on('close', sttCloseHandler);
169
+ this.sttHandlers = [
170
+ { event: 'transcript', handler: sttTranscriptHandler },
171
+ { event: 'error', handler: sttErrorHandler },
172
+ { event: 'open', handler: sttOpenHandler },
173
+ { event: 'close', handler: sttCloseHandler },
174
+ ];
175
+ this.connectSttInBackground();
176
+ // Initialize TTS with streaming conversion
177
+ this.log('[TTS] Setting up ElevenLabs (streaming mode)...');
178
+ this.tts = createPhoneCallTTS(this.config.elevenLabsApiKey, this.config.elevenLabsVoiceId);
179
+ let ttsChunks = 0;
180
+ let ttsBytes = 0;
181
+ // Store event handlers for cleanup
182
+ const ttsAudioHandler = (chunk, requestId) => {
183
+ if (requestId !== undefined && requestId !== this.decoderGeneration) {
184
+ return;
185
+ }
186
+ ttsChunks++;
187
+ ttsBytes += chunk.length;
188
+ if (ttsChunks === 1) {
189
+ this.log(`[TTS] First audio chunk: ${chunk.length} bytes`);
190
+ }
191
+ // ElevenLabs returns MP3 (regardless of output_format requested!)
192
+ // Stream through ffmpeg decoder to convert to µ-law
193
+ if (this.decoder?.running) {
194
+ this.decoder.write(chunk);
195
+ }
196
+ };
197
+ const ttsDoneHandler = (requestId) => {
198
+ if (requestId !== undefined && requestId !== this.decoderGeneration) {
199
+ return;
200
+ }
201
+ this.log(`[TTS] Stream complete: ${ttsChunks} chunks, ${ttsBytes} bytes total, flushing decoder`);
202
+ // Signal end of input to ffmpeg - it will flush remaining audio
203
+ this.decoder?.end();
204
+ };
205
+ const ttsErrorHandler = (err, requestId) => {
206
+ if (requestId !== undefined && requestId !== this.decoderGeneration) {
207
+ return;
208
+ }
209
+ this.log(`[TTS] Error: ${this.formatError(err)}`);
210
+ };
211
+ this.tts.on('audio', ttsAudioHandler);
212
+ this.tts.on('done', ttsDoneHandler);
213
+ this.tts.on('error', ttsErrorHandler);
214
+ this.ttsHandlers = [
215
+ { event: 'audio', handler: ttsAudioHandler },
216
+ { event: 'done', handler: ttsDoneHandler },
217
+ { event: 'error', handler: ttsErrorHandler },
218
+ ];
219
+ this.log('[TTS] Ready (streaming)');
220
+ // Process the start message AFTER TTS is ready.
221
+ // The 'start' event triggers a delayed greeting that requires TTS to be initialized.
222
+ // WebSocket handlers were attached early to capture media frames during STT connect,
223
+ // but the startMessage must be processed here to ensure greeting works.
224
+ if (startMessage) {
225
+ this.log('[Session] Processing initial start message');
226
+ this.handleTwilioMessage(startMessage);
227
+ }
228
+ }
229
+ /**
230
+ * Handle incoming Twilio media stream messages
231
+ */
232
+ handleTwilioMessage(msg) {
233
+ switch (msg.event) {
234
+ case 'connected':
235
+ this.log('[Twilio] Media stream connected');
236
+ break;
237
+ case 'start':
238
+ if (msg.start) {
239
+ this.streamSid = msg.start.streamSid;
240
+ this.state.callSid = msg.start.callSid;
241
+ this.callConnectedAtMs = Date.now();
242
+ this.log(`[Session] Stream started - streamSid: ${this.streamSid}`);
243
+ this.updateStatus('in-progress');
244
+ this.emitMessage({ type: 'call_connected', callId: this.callId });
245
+ this.prefetchGreeting();
246
+ // Send AI-generated greeting after a short delay to ensure audio is ready.
247
+ // If the remote party speaks first (common with IVRs), we delay greeting.
248
+ this.scheduleInitialGreeting(GREETING_DELAY_MS);
249
+ }
250
+ break;
251
+ case 'media':
252
+ if (msg.media?.payload) {
253
+ // Convert mulaw to PCM and send to STT (accept any track for now)
254
+ const mulaw = Buffer.from(msg.media.payload, 'base64');
255
+ const pcm = mulawToPcm(mulaw);
256
+ if (!this.greetingStarted && !this.isSpeaking) {
257
+ this.trackInboundSpeechActivity(pcm);
258
+ }
259
+ if (this.stt?.connected) {
260
+ this.stt.sendAudio(pcm);
261
+ }
262
+ else {
263
+ if (this.bufferedSttAudio.length >= MAX_BUFFERED_STT_CHUNKS) {
264
+ this.bufferedSttAudio.shift();
265
+ }
266
+ this.bufferedSttAudio.push(pcm);
267
+ }
268
+ }
269
+ break;
270
+ case 'stop':
271
+ this.handleMediaStreamClose();
272
+ break;
273
+ case 'mark':
274
+ // Audio playback marker - can be used for timing
275
+ if (msg.mark?.name === 'audio_done') {
276
+ this.isPlaying = false;
277
+ this.flushAudioQueue();
278
+ }
279
+ break;
280
+ }
281
+ }
282
+ // How long to wait after final transcript before responding (ms)
283
+ // This allows the human to pause mid-sentence without being interrupted
284
+ // 1000ms = 1 second of silence before AI responds
285
+ static RESPONSE_DEBOUNCE_MS = 1000;
286
+ /**
287
+ * Handle transcription results from Deepgram
288
+ */
289
+ handleTranscript(result) {
290
+ const text = result.text.trim();
291
+ if (!text)
292
+ return;
293
+ // IVRs often speak immediately after answer; avoid talking over them.
294
+ if (!this.greetingStarted) {
295
+ this.lastInboundTranscriptAtMs = Date.now();
296
+ }
297
+ const transcriptEndMs = this.getTranscriptEndTimestampMs(result);
298
+ if (transcriptEndMs !== undefined && transcriptEndMs <= this.suppressSttUntilMs) {
299
+ this.log(`[STT] Ignoring likely overlap transcript by word timing: "${result.text}"`);
300
+ return;
301
+ }
302
+ // Prevent AI voice playback/echo from being treated as human speech.
303
+ // This intentionally drops barge-in during playback to avoid self-transcription loops.
304
+ if (this.isSpeaking || Date.now() < this.suppressSttUntilMs) {
305
+ this.log(`[STT] Ignoring likely echo while AI audio is active: "${text}"`);
306
+ return;
307
+ }
308
+ // Emit interim transcript events (for UI feedback)
309
+ // But DON'T add to state.transcript yet - wait for debounce to combine segments
310
+ this.emitMessage({
311
+ type: 'transcript',
312
+ callId: this.callId,
313
+ text,
314
+ role: 'human',
315
+ isFinal: result.isFinal,
316
+ });
317
+ // For final transcripts, use debouncing to combine segments and avoid interrupting
318
+ if (result.isFinal) {
319
+ // Cancel any pending response timer
320
+ if (this.responseDebounceTimer) {
321
+ clearTimeout(this.responseDebounceTimer);
322
+ this.log(`[Turn] More speech detected, extending wait...`);
323
+ }
324
+ // Accumulate transcript segments
325
+ if (this.pendingTranscript) {
326
+ this.pendingTranscript += ` ${text}`;
327
+ }
328
+ else {
329
+ this.pendingTranscript = text;
330
+ }
331
+ this.log(`[Turn] Accumulated: "${this.pendingTranscript}"`);
332
+ // If already processing a response, don't queue another
333
+ if (this.isProcessingResponse) {
334
+ this.log(`[Turn] AI still speaking, will respond after`);
335
+ return;
336
+ }
337
+ // Start debounce timer - wait for more speech or timeout
338
+ this.log(`[Turn] Waiting ${CallSession.RESPONSE_DEBOUNCE_MS}ms for more speech...`);
339
+ this.responseDebounceTimer = setTimeout(() => {
340
+ this.responseDebounceTimer = null;
341
+ const fullTranscript = this.pendingTranscript.trim();
342
+ this.pendingTranscript = '';
343
+ if (fullTranscript && !this.isProcessingResponse) {
344
+ // NOW add the combined transcript to state
345
+ const entry = {
346
+ role: 'human',
347
+ text: fullTranscript,
348
+ timestamp: new Date(),
349
+ isFinal: true,
350
+ };
351
+ this.state.transcript.push(entry);
352
+ this.log(`[Turn] Silence confirmed, responding to: "${fullTranscript}"`);
353
+ this.generateAIResponse(fullTranscript);
354
+ }
355
+ }, CallSession.RESPONSE_DEBOUNCE_MS);
356
+ }
357
+ }
358
+ trackInboundSpeechActivity(pcm) {
359
+ const rms = calculateRms(pcm);
360
+ if (rms >= PRE_GREETING_VAD_RMS_THRESHOLD) {
361
+ this.consecutiveInboundSpeechChunks++;
362
+ if (this.consecutiveInboundSpeechChunks >= PRE_GREETING_VAD_MIN_CONSECUTIVE_CHUNKS) {
363
+ this.lastInboundAudioActivityAtMs = Date.now();
364
+ }
365
+ }
366
+ else {
367
+ this.consecutiveInboundSpeechChunks = 0;
368
+ }
369
+ }
370
+ scheduleInitialGreeting(delayMs) {
371
+ if (this.greetingStarted || this.cleanedUp)
372
+ return;
373
+ if (this.greetingTimer) {
374
+ clearTimeout(this.greetingTimer);
375
+ }
376
+ this.greetingTimer = setTimeout(() => {
377
+ this.greetingTimer = null;
378
+ this.sendInitialGreeting().catch((err) => {
379
+ this.log(`[AI] Greeting error: ${this.formatError(err)}`);
380
+ });
381
+ }, delayMs);
382
+ }
383
+ connectSttInBackground() {
384
+ void (async () => {
385
+ if (!this.stt)
386
+ return;
387
+ try {
388
+ await this.stt.connect();
389
+ if (this.cleanedUp || !this.stt)
390
+ return;
391
+ this.log('[STT] Connection established');
392
+ this.sttTimelineStartMs = Date.now();
393
+ if (this.bufferedSttAudio.length > 0) {
394
+ this.log(`[STT] Flushing ${this.bufferedSttAudio.length} buffered audio chunk(s)`);
395
+ for (const pcm of this.bufferedSttAudio) {
396
+ this.stt.sendAudio(pcm);
397
+ }
398
+ this.bufferedSttAudio = [];
399
+ }
400
+ }
401
+ catch (err) {
402
+ if (this.cleanedUp)
403
+ return;
404
+ const message = this.formatError(err);
405
+ this.log(`[STT] Connection failed: ${message}`);
406
+ this.emitMessage({
407
+ type: 'error',
408
+ callId: this.callId,
409
+ message: `STT connection failed: ${message}`,
410
+ });
411
+ }
412
+ })();
413
+ }
414
+ prefetchGreeting() {
415
+ if (this.greetingPrefetchPromise)
416
+ return;
417
+ this.greetingPrefetchPromise = this.conversationAI
418
+ .getGreeting()
419
+ .then((greeting) => greeting.trim() || null)
420
+ .catch((err) => {
421
+ this.log(`[AI] Greeting prefetch failed: ${this.formatError(err)}`);
422
+ return null;
423
+ });
424
+ }
425
+ async sendInitialGreeting() {
426
+ if (this.greetingStarted || this.cleanedUp)
427
+ return;
428
+ const lastInboundActivityAtMs = Math.max(this.lastInboundTranscriptAtMs, this.lastInboundAudioActivityAtMs);
429
+ if (lastInboundActivityAtMs) {
430
+ const elapsed = Date.now() - lastInboundActivityAtMs;
431
+ const callElapsed = this.callConnectedAtMs ? Date.now() - this.callConnectedAtMs : 0;
432
+ if (elapsed < PRE_GREETING_IDLE_MS) {
433
+ if (callElapsed >= MAX_GREETING_DEFERRAL_MS) {
434
+ this.log('[AI] Greeting deferral timeout reached; proceeding');
435
+ }
436
+ else {
437
+ this.log(`[AI] Deferring greeting; remote speech detected ${elapsed}ms ago`);
438
+ this.scheduleInitialGreeting(PRE_GREETING_IDLE_MS - elapsed);
439
+ return;
440
+ }
441
+ }
442
+ }
443
+ if (this.pendingTranscript || this.state.transcript.some((entry) => entry.role === 'human') || this.isProcessingResponse) {
444
+ this.log('[AI] Skipping initial greeting because remote party spoke first');
445
+ this.greetingStarted = true;
446
+ return;
447
+ }
448
+ this.greetingStarted = true;
449
+ try {
450
+ this.log('[AI] Generating greeting...');
451
+ const prefetchedGreeting = this.greetingPrefetchPromise ? await this.greetingPrefetchPromise : null;
452
+ const greeting = prefetchedGreeting ?? (await this.conversationAI.getGreeting());
453
+ this.log(`[AI] Greeting: "${greeting}"`);
454
+ await this.speak(greeting);
455
+ }
456
+ catch (err) {
457
+ const message = this.formatError(err);
458
+ this.log(`[AI] Greeting error: ${message}`);
459
+ this.emitMessage({
460
+ type: 'error',
461
+ callId: this.callId,
462
+ message: this.isElevenLabsQuotaExceeded(err)
463
+ ? this.getTTSOperatorMessage(err)
464
+ : `Greeting generation failed: ${message}`,
465
+ });
466
+ if (this.isElevenLabsQuotaExceeded(err)) {
467
+ await this.hangup();
468
+ return;
469
+ }
470
+ // Fallback to basic greeting.
471
+ const fallback = `Hello! I'm calling about: ${this.state.goal}`;
472
+ this.speak(fallback).catch((fallbackErr) => {
473
+ const fallbackMessage = this.getTTSOperatorMessage(fallbackErr);
474
+ this.log(`[AI] Fallback error: ${fallbackMessage}`);
475
+ this.emitMessage({
476
+ type: 'error',
477
+ callId: this.callId,
478
+ message: fallbackMessage,
479
+ });
480
+ });
481
+ }
482
+ }
483
+ getTranscriptEndTimestampMs(result) {
484
+ if (!this.sttTimelineStartMs || !result.words || result.words.length === 0) {
485
+ return undefined;
486
+ }
487
+ const maxWordEndSeconds = result.words.reduce((max, word) => Math.max(max, word.end), 0);
488
+ return this.sttTimelineStartMs + Math.round(maxWordEndSeconds * 1000);
489
+ }
490
+ /**
491
+ * Generate and speak AI response to human speech
492
+ */
493
+ async generateAIResponse(humanSaid) {
494
+ if (this.conversationAI.complete) {
495
+ this.log('[AI] Conversation already complete, ignoring');
496
+ return;
497
+ }
498
+ this.isProcessingResponse = true;
499
+ const responseStart = Date.now();
500
+ try {
501
+ this.log(`[AI] Generating response to: "${humanSaid}"`);
502
+ const lastAssistantUtterance = this.getLastAssistantUtterance();
503
+ const shortAcknowledgement = isLikelyShortAcknowledgement(humanSaid);
504
+ const lastAssistantQuestion = lastAssistantUtterance
505
+ ? extractMostRecentQuestion(lastAssistantUtterance)
506
+ : undefined;
507
+ const response = await this.conversationAI.respond(humanSaid, {
508
+ shortAcknowledgement,
509
+ lastAssistantUtterance,
510
+ lastAssistantQuestion,
511
+ });
512
+ this.log(`[AI] Response ready (${Date.now() - responseStart}ms, ${response?.length || 0} chars)`);
513
+ if (response === null) {
514
+ // Conversation is complete
515
+ this.log('[AI] Conversation complete');
516
+ await this.hangup();
517
+ return;
518
+ }
519
+ this.log(`[TTS] Speaking: "${response.substring(0, 50)}..."`);
520
+ await this.speak(response);
521
+ this.log(`[TTS] Speech complete (${Date.now() - responseStart}ms total)`);
522
+ // Check if AI marked conversation complete (handled internally)
523
+ if (this.conversationAI.complete) {
524
+ this.log(`[AI] Marked complete, ending call in ${CALL_COMPLETION_DELAY_MS}ms`);
525
+ // Give a moment for the final response to be spoken
526
+ // Clear any existing hangup timer first
527
+ if (this.hangupTimer) {
528
+ clearTimeout(this.hangupTimer);
529
+ }
530
+ this.hangupTimer = setTimeout(() => {
531
+ this.hangupTimer = null;
532
+ if (!this.cleanedUp) {
533
+ this.hangup().catch((err) => this.log(`[Hangup] Error: ${err}`));
534
+ }
535
+ }, CALL_COMPLETION_DELAY_MS);
536
+ }
537
+ }
538
+ catch (err) {
539
+ const message = this.formatError(err);
540
+ this.log(`[AI] Response error: ${message}`);
541
+ this.emitMessage({
542
+ type: 'error',
543
+ callId: this.callId,
544
+ message: this.isElevenLabsQuotaExceeded(err)
545
+ ? this.getTTSOperatorMessage(err)
546
+ : `AI response failed: ${message}`,
547
+ });
548
+ if (this.isElevenLabsQuotaExceeded(err)) {
549
+ await this.hangup();
550
+ return;
551
+ }
552
+ // Try to recover with a fallback
553
+ try {
554
+ await this.speak("I'm sorry, could you repeat that?");
555
+ }
556
+ catch (fallbackErr) {
557
+ const fallbackMessage = this.getTTSOperatorMessage(fallbackErr);
558
+ this.log(`[AI] Fallback response failed: ${fallbackMessage}`);
559
+ this.emitMessage({
560
+ type: 'error',
561
+ callId: this.callId,
562
+ message: fallbackMessage,
563
+ });
564
+ }
565
+ }
566
+ finally {
567
+ this.isProcessingResponse = false;
568
+ }
569
+ }
570
+ getLastAssistantUtterance() {
571
+ for (let i = this.state.transcript.length - 1; i >= 0; i--) {
572
+ const entry = this.state.transcript[i];
573
+ if (entry.role === 'assistant' && entry.text.trim()) {
574
+ return entry.text.trim();
575
+ }
576
+ }
577
+ return undefined;
578
+ }
579
+ /**
580
+ * Speak text using TTS
581
+ */
582
+ async speak(text) {
583
+ this.log(`[TTS] speak() called: "${text.substring(0, 50)}..."`);
584
+ if (!this.tts || !this.mediaWs) {
585
+ this.log(`[TTS] speak() failed: not initialized (tts: ${!!this.tts}, ws: ${!!this.mediaWs})`);
586
+ throw new Error('Session not initialized');
587
+ }
588
+ // Cancel any ongoing speech
589
+ if (this.isSpeaking) {
590
+ this.tts.cancel();
591
+ this.decoder?.stop();
592
+ }
593
+ // Reset streaming state
594
+ this.audioQueue = [];
595
+ for (let attempt = 0; attempt <= TTS_EMPTY_AUDIO_MAX_RETRIES; attempt++) {
596
+ this.isSpeaking = true;
597
+ // Increment generation to track which decoder is current
598
+ this.decoderGeneration++;
599
+ const currentGeneration = this.decoderGeneration;
600
+ // Start ffmpeg decoder to convert MP3 → µ-law
601
+ // (ElevenLabs always returns MP3 regardless of output_format requested)
602
+ this.decoder = createStreamingDecoder();
603
+ let resolveFirstChunk = null;
604
+ const firstChunkPromise = new Promise((resolve) => {
605
+ resolveFirstChunk = resolve;
606
+ });
607
+ let decoderChunks = 0;
608
+ let decoderBytes = 0;
609
+ this.decoder.on('data', (mulaw) => {
610
+ // Only process data if this is still the current decoder
611
+ if (currentGeneration !== this.decoderGeneration)
612
+ return;
613
+ decoderChunks++;
614
+ decoderBytes += mulaw.length;
615
+ if (decoderChunks === 1) {
616
+ this.log(`[Decoder] First chunk: ${mulaw.length} bytes, first 4 bytes: ${mulaw.slice(0, 4).toString('hex')}`);
617
+ resolveFirstChunk?.();
618
+ resolveFirstChunk = null;
619
+ }
620
+ if (decoderChunks % 10 === 0) {
621
+ this.log(`[Decoder] ${decoderChunks} chunks, ${decoderBytes} bytes total`);
622
+ }
623
+ // Send µ-law directly to Twilio as it's decoded
624
+ this.sendAudioToTwilio(mulaw);
625
+ });
626
+ this.decoder.on('close', () => {
627
+ // Only update isSpeaking if this is the current decoder
628
+ if (currentGeneration === this.decoderGeneration) {
629
+ this.log('[Decoder] Closed, speech complete');
630
+ this.isSpeaking = false;
631
+ this.suppressSttUntilMs = Date.now() + POST_TTS_STT_SUPPRESSION_MS;
632
+ }
633
+ });
634
+ this.decoder.on('error', (err) => {
635
+ this.log(`[Decoder] Error: ${err}`);
636
+ });
637
+ this.decoder.start();
638
+ try {
639
+ // Start TTS
640
+ await this.tts.speak(text, currentGeneration);
641
+ if (decoderChunks === 0) {
642
+ // The decoder can lag behind the TTS "done" signal slightly; wait briefly before declaring empty output.
643
+ await Promise.race([
644
+ firstChunkPromise,
645
+ new Promise((resolve) => setTimeout(resolve, TTS_DECODER_FLUSH_GRACE_MS)),
646
+ ]);
647
+ if (decoderChunks === 0) {
648
+ throw new Error('TTS produced no audio output (decoder emitted 0 chunks)');
649
+ }
650
+ }
651
+ // Add to transcript only after TTS succeeds.
652
+ const entry = {
653
+ role: 'assistant',
654
+ text,
655
+ timestamp: new Date(),
656
+ isFinal: true,
657
+ };
658
+ this.state.transcript.push(entry);
659
+ // Emit transcript event only when audio was actually produced.
660
+ this.emitMessage({
661
+ type: 'transcript',
662
+ callId: this.callId,
663
+ text,
664
+ role: 'assistant',
665
+ isFinal: true,
666
+ });
667
+ return;
668
+ }
669
+ catch (err) {
670
+ // Ensure we don't leave a stalled decoder when synthesis fails.
671
+ if (currentGeneration === this.decoderGeneration) {
672
+ this.decoder?.stop();
673
+ this.isSpeaking = false;
674
+ }
675
+ const canRetry = this.isEmptyTtsAudioError(err) && attempt < TTS_EMPTY_AUDIO_MAX_RETRIES;
676
+ if (!canRetry) {
677
+ throw err;
678
+ }
679
+ const retryCount = attempt + 1;
680
+ this.log(`[TTS] Empty audio output, retrying synthesis (${retryCount}/${TTS_EMPTY_AUDIO_MAX_RETRIES})`);
681
+ this.tts.cancel();
682
+ await new Promise((resolve) => setTimeout(resolve, TTS_EMPTY_AUDIO_RETRY_DELAY_MS));
683
+ }
684
+ }
685
+ throw new Error('TTS failed after retry attempts');
686
+ }
687
+ /**
688
+ * Send queued audio to Twilio
689
+ */
690
+ flushAudioQueue() {
691
+ if (!this.mediaWs || !this.streamSid || this.isPlaying || this.audioQueue.length === 0) {
692
+ return;
693
+ }
694
+ this.log(`[Audio] Flushing ${this.audioQueue.length} chunks to Twilio`);
695
+ this.isPlaying = true;
696
+ // Send all queued audio
697
+ let totalBytes = 0;
698
+ while (this.audioQueue.length > 0) {
699
+ const audio = this.audioQueue.shift();
700
+ if (audio) {
701
+ this.sendAudioToTwilio(audio);
702
+ totalBytes += audio.length;
703
+ }
704
+ }
705
+ this.log(`[Audio] Sent ${totalBytes} bytes total`);
706
+ // Send mark to know when audio is done
707
+ this.sendMarkToTwilio('audio_done');
708
+ }
709
+ /**
710
+ * Send audio data to Twilio media stream
711
+ */
712
+ sendAudioToTwilio(mulaw) {
713
+ if (!this.mediaWs || !this.streamSid) {
714
+ return;
715
+ }
716
+ const msg = {
717
+ event: 'media',
718
+ streamSid: this.streamSid,
719
+ media: {
720
+ payload: mulaw.toString('base64'),
721
+ },
722
+ };
723
+ try {
724
+ this.mediaWs.send(JSON.stringify(msg));
725
+ }
726
+ catch (err) {
727
+ this.log(`[Audio] Send error: ${err}`);
728
+ }
729
+ }
730
+ /**
731
+ * Send a mark event to track audio playback
732
+ */
733
+ sendMarkToTwilio(name) {
734
+ if (!this.mediaWs || !this.streamSid)
735
+ return;
736
+ const msg = {
737
+ event: 'mark',
738
+ streamSid: this.streamSid,
739
+ mark: { name },
740
+ };
741
+ try {
742
+ this.mediaWs.send(JSON.stringify(msg));
743
+ }
744
+ catch (err) {
745
+ console.error('[Media WS] Mark error:', err);
746
+ }
747
+ }
748
+ /**
749
+ * Hang up the call
750
+ */
751
+ async hangup() {
752
+ this.log('[Session] Hanging up...');
753
+ if (this.state.callSid) {
754
+ try {
755
+ await hangupCall(this.config, this.state.callSid);
756
+ }
757
+ catch (err) {
758
+ this.log(`[Hangup] Error: ${err}`);
759
+ }
760
+ }
761
+ this.cleanup();
762
+ this.updateStatus('completed');
763
+ this.emitEnded();
764
+ }
765
+ endFromProviderStatus(status) {
766
+ if (this.endedEmitted)
767
+ return;
768
+ this.log(`[Session] Ending from provider status: ${status}`);
769
+ this.updateStatus(status);
770
+ this.cleanup();
771
+ this.emitEnded();
772
+ }
773
+ /**
774
+ * Handle media stream close
775
+ */
776
+ handleMediaStreamClose() {
777
+ this.cleanup();
778
+ if (this.state.status === 'in-progress') {
779
+ this.updateStatus('completed');
780
+ }
781
+ this.emitEnded();
782
+ }
783
+ /**
784
+ * Clean up resources
785
+ */
786
+ cleanup() {
787
+ if (this.cleanedUp)
788
+ return;
789
+ this.cleanedUp = true;
790
+ // Clear timers
791
+ if (this.responseDebounceTimer) {
792
+ clearTimeout(this.responseDebounceTimer);
793
+ this.responseDebounceTimer = null;
794
+ }
795
+ if (this.hangupTimer) {
796
+ clearTimeout(this.hangupTimer);
797
+ this.hangupTimer = null;
798
+ }
799
+ if (this.greetingTimer) {
800
+ clearTimeout(this.greetingTimer);
801
+ this.greetingTimer = null;
802
+ }
803
+ // Stop decoder
804
+ if (this.decoder) {
805
+ this.decoder.stop();
806
+ this.decoder = null;
807
+ }
808
+ // Remove STT event listeners and close
809
+ if (this.stt) {
810
+ for (const { event, handler } of this.sttHandlers) {
811
+ this.stt.removeListener(event, handler);
812
+ }
813
+ this.stt.close();
814
+ this.stt = null;
815
+ }
816
+ this.sttHandlers = [];
817
+ // Remove TTS event listeners and cancel
818
+ if (this.tts) {
819
+ for (const { event, handler } of this.ttsHandlers) {
820
+ this.tts.removeListener(event, handler);
821
+ }
822
+ this.tts.cancel();
823
+ this.tts = null;
824
+ }
825
+ this.ttsHandlers = [];
826
+ // Remove WebSocket event listeners
827
+ if (this.mediaWs) {
828
+ for (const { event, handler } of this.mediaWsHandlers) {
829
+ this.mediaWs.removeListener(event, handler);
830
+ }
831
+ this.mediaWs = null;
832
+ }
833
+ this.mediaWsHandlers = [];
834
+ this.streamSid = null;
835
+ this.bufferedSttAudio = [];
836
+ this.state.endedAt = new Date();
837
+ }
838
+ /**
839
+ * Update call status
840
+ */
841
+ updateStatus(status) {
842
+ this.state.status = status;
843
+ }
844
+ /**
845
+ * Set the Twilio call SID
846
+ */
847
+ setCallSid(callSid) {
848
+ this.state.callSid = callSid;
849
+ }
850
+ /**
851
+ * Emit a server message
852
+ */
853
+ emitMessage(msg) {
854
+ this.emit('message', msg);
855
+ }
856
+ /**
857
+ * Emit ended event
858
+ */
859
+ emitEnded() {
860
+ if (this.endedEmitted)
861
+ return;
862
+ this.endedEmitted = true;
863
+ const summary = this.generateSummary();
864
+ this.state.summary = summary;
865
+ this.emitMessage({
866
+ type: 'call_ended',
867
+ callId: this.callId,
868
+ summary,
869
+ status: this.state.status,
870
+ });
871
+ this.emit('ended', this.state);
872
+ }
873
+ /**
874
+ * Generate a conversation summary
875
+ */
876
+ generateSummary() {
877
+ if (this.state.transcript.length === 0) {
878
+ return 'No conversation recorded.';
879
+ }
880
+ const lines = this.state.transcript.map((t) => `${t.role === 'assistant' ? 'AI' : 'Human'}: ${t.text}`);
881
+ return lines.join('\n');
882
+ }
883
+ /**
884
+ * Get current state
885
+ */
886
+ getState() {
887
+ return { ...this.state };
888
+ }
889
+ }
890
+ //# sourceMappingURL=call-session.js.map