@kognitivedev/backend-cloud 0.2.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/.turbo/turbo-build.log +2 -0
  2. package/.turbo/turbo-test.log +14 -0
  3. package/CHANGELOG.md +11 -0
  4. package/README.md +88 -0
  5. package/dist/cloud-voice-parameters.d.ts +11 -0
  6. package/dist/cloud-voice-parameters.js +219 -0
  7. package/dist/cloud-voice-prompt-service.d.ts +24 -0
  8. package/dist/cloud-voice-prompt-service.js +382 -0
  9. package/dist/cloud-voice-runtime-service.d.ts +73 -0
  10. package/dist/cloud-voice-runtime-service.js +443 -0
  11. package/dist/cloud-voice.d.ts +36 -0
  12. package/dist/cloud-voice.js +683 -0
  13. package/dist/index.d.ts +10 -0
  14. package/dist/index.js +26 -0
  15. package/dist/phone-control.d.ts +50 -0
  16. package/dist/phone-control.js +97 -0
  17. package/dist/phone-runtime/audio-playout-tracker.d.ts +51 -0
  18. package/dist/phone-runtime/audio-playout-tracker.js +93 -0
  19. package/dist/phone-runtime/openai-twilio-realtime.d.ts +95 -0
  20. package/dist/phone-runtime/openai-twilio-realtime.js +1074 -0
  21. package/dist/tools.d.ts +2 -0
  22. package/dist/tools.js +216 -0
  23. package/dist/types.d.ts +468 -0
  24. package/dist/types.js +2 -0
  25. package/dist/utils.d.ts +3 -0
  26. package/dist/utils.js +14 -0
  27. package/package.json +47 -0
  28. package/src/__tests__/audio-playout-tracker.test.ts +46 -0
  29. package/src/__tests__/cloud-voice.test.ts +1006 -0
  30. package/src/__tests__/openai-twilio-realtime.test.ts +1193 -0
  31. package/src/__tests__/phone-control.test.ts +105 -0
  32. package/src/cloud-voice-parameters.ts +236 -0
  33. package/src/cloud-voice-prompt-service.ts +493 -0
  34. package/src/cloud-voice-runtime-service.ts +465 -0
  35. package/src/cloud-voice.ts +831 -0
  36. package/src/index.ts +10 -0
  37. package/src/phone-control.ts +156 -0
  38. package/src/phone-runtime/audio-playout-tracker.ts +132 -0
  39. package/src/phone-runtime/openai-twilio-realtime.ts +1250 -0
  40. package/src/tools.ts +227 -0
  41. package/src/types.ts +529 -0
  42. package/src/utils.ts +11 -0
  43. package/tsconfig.json +13 -0
@@ -0,0 +1,1074 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.summarizeTwilioPcmuSignal = summarizeTwilioPcmuSignal;
7
+ exports.isOpenAITwilioRealtimeReady = isOpenAITwilioRealtimeReady;
8
+ exports.createOpenAITwilioRealtimeTools = createOpenAITwilioRealtimeTools;
9
+ exports.createOpenAITwilioRealtimeSessionUpdate = createOpenAITwilioRealtimeSessionUpdate;
10
+ exports.runOpenAITwilioRealtimeBridge = runOpenAITwilioRealtimeBridge;
11
+ exports.runXAITwilioRealtimeBridge = runXAITwilioRealtimeBridge;
12
+ const crypto_1 = require("crypto");
13
+ const ws_1 = __importDefault(require("ws"));
14
+ const cloud_voice_runtime_service_1 = require("../cloud-voice-runtime-service");
15
+ const cloud_voice_prompt_service_1 = require("../cloud-voice-prompt-service");
16
+ const audio_playout_tracker_1 = require("./audio-playout-tracker");
17
+ const telephony_1 = require("@kognitivedev/telephony");
18
+ const voice_media_bridge_1 = require("@kognitivedev/voice-media-bridge");
19
+ const MAX_PRE_READY_AUDIO_FRAMES = 250;
20
+ const LIKELY_SPEECH_RMS = 400;
21
+ const LIKELY_SPEECH_PEAK_ABS = 2500;
22
+ const MIN_INTERRUPT_ASSISTANT_AUDIO_MS = 240;
23
+ const MIN_INTERRUPT_ELAPSED_MS = 120;
24
+ const MIN_INTERRUPT_SPEECH_RMS = 220;
25
+ const MIN_INTERRUPT_SPEECH_PEAK_ABS = 1400;
26
+ const DEFAULT_PHONE_TURN_DETECTION = {
27
+ type: "server_vad",
28
+ create_response: true,
29
+ interrupt_response: true,
30
+ };
31
+ const DEFAULT_XAI_PHONE_TURN_DETECTION = {
32
+ type: "server_vad",
33
+ threshold: 0.55,
34
+ prefix_padding_ms: 160,
35
+ silence_duration_ms: 450,
36
+ };
37
+ function toOpenAIPhoneTurnDetection(value) {
38
+ const turnDetection = (0, cloud_voice_runtime_service_1.toOpenAITurnDetection)(value);
39
+ if (!turnDetection)
40
+ return turnDetection;
41
+ const type = getString(turnDetection.type);
42
+ return Object.assign(Object.assign(Object.assign({ type }, (typeof turnDetection.create_response === "boolean" ? { create_response: turnDetection.create_response } : {})), (typeof turnDetection.interrupt_response === "boolean" ? { interrupt_response: turnDetection.interrupt_response } : {})), (type === "semantic_vad" && typeof turnDetection.eagerness === "string" ? { eagerness: turnDetection.eagerness } : {}));
43
+ }
44
+ function toXAIPhoneTurnDetection(value) {
45
+ if (value === null)
46
+ return null;
47
+ const turnDetection = getRecord(value);
48
+ const type = getString(turnDetection.type, "server_vad");
49
+ if (type !== "server_vad")
50
+ return Object.assign({}, DEFAULT_XAI_PHONE_TURN_DETECTION);
51
+ const config = Object.assign({}, DEFAULT_XAI_PHONE_TURN_DETECTION);
52
+ if (typeof turnDetection.threshold === "number") {
53
+ config.threshold = turnDetection.threshold;
54
+ }
55
+ if (typeof turnDetection.prefix_padding_ms === "number") {
56
+ config.prefix_padding_ms = turnDetection.prefix_padding_ms;
57
+ }
58
+ else if (typeof turnDetection.prefixPaddingMs === "number") {
59
+ config.prefix_padding_ms = turnDetection.prefixPaddingMs;
60
+ }
61
+ if (typeof turnDetection.silence_duration_ms === "number") {
62
+ config.silence_duration_ms = turnDetection.silence_duration_ms;
63
+ }
64
+ else if (typeof turnDetection.silenceDurationMs === "number") {
65
+ config.silence_duration_ms = turnDetection.silenceDurationMs;
66
+ }
67
+ return config;
68
+ }
69
+ function getRecord(value) {
70
+ return value && typeof value === "object" && !Array.isArray(value) ? value : {};
71
+ }
72
+ function getString(value, fallback = "") {
73
+ return typeof value === "string" && value.trim() ? value.trim() : fallback;
74
+ }
75
+ function getRawString(value) {
76
+ return typeof value === "string" ? value : "";
77
+ }
78
+ function normalizeToolInput(value) {
79
+ if (typeof value !== "string")
80
+ return value;
81
+ try {
82
+ return JSON.parse(value);
83
+ }
84
+ catch (_a) {
85
+ return value;
86
+ }
87
+ }
88
+ function serializeToolResult(value) {
89
+ return typeof value === "string" ? value : JSON.stringify(value);
90
+ }
91
+ function serializeError(value) {
92
+ if (value instanceof Error) {
93
+ return {
94
+ name: value.name,
95
+ message: value.message,
96
+ stack: value.stack,
97
+ };
98
+ }
99
+ return value;
100
+ }
101
+ function sendJson(socket, message) {
102
+ if (socket.readyState !== ws_1.default.OPEN)
103
+ return false;
104
+ socket.send(JSON.stringify(message));
105
+ return true;
106
+ }
107
+ function sendOpenAIEvent(socket, event) {
108
+ return sendJson(socket, event);
109
+ }
110
+ function summarizePcm16Signal(samples) {
111
+ if (samples.length === 0)
112
+ return { samples: 0, peakAbs: 0, rms: 0 };
113
+ let peakAbs = 0;
114
+ let squareSum = 0;
115
+ for (const sample of samples) {
116
+ const abs = Math.abs(sample);
117
+ if (abs > peakAbs)
118
+ peakAbs = abs;
119
+ squareSum += sample * sample;
120
+ }
121
+ return {
122
+ samples: samples.length,
123
+ peakAbs,
124
+ rms: Math.round(Math.sqrt(squareSum / samples.length)),
125
+ };
126
+ }
127
+ function summarizeTwilioPcmuSignal(base64Audio) {
128
+ return (0, voice_media_bridge_1.summarizeCarrierPayload)((0, voice_media_bridge_1.getPhoneMediaProfile)("twilio-pcmu-8k"), base64Audio);
129
+ }
130
+ function isLikelySpeechTwilioPcmu(base64Audio) {
131
+ return (0, voice_media_bridge_1.isLikelySpeechPayload)((0, voice_media_bridge_1.getPhoneMediaProfile)("twilio-pcmu-8k"), base64Audio, {
132
+ rms: LIKELY_SPEECH_RMS,
133
+ peakAbs: LIKELY_SPEECH_PEAK_ABS,
134
+ });
135
+ }
136
+ function summarizeL16EndianCandidates(profile, payload) {
137
+ var _a;
138
+ if (profile.codec !== "l16")
139
+ return null;
140
+ const bytes = Buffer.byteLength(payload, "base64");
141
+ return {
142
+ configured: (_a = profile.carrierByteOrder) !== null && _a !== void 0 ? _a : "be",
143
+ be: (0, voice_media_bridge_1.summarizePcm16Signal)((0, voice_media_bridge_1.base64ToPcm16Be)(payload), bytes),
144
+ le: (0, voice_media_bridge_1.summarizePcm16Signal)((0, voice_media_bridge_1.base64ToPcm16Le)(payload), bytes),
145
+ };
146
+ }
147
+ function summarizeOpenAIAudioFormat(value) {
148
+ if (typeof value === "string")
149
+ return value;
150
+ const record = getRecord(value);
151
+ const type = getString(record.type, "");
152
+ const rate = typeof record.rate === "number" ? record.rate : undefined;
153
+ return type ? Object.assign({ type }, (rate ? { rate } : {})) : null;
154
+ }
155
+ function createOpenAIRealtimeReasoningConfig(snapshot, model) {
156
+ var _a;
157
+ if (!/^gpt-realtime-2(?:$|-)/.test(model))
158
+ return null;
159
+ const providerOptions = getRecord((_a = snapshot.runtime) === null || _a === void 0 ? void 0 : _a.providerOptions);
160
+ const reasoning = getRecord(providerOptions.reasoning);
161
+ if (Object.keys(reasoning).length > 0)
162
+ return reasoning;
163
+ const effort = getString(providerOptions.reasoningEffort, "");
164
+ return { effort: effort || "low" };
165
+ }
166
+ function isOpenAITwilioRealtimeReady(event, mediaProfile = (0, voice_media_bridge_1.getPhoneMediaProfile)("twilio-pcmu-8k")) {
167
+ return (0, voice_media_bridge_1.isOpenAIRealtimeAudioReady)(event, mediaProfile);
168
+ }
169
+ function createOpenAITwilioRealtimeTools(snapshot) {
170
+ var _a;
171
+ return ((_a = snapshot.toolManifest) !== null && _a !== void 0 ? _a : [])
172
+ .filter((manifest) => getString(manifest.name, ""))
173
+ .map((manifest) => {
174
+ var _a;
175
+ return ({
176
+ type: "function",
177
+ name: getString(manifest.name, ""),
178
+ description: getString(manifest.description, getString(manifest.name, "")),
179
+ parameters: (_a = manifest.parameters) !== null && _a !== void 0 ? _a : { type: "object", additionalProperties: true },
180
+ });
181
+ });
182
+ }
183
+ function createOpenAITwilioRealtimeSessionUpdate(snapshot, model, defaultVoice = "alloy", provider = "openai-realtime", mediaProfile = (0, voice_media_bridge_1.getPhoneMediaProfile)("twilio-pcmu-8k"), audioRoute = (0, voice_media_bridge_1.createPhoneAudioRoute)(mediaProfile, provider)) {
184
+ var _a, _b, _c;
185
+ const voiceConfig = (_a = snapshot.voiceConfig) !== null && _a !== void 0 ? _a : {};
186
+ const inputConfig = {
187
+ format: (0, voice_media_bridge_1.realtimeAudioFormatForRoute)(audioRoute),
188
+ };
189
+ const turnDetection = provider === "xai-realtime"
190
+ ? toXAIPhoneTurnDetection(voiceConfig.turnDetection)
191
+ : toOpenAIPhoneTurnDetection(voiceConfig.turnDetection);
192
+ if (provider !== "xai-realtime") {
193
+ inputConfig.turn_detection = typeof turnDetection === "undefined"
194
+ ? DEFAULT_PHONE_TURN_DETECTION
195
+ : turnDetection;
196
+ }
197
+ if (voiceConfig.transcription && typeof voiceConfig.transcription === "object") {
198
+ inputConfig.transcription = voiceConfig.transcription;
199
+ }
200
+ if (voiceConfig.inputNoiseReduction && typeof voiceConfig.inputNoiseReduction === "object") {
201
+ inputConfig.noise_reduction = voiceConfig.inputNoiseReduction;
202
+ }
203
+ const tools = createOpenAITwilioRealtimeTools(snapshot);
204
+ const voice = getString((_b = voiceConfig.voice) !== null && _b !== void 0 ? _b : (_c = snapshot.runtime) === null || _c === void 0 ? void 0 : _c.voice, defaultVoice);
205
+ const reasoning = provider === "openai-realtime" ? createOpenAIRealtimeReasoningConfig(snapshot, model) : null;
206
+ const instructions = (0, cloud_voice_prompt_service_1.resolveCloudVoiceProviderSystemPrompt)({
207
+ voiceConfig,
208
+ config: snapshot.config,
209
+ });
210
+ const session = provider === "xai-realtime" ? {
211
+ instructions,
212
+ voice,
213
+ audio: {
214
+ input: inputConfig,
215
+ output: {
216
+ format: (0, voice_media_bridge_1.realtimeOutputAudioFormatForRoute)(audioRoute),
217
+ },
218
+ },
219
+ } : Object.assign(Object.assign({ type: "realtime", model,
220
+ instructions, output_modalities: ["audio"] }, (reasoning ? { reasoning } : {})), { audio: {
221
+ input: inputConfig,
222
+ output: {
223
+ format: (0, voice_media_bridge_1.realtimeOutputAudioFormatForRoute)(audioRoute),
224
+ voice,
225
+ },
226
+ } });
227
+ if (provider === "xai-realtime") {
228
+ session.turn_detection = typeof turnDetection === "undefined"
229
+ ? { type: "server_vad" }
230
+ : turnDetection;
231
+ }
232
+ if (tools.length > 0) {
233
+ session.tools = tools;
234
+ if (provider !== "xai-realtime") {
235
+ session.tool_choice = "auto";
236
+ }
237
+ }
238
+ return {
239
+ type: "session.update",
240
+ session,
241
+ };
242
+ }
243
+ function sendOpenAIUserTextTurn(socket, text) {
244
+ sendOpenAIEvent(socket, {
245
+ type: "conversation.item.create",
246
+ item: {
247
+ type: "message",
248
+ role: "user",
249
+ content: [{ type: "input_text", text }],
250
+ },
251
+ });
252
+ sendOpenAIEvent(socket, { type: "response.create" });
253
+ }
254
+ function summarizeOpenAIResponseOutputTypes(response) {
255
+ const output = getRecord(response).output;
256
+ if (!Array.isArray(output))
257
+ return [];
258
+ return output.map((item) => getString(getRecord(item).type, "unknown"));
259
+ }
260
+ function extractOpenAIContentText(content) {
261
+ if (!Array.isArray(content))
262
+ return "";
263
+ return content
264
+ .flatMap((part) => {
265
+ const record = getRecord(part);
266
+ const transcript = getRawString(record.transcript).trim();
267
+ if (transcript)
268
+ return [transcript];
269
+ const text = getRawString(record.text).trim();
270
+ if (text)
271
+ return [text];
272
+ return [];
273
+ })
274
+ .join("\n")
275
+ .trim();
276
+ }
277
+ function extractOpenAIAssistantItemText(item) {
278
+ if (getString(item.type, "") !== "message")
279
+ return "";
280
+ const role = getString(item.role, "");
281
+ if (role && role !== "assistant")
282
+ return "";
283
+ return extractOpenAIContentText(item.content);
284
+ }
285
+ function extractOpenAIResponseText(response) {
286
+ const output = response.output;
287
+ if (!Array.isArray(output))
288
+ return "";
289
+ return output
290
+ .flatMap((item) => {
291
+ const text = extractOpenAIAssistantItemText(getRecord(item));
292
+ return text ? [text] : [];
293
+ })
294
+ .join("\n")
295
+ .trim();
296
+ }
297
+ function normalizeTelnyxMediaStreamMessage(raw) {
298
+ const event = getString(raw.event);
299
+ const streamId = getString(raw.stream_id);
300
+ const start = getRecord(raw.start);
301
+ const media = getRecord(raw.media);
302
+ const stop = getRecord(raw.stop);
303
+ const dtmf = getRecord(raw.dtmf);
304
+ return {
305
+ event,
306
+ protocol: getString(raw.protocol),
307
+ version: getString(raw.version),
308
+ streamSid: streamId,
309
+ start: {
310
+ streamSid: streamId,
311
+ accountSid: null,
312
+ callSid: getString(start.call_control_id),
313
+ tracks: [],
314
+ mediaFormat: getRecord(start.media_format),
315
+ customParameters: {
316
+ callSessionId: getString(start.call_session_id),
317
+ clientState: getString(start.client_state),
318
+ },
319
+ },
320
+ media: {
321
+ track: getString(media.track),
322
+ chunk: getString(media.chunk),
323
+ timestamp: getString(media.timestamp),
324
+ payload: getString(media.payload),
325
+ },
326
+ mark: getRecord(raw.mark),
327
+ dtmf: {
328
+ digit: getString(dtmf.digit),
329
+ },
330
+ stop: {
331
+ accountSid: null,
332
+ callSid: getString(stop.call_control_id),
333
+ },
334
+ };
335
+ }
336
+ function parseCarrierMediaStreamMessage(profile, raw) {
337
+ const parsed = JSON.parse(raw);
338
+ return profile.provider === "telnyx"
339
+ ? normalizeTelnyxMediaStreamMessage(parsed)
340
+ : (0, telephony_1.parseTwilioMediaStreamMessage)(parsed);
341
+ }
342
+ async function runOpenAITwilioRealtimeBridge(input) {
343
+ var _a, _b, _c, _d, _e, _f, _g, _h, _j;
344
+ const provider = (_a = input.provider) !== null && _a !== void 0 ? _a : "openai-realtime";
345
+ const providerLogPrefix = provider === "xai-realtime" ? "xai" : "openai";
346
+ const bridgeMode = (_b = input.bridgeMode) !== null && _b !== void 0 ? _b : (provider === "xai-realtime" ? "xai-twilio-realtime" : "openai-twilio-realtime-ga");
347
+ const supportsConversationItemTruncate = (_c = input.supportsConversationItemTruncate) !== null && _c !== void 0 ? _c : provider === "openai-realtime";
348
+ const model = getString((_d = input.snapshot.runtime) === null || _d === void 0 ? void 0 : _d.model, (_e = input.defaultModel) !== null && _e !== void 0 ? _e : "gpt-realtime");
349
+ const realtimeUrl = (_f = input.realtimeUrl) !== null && _f !== void 0 ? _f : `wss://api.openai.com/v1/realtime?model=${encodeURIComponent(model)}`;
350
+ const mediaProfile = (_g = input.mediaProfile) !== null && _g !== void 0 ? _g : (0, voice_media_bridge_1.getPhoneMediaProfile)("twilio-pcmu-8k");
351
+ const audioRoute = (_h = input.audioRoute) !== null && _h !== void 0 ? _h : (0, voice_media_bridge_1.createPhoneAudioRoute)(mediaProfile, provider);
352
+ const tools = createOpenAITwilioRealtimeTools(input.snapshot);
353
+ const humanization = (0, cloud_voice_runtime_service_1.normalizeCloudVoiceHumanizationConfig)(getRecord(input.snapshot.config).humanization);
354
+ const openAI = new ws_1.default(realtimeUrl, {
355
+ headers: {
356
+ Authorization: `Bearer ${input.apiKey}`,
357
+ },
358
+ });
359
+ let twilioStarted = false;
360
+ let providerConnected = false;
361
+ let phoneAudioReady = false;
362
+ let greeted = false;
363
+ let streamSid = null;
364
+ let latestMediaTimestamp = 0;
365
+ let responseStartTimestampTwilio = null;
366
+ let lastAssistantItemId = null;
367
+ let lastAssistantItemAudioMs = 0;
368
+ let responseOutboundFramesAtStart = 0;
369
+ let currentAssistantTranscript = "";
370
+ let currentAssistantFinalTranscript = "";
371
+ let missingStreamSidAudioWarningEmitted = false;
372
+ let preReadyAudioHadSpeech = false;
373
+ let latestInboundSignal = { bytes: 0, samples: 0, peakAbs: 0, rms: 0 };
374
+ let ignoredCarrierMediaFrames = 0;
375
+ const carrierMediaTracks = {};
376
+ const preReadyAudioFrames = [];
377
+ const markQueue = [];
378
+ const playoutTracker = new audio_playout_tracker_1.PhonePlayoutTracker();
379
+ const unregisterPlayoutIdleWaiter = (_j = input.registerPlayoutIdleWaiter) === null || _j === void 0 ? void 0 : _j.call(input, (waitInput) => playoutTracker.waitForIdle(waitInput));
380
+ const counters = {
381
+ inboundMediaFrames: 0,
382
+ providerInputFrames: 0,
383
+ providerInputDroppedBeforeReady: 0,
384
+ outboundMediaFrames: 0,
385
+ outboundAudioDroppedMissingStreamSid: 0,
386
+ outboundAudioBytes: 0,
387
+ outboundAudioPeakAbs: 0,
388
+ outboundAudioLastRms: 0,
389
+ clears: 0,
390
+ truncates: 0,
391
+ marks: 0,
392
+ dtmf: 0,
393
+ interruptions: 0,
394
+ ignoredInterruptions: 0,
395
+ toolCalls: 0,
396
+ providerMessages: 0,
397
+ };
398
+ const bindStreamSid = (nextStreamSid, source) => {
399
+ const next = getString(nextStreamSid, "");
400
+ if (!next || next === streamSid)
401
+ return;
402
+ streamSid = next;
403
+ input.updateCallLegActive({ streamSid });
404
+ input.log("twilio.stream_sid.bound", {
405
+ projectId: input.projectId,
406
+ sessionId: input.sessionId,
407
+ streamSid,
408
+ source,
409
+ });
410
+ };
411
+ const maybeGreet = () => {
412
+ var _a, _b, _c, _d;
413
+ if (greeted || !twilioStarted || !providerConnected || !phoneAudioReady)
414
+ return;
415
+ greeted = true;
416
+ if (preReadyAudioHadSpeech) {
417
+ input.log(`${providerLogPrefix}.initial_prompt.skipped`, {
418
+ projectId: input.projectId,
419
+ sessionId: input.sessionId,
420
+ providerCallId: (_a = input.providerCallId) !== null && _a !== void 0 ? _a : null,
421
+ reason: "caller_audio_before_ready",
422
+ });
423
+ return;
424
+ }
425
+ if (input.initialPrompt === null) {
426
+ input.log(`${providerLogPrefix}.initial_prompt.skipped`, {
427
+ projectId: input.projectId,
428
+ sessionId: input.sessionId,
429
+ providerCallId: (_b = input.providerCallId) !== null && _b !== void 0 ? _b : null,
430
+ reason: "openingMode.wait",
431
+ });
432
+ return;
433
+ }
434
+ input.log(`${providerLogPrefix}.initial_prompt.sent`, {
435
+ projectId: input.projectId,
436
+ sessionId: input.sessionId,
437
+ providerCallId: (_c = input.providerCallId) !== null && _c !== void 0 ? _c : null,
438
+ method: "conversation.item.create + response.create",
439
+ });
440
+ sendOpenAIUserTextTurn(openAI, (_d = input.initialPrompt) !== null && _d !== void 0 ? _d : "Start the phone call now. Greet the caller briefly and ask how you can help.");
441
+ };
442
+ const appendOpenAIAudio = (audio) => {
443
+ sendOpenAIEvent(openAI, {
444
+ type: "input_audio_buffer.append",
445
+ audio: (0, voice_media_bridge_1.toAiInputAudio)(audioRoute, audio),
446
+ });
447
+ counters.providerInputFrames += 1;
448
+ };
449
+ const flushPreReadyAudio = () => {
450
+ var _a;
451
+ if (!phoneAudioReady || openAI.readyState !== ws_1.default.OPEN || preReadyAudioFrames.length === 0)
452
+ return;
453
+ const frameCount = preReadyAudioFrames.length;
454
+ for (const audio of preReadyAudioFrames.splice(0)) {
455
+ appendOpenAIAudio(audio);
456
+ }
457
+ input.log(`${providerLogPrefix}.audio.input.pre_ready_flushed`, {
458
+ projectId: input.projectId,
459
+ sessionId: input.sessionId,
460
+ providerCallId: (_a = input.providerCallId) !== null && _a !== void 0 ? _a : null,
461
+ frameCount,
462
+ hadSpeech: preReadyAudioHadSpeech,
463
+ });
464
+ };
465
+ const bufferPreReadyAudio = (audio) => {
466
+ if (preReadyAudioFrames.length >= MAX_PRE_READY_AUDIO_FRAMES) {
467
+ preReadyAudioFrames.shift();
468
+ counters.providerInputDroppedBeforeReady += 1;
469
+ }
470
+ preReadyAudioFrames.push(audio);
471
+ preReadyAudioHadSpeech || (preReadyAudioHadSpeech = (0, voice_media_bridge_1.isLikelySpeechPayload)(mediaProfile, audio, {
472
+ rms: LIKELY_SPEECH_RMS,
473
+ peakAbs: LIKELY_SPEECH_PEAK_ABS,
474
+ }));
475
+ };
476
+ const dropOutboundAudioForMissingStreamSid = (eventType, delta) => {
477
+ counters.outboundAudioDroppedMissingStreamSid += 1;
478
+ const output = (0, voice_media_bridge_1.fromAiOutputAudio)(audioRoute, delta);
479
+ const signal = output.signal;
480
+ const payload = {
481
+ provider,
482
+ reason: "missing_twilio_stream_sid",
483
+ eventType,
484
+ droppedFrames: counters.outboundAudioDroppedMissingStreamSid,
485
+ droppedAudioBytes: signal.bytes,
486
+ inboundMediaFrames: counters.inboundMediaFrames,
487
+ providerInputFrames: counters.providerInputFrames,
488
+ providerMessages: counters.providerMessages,
489
+ };
490
+ input.log(`${providerLogPrefix}.audio.output.dropped`, Object.assign({ projectId: input.projectId, sessionId: input.sessionId }, payload));
491
+ if (!missingStreamSidAudioWarningEmitted) {
492
+ missingStreamSidAudioWarningEmitted = true;
493
+ input.appendEvent("voice.provider.warning", Object.assign(Object.assign({}, payload), { message: `${provider === "xai-realtime" ? "xAI" : "OpenAI"} returned assistant audio, but the bridge has no Twilio streamSid to send it to.` }), "Assistant audio could not be sent to Twilio");
494
+ }
495
+ if (counters.outboundAudioDroppedMissingStreamSid === 1 || counters.outboundAudioDroppedMissingStreamSid % 50 === 0) {
496
+ input.appendEvent("voice.call.audio.dropped", payload, "Assistant audio dropped before Twilio stream was bound");
497
+ }
498
+ };
499
+ const ignoreAssistantInterruption = (reason, details = {}) => {
500
+ counters.ignoredInterruptions += 1;
501
+ input.log(`${providerLogPrefix}.audio.interruption_ignored`, Object.assign({ projectId: input.projectId, sessionId: input.sessionId, reason,
502
+ counters }, details));
503
+ if (counters.ignoredInterruptions === 1 || counters.ignoredInterruptions % 10 === 0) {
504
+ input.appendEvent("voice.assistant.interruption_ignored", Object.assign({ provider,
505
+ reason,
506
+ counters }, details));
507
+ }
508
+ };
509
+ const interruptAssistantAudio = () => {
510
+ if (!streamSid || markQueue.length === 0 || responseStartTimestampTwilio === null)
511
+ return;
512
+ const elapsedMs = Math.max(0, latestMediaTimestamp - responseStartTimestampTwilio);
513
+ if (elapsedMs < MIN_INTERRUPT_ELAPSED_MS || lastAssistantItemAudioMs < MIN_INTERRUPT_ASSISTANT_AUDIO_MS) {
514
+ ignoreAssistantInterruption("assistant_audio_too_short", {
515
+ elapsedMs,
516
+ assistantAudioMs: lastAssistantItemAudioMs,
517
+ pendingMarks: markQueue.length,
518
+ });
519
+ return;
520
+ }
521
+ if (latestInboundSignal.samples > 0
522
+ && latestInboundSignal.rms < MIN_INTERRUPT_SPEECH_RMS
523
+ && latestInboundSignal.peakAbs < MIN_INTERRUPT_SPEECH_PEAK_ABS) {
524
+ ignoreAssistantInterruption("weak_recent_input_signal", {
525
+ elapsedMs,
526
+ assistantAudioMs: lastAssistantItemAudioMs,
527
+ latestInboundSignal,
528
+ pendingMarks: markQueue.length,
529
+ });
530
+ return;
531
+ }
532
+ const truncateMs = lastAssistantItemAudioMs > 0
533
+ ? Math.min(elapsedMs, Math.max(0, lastAssistantItemAudioMs - 20))
534
+ : elapsedMs;
535
+ counters.interruptions += 1;
536
+ const playoutInterruption = playoutTracker.interrupt({
537
+ elapsedMs: truncateMs,
538
+ providerTruncationSupported: supportsConversationItemTruncate,
539
+ });
540
+ if (supportsConversationItemTruncate && lastAssistantItemId && truncateMs >= 0) {
541
+ counters.truncates += 1;
542
+ sendOpenAIEvent(openAI, {
543
+ type: "conversation.item.truncate",
544
+ item_id: lastAssistantItemId,
545
+ content_index: 0,
546
+ audio_end_ms: truncateMs,
547
+ });
548
+ input.appendEvent("voice.call.audio.truncate_sent", Object.assign(Object.assign({ provider, itemId: lastAssistantItemId, audioEndMs: truncateMs, elapsedMs }, playoutInterruption), { counters }));
549
+ }
550
+ counters.clears += 1;
551
+ const clearMessage = (0, voice_media_bridge_1.createCarrierClearMessage)(mediaProfile, streamSid);
552
+ if (clearMessage)
553
+ sendJson(input.socket, clearMessage);
554
+ input.appendEvent("voice.call.audio.clear_sent", Object.assign(Object.assign({ provider: mediaProfile.provider, streamSid,
555
+ elapsedMs }, playoutInterruption), { counters }));
556
+ if (!supportsConversationItemTruncate) {
557
+ sendOpenAIEvent(openAI, { type: "response.cancel" });
558
+ input.appendEvent("voice.call.response_cancel_sent", {
559
+ provider,
560
+ elapsedMs,
561
+ counters,
562
+ });
563
+ }
564
+ input.log(`${providerLogPrefix}.audio.interrupted`, {
565
+ projectId: input.projectId,
566
+ sessionId: input.sessionId,
567
+ streamSid,
568
+ elapsedMs,
569
+ truncateMs,
570
+ assistantAudioMs: lastAssistantItemAudioMs,
571
+ pendingMarks: markQueue.length,
572
+ });
573
+ input.appendEvent("voice.assistant.interrupted", Object.assign({ provider,
574
+ elapsedMs }, playoutInterruption));
575
+ markQueue.length = 0;
576
+ responseStartTimestampTwilio = null;
577
+ lastAssistantItemId = null;
578
+ lastAssistantItemAudioMs = 0;
579
+ };
580
+ const handleToolCall = async (item) => {
581
+ var _a, _b, _c, _d;
582
+ const toolId = getString(item.name, "");
583
+ const toolCallId = getString(item.call_id, getString(item.id, (0, crypto_1.randomUUID)()));
584
+ if (!toolId)
585
+ return;
586
+ counters.toolCalls += 1;
587
+ input.log(`${providerLogPrefix}.tool.execute.start`, {
588
+ projectId: input.projectId,
589
+ sessionId: input.sessionId,
590
+ providerCallId: (_a = input.providerCallId) !== null && _a !== void 0 ? _a : null,
591
+ toolId,
592
+ toolCallId,
593
+ toolLatencyFillerMs: humanization.toolLatencyFillerMs,
594
+ });
595
+ try {
596
+ const result = await input.executeTool({
597
+ toolId,
598
+ args: normalizeToolInput(getString(item.arguments, "{}")),
599
+ toolCallId,
600
+ providerCallId: (_b = input.providerCallId) !== null && _b !== void 0 ? _b : null,
601
+ });
602
+ sendOpenAIEvent(openAI, {
603
+ type: "conversation.item.create",
604
+ item: {
605
+ type: "function_call_output",
606
+ call_id: toolCallId,
607
+ output: serializeToolResult(result.result),
608
+ },
609
+ });
610
+ sendOpenAIEvent(openAI, { type: "response.create" });
611
+ input.log(`${providerLogPrefix}.tool.execute.completed`, {
612
+ projectId: input.projectId,
613
+ sessionId: input.sessionId,
614
+ providerCallId: (_c = input.providerCallId) !== null && _c !== void 0 ? _c : null,
615
+ toolId,
616
+ toolCallId,
617
+ });
618
+ }
619
+ catch (error) {
620
+ input.error(`${providerLogPrefix}.tool.execute.failed`, error, {
621
+ projectId: input.projectId,
622
+ sessionId: input.sessionId,
623
+ providerCallId: (_d = input.providerCallId) !== null && _d !== void 0 ? _d : null,
624
+ toolId,
625
+ toolCallId,
626
+ });
627
+ sendOpenAIEvent(openAI, {
628
+ type: "conversation.item.create",
629
+ item: {
630
+ type: "function_call_output",
631
+ call_id: toolCallId,
632
+ output: serializeToolResult({ error: error instanceof Error ? error.message : String(error) }),
633
+ },
634
+ });
635
+ sendOpenAIEvent(openAI, { type: "response.create" });
636
+ }
637
+ };
638
+ const handleOpenAIMessage = (rawData) => {
639
+ var _a, _b, _c, _d, _e, _f, _g, _h;
640
+ let event;
641
+ try {
642
+ event = JSON.parse(rawData.toString());
643
+ }
644
+ catch (error) {
645
+ input.error(`${providerLogPrefix}.message.parse.failed`, error, { projectId: input.projectId, sessionId: input.sessionId });
646
+ return;
647
+ }
648
+ counters.providerMessages += 1;
649
+ const eventType = getString(event.type, "");
650
+ if (eventType === "error") {
651
+ input.error(`${providerLogPrefix}.session.error`, event, { projectId: input.projectId, sessionId: input.sessionId });
652
+ input.appendEvent("voice.provider.error", { provider, error: event });
653
+ return;
654
+ }
655
+ if (eventType === "session.updated") {
656
+ phoneAudioReady = (0, voice_media_bridge_1.isRealtimeAudioReadyForRoute)(event, audioRoute);
657
+ const audio = getRecord(getRecord(event.session).audio);
658
+ input.log(`${providerLogPrefix}.session.updated`, {
659
+ projectId: input.projectId,
660
+ sessionId: input.sessionId,
661
+ inputFormat: summarizeOpenAIAudioFormat(getRecord(audio.input).format),
662
+ outputFormat: summarizeOpenAIAudioFormat(getRecord(audio.output).format),
663
+ phoneAudioReady,
664
+ mediaProfile: mediaProfile.id,
665
+ });
666
+ if (!phoneAudioReady) {
667
+ input.appendEvent("voice.provider.warning", {
668
+ provider,
669
+ reason: `${provider === "xai-realtime" ? "xAI" : "OpenAI"} session did not confirm ${audioRoute.aiInput.format} phone audio format`,
670
+ inputFormat: summarizeOpenAIAudioFormat(getRecord(audio.input).format),
671
+ outputFormat: summarizeOpenAIAudioFormat(getRecord(audio.output).format),
672
+ mediaProfile: mediaProfile.id,
673
+ });
674
+ }
675
+ flushPreReadyAudio();
676
+ maybeGreet();
677
+ return;
678
+ }
679
+ if (eventType === "input_audio_buffer.speech_started") {
680
+ interruptAssistantAudio();
681
+ return;
682
+ }
683
+ if (eventType === "conversation.item.input_audio_transcription.completed") {
684
+ const transcript = getString(event.transcript, "");
685
+ if (transcript) {
686
+ input.appendEvent("voice.user.transcribed", {
687
+ provider,
688
+ text: transcript,
689
+ usage: (_a = event.usage) !== null && _a !== void 0 ? _a : undefined,
690
+ }, transcript);
691
+ }
692
+ return;
693
+ }
694
+ if (eventType === "response.created") {
695
+ const response = getRecord(event.response);
696
+ responseStartTimestampTwilio = null;
697
+ lastAssistantItemId = null;
698
+ lastAssistantItemAudioMs = 0;
699
+ responseOutboundFramesAtStart = counters.outboundMediaFrames;
700
+ currentAssistantTranscript = "";
701
+ currentAssistantFinalTranscript = "";
702
+ input.log(`${providerLogPrefix}.response.created`, {
703
+ projectId: input.projectId,
704
+ sessionId: input.sessionId,
705
+ responseId: getString(response.id, ""),
706
+ });
707
+ input.appendEvent("voice.assistant.started", {
708
+ provider,
709
+ responseId: getString(response.id, ""),
710
+ });
711
+ return;
712
+ }
713
+ if (eventType === "response.output_audio.delta" || eventType === "response.audio.delta") {
714
+ const delta = getString(event.delta, "");
715
+ if (!delta)
716
+ return;
717
+ if (!streamSid && mediaProfile.provider !== "telnyx") {
718
+ dropOutboundAudioForMissingStreamSid(eventType, delta);
719
+ return;
720
+ }
721
+ if ((_b = input.isAiPaused) === null || _b === void 0 ? void 0 : _b.call(input)) {
722
+ input.log(`${providerLogPrefix}.audio.output.paused_for_handoff`, {
723
+ projectId: input.projectId,
724
+ sessionId: input.sessionId,
725
+ providerCallId: (_c = input.providerCallId) !== null && _c !== void 0 ? _c : null,
726
+ outboundMediaFrames: counters.outboundMediaFrames,
727
+ });
728
+ return;
729
+ }
730
+ const carrierOutput = (0, voice_media_bridge_1.fromAiOutputAudio)(audioRoute, delta);
731
+ const mediaMessage = (0, voice_media_bridge_1.createCarrierMediaMessage)(mediaProfile, streamSid, carrierOutput.payload);
732
+ const sent = mediaMessage ? sendJson(input.socket, mediaMessage) : false;
733
+ if (!sent)
734
+ return;
735
+ const itemId = getString(event.item_id, lastAssistantItemId !== null && lastAssistantItemId !== void 0 ? lastAssistantItemId : "");
736
+ if (itemId && itemId !== lastAssistantItemId) {
737
+ responseStartTimestampTwilio = latestMediaTimestamp;
738
+ lastAssistantItemAudioMs = 0;
739
+ playoutTracker.startAssistantItem(itemId || null);
740
+ }
741
+ counters.outboundMediaFrames += 1;
742
+ const signal = carrierOutput.signal;
743
+ counters.outboundAudioBytes += signal.bytes;
744
+ counters.outboundAudioPeakAbs = Math.max(counters.outboundAudioPeakAbs, signal.peakAbs);
745
+ counters.outboundAudioLastRms = signal.rms;
746
+ lastAssistantItemAudioMs += carrierOutput.durationMs;
747
+ if (responseStartTimestampTwilio === null)
748
+ responseStartTimestampTwilio = latestMediaTimestamp;
749
+ lastAssistantItemId = itemId;
750
+ const markName = `${providerLogPrefix}:${counters.outboundMediaFrames}`;
751
+ const markMessage = (0, voice_media_bridge_1.createCarrierMarkMessage)(mediaProfile, streamSid, markName);
752
+ if (markMessage)
753
+ sendJson(input.socket, markMessage);
754
+ markQueue.push(markName);
755
+ playoutTracker.recordOutboundFrame({ markName, itemId: itemId || null, durationMs: carrierOutput.durationMs });
756
+ (_d = input.onAudioFrame) === null || _d === void 0 ? void 0 : _d.call(input, {
757
+ source: "assistant",
758
+ payload: carrierOutput.payload,
759
+ metadata: { provider, frame: counters.outboundMediaFrames },
760
+ });
761
+ if (counters.outboundMediaFrames === 1 || counters.outboundMediaFrames % 50 === 0) {
762
+ input.log(`${providerLogPrefix}.audio.output.summary`, {
763
+ projectId: input.projectId,
764
+ sessionId: input.sessionId,
765
+ streamSid,
766
+ outboundMediaFrames: counters.outboundMediaFrames,
767
+ outboundAudioBytes: counters.outboundAudioBytes,
768
+ lastSignal: signal,
769
+ });
770
+ }
771
+ return;
772
+ }
773
+ if (eventType === "response.output_audio_transcript.delta" || eventType === "response.audio_transcript.delta") {
774
+ const delta = getRawString(event.delta);
775
+ currentAssistantTranscript += delta;
776
+ if (delta) {
777
+ input.appendEvent("voice.assistant.transcript.delta", {
778
+ provider,
779
+ delta,
780
+ }, delta);
781
+ }
782
+ return;
783
+ }
784
+ if (eventType === "response.output_audio_transcript.done" || eventType === "response.audio_transcript.done") {
785
+ const transcript = getRawString(event.transcript).trim();
786
+ if (transcript)
787
+ currentAssistantFinalTranscript = transcript;
788
+ return;
789
+ }
790
+ if (eventType === "response.output_audio.done" || eventType === "response.audio.done") {
791
+ input.log(`${providerLogPrefix}.audio.output.done`, {
792
+ projectId: input.projectId,
793
+ sessionId: input.sessionId,
794
+ responseId: getString(event.response_id, ""),
795
+ outboundMediaFrames: counters.outboundMediaFrames,
796
+ outboundAudioBytes: counters.outboundAudioBytes,
797
+ });
798
+ return;
799
+ }
800
+ if (eventType === "response.output_item.done") {
801
+ const item = getRecord(event.item);
802
+ const assistantText = extractOpenAIAssistantItemText(item);
803
+ if (assistantText)
804
+ currentAssistantFinalTranscript = assistantText;
805
+ if (item.type === "function_call" && getString(item.status, "") === "completed") {
806
+ void handleToolCall(item);
807
+ }
808
+ return;
809
+ }
810
+ if (eventType === "response.done") {
811
+ const response = getRecord(event.response);
812
+ const text = currentAssistantFinalTranscript
813
+ || extractOpenAIResponseText(response)
814
+ || currentAssistantTranscript.trim();
815
+ const status = getString(response.status, "");
816
+ if (text && status !== "cancelled" && counters.outboundMediaFrames === responseOutboundFramesAtStart) {
817
+ input.appendEvent("voice.provider.warning", {
818
+ provider,
819
+ reason: "assistant_text_without_outbound_audio",
820
+ responseId: getString(response.id, ""),
821
+ text,
822
+ streamSid,
823
+ counters,
824
+ }, "Assistant response produced text but no outbound phone audio");
825
+ }
826
+ currentAssistantTranscript = "";
827
+ currentAssistantFinalTranscript = "";
828
+ input.log(`${providerLogPrefix}.response.done`, {
829
+ projectId: input.projectId,
830
+ sessionId: input.sessionId,
831
+ responseId: getString(response.id, ""),
832
+ status,
833
+ statusDetails: (_e = response.status_details) !== null && _e !== void 0 ? _e : null,
834
+ outputTypes: summarizeOpenAIResponseOutputTypes(response),
835
+ counters,
836
+ });
837
+ input.appendEvent("voice.assistant.stopped", {
838
+ provider,
839
+ responseId: getString(response.id, ""),
840
+ status: getString(response.status, ""),
841
+ statusDetails: (_f = response.status_details) !== null && _f !== void 0 ? _f : undefined,
842
+ text: text || undefined,
843
+ usage: (_g = response.usage) !== null && _g !== void 0 ? _g : undefined,
844
+ counters,
845
+ }, text || undefined);
846
+ (_h = input.onAssistantTurnComplete) === null || _h === void 0 ? void 0 : _h.call(input);
847
+ }
848
+ };
849
+ const handleTwilioMessage = (data) => {
850
+ var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l, _m, _o, _p, _q, _r, _s;
851
+ try {
852
+ const parsed = parseCarrierMediaStreamMessage(mediaProfile, typeof data === "string" ? data : data.toString());
853
+ if (parsed.event === "connected") {
854
+ input.log(`${mediaProfile.provider}.connected`, {
855
+ projectId: input.projectId,
856
+ sessionId: input.sessionId,
857
+ protocol: parsed.protocol,
858
+ version: parsed.version,
859
+ });
860
+ }
861
+ else if (parsed.event === "start") {
862
+ twilioStarted = true;
863
+ bindStreamSid((_b = (_a = parsed.start.streamSid) !== null && _a !== void 0 ? _a : parsed.streamSid) !== null && _b !== void 0 ? _b : null, `${mediaProfile.provider}.start`);
864
+ responseStartTimestampTwilio = null;
865
+ latestMediaTimestamp = 0;
866
+ lastAssistantItemId = null;
867
+ lastAssistantItemAudioMs = 0;
868
+ preReadyAudioFrames.length = 0;
869
+ preReadyAudioHadSpeech = false;
870
+ markQueue.length = 0;
871
+ input.log(`${mediaProfile.provider}.start`, {
872
+ projectId: input.projectId,
873
+ sessionId: input.sessionId,
874
+ providerCallId: (_d = (_c = parsed.start.callSid) !== null && _c !== void 0 ? _c : input.providerCallId) !== null && _d !== void 0 ? _d : null,
875
+ streamSid,
876
+ mediaFormat: parsed.start.mediaFormat,
877
+ });
878
+ input.updateSessionActive();
879
+ (_e = input.onCarrierAudioSender) === null || _e === void 0 ? void 0 : _e.call(input, (payload) => {
880
+ const mediaMessage = (0, voice_media_bridge_1.createCarrierMediaMessage)(mediaProfile, streamSid, payload);
881
+ return mediaMessage ? sendJson(input.socket, mediaMessage) : false;
882
+ });
883
+ input.appendEvent("voice.call.started", {
884
+ provider: mediaProfile.provider,
885
+ providerCallId: (_g = (_f = parsed.start.callSid) !== null && _f !== void 0 ? _f : input.providerCallId) !== null && _g !== void 0 ? _g : null,
886
+ streamSid,
887
+ mediaFormat: parsed.start.mediaFormat,
888
+ }, `${mediaProfile.provider} media stream started`);
889
+ maybeGreet();
890
+ }
891
+ else if (parsed.event === "media") {
892
+ bindStreamSid(parsed.streamSid, `${mediaProfile.provider}.media`);
893
+ const mediaTrack = getString(parsed.media.track, "");
894
+ const mediaTrackKey = mediaTrack || "unknown";
895
+ carrierMediaTracks[mediaTrackKey] = ((_h = carrierMediaTracks[mediaTrackKey]) !== null && _h !== void 0 ? _h : 0) + 1;
896
+ if (mediaProfile.provider === "telnyx" && mediaTrack && mediaTrack !== "inbound") {
897
+ ignoredCarrierMediaFrames += 1;
898
+ if (ignoredCarrierMediaFrames === 1 || ignoredCarrierMediaFrames % 250 === 0) {
899
+ input.log(`${mediaProfile.provider}.media.ignored`, {
900
+ projectId: input.projectId,
901
+ sessionId: input.sessionId,
902
+ providerCallId: (_j = input.providerCallId) !== null && _j !== void 0 ? _j : null,
903
+ track: mediaTrack,
904
+ ignoredCarrierMediaFrames,
905
+ carrierMediaTracks,
906
+ });
907
+ }
908
+ return;
909
+ }
910
+ counters.inboundMediaFrames += 1;
911
+ latestInboundSignal = (0, voice_media_bridge_1.summarizeCarrierPayload)(mediaProfile, parsed.media.payload);
912
+ const l16EndianSignals = summarizeL16EndianCandidates(mediaProfile, parsed.media.payload);
913
+ latestMediaTimestamp = Number.parseInt((_k = parsed.media.timestamp) !== null && _k !== void 0 ? _k : "0", 10) || latestMediaTimestamp;
914
+ (_l = input.onAudioFrame) === null || _l === void 0 ? void 0 : _l.call(input, {
915
+ source: "caller",
916
+ payload: parsed.media.payload,
917
+ metadata: Object.assign({ provider: mediaProfile.provider, frame: counters.inboundMediaFrames }, (l16EndianSignals ? { l16ByteOrder: l16EndianSignals.configured } : {})),
918
+ });
919
+ if ((_m = input.isAiPaused) === null || _m === void 0 ? void 0 : _m.call(input)) {
920
+ counters.providerInputDroppedBeforeReady += 1;
921
+ }
922
+ else if (openAI.readyState === ws_1.default.OPEN && phoneAudioReady) {
923
+ appendOpenAIAudio(parsed.media.payload);
924
+ }
925
+ else {
926
+ bufferPreReadyAudio(parsed.media.payload);
927
+ }
928
+ if (counters.inboundMediaFrames === 1
929
+ || counters.inboundMediaFrames % 250 === 0
930
+ || counters.providerInputDroppedBeforeReady === 1) {
931
+ input.log(`${providerLogPrefix}.audio.input.summary`, Object.assign(Object.assign({ projectId: input.projectId, sessionId: input.sessionId, providerCallId: (_o = input.providerCallId) !== null && _o !== void 0 ? _o : null, inboundMediaFrames: counters.inboundMediaFrames, providerInputFrames: counters.providerInputFrames, providerInputDroppedBeforeReady: counters.providerInputDroppedBeforeReady, providerInputBufferedBeforeReady: preReadyAudioFrames.length, preReadyAudioHadSpeech,
932
+ latestInboundSignal }, (l16EndianSignals ? { l16EndianSignals } : {})), { carrierMediaTracks,
933
+ ignoredCarrierMediaFrames, mediaTrack: mediaTrack || null, providerConnected,
934
+ phoneAudioReady }));
935
+ }
936
+ if (counters.inboundMediaFrames === 1 || counters.inboundMediaFrames % 250 === 0) {
937
+ input.log(`${mediaProfile.provider}.media.summary`, Object.assign(Object.assign({ projectId: input.projectId, sessionId: input.sessionId, inboundMediaFrames: counters.inboundMediaFrames, latestInboundSignal }, (l16EndianSignals ? { l16EndianSignals } : {})), { carrierMediaTracks,
938
+ ignoredCarrierMediaFrames }));
939
+ input.appendEvent("voice.call.audio.summary", {
940
+ provider: mediaProfile.provider,
941
+ mediaProfile: mediaProfile.id,
942
+ inboundMediaFrames: counters.inboundMediaFrames,
943
+ });
944
+ }
945
+ }
946
+ else if (parsed.event === "mark") {
947
+ bindStreamSid(parsed.streamSid, `${mediaProfile.provider}.mark`);
948
+ counters.marks += 1;
949
+ const markName = getString((_p = parsed.mark) === null || _p === void 0 ? void 0 : _p.name, "");
950
+ if (markQueue.length > 0)
951
+ markQueue.shift();
952
+ playoutTracker.recordCarrierMark(markName);
953
+ }
954
+ else if (parsed.event === "dtmf") {
955
+ bindStreamSid(parsed.streamSid, `${mediaProfile.provider}.dtmf`);
956
+ counters.dtmf += 1;
957
+ input.appendEvent("voice.call.dtmf", {
958
+ provider: mediaProfile.provider,
959
+ digit: parsed.dtmf.digit,
960
+ }, `DTMF ${parsed.dtmf.digit}`);
961
+ }
962
+ else if (parsed.event === "stop") {
963
+ bindStreamSid(parsed.streamSid, `${mediaProfile.provider}.stop`);
964
+ input.log(`${mediaProfile.provider}.stop`, {
965
+ projectId: input.projectId,
966
+ sessionId: input.sessionId,
967
+ streamSid,
968
+ counters,
969
+ });
970
+ input.appendEvent("voice.call.ended", {
971
+ provider: mediaProfile.provider,
972
+ providerCallId: (_r = (_q = parsed.stop.callSid) !== null && _q !== void 0 ? _q : input.providerCallId) !== null && _r !== void 0 ? _r : null,
973
+ streamSid,
974
+ counters,
975
+ }, `${mediaProfile.provider} media stream stopped`);
976
+ (_s = input.onCarrierAudioSender) === null || _s === void 0 ? void 0 : _s.call(input, null);
977
+ try {
978
+ openAI.close();
979
+ }
980
+ catch (_t) { }
981
+ }
982
+ }
983
+ catch (error) {
984
+ input.error(`${mediaProfile.provider}.message.parse.failed`, error, {
985
+ projectId: input.projectId,
986
+ sessionId: input.sessionId,
987
+ mediaProfile: mediaProfile.id,
988
+ });
989
+ input.appendEvent("voice.phone.codec.validation_failed", {
990
+ provider: mediaProfile.provider,
991
+ mediaProfile: mediaProfile.id,
992
+ error: error instanceof Error ? error.message : String(error),
993
+ });
994
+ }
995
+ };
996
+ input.socket.on("message", handleTwilioMessage);
997
+ openAI.on("message", handleOpenAIMessage);
998
+ openAI.on("close", (code, reason) => {
999
+ unregisterPlayoutIdleWaiter === null || unregisterPlayoutIdleWaiter === void 0 ? void 0 : unregisterPlayoutIdleWaiter();
1000
+ input.log(`${providerLogPrefix}.closed`, {
1001
+ projectId: input.projectId,
1002
+ sessionId: input.sessionId,
1003
+ code,
1004
+ reason: reason.toString(),
1005
+ counters,
1006
+ });
1007
+ input.appendEvent("voice.provider.closed", {
1008
+ provider,
1009
+ code,
1010
+ reason: reason.toString(),
1011
+ counters,
1012
+ });
1013
+ });
1014
+ openAI.on("error", (error) => {
1015
+ input.error(`${providerLogPrefix}.socket.error`, error, { projectId: input.projectId, sessionId: input.sessionId });
1016
+ input.appendEvent("voice.provider.error", {
1017
+ provider,
1018
+ error: serializeError(error),
1019
+ });
1020
+ });
1021
+ await new Promise((resolve, reject) => {
1022
+ openAI.once("open", () => {
1023
+ var _a, _b;
1024
+ providerConnected = true;
1025
+ sendOpenAIEvent(openAI, createOpenAITwilioRealtimeSessionUpdate(input.snapshot, model, (_a = input.defaultVoice) !== null && _a !== void 0 ? _a : "alloy", provider, mediaProfile, audioRoute));
1026
+ input.log(`${providerLogPrefix}.connected`, {
1027
+ projectId: input.projectId,
1028
+ sessionId: input.sessionId,
1029
+ providerCallId: (_b = input.providerCallId) !== null && _b !== void 0 ? _b : null,
1030
+ model,
1031
+ toolCount: tools.length,
1032
+ tools: tools.map((tool) => ({ name: tool.name, description: tool.description })),
1033
+ bridgeMode,
1034
+ humanization,
1035
+ mediaProfile: mediaProfile.id,
1036
+ waitingForPhoneAudioReady: !phoneAudioReady,
1037
+ });
1038
+ input.appendEvent("voice.provider.connected", {
1039
+ provider,
1040
+ model,
1041
+ toolCount: tools.length,
1042
+ tools: tools.map((tool) => tool.name),
1043
+ bridgeMode,
1044
+ mediaProfile: mediaProfile.id,
1045
+ });
1046
+ maybeGreet();
1047
+ resolve();
1048
+ });
1049
+ openAI.once("error", reject);
1050
+ });
1051
+ return {
1052
+ sendTextTurn: (text) => {
1053
+ if (openAI.readyState !== ws_1.default.OPEN)
1054
+ return false;
1055
+ sendOpenAIUserTextTurn(openAI, text);
1056
+ return true;
1057
+ },
1058
+ close: () => {
1059
+ var _a;
1060
+ input.socket.off("message", handleTwilioMessage);
1061
+ (_a = input.onCarrierAudioSender) === null || _a === void 0 ? void 0 : _a.call(input, null);
1062
+ unregisterPlayoutIdleWaiter === null || unregisterPlayoutIdleWaiter === void 0 ? void 0 : unregisterPlayoutIdleWaiter();
1063
+ try {
1064
+ openAI.close();
1065
+ }
1066
+ catch (_b) { }
1067
+ },
1068
+ };
1069
+ }
1070
+ async function runXAITwilioRealtimeBridge(input) {
1071
+ var _a;
1072
+ const model = getString((_a = input.snapshot.runtime) === null || _a === void 0 ? void 0 : _a.model, "grok-voice-think-fast-1.0");
1073
+ return runOpenAITwilioRealtimeBridge(Object.assign(Object.assign({}, input), { provider: "xai-realtime", realtimeUrl: `wss://api.x.ai/v1/realtime?model=${encodeURIComponent(model)}`, defaultModel: "grok-voice-think-fast-1.0", defaultVoice: "eve", bridgeMode: "xai-twilio-realtime", supportsConversationItemTruncate: false }));
1074
+ }