@kognitivedev/backend-cloud 0.2.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/.turbo/turbo-build.log +2 -0
  2. package/.turbo/turbo-test.log +14 -0
  3. package/CHANGELOG.md +11 -0
  4. package/README.md +88 -0
  5. package/dist/cloud-voice-parameters.d.ts +11 -0
  6. package/dist/cloud-voice-parameters.js +219 -0
  7. package/dist/cloud-voice-prompt-service.d.ts +24 -0
  8. package/dist/cloud-voice-prompt-service.js +382 -0
  9. package/dist/cloud-voice-runtime-service.d.ts +73 -0
  10. package/dist/cloud-voice-runtime-service.js +443 -0
  11. package/dist/cloud-voice.d.ts +36 -0
  12. package/dist/cloud-voice.js +683 -0
  13. package/dist/index.d.ts +10 -0
  14. package/dist/index.js +26 -0
  15. package/dist/phone-control.d.ts +50 -0
  16. package/dist/phone-control.js +97 -0
  17. package/dist/phone-runtime/audio-playout-tracker.d.ts +51 -0
  18. package/dist/phone-runtime/audio-playout-tracker.js +93 -0
  19. package/dist/phone-runtime/openai-twilio-realtime.d.ts +95 -0
  20. package/dist/phone-runtime/openai-twilio-realtime.js +1074 -0
  21. package/dist/tools.d.ts +2 -0
  22. package/dist/tools.js +216 -0
  23. package/dist/types.d.ts +468 -0
  24. package/dist/types.js +2 -0
  25. package/dist/utils.d.ts +3 -0
  26. package/dist/utils.js +14 -0
  27. package/package.json +47 -0
  28. package/src/__tests__/audio-playout-tracker.test.ts +46 -0
  29. package/src/__tests__/cloud-voice.test.ts +1006 -0
  30. package/src/__tests__/openai-twilio-realtime.test.ts +1193 -0
  31. package/src/__tests__/phone-control.test.ts +105 -0
  32. package/src/cloud-voice-parameters.ts +236 -0
  33. package/src/cloud-voice-prompt-service.ts +493 -0
  34. package/src/cloud-voice-runtime-service.ts +465 -0
  35. package/src/cloud-voice.ts +831 -0
  36. package/src/index.ts +10 -0
  37. package/src/phone-control.ts +156 -0
  38. package/src/phone-runtime/audio-playout-tracker.ts +132 -0
  39. package/src/phone-runtime/openai-twilio-realtime.ts +1250 -0
  40. package/src/tools.ts +227 -0
  41. package/src/types.ts +529 -0
  42. package/src/utils.ts +11 -0
  43. package/tsconfig.json +13 -0
@@ -0,0 +1,1250 @@
1
+ import { randomUUID } from "crypto";
2
+ import NodeWebSocket, { type RawData, type WebSocket } from "ws";
3
+ import {
4
+ normalizeCloudVoiceHumanizationConfig,
5
+ toOpenAITurnDetection,
6
+ } from "../cloud-voice-runtime-service";
7
+ import { resolveCloudVoiceProviderSystemPrompt } from "../cloud-voice-prompt-service";
8
+ import type {
9
+ CloudVoiceFunctionToolManifest,
10
+ PreparedCloudVoiceConfig,
11
+ } from "../types";
12
+ import { PhonePlayoutTracker } from "./audio-playout-tracker";
13
+ import {
14
+ parseTwilioMediaStreamMessage,
15
+ } from "@kognitivedev/telephony";
16
+ import {
17
+ base64ToPcm16Be,
18
+ base64ToPcm16Le,
19
+ createCarrierClearMessage,
20
+ createCarrierMarkMessage,
21
+ createCarrierMediaMessage,
22
+ createPhoneAudioRoute,
23
+ fromAiOutputAudio,
24
+ getPhoneMediaProfile,
25
+ isLikelySpeechPayload,
26
+ isOpenAIRealtimeAudioReady,
27
+ isRealtimeAudioReadyForRoute,
28
+ realtimeAudioFormatForRoute,
29
+ realtimeOutputAudioFormatForRoute,
30
+ summarizeCarrierPayload,
31
+ summarizePcm16Signal as summarizeVoiceMediaPcm16Signal,
32
+ toAiInputAudio,
33
+ type PhoneAudioRoute,
34
+ type PhoneMediaProfile,
35
+ } from "@kognitivedev/voice-media-bridge";
36
+
37
+ export type OpenAITwilioRealtimeSnapshot = {
38
+ runtime?: Partial<PreparedCloudVoiceConfig["runtime"]>;
39
+ voiceConfig?: Partial<PreparedCloudVoiceConfig["voiceConfig"]>;
40
+ toolManifest?: Array<Partial<CloudVoiceFunctionToolManifest>>;
41
+ config?: Record<string, unknown>;
42
+ };
43
+
44
+ export type OpenAITwilioRealtimeCounters = {
45
+ inboundMediaFrames: number;
46
+ providerInputFrames: number;
47
+ providerInputDroppedBeforeReady: number;
48
+ outboundMediaFrames: number;
49
+ outboundAudioDroppedMissingStreamSid: number;
50
+ outboundAudioBytes: number;
51
+ outboundAudioPeakAbs: number;
52
+ outboundAudioLastRms: number;
53
+ clears: number;
54
+ truncates: number;
55
+ marks: number;
56
+ dtmf: number;
57
+ interruptions: number;
58
+ ignoredInterruptions: number;
59
+ toolCalls: number;
60
+ providerMessages: number;
61
+ };
62
+
63
+ type BridgeLog = (event: string, data?: Record<string, unknown>) => void;
64
+ type BridgeError = (event: string, error: unknown, data?: Record<string, unknown>) => void;
65
+
66
+ const MAX_PRE_READY_AUDIO_FRAMES = 250;
67
+ const LIKELY_SPEECH_RMS = 400;
68
+ const LIKELY_SPEECH_PEAK_ABS = 2500;
69
+ const MIN_INTERRUPT_ASSISTANT_AUDIO_MS = 240;
70
+ const MIN_INTERRUPT_ELAPSED_MS = 120;
71
+ const MIN_INTERRUPT_SPEECH_RMS = 220;
72
+ const MIN_INTERRUPT_SPEECH_PEAK_ABS = 1400;
73
+ const DEFAULT_PHONE_TURN_DETECTION = {
74
+ type: "server_vad",
75
+ create_response: true,
76
+ interrupt_response: true,
77
+ };
78
+ const DEFAULT_XAI_PHONE_TURN_DETECTION = {
79
+ type: "server_vad",
80
+ threshold: 0.55,
81
+ prefix_padding_ms: 160,
82
+ silence_duration_ms: 450,
83
+ };
84
+
85
+ function toOpenAIPhoneTurnDetection(value: unknown) {
86
+ const turnDetection = toOpenAITurnDetection(value);
87
+ if (!turnDetection) return turnDetection;
88
+
89
+ const type = getString(turnDetection.type);
90
+ return {
91
+ type,
92
+ ...(typeof turnDetection.create_response === "boolean" ? { create_response: turnDetection.create_response } : {}),
93
+ ...(typeof turnDetection.interrupt_response === "boolean" ? { interrupt_response: turnDetection.interrupt_response } : {}),
94
+ ...(type === "semantic_vad" && typeof turnDetection.eagerness === "string" ? { eagerness: turnDetection.eagerness } : {}),
95
+ };
96
+ }
97
+
98
+ function toXAIPhoneTurnDetection(value: unknown) {
99
+ if (value === null) return null;
100
+ const turnDetection = getRecord(value);
101
+ const type = getString(turnDetection.type, "server_vad");
102
+ if (type !== "server_vad") return { ...DEFAULT_XAI_PHONE_TURN_DETECTION };
103
+
104
+ const config: Record<string, unknown> = { ...DEFAULT_XAI_PHONE_TURN_DETECTION };
105
+ if (typeof turnDetection.threshold === "number") {
106
+ config.threshold = turnDetection.threshold;
107
+ }
108
+ if (typeof turnDetection.prefix_padding_ms === "number") {
109
+ config.prefix_padding_ms = turnDetection.prefix_padding_ms;
110
+ } else if (typeof turnDetection.prefixPaddingMs === "number") {
111
+ config.prefix_padding_ms = turnDetection.prefixPaddingMs;
112
+ }
113
+ if (typeof turnDetection.silence_duration_ms === "number") {
114
+ config.silence_duration_ms = turnDetection.silence_duration_ms;
115
+ } else if (typeof turnDetection.silenceDurationMs === "number") {
116
+ config.silence_duration_ms = turnDetection.silenceDurationMs;
117
+ }
118
+ return config;
119
+ }
120
+
121
+ export type OpenAITwilioRealtimeBridgeInput = {
122
+ socket: WebSocket;
123
+ apiKey: string;
124
+ provider?: "openai-realtime" | "xai-realtime";
125
+ realtimeUrl?: string;
126
+ defaultModel?: string;
127
+ defaultVoice?: string;
128
+ mediaProfile?: PhoneMediaProfile;
129
+ audioRoute?: PhoneAudioRoute;
130
+ bridgeMode?: string;
131
+ supportsConversationItemTruncate?: boolean;
132
+ projectId: string;
133
+ sessionId: string;
134
+ providerCallId?: string | null;
135
+ snapshot: OpenAITwilioRealtimeSnapshot;
136
+ initialPrompt?: string | null;
137
+ log: BridgeLog;
138
+ error: BridgeError;
139
+ appendEvent: (eventType: string, payload?: Record<string, unknown>, message?: string) => void;
140
+ updateCallLegActive: (metadata: { streamSid: string | null }) => void;
141
+ updateSessionActive: () => void;
142
+ onAudioFrame?: (input: {
143
+ source: "caller" | "assistant";
144
+ payload: string;
145
+ metadata?: Record<string, unknown>;
146
+ }) => void;
147
+ onCarrierAudioSender?: (sender: ((payload: string) => boolean) | null) => void;
148
+ isAiPaused?: () => boolean;
149
+ onAssistantTurnComplete?: () => void;
150
+ registerPlayoutIdleWaiter?: (waiter: (input?: { timeoutMs?: number; quietMs?: number }) => Promise<unknown>) => () => void;
151
+ executeTool: (input: {
152
+ toolId: string;
153
+ args: unknown;
154
+ toolCallId?: string | null;
155
+ providerCallId?: string | null;
156
+ }) => Promise<{ result: unknown }>;
157
+ };
158
+
159
+ function getRecord(value: unknown): Record<string, unknown> {
160
+ return value && typeof value === "object" && !Array.isArray(value) ? value as Record<string, unknown> : {};
161
+ }
162
+
163
+ function getString(value: unknown, fallback = "") {
164
+ return typeof value === "string" && value.trim() ? value.trim() : fallback;
165
+ }
166
+
167
+ function getRawString(value: unknown) {
168
+ return typeof value === "string" ? value : "";
169
+ }
170
+
171
+ function normalizeToolInput(value: unknown) {
172
+ if (typeof value !== "string") return value;
173
+ try {
174
+ return JSON.parse(value);
175
+ } catch {
176
+ return value;
177
+ }
178
+ }
179
+
180
+ function serializeToolResult(value: unknown) {
181
+ return typeof value === "string" ? value : JSON.stringify(value);
182
+ }
183
+
184
+ function serializeError(value: unknown) {
185
+ if (value instanceof Error) {
186
+ return {
187
+ name: value.name,
188
+ message: value.message,
189
+ stack: value.stack,
190
+ };
191
+ }
192
+ return value;
193
+ }
194
+
195
+ function sendJson(socket: WebSocket | NodeWebSocket, message: unknown) {
196
+ if (socket.readyState !== NodeWebSocket.OPEN) return false;
197
+ socket.send(JSON.stringify(message));
198
+ return true;
199
+ }
200
+
201
+ function sendOpenAIEvent(socket: NodeWebSocket, event: Record<string, unknown>) {
202
+ return sendJson(socket, event);
203
+ }
204
+
205
+ function summarizePcm16Signal(samples: Int16Array) {
206
+ if (samples.length === 0) return { samples: 0, peakAbs: 0, rms: 0 };
207
+
208
+ let peakAbs = 0;
209
+ let squareSum = 0;
210
+ for (const sample of samples) {
211
+ const abs = Math.abs(sample);
212
+ if (abs > peakAbs) peakAbs = abs;
213
+ squareSum += sample * sample;
214
+ }
215
+
216
+ return {
217
+ samples: samples.length,
218
+ peakAbs,
219
+ rms: Math.round(Math.sqrt(squareSum / samples.length)),
220
+ };
221
+ }
222
+
223
+ export function summarizeTwilioPcmuSignal(base64Audio: string) {
224
+ return summarizeCarrierPayload(getPhoneMediaProfile("twilio-pcmu-8k"), base64Audio);
225
+ }
226
+
227
+ function isLikelySpeechTwilioPcmu(base64Audio: string) {
228
+ return isLikelySpeechPayload(getPhoneMediaProfile("twilio-pcmu-8k"), base64Audio, {
229
+ rms: LIKELY_SPEECH_RMS,
230
+ peakAbs: LIKELY_SPEECH_PEAK_ABS,
231
+ });
232
+ }
233
+
234
+ function summarizeL16EndianCandidates(profile: PhoneMediaProfile, payload: string) {
235
+ if (profile.codec !== "l16") return null;
236
+ const bytes = Buffer.byteLength(payload, "base64");
237
+ return {
238
+ configured: profile.carrierByteOrder ?? "be",
239
+ be: summarizeVoiceMediaPcm16Signal(base64ToPcm16Be(payload), bytes),
240
+ le: summarizeVoiceMediaPcm16Signal(base64ToPcm16Le(payload), bytes),
241
+ };
242
+ }
243
+
244
+ function summarizeOpenAIAudioFormat(value: unknown) {
245
+ if (typeof value === "string") return value;
246
+ const record = getRecord(value);
247
+ const type = getString(record.type, "");
248
+ const rate = typeof record.rate === "number" ? record.rate : undefined;
249
+ return type ? { type, ...(rate ? { rate } : {}) } : null;
250
+ }
251
+
252
+ function createOpenAIRealtimeReasoningConfig(snapshot: OpenAITwilioRealtimeSnapshot, model: string) {
253
+ if (!/^gpt-realtime-2(?:$|-)/.test(model)) return null;
254
+ const providerOptions = getRecord(snapshot.runtime?.providerOptions);
255
+ const reasoning = getRecord(providerOptions.reasoning);
256
+ if (Object.keys(reasoning).length > 0) return reasoning;
257
+
258
+ const effort = getString(providerOptions.reasoningEffort, "");
259
+ return { effort: effort || "low" };
260
+ }
261
+
262
+ export function isOpenAITwilioRealtimeReady(event: unknown, mediaProfile: PhoneMediaProfile = getPhoneMediaProfile("twilio-pcmu-8k")) {
263
+ return isOpenAIRealtimeAudioReady(event, mediaProfile);
264
+ }
265
+
266
+ export function createOpenAITwilioRealtimeTools(snapshot: OpenAITwilioRealtimeSnapshot) {
267
+ return (snapshot.toolManifest ?? [])
268
+ .filter((manifest) => getString(manifest.name, ""))
269
+ .map((manifest) => ({
270
+ type: "function",
271
+ name: getString(manifest.name, ""),
272
+ description: getString(manifest.description, getString(manifest.name, "")),
273
+ parameters: manifest.parameters ?? { type: "object", additionalProperties: true },
274
+ }));
275
+ }
276
+
277
+ export function createOpenAITwilioRealtimeSessionUpdate(
278
+ snapshot: OpenAITwilioRealtimeSnapshot,
279
+ model: string,
280
+ defaultVoice = "alloy",
281
+ provider: "openai-realtime" | "xai-realtime" = "openai-realtime",
282
+ mediaProfile: PhoneMediaProfile = getPhoneMediaProfile("twilio-pcmu-8k"),
283
+ audioRoute: PhoneAudioRoute = createPhoneAudioRoute(mediaProfile, provider),
284
+ ) {
285
+ const voiceConfig = snapshot.voiceConfig ?? {};
286
+ const inputConfig: Record<string, unknown> = {
287
+ format: realtimeAudioFormatForRoute(audioRoute),
288
+ };
289
+ const turnDetection = provider === "xai-realtime"
290
+ ? toXAIPhoneTurnDetection(voiceConfig.turnDetection)
291
+ : toOpenAIPhoneTurnDetection(voiceConfig.turnDetection);
292
+ if (provider !== "xai-realtime") {
293
+ inputConfig.turn_detection = typeof turnDetection === "undefined"
294
+ ? DEFAULT_PHONE_TURN_DETECTION
295
+ : turnDetection;
296
+ }
297
+ if (voiceConfig.transcription && typeof voiceConfig.transcription === "object") {
298
+ inputConfig.transcription = voiceConfig.transcription;
299
+ }
300
+ if (voiceConfig.inputNoiseReduction && typeof voiceConfig.inputNoiseReduction === "object") {
301
+ inputConfig.noise_reduction = voiceConfig.inputNoiseReduction;
302
+ }
303
+
304
+ const tools = createOpenAITwilioRealtimeTools(snapshot);
305
+ const voice = getString(voiceConfig.voice ?? snapshot.runtime?.voice, defaultVoice);
306
+ const reasoning = provider === "openai-realtime" ? createOpenAIRealtimeReasoningConfig(snapshot, model) : null;
307
+ const instructions = resolveCloudVoiceProviderSystemPrompt({
308
+ voiceConfig,
309
+ config: snapshot.config,
310
+ });
311
+ const session: Record<string, unknown> = provider === "xai-realtime" ? {
312
+ instructions,
313
+ voice,
314
+ audio: {
315
+ input: inputConfig,
316
+ output: {
317
+ format: realtimeOutputAudioFormatForRoute(audioRoute),
318
+ },
319
+ },
320
+ } : {
321
+ type: "realtime",
322
+ model,
323
+ instructions,
324
+ output_modalities: ["audio"],
325
+ ...(reasoning ? { reasoning } : {}),
326
+ audio: {
327
+ input: inputConfig,
328
+ output: {
329
+ format: realtimeOutputAudioFormatForRoute(audioRoute),
330
+ voice,
331
+ },
332
+ },
333
+ };
334
+ if (provider === "xai-realtime") {
335
+ session.turn_detection = typeof turnDetection === "undefined"
336
+ ? { type: "server_vad" }
337
+ : turnDetection;
338
+ }
339
+ if (tools.length > 0) {
340
+ session.tools = tools;
341
+ if (provider !== "xai-realtime") {
342
+ session.tool_choice = "auto";
343
+ }
344
+ }
345
+
346
+ return {
347
+ type: "session.update",
348
+ session,
349
+ };
350
+ }
351
+
352
+ function sendOpenAIUserTextTurn(socket: NodeWebSocket, text: string) {
353
+ sendOpenAIEvent(socket, {
354
+ type: "conversation.item.create",
355
+ item: {
356
+ type: "message",
357
+ role: "user",
358
+ content: [{ type: "input_text", text }],
359
+ },
360
+ });
361
+ sendOpenAIEvent(socket, { type: "response.create" });
362
+ }
363
+
364
+ function summarizeOpenAIResponseOutputTypes(response: unknown) {
365
+ const output = getRecord(response).output;
366
+ if (!Array.isArray(output)) return [];
367
+ return output.map((item) => getString(getRecord(item).type, "unknown"));
368
+ }
369
+
370
+ function extractOpenAIContentText(content: unknown) {
371
+ if (!Array.isArray(content)) return "";
372
+ return content
373
+ .flatMap((part) => {
374
+ const record = getRecord(part);
375
+ const transcript = getRawString(record.transcript).trim();
376
+ if (transcript) return [transcript];
377
+ const text = getRawString(record.text).trim();
378
+ if (text) return [text];
379
+ return [];
380
+ })
381
+ .join("\n")
382
+ .trim();
383
+ }
384
+
385
+ function extractOpenAIAssistantItemText(item: Record<string, unknown>) {
386
+ if (getString(item.type, "") !== "message") return "";
387
+ const role = getString(item.role, "");
388
+ if (role && role !== "assistant") return "";
389
+ return extractOpenAIContentText(item.content);
390
+ }
391
+
392
+ function extractOpenAIResponseText(response: Record<string, unknown>) {
393
+ const output = response.output;
394
+ if (!Array.isArray(output)) return "";
395
+ return output
396
+ .flatMap((item) => {
397
+ const text = extractOpenAIAssistantItemText(getRecord(item));
398
+ return text ? [text] : [];
399
+ })
400
+ .join("\n")
401
+ .trim();
402
+ }
403
+
404
+ function normalizeTelnyxMediaStreamMessage(raw: Record<string, unknown>) {
405
+ const event = getString(raw.event);
406
+ const streamId = getString(raw.stream_id);
407
+ const start = getRecord(raw.start);
408
+ const media = getRecord(raw.media);
409
+ const stop = getRecord(raw.stop);
410
+ const dtmf = getRecord(raw.dtmf);
411
+ return {
412
+ event,
413
+ protocol: getString(raw.protocol),
414
+ version: getString(raw.version),
415
+ streamSid: streamId,
416
+ start: {
417
+ streamSid: streamId,
418
+ accountSid: null,
419
+ callSid: getString(start.call_control_id),
420
+ tracks: [],
421
+ mediaFormat: getRecord(start.media_format),
422
+ customParameters: {
423
+ callSessionId: getString(start.call_session_id),
424
+ clientState: getString(start.client_state),
425
+ },
426
+ },
427
+ media: {
428
+ track: getString(media.track),
429
+ chunk: getString(media.chunk),
430
+ timestamp: getString(media.timestamp),
431
+ payload: getString(media.payload),
432
+ },
433
+ mark: getRecord(raw.mark),
434
+ dtmf: {
435
+ digit: getString(dtmf.digit),
436
+ },
437
+ stop: {
438
+ accountSid: null,
439
+ callSid: getString(stop.call_control_id),
440
+ },
441
+ };
442
+ }
443
+
444
+ function parseCarrierMediaStreamMessage(profile: PhoneMediaProfile, raw: string) {
445
+ const parsed = JSON.parse(raw) as Record<string, unknown>;
446
+ return profile.provider === "telnyx"
447
+ ? normalizeTelnyxMediaStreamMessage(parsed)
448
+ : parseTwilioMediaStreamMessage(parsed);
449
+ }
450
+
451
+ export async function runOpenAITwilioRealtimeBridge(input: OpenAITwilioRealtimeBridgeInput) {
452
+ const provider = input.provider ?? "openai-realtime";
453
+ const providerLogPrefix = provider === "xai-realtime" ? "xai" : "openai";
454
+ const bridgeMode = input.bridgeMode ?? (provider === "xai-realtime" ? "xai-twilio-realtime" : "openai-twilio-realtime-ga");
455
+ const supportsConversationItemTruncate = input.supportsConversationItemTruncate ?? provider === "openai-realtime";
456
+ const model = getString(input.snapshot.runtime?.model, input.defaultModel ?? "gpt-realtime");
457
+ const realtimeUrl = input.realtimeUrl ?? `wss://api.openai.com/v1/realtime?model=${encodeURIComponent(model)}`;
458
+ const mediaProfile = input.mediaProfile ?? getPhoneMediaProfile("twilio-pcmu-8k");
459
+ const audioRoute = input.audioRoute ?? createPhoneAudioRoute(mediaProfile, provider);
460
+ const tools = createOpenAITwilioRealtimeTools(input.snapshot);
461
+ const humanization = normalizeCloudVoiceHumanizationConfig(getRecord(input.snapshot.config).humanization);
462
+ const openAI = new NodeWebSocket(realtimeUrl, {
463
+ headers: {
464
+ Authorization: `Bearer ${input.apiKey}`,
465
+ },
466
+ });
467
+
468
+ let twilioStarted = false;
469
+ let providerConnected = false;
470
+ let phoneAudioReady = false;
471
+ let greeted = false;
472
+ let streamSid: string | null = null;
473
+ let latestMediaTimestamp = 0;
474
+ let responseStartTimestampTwilio: number | null = null;
475
+ let lastAssistantItemId: string | null = null;
476
+ let lastAssistantItemAudioMs = 0;
477
+ let responseOutboundFramesAtStart = 0;
478
+ let currentAssistantTranscript = "";
479
+ let currentAssistantFinalTranscript = "";
480
+ let missingStreamSidAudioWarningEmitted = false;
481
+ let preReadyAudioHadSpeech = false;
482
+ let latestInboundSignal = { bytes: 0, samples: 0, peakAbs: 0, rms: 0 };
483
+ let ignoredCarrierMediaFrames = 0;
484
+ const carrierMediaTracks: Record<string, number> = {};
485
+ const preReadyAudioFrames: string[] = [];
486
+ const markQueue: string[] = [];
487
+ const playoutTracker = new PhonePlayoutTracker();
488
+ const unregisterPlayoutIdleWaiter = input.registerPlayoutIdleWaiter?.((waitInput) =>
489
+ playoutTracker.waitForIdle(waitInput),
490
+ );
491
+ const counters: OpenAITwilioRealtimeCounters = {
492
+ inboundMediaFrames: 0,
493
+ providerInputFrames: 0,
494
+ providerInputDroppedBeforeReady: 0,
495
+ outboundMediaFrames: 0,
496
+ outboundAudioDroppedMissingStreamSid: 0,
497
+ outboundAudioBytes: 0,
498
+ outboundAudioPeakAbs: 0,
499
+ outboundAudioLastRms: 0,
500
+ clears: 0,
501
+ truncates: 0,
502
+ marks: 0,
503
+ dtmf: 0,
504
+ interruptions: 0,
505
+ ignoredInterruptions: 0,
506
+ toolCalls: 0,
507
+ providerMessages: 0,
508
+ };
509
+
510
+ const bindStreamSid = (nextStreamSid: unknown, source: string) => {
511
+ const next = getString(nextStreamSid, "");
512
+ if (!next || next === streamSid) return;
513
+ streamSid = next;
514
+ input.updateCallLegActive({ streamSid });
515
+ input.log("twilio.stream_sid.bound", {
516
+ projectId: input.projectId,
517
+ sessionId: input.sessionId,
518
+ streamSid,
519
+ source,
520
+ });
521
+ };
522
+
523
+ const maybeGreet = () => {
524
+ if (greeted || !twilioStarted || !providerConnected || !phoneAudioReady) return;
525
+ greeted = true;
526
+ if (preReadyAudioHadSpeech) {
527
+ input.log(`${providerLogPrefix}.initial_prompt.skipped`, {
528
+ projectId: input.projectId,
529
+ sessionId: input.sessionId,
530
+ providerCallId: input.providerCallId ?? null,
531
+ reason: "caller_audio_before_ready",
532
+ });
533
+ return;
534
+ }
535
+ if (input.initialPrompt === null) {
536
+ input.log(`${providerLogPrefix}.initial_prompt.skipped`, {
537
+ projectId: input.projectId,
538
+ sessionId: input.sessionId,
539
+ providerCallId: input.providerCallId ?? null,
540
+ reason: "openingMode.wait",
541
+ });
542
+ return;
543
+ }
544
+ input.log(`${providerLogPrefix}.initial_prompt.sent`, {
545
+ projectId: input.projectId,
546
+ sessionId: input.sessionId,
547
+ providerCallId: input.providerCallId ?? null,
548
+ method: "conversation.item.create + response.create",
549
+ });
550
+ sendOpenAIUserTextTurn(
551
+ openAI,
552
+ input.initialPrompt ?? "Start the phone call now. Greet the caller briefly and ask how you can help.",
553
+ );
554
+ };
555
+
556
+ const appendOpenAIAudio = (audio: string) => {
557
+ sendOpenAIEvent(openAI, {
558
+ type: "input_audio_buffer.append",
559
+ audio: toAiInputAudio(audioRoute, audio),
560
+ });
561
+ counters.providerInputFrames += 1;
562
+ };
563
+
564
+ const flushPreReadyAudio = () => {
565
+ if (!phoneAudioReady || openAI.readyState !== NodeWebSocket.OPEN || preReadyAudioFrames.length === 0) return;
566
+ const frameCount = preReadyAudioFrames.length;
567
+ for (const audio of preReadyAudioFrames.splice(0)) {
568
+ appendOpenAIAudio(audio);
569
+ }
570
+ input.log(`${providerLogPrefix}.audio.input.pre_ready_flushed`, {
571
+ projectId: input.projectId,
572
+ sessionId: input.sessionId,
573
+ providerCallId: input.providerCallId ?? null,
574
+ frameCount,
575
+ hadSpeech: preReadyAudioHadSpeech,
576
+ });
577
+ };
578
+
579
+ const bufferPreReadyAudio = (audio: string) => {
580
+ if (preReadyAudioFrames.length >= MAX_PRE_READY_AUDIO_FRAMES) {
581
+ preReadyAudioFrames.shift();
582
+ counters.providerInputDroppedBeforeReady += 1;
583
+ }
584
+ preReadyAudioFrames.push(audio);
585
+ preReadyAudioHadSpeech ||= isLikelySpeechPayload(mediaProfile, audio, {
586
+ rms: LIKELY_SPEECH_RMS,
587
+ peakAbs: LIKELY_SPEECH_PEAK_ABS,
588
+ });
589
+ };
590
+
591
+ const dropOutboundAudioForMissingStreamSid = (eventType: string, delta: string) => {
592
+ counters.outboundAudioDroppedMissingStreamSid += 1;
593
+ const output = fromAiOutputAudio(audioRoute, delta);
594
+ const signal = output.signal;
595
+ const payload = {
596
+ provider,
597
+ reason: "missing_twilio_stream_sid",
598
+ eventType,
599
+ droppedFrames: counters.outboundAudioDroppedMissingStreamSid,
600
+ droppedAudioBytes: signal.bytes,
601
+ inboundMediaFrames: counters.inboundMediaFrames,
602
+ providerInputFrames: counters.providerInputFrames,
603
+ providerMessages: counters.providerMessages,
604
+ };
605
+
606
+ input.log(`${providerLogPrefix}.audio.output.dropped`, {
607
+ projectId: input.projectId,
608
+ sessionId: input.sessionId,
609
+ ...payload,
610
+ });
611
+
612
+ if (!missingStreamSidAudioWarningEmitted) {
613
+ missingStreamSidAudioWarningEmitted = true;
614
+ input.appendEvent("voice.provider.warning", {
615
+ ...payload,
616
+ message: `${provider === "xai-realtime" ? "xAI" : "OpenAI"} returned assistant audio, but the bridge has no Twilio streamSid to send it to.`,
617
+ }, "Assistant audio could not be sent to Twilio");
618
+ }
619
+
620
+ if (counters.outboundAudioDroppedMissingStreamSid === 1 || counters.outboundAudioDroppedMissingStreamSid % 50 === 0) {
621
+ input.appendEvent("voice.call.audio.dropped", payload, "Assistant audio dropped before Twilio stream was bound");
622
+ }
623
+ };
624
+
625
+ const ignoreAssistantInterruption = (reason: string, details: Record<string, unknown> = {}) => {
626
+ counters.ignoredInterruptions += 1;
627
+ input.log(`${providerLogPrefix}.audio.interruption_ignored`, {
628
+ projectId: input.projectId,
629
+ sessionId: input.sessionId,
630
+ reason,
631
+ counters,
632
+ ...details,
633
+ });
634
+ if (counters.ignoredInterruptions === 1 || counters.ignoredInterruptions % 10 === 0) {
635
+ input.appendEvent("voice.assistant.interruption_ignored", {
636
+ provider,
637
+ reason,
638
+ counters,
639
+ ...details,
640
+ });
641
+ }
642
+ };
643
+
644
+ const interruptAssistantAudio = () => {
645
+ if (!streamSid || markQueue.length === 0 || responseStartTimestampTwilio === null) return;
646
+
647
+ const elapsedMs = Math.max(0, latestMediaTimestamp - responseStartTimestampTwilio);
648
+ if (elapsedMs < MIN_INTERRUPT_ELAPSED_MS || lastAssistantItemAudioMs < MIN_INTERRUPT_ASSISTANT_AUDIO_MS) {
649
+ ignoreAssistantInterruption("assistant_audio_too_short", {
650
+ elapsedMs,
651
+ assistantAudioMs: lastAssistantItemAudioMs,
652
+ pendingMarks: markQueue.length,
653
+ });
654
+ return;
655
+ }
656
+ if (
657
+ latestInboundSignal.samples > 0
658
+ && latestInboundSignal.rms < MIN_INTERRUPT_SPEECH_RMS
659
+ && latestInboundSignal.peakAbs < MIN_INTERRUPT_SPEECH_PEAK_ABS
660
+ ) {
661
+ ignoreAssistantInterruption("weak_recent_input_signal", {
662
+ elapsedMs,
663
+ assistantAudioMs: lastAssistantItemAudioMs,
664
+ latestInboundSignal,
665
+ pendingMarks: markQueue.length,
666
+ });
667
+ return;
668
+ }
669
+ const truncateMs = lastAssistantItemAudioMs > 0
670
+ ? Math.min(elapsedMs, Math.max(0, lastAssistantItemAudioMs - 20))
671
+ : elapsedMs;
672
+ counters.interruptions += 1;
673
+ const playoutInterruption = playoutTracker.interrupt({
674
+ elapsedMs: truncateMs,
675
+ providerTruncationSupported: supportsConversationItemTruncate,
676
+ });
677
+ if (supportsConversationItemTruncate && lastAssistantItemId && truncateMs >= 0) {
678
+ counters.truncates += 1;
679
+ sendOpenAIEvent(openAI, {
680
+ type: "conversation.item.truncate",
681
+ item_id: lastAssistantItemId,
682
+ content_index: 0,
683
+ audio_end_ms: truncateMs,
684
+ });
685
+ input.appendEvent("voice.call.audio.truncate_sent", {
686
+ provider,
687
+ itemId: lastAssistantItemId,
688
+ audioEndMs: truncateMs,
689
+ elapsedMs,
690
+ ...playoutInterruption,
691
+ counters,
692
+ });
693
+ }
694
+ counters.clears += 1;
695
+ const clearMessage = createCarrierClearMessage(mediaProfile, streamSid);
696
+ if (clearMessage) sendJson(input.socket, clearMessage);
697
+ input.appendEvent("voice.call.audio.clear_sent", {
698
+ provider: mediaProfile.provider,
699
+ streamSid,
700
+ elapsedMs,
701
+ ...playoutInterruption,
702
+ counters,
703
+ });
704
+ if (!supportsConversationItemTruncate) {
705
+ sendOpenAIEvent(openAI, { type: "response.cancel" });
706
+ input.appendEvent("voice.call.response_cancel_sent", {
707
+ provider,
708
+ elapsedMs,
709
+ counters,
710
+ });
711
+ }
712
+ input.log(`${providerLogPrefix}.audio.interrupted`, {
713
+ projectId: input.projectId,
714
+ sessionId: input.sessionId,
715
+ streamSid,
716
+ elapsedMs,
717
+ truncateMs,
718
+ assistantAudioMs: lastAssistantItemAudioMs,
719
+ pendingMarks: markQueue.length,
720
+ });
721
+ input.appendEvent("voice.assistant.interrupted", {
722
+ provider,
723
+ elapsedMs,
724
+ ...playoutInterruption,
725
+ });
726
+ markQueue.length = 0;
727
+ responseStartTimestampTwilio = null;
728
+ lastAssistantItemId = null;
729
+ lastAssistantItemAudioMs = 0;
730
+ };
731
+
732
+ const handleToolCall = async (item: Record<string, unknown>) => {
733
+ const toolId = getString(item.name, "");
734
+ const toolCallId = getString(item.call_id, getString(item.id, randomUUID()));
735
+ if (!toolId) return;
736
+
737
+ counters.toolCalls += 1;
738
+ input.log(`${providerLogPrefix}.tool.execute.start`, {
739
+ projectId: input.projectId,
740
+ sessionId: input.sessionId,
741
+ providerCallId: input.providerCallId ?? null,
742
+ toolId,
743
+ toolCallId,
744
+ toolLatencyFillerMs: humanization.toolLatencyFillerMs,
745
+ });
746
+ try {
747
+ const result = await input.executeTool({
748
+ toolId,
749
+ args: normalizeToolInput(getString(item.arguments, "{}")),
750
+ toolCallId,
751
+ providerCallId: input.providerCallId ?? null,
752
+ });
753
+ sendOpenAIEvent(openAI, {
754
+ type: "conversation.item.create",
755
+ item: {
756
+ type: "function_call_output",
757
+ call_id: toolCallId,
758
+ output: serializeToolResult(result.result),
759
+ },
760
+ });
761
+ sendOpenAIEvent(openAI, { type: "response.create" });
762
+ input.log(`${providerLogPrefix}.tool.execute.completed`, {
763
+ projectId: input.projectId,
764
+ sessionId: input.sessionId,
765
+ providerCallId: input.providerCallId ?? null,
766
+ toolId,
767
+ toolCallId,
768
+ });
769
+ } catch (error) {
770
+ input.error(`${providerLogPrefix}.tool.execute.failed`, error, {
771
+ projectId: input.projectId,
772
+ sessionId: input.sessionId,
773
+ providerCallId: input.providerCallId ?? null,
774
+ toolId,
775
+ toolCallId,
776
+ });
777
+ sendOpenAIEvent(openAI, {
778
+ type: "conversation.item.create",
779
+ item: {
780
+ type: "function_call_output",
781
+ call_id: toolCallId,
782
+ output: serializeToolResult({ error: error instanceof Error ? error.message : String(error) }),
783
+ },
784
+ });
785
+ sendOpenAIEvent(openAI, { type: "response.create" });
786
+ }
787
+ };
788
+
789
+ const handleOpenAIMessage = (rawData: RawData) => {
790
+ let event: Record<string, unknown>;
791
+ try {
792
+ event = JSON.parse(rawData.toString()) as Record<string, unknown>;
793
+ } catch (error) {
794
+ input.error(`${providerLogPrefix}.message.parse.failed`, error, { projectId: input.projectId, sessionId: input.sessionId });
795
+ return;
796
+ }
797
+
798
+ counters.providerMessages += 1;
799
+ const eventType = getString(event.type, "");
800
+ if (eventType === "error") {
801
+ input.error(`${providerLogPrefix}.session.error`, event, { projectId: input.projectId, sessionId: input.sessionId });
802
+ input.appendEvent("voice.provider.error", { provider, error: event });
803
+ return;
804
+ }
805
+ if (eventType === "session.updated") {
806
+ phoneAudioReady = isRealtimeAudioReadyForRoute(event, audioRoute);
807
+ const audio = getRecord(getRecord(event.session).audio);
808
+ input.log(`${providerLogPrefix}.session.updated`, {
809
+ projectId: input.projectId,
810
+ sessionId: input.sessionId,
811
+ inputFormat: summarizeOpenAIAudioFormat(getRecord(audio.input).format),
812
+ outputFormat: summarizeOpenAIAudioFormat(getRecord(audio.output).format),
813
+ phoneAudioReady,
814
+ mediaProfile: mediaProfile.id,
815
+ });
816
+ if (!phoneAudioReady) {
817
+ input.appendEvent("voice.provider.warning", {
818
+ provider,
819
+ reason: `${provider === "xai-realtime" ? "xAI" : "OpenAI"} session did not confirm ${audioRoute.aiInput.format} phone audio format`,
820
+ inputFormat: summarizeOpenAIAudioFormat(getRecord(audio.input).format),
821
+ outputFormat: summarizeOpenAIAudioFormat(getRecord(audio.output).format),
822
+ mediaProfile: mediaProfile.id,
823
+ });
824
+ }
825
+ flushPreReadyAudio();
826
+ maybeGreet();
827
+ return;
828
+ }
829
+ if (eventType === "input_audio_buffer.speech_started") {
830
+ interruptAssistantAudio();
831
+ return;
832
+ }
833
+ if (eventType === "conversation.item.input_audio_transcription.completed") {
834
+ const transcript = getString(event.transcript, "");
835
+ if (transcript) {
836
+ input.appendEvent("voice.user.transcribed", {
837
+ provider,
838
+ text: transcript,
839
+ usage: event.usage ?? undefined,
840
+ }, transcript);
841
+ }
842
+ return;
843
+ }
844
+ if (eventType === "response.created") {
845
+ const response = getRecord(event.response);
846
+ responseStartTimestampTwilio = null;
847
+ lastAssistantItemId = null;
848
+ lastAssistantItemAudioMs = 0;
849
+ responseOutboundFramesAtStart = counters.outboundMediaFrames;
850
+ currentAssistantTranscript = "";
851
+ currentAssistantFinalTranscript = "";
852
+ input.log(`${providerLogPrefix}.response.created`, {
853
+ projectId: input.projectId,
854
+ sessionId: input.sessionId,
855
+ responseId: getString(response.id, ""),
856
+ });
857
+ input.appendEvent("voice.assistant.started", {
858
+ provider,
859
+ responseId: getString(response.id, ""),
860
+ });
861
+ return;
862
+ }
863
+ if (eventType === "response.output_audio.delta" || eventType === "response.audio.delta") {
864
+ const delta = getString(event.delta, "");
865
+ if (!delta) return;
866
+ if (!streamSid && mediaProfile.provider !== "telnyx") {
867
+ dropOutboundAudioForMissingStreamSid(eventType, delta);
868
+ return;
869
+ }
870
+ if (input.isAiPaused?.()) {
871
+ input.log(`${providerLogPrefix}.audio.output.paused_for_handoff`, {
872
+ projectId: input.projectId,
873
+ sessionId: input.sessionId,
874
+ providerCallId: input.providerCallId ?? null,
875
+ outboundMediaFrames: counters.outboundMediaFrames,
876
+ });
877
+ return;
878
+ }
879
+
880
+ const carrierOutput = fromAiOutputAudio(audioRoute, delta);
881
+ const mediaMessage = createCarrierMediaMessage(mediaProfile, streamSid, carrierOutput.payload);
882
+ const sent = mediaMessage ? sendJson(input.socket, mediaMessage) : false;
883
+ if (!sent) return;
884
+
885
+ const itemId = getString(event.item_id, lastAssistantItemId ?? "");
886
+ if (itemId && itemId !== lastAssistantItemId) {
887
+ responseStartTimestampTwilio = latestMediaTimestamp;
888
+ lastAssistantItemAudioMs = 0;
889
+ playoutTracker.startAssistantItem(itemId || null);
890
+ }
891
+ counters.outboundMediaFrames += 1;
892
+ const signal = carrierOutput.signal;
893
+ counters.outboundAudioBytes += signal.bytes;
894
+ counters.outboundAudioPeakAbs = Math.max(counters.outboundAudioPeakAbs, signal.peakAbs);
895
+ counters.outboundAudioLastRms = signal.rms;
896
+ lastAssistantItemAudioMs += carrierOutput.durationMs;
897
+ if (responseStartTimestampTwilio === null) responseStartTimestampTwilio = latestMediaTimestamp;
898
+ lastAssistantItemId = itemId;
899
+
900
+ const markName = `${providerLogPrefix}:${counters.outboundMediaFrames}`;
901
+ const markMessage = createCarrierMarkMessage(mediaProfile, streamSid, markName);
902
+ if (markMessage) sendJson(input.socket, markMessage);
903
+ markQueue.push(markName);
904
+ playoutTracker.recordOutboundFrame({ markName, itemId: itemId || null, durationMs: carrierOutput.durationMs });
905
+ input.onAudioFrame?.({
906
+ source: "assistant",
907
+ payload: carrierOutput.payload,
908
+ metadata: { provider, frame: counters.outboundMediaFrames },
909
+ });
910
+
911
+ if (counters.outboundMediaFrames === 1 || counters.outboundMediaFrames % 50 === 0) {
912
+ input.log(`${providerLogPrefix}.audio.output.summary`, {
913
+ projectId: input.projectId,
914
+ sessionId: input.sessionId,
915
+ streamSid,
916
+ outboundMediaFrames: counters.outboundMediaFrames,
917
+ outboundAudioBytes: counters.outboundAudioBytes,
918
+ lastSignal: signal,
919
+ });
920
+ }
921
+ return;
922
+ }
923
+ if (eventType === "response.output_audio_transcript.delta" || eventType === "response.audio_transcript.delta") {
924
+ const delta = getRawString(event.delta);
925
+ currentAssistantTranscript += delta;
926
+ if (delta) {
927
+ input.appendEvent("voice.assistant.transcript.delta", {
928
+ provider,
929
+ delta,
930
+ }, delta);
931
+ }
932
+ return;
933
+ }
934
+ if (eventType === "response.output_audio_transcript.done" || eventType === "response.audio_transcript.done") {
935
+ const transcript = getRawString(event.transcript).trim();
936
+ if (transcript) currentAssistantFinalTranscript = transcript;
937
+ return;
938
+ }
939
+ if (eventType === "response.output_audio.done" || eventType === "response.audio.done") {
940
+ input.log(`${providerLogPrefix}.audio.output.done`, {
941
+ projectId: input.projectId,
942
+ sessionId: input.sessionId,
943
+ responseId: getString(event.response_id, ""),
944
+ outboundMediaFrames: counters.outboundMediaFrames,
945
+ outboundAudioBytes: counters.outboundAudioBytes,
946
+ });
947
+ return;
948
+ }
949
+ if (eventType === "response.output_item.done") {
950
+ const item = getRecord(event.item);
951
+ const assistantText = extractOpenAIAssistantItemText(item);
952
+ if (assistantText) currentAssistantFinalTranscript = assistantText;
953
+ if (item.type === "function_call" && getString(item.status, "") === "completed") {
954
+ void handleToolCall(item);
955
+ }
956
+ return;
957
+ }
958
+ if (eventType === "response.done") {
959
+ const response = getRecord(event.response);
960
+ const text = currentAssistantFinalTranscript
961
+ || extractOpenAIResponseText(response)
962
+ || currentAssistantTranscript.trim();
963
+ const status = getString(response.status, "");
964
+ if (text && status !== "cancelled" && counters.outboundMediaFrames === responseOutboundFramesAtStart) {
965
+ input.appendEvent("voice.provider.warning", {
966
+ provider,
967
+ reason: "assistant_text_without_outbound_audio",
968
+ responseId: getString(response.id, ""),
969
+ text,
970
+ streamSid,
971
+ counters,
972
+ }, "Assistant response produced text but no outbound phone audio");
973
+ }
974
+ currentAssistantTranscript = "";
975
+ currentAssistantFinalTranscript = "";
976
+ input.log(`${providerLogPrefix}.response.done`, {
977
+ projectId: input.projectId,
978
+ sessionId: input.sessionId,
979
+ responseId: getString(response.id, ""),
980
+ status,
981
+ statusDetails: response.status_details ?? null,
982
+ outputTypes: summarizeOpenAIResponseOutputTypes(response),
983
+ counters,
984
+ });
985
+ input.appendEvent("voice.assistant.stopped", {
986
+ provider,
987
+ responseId: getString(response.id, ""),
988
+ status: getString(response.status, ""),
989
+ statusDetails: response.status_details ?? undefined,
990
+ text: text || undefined,
991
+ usage: response.usage ?? undefined,
992
+ counters,
993
+ }, text || undefined);
994
+ input.onAssistantTurnComplete?.();
995
+ }
996
+ };
997
+
998
+ const handleTwilioMessage = (data: RawData) => {
999
+ try {
1000
+ const parsed = parseCarrierMediaStreamMessage(mediaProfile, typeof data === "string" ? data : data.toString());
1001
+ if (parsed.event === "connected") {
1002
+ input.log(`${mediaProfile.provider}.connected`, {
1003
+ projectId: input.projectId,
1004
+ sessionId: input.sessionId,
1005
+ protocol: parsed.protocol,
1006
+ version: parsed.version,
1007
+ });
1008
+ } else if (parsed.event === "start") {
1009
+ twilioStarted = true;
1010
+ bindStreamSid(parsed.start.streamSid ?? parsed.streamSid ?? null, `${mediaProfile.provider}.start`);
1011
+ responseStartTimestampTwilio = null;
1012
+ latestMediaTimestamp = 0;
1013
+ lastAssistantItemId = null;
1014
+ lastAssistantItemAudioMs = 0;
1015
+ preReadyAudioFrames.length = 0;
1016
+ preReadyAudioHadSpeech = false;
1017
+ markQueue.length = 0;
1018
+ input.log(`${mediaProfile.provider}.start`, {
1019
+ projectId: input.projectId,
1020
+ sessionId: input.sessionId,
1021
+ providerCallId: parsed.start.callSid ?? input.providerCallId ?? null,
1022
+ streamSid,
1023
+ mediaFormat: parsed.start.mediaFormat,
1024
+ });
1025
+ input.updateSessionActive();
1026
+ input.onCarrierAudioSender?.((payload: string) => {
1027
+ const mediaMessage = createCarrierMediaMessage(mediaProfile, streamSid, payload);
1028
+ return mediaMessage ? sendJson(input.socket, mediaMessage) : false;
1029
+ });
1030
+ input.appendEvent("voice.call.started", {
1031
+ provider: mediaProfile.provider,
1032
+ providerCallId: parsed.start.callSid ?? input.providerCallId ?? null,
1033
+ streamSid,
1034
+ mediaFormat: parsed.start.mediaFormat,
1035
+ }, `${mediaProfile.provider} media stream started`);
1036
+ maybeGreet();
1037
+ } else if (parsed.event === "media") {
1038
+ bindStreamSid(parsed.streamSid, `${mediaProfile.provider}.media`);
1039
+ const mediaTrack = getString(parsed.media.track, "");
1040
+ const mediaTrackKey = mediaTrack || "unknown";
1041
+ carrierMediaTracks[mediaTrackKey] = (carrierMediaTracks[mediaTrackKey] ?? 0) + 1;
1042
+ if (mediaProfile.provider === "telnyx" && mediaTrack && mediaTrack !== "inbound") {
1043
+ ignoredCarrierMediaFrames += 1;
1044
+ if (ignoredCarrierMediaFrames === 1 || ignoredCarrierMediaFrames % 250 === 0) {
1045
+ input.log(`${mediaProfile.provider}.media.ignored`, {
1046
+ projectId: input.projectId,
1047
+ sessionId: input.sessionId,
1048
+ providerCallId: input.providerCallId ?? null,
1049
+ track: mediaTrack,
1050
+ ignoredCarrierMediaFrames,
1051
+ carrierMediaTracks,
1052
+ });
1053
+ }
1054
+ return;
1055
+ }
1056
+ counters.inboundMediaFrames += 1;
1057
+ latestInboundSignal = summarizeCarrierPayload(mediaProfile, parsed.media.payload);
1058
+ const l16EndianSignals = summarizeL16EndianCandidates(mediaProfile, parsed.media.payload);
1059
+ latestMediaTimestamp = Number.parseInt(parsed.media.timestamp ?? "0", 10) || latestMediaTimestamp;
1060
+ input.onAudioFrame?.({
1061
+ source: "caller",
1062
+ payload: parsed.media.payload,
1063
+ metadata: {
1064
+ provider: mediaProfile.provider,
1065
+ frame: counters.inboundMediaFrames,
1066
+ ...(l16EndianSignals ? { l16ByteOrder: l16EndianSignals.configured } : {}),
1067
+ },
1068
+ });
1069
+ if (input.isAiPaused?.()) {
1070
+ counters.providerInputDroppedBeforeReady += 1;
1071
+ } else if (openAI.readyState === NodeWebSocket.OPEN && phoneAudioReady) {
1072
+ appendOpenAIAudio(parsed.media.payload);
1073
+ } else {
1074
+ bufferPreReadyAudio(parsed.media.payload);
1075
+ }
1076
+ if (
1077
+ counters.inboundMediaFrames === 1
1078
+ || counters.inboundMediaFrames % 250 === 0
1079
+ || counters.providerInputDroppedBeforeReady === 1
1080
+ ) {
1081
+ input.log(`${providerLogPrefix}.audio.input.summary`, {
1082
+ projectId: input.projectId,
1083
+ sessionId: input.sessionId,
1084
+ providerCallId: input.providerCallId ?? null,
1085
+ inboundMediaFrames: counters.inboundMediaFrames,
1086
+ providerInputFrames: counters.providerInputFrames,
1087
+ providerInputDroppedBeforeReady: counters.providerInputDroppedBeforeReady,
1088
+ providerInputBufferedBeforeReady: preReadyAudioFrames.length,
1089
+ preReadyAudioHadSpeech,
1090
+ latestInboundSignal,
1091
+ ...(l16EndianSignals ? { l16EndianSignals } : {}),
1092
+ carrierMediaTracks,
1093
+ ignoredCarrierMediaFrames,
1094
+ mediaTrack: mediaTrack || null,
1095
+ providerConnected,
1096
+ phoneAudioReady,
1097
+ });
1098
+ }
1099
+ if (counters.inboundMediaFrames === 1 || counters.inboundMediaFrames % 250 === 0) {
1100
+ input.log(`${mediaProfile.provider}.media.summary`, {
1101
+ projectId: input.projectId,
1102
+ sessionId: input.sessionId,
1103
+ inboundMediaFrames: counters.inboundMediaFrames,
1104
+ latestInboundSignal,
1105
+ ...(l16EndianSignals ? { l16EndianSignals } : {}),
1106
+ carrierMediaTracks,
1107
+ ignoredCarrierMediaFrames,
1108
+ });
1109
+ input.appendEvent("voice.call.audio.summary", {
1110
+ provider: mediaProfile.provider,
1111
+ mediaProfile: mediaProfile.id,
1112
+ inboundMediaFrames: counters.inboundMediaFrames,
1113
+ });
1114
+ }
1115
+ } else if (parsed.event === "mark") {
1116
+ bindStreamSid(parsed.streamSid, `${mediaProfile.provider}.mark`);
1117
+ counters.marks += 1;
1118
+ const markName = getString(parsed.mark?.name, "");
1119
+ if (markQueue.length > 0) markQueue.shift();
1120
+ playoutTracker.recordCarrierMark(markName);
1121
+ } else if (parsed.event === "dtmf") {
1122
+ bindStreamSid(parsed.streamSid, `${mediaProfile.provider}.dtmf`);
1123
+ counters.dtmf += 1;
1124
+ input.appendEvent("voice.call.dtmf", {
1125
+ provider: mediaProfile.provider,
1126
+ digit: parsed.dtmf.digit,
1127
+ }, `DTMF ${parsed.dtmf.digit}`);
1128
+ } else if (parsed.event === "stop") {
1129
+ bindStreamSid(parsed.streamSid, `${mediaProfile.provider}.stop`);
1130
+ input.log(`${mediaProfile.provider}.stop`, {
1131
+ projectId: input.projectId,
1132
+ sessionId: input.sessionId,
1133
+ streamSid,
1134
+ counters,
1135
+ });
1136
+ input.appendEvent("voice.call.ended", {
1137
+ provider: mediaProfile.provider,
1138
+ providerCallId: parsed.stop.callSid ?? input.providerCallId ?? null,
1139
+ streamSid,
1140
+ counters,
1141
+ }, `${mediaProfile.provider} media stream stopped`);
1142
+ input.onCarrierAudioSender?.(null);
1143
+ try {
1144
+ openAI.close();
1145
+ } catch {}
1146
+ }
1147
+ } catch (error) {
1148
+ input.error(`${mediaProfile.provider}.message.parse.failed`, error, {
1149
+ projectId: input.projectId,
1150
+ sessionId: input.sessionId,
1151
+ mediaProfile: mediaProfile.id,
1152
+ });
1153
+ input.appendEvent("voice.phone.codec.validation_failed", {
1154
+ provider: mediaProfile.provider,
1155
+ mediaProfile: mediaProfile.id,
1156
+ error: error instanceof Error ? error.message : String(error),
1157
+ });
1158
+ }
1159
+ };
1160
+
1161
+ input.socket.on("message", handleTwilioMessage);
1162
+ openAI.on("message", handleOpenAIMessage);
1163
+ openAI.on("close", (code, reason) => {
1164
+ unregisterPlayoutIdleWaiter?.();
1165
+ input.log(`${providerLogPrefix}.closed`, {
1166
+ projectId: input.projectId,
1167
+ sessionId: input.sessionId,
1168
+ code,
1169
+ reason: reason.toString(),
1170
+ counters,
1171
+ });
1172
+ input.appendEvent("voice.provider.closed", {
1173
+ provider,
1174
+ code,
1175
+ reason: reason.toString(),
1176
+ counters,
1177
+ });
1178
+ });
1179
+ openAI.on("error", (error) => {
1180
+ input.error(`${providerLogPrefix}.socket.error`, error, { projectId: input.projectId, sessionId: input.sessionId });
1181
+ input.appendEvent("voice.provider.error", {
1182
+ provider,
1183
+ error: serializeError(error),
1184
+ });
1185
+ });
1186
+
1187
+ await new Promise<void>((resolve, reject) => {
1188
+ openAI.once("open", () => {
1189
+ providerConnected = true;
1190
+ sendOpenAIEvent(openAI, createOpenAITwilioRealtimeSessionUpdate(input.snapshot, model, input.defaultVoice ?? "alloy", provider, mediaProfile, audioRoute));
1191
+ input.log(`${providerLogPrefix}.connected`, {
1192
+ projectId: input.projectId,
1193
+ sessionId: input.sessionId,
1194
+ providerCallId: input.providerCallId ?? null,
1195
+ model,
1196
+ toolCount: tools.length,
1197
+ tools: tools.map((tool) => ({ name: tool.name, description: tool.description })),
1198
+ bridgeMode,
1199
+ humanization,
1200
+ mediaProfile: mediaProfile.id,
1201
+ waitingForPhoneAudioReady: !phoneAudioReady,
1202
+ });
1203
+ input.appendEvent("voice.provider.connected", {
1204
+ provider,
1205
+ model,
1206
+ toolCount: tools.length,
1207
+ tools: tools.map((tool) => tool.name),
1208
+ bridgeMode,
1209
+ mediaProfile: mediaProfile.id,
1210
+ });
1211
+ maybeGreet();
1212
+ resolve();
1213
+ });
1214
+ openAI.once("error", reject);
1215
+ });
1216
+
1217
+ return {
1218
+ sendTextTurn: (text: string) => {
1219
+ if (openAI.readyState !== NodeWebSocket.OPEN) return false;
1220
+ sendOpenAIUserTextTurn(openAI, text);
1221
+ return true;
1222
+ },
1223
+ close: () => {
1224
+ input.socket.off("message", handleTwilioMessage);
1225
+ input.onCarrierAudioSender?.(null);
1226
+ unregisterPlayoutIdleWaiter?.();
1227
+ try {
1228
+ openAI.close();
1229
+ } catch {}
1230
+ },
1231
+ };
1232
+ }
1233
+
1234
+ export type XAITwilioRealtimeBridgeInput = Omit<
1235
+ OpenAITwilioRealtimeBridgeInput,
1236
+ "provider" | "realtimeUrl" | "defaultModel" | "defaultVoice" | "bridgeMode" | "supportsConversationItemTruncate"
1237
+ >;
1238
+
1239
+ export async function runXAITwilioRealtimeBridge(input: XAITwilioRealtimeBridgeInput) {
1240
+ const model = getString(input.snapshot.runtime?.model, "grok-voice-think-fast-1.0");
1241
+ return runOpenAITwilioRealtimeBridge({
1242
+ ...input,
1243
+ provider: "xai-realtime",
1244
+ realtimeUrl: `wss://api.x.ai/v1/realtime?model=${encodeURIComponent(model)}`,
1245
+ defaultModel: "grok-voice-think-fast-1.0",
1246
+ defaultVoice: "eve",
1247
+ bridgeMode: "xai-twilio-realtime",
1248
+ supportsConversationItemTruncate: false,
1249
+ });
1250
+ }