@kognitivedev/backend-cloud 0.2.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/.turbo/turbo-build.log +2 -0
  2. package/.turbo/turbo-test.log +14 -0
  3. package/CHANGELOG.md +11 -0
  4. package/README.md +88 -0
  5. package/dist/cloud-voice-parameters.d.ts +11 -0
  6. package/dist/cloud-voice-parameters.js +219 -0
  7. package/dist/cloud-voice-prompt-service.d.ts +24 -0
  8. package/dist/cloud-voice-prompt-service.js +382 -0
  9. package/dist/cloud-voice-runtime-service.d.ts +73 -0
  10. package/dist/cloud-voice-runtime-service.js +443 -0
  11. package/dist/cloud-voice.d.ts +36 -0
  12. package/dist/cloud-voice.js +683 -0
  13. package/dist/index.d.ts +10 -0
  14. package/dist/index.js +26 -0
  15. package/dist/phone-control.d.ts +50 -0
  16. package/dist/phone-control.js +97 -0
  17. package/dist/phone-runtime/audio-playout-tracker.d.ts +51 -0
  18. package/dist/phone-runtime/audio-playout-tracker.js +93 -0
  19. package/dist/phone-runtime/openai-twilio-realtime.d.ts +95 -0
  20. package/dist/phone-runtime/openai-twilio-realtime.js +1074 -0
  21. package/dist/tools.d.ts +2 -0
  22. package/dist/tools.js +216 -0
  23. package/dist/types.d.ts +468 -0
  24. package/dist/types.js +2 -0
  25. package/dist/utils.d.ts +3 -0
  26. package/dist/utils.js +14 -0
  27. package/package.json +47 -0
  28. package/src/__tests__/audio-playout-tracker.test.ts +46 -0
  29. package/src/__tests__/cloud-voice.test.ts +1006 -0
  30. package/src/__tests__/openai-twilio-realtime.test.ts +1193 -0
  31. package/src/__tests__/phone-control.test.ts +105 -0
  32. package/src/cloud-voice-parameters.ts +236 -0
  33. package/src/cloud-voice-prompt-service.ts +493 -0
  34. package/src/cloud-voice-runtime-service.ts +465 -0
  35. package/src/cloud-voice.ts +831 -0
  36. package/src/index.ts +10 -0
  37. package/src/phone-control.ts +156 -0
  38. package/src/phone-runtime/audio-playout-tracker.ts +132 -0
  39. package/src/phone-runtime/openai-twilio-realtime.ts +1250 -0
  40. package/src/tools.ts +227 -0
  41. package/src/types.ts +529 -0
  42. package/src/utils.ts +11 -0
  43. package/tsconfig.json +13 -0
@@ -0,0 +1,1193 @@
1
+ import { EventEmitter } from "events";
2
+ import { afterEach, describe, expect, it, vi } from "vitest";
3
+ import { encodePcm16ToTwilioMulawBase64 } from "@kognitivedev/telephony";
4
+ import {
5
+ base64ToPcm16Le,
6
+ getPhoneMediaProfile,
7
+ pcm16ToBase64Le,
8
+ } from "@kognitivedev/voice-media-bridge";
9
+
10
+ class MockTwilioSocket extends EventEmitter {
11
+ readyState = 1;
12
+ sent: unknown[] = [];
13
+
14
+ send(data: string) {
15
+ this.sent.push(JSON.parse(data));
16
+ }
17
+ }
18
+
19
+ function waitFor(assertion: () => void | boolean, timeoutMs = 1500) {
20
+ const startedAt = Date.now();
21
+ return new Promise<void>((resolve, reject) => {
22
+ const tick = () => {
23
+ try {
24
+ const result = assertion();
25
+ if (result === false) throw new Error("condition returned false");
26
+ resolve();
27
+ } catch (error) {
28
+ if (Date.now() - startedAt > timeoutMs) {
29
+ reject(error);
30
+ return;
31
+ }
32
+ setTimeout(tick, 10);
33
+ }
34
+ };
35
+ tick();
36
+ });
37
+ }
38
+
39
+ describe("OpenAI Twilio Realtime bridge", () => {
40
+ afterEach(() => {
41
+ vi.doUnmock("ws");
42
+ vi.resetModules();
43
+ });
44
+
45
+ it("uses provider-safe phone VAD defaults when an agent has no explicit turn detection", async () => {
46
+ const { createOpenAITwilioRealtimeSessionUpdate } = await import("../phone-runtime/openai-twilio-realtime");
47
+
48
+ expect(createOpenAITwilioRealtimeSessionUpdate({
49
+ runtime: { voice: "marin" },
50
+ voiceConfig: { system: "Answer briefly.", voice: "marin" },
51
+ toolManifest: [],
52
+ }, "gpt-realtime")).toMatchObject({
53
+ session: {
54
+ audio: {
55
+ input: {
56
+ turn_detection: {
57
+ type: "server_vad",
58
+ create_response: true,
59
+ interrupt_response: true,
60
+ },
61
+ },
62
+ },
63
+ },
64
+ });
65
+ });
66
+
67
+ it("strips phone VAD tuning fields rejected by OpenAI phone sessions", async () => {
68
+ const { createOpenAITwilioRealtimeSessionUpdate } = await import("../phone-runtime/openai-twilio-realtime");
69
+
70
+ const update = createOpenAITwilioRealtimeSessionUpdate({
71
+ runtime: { voice: "marin" },
72
+ voiceConfig: {
73
+ system: "Answer briefly.",
74
+ voice: "marin",
75
+ turnDetection: {
76
+ type: "server_vad",
77
+ createResponse: true,
78
+ interruptResponse: true,
79
+ prefixPaddingMs: 300,
80
+ silenceDurationMs: 650,
81
+ threshold: 0.6,
82
+ },
83
+ },
84
+ toolManifest: [],
85
+ }, "gpt-realtime");
86
+
87
+ expect((update.session as any).audio.input.turn_detection).toEqual({
88
+ type: "server_vad",
89
+ create_response: true,
90
+ interrupt_response: true,
91
+ });
92
+ });
93
+
94
+ it("sets low reasoning by default for gpt-realtime-2 phone sessions", async () => {
95
+ const { createOpenAITwilioRealtimeSessionUpdate } = await import("../phone-runtime/openai-twilio-realtime");
96
+
97
+ expect(createOpenAITwilioRealtimeSessionUpdate({
98
+ runtime: { voice: "marin" },
99
+ voiceConfig: { system: "Answer briefly.", voice: "marin" },
100
+ toolManifest: [],
101
+ }, "gpt-realtime-2")).toMatchObject({
102
+ session: {
103
+ model: "gpt-realtime-2",
104
+ reasoning: { effort: "low" },
105
+ },
106
+ });
107
+ });
108
+
109
+ it("honors explicit gpt-realtime-2 reasoning provider options", async () => {
110
+ const { createOpenAITwilioRealtimeSessionUpdate } = await import("../phone-runtime/openai-twilio-realtime");
111
+
112
+ expect(createOpenAITwilioRealtimeSessionUpdate({
113
+ runtime: {
114
+ voice: "marin",
115
+ providerOptions: { reasoning: { effort: "minimal" } },
116
+ },
117
+ voiceConfig: { system: "Answer briefly.", voice: "marin" },
118
+ toolManifest: [],
119
+ }, "gpt-realtime-2")).toMatchObject({
120
+ session: {
121
+ reasoning: { effort: "minimal" },
122
+ },
123
+ });
124
+ });
125
+
126
+ it("uses tuned xAI phone VAD defaults when UI semantic VAD is selected", async () => {
127
+ const { createOpenAITwilioRealtimeSessionUpdate } = await import("../phone-runtime/openai-twilio-realtime");
128
+
129
+ const update = createOpenAITwilioRealtimeSessionUpdate({
130
+ runtime: { voice: "eve" },
131
+ voiceConfig: {
132
+ system: "Answer briefly.",
133
+ voice: "eve",
134
+ turnDetection: {
135
+ type: "semantic_vad",
136
+ eagerness: "low",
137
+ },
138
+ },
139
+ toolManifest: [],
140
+ }, "grok-voice-think-fast-1.0", "eve", "xai-realtime");
141
+
142
+ expect((update.session as any).turn_detection).toEqual({
143
+ type: "server_vad",
144
+ threshold: 0.55,
145
+ prefix_padding_ms: 160,
146
+ silence_duration_ms: 450,
147
+ });
148
+ });
149
+
150
+ it("executes declared tools and returns function_call_output to OpenAI", async () => {
151
+ const instances = new Array<any>();
152
+ vi.doMock("ws", () => {
153
+ class MockWebSocket extends EventEmitter {
154
+ static OPEN = 1;
155
+ readyState = MockWebSocket.OPEN;
156
+ sent: unknown[] = [];
157
+
158
+ constructor(public url: string, public options: unknown) {
159
+ super();
160
+ instances.push(this);
161
+ queueMicrotask(() => this.emit("open"));
162
+ }
163
+
164
+ send(data: string) {
165
+ this.sent.push(JSON.parse(data));
166
+ }
167
+
168
+ close() {
169
+ this.readyState = 3;
170
+ this.emit("close", 1000, Buffer.from(""));
171
+ }
172
+ }
173
+
174
+ return { default: MockWebSocket };
175
+ });
176
+
177
+ const { runOpenAITwilioRealtimeBridge } = await import("../phone-runtime/openai-twilio-realtime");
178
+ const twilioSocket = new MockTwilioSocket();
179
+ const executeTool = vi.fn(async () => ({ result: { status: "shipped" } }));
180
+
181
+ const bridge = await runOpenAITwilioRealtimeBridge({
182
+ socket: twilioSocket as any,
183
+ apiKey: "test-key",
184
+ projectId: "project-1",
185
+ sessionId: "session-1",
186
+ providerCallId: "CA123",
187
+ snapshot: {
188
+ runtime: { model: "gpt-realtime", voice: "marin" },
189
+ config: { instructions: "Answer briefly." },
190
+ toolManifest: [{
191
+ type: "function",
192
+ name: "lookup_order",
193
+ description: "Lookup an order",
194
+ parameters: {
195
+ type: "object",
196
+ properties: { orderNumber: { type: "string" } },
197
+ required: ["orderNumber"],
198
+ },
199
+ }],
200
+ },
201
+ log: vi.fn(),
202
+ error: vi.fn(),
203
+ appendEvent: vi.fn(),
204
+ updateCallLegActive: vi.fn(),
205
+ updateSessionActive: vi.fn(),
206
+ executeTool,
207
+ });
208
+
209
+ const openAI = instances[0] as any;
210
+ expect(openAI.url).toContain("gpt-realtime");
211
+ expect(openAI.options).toEqual({
212
+ headers: {
213
+ Authorization: "Bearer test-key",
214
+ },
215
+ });
216
+ expect(openAI.sent[0]).toEqual(expect.objectContaining({
217
+ type: "session.update",
218
+ session: expect.objectContaining({
219
+ type: "realtime",
220
+ model: "gpt-realtime",
221
+ output_modalities: ["audio"],
222
+ }),
223
+ }));
224
+
225
+ twilioSocket.emit("message", JSON.stringify({
226
+ event: "start",
227
+ streamSid: "MZ123",
228
+ start: {
229
+ streamSid: "MZ123",
230
+ callSid: "CA123",
231
+ mediaFormat: { encoding: "audio/x-mulaw", sampleRate: 8000, channels: 1 },
232
+ },
233
+ }));
234
+ openAI.emit("message", JSON.stringify({
235
+ type: "session.updated",
236
+ session: {
237
+ audio: {
238
+ input: { format: { type: "audio/pcmu" } },
239
+ output: { format: { type: "audio/pcmu" } },
240
+ },
241
+ },
242
+ }));
243
+ openAI.emit("message", JSON.stringify({
244
+ type: "response.output_item.done",
245
+ item: {
246
+ type: "function_call",
247
+ status: "completed",
248
+ name: "lookup_order",
249
+ call_id: "call-1",
250
+ arguments: JSON.stringify({ orderNumber: "A100" }),
251
+ },
252
+ }));
253
+
254
+ await waitFor(() => expect(executeTool).toHaveBeenCalledTimes(1));
255
+ expect(executeTool).toHaveBeenCalledWith({
256
+ toolId: "lookup_order",
257
+ args: { orderNumber: "A100" },
258
+ toolCallId: "call-1",
259
+ providerCallId: "CA123",
260
+ });
261
+ expect(openAI.sent).toContainEqual({
262
+ type: "conversation.item.create",
263
+ item: {
264
+ type: "function_call_output",
265
+ call_id: "call-1",
266
+ output: JSON.stringify({ status: "shipped" }),
267
+ },
268
+ });
269
+ expect(openAI.sent).toContainEqual({ type: "response.create" });
270
+
271
+ bridge.close();
272
+ });
273
+
274
+ it("connects xAI phone sessions with PCMU audio and cancels responses on barge-in", async () => {
275
+ const instances = new Array<any>();
276
+ vi.doMock("ws", () => {
277
+ class MockWebSocket extends EventEmitter {
278
+ static OPEN = 1;
279
+ readyState = MockWebSocket.OPEN;
280
+ sent: unknown[] = [];
281
+
282
+ constructor(public url: string, public options: unknown) {
283
+ super();
284
+ instances.push(this);
285
+ queueMicrotask(() => this.emit("open"));
286
+ }
287
+
288
+ send(data: string) {
289
+ this.sent.push(JSON.parse(data));
290
+ }
291
+
292
+ close() {
293
+ this.readyState = 3;
294
+ this.emit("close", 1000, Buffer.from(""));
295
+ }
296
+ }
297
+
298
+ return { default: MockWebSocket };
299
+ });
300
+
301
+ const { runXAITwilioRealtimeBridge } = await import("../phone-runtime/openai-twilio-realtime");
302
+ const twilioSocket = new MockTwilioSocket();
303
+ const appendEvent = vi.fn();
304
+
305
+ const bridge = await runXAITwilioRealtimeBridge({
306
+ socket: twilioSocket as any,
307
+ apiKey: "test-xai-key",
308
+ projectId: "project-1",
309
+ sessionId: "session-1",
310
+ providerCallId: "CA123",
311
+ snapshot: {
312
+ runtime: { model: "grok-voice-think-fast-1.0", voice: "abcd1234" },
313
+ voiceConfig: {
314
+ system: "Answer briefly.",
315
+ voice: "abcd1234",
316
+ turnDetection: {
317
+ type: "server_vad",
318
+ threshold: 0.7,
319
+ prefixPaddingMs: 333,
320
+ silenceDurationMs: 900,
321
+ },
322
+ },
323
+ },
324
+ initialPrompt: null,
325
+ log: vi.fn(),
326
+ error: vi.fn(),
327
+ appendEvent,
328
+ updateCallLegActive: vi.fn(),
329
+ updateSessionActive: vi.fn(),
330
+ executeTool: vi.fn(),
331
+ });
332
+
333
+ const xai = instances[0] as any;
334
+ expect(xai.url).toBe("wss://api.x.ai/v1/realtime?model=grok-voice-think-fast-1.0");
335
+ expect(xai.options).toEqual({
336
+ headers: {
337
+ Authorization: "Bearer test-xai-key",
338
+ },
339
+ });
340
+ expect(xai.sent[0]).toEqual(expect.objectContaining({
341
+ type: "session.update",
342
+ session: expect.objectContaining({
343
+ voice: "abcd1234",
344
+ turn_detection: {
345
+ type: "server_vad",
346
+ threshold: 0.7,
347
+ prefix_padding_ms: 333,
348
+ silence_duration_ms: 900,
349
+ },
350
+ audio: {
351
+ input: expect.objectContaining({ format: { type: "audio/pcmu" } }),
352
+ output: expect.objectContaining({ format: { type: "audio/pcmu" } }),
353
+ },
354
+ }),
355
+ }));
356
+ expect(xai.sent[0].session).not.toHaveProperty("model");
357
+ expect(xai.sent[0].session).not.toHaveProperty("output_modalities");
358
+ expect(xai.sent[0].session.audio.output).not.toHaveProperty("voice");
359
+
360
+ twilioSocket.emit("message", JSON.stringify({
361
+ event: "start",
362
+ streamSid: "MZ123",
363
+ start: {
364
+ streamSid: "MZ123",
365
+ callSid: "CA123",
366
+ mediaFormat: { encoding: "audio/x-mulaw", sampleRate: 8000, channels: 1 },
367
+ },
368
+ }));
369
+ xai.emit("message", JSON.stringify({
370
+ type: "session.updated",
371
+ session: {
372
+ audio: {
373
+ input: { format: { type: "audio/pcmu" } },
374
+ output: { format: { type: "audio/pcmu" } },
375
+ },
376
+ },
377
+ }));
378
+ xai.emit("message", JSON.stringify({ type: "response.created", response: { id: "resp-1" } }));
379
+ xai.emit("message", JSON.stringify({
380
+ type: "response.output_audio.delta",
381
+ item_id: "item-1",
382
+ delta: Buffer.alloc(8_000).toString("base64"),
383
+ }));
384
+ const speechFrame = new Int16Array(160);
385
+ speechFrame.fill(4000);
386
+ twilioSocket.emit("message", JSON.stringify({
387
+ event: "media",
388
+ streamSid: "MZ123",
389
+ media: {
390
+ track: "inbound",
391
+ chunk: "1",
392
+ timestamp: "5000",
393
+ payload: encodePcm16ToTwilioMulawBase64(speechFrame),
394
+ },
395
+ }));
396
+ xai.emit("message", JSON.stringify({ type: "input_audio_buffer.speech_started" }));
397
+
398
+ expect(xai.sent).toContainEqual({ type: "response.cancel" });
399
+ expect(xai.sent).not.toContainEqual(expect.objectContaining({ type: "conversation.item.truncate" }));
400
+ expect(appendEvent).toHaveBeenCalledWith("voice.call.response_cancel_sent", expect.objectContaining({
401
+ provider: "xai-realtime",
402
+ }));
403
+ expect(twilioSocket.sent).toContainEqual({
404
+ event: "clear",
405
+ streamSid: "MZ123",
406
+ });
407
+
408
+ bridge.close();
409
+ });
410
+
411
+ it("skips automatic greeting when initial prompt is null", async () => {
412
+ const instances = new Array<any>();
413
+ vi.doMock("ws", () => {
414
+ class MockWebSocket extends EventEmitter {
415
+ static OPEN = 1;
416
+ readyState = MockWebSocket.OPEN;
417
+ sent: unknown[] = [];
418
+
419
+ constructor(public url: string, public options: unknown) {
420
+ super();
421
+ instances.push(this);
422
+ queueMicrotask(() => this.emit("open"));
423
+ }
424
+
425
+ send(data: string) {
426
+ this.sent.push(JSON.parse(data));
427
+ }
428
+
429
+ close() {
430
+ this.readyState = 3;
431
+ this.emit("close", 1000, Buffer.from(""));
432
+ }
433
+ }
434
+
435
+ return { default: MockWebSocket };
436
+ });
437
+
438
+ const { runOpenAITwilioRealtimeBridge } = await import("../phone-runtime/openai-twilio-realtime");
439
+ const twilioSocket = new MockTwilioSocket();
440
+ const log = vi.fn();
441
+
442
+ const bridge = await runOpenAITwilioRealtimeBridge({
443
+ socket: twilioSocket as any,
444
+ apiKey: "test-key",
445
+ projectId: "project-1",
446
+ sessionId: "session-1",
447
+ providerCallId: "CA123",
448
+ snapshot: {
449
+ runtime: { model: "gpt-realtime", voice: "marin" },
450
+ voiceConfig: { system: "Humanized instructions." },
451
+ config: { instructions: "Answer briefly.", humanization: { openingMode: "wait" } },
452
+ },
453
+ initialPrompt: null,
454
+ log,
455
+ error: vi.fn(),
456
+ appendEvent: vi.fn(),
457
+ updateCallLegActive: vi.fn(),
458
+ updateSessionActive: vi.fn(),
459
+ executeTool: vi.fn(),
460
+ });
461
+
462
+ const openAI = instances[0] as any;
463
+ twilioSocket.emit("message", JSON.stringify({
464
+ event: "start",
465
+ streamSid: "MZ123",
466
+ start: {
467
+ streamSid: "MZ123",
468
+ callSid: "CA123",
469
+ mediaFormat: { encoding: "audio/x-mulaw", sampleRate: 8000, channels: 1 },
470
+ },
471
+ }));
472
+ openAI.emit("message", JSON.stringify({
473
+ type: "session.updated",
474
+ session: {
475
+ audio: {
476
+ input: { format: { type: "audio/pcmu" } },
477
+ output: { format: { type: "audio/pcmu" } },
478
+ },
479
+ },
480
+ }));
481
+
482
+ expect(openAI.sent).not.toContainEqual(expect.objectContaining({ type: "conversation.item.create" }));
483
+ expect(log).toHaveBeenCalledWith("openai.initial_prompt.skipped", expect.objectContaining({ reason: "openingMode.wait" }));
484
+
485
+ bridge.close();
486
+ });
487
+
488
+ it("preserves transcript delta whitespace in assistant stopped events", async () => {
489
+ const instances = new Array<any>();
490
+ vi.doMock("ws", () => {
491
+ class MockWebSocket extends EventEmitter {
492
+ static OPEN = 1;
493
+ readyState = MockWebSocket.OPEN;
494
+ sent: unknown[] = [];
495
+
496
+ constructor(public url: string, public options: unknown) {
497
+ super();
498
+ instances.push(this);
499
+ queueMicrotask(() => this.emit("open"));
500
+ }
501
+
502
+ send(data: string) {
503
+ this.sent.push(JSON.parse(data));
504
+ }
505
+
506
+ close() {
507
+ this.readyState = 3;
508
+ this.emit("close", 1000, Buffer.from(""));
509
+ }
510
+ }
511
+
512
+ return { default: MockWebSocket };
513
+ });
514
+
515
+ const { runOpenAITwilioRealtimeBridge } = await import("../phone-runtime/openai-twilio-realtime");
516
+ const appendEvent = vi.fn();
517
+
518
+ const bridge = await runOpenAITwilioRealtimeBridge({
519
+ socket: new MockTwilioSocket() as any,
520
+ apiKey: "test-key",
521
+ projectId: "project-1",
522
+ sessionId: "session-1",
523
+ providerCallId: "CA123",
524
+ snapshot: {
525
+ runtime: { model: "gpt-realtime", voice: "marin" },
526
+ voiceConfig: { system: "Answer briefly." },
527
+ },
528
+ initialPrompt: null,
529
+ log: vi.fn(),
530
+ error: vi.fn(),
531
+ appendEvent,
532
+ updateCallLegActive: vi.fn(),
533
+ updateSessionActive: vi.fn(),
534
+ executeTool: vi.fn(),
535
+ });
536
+
537
+ const openAI = instances[0] as any;
538
+ openAI.emit("message", JSON.stringify({ type: "response.created", response: { id: "resp-1" } }));
539
+ openAI.emit("message", JSON.stringify({ type: "response.output_audio_transcript.delta", delta: "Hi" }));
540
+ openAI.emit("message", JSON.stringify({ type: "response.output_audio_transcript.delta", delta: " there." }));
541
+ const usage = {
542
+ input_tokens: 12,
543
+ output_tokens: 8,
544
+ input_token_details: { audio_tokens: 10, text_tokens: 2 },
545
+ output_token_details: { audio_tokens: 6, text_tokens: 2 },
546
+ };
547
+ openAI.emit("message", JSON.stringify({ type: "response.done", response: { id: "resp-1", status: "completed", output: [], usage } }));
548
+
549
+ expect(appendEvent).toHaveBeenCalledWith("voice.assistant.stopped", expect.objectContaining({
550
+ responseId: "resp-1",
551
+ text: "Hi there.",
552
+ usage,
553
+ }), "Hi there.");
554
+
555
+ bridge.close();
556
+ });
557
+
558
+ it("prefers final OpenAI assistant transcript over partial transcript deltas", async () => {
559
+ const instances = new Array<any>();
560
+ vi.doMock("ws", () => {
561
+ class MockWebSocket extends EventEmitter {
562
+ static OPEN = 1;
563
+ readyState = MockWebSocket.OPEN;
564
+ sent: unknown[] = [];
565
+
566
+ constructor(public url: string, public options: unknown) {
567
+ super();
568
+ instances.push(this);
569
+ queueMicrotask(() => this.emit("open"));
570
+ }
571
+
572
+ send(data: string) {
573
+ this.sent.push(JSON.parse(data));
574
+ }
575
+
576
+ close() {
577
+ this.readyState = 3;
578
+ this.emit("close", 1000, Buffer.from(""));
579
+ }
580
+ }
581
+
582
+ return { default: MockWebSocket };
583
+ });
584
+
585
+ const { runOpenAITwilioRealtimeBridge } = await import("../phone-runtime/openai-twilio-realtime");
586
+ const appendEvent = vi.fn();
587
+
588
+ const bridge = await runOpenAITwilioRealtimeBridge({
589
+ socket: new MockTwilioSocket() as any,
590
+ apiKey: "test-key",
591
+ projectId: "project-1",
592
+ sessionId: "session-1",
593
+ providerCallId: "CA123",
594
+ snapshot: {
595
+ runtime: { model: "gpt-realtime", voice: "marin" },
596
+ voiceConfig: { system: "Answer briefly." },
597
+ },
598
+ initialPrompt: null,
599
+ log: vi.fn(),
600
+ error: vi.fn(),
601
+ appendEvent,
602
+ updateCallLegActive: vi.fn(),
603
+ updateSessionActive: vi.fn(),
604
+ executeTool: vi.fn(),
605
+ });
606
+
607
+ const openAI = instances[0] as any;
608
+ openAI.emit("message", JSON.stringify({ type: "response.created", response: { id: "resp-1" } }));
609
+ openAI.emit("message", JSON.stringify({ type: "response.output_audio_transcript.delta", delta: "Sure,letmecheck." }));
610
+ openAI.emit("message", JSON.stringify({
611
+ type: "response.output_audio_transcript.done",
612
+ transcript: "Sure, let me check.",
613
+ }));
614
+ openAI.emit("message", JSON.stringify({ type: "response.done", response: { id: "resp-1", status: "completed", output: [] } }));
615
+
616
+ expect(appendEvent).toHaveBeenCalledWith("voice.assistant.stopped", expect.objectContaining({
617
+ responseId: "resp-1",
618
+ text: "Sure, let me check.",
619
+ }), "Sure, let me check.");
620
+
621
+ bridge.close();
622
+ });
623
+
624
+ it("clamps OpenAI audio truncation to the assistant audio duration on interruption", async () => {
625
+ const instances = new Array<any>();
626
+ vi.doMock("ws", () => {
627
+ class MockWebSocket extends EventEmitter {
628
+ static OPEN = 1;
629
+ readyState = MockWebSocket.OPEN;
630
+ sent: unknown[] = [];
631
+
632
+ constructor(public url: string, public options: unknown) {
633
+ super();
634
+ instances.push(this);
635
+ queueMicrotask(() => this.emit("open"));
636
+ }
637
+
638
+ send(data: string) {
639
+ this.sent.push(JSON.parse(data));
640
+ }
641
+
642
+ close() {
643
+ this.readyState = 3;
644
+ this.emit("close", 1000, Buffer.from(""));
645
+ }
646
+ }
647
+
648
+ return { default: MockWebSocket };
649
+ });
650
+
651
+ const { runOpenAITwilioRealtimeBridge } = await import("../phone-runtime/openai-twilio-realtime");
652
+ const twilioSocket = new MockTwilioSocket();
653
+
654
+ const input = {
655
+ socket: twilioSocket as any,
656
+ apiKey: "test-key",
657
+ projectId: "project-1",
658
+ sessionId: "session-1",
659
+ providerCallId: "CA123",
660
+ snapshot: {
661
+ runtime: { model: "gpt-realtime", voice: "marin" },
662
+ voiceConfig: { system: "Answer briefly." },
663
+ },
664
+ initialPrompt: null,
665
+ log: vi.fn(),
666
+ error: vi.fn(),
667
+ appendEvent: vi.fn(),
668
+ updateCallLegActive: vi.fn(),
669
+ updateSessionActive: vi.fn(),
670
+ executeTool: vi.fn(),
671
+ };
672
+ const bridge = await runOpenAITwilioRealtimeBridge(input);
673
+
674
+ const openAI = instances[0] as any;
675
+ twilioSocket.emit("message", JSON.stringify({
676
+ event: "start",
677
+ streamSid: "MZ123",
678
+ start: {
679
+ streamSid: "MZ123",
680
+ callSid: "CA123",
681
+ mediaFormat: { encoding: "audio/x-mulaw", sampleRate: 8000, channels: 1 },
682
+ },
683
+ }));
684
+ openAI.emit("message", JSON.stringify({
685
+ type: "session.updated",
686
+ session: {
687
+ audio: {
688
+ input: { format: { type: "audio/pcmu" } },
689
+ output: { format: { type: "audio/pcmu" } },
690
+ },
691
+ },
692
+ }));
693
+ openAI.emit("message", JSON.stringify({
694
+ type: "response.created",
695
+ response: { id: "resp-1" },
696
+ }));
697
+ openAI.emit("message", JSON.stringify({
698
+ type: "response.output_audio.delta",
699
+ item_id: "item-1",
700
+ delta: Buffer.alloc(8_000).toString("base64"),
701
+ }));
702
+ const speechFrame = new Int16Array(160);
703
+ speechFrame.fill(4000);
704
+ twilioSocket.emit("message", JSON.stringify({
705
+ event: "media",
706
+ streamSid: "MZ123",
707
+ media: {
708
+ track: "inbound",
709
+ chunk: "1",
710
+ timestamp: "5000",
711
+ payload: encodePcm16ToTwilioMulawBase64(speechFrame),
712
+ },
713
+ }));
714
+ openAI.emit("message", JSON.stringify({ type: "input_audio_buffer.speech_started" }));
715
+
716
+ expect(openAI.sent).toContainEqual({
717
+ type: "conversation.item.truncate",
718
+ item_id: "item-1",
719
+ content_index: 0,
720
+ audio_end_ms: 980,
721
+ });
722
+ expect(twilioSocket.sent).toContainEqual({
723
+ event: "clear",
724
+ streamSid: "MZ123",
725
+ });
726
+ expect(input.appendEvent).toHaveBeenCalledWith("voice.call.audio.truncate_sent", expect.objectContaining({
727
+ provider: "openai-realtime",
728
+ audioEndMs: 980,
729
+ }));
730
+ expect(input.appendEvent).toHaveBeenCalledWith("voice.call.audio.clear_sent", expect.objectContaining({
731
+ provider: "twilio",
732
+ streamSid: "MZ123",
733
+ }));
734
+
735
+ bridge.close();
736
+ });
737
+
738
+ it("buffers early Twilio audio until the OpenAI phone audio session is ready", async () => {
739
+ const instances = new Array<any>();
740
+ vi.doMock("ws", () => {
741
+ class MockWebSocket extends EventEmitter {
742
+ static OPEN = 1;
743
+ readyState = MockWebSocket.OPEN;
744
+ sent: unknown[] = [];
745
+
746
+ constructor(public url: string, public options: unknown) {
747
+ super();
748
+ instances.push(this);
749
+ queueMicrotask(() => this.emit("open"));
750
+ }
751
+
752
+ send(data: string) {
753
+ this.sent.push(JSON.parse(data));
754
+ }
755
+
756
+ close() {
757
+ this.readyState = 3;
758
+ this.emit("close", 1000, Buffer.from(""));
759
+ }
760
+ }
761
+
762
+ return { default: MockWebSocket };
763
+ });
764
+
765
+ const { runOpenAITwilioRealtimeBridge } = await import("../phone-runtime/openai-twilio-realtime");
766
+ const twilioSocket = new MockTwilioSocket();
767
+ const log = vi.fn();
768
+
769
+ const bridge = await runOpenAITwilioRealtimeBridge({
770
+ socket: twilioSocket as any,
771
+ apiKey: "test-key",
772
+ projectId: "project-1",
773
+ sessionId: "session-1",
774
+ providerCallId: "CA123",
775
+ snapshot: {
776
+ runtime: { model: "gpt-realtime", voice: "marin" },
777
+ voiceConfig: { system: "Answer briefly." },
778
+ },
779
+ log,
780
+ error: vi.fn(),
781
+ appendEvent: vi.fn(),
782
+ updateCallLegActive: vi.fn(),
783
+ updateSessionActive: vi.fn(),
784
+ executeTool: vi.fn(),
785
+ });
786
+
787
+ const openAI = instances[0] as any;
788
+ const earlySpeech = new Int16Array(160);
789
+ earlySpeech.fill(6000);
790
+ const earlyPayload = encodePcm16ToTwilioMulawBase64(earlySpeech);
791
+
792
+ twilioSocket.emit("message", JSON.stringify({
793
+ event: "start",
794
+ streamSid: "MZ123",
795
+ start: {
796
+ streamSid: "MZ123",
797
+ callSid: "CA123",
798
+ mediaFormat: { encoding: "audio/x-mulaw", sampleRate: 8000, channels: 1 },
799
+ },
800
+ }));
801
+ twilioSocket.emit("message", JSON.stringify({
802
+ event: "media",
803
+ streamSid: "MZ123",
804
+ media: {
805
+ track: "inbound",
806
+ chunk: "1",
807
+ timestamp: "20",
808
+ payload: earlyPayload,
809
+ },
810
+ }));
811
+
812
+ expect(openAI.sent).not.toContainEqual(expect.objectContaining({ type: "input_audio_buffer.append" }));
813
+
814
+ openAI.emit("message", JSON.stringify({
815
+ type: "session.updated",
816
+ session: {
817
+ audio: {
818
+ input: { format: { type: "audio/pcmu" } },
819
+ output: { format: { type: "audio/pcmu" } },
820
+ },
821
+ },
822
+ }));
823
+
824
+ expect(openAI.sent).toContainEqual({
825
+ type: "input_audio_buffer.append",
826
+ audio: earlyPayload,
827
+ });
828
+ expect(log).toHaveBeenCalledWith("openai.initial_prompt.skipped", expect.objectContaining({
829
+ reason: "caller_audio_before_ready",
830
+ }));
831
+
832
+ bridge.close();
833
+ });
834
+
835
+ it("recovers the Twilio stream id from media events and forwards response.audio.delta", async () => {
836
+ const instances = new Array<any>();
837
+ vi.doMock("ws", () => {
838
+ class MockWebSocket extends EventEmitter {
839
+ static OPEN = 1;
840
+ readyState = MockWebSocket.OPEN;
841
+ sent: unknown[] = [];
842
+
843
+ constructor(public url: string, public options: unknown) {
844
+ super();
845
+ instances.push(this);
846
+ queueMicrotask(() => this.emit("open"));
847
+ }
848
+
849
+ send(data: string) {
850
+ this.sent.push(JSON.parse(data));
851
+ }
852
+
853
+ close() {
854
+ this.readyState = 3;
855
+ this.emit("close", 1000, Buffer.from(""));
856
+ }
857
+ }
858
+
859
+ return { default: MockWebSocket };
860
+ });
861
+
862
+ const { runOpenAITwilioRealtimeBridge } = await import("../phone-runtime/openai-twilio-realtime");
863
+ const twilioSocket = new MockTwilioSocket();
864
+ const log = vi.fn();
865
+ const updateCallLegActive = vi.fn();
866
+
867
+ const bridge = await runOpenAITwilioRealtimeBridge({
868
+ socket: twilioSocket as any,
869
+ apiKey: "test-key",
870
+ projectId: "project-1",
871
+ sessionId: "session-1",
872
+ providerCallId: "CA123",
873
+ snapshot: {
874
+ runtime: { model: "gpt-realtime", voice: "marin" },
875
+ voiceConfig: { system: "Answer briefly." },
876
+ },
877
+ initialPrompt: null,
878
+ log,
879
+ error: vi.fn(),
880
+ appendEvent: vi.fn(),
881
+ updateCallLegActive,
882
+ updateSessionActive: vi.fn(),
883
+ executeTool: vi.fn(),
884
+ });
885
+
886
+ const openAI = instances[0] as any;
887
+ twilioSocket.emit("message", JSON.stringify({
888
+ event: "start",
889
+ start: {
890
+ callSid: "CA123",
891
+ mediaFormat: { encoding: "audio/x-mulaw", sampleRate: 8000, channels: 1 },
892
+ },
893
+ }));
894
+ openAI.emit("message", JSON.stringify({
895
+ type: "session.updated",
896
+ session: {
897
+ audio: {
898
+ input: { format: { type: "audio/pcmu" } },
899
+ output: { format: { type: "audio/pcmu" } },
900
+ },
901
+ },
902
+ }));
903
+ twilioSocket.emit("message", JSON.stringify({
904
+ event: "media",
905
+ streamSid: "MZRECOVER",
906
+ media: {
907
+ track: "inbound",
908
+ chunk: "1",
909
+ timestamp: "20",
910
+ payload: Buffer.alloc(160).toString("base64"),
911
+ },
912
+ }));
913
+ openAI.emit("message", JSON.stringify({
914
+ type: "response.created",
915
+ response: { id: "resp-1" },
916
+ }));
917
+ openAI.emit("message", JSON.stringify({
918
+ type: "response.audio.delta",
919
+ item_id: "item-1",
920
+ delta: Buffer.alloc(160).toString("base64"),
921
+ }));
922
+ openAI.emit("message", JSON.stringify({
923
+ type: "response.audio.done",
924
+ response_id: "resp-1",
925
+ }));
926
+
927
+ expect(updateCallLegActive).toHaveBeenCalledWith({ streamSid: "MZRECOVER" });
928
+ expect(log).toHaveBeenCalledWith("twilio.stream_sid.bound", expect.objectContaining({
929
+ streamSid: "MZRECOVER",
930
+ source: "twilio.media",
931
+ }));
932
+ expect(twilioSocket.sent).toContainEqual(expect.objectContaining({
933
+ event: "media",
934
+ streamSid: "MZRECOVER",
935
+ }));
936
+ expect(twilioSocket.sent).toContainEqual(expect.objectContaining({
937
+ event: "mark",
938
+ streamSid: "MZRECOVER",
939
+ }));
940
+ expect(log).toHaveBeenCalledWith("openai.audio.output.done", expect.objectContaining({
941
+ responseId: "resp-1",
942
+ outboundMediaFrames: 1,
943
+ }));
944
+
945
+ bridge.close();
946
+ });
947
+
948
+ it("bridges fake Telnyx L16 media to OpenAI PCM24 and returns L16 audio to Telnyx", async () => {
949
+ const instances = new Array<any>();
950
+ vi.doMock("ws", () => {
951
+ class MockWebSocket extends EventEmitter {
952
+ static OPEN = 1;
953
+ readyState = MockWebSocket.OPEN;
954
+ sent: unknown[] = [];
955
+
956
+ constructor(public url: string, public options: unknown) {
957
+ super();
958
+ instances.push(this);
959
+ queueMicrotask(() => this.emit("open"));
960
+ }
961
+
962
+ send(data: string) {
963
+ this.sent.push(JSON.parse(data));
964
+ }
965
+
966
+ close() {
967
+ this.readyState = 3;
968
+ this.emit("close", 1000, Buffer.from(""));
969
+ }
970
+ }
971
+
972
+ return { default: MockWebSocket };
973
+ });
974
+
975
+ const { runOpenAITwilioRealtimeBridge } = await import("../phone-runtime/openai-twilio-realtime");
976
+ const carrierSocket = new MockTwilioSocket();
977
+ const profile = getPhoneMediaProfile("telnyx-l16-16k");
978
+ const appendEvent = vi.fn();
979
+
980
+ const bridge = await runOpenAITwilioRealtimeBridge({
981
+ socket: carrierSocket as any,
982
+ apiKey: "test-key",
983
+ projectId: "project-1",
984
+ sessionId: "session-1",
985
+ providerCallId: "telnyx-call-1",
986
+ snapshot: {
987
+ runtime: { model: "gpt-realtime", voice: "marin" },
988
+ voiceConfig: { system: "Answer briefly." },
989
+ },
990
+ mediaProfile: profile,
991
+ initialPrompt: null,
992
+ log: vi.fn(),
993
+ error: vi.fn(),
994
+ appendEvent,
995
+ updateCallLegActive: vi.fn(),
996
+ updateSessionActive: vi.fn(),
997
+ executeTool: vi.fn(),
998
+ });
999
+
1000
+ const openAI = instances[0] as any;
1001
+ expect(openAI.sent[0]).toMatchObject({
1002
+ type: "session.update",
1003
+ session: {
1004
+ audio: {
1005
+ input: { format: { type: "audio/pcm", rate: 24000 } },
1006
+ output: { format: { type: "audio/pcm", rate: 24000 } },
1007
+ },
1008
+ },
1009
+ });
1010
+
1011
+ carrierSocket.emit("message", JSON.stringify({
1012
+ event: "start",
1013
+ stream_id: "telnyx-stream-1",
1014
+ start: {
1015
+ call_control_id: "telnyx-call-1",
1016
+ media_format: { encoding: "L16", sample_rate: 16000, channels: 1 },
1017
+ },
1018
+ }));
1019
+ openAI.emit("message", JSON.stringify({
1020
+ type: "session.updated",
1021
+ session: {
1022
+ audio: {
1023
+ input: { format: { type: "audio/pcm", rate: 24000 } },
1024
+ output: { format: { type: "audio/pcm", rate: 24000 } },
1025
+ },
1026
+ },
1027
+ }));
1028
+
1029
+ const telnyxFrame = new Int16Array(320);
1030
+ for (let index = 0; index < telnyxFrame.length; index += 1) {
1031
+ telnyxFrame[index] = Math.round(Math.sin(index / 9) * 5000);
1032
+ }
1033
+ carrierSocket.emit("message", JSON.stringify({
1034
+ event: "media",
1035
+ stream_id: "telnyx-stream-1",
1036
+ media: {
1037
+ track: "inbound",
1038
+ chunk: "1",
1039
+ timestamp: "20",
1040
+ payload: pcm16ToBase64Le(telnyxFrame),
1041
+ },
1042
+ }));
1043
+
1044
+ const append = openAI.sent.find((message: any) => message.type === "input_audio_buffer.append");
1045
+ expect(append).toEqual(expect.objectContaining({
1046
+ type: "input_audio_buffer.append",
1047
+ audio: expect.any(String),
1048
+ }));
1049
+ expect(base64ToPcm16Le(append.audio).length).toBe(480);
1050
+
1051
+ const openAIOutput = new Int16Array(480);
1052
+ openAIOutput.fill(2400);
1053
+ openAI.emit("message", JSON.stringify({ type: "response.created", response: { id: "resp-1" } }));
1054
+ openAI.emit("message", JSON.stringify({
1055
+ type: "response.output_audio.delta",
1056
+ item_id: "item-1",
1057
+ delta: pcm16ToBase64Le(openAIOutput),
1058
+ }));
1059
+
1060
+ const outboundMedia = carrierSocket.sent.find((message: any) => message.event === "media") as any;
1061
+ expect(outboundMedia).toEqual({
1062
+ event: "media",
1063
+ media: { payload: expect.any(String) },
1064
+ });
1065
+ expect(base64ToPcm16Le(outboundMedia.media.payload).length).toBe(320);
1066
+ expect(carrierSocket.sent).toContainEqual({
1067
+ event: "mark",
1068
+ mark: { name: "openai:1" },
1069
+ });
1070
+ expect(appendEvent).toHaveBeenCalledWith("voice.call.audio.summary", expect.objectContaining({
1071
+ provider: "telnyx",
1072
+ mediaProfile: "telnyx-l16-16k",
1073
+ inboundMediaFrames: 1,
1074
+ }));
1075
+
1076
+ bridge.close();
1077
+ });
1078
+
1079
+ it("emits warnings instead of silently dropping assistant audio when stream id is missing", async () => {
1080
+ const instances = new Array<any>();
1081
+ vi.doMock("ws", () => {
1082
+ class MockWebSocket extends EventEmitter {
1083
+ static OPEN = 1;
1084
+ readyState = MockWebSocket.OPEN;
1085
+ sent: unknown[] = [];
1086
+
1087
+ constructor(public url: string, public options: unknown) {
1088
+ super();
1089
+ instances.push(this);
1090
+ queueMicrotask(() => this.emit("open"));
1091
+ }
1092
+
1093
+ send(data: string) {
1094
+ this.sent.push(JSON.parse(data));
1095
+ }
1096
+
1097
+ close() {
1098
+ this.readyState = 3;
1099
+ this.emit("close", 1000, Buffer.from(""));
1100
+ }
1101
+ }
1102
+
1103
+ return { default: MockWebSocket };
1104
+ });
1105
+
1106
+ const { runOpenAITwilioRealtimeBridge } = await import("../phone-runtime/openai-twilio-realtime");
1107
+ const twilioSocket = new MockTwilioSocket();
1108
+ const log = vi.fn();
1109
+ const appendEvent = vi.fn();
1110
+
1111
+ const bridge = await runOpenAITwilioRealtimeBridge({
1112
+ socket: twilioSocket as any,
1113
+ apiKey: "test-key",
1114
+ projectId: "project-1",
1115
+ sessionId: "session-1",
1116
+ providerCallId: "CA123",
1117
+ snapshot: {
1118
+ runtime: { model: "gpt-realtime", voice: "marin" },
1119
+ voiceConfig: { system: "Answer briefly." },
1120
+ },
1121
+ initialPrompt: null,
1122
+ log,
1123
+ error: vi.fn(),
1124
+ appendEvent,
1125
+ updateCallLegActive: vi.fn(),
1126
+ updateSessionActive: vi.fn(),
1127
+ executeTool: vi.fn(),
1128
+ });
1129
+
1130
+ const openAI = instances[0] as any;
1131
+ twilioSocket.emit("message", JSON.stringify({
1132
+ event: "start",
1133
+ start: {
1134
+ callSid: "CA123",
1135
+ mediaFormat: { encoding: "audio/x-mulaw", sampleRate: 8000, channels: 1 },
1136
+ },
1137
+ }));
1138
+ openAI.emit("message", JSON.stringify({
1139
+ type: "session.updated",
1140
+ session: {
1141
+ audio: {
1142
+ input: { format: { type: "audio/pcmu" } },
1143
+ output: { format: { type: "audio/pcmu" } },
1144
+ },
1145
+ },
1146
+ }));
1147
+ openAI.emit("message", JSON.stringify({
1148
+ type: "response.created",
1149
+ response: { id: "resp-1" },
1150
+ }));
1151
+ openAI.emit("message", JSON.stringify({
1152
+ type: "response.output_audio.delta",
1153
+ item_id: "item-1",
1154
+ delta: Buffer.alloc(160).toString("base64"),
1155
+ }));
1156
+ openAI.emit("message", JSON.stringify({
1157
+ type: "response.audio_transcript.delta",
1158
+ delta: "Hello there.",
1159
+ }));
1160
+ openAI.emit("message", JSON.stringify({
1161
+ type: "response.done",
1162
+ response: { id: "resp-1", status: "completed", output: [] },
1163
+ }));
1164
+
1165
+ expect(twilioSocket.sent).not.toContainEqual(expect.objectContaining({ event: "media" }));
1166
+ expect(log).toHaveBeenCalledWith("openai.audio.output.dropped", expect.objectContaining({
1167
+ reason: "missing_twilio_stream_sid",
1168
+ eventType: "response.output_audio.delta",
1169
+ droppedFrames: 1,
1170
+ }));
1171
+ expect(appendEvent).toHaveBeenCalledWith("voice.call.audio.dropped", expect.objectContaining({
1172
+ reason: "missing_twilio_stream_sid",
1173
+ droppedFrames: 1,
1174
+ }), "Assistant audio dropped before Twilio stream was bound");
1175
+ expect(appendEvent).toHaveBeenCalledWith("voice.provider.warning", expect.objectContaining({
1176
+ reason: "missing_twilio_stream_sid",
1177
+ }), "Assistant audio could not be sent to Twilio");
1178
+ expect(appendEvent).toHaveBeenCalledWith("voice.provider.warning", expect.objectContaining({
1179
+ reason: "assistant_text_without_outbound_audio",
1180
+ text: "Hello there.",
1181
+ }), "Assistant response produced text but no outbound phone audio");
1182
+ expect(appendEvent).toHaveBeenCalledWith("voice.assistant.stopped", expect.objectContaining({
1183
+ responseId: "resp-1",
1184
+ text: "Hello there.",
1185
+ counters: expect.objectContaining({
1186
+ outboundMediaFrames: 0,
1187
+ outboundAudioDroppedMissingStreamSid: 1,
1188
+ }),
1189
+ }), "Hello there.");
1190
+
1191
+ bridge.close();
1192
+ });
1193
+ });