@livekit/agents-plugin-openai 1.0.31 → 1.0.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1665 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import type { metrics } from '@livekit/agents';
5
+ import {
6
+ type APIConnectOptions,
7
+ APIConnectionError,
8
+ APIError,
9
+ AudioByteStream,
10
+ DEFAULT_API_CONNECT_OPTIONS,
11
+ Future,
12
+ Queue,
13
+ Task,
14
+ cancelAndWait,
15
+ delay,
16
+ isAPIError,
17
+ llm,
18
+ log,
19
+ shortuuid,
20
+ stream,
21
+ } from '@livekit/agents';
22
+ import { Mutex } from '@livekit/mutex';
23
+ import type { AudioResampler } from '@livekit/rtc-node';
24
+ import { AudioFrame, combineAudioFrames } from '@livekit/rtc-node';
25
+ import { type MessageEvent, WebSocket } from 'ws';
26
+ import * as api_proto from './api_proto.js';
27
+
28
+ // if LK_OPENAI_DEBUG convert it to a number, otherwise set it to 0
29
+ const lkOaiDebug = process.env.LK_OPENAI_DEBUG ? Number(process.env.LK_OPENAI_DEBUG) : 0;
30
+
31
+ const SAMPLE_RATE = 24000;
32
+ const NUM_CHANNELS = 1;
33
+ const BASE_URL = 'https://api.openai.com/v1';
34
+
35
+ const MOCK_AUDIO_ID_PREFIX = 'lk_mock_audio_item_';
36
+
37
+ type Modality = 'text' | 'audio';
38
+
39
+ interface RealtimeOptions {
40
+ model: api_proto.Model;
41
+ voice: api_proto.Voice;
42
+ temperature: number;
43
+ toolChoice?: llm.ToolChoice;
44
+ inputAudioTranscription?: api_proto.InputAudioTranscription | null;
45
+ // TODO(shubhra): add inputAudioNoiseReduction
46
+ turnDetection?: api_proto.TurnDetectionType | null;
47
+ maxResponseOutputTokens?: number | 'inf';
48
+ speed?: number;
49
+ // TODO(shubhra): add openai tracing options
50
+ apiKey?: string;
51
+ baseURL: string;
52
+ isAzure: boolean;
53
+ azureDeployment?: string;
54
+ entraToken?: string;
55
+ apiVersion?: string;
56
+ maxSessionDuration: number;
57
+ // reset the connection after this many seconds if provided
58
+ connOptions: APIConnectOptions;
59
+ modalities: Modality[];
60
+ }
61
+
62
+ interface MessageGeneration {
63
+ messageId: string;
64
+ textChannel: stream.StreamChannel<string>;
65
+ audioChannel: stream.StreamChannel<AudioFrame>;
66
+ audioTranscript: string;
67
+ modalities: Future<('text' | 'audio')[]>;
68
+ }
69
+
70
+ interface ResponseGeneration {
71
+ messageChannel: stream.StreamChannel<llm.MessageGeneration>;
72
+ functionChannel: stream.StreamChannel<llm.FunctionCall>;
73
+ messages: Map<string, MessageGeneration>;
74
+
75
+ /** @internal */
76
+ _doneFut: Future;
77
+ /** @internal */
78
+ _createdTimestamp: number;
79
+ /** @internal */
80
+ _firstTokenTimestamp?: number;
81
+ }
82
+
83
+ class CreateResponseHandle {
84
+ instructions?: string;
85
+ doneFut: Future<llm.GenerationCreatedEvent>;
86
+ // TODO(shubhra): add timeout
87
+ constructor({ instructions }: { instructions?: string }) {
88
+ this.instructions = instructions;
89
+ this.doneFut = new Future();
90
+ }
91
+ }
92
+
93
+ // default values got from a "default" session from their API
94
+ const DEFAULT_FIRST_RETRY_INTERVAL_MS = 100;
95
+ const DEFAULT_TEMPERATURE = 0.8;
96
+ const DEFAULT_TURN_DETECTION: api_proto.TurnDetectionType = {
97
+ type: 'semantic_vad',
98
+ eagerness: 'medium',
99
+ create_response: true,
100
+ interrupt_response: true,
101
+ };
102
+ const DEFAULT_INPUT_AUDIO_TRANSCRIPTION: api_proto.InputAudioTranscription = {
103
+ model: 'gpt-4o-mini-transcribe',
104
+ };
105
+ const DEFAULT_TOOL_CHOICE: llm.ToolChoice = 'auto';
106
+ const DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS: number | 'inf' = 'inf';
107
+
108
+ const AZURE_DEFAULT_INPUT_AUDIO_TRANSCRIPTION: api_proto.InputAudioTranscription = {
109
+ model: 'whisper-1',
110
+ };
111
+
112
+ const AZURE_DEFAULT_TURN_DETECTION: api_proto.TurnDetectionType = {
113
+ type: 'server_vad',
114
+ threshold: 0.5,
115
+ prefix_padding_ms: 300,
116
+ silence_duration_ms: 200,
117
+ create_response: true,
118
+ };
119
+
120
+ const DEFAULT_MAX_SESSION_DURATION = 20 * 60 * 1000; // 20 minutes
121
+
122
+ const DEFAULT_REALTIME_MODEL_OPTIONS = {
123
+ model: 'gpt-realtime',
124
+ voice: 'marin',
125
+ temperature: DEFAULT_TEMPERATURE,
126
+ inputAudioTranscription: DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
127
+ turnDetection: DEFAULT_TURN_DETECTION,
128
+ toolChoice: DEFAULT_TOOL_CHOICE,
129
+ maxResponseOutputTokens: DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS,
130
+ maxSessionDuration: DEFAULT_MAX_SESSION_DURATION,
131
+ connOptions: DEFAULT_API_CONNECT_OPTIONS,
132
+ modalities: ['text', 'audio'] as Modality[],
133
+ };
134
+ export class RealtimeModel extends llm.RealtimeModel {
135
+ sampleRate = api_proto.SAMPLE_RATE;
136
+ numChannels = api_proto.NUM_CHANNELS;
137
+ inFrameSize = api_proto.IN_FRAME_SIZE;
138
+ outFrameSize = api_proto.OUT_FRAME_SIZE;
139
+
140
+ /* @internal */
141
+ _options: RealtimeOptions;
142
+
143
+ get model(): string {
144
+ return this._options.model;
145
+ }
146
+
147
+ constructor(
148
+ options: {
149
+ model?: string;
150
+ voice?: string;
151
+ temperature?: number;
152
+ toolChoice?: llm.ToolChoice;
153
+ baseURL?: string;
154
+ modalities?: Modality[];
155
+ inputAudioTranscription?: api_proto.InputAudioTranscription | null;
156
+ // TODO(shubhra): add inputAudioNoiseReduction
157
+ turnDetection?: api_proto.TurnDetectionType | null;
158
+ speed?: number;
159
+ // TODO(shubhra): add openai tracing options
160
+ azureDeployment?: string;
161
+ apiKey?: string;
162
+ entraToken?: string;
163
+ apiVersion?: string;
164
+ maxSessionDuration?: number;
165
+ connOptions?: APIConnectOptions;
166
+ } = {},
167
+ ) {
168
+ const modalities = (options.modalities ||
169
+ DEFAULT_REALTIME_MODEL_OPTIONS.modalities) as Modality[];
170
+
171
+ super({
172
+ messageTruncation: true,
173
+ turnDetection: options.turnDetection !== null,
174
+ userTranscription: options.inputAudioTranscription !== null,
175
+ autoToolReplyGeneration: false,
176
+ audioOutput: modalities.includes('audio'),
177
+ });
178
+
179
+ const isAzure = !!(options.apiVersion || options.entraToken || options.azureDeployment);
180
+
181
+ if (options.apiKey === '' && !isAzure) {
182
+ throw new Error(
183
+ 'OpenAI API key is required, either using the argument or by setting the OPENAI_API_KEY environment variable',
184
+ );
185
+ }
186
+
187
+ const apiKey = options.apiKey || process.env.OPENAI_API_KEY;
188
+
189
+ if (!apiKey && !isAzure) {
190
+ throw new Error(
191
+ 'OpenAI API key is required, either using the argument or by setting the OPENAI_API_KEY environment variable',
192
+ );
193
+ }
194
+
195
+ if (!options.baseURL && isAzure) {
196
+ const azureEndpoint = process.env.AZURE_OPENAI_ENDPOINT;
197
+ if (!azureEndpoint) {
198
+ throw new Error(
199
+ 'Missing Azure endpoint. Please pass base_url or set AZURE_OPENAI_ENDPOINT environment variable.',
200
+ );
201
+ }
202
+ options.baseURL = `${azureEndpoint.replace(/\/$/, '')}/openai`;
203
+ }
204
+
205
+ const { modalities: _, ...optionsWithoutModalities } = options;
206
+ this._options = {
207
+ ...DEFAULT_REALTIME_MODEL_OPTIONS,
208
+ ...optionsWithoutModalities,
209
+ baseURL: options.baseURL || BASE_URL,
210
+ apiKey,
211
+ isAzure,
212
+ model: options.model || DEFAULT_REALTIME_MODEL_OPTIONS.model,
213
+ modalities,
214
+ };
215
+ }
216
+
217
+ /**
218
+ * Create a RealtimeModel instance configured for Azure OpenAI Service.
219
+ *
220
+ * @param azureDeployment - The name of your Azure OpenAI deployment.
221
+ * @param azureEndpoint - The endpoint URL for your Azure OpenAI resource. If undefined, will attempt to read from the environment variable AZURE_OPENAI_ENDPOINT.
222
+ * @param apiVersion - API version to use with Azure OpenAI Service. If undefined, will attempt to read from the environment variable OPENAI_API_VERSION.
223
+ * @param apiKey - Azure OpenAI API key. If undefined, will attempt to read from the environment variable AZURE_OPENAI_API_KEY.
224
+ * @param entraToken - Azure Entra authentication token. Required if not using API key authentication.
225
+ * @param baseURL - Base URL for the API endpoint. If undefined, constructed from the azure_endpoint.
226
+ * @param voice - Voice setting for audio outputs. Defaults to "alloy".
227
+ * @param inputAudioTranscription - Options for transcribing input audio. Defaults to @see DEFAULT_INPUT_AUDIO_TRANSCRIPTION.
228
+ * @param turnDetection - Options for server-based voice activity detection (VAD). Defaults to @see DEFAULT_SERVER_VAD_OPTIONS.
229
+ * @param temperature - Sampling temperature for response generation. Defaults to @see DEFAULT_TEMPERATURE.
230
+ * @param speed - Speed of the audio output. Defaults to 1.0.
231
+ * @param maxResponseOutputTokens - Maximum number of tokens in the response. Defaults to @see DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS.
232
+ * @param maxSessionDuration - Maximum duration of the session in milliseconds. Defaults to @see DEFAULT_MAX_SESSION_DURATION.
233
+ *
234
+ * @returns A RealtimeModel instance configured for Azure OpenAI Service.
235
+ *
236
+ * @throws Error if required Azure parameters are missing or invalid.
237
+ */
238
+ static withAzure({
239
+ azureDeployment,
240
+ azureEndpoint,
241
+ apiVersion,
242
+ apiKey,
243
+ entraToken,
244
+ baseURL,
245
+ voice = 'alloy',
246
+ inputAudioTranscription = AZURE_DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
247
+ turnDetection = AZURE_DEFAULT_TURN_DETECTION,
248
+ temperature = 0.8,
249
+ speed,
250
+ }: {
251
+ azureDeployment: string;
252
+ azureEndpoint?: string;
253
+ apiVersion?: string;
254
+ apiKey?: string;
255
+ entraToken?: string;
256
+ baseURL?: string;
257
+ voice?: string;
258
+ inputAudioTranscription?: api_proto.InputAudioTranscription;
259
+ // TODO(shubhra): add inputAudioNoiseReduction
260
+ turnDetection?: api_proto.TurnDetectionType;
261
+ temperature?: number;
262
+ speed?: number;
263
+ }) {
264
+ apiKey = apiKey || process.env.AZURE_OPENAI_API_KEY;
265
+ if (!apiKey && !entraToken) {
266
+ throw new Error(
267
+ 'Missing credentials. Please pass one of `apiKey`, `entraToken`, or the `AZURE_OPENAI_API_KEY` environment variable.',
268
+ );
269
+ }
270
+
271
+ apiVersion = apiVersion || process.env.OPENAI_API_VERSION;
272
+ if (!apiVersion) {
273
+ throw new Error(
274
+ 'Must provide either the `apiVersion` argument or the `OPENAI_API_VERSION` environment variable',
275
+ );
276
+ }
277
+
278
+ if (!baseURL) {
279
+ azureEndpoint = azureEndpoint || process.env.AZURE_OPENAI_ENDPOINT;
280
+ if (!azureEndpoint) {
281
+ throw new Error(
282
+ 'Missing Azure endpoint. Please pass the `azure_endpoint` parameter or set the `AZURE_OPENAI_ENDPOINT` environment variable.',
283
+ );
284
+ }
285
+ baseURL = `${azureEndpoint.replace(/\/$/, '')}/openai`;
286
+ }
287
+
288
+ return new RealtimeModel({
289
+ voice,
290
+ inputAudioTranscription,
291
+ turnDetection,
292
+ temperature,
293
+ speed,
294
+ apiKey,
295
+ azureDeployment,
296
+ apiVersion,
297
+ entraToken,
298
+ baseURL,
299
+ });
300
+ }
301
+
302
+ session() {
303
+ return new RealtimeSession(this);
304
+ }
305
+
306
+ async close() {
307
+ return;
308
+ }
309
+ }
310
+
311
+ function processBaseURL({
312
+ baseURL,
313
+ model,
314
+ isAzure = false,
315
+ azureDeployment,
316
+ apiVersion,
317
+ }: {
318
+ baseURL: string;
319
+ model: string;
320
+ isAzure: boolean;
321
+ azureDeployment?: string;
322
+ apiVersion?: string;
323
+ }): string {
324
+ const url = new URL([baseURL, 'realtime'].join('/'));
325
+
326
+ if (url.protocol === 'https:') {
327
+ url.protocol = 'wss:';
328
+ }
329
+
330
+ // ensure "/realtime" is added if the path is empty OR "/v1"
331
+ if (!url.pathname || ['', '/v1', '/openai'].includes(url.pathname.replace(/\/$/, ''))) {
332
+ url.pathname = url.pathname.replace(/\/$/, '') + '/realtime';
333
+ } else {
334
+ url.pathname = url.pathname.replace(/\/$/, '');
335
+ }
336
+
337
+ const queryParams: Record<string, string> = {};
338
+ if (isAzure) {
339
+ if (apiVersion) {
340
+ queryParams['api-version'] = apiVersion;
341
+ }
342
+ if (azureDeployment) {
343
+ queryParams['deployment'] = azureDeployment;
344
+ }
345
+ } else {
346
+ queryParams['model'] = model;
347
+ }
348
+
349
+ for (const [key, value] of Object.entries(queryParams)) {
350
+ url.searchParams.set(key, value);
351
+ }
352
+
353
+ return url.toString();
354
+ }
355
+
356
+ /**
357
+ * A session for the OpenAI Realtime API.
358
+ *
359
+ * This class is used to interact with the OpenAI Realtime API.
360
+ * It is responsible for sending events to the OpenAI Realtime API and receiving events from it.
361
+ *
362
+ * It exposes two more events:
363
+ * - openai_server_event_received: expose the raw server events from the OpenAI Realtime API
364
+ * - openai_client_event_queued: expose the raw client events sent to the OpenAI Realtime API
365
+ */
366
+ export class RealtimeSession extends llm.RealtimeSession {
367
+ private _tools: llm.ToolContext = {};
368
+ private remoteChatCtx: llm.RemoteChatContext = new llm.RemoteChatContext();
369
+ private messageChannel = new Queue<api_proto.ClientEvent>();
370
+ private inputResampler?: AudioResampler;
371
+ private instructions?: string;
372
+ private oaiRealtimeModel: RealtimeModel;
373
+ private currentGeneration?: ResponseGeneration;
374
+ private responseCreatedFutures: { [id: string]: CreateResponseHandle } = {};
375
+
376
+ private textModeRecoveryRetries: number = 0;
377
+
378
+ private itemCreateFutures: { [id: string]: Future } = {};
379
+ private itemDeleteFutures: { [id: string]: Future } = {};
380
+
381
+ private updateChatCtxLock = new Mutex();
382
+ private updateFuncCtxLock = new Mutex();
383
+
384
+ // 100ms chunks
385
+ private bstream = new AudioByteStream(SAMPLE_RATE, NUM_CHANNELS, SAMPLE_RATE / 10);
386
+
387
+ private pushedDurationMs: number = 0;
388
+
389
+ #logger = log();
390
+ #task: Task<void>;
391
+ #closed = false;
392
+
393
+ constructor(realtimeModel: RealtimeModel) {
394
+ super(realtimeModel);
395
+
396
+ this.oaiRealtimeModel = realtimeModel;
397
+
398
+ this.#task = Task.from(({ signal }) => this.#mainTask(signal));
399
+
400
+ this.sendEvent(this.createSessionUpdateEvent());
401
+ }
402
+
403
+ sendEvent(command: api_proto.ClientEvent): void {
404
+ this.messageChannel.put(command);
405
+ }
406
+
407
+ private createSessionUpdateEvent(): api_proto.SessionUpdateEvent {
408
+ // OpenAI supports ['text'] or ['text', 'audio'] (audio always includes text transcript)
409
+ // We normalize to ensure 'text' is always present when using audio
410
+ const modalities: Modality[] = this.oaiRealtimeModel._options.modalities.includes('audio')
411
+ ? ['text', 'audio']
412
+ : ['text'];
413
+
414
+ return {
415
+ type: 'session.update',
416
+ session: {
417
+ model: this.oaiRealtimeModel._options.model,
418
+ voice: this.oaiRealtimeModel._options.voice,
419
+ input_audio_format: 'pcm16',
420
+ output_audio_format: 'pcm16',
421
+ modalities: modalities,
422
+ turn_detection: this.oaiRealtimeModel._options.turnDetection,
423
+ input_audio_transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
424
+ // TODO(shubhra): add inputAudioNoiseReduction
425
+ temperature: this.oaiRealtimeModel._options.temperature,
426
+ tool_choice: toOaiToolChoice(this.oaiRealtimeModel._options.toolChoice),
427
+ max_response_output_tokens:
428
+ this.oaiRealtimeModel._options.maxResponseOutputTokens === Infinity
429
+ ? 'inf'
430
+ : this.oaiRealtimeModel._options.maxResponseOutputTokens,
431
+ // TODO(shubhra): add tracing options
432
+ instructions: this.instructions,
433
+ speed: this.oaiRealtimeModel._options.speed,
434
+ },
435
+ };
436
+ }
437
+
438
+ get chatCtx() {
439
+ return this.remoteChatCtx.toChatCtx();
440
+ }
441
+
442
+ get tools() {
443
+ return { ...this._tools } as llm.ToolContext;
444
+ }
445
+
446
+ async updateChatCtx(_chatCtx: llm.ChatContext): Promise<void> {
447
+ const unlock = await this.updateChatCtxLock.lock();
448
+ const events = this.createChatCtxUpdateEvents(_chatCtx);
449
+ const futures: Future<void>[] = [];
450
+
451
+ for (const event of events) {
452
+ const future = new Future<void>();
453
+ futures.push(future);
454
+
455
+ if (event.type === 'conversation.item.create') {
456
+ this.itemCreateFutures[event.item.id] = future;
457
+ } else if (event.type == 'conversation.item.delete') {
458
+ this.itemDeleteFutures[event.item_id] = future;
459
+ }
460
+
461
+ this.sendEvent(event);
462
+ }
463
+
464
+ if (futures.length === 0) {
465
+ unlock();
466
+ return;
467
+ }
468
+
469
+ try {
470
+ // wait for futures to resolve or timeout
471
+ await Promise.race([
472
+ Promise.all(futures),
473
+ delay(5000).then(() => {
474
+ throw new Error('Chat ctx update events timed out');
475
+ }),
476
+ ]);
477
+ } catch (e) {
478
+ this.#logger.error((e as Error).message);
479
+ throw e;
480
+ } finally {
481
+ unlock();
482
+ }
483
+ }
484
+
485
+ private createChatCtxUpdateEvents(
486
+ chatCtx: llm.ChatContext,
487
+ addMockAudio: boolean = false,
488
+ ): (api_proto.ConversationItemCreateEvent | api_proto.ConversationItemDeleteEvent)[] {
489
+ const newChatCtx = chatCtx.copy();
490
+ if (addMockAudio) {
491
+ newChatCtx.items.push(createMockAudioItem());
492
+ } else {
493
+ // clean up existing mock audio items
494
+ newChatCtx.items = newChatCtx.items.filter(
495
+ (item) => !item.id.startsWith(MOCK_AUDIO_ID_PREFIX),
496
+ );
497
+ }
498
+
499
+ const events: (
500
+ | api_proto.ConversationItemCreateEvent
501
+ | api_proto.ConversationItemDeleteEvent
502
+ )[] = [];
503
+
504
+ const diffOps = llm.computeChatCtxDiff(this.chatCtx, newChatCtx);
505
+ for (const op of diffOps.toRemove) {
506
+ events.push({
507
+ type: 'conversation.item.delete',
508
+ item_id: op,
509
+ event_id: shortuuid('chat_ctx_delete_'),
510
+ } as api_proto.ConversationItemDeleteEvent);
511
+ }
512
+
513
+ for (const [previousId, id] of diffOps.toCreate) {
514
+ const chatItem = newChatCtx.getById(id);
515
+ if (!chatItem) {
516
+ throw new Error(`Chat item ${id} not found`);
517
+ }
518
+ events.push({
519
+ type: 'conversation.item.create',
520
+ item: livekitItemToOpenAIItem(chatItem),
521
+ previous_item_id: previousId ?? undefined,
522
+ event_id: shortuuid('chat_ctx_create_'),
523
+ } as api_proto.ConversationItemCreateEvent);
524
+ }
525
+ return events;
526
+ }
527
+
528
+ async updateTools(_tools: llm.ToolContext): Promise<void> {
529
+ const unlock = await this.updateFuncCtxLock.lock();
530
+ const ev = this.createToolsUpdateEvent(_tools);
531
+ this.sendEvent(ev);
532
+
533
+ if (!ev.session.tools) {
534
+ throw new Error('Tools are missing in the session update event');
535
+ }
536
+
537
+ // TODO(brian): these logics below are noops I think, leaving it here to keep
538
+ // parity with the python but we should remove them later
539
+ const retainedToolNames = new Set(ev.session.tools.map((tool) => tool.name));
540
+ const retainedTools = Object.fromEntries(
541
+ Object.entries(_tools).filter(
542
+ ([name, tool]) => llm.isFunctionTool(tool) && retainedToolNames.has(name),
543
+ ),
544
+ );
545
+
546
+ this._tools = retainedTools as llm.ToolContext;
547
+
548
+ unlock();
549
+ }
550
+
551
+ private createToolsUpdateEvent(_tools: llm.ToolContext): api_proto.SessionUpdateEvent {
552
+ const oaiTools: api_proto.Tool[] = [];
553
+
554
+ for (const [name, tool] of Object.entries(_tools)) {
555
+ if (!llm.isFunctionTool(tool)) {
556
+ this.#logger.error({ name, tool }, "OpenAI Realtime API doesn't support this tool type");
557
+ continue;
558
+ }
559
+
560
+ const { parameters: toolParameters, description } = tool;
561
+ try {
562
+ const parameters = llm.toJsonSchema(
563
+ toolParameters,
564
+ ) as unknown as api_proto.Tool['parameters'];
565
+
566
+ oaiTools.push({
567
+ name,
568
+ description,
569
+ parameters: parameters,
570
+ type: 'function',
571
+ });
572
+ } catch (e) {
573
+ this.#logger.error({ name, tool }, "OpenAI Realtime API doesn't support this tool type");
574
+ continue;
575
+ }
576
+ }
577
+
578
+ return {
579
+ type: 'session.update',
580
+ session: {
581
+ model: this.oaiRealtimeModel._options.model,
582
+ tools: oaiTools,
583
+ },
584
+ event_id: shortuuid('tools_update_'),
585
+ };
586
+ }
587
+
588
+ async updateInstructions(_instructions: string): Promise<void> {
589
+ const eventId = shortuuid('instructions_update_');
590
+ this.sendEvent({
591
+ type: 'session.update',
592
+ session: {
593
+ instructions: _instructions,
594
+ },
595
+ event_id: eventId,
596
+ } as api_proto.SessionUpdateEvent);
597
+ this.instructions = _instructions;
598
+ }
599
+
600
+ updateOptions({ toolChoice }: { toolChoice?: llm.ToolChoice }): void {
601
+ const options: api_proto.SessionUpdateEvent['session'] = {};
602
+
603
+ this.oaiRealtimeModel._options.toolChoice = toolChoice;
604
+ options.tool_choice = toOaiToolChoice(toolChoice);
605
+
606
+ // TODO(brian): add other options here
607
+
608
+ this.sendEvent({
609
+ type: 'session.update',
610
+ session: options,
611
+ event_id: shortuuid('options_update_'),
612
+ });
613
+ }
614
+
615
+ pushAudio(frame: AudioFrame): void {
616
+ for (const f of this.resampleAudio(frame)) {
617
+ for (const nf of this.bstream.write(f.data.buffer as ArrayBuffer)) {
618
+ this.sendEvent({
619
+ type: 'input_audio_buffer.append',
620
+ audio: Buffer.from(nf.data.buffer).toString('base64'),
621
+ } as api_proto.InputAudioBufferAppendEvent);
622
+ // TODO(AJS-102): use frame.durationMs once available in rtc-node
623
+ this.pushedDurationMs += (nf.samplesPerChannel / nf.sampleRate) * 1000;
624
+ }
625
+ }
626
+ }
627
+
628
+ async commitAudio(): Promise<void> {
629
+ if (this.pushedDurationMs > 100) {
630
+ // OpenAI requires at least 100ms of audio
631
+ this.sendEvent({
632
+ type: 'input_audio_buffer.commit',
633
+ } as api_proto.InputAudioBufferCommitEvent);
634
+ this.pushedDurationMs = 0;
635
+ }
636
+ }
637
+
638
+ async clearAudio(): Promise<void> {
639
+ this.sendEvent({
640
+ type: 'input_audio_buffer.clear',
641
+ } as api_proto.InputAudioBufferClearEvent);
642
+ this.pushedDurationMs = 0;
643
+ }
644
+
645
+ async generateReply(instructions?: string): Promise<llm.GenerationCreatedEvent> {
646
+ const handle = this.createResponse({ instructions, userInitiated: true });
647
+ this.textModeRecoveryRetries = 0;
648
+ return handle.doneFut.await;
649
+ }
650
+
651
+ async interrupt(): Promise<void> {
652
+ this.sendEvent({
653
+ type: 'response.cancel',
654
+ } as api_proto.ResponseCancelEvent);
655
+ }
656
+
657
+ async truncate(_options: {
658
+ messageId: string;
659
+ audioEndMs: number;
660
+ modalities?: Modality[];
661
+ audioTranscript?: string;
662
+ }): Promise<void> {
663
+ if (!_options.modalities || _options.modalities.includes('audio')) {
664
+ this.sendEvent({
665
+ type: 'conversation.item.truncate',
666
+ content_index: 0,
667
+ item_id: _options.messageId,
668
+ audio_end_ms: _options.audioEndMs,
669
+ } as api_proto.ConversationItemTruncateEvent);
670
+ } else if (_options.audioTranscript !== undefined) {
671
+ // sync it to the remote chat context
672
+ const chatCtx = this.chatCtx.copy();
673
+ const idx = chatCtx.indexById(_options.messageId);
674
+ if (idx !== undefined) {
675
+ const item = chatCtx.items[idx];
676
+ if (item && item.type === 'message') {
677
+ const newItem = llm.ChatMessage.create({
678
+ ...item,
679
+ content: [_options.audioTranscript],
680
+ });
681
+ chatCtx.items[idx] = newItem;
682
+ const events = this.createChatCtxUpdateEvents(chatCtx);
683
+ for (const ev of events) {
684
+ this.sendEvent(ev);
685
+ }
686
+ }
687
+ }
688
+ }
689
+ }
690
+
691
+ private loggableEvent(
692
+ event: api_proto.ClientEvent | api_proto.ServerEvent,
693
+ ): Record<string, unknown> {
694
+ const untypedEvent: Record<string, unknown> = {};
695
+ for (const [key, value] of Object.entries(event)) {
696
+ if (value !== undefined) {
697
+ untypedEvent[key] = value;
698
+ }
699
+ }
700
+
701
+ if (untypedEvent.audio && typeof untypedEvent.audio === 'string') {
702
+ return { ...untypedEvent, audio: '...' };
703
+ }
704
+ if (
705
+ untypedEvent.delta &&
706
+ typeof untypedEvent.delta === 'string' &&
707
+ event.type === 'response.audio.delta'
708
+ ) {
709
+ return { ...untypedEvent, delta: '...' };
710
+ }
711
+ return untypedEvent;
712
+ }
713
+
714
+ private async createWsConn(): Promise<WebSocket> {
715
+ const headers: Record<string, string> = {
716
+ 'User-Agent': 'LiveKit-Agents-JS',
717
+ };
718
+
719
+ if (this.oaiRealtimeModel._options.isAzure) {
720
+ // Microsoft API has two ways of authentication
721
+ // 1. Entra token set as `Bearer` token
722
+ // 2. API key set as `api_key` header (also accepts query string)
723
+ if (this.oaiRealtimeModel._options.entraToken) {
724
+ headers.Authorization = `Bearer ${this.oaiRealtimeModel._options.entraToken}`;
725
+ } else if (this.oaiRealtimeModel._options.apiKey) {
726
+ headers['api-key'] = this.oaiRealtimeModel._options.apiKey;
727
+ } else {
728
+ throw new Error('Microsoft API key or entraToken is required');
729
+ }
730
+ } else {
731
+ headers.Authorization = `Bearer ${this.oaiRealtimeModel._options.apiKey}`;
732
+ headers['OpenAI-Beta'] = 'realtime=v1';
733
+ }
734
+
735
+ const url = processBaseURL({
736
+ baseURL: this.oaiRealtimeModel._options.baseURL,
737
+ model: this.oaiRealtimeModel._options.model,
738
+ isAzure: this.oaiRealtimeModel._options.isAzure,
739
+ apiVersion: this.oaiRealtimeModel._options.apiVersion,
740
+ azureDeployment: this.oaiRealtimeModel._options.azureDeployment,
741
+ });
742
+
743
+ if (lkOaiDebug) {
744
+ this.#logger.debug(`Connecting to OpenAI Realtime API at ${url}`);
745
+ }
746
+
747
+ return new Promise((resolve, reject) => {
748
+ const ws = new WebSocket(url, { headers });
749
+ let waiting = true;
750
+
751
+ const timeout = setTimeout(() => {
752
+ ws.close();
753
+ reject(new Error('WebSocket connection timeout'));
754
+ }, this.oaiRealtimeModel._options.connOptions.timeoutMs);
755
+
756
+ ws.once('open', () => {
757
+ if (!waiting) return;
758
+ waiting = false;
759
+ clearTimeout(timeout);
760
+ resolve(ws);
761
+ });
762
+
763
+ ws.once('close', () => {
764
+ if (!waiting) return;
765
+ waiting = false;
766
+ clearTimeout(timeout);
767
+ reject(new Error('OpenAI Realtime API connection closed'));
768
+ });
769
+ });
770
+ }
771
+
772
+ async #mainTask(signal: AbortSignal): Promise<void> {
773
+ let reconnecting = false;
774
+ let numRetries = 0;
775
+ let wsConn: WebSocket | null = null;
776
+ const maxRetries = this.oaiRealtimeModel._options.connOptions.maxRetry;
777
+
778
+ const reconnect = async () => {
779
+ this.#logger.debug(
780
+ {
781
+ maxSessionDuration: this.oaiRealtimeModel._options.maxSessionDuration,
782
+ },
783
+ 'Reconnecting to OpenAI Realtime API',
784
+ );
785
+
786
+ const events: api_proto.ClientEvent[] = [];
787
+
788
+ // options and instructions
789
+ events.push(this.createSessionUpdateEvent());
790
+
791
+ // tools
792
+ if (Object.keys(this._tools).length > 0) {
793
+ events.push(this.createToolsUpdateEvent(this._tools));
794
+ }
795
+
796
+ // chat context
797
+ const chatCtx = this.chatCtx.copy({
798
+ excludeFunctionCall: true,
799
+ excludeInstructions: true,
800
+ excludeEmptyMessage: true,
801
+ });
802
+
803
+ const oldChatCtx = this.remoteChatCtx;
804
+ this.remoteChatCtx = new llm.RemoteChatContext();
805
+ events.push(...this.createChatCtxUpdateEvents(chatCtx));
806
+
807
+ try {
808
+ for (const ev of events) {
809
+ this.emit('openai_client_event_queued', ev);
810
+ wsConn!.send(JSON.stringify(ev));
811
+ }
812
+ } catch (error) {
813
+ this.remoteChatCtx = oldChatCtx;
814
+ throw new APIConnectionError({
815
+ message: 'Failed to send message to OpenAI Realtime API during session re-connection',
816
+ });
817
+ }
818
+
819
+ this.#logger.debug('Reconnected to OpenAI Realtime API');
820
+
821
+ this.emit('session_reconnected', {} as llm.RealtimeSessionReconnectedEvent);
822
+ };
823
+
824
+ reconnecting = false;
825
+ while (!this.#closed && !signal.aborted) {
826
+ this.#logger.debug('Creating WebSocket connection to OpenAI Realtime API');
827
+ wsConn = await this.createWsConn();
828
+ if (signal.aborted) break;
829
+
830
+ try {
831
+ if (reconnecting) {
832
+ await reconnect();
833
+ if (signal.aborted) break;
834
+ numRetries = 0;
835
+ }
836
+
837
+ await this.runWs(wsConn);
838
+ if (signal.aborted) break;
839
+ } catch (error) {
840
+ if (!isAPIError(error)) {
841
+ this.emitError({ error: error as Error, recoverable: false });
842
+ throw error;
843
+ }
844
+
845
+ if (maxRetries === 0 || !error.retryable) {
846
+ this.emitError({ error: error as Error, recoverable: false });
847
+ throw error;
848
+ }
849
+
850
+ if (numRetries === maxRetries) {
851
+ this.emitError({ error: error as Error, recoverable: false });
852
+ throw new APIConnectionError({
853
+ message: `OpenAI Realtime API connection failed after ${numRetries} attempts`,
854
+ options: {
855
+ body: error,
856
+ retryable: false,
857
+ },
858
+ });
859
+ }
860
+
861
+ this.emitError({ error: error as Error, recoverable: true });
862
+ const retryInterval =
863
+ numRetries === 0
864
+ ? DEFAULT_FIRST_RETRY_INTERVAL_MS
865
+ : this.oaiRealtimeModel._options.connOptions.retryIntervalMs;
866
+ this.#logger.warn(
867
+ {
868
+ attempt: numRetries,
869
+ maxRetries,
870
+ error,
871
+ },
872
+ `OpenAI Realtime API connection failed, retrying in ${retryInterval / 1000}s`,
873
+ );
874
+
875
+ await delay(retryInterval);
876
+ numRetries++;
877
+ }
878
+
879
+ reconnecting = true;
880
+ }
881
+ }
882
+
883
+ private async runWs(wsConn: WebSocket): Promise<void> {
884
+ const forwardEvents = async (signal: AbortSignal): Promise<void> => {
885
+ const abortFuture = new Future<void>();
886
+ signal.addEventListener('abort', () => abortFuture.resolve());
887
+
888
+ while (!this.#closed && wsConn.readyState === WebSocket.OPEN && !signal.aborted) {
889
+ try {
890
+ const event = await Promise.race([this.messageChannel.get(), abortFuture.await]);
891
+ if (signal.aborted || abortFuture.done || event === undefined) {
892
+ break;
893
+ }
894
+
895
+ if (lkOaiDebug) {
896
+ this.#logger.debug(this.loggableEvent(event), `(client) -> ${event.type}`);
897
+ }
898
+
899
+ this.emit('openai_client_event_queued', event);
900
+ wsConn.send(JSON.stringify(event));
901
+ } catch (error) {
902
+ break;
903
+ }
904
+ }
905
+
906
+ wsConn.close();
907
+ };
908
+
909
+ const wsCloseFuture = new Future<void | Error>();
910
+
911
+ wsConn.onerror = (error) => {
912
+ wsCloseFuture.resolve(new APIConnectionError({ message: error.message }));
913
+ };
914
+ wsConn.onclose = () => {
915
+ wsCloseFuture.resolve();
916
+ };
917
+
918
+ wsConn.onmessage = (message: MessageEvent) => {
919
+ const event: api_proto.ServerEvent = JSON.parse(message.data as string);
920
+
921
+ this.emit('openai_server_event_received', event);
922
+ if (lkOaiDebug) {
923
+ this.#logger.debug(this.loggableEvent(event), `(server) <- ${event.type}`);
924
+ }
925
+
926
+ switch (event.type) {
927
+ case 'input_audio_buffer.speech_started':
928
+ this.handleInputAudioBufferSpeechStarted(event);
929
+ break;
930
+ case 'input_audio_buffer.speech_stopped':
931
+ this.handleInputAudioBufferSpeechStopped(event);
932
+ break;
933
+ case 'response.created':
934
+ this.handleResponseCreated(event);
935
+ break;
936
+ case 'response.output_item.added':
937
+ this.handleResponseOutputItemAdded(event);
938
+ break;
939
+ case 'conversation.item.created':
940
+ this.handleConversationItemCreated(event);
941
+ break;
942
+ case 'conversation.item.deleted':
943
+ this.handleConversationItemDeleted(event);
944
+ break;
945
+ case 'conversation.item.input_audio_transcription.completed':
946
+ this.handleConversationItemInputAudioTranscriptionCompleted(event);
947
+ break;
948
+ case 'conversation.item.input_audio_transcription.failed':
949
+ this.handleConversationItemInputAudioTranscriptionFailed(event);
950
+ break;
951
+ case 'response.content_part.added':
952
+ this.handleResponseContentPartAdded(event);
953
+ break;
954
+ case 'response.content_part.done':
955
+ this.handleResponseContentPartDone(event);
956
+ break;
957
+ case 'response.text.delta':
958
+ this.handleResponseTextDelta(event);
959
+ break;
960
+ case 'response.text.done':
961
+ this.handleResponseTextDone(event);
962
+ break;
963
+ case 'response.audio_transcript.delta':
964
+ this.handleResponseAudioTranscriptDelta(event);
965
+ break;
966
+ case 'response.audio.delta':
967
+ this.handleResponseAudioDelta(event);
968
+ break;
969
+ case 'response.audio_transcript.done':
970
+ this.handleResponseAudioTranscriptDone(event);
971
+ break;
972
+ case 'response.audio.done':
973
+ this.handleResponseAudioDone(event);
974
+ break;
975
+ case 'response.output_item.done':
976
+ this.handleResponseOutputItemDone(event);
977
+ break;
978
+ case 'response.done':
979
+ this.handleResponseDone(event);
980
+ break;
981
+ case 'error':
982
+ this.handleError(event);
983
+ break;
984
+ default:
985
+ if (lkOaiDebug) {
986
+ this.#logger.debug(`unhandled event: ${event.type}`);
987
+ }
988
+ break;
989
+ }
990
+ };
991
+
992
+ const sendTask = Task.from(({ signal }) => forwardEvents(signal));
993
+
994
+ const wsTask = Task.from(({ signal }) => {
995
+ const abortPromise = new Promise<void>((resolve) => {
996
+ signal.addEventListener('abort', () => {
997
+ resolve();
998
+ });
999
+ });
1000
+
1001
+ return Promise.race([wsCloseFuture.await, abortPromise]);
1002
+ });
1003
+
1004
+ const waitReconnectTask = Task.from(async ({ signal }) => {
1005
+ await delay(this.oaiRealtimeModel._options.maxSessionDuration, { signal });
1006
+ return new APIConnectionError({
1007
+ message: 'OpenAI Realtime API connection timeout',
1008
+ });
1009
+ });
1010
+
1011
+ try {
1012
+ const result = await Promise.race([wsTask.result, sendTask.result, waitReconnectTask.result]);
1013
+
1014
+ if (waitReconnectTask.done && this.currentGeneration) {
1015
+ await this.currentGeneration._doneFut.await;
1016
+ }
1017
+
1018
+ if (result instanceof Error) {
1019
+ throw result;
1020
+ }
1021
+ } finally {
1022
+ await cancelAndWait([wsTask, sendTask, waitReconnectTask], 2000);
1023
+ wsConn.close();
1024
+ }
1025
+ }
1026
+
1027
+ async close() {
1028
+ super.close();
1029
+ this.#closed = true;
1030
+ await this.#task;
1031
+ }
1032
+
1033
+ private handleInputAudioBufferSpeechStarted(
1034
+ _event: api_proto.InputAudioBufferSpeechStartedEvent,
1035
+ ): void {
1036
+ this.emit('input_speech_started', {} as llm.InputSpeechStartedEvent);
1037
+ }
1038
+
1039
+ private handleInputAudioBufferSpeechStopped(
1040
+ _event: api_proto.InputAudioBufferSpeechStoppedEvent,
1041
+ ): void {
1042
+ this.emit('input_speech_stopped', {
1043
+ userTranscriptionEnabled: this.oaiRealtimeModel._options.inputAudioTranscription !== null,
1044
+ } as llm.InputSpeechStoppedEvent);
1045
+ }
1046
+
1047
+ private handleResponseCreated(event: api_proto.ResponseCreatedEvent): void {
1048
+ if (!event.response.id) {
1049
+ throw new Error('response.id is missing');
1050
+ }
1051
+
1052
+ this.currentGeneration = {
1053
+ messageChannel: stream.createStreamChannel<llm.MessageGeneration>(),
1054
+ functionChannel: stream.createStreamChannel<llm.FunctionCall>(),
1055
+ messages: new Map(),
1056
+ _doneFut: new Future(),
1057
+ _createdTimestamp: Date.now(),
1058
+ };
1059
+
1060
+ // Build generation event and resolve client future (if any) before emitting,
1061
+ // matching Python behavior.
1062
+ const generationEv = {
1063
+ messageStream: this.currentGeneration.messageChannel.stream(),
1064
+ functionStream: this.currentGeneration.functionChannel.stream(),
1065
+ userInitiated: false,
1066
+ responseId: event.response.id,
1067
+ } as llm.GenerationCreatedEvent;
1068
+
1069
+ const clientEventId = event.response.metadata?.client_event_id;
1070
+ if (clientEventId) {
1071
+ const handle = this.responseCreatedFutures[clientEventId];
1072
+ if (handle) {
1073
+ delete this.responseCreatedFutures[clientEventId];
1074
+ generationEv.userInitiated = true;
1075
+ if (!handle.doneFut.done) {
1076
+ handle.doneFut.resolve(generationEv);
1077
+ }
1078
+ }
1079
+ }
1080
+
1081
+ this.emit('generation_created', generationEv);
1082
+ }
1083
+
1084
+ private handleResponseOutputItemAdded(event: api_proto.ResponseOutputItemAddedEvent): void {
1085
+ if (!this.currentGeneration) {
1086
+ throw new Error('currentGeneration is not set');
1087
+ }
1088
+
1089
+ if (!event.item.type) {
1090
+ throw new Error('item.type is not set');
1091
+ }
1092
+
1093
+ if (!event.response_id) {
1094
+ throw new Error('response_id is not set');
1095
+ }
1096
+
1097
+ const itemType = event.item.type;
1098
+ const responseId = event.response_id;
1099
+
1100
+ if (itemType !== 'message') {
1101
+ // emit immediately if it's not a message, otherwise wait response.content_part.added
1102
+ this.resolveGeneration(responseId);
1103
+ this.textModeRecoveryRetries = 0;
1104
+ return;
1105
+ }
1106
+
1107
+ const itemId = event.item.id;
1108
+ if (!itemId) {
1109
+ throw new Error('item.id is not set');
1110
+ }
1111
+
1112
+ const modalitiesFut = new Future<Modality[]>();
1113
+ const itemGeneration: MessageGeneration = {
1114
+ messageId: itemId,
1115
+ textChannel: stream.createStreamChannel<string>(),
1116
+ audioChannel: stream.createStreamChannel<AudioFrame>(),
1117
+ audioTranscript: '',
1118
+ modalities: modalitiesFut,
1119
+ };
1120
+
1121
+ // If audioOutput is not supported, close audio channel immediately
1122
+ if (!this.oaiRealtimeModel.capabilities.audioOutput) {
1123
+ itemGeneration.audioChannel.close();
1124
+ modalitiesFut.resolve(['text']);
1125
+ }
1126
+
1127
+ this.currentGeneration.messageChannel.write({
1128
+ messageId: itemId,
1129
+ textStream: itemGeneration.textChannel.stream(),
1130
+ audioStream: itemGeneration.audioChannel.stream(),
1131
+ modalities: modalitiesFut.await,
1132
+ });
1133
+
1134
+ this.currentGeneration.messages.set(itemId, itemGeneration);
1135
+ }
1136
+
1137
+ private handleConversationItemCreated(event: api_proto.ConversationItemCreatedEvent): void {
1138
+ if (!event.item.id) {
1139
+ throw new Error('item.id is not set');
1140
+ }
1141
+
1142
+ try {
1143
+ this.remoteChatCtx.insert(event.previous_item_id, openAIItemToLivekitItem(event.item));
1144
+ } catch (error) {
1145
+ this.#logger.error({ error, itemId: event.item.id }, 'failed to insert conversation item');
1146
+ }
1147
+
1148
+ const fut = this.itemCreateFutures[event.item.id];
1149
+ if (fut) {
1150
+ fut.resolve();
1151
+ delete this.itemCreateFutures[event.item.id];
1152
+ }
1153
+ }
1154
+
1155
+ private handleConversationItemDeleted(event: api_proto.ConversationItemDeletedEvent): void {
1156
+ if (!event.item_id) {
1157
+ throw new Error('item_id is not set');
1158
+ }
1159
+
1160
+ try {
1161
+ this.remoteChatCtx.delete(event.item_id);
1162
+ } catch (error) {
1163
+ this.#logger.error({ error, itemId: event.item_id }, 'failed to delete conversation item');
1164
+ }
1165
+
1166
+ const fut = this.itemDeleteFutures[event.item_id];
1167
+ if (fut) {
1168
+ fut.resolve();
1169
+ delete this.itemDeleteFutures[event.item_id];
1170
+ }
1171
+ }
1172
+
1173
+ private handleConversationItemInputAudioTranscriptionCompleted(
1174
+ event: api_proto.ConversationItemInputAudioTranscriptionCompletedEvent,
1175
+ ): void {
1176
+ const remoteItem = this.remoteChatCtx.get(event.item_id);
1177
+ if (!remoteItem) {
1178
+ return;
1179
+ }
1180
+
1181
+ const item = remoteItem.item;
1182
+ if (item instanceof llm.ChatMessage) {
1183
+ item.content.push(event.transcript);
1184
+ } else {
1185
+ throw new Error('item is not a chat message');
1186
+ }
1187
+
1188
+ this.emit('input_audio_transcription_completed', {
1189
+ itemId: event.item_id,
1190
+ transcript: event.transcript,
1191
+ isFinal: true,
1192
+ } as llm.InputTranscriptionCompleted);
1193
+ }
1194
+
1195
+ private handleConversationItemInputAudioTranscriptionFailed(
1196
+ event: api_proto.ConversationItemInputAudioTranscriptionFailedEvent,
1197
+ ): void {
1198
+ this.#logger.error(
1199
+ { error: event.error },
1200
+ 'OpenAI Realtime API failed to transcribe input audio',
1201
+ );
1202
+ }
1203
+
1204
+ private handleResponseContentPartAdded(event: api_proto.ResponseContentPartAddedEvent): void {
1205
+ if (!this.currentGeneration) {
1206
+ throw new Error('currentGeneration is not set');
1207
+ }
1208
+
1209
+ const itemId = event.item_id;
1210
+ const itemType = event.part.type;
1211
+
1212
+ const itemGeneration = this.currentGeneration.messages.get(itemId);
1213
+ if (!itemGeneration) {
1214
+ this.#logger.warn(`itemGeneration not found for itemId=${itemId}`);
1215
+ return;
1216
+ }
1217
+
1218
+ if (itemType === 'text' && this.oaiRealtimeModel.capabilities.audioOutput) {
1219
+ this.#logger.warn('Text response received from OpenAI Realtime API in audio modality.');
1220
+ }
1221
+
1222
+ if (!itemGeneration.modalities.done) {
1223
+ const modalityResult: Modality[] = itemType === 'text' ? ['text'] : ['audio', 'text'];
1224
+ itemGeneration.modalities.resolve(modalityResult);
1225
+ }
1226
+
1227
+ if (this.currentGeneration._firstTokenTimestamp === undefined) {
1228
+ this.currentGeneration._firstTokenTimestamp = Date.now();
1229
+ }
1230
+ }
1231
+
1232
+ private handleResponseContentPartDone(event: api_proto.ResponseContentPartDoneEvent): void {
1233
+ if (!event.part) {
1234
+ return;
1235
+ }
1236
+ if (event.part.type !== 'text') {
1237
+ return;
1238
+ }
1239
+
1240
+ if (!this.currentGeneration) {
1241
+ throw new Error('currentGeneration is not set');
1242
+ }
1243
+
1244
+ // TODO(shubhra): handle text mode recovery
1245
+ }
1246
+
1247
+ private handleResponseTextDelta(event: api_proto.ResponseTextDeltaEvent): void {
1248
+ if (!this.currentGeneration) {
1249
+ throw new Error('currentGeneration is not set');
1250
+ }
1251
+
1252
+ const itemGeneration = this.currentGeneration.messages.get(event.item_id);
1253
+ if (!itemGeneration) {
1254
+ throw new Error('itemGeneration is not set');
1255
+ }
1256
+
1257
+ if (
1258
+ !this.oaiRealtimeModel.capabilities.audioOutput &&
1259
+ !this.currentGeneration._firstTokenTimestamp
1260
+ ) {
1261
+ this.currentGeneration._firstTokenTimestamp = Date.now();
1262
+ }
1263
+
1264
+ itemGeneration.textChannel.write(event.delta);
1265
+ itemGeneration.audioTranscript += event.delta;
1266
+ }
1267
+
1268
+ private handleResponseTextDone(_event: api_proto.ResponseTextDoneEvent): void {
1269
+ if (!this.currentGeneration) {
1270
+ throw new Error('currentGeneration is not set');
1271
+ }
1272
+ }
1273
+
1274
+ private handleResponseAudioTranscriptDelta(
1275
+ event: api_proto.ResponseAudioTranscriptDeltaEvent,
1276
+ ): void {
1277
+ if (!this.currentGeneration) {
1278
+ throw new Error('currentGeneration is not set');
1279
+ }
1280
+
1281
+ const itemId = event.item_id;
1282
+ const delta = event.delta;
1283
+
1284
+ // TODO (shubhra): add timed string support
1285
+
1286
+ const itemGeneration = this.currentGeneration.messages.get(itemId);
1287
+ if (!itemGeneration) {
1288
+ throw new Error('itemGeneration is not set');
1289
+ } else {
1290
+ itemGeneration.textChannel.write(delta);
1291
+ itemGeneration.audioTranscript += delta;
1292
+ }
1293
+ }
1294
+
1295
+ private handleResponseAudioDelta(event: api_proto.ResponseAudioDeltaEvent): void {
1296
+ if (!this.currentGeneration) {
1297
+ throw new Error('currentGeneration is not set');
1298
+ }
1299
+
1300
+ const itemGeneration = this.currentGeneration.messages.get(event.item_id);
1301
+ if (!itemGeneration) {
1302
+ throw new Error('itemGeneration is not set');
1303
+ }
1304
+
1305
+ if (this.currentGeneration._firstTokenTimestamp === undefined) {
1306
+ this.currentGeneration._firstTokenTimestamp = Date.now();
1307
+ }
1308
+
1309
+ if (!itemGeneration.modalities.done) {
1310
+ itemGeneration.modalities.resolve(['audio', 'text']);
1311
+ }
1312
+
1313
+ const binaryString = atob(event.delta);
1314
+ const len = binaryString.length;
1315
+ const bytes = new Uint8Array(len);
1316
+ for (let i = 0; i < len; i++) {
1317
+ bytes[i] = binaryString.charCodeAt(i);
1318
+ }
1319
+
1320
+ itemGeneration.audioChannel.write(
1321
+ new AudioFrame(
1322
+ new Int16Array(bytes.buffer),
1323
+ api_proto.SAMPLE_RATE,
1324
+ api_proto.NUM_CHANNELS,
1325
+ bytes.length / 2,
1326
+ ),
1327
+ );
1328
+ }
1329
+
1330
+ private handleResponseAudioTranscriptDone(
1331
+ _event: api_proto.ResponseAudioTranscriptDoneEvent,
1332
+ ): void {
1333
+ if (!this.currentGeneration) {
1334
+ throw new Error('currentGeneration is not set');
1335
+ }
1336
+ }
1337
+
1338
+ private handleResponseAudioDone(_event: api_proto.ResponseAudioDoneEvent): void {
1339
+ if (!this.currentGeneration) {
1340
+ throw new Error('currentGeneration is not set');
1341
+ }
1342
+ }
1343
+
1344
+ private handleResponseOutputItemDone(event: api_proto.ResponseOutputItemDoneEvent): void {
1345
+ if (!this.currentGeneration) {
1346
+ throw new Error('currentGeneration is not set');
1347
+ }
1348
+
1349
+ const itemId = event.item.id;
1350
+ const itemType = event.item.type;
1351
+
1352
+ if (itemType === 'function_call') {
1353
+ const item = event.item;
1354
+ if (!item.call_id || !item.name || !item.arguments) {
1355
+ throw new Error('item is not a function call');
1356
+ }
1357
+ this.currentGeneration.functionChannel.write(
1358
+ llm.FunctionCall.create({
1359
+ callId: item.call_id,
1360
+ name: item.name,
1361
+ args: item.arguments,
1362
+ }),
1363
+ );
1364
+ } else if (itemType === 'message') {
1365
+ const itemGeneration = this.currentGeneration.messages.get(itemId);
1366
+ if (!itemGeneration) {
1367
+ return;
1368
+ }
1369
+ // text response doesn't have itemGeneration
1370
+ itemGeneration.textChannel.close();
1371
+ itemGeneration.audioChannel.close();
1372
+ if (!itemGeneration.modalities.done) {
1373
+ // In case message modalities is not set, this shouldn't happen
1374
+ itemGeneration.modalities.resolve(this.oaiRealtimeModel._options.modalities);
1375
+ }
1376
+ }
1377
+ }
1378
+
1379
+ private handleResponseDone(_event: api_proto.ResponseDoneEvent): void {
1380
+ if (!this.currentGeneration) {
1381
+ // OpenAI has a race condition where we could receive response.done without any
1382
+ // previous response.created (This happens generally during interruption)
1383
+ return;
1384
+ }
1385
+
1386
+ const createdTimestamp = this.currentGeneration._createdTimestamp;
1387
+ const firstTokenTimestamp = this.currentGeneration._firstTokenTimestamp;
1388
+
1389
+ this.#logger.debug(
1390
+ {
1391
+ messageCount: this.currentGeneration.messages.size,
1392
+ },
1393
+ 'Closing generation channels in handleResponseDone',
1394
+ );
1395
+
1396
+ for (const generation of this.currentGeneration.messages.values()) {
1397
+ generation.textChannel.close();
1398
+ generation.audioChannel.close();
1399
+ if (!generation.modalities.done) {
1400
+ generation.modalities.resolve(this.oaiRealtimeModel._options.modalities);
1401
+ }
1402
+ }
1403
+
1404
+ this.currentGeneration.functionChannel.close();
1405
+ this.currentGeneration.messageChannel.close();
1406
+
1407
+ for (const itemId of this.currentGeneration.messages.keys()) {
1408
+ const remoteItem = this.remoteChatCtx.get(itemId);
1409
+ if (remoteItem && remoteItem.item instanceof llm.ChatMessage) {
1410
+ remoteItem.item.content.push(this.currentGeneration.messages.get(itemId)!.audioTranscript);
1411
+ }
1412
+ }
1413
+
1414
+ this.currentGeneration._doneFut.resolve();
1415
+ this.currentGeneration = undefined;
1416
+
1417
+ // Calculate and emit metrics
1418
+ const usage = _event.response.usage;
1419
+ const ttftMs = firstTokenTimestamp ? firstTokenTimestamp - createdTimestamp : -1;
1420
+ const durationMs = Date.now() - createdTimestamp;
1421
+
1422
+ const realtimeMetrics: metrics.RealtimeModelMetrics = {
1423
+ type: 'realtime_model_metrics',
1424
+ timestamp: createdTimestamp,
1425
+ requestId: _event.response.id || '',
1426
+ ttftMs,
1427
+ durationMs,
1428
+ cancelled: _event.response.status === 'cancelled',
1429
+ label: 'openai_realtime',
1430
+ inputTokens: usage?.input_tokens ?? 0,
1431
+ outputTokens: usage?.output_tokens ?? 0,
1432
+ totalTokens: usage?.total_tokens ?? 0,
1433
+ tokensPerSecond: durationMs > 0 ? (usage?.output_tokens ?? 0) / (durationMs / 1000) : 0,
1434
+ inputTokenDetails: {
1435
+ audioTokens: usage?.input_token_details?.audio_tokens ?? 0,
1436
+ textTokens: usage?.input_token_details?.text_tokens ?? 0,
1437
+ imageTokens: 0, // Not supported yet
1438
+ cachedTokens: usage?.input_token_details?.cached_tokens ?? 0,
1439
+ cachedTokensDetails: usage?.input_token_details?.cached_tokens_details
1440
+ ? {
1441
+ audioTokens: usage?.input_token_details?.cached_tokens_details?.audio_tokens ?? 0,
1442
+ textTokens: usage?.input_token_details?.cached_tokens_details?.text_tokens ?? 0,
1443
+ imageTokens: usage?.input_token_details?.cached_tokens_details?.image_tokens ?? 0,
1444
+ }
1445
+ : undefined,
1446
+ },
1447
+ outputTokenDetails: {
1448
+ textTokens: usage?.output_token_details?.text_tokens ?? 0,
1449
+ audioTokens: usage?.output_token_details?.audio_tokens ?? 0,
1450
+ imageTokens: 0,
1451
+ },
1452
+ };
1453
+
1454
+ this.emit('metrics_collected', realtimeMetrics);
1455
+ // TODO(brian): handle response done but not complete
1456
+ }
1457
+
1458
+ private handleError(event: api_proto.ErrorEvent): void {
1459
+ if (event.error.message.startsWith('Cancellation failed')) {
1460
+ return;
1461
+ }
1462
+
1463
+ this.#logger.error({ error: event.error }, 'OpenAI Realtime API returned an error');
1464
+ this.emitError({
1465
+ error: new APIError(event.error.message, {
1466
+ body: event.error,
1467
+ retryable: true,
1468
+ }),
1469
+ recoverable: true,
1470
+ });
1471
+
1472
+ // TODO(brian): set error for response future if it exists
1473
+ }
1474
+
1475
+ private emitError({ error, recoverable }: { error: Error; recoverable: boolean }): void {
1476
+ // IMPORTANT: only emit error if there are listeners; otherwise emit will throw an error
1477
+ this.emit('error', {
1478
+ timestamp: Date.now(),
1479
+ // TODO(brian): add label
1480
+ label: '',
1481
+ error,
1482
+ recoverable,
1483
+ } as llm.RealtimeModelError);
1484
+ }
1485
+
1486
+ private *resampleAudio(frame: AudioFrame): Generator<AudioFrame> {
1487
+ yield frame;
1488
+ }
1489
+
1490
+ private createResponse({
1491
+ userInitiated,
1492
+ instructions,
1493
+ oldHandle,
1494
+ }: {
1495
+ userInitiated: boolean;
1496
+ instructions?: string;
1497
+ oldHandle?: CreateResponseHandle;
1498
+ }): CreateResponseHandle {
1499
+ const handle = oldHandle || new CreateResponseHandle({ instructions });
1500
+ if (oldHandle && instructions) {
1501
+ handle.instructions = instructions;
1502
+ }
1503
+
1504
+ const eventId = shortuuid('response_create_');
1505
+ if (userInitiated) {
1506
+ this.responseCreatedFutures[eventId] = handle;
1507
+ }
1508
+
1509
+ const response: api_proto.ResponseCreateEvent['response'] = {};
1510
+ if (instructions) response.instructions = instructions;
1511
+ if (userInitiated) response.metadata = { client_event_id: eventId };
1512
+
1513
+ this.sendEvent({
1514
+ type: 'response.create',
1515
+ event_id: eventId,
1516
+ response: Object.keys(response).length > 0 ? response : undefined,
1517
+ });
1518
+
1519
+ return handle;
1520
+ }
1521
+
1522
+ private resolveGeneration(responseId: string): void {
1523
+ if (!this.currentGeneration) {
1524
+ throw new Error('currentGeneration is not set');
1525
+ }
1526
+
1527
+ const generation_ev = {
1528
+ messageStream: this.currentGeneration.messageChannel.stream(),
1529
+ functionStream: this.currentGeneration.functionChannel.stream(),
1530
+ userInitiated: false,
1531
+ responseId,
1532
+ } as llm.GenerationCreatedEvent;
1533
+
1534
+ const handle = this.responseCreatedFutures[responseId];
1535
+ if (handle) {
1536
+ delete this.responseCreatedFutures[responseId];
1537
+ generation_ev.userInitiated = true;
1538
+ if (handle.doneFut.done) {
1539
+ this.#logger.warn({ responseId }, 'response received after timeout');
1540
+ } else {
1541
+ handle.doneFut.resolve(generation_ev);
1542
+ }
1543
+ }
1544
+ }
1545
+ }
1546
+
1547
+ function livekitItemToOpenAIItem(item: llm.ChatItem): api_proto.ItemResource {
1548
+ switch (item.type) {
1549
+ case 'function_call':
1550
+ return {
1551
+ id: item.id,
1552
+ type: 'function_call',
1553
+ call_id: item.callId,
1554
+ name: item.name,
1555
+ arguments: item.args,
1556
+ } as api_proto.FunctionCallItem;
1557
+ case 'function_call_output':
1558
+ return {
1559
+ id: item.id,
1560
+ type: 'function_call_output',
1561
+ call_id: item.callId,
1562
+ output: item.output,
1563
+ } as api_proto.FunctionCallOutputItem;
1564
+ case 'message':
1565
+ const role = item.role === 'developer' ? 'system' : item.role;
1566
+ const contentList: api_proto.Content[] = [];
1567
+ for (const c of item.content) {
1568
+ if (typeof c === 'string') {
1569
+ contentList.push({
1570
+ type: role === 'assistant' ? 'text' : 'input_text',
1571
+ text: c,
1572
+ } as api_proto.InputTextContent);
1573
+ } else if (c.type === 'image_content') {
1574
+ // not supported for now
1575
+ continue;
1576
+ } else if (c.type === 'audio_content') {
1577
+ if (role === 'user') {
1578
+ const encodedAudio = Buffer.from(combineAudioFrames(c.frame).data).toString('base64');
1579
+ contentList.push({
1580
+ type: 'input_audio',
1581
+ audio: encodedAudio,
1582
+ } as api_proto.InputAudioContent);
1583
+ }
1584
+ }
1585
+ }
1586
+ return {
1587
+ id: item.id,
1588
+ type: 'message',
1589
+ role,
1590
+ content: contentList,
1591
+ } as api_proto.UserItem;
1592
+ default:
1593
+ throw new Error(`Unsupported item type: ${(item as any).type}`);
1594
+ }
1595
+ }
1596
+
1597
+ function openAIItemToLivekitItem(item: api_proto.ItemResource): llm.ChatItem {
1598
+ if (!item.id) {
1599
+ throw new Error('item.id is not set');
1600
+ }
1601
+
1602
+ switch (item.type) {
1603
+ case 'function_call':
1604
+ return llm.FunctionCall.create({
1605
+ id: item.id,
1606
+ callId: item.call_id,
1607
+ name: item.name,
1608
+ args: item.arguments,
1609
+ });
1610
+ case 'function_call_output':
1611
+ return llm.FunctionCallOutput.create({
1612
+ id: item.id,
1613
+ callId: item.call_id,
1614
+ output: item.output,
1615
+ isError: false,
1616
+ });
1617
+ case 'message':
1618
+ const content: llm.ChatContent[] = [];
1619
+ // item.content can be a single object or an array; normalize to array
1620
+ const contents = Array.isArray(item.content) ? item.content : [item.content];
1621
+ for (const c of contents) {
1622
+ if (c.type === 'text' || c.type === 'input_text') {
1623
+ content.push(c.text);
1624
+ }
1625
+ }
1626
+ return llm.ChatMessage.create({
1627
+ id: item.id,
1628
+ role: item.role,
1629
+ content,
1630
+ });
1631
+ }
1632
+ }
1633
+
1634
+ function createMockAudioItem(durationSeconds: number = 2): llm.ChatMessage {
1635
+ const audioData = Buffer.alloc(durationSeconds * SAMPLE_RATE);
1636
+ return llm.ChatMessage.create({
1637
+ id: shortuuid(MOCK_AUDIO_ID_PREFIX),
1638
+ role: 'user',
1639
+ content: [
1640
+ {
1641
+ type: 'audio_content',
1642
+ frame: [
1643
+ new AudioFrame(
1644
+ new Int16Array(audioData.buffer),
1645
+ SAMPLE_RATE,
1646
+ NUM_CHANNELS,
1647
+ audioData.length / 2,
1648
+ ),
1649
+ ],
1650
+ } as llm.AudioContent,
1651
+ ],
1652
+ });
1653
+ }
1654
+
1655
+ function toOaiToolChoice(toolChoice?: llm.ToolChoice): api_proto.ToolChoice {
1656
+ if (typeof toolChoice === 'string') {
1657
+ return toolChoice;
1658
+ }
1659
+
1660
+ if (toolChoice?.type === 'function') {
1661
+ return toolChoice.function.name;
1662
+ }
1663
+
1664
+ return 'auto';
1665
+ }