@livekit/agents-plugin-openai 0.9.3 → 1.0.0-next.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/dist/index.cjs +16 -5
  2. package/dist/index.cjs.map +1 -1
  3. package/dist/index.d.cts +4 -4
  4. package/dist/index.d.ts +4 -4
  5. package/dist/index.d.ts.map +1 -1
  6. package/dist/index.js +14 -3
  7. package/dist/index.js.map +1 -1
  8. package/dist/llm.cjs +156 -197
  9. package/dist/llm.cjs.map +1 -1
  10. package/dist/llm.d.cts +27 -8
  11. package/dist/llm.d.ts +27 -8
  12. package/dist/llm.d.ts.map +1 -1
  13. package/dist/llm.js +164 -188
  14. package/dist/llm.js.map +1 -1
  15. package/dist/models.cjs +14 -0
  16. package/dist/models.cjs.map +1 -1
  17. package/dist/models.d.cts +11 -6
  18. package/dist/models.d.ts +11 -6
  19. package/dist/models.d.ts.map +1 -1
  20. package/dist/models.js +6 -0
  21. package/dist/models.js.map +1 -1
  22. package/dist/realtime/api_proto.cjs.map +1 -1
  23. package/dist/realtime/api_proto.d.cts +15 -0
  24. package/dist/realtime/api_proto.d.ts +15 -0
  25. package/dist/realtime/api_proto.d.ts.map +1 -1
  26. package/dist/realtime/api_proto.js.map +1 -1
  27. package/dist/realtime/realtime_model.cjs +1057 -820
  28. package/dist/realtime/realtime_model.cjs.map +1 -1
  29. package/dist/realtime/realtime_model.d.cts +126 -160
  30. package/dist/realtime/realtime_model.d.ts +126 -160
  31. package/dist/realtime/realtime_model.d.ts.map +1 -1
  32. package/dist/realtime/realtime_model.js +1067 -825
  33. package/dist/realtime/realtime_model.js.map +1 -1
  34. package/dist/tts.cjs +5 -5
  35. package/dist/tts.cjs.map +1 -1
  36. package/dist/tts.d.cts +2 -1
  37. package/dist/tts.d.ts +2 -1
  38. package/dist/tts.d.ts.map +1 -1
  39. package/dist/tts.js +6 -6
  40. package/dist/tts.js.map +1 -1
  41. package/package.json +9 -7
  42. package/src/index.ts +19 -5
  43. package/src/llm.ts +227 -228
  44. package/src/models.ts +83 -5
  45. package/src/realtime/api_proto.ts +15 -1
  46. package/src/realtime/realtime_model.ts +1305 -996
  47. package/src/tts.ts +6 -6
@@ -1,561 +1,644 @@
1
1
  // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
+ import type { metrics } from '@livekit/agents';
4
5
  import {
5
- AsyncIterableQueue,
6
+ type APIConnectOptions,
7
+ APIConnectionError,
8
+ APIError,
9
+ AudioByteStream,
10
+ DEFAULT_API_CONNECT_OPTIONS,
6
11
  Future,
7
12
  Queue,
13
+ Task,
14
+ cancelAndWait,
15
+ isAPIError,
8
16
  llm,
9
17
  log,
10
- mergeFrames,
11
- metrics,
12
- multimodal,
18
+ shortuuid,
19
+ stream,
13
20
  } from '@livekit/agents';
14
- import { AudioFrame } from '@livekit/rtc-node';
15
- import { once } from 'node:events';
16
- import { WebSocket } from 'ws';
21
+ import { Mutex } from '@livekit/mutex';
22
+ import type { AudioResampler } from '@livekit/rtc-node';
23
+ import { AudioFrame, combineAudioFrames } from '@livekit/rtc-node';
24
+ import { delay } from '@std/async';
25
+ import type { GenerationCreatedEvent } from 'agents/dist/llm/realtime.js';
26
+ import { type MessageEvent, WebSocket } from 'ws';
17
27
  import * as api_proto from './api_proto.js';
18
28
 
19
- interface ModelOptions {
20
- modalities: ['text', 'audio'] | ['text'];
21
- instructions: string;
29
+ const SAMPLE_RATE = 24000;
30
+ const NUM_CHANNELS = 1;
31
+ const BASE_URL = 'https://api.openai.com/v1';
32
+
33
+ const MOCK_AUDIO_ID_PREFIX = 'lk_mock_audio_item_';
34
+
35
+ interface RealtimeOptions {
36
+ model: api_proto.Model;
22
37
  voice: api_proto.Voice;
23
- inputAudioFormat: api_proto.AudioFormat;
24
- outputAudioFormat: api_proto.AudioFormat;
25
- inputAudioTranscription: api_proto.InputAudioTranscription | null;
26
- turnDetection: api_proto.TurnDetectionType | null;
27
38
  temperature: number;
28
- maxResponseOutputTokens: number;
29
- model: api_proto.Model;
39
+ toolChoice?: llm.ToolChoice;
40
+ inputAudioTranscription?: api_proto.InputAudioTranscription | null;
41
+ // TODO(shubhra): add inputAudioNoiseReduction
42
+ turnDetection?: api_proto.TurnDetectionType | null;
43
+ maxResponseOutputTokens?: number | 'inf';
44
+ speed?: number;
45
+ // TODO(shubhra): add openai tracing options
30
46
  apiKey?: string;
31
47
  baseURL: string;
32
48
  isAzure: boolean;
49
+ azureDeployment?: string;
33
50
  entraToken?: string;
34
51
  apiVersion?: string;
52
+ maxSessionDuration: number;
53
+ // reset the connection after this many seconds if provided
54
+ connOptions: APIConnectOptions;
35
55
  }
36
56
 
37
- export interface RealtimeResponse {
38
- id: string;
39
- status: api_proto.ResponseStatus;
40
- statusDetails: api_proto.ResponseStatusDetails | null;
41
- usage: api_proto.ModelUsage | null;
42
- output: RealtimeOutput[];
43
- doneFut: Future;
44
- createdTimestamp: number;
45
- firstTokenTimestamp?: number;
57
+ interface MessageGeneration {
58
+ messageId: string;
59
+ textChannel: stream.StreamChannel<string>;
60
+ audioChannel: stream.StreamChannel<AudioFrame>;
61
+ audioTranscript: string;
46
62
  }
47
63
 
48
- export interface RealtimeOutput {
49
- responseId: string;
50
- itemId: string;
51
- outputIndex: number;
52
- role: api_proto.Role;
53
- type: 'message' | 'function_call';
54
- content: RealtimeContent[];
55
- doneFut: Future;
64
+ interface ResponseGeneration {
65
+ messageChannel: stream.StreamChannel<llm.MessageGeneration>;
66
+ functionChannel: stream.StreamChannel<llm.FunctionCall>;
67
+ messages: Map<string, MessageGeneration>;
68
+
69
+ /** @internal */
70
+ _doneFut: Future;
71
+ /** @internal */
72
+ _createdTimestamp: number;
73
+ /** @internal */
74
+ _firstTokenTimestamp?: number;
56
75
  }
57
76
 
58
- export interface RealtimeContent {
59
- responseId: string;
60
- itemId: string;
61
- outputIndex: number;
62
- contentIndex: number;
63
- text: string;
64
- audio: AudioFrame[];
65
- textStream: AsyncIterableQueue<string>;
66
- audioStream: AsyncIterableQueue<AudioFrame>;
67
- toolCalls: RealtimeToolCall[];
68
- contentType: api_proto.Modality;
77
+ class CreateResponseHandle {
78
+ instructions?: string;
79
+ doneFut: Future<llm.GenerationCreatedEvent>;
80
+ // TODO(shubhra): add timeout
81
+ constructor({ instructions }: { instructions?: string }) {
82
+ this.instructions = instructions;
83
+ this.doneFut = new Future();
84
+ }
69
85
  }
70
86
 
71
- export interface RealtimeToolCall {
72
- name: string;
73
- arguments: string;
74
- toolCallID: string;
75
- }
87
+ // default values got from a "default" session from their API
88
+ const DEFAULT_FIRST_RETRY_INTERVAL_MS = 100;
89
+ const DEFAULT_TEMPERATURE = 0.8;
90
+ const DEFAULT_TURN_DETECTION: api_proto.TurnDetectionType = {
91
+ type: 'server_vad',
92
+ threshold: 0.5,
93
+ prefix_padding_ms: 300,
94
+ silence_duration_ms: 200,
95
+ create_response: true,
96
+ interrupt_response: true,
97
+ };
98
+ const DEFAULT_INPUT_AUDIO_TRANSCRIPTION: api_proto.InputAudioTranscription = {
99
+ model: 'gpt-4o-mini-transcribe',
100
+ };
101
+ const DEFAULT_TOOL_CHOICE: llm.ToolChoice = 'auto';
102
+ const DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS: number | 'inf' = 'inf';
103
+
104
+ const AZURE_DEFAULT_INPUT_AUDIO_TRANSCRIPTION: api_proto.InputAudioTranscription = {
105
+ model: 'whisper-1',
106
+ };
107
+
108
+ const AZURE_DEFAULT_TURN_DETECTION: api_proto.TurnDetectionType = {
109
+ type: 'server_vad',
110
+ threshold: 0.5,
111
+ prefix_padding_ms: 300,
112
+ silence_duration_ms: 200,
113
+ create_response: true,
114
+ };
115
+
116
+ const DEFAULT_MAX_SESSION_DURATION = 20 * 60 * 1000; // 20 minutes
117
+
118
+ const DEFAULT_REALTIME_MODEL_OPTIONS = {
119
+ model: 'gpt-4o-realtime-preview',
120
+ voice: 'alloy',
121
+ temperature: DEFAULT_TEMPERATURE,
122
+ inputAudioTranscription: DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
123
+ turnDetection: DEFAULT_TURN_DETECTION,
124
+ toolChoice: DEFAULT_TOOL_CHOICE,
125
+ maxResponseOutputTokens: DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS,
126
+ maxSessionDuration: DEFAULT_MAX_SESSION_DURATION,
127
+ connOptions: DEFAULT_API_CONNECT_OPTIONS,
128
+ };
129
+ export class RealtimeModel extends llm.RealtimeModel {
130
+ sampleRate = api_proto.SAMPLE_RATE;
131
+ numChannels = api_proto.NUM_CHANNELS;
132
+ inFrameSize = api_proto.IN_FRAME_SIZE;
133
+ outFrameSize = api_proto.OUT_FRAME_SIZE;
76
134
 
77
- export interface InputSpeechTranscriptionCompleted {
78
- itemId: string;
79
- transcript: string;
80
- }
135
+ /* @internal */
136
+ _options: RealtimeOptions;
81
137
 
82
- export interface InputSpeechTranscriptionFailed {
83
- itemId: string;
84
- message: string;
85
- }
138
+ constructor(
139
+ options: {
140
+ model?: string;
141
+ voice?: string;
142
+ temperature?: number;
143
+ toolChoice?: llm.ToolChoice;
144
+ baseURL?: string;
145
+ inputAudioTranscription?: api_proto.InputAudioTranscription | null;
146
+ // TODO(shubhra): add inputAudioNoiseReduction
147
+ turnDetection?: api_proto.TurnDetectionType | null;
148
+ speed?: number;
149
+ // TODO(shubhra): add openai tracing options
150
+ azureDeployment?: string;
151
+ apiKey?: string;
152
+ entraToken?: string;
153
+ apiVersion?: string;
154
+ maxSessionDuration?: number;
155
+ connOptions?: APIConnectOptions;
156
+ } = {},
157
+ ) {
158
+ super({
159
+ messageTruncation: true,
160
+ turnDetection: options.turnDetection !== null,
161
+ userTranscription: options.inputAudioTranscription !== null,
162
+ autoToolReplyGeneration: false,
163
+ });
86
164
 
87
- export interface InputSpeechStarted {
88
- itemId: string;
89
- }
165
+ const isAzure = !!(options.apiVersion || options.entraToken || options.azureDeployment);
90
166
 
91
- export interface InputSpeechCommitted {
92
- itemId: string;
93
- }
167
+ if (options.apiKey === '' && !isAzure) {
168
+ throw new Error(
169
+ 'OpenAI API key is required, either using the argument or by setting the OPENAI_API_KEY environmental variable',
170
+ );
171
+ }
172
+
173
+ const apiKey = options.apiKey || process.env.OPENAI_API_KEY;
174
+
175
+ if (!apiKey && !isAzure) {
176
+ throw new Error(
177
+ 'OpenAI API key is required, either using the argument or by setting the OPENAI_API_KEY environmental variable',
178
+ );
179
+ }
94
180
 
95
- class InputAudioBuffer {
96
- #session: RealtimeSession;
181
+ if (!options.baseURL && isAzure) {
182
+ const azureEndpoint = process.env.AZURE_OPENAI_ENDPOINT;
183
+ if (!azureEndpoint) {
184
+ throw new Error(
185
+ 'Missing Azure endpoint. Please pass base_url or set AZURE_OPENAI_ENDPOINT environment variable.',
186
+ );
187
+ }
188
+ options.baseURL = `${azureEndpoint.replace(/\/$/, '')}/openai`;
189
+ }
97
190
 
98
- constructor(session: RealtimeSession) {
99
- this.#session = session;
191
+ this._options = {
192
+ ...DEFAULT_REALTIME_MODEL_OPTIONS,
193
+ ...options,
194
+ baseURL: options.baseURL || BASE_URL,
195
+ apiKey,
196
+ isAzure,
197
+ model: options.model || DEFAULT_REALTIME_MODEL_OPTIONS.model,
198
+ };
100
199
  }
101
200
 
102
- append(frame: AudioFrame) {
103
- this.#session.queueMsg({
104
- type: 'input_audio_buffer.append',
105
- audio: Buffer.from(frame.data.buffer).toString('base64'),
201
+ /**
202
+ * Create a RealtimeModel instance configured for Azure OpenAI Service.
203
+ *
204
+ * @param azureDeployment - The name of your Azure OpenAI deployment.
205
+ * @param azureEndpoint - The endpoint URL for your Azure OpenAI resource. If undefined, will attempt to read from the environment variable AZURE_OPENAI_ENDPOINT.
206
+ * @param apiVersion - API version to use with Azure OpenAI Service. If undefined, will attempt to read from the environment variable OPENAI_API_VERSION.
207
+ * @param apiKey - Azure OpenAI API key. If undefined, will attempt to read from the environment variable AZURE_OPENAI_API_KEY.
208
+ * @param entraToken - Azure Entra authentication token. Required if not using API key authentication.
209
+ * @param baseURL - Base URL for the API endpoint. If undefined, constructed from the azure_endpoint.
210
+ * @param voice - Voice setting for audio outputs. Defaults to "alloy".
211
+ * @param inputAudioTranscription - Options for transcribing input audio. Defaults to @see DEFAULT_INPUT_AUDIO_TRANSCRIPTION.
212
+ * @param turnDetection - Options for server-based voice activity detection (VAD). Defaults to @see DEFAULT_SERVER_VAD_OPTIONS.
213
+ * @param temperature - Sampling temperature for response generation. Defaults to @see DEFAULT_TEMPERATURE.
214
+ * @param speed - Speed of the audio output. Defaults to 1.0.
215
+ * @param maxResponseOutputTokens - Maximum number of tokens in the response. Defaults to @see DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS.
216
+ * @param maxSessionDuration - Maximum duration of the session in milliseconds. Defaults to @see DEFAULT_MAX_SESSION_DURATION.
217
+ *
218
+ * @returns A RealtimeModel instance configured for Azure OpenAI Service.
219
+ *
220
+ * @throws Error if required Azure parameters are missing or invalid.
221
+ */
222
+ static withAzure({
223
+ azureDeployment,
224
+ azureEndpoint,
225
+ apiVersion,
226
+ apiKey,
227
+ entraToken,
228
+ baseURL,
229
+ voice = 'alloy',
230
+ inputAudioTranscription = AZURE_DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
231
+ turnDetection = AZURE_DEFAULT_TURN_DETECTION,
232
+ temperature = 0.8,
233
+ speed,
234
+ }: {
235
+ azureDeployment: string;
236
+ azureEndpoint?: string;
237
+ apiVersion?: string;
238
+ apiKey?: string;
239
+ entraToken?: string;
240
+ baseURL?: string;
241
+ voice?: string;
242
+ inputAudioTranscription?: api_proto.InputAudioTranscription;
243
+ // TODO(shubhra): add inputAudioNoiseReduction
244
+ turnDetection?: api_proto.TurnDetectionType;
245
+ temperature?: number;
246
+ speed?: number;
247
+ }) {
248
+ apiKey = apiKey || process.env.AZURE_OPENAI_API_KEY;
249
+ if (!apiKey && !entraToken) {
250
+ throw new Error(
251
+ 'Missing credentials. Please pass one of `apiKey`, `entraToken`, or the `AZURE_OPENAI_API_KEY` environment variable.',
252
+ );
253
+ }
254
+
255
+ apiVersion = apiVersion || process.env.OPENAI_API_VERSION;
256
+ if (!apiVersion) {
257
+ throw new Error(
258
+ 'Must provide either the `apiVersion` argument or the `OPENAI_API_VERSION` environment variable',
259
+ );
260
+ }
261
+
262
+ if (!baseURL) {
263
+ azureEndpoint = azureEndpoint || process.env.AZURE_OPENAI_ENDPOINT;
264
+ if (!azureEndpoint) {
265
+ throw new Error(
266
+ 'Missing Azure endpoint. Please pass the `azure_endpoint` parameter or set the `AZURE_OPENAI_ENDPOINT` environment variable.',
267
+ );
268
+ }
269
+ baseURL = `${azureEndpoint.replace(/\/$/, '')}/openai`;
270
+ }
271
+
272
+ return new RealtimeModel({
273
+ voice,
274
+ inputAudioTranscription,
275
+ turnDetection,
276
+ temperature,
277
+ speed,
278
+ apiKey,
279
+ azureDeployment,
280
+ apiVersion,
281
+ entraToken,
282
+ baseURL,
106
283
  });
107
284
  }
108
285
 
109
- clear() {
110
- this.#session.queueMsg({
111
- type: 'input_audio_buffer.clear',
112
- });
286
+ session() {
287
+ return new RealtimeSession(this);
113
288
  }
114
289
 
115
- commit() {
116
- this.#session.queueMsg({
117
- type: 'input_audio_buffer.commit',
118
- });
290
+ async close() {
291
+ return;
119
292
  }
120
293
  }
121
294
 
122
- class ConversationItem {
123
- #session: RealtimeSession;
124
- #logger = log();
125
-
126
- constructor(session: RealtimeSession) {
127
- this.#session = session;
128
- }
295
+ function processBaseURL({
296
+ baseURL,
297
+ model,
298
+ isAzure = false,
299
+ azureDeployment,
300
+ apiVersion,
301
+ }: {
302
+ baseURL: string;
303
+ model: string;
304
+ isAzure: boolean;
305
+ azureDeployment?: string;
306
+ apiVersion?: string;
307
+ }): string {
308
+ const url = new URL([baseURL, 'realtime'].join('/'));
129
309
 
130
- truncate(itemId: string, contentIndex: number, audioEnd: number) {
131
- this.#session.queueMsg({
132
- type: 'conversation.item.truncate',
133
- item_id: itemId,
134
- content_index: contentIndex,
135
- audio_end_ms: audioEnd,
136
- });
310
+ if (url.protocol === 'https:') {
311
+ url.protocol = 'wss:';
137
312
  }
138
313
 
139
- delete(itemId: string) {
140
- this.#session.queueMsg({
141
- type: 'conversation.item.delete',
142
- item_id: itemId,
143
- });
314
+ // ensure "/realtime" is added if the path is empty OR "/v1"
315
+ if (!url.pathname || ['', '/v1', '/openai'].includes(url.pathname.replace(/\/$/, ''))) {
316
+ url.pathname = url.pathname.replace(/\/$/, '') + '/realtime';
317
+ } else {
318
+ url.pathname = url.pathname.replace(/\/$/, '');
144
319
  }
145
320
 
146
- create(message: llm.ChatMessage, previousItemId?: string): void {
147
- if (!message.content) {
148
- return;
321
+ const queryParams: Record<string, string> = {};
322
+ if (isAzure) {
323
+ if (apiVersion) {
324
+ queryParams['api-version'] = apiVersion;
149
325
  }
326
+ if (azureDeployment) {
327
+ queryParams['deployment'] = azureDeployment;
328
+ }
329
+ } else {
330
+ queryParams['model'] = model;
331
+ }
150
332
 
151
- let event: api_proto.ConversationItemCreateEvent;
333
+ for (const [key, value] of Object.entries(queryParams)) {
334
+ url.searchParams.set(key, value);
335
+ }
152
336
 
153
- if (message.toolCallId) {
154
- if (typeof message.content !== 'string') {
155
- throw new TypeError('message.content must be a string');
156
- }
337
+ return url.toString();
338
+ }
157
339
 
158
- event = {
159
- type: 'conversation.item.create',
160
- previous_item_id: previousItemId,
161
- item: {
162
- type: 'function_call_output',
163
- call_id: message.toolCallId,
164
- output: message.content,
165
- },
166
- };
167
- } else {
168
- let content = message.content;
169
- if (!Array.isArray(content)) {
170
- content = [content];
171
- }
340
+ /**
341
+ * A session for the OpenAI Realtime API.
342
+ *
343
+ * This class is used to interact with the OpenAI Realtime API.
344
+ * It is responsible for sending events to the OpenAI Realtime API and receiving events from it.
345
+ *
346
+ * It exposes two more events:
347
+ * - openai_server_event_received: expose the raw server events from the OpenAI Realtime API
348
+ * - openai_client_event_queued: expose the raw client events sent to the OpenAI Realtime API
349
+ */
350
+ export class RealtimeSession extends llm.RealtimeSession {
351
+ private _tools: llm.ToolContext = {};
352
+ private remoteChatCtx: llm.RemoteChatContext = new llm.RemoteChatContext();
353
+ private messageChannel = new Queue<api_proto.ClientEvent>();
354
+ private inputResampler?: AudioResampler;
355
+ private instructions?: string;
356
+ private oaiRealtimeModel: RealtimeModel;
357
+ private currentGeneration?: ResponseGeneration;
358
+ private responseCreatedFutures: { [id: string]: CreateResponseHandle } = {};
359
+
360
+ private textModeRecoveryRetries: number = 0;
361
+
362
+ private itemCreateFutures: { [id: string]: Future } = {};
363
+ private itemDeleteFutures: { [id: string]: Future } = {};
364
+
365
+ private updateChatCtxLock = new Mutex();
366
+ private updateFuncCtxLock = new Mutex();
367
+
368
+ // 100ms chunks
369
+ private bstream = new AudioByteStream(SAMPLE_RATE, NUM_CHANNELS, SAMPLE_RATE / 10);
370
+
371
+ private pushedDurationMs: number = 0;
172
372
 
173
- if (message.role === llm.ChatRole.USER) {
174
- const contents: (api_proto.InputTextContent | api_proto.InputAudioContent)[] = [];
175
- for (const c of content) {
176
- if (typeof c === 'string') {
177
- contents.push({
178
- type: 'input_text',
179
- text: c,
180
- });
181
- } else if (
182
- // typescript type guard for determining ChatAudio vs ChatImage
183
- ((c: llm.ChatAudio | llm.ChatImage): c is llm.ChatAudio => {
184
- return (c as llm.ChatAudio).frame !== undefined;
185
- })(c)
186
- ) {
187
- contents.push({
188
- type: 'input_audio',
189
- audio: Buffer.from(mergeFrames(c.frame).data.buffer).toString('base64'),
190
- });
191
- }
192
- }
373
+ #logger = log();
374
+ #task: Promise<void>;
375
+ #closed = false;
193
376
 
194
- event = {
195
- type: 'conversation.item.create',
196
- previous_item_id: previousItemId,
197
- item: {
198
- type: 'message',
199
- role: 'user',
200
- content: contents,
201
- },
202
- };
203
- } else if (message.role === llm.ChatRole.ASSISTANT) {
204
- const contents: api_proto.TextContent[] = [];
205
- for (const c of content) {
206
- if (typeof c === 'string') {
207
- contents.push({
208
- type: 'text',
209
- text: c,
210
- });
211
- } else if (
212
- // typescript type guard for determining ChatAudio vs ChatImage
213
- ((c: llm.ChatAudio | llm.ChatImage): c is llm.ChatAudio => {
214
- return (c as llm.ChatAudio).frame !== undefined;
215
- })(c)
216
- ) {
217
- this.#logger.warn('audio content in assistant message is not supported');
218
- }
219
- }
377
+ constructor(realtimeModel: RealtimeModel) {
378
+ super(realtimeModel);
220
379
 
221
- event = {
222
- type: 'conversation.item.create',
223
- previous_item_id: previousItemId,
224
- item: {
225
- type: 'message',
226
- role: 'assistant',
227
- content: contents,
228
- },
229
- };
230
- } else if (message.role === llm.ChatRole.SYSTEM) {
231
- const contents: api_proto.InputTextContent[] = [];
232
- for (const c of content) {
233
- if (typeof c === 'string') {
234
- contents.push({
235
- type: 'input_text',
236
- text: c,
237
- });
238
- } else if (
239
- // typescript type guard for determining ChatAudio vs ChatImage
240
- ((c: llm.ChatAudio | llm.ChatImage): c is llm.ChatAudio => {
241
- return (c as llm.ChatAudio).frame !== undefined;
242
- })(c)
243
- ) {
244
- this.#logger.warn('audio content in system message is not supported');
245
- }
246
- }
380
+ this.oaiRealtimeModel = realtimeModel;
247
381
 
248
- event = {
249
- type: 'conversation.item.create',
250
- previous_item_id: previousItemId,
251
- item: {
252
- type: 'message',
253
- role: 'system',
254
- content: contents,
255
- },
256
- };
257
- } else {
258
- this.#logger
259
- .child({ message })
260
- .warn('chat message is not supported inside the realtime API');
261
- return;
262
- }
263
- }
382
+ this.#task = this.#mainTask();
264
383
 
265
- this.#session.queueMsg(event);
384
+ this.sendEvent(this.createSessionUpdateEvent());
266
385
  }
267
- }
268
-
269
- class Conversation {
270
- #session: RealtimeSession;
271
386
 
272
- constructor(session: RealtimeSession) {
273
- this.#session = session;
387
+ sendEvent(command: api_proto.ClientEvent): void {
388
+ this.messageChannel.put(command);
274
389
  }
275
390
 
276
- get item(): ConversationItem {
277
- return new ConversationItem(this.#session);
391
+ private createSessionUpdateEvent(): api_proto.SessionUpdateEvent {
392
+ return {
393
+ type: 'session.update',
394
+ session: {
395
+ model: this.oaiRealtimeModel._options.model,
396
+ voice: this.oaiRealtimeModel._options.voice,
397
+ input_audio_format: 'pcm16',
398
+ output_audio_format: 'pcm16',
399
+ modalities: ['text', 'audio'],
400
+ turn_detection: this.oaiRealtimeModel._options.turnDetection,
401
+ input_audio_transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
402
+ // TODO(shubhra): add inputAudioNoiseReduction
403
+ temperature: this.oaiRealtimeModel._options.temperature,
404
+ tool_choice: toOaiToolChoice(this.oaiRealtimeModel._options.toolChoice),
405
+ max_response_output_tokens:
406
+ this.oaiRealtimeModel._options.maxResponseOutputTokens === Infinity
407
+ ? 'inf'
408
+ : this.oaiRealtimeModel._options.maxResponseOutputTokens,
409
+ // TODO(shubhra): add tracing options
410
+ instructions: this.instructions,
411
+ speed: this.oaiRealtimeModel._options.speed,
412
+ },
413
+ };
278
414
  }
279
- }
280
-
281
- class Response {
282
- #session: RealtimeSession;
283
415
 
284
- constructor(session: RealtimeSession) {
285
- this.#session = session;
416
+ get chatCtx() {
417
+ return this.remoteChatCtx.toChatCtx();
286
418
  }
287
419
 
288
- create() {
289
- this.#session.queueMsg({
290
- type: 'response.create',
291
- });
420
+ get tools() {
421
+ return { ...this._tools } as llm.ToolContext;
292
422
  }
293
423
 
294
- cancel() {
295
- this.#session.queueMsg({
296
- type: 'response.cancel',
297
- });
298
- }
299
- }
424
+ async updateChatCtx(_chatCtx: llm.ChatContext): Promise<void> {
425
+ const unlock = await this.updateChatCtxLock.lock();
426
+ const events = this.createChatCtxUpdateEvents(_chatCtx);
427
+ const futures: Future<void>[] = [];
300
428
 
301
- interface ContentPtr {
302
- response_id: string;
303
- output_index: number;
304
- content_index: number;
305
- }
429
+ for (const event of events) {
430
+ const future = new Future<void>();
431
+ futures.push(future);
306
432
 
307
- export class RealtimeModel extends multimodal.RealtimeModel {
308
- sampleRate = api_proto.SAMPLE_RATE;
309
- numChannels = api_proto.NUM_CHANNELS;
310
- inFrameSize = api_proto.IN_FRAME_SIZE;
311
- outFrameSize = api_proto.OUT_FRAME_SIZE;
433
+ if (event.type === 'conversation.item.create') {
434
+ this.itemCreateFutures[event.item.id] = future;
435
+ } else if (event.type == 'conversation.item.delete') {
436
+ this.itemDeleteFutures[event.item_id] = future;
437
+ }
312
438
 
313
- #defaultOpts: ModelOptions;
314
- #sessions: RealtimeSession[] = [];
439
+ this.sendEvent(event);
440
+ }
315
441
 
316
- static withAzure({
317
- baseURL,
318
- azureDeployment,
319
- apiVersion = '2024-10-01-preview',
320
- apiKey = undefined,
321
- entraToken = undefined,
322
- instructions = '',
323
- modalities = ['text', 'audio'],
324
- voice = 'alloy',
325
- inputAudioFormat = 'pcm16',
326
- outputAudioFormat = 'pcm16',
327
- inputAudioTranscription = { model: 'whisper-1' },
328
- turnDetection = { type: 'server_vad' },
329
- temperature = 0.8,
330
- maxResponseOutputTokens = Infinity,
331
- }: {
332
- baseURL: string;
333
- azureDeployment: string;
334
- apiVersion?: string;
335
- apiKey?: string;
336
- entraToken?: string;
337
- instructions?: string;
338
- modalities?: ['text', 'audio'] | ['text'];
339
- voice?: api_proto.Voice;
340
- inputAudioFormat?: api_proto.AudioFormat;
341
- outputAudioFormat?: api_proto.AudioFormat;
342
- inputAudioTranscription?: api_proto.InputAudioTranscription;
343
- turnDetection?: api_proto.TurnDetectionType;
344
- temperature?: number;
345
- maxResponseOutputTokens?: number;
346
- }) {
347
- return new RealtimeModel({
348
- isAzure: true,
349
- baseURL: new URL('openai', baseURL).toString(),
350
- model: azureDeployment,
351
- apiVersion,
352
- apiKey,
353
- entraToken,
354
- instructions,
355
- modalities,
356
- voice,
357
- inputAudioFormat,
358
- outputAudioFormat,
359
- inputAudioTranscription,
360
- turnDetection,
361
- temperature,
362
- maxResponseOutputTokens,
363
- });
364
- }
442
+ if (futures.length === 0) {
443
+ unlock();
444
+ return;
445
+ }
365
446
 
366
- constructor({
367
- modalities = ['text', 'audio'],
368
- instructions = '',
369
- voice = 'alloy',
370
- inputAudioFormat = 'pcm16',
371
- outputAudioFormat = 'pcm16',
372
- inputAudioTranscription = { model: 'whisper-1' },
373
- turnDetection = { type: 'server_vad' },
374
- temperature = 0.8,
375
- maxResponseOutputTokens = Infinity,
376
- model = 'gpt-4o-realtime-preview-2024-10-01',
377
- apiKey = process.env.OPENAI_API_KEY || '',
378
- baseURL = api_proto.BASE_URL,
379
- // used for microsoft
380
- isAzure = false,
381
- apiVersion = undefined,
382
- entraToken = undefined,
383
- }: {
384
- modalities?: ['text', 'audio'] | ['text'];
385
- instructions?: string;
386
- voice?: api_proto.Voice;
387
- inputAudioFormat?: api_proto.AudioFormat;
388
- outputAudioFormat?: api_proto.AudioFormat;
389
- inputAudioTranscription?: api_proto.InputAudioTranscription;
390
- turnDetection?: api_proto.TurnDetectionType;
391
- temperature?: number;
392
- maxResponseOutputTokens?: number;
393
- model?: api_proto.Model;
394
- apiKey?: string;
395
- baseURL?: string;
396
- isAzure?: boolean;
397
- apiVersion?: string;
398
- entraToken?: string;
399
- }) {
400
- super();
447
+ try {
448
+ // wait for futures to resolve or timeout
449
+ await Promise.race([
450
+ Promise.all(futures),
451
+ delay(5000).then(() => {
452
+ throw new Error('Chat ctx update events timed out');
453
+ }),
454
+ ]);
455
+ } catch (e) {
456
+ this.#logger.error((e as Error).message);
457
+ throw e;
458
+ } finally {
459
+ unlock();
460
+ }
461
+ }
401
462
 
402
- if (apiKey === '' && !(isAzure && entraToken)) {
403
- throw new Error(
404
- 'OpenAI API key is required, either using the argument or by setting the OPENAI_API_KEY environmental variable',
463
+ private createChatCtxUpdateEvents(
464
+ chatCtx: llm.ChatContext,
465
+ addMockAudio: boolean = false,
466
+ ): (api_proto.ConversationItemCreateEvent | api_proto.ConversationItemDeleteEvent)[] {
467
+ const newChatCtx = chatCtx.copy();
468
+ if (addMockAudio) {
469
+ newChatCtx.items.push(createMockAudioItem());
470
+ } else {
471
+ // clean up existing mock audio items
472
+ newChatCtx.items = newChatCtx.items.filter(
473
+ (item) => !item.id.startsWith(MOCK_AUDIO_ID_PREFIX),
405
474
  );
406
475
  }
407
476
 
408
- this.#defaultOpts = {
409
- modalities,
410
- instructions,
411
- voice,
412
- inputAudioFormat,
413
- outputAudioFormat,
414
- inputAudioTranscription,
415
- turnDetection,
416
- temperature,
417
- maxResponseOutputTokens,
418
- model,
419
- apiKey,
420
- baseURL,
421
- isAzure,
422
- apiVersion,
423
- entraToken,
424
- };
425
- }
477
+ const events: (
478
+ | api_proto.ConversationItemCreateEvent
479
+ | api_proto.ConversationItemDeleteEvent
480
+ )[] = [];
481
+
482
+ const diffOps = llm.computeChatCtxDiff(this.chatCtx, newChatCtx);
483
+ for (const op of diffOps.toRemove) {
484
+ events.push({
485
+ type: 'conversation.item.delete',
486
+ item_id: op,
487
+ event_id: shortuuid('chat_ctx_delete_'),
488
+ } as api_proto.ConversationItemDeleteEvent);
489
+ }
426
490
 
427
- get sessions(): RealtimeSession[] {
428
- return this.#sessions;
491
+ for (const [previousId, id] of diffOps.toCreate) {
492
+ const chatItem = newChatCtx.getById(id);
493
+ if (!chatItem) {
494
+ throw new Error(`Chat item ${id} not found`);
495
+ }
496
+ events.push({
497
+ type: 'conversation.item.create',
498
+ item: livekitItemToOpenAIItem(chatItem),
499
+ previous_item_id: previousId ?? undefined,
500
+ event_id: shortuuid('chat_ctx_create_'),
501
+ } as api_proto.ConversationItemCreateEvent);
502
+ }
503
+ return events;
429
504
  }
430
505
 
431
- session({
432
- fncCtx,
433
- chatCtx,
434
- modalities = this.#defaultOpts.modalities,
435
- instructions = this.#defaultOpts.instructions,
436
- voice = this.#defaultOpts.voice,
437
- inputAudioFormat = this.#defaultOpts.inputAudioFormat,
438
- outputAudioFormat = this.#defaultOpts.outputAudioFormat,
439
- inputAudioTranscription = this.#defaultOpts.inputAudioTranscription,
440
- turnDetection = this.#defaultOpts.turnDetection,
441
- temperature = this.#defaultOpts.temperature,
442
- maxResponseOutputTokens = this.#defaultOpts.maxResponseOutputTokens,
443
- }: {
444
- fncCtx?: llm.FunctionContext;
445
- chatCtx?: llm.ChatContext;
446
- modalities?: ['text', 'audio'] | ['text'];
447
- instructions?: string;
448
- voice?: api_proto.Voice;
449
- inputAudioFormat?: api_proto.AudioFormat;
450
- outputAudioFormat?: api_proto.AudioFormat;
451
- inputAudioTranscription?: api_proto.InputAudioTranscription | null;
452
- turnDetection?: api_proto.TurnDetectionType | null;
453
- temperature?: number;
454
- maxResponseOutputTokens?: number;
455
- }): RealtimeSession {
456
- const opts: ModelOptions = {
457
- modalities,
458
- instructions,
459
- voice,
460
- inputAudioFormat,
461
- outputAudioFormat,
462
- inputAudioTranscription,
463
- turnDetection,
464
- temperature,
465
- maxResponseOutputTokens,
466
- model: this.#defaultOpts.model,
467
- apiKey: this.#defaultOpts.apiKey,
468
- baseURL: this.#defaultOpts.baseURL,
469
- isAzure: this.#defaultOpts.isAzure,
470
- apiVersion: this.#defaultOpts.apiVersion,
471
- entraToken: this.#defaultOpts.entraToken,
472
- };
506
+ async updateTools(_tools: llm.ToolContext): Promise<void> {
507
+ const unlock = await this.updateFuncCtxLock.lock();
508
+ const ev = this.createToolsUpdateEvent(_tools);
509
+ this.sendEvent(ev);
473
510
 
474
- const newSession = new RealtimeSession(opts, {
475
- chatCtx: chatCtx || new llm.ChatContext(),
476
- fncCtx,
477
- });
478
- this.#sessions.push(newSession);
479
- return newSession;
480
- }
511
+ if (!ev.session.tools) {
512
+ throw new Error('Tools are missing in the session update event');
513
+ }
481
514
 
482
- async close() {
483
- await Promise.allSettled(this.#sessions.map((session) => session.close()));
515
+ // TODO(brian): these logics below are noops I think, leaving it here to keep
516
+ // parity with the python but we should remove them later
517
+ const retainedToolNames = new Set(ev.session.tools.map((tool) => tool.name));
518
+ const retainedTools = Object.fromEntries(
519
+ Object.entries(_tools).filter(
520
+ ([name, tool]) => llm.isFunctionTool(tool) && retainedToolNames.has(name),
521
+ ),
522
+ );
523
+
524
+ this._tools = retainedTools as llm.ToolContext;
525
+
526
+ unlock();
484
527
  }
485
- }
486
528
 
487
- export class RealtimeSession extends multimodal.RealtimeSession {
488
- #chatCtx: llm.ChatContext | undefined = undefined;
489
- #fncCtx: llm.FunctionContext | undefined = undefined;
490
- #opts: ModelOptions;
491
- #pendingResponses: { [id: string]: RealtimeResponse } = {};
492
- #sessionId = 'not-connected';
493
- #ws: WebSocket | null = null;
494
- #expiresAt: number | null = null;
495
- #logger = log();
496
- #task: Promise<void>;
497
- #closing = true;
498
- #sendQueue = new Queue<api_proto.ClientEvent>();
529
+ private createToolsUpdateEvent(_tools: llm.ToolContext): api_proto.SessionUpdateEvent {
530
+ const oaiTools: api_proto.Tool[] = [];
499
531
 
500
- constructor(
501
- opts: ModelOptions,
502
- { fncCtx, chatCtx }: { fncCtx?: llm.FunctionContext; chatCtx?: llm.ChatContext },
503
- ) {
504
- super();
505
-
506
- this.#opts = opts;
507
- this.#chatCtx = chatCtx;
508
- this.#fncCtx = fncCtx;
509
-
510
- this.#task = this.#start();
511
-
512
- this.sessionUpdate({
513
- modalities: this.#opts.modalities,
514
- instructions: this.#opts.instructions,
515
- voice: this.#opts.voice,
516
- inputAudioFormat: this.#opts.inputAudioFormat,
517
- outputAudioFormat: this.#opts.outputAudioFormat,
518
- inputAudioTranscription: this.#opts.inputAudioTranscription,
519
- turnDetection: this.#opts.turnDetection,
520
- temperature: this.#opts.temperature,
521
- maxResponseOutputTokens: this.#opts.maxResponseOutputTokens,
522
- toolChoice: 'auto',
523
- });
532
+ for (const [name, tool] of Object.entries(_tools)) {
533
+ if (!llm.isFunctionTool(tool)) {
534
+ this.#logger.error({ name, tool }, "OpenAI Realtime API doesn't support this tool type");
535
+ continue;
536
+ }
537
+
538
+ const { parameters: toolParameters, description } = tool;
539
+ try {
540
+ const parameters = llm.toJsonSchema(
541
+ toolParameters,
542
+ ) as unknown as api_proto.Tool['parameters'];
543
+
544
+ oaiTools.push({
545
+ name,
546
+ description,
547
+ parameters: parameters,
548
+ type: 'function',
549
+ });
550
+ } catch (e) {
551
+ this.#logger.error({ name, tool }, "OpenAI Realtime API doesn't support this tool type");
552
+ continue;
553
+ }
554
+ }
555
+
556
+ return {
557
+ type: 'session.update',
558
+ session: {
559
+ model: this.oaiRealtimeModel._options.model,
560
+ tools: oaiTools,
561
+ },
562
+ event_id: shortuuid('tools_update_'),
563
+ };
524
564
  }
525
565
 
526
- get chatCtx(): llm.ChatContext | undefined {
527
- return this.#chatCtx;
566
+ async updateInstructions(_instructions: string): Promise<void> {
567
+ const eventId = shortuuid('instructions_update_');
568
+ this.sendEvent({
569
+ type: 'session.update',
570
+ session: {
571
+ instructions: _instructions,
572
+ },
573
+ event_id: eventId,
574
+ } as api_proto.SessionUpdateEvent);
575
+ this.instructions = _instructions;
528
576
  }
529
577
 
530
- get fncCtx(): llm.FunctionContext | undefined {
531
- return this.#fncCtx;
578
+ updateOptions({ toolChoice }: { toolChoice?: llm.ToolChoice }): void {
579
+ const options: api_proto.SessionUpdateEvent['session'] = {};
580
+
581
+ this.oaiRealtimeModel._options.toolChoice = toolChoice;
582
+ options.tool_choice = toOaiToolChoice(toolChoice);
583
+
584
+ // TODO(brian): add other options here
585
+
586
+ this.sendEvent({
587
+ type: 'session.update',
588
+ session: options,
589
+ event_id: shortuuid('options_update_'),
590
+ });
532
591
  }
533
592
 
534
- set fncCtx(ctx: llm.FunctionContext | undefined) {
535
- this.#fncCtx = ctx;
593
+ pushAudio(frame: AudioFrame): void {
594
+ for (const f of this.resampleAudio(frame)) {
595
+ for (const nf of this.bstream.write(f.data.buffer)) {
596
+ this.sendEvent({
597
+ type: 'input_audio_buffer.append',
598
+ audio: Buffer.from(nf.data.buffer).toString('base64'),
599
+ } as api_proto.InputAudioBufferAppendEvent);
600
+ // TODO(AJS-102): use frame.durationMs once available in rtc-node
601
+ this.pushedDurationMs += (nf.samplesPerChannel / nf.sampleRate) * 1000;
602
+ }
603
+ }
536
604
  }
537
605
 
538
- get conversation(): Conversation {
539
- return new Conversation(this);
606
+ async commitAudio(): Promise<void> {
607
+ if (this.pushedDurationMs > 100) {
608
+ // OpenAI requires at least 100ms of audio
609
+ this.sendEvent({
610
+ type: 'input_audio_buffer.commit',
611
+ } as api_proto.InputAudioBufferCommitEvent);
612
+ this.pushedDurationMs = 0;
613
+ }
540
614
  }
541
615
 
542
- get inputAudioBuffer(): InputAudioBuffer {
543
- return new InputAudioBuffer(this);
616
+ async clearAudio(): Promise<void> {
617
+ this.sendEvent({
618
+ type: 'input_audio_buffer.clear',
619
+ } as api_proto.InputAudioBufferClearEvent);
620
+ this.pushedDurationMs = 0;
544
621
  }
545
622
 
546
- get response(): Response {
547
- return new Response(this);
623
+ async generateReply(instructions?: string): Promise<llm.GenerationCreatedEvent> {
624
+ const handle = this.createResponse({ instructions, userInitiated: true });
625
+ this.textModeRecoveryRetries = 0;
626
+ return handle.doneFut.await;
548
627
  }
549
628
 
550
- get expiration(): number {
551
- if (!this.#expiresAt) {
552
- throw new Error('session not started');
553
- }
554
- return this.#expiresAt * 1000;
629
+ async interrupt(): Promise<void> {
630
+ this.sendEvent({
631
+ type: 'response.cancel',
632
+ } as api_proto.ResponseCancelEvent);
555
633
  }
556
634
 
557
- queueMsg(command: api_proto.ClientEvent): void {
558
- this.#sendQueue.put(command);
635
+ async truncate(_options: { messageId: string; audioEndMs: number }): Promise<void> {
636
+ this.sendEvent({
637
+ type: 'conversation.item.truncate',
638
+ content_index: 0,
639
+ item_id: _options.messageId,
640
+ audio_end_ms: _options.audioEndMs,
641
+ } as api_proto.ConversationItemTruncateEvent);
559
642
  }
560
643
 
561
644
  /// Truncates the data field of the event to the specified maxLength to avoid overwhelming logs
@@ -588,646 +671,872 @@ export class RealtimeSession extends multimodal.RealtimeSession {
588
671
  return untypedEvent;
589
672
  }
590
673
 
591
- sessionUpdate({
592
- modalities = this.#opts.modalities,
593
- instructions = this.#opts.instructions,
594
- voice = this.#opts.voice,
595
- inputAudioFormat = this.#opts.inputAudioFormat,
596
- outputAudioFormat = this.#opts.outputAudioFormat,
597
- inputAudioTranscription = this.#opts.inputAudioTranscription,
598
- turnDetection = this.#opts.turnDetection,
599
- temperature = this.#opts.temperature,
600
- maxResponseOutputTokens = this.#opts.maxResponseOutputTokens,
601
- toolChoice = 'auto',
602
- selectedTools = Object.keys(this.#fncCtx || {}),
603
- }: {
604
- modalities: ['text', 'audio'] | ['text'];
605
- instructions?: string;
606
- voice?: api_proto.Voice;
607
- inputAudioFormat?: api_proto.AudioFormat;
608
- outputAudioFormat?: api_proto.AudioFormat;
609
- inputAudioTranscription?: api_proto.InputAudioTranscription | null;
610
- turnDetection?: api_proto.TurnDetectionType | null;
611
- temperature?: number;
612
- maxResponseOutputTokens?: number;
613
- toolChoice?: api_proto.ToolChoice;
614
- selectedTools?: string[];
615
- }) {
616
- this.#opts = {
617
- modalities,
618
- instructions,
619
- voice,
620
- inputAudioFormat,
621
- outputAudioFormat,
622
- inputAudioTranscription,
623
- turnDetection,
624
- temperature,
625
- maxResponseOutputTokens,
626
- model: this.#opts.model,
627
- apiKey: this.#opts.apiKey,
628
- baseURL: this.#opts.baseURL,
629
- isAzure: this.#opts.isAzure,
630
- apiVersion: this.#opts.apiVersion,
631
- entraToken: this.#opts.entraToken,
674
+ private async createWsConn(): Promise<WebSocket> {
675
+ const headers: Record<string, string> = {
676
+ 'User-Agent': 'LiveKit-Agents-JS',
632
677
  };
633
678
 
634
- const tools = this.#fncCtx
635
- ? Object.entries(this.#fncCtx)
636
- .filter(([name]) => selectedTools.includes(name))
637
- .map(([name, func]) => ({
638
- type: 'function' as const,
639
- name,
640
- description: func.description,
641
- parameters:
642
- // don't format parameters if they are raw openai params
643
- func.parameters.type == ('object' as const)
644
- ? func.parameters
645
- : llm.oaiParams(func.parameters),
646
- }))
647
- : [];
648
-
649
- const sessionUpdateEvent: api_proto.SessionUpdateEvent = {
650
- type: 'session.update',
651
- session: {
652
- modalities: this.#opts.modalities,
653
- instructions: this.#opts.instructions,
654
- voice: this.#opts.voice,
655
- input_audio_format: this.#opts.inputAudioFormat,
656
- output_audio_format: this.#opts.outputAudioFormat,
657
- input_audio_transcription: this.#opts.inputAudioTranscription,
658
- turn_detection: this.#opts.turnDetection,
659
- temperature: this.#opts.temperature,
660
- max_response_output_tokens:
661
- this.#opts.maxResponseOutputTokens === Infinity
662
- ? 'inf'
663
- : this.#opts.maxResponseOutputTokens,
664
- tools,
665
- tool_choice: toolChoice,
666
- },
667
- };
668
-
669
- if (this.#opts.isAzure && this.#opts.maxResponseOutputTokens === Infinity) {
670
- // microsoft doesn't support inf for max_response_output_tokens, but accepts no args
671
- sessionUpdateEvent.session.max_response_output_tokens = undefined;
672
- }
679
+ if (this.oaiRealtimeModel._options.isAzure) {
680
+ // Microsoft API has two ways of authentication
681
+ // 1. Entra token set as `Bearer` token
682
+ // 2. API key set as `api_key` header (also accepts query string)
683
+ if (this.oaiRealtimeModel._options.entraToken) {
684
+ headers.Authorization = `Bearer ${this.oaiRealtimeModel._options.entraToken}`;
685
+ } else if (this.oaiRealtimeModel._options.apiKey) {
686
+ headers['api-key'] = this.oaiRealtimeModel._options.apiKey;
687
+ } else {
688
+ throw new Error('Microsoft API key or entraToken is required');
689
+ }
690
+ } else {
691
+ headers.Authorization = `Bearer ${this.oaiRealtimeModel._options.apiKey}`;
692
+ headers['OpenAI-Beta'] = 'realtime=v1';
693
+ }
673
694
 
674
- this.queueMsg(sessionUpdateEvent);
675
- }
695
+ const url = processBaseURL({
696
+ baseURL: this.oaiRealtimeModel._options.baseURL,
697
+ model: this.oaiRealtimeModel._options.model,
698
+ isAzure: this.oaiRealtimeModel._options.isAzure,
699
+ apiVersion: this.oaiRealtimeModel._options.apiVersion,
700
+ azureDeployment: this.oaiRealtimeModel._options.azureDeployment,
701
+ });
676
702
 
677
- /** Create an empty audio message with the given duration. */
678
- #createEmptyUserAudioMessage(duration: number): llm.ChatMessage {
679
- const samples = duration * api_proto.SAMPLE_RATE;
680
- return new llm.ChatMessage({
681
- role: llm.ChatRole.USER,
682
- content: {
683
- frame: new AudioFrame(
684
- new Int16Array(samples * api_proto.NUM_CHANNELS),
685
- api_proto.SAMPLE_RATE,
686
- api_proto.NUM_CHANNELS,
687
- samples,
688
- ),
689
- },
703
+ this.#logger.debug(`Connecting to OpenAI Realtime API at ${url}`);
704
+
705
+ return new Promise((resolve, reject) => {
706
+ const ws = new WebSocket(url, { headers });
707
+ let waiting = true;
708
+
709
+ const timeout = setTimeout(() => {
710
+ ws.close();
711
+ reject(new Error('WebSocket connection timeout'));
712
+ }, this.oaiRealtimeModel._options.connOptions.timeoutMs);
713
+
714
+ ws.once('open', () => {
715
+ if (!waiting) return;
716
+ waiting = false;
717
+ clearTimeout(timeout);
718
+ resolve(ws);
719
+ });
720
+
721
+ ws.once('close', () => {
722
+ if (!waiting) return;
723
+ waiting = false;
724
+ clearTimeout(timeout);
725
+ reject(new Error('OpenAI Realtime API connection closed'));
726
+ });
690
727
  });
691
728
  }
692
729
 
693
- /**
694
- * Try to recover from a text response to audio mode.
695
- *
696
- * @remarks
697
- * Sometimes the OpenAI Realtime API returns text instead of audio responses.
698
- * This method tries to recover from this by requesting a new response after deleting the text
699
- * response and creating an empty user audio message.
700
- */
701
- recoverFromTextResponse(itemId: string) {
702
- if (itemId) {
703
- this.conversation.item.delete(itemId);
704
- }
705
- this.conversation.item.create(this.#createEmptyUserAudioMessage(1));
706
- this.response.create();
707
- }
730
+ async #mainTask(): Promise<void> {
731
+ let reconnecting = false;
732
+ let numRetries = 0;
733
+ let wsConn: WebSocket | null = null;
734
+ const maxRetries = this.oaiRealtimeModel._options.connOptions.maxRetry;
708
735
 
709
- #start(): Promise<void> {
710
- return new Promise(async (resolve, reject) => {
711
- const headers: Record<string, string> = {
712
- 'User-Agent': 'LiveKit-Agents-JS',
713
- };
714
- if (this.#opts.isAzure) {
715
- // Microsoft API has two ways of authentication
716
- // 1. Entra token set as `Bearer` token
717
- // 2. API key set as `api_key` header (also accepts query string)
718
- if (this.#opts.entraToken) {
719
- headers.Authorization = `Bearer ${this.#opts.entraToken}`;
720
- } else if (this.#opts.apiKey) {
721
- headers['api-key'] = this.#opts.apiKey;
722
- } else {
723
- reject(new Error('Microsoft API key or entraToken is required'));
724
- return;
725
- }
726
- } else {
727
- headers.Authorization = `Bearer ${this.#opts.apiKey}`;
728
- headers['OpenAI-Beta'] = 'realtime=v1';
729
- }
730
- const url = new URL([this.#opts.baseURL, 'realtime'].join('/'));
731
- if (url.protocol === 'https:') {
732
- url.protocol = 'wss:';
733
- }
736
+ const reconnect = async () => {
737
+ this.#logger.debug(
738
+ {
739
+ maxSessionDuration: this.oaiRealtimeModel._options.maxSessionDuration,
740
+ },
741
+ 'Reconnecting to OpenAI Realtime API',
742
+ );
734
743
 
735
- // Construct query parameters
736
- const queryParams: Record<string, string> = {};
737
- if (this.#opts.isAzure) {
738
- queryParams['api-version'] = this.#opts.apiVersion ?? '2024-10-01-preview';
739
- queryParams['deployment'] = this.#opts.model;
740
- } else {
741
- queryParams['model'] = this.#opts.model;
742
- }
744
+ const events: api_proto.ClientEvent[] = [];
743
745
 
744
- for (const [key, value] of Object.entries(queryParams)) {
745
- url.searchParams.set(key, value);
746
+ // options and instructions
747
+ events.push(this.createSessionUpdateEvent());
748
+
749
+ // tools
750
+ if (Object.keys(this._tools).length > 0) {
751
+ events.push(this.createToolsUpdateEvent(this._tools));
746
752
  }
747
753
 
748
- console.debug('Connecting to OpenAI Realtime API at ', url.toString());
749
- this.#ws = new WebSocket(url.toString(), {
750
- headers: headers,
754
+ // chat context
755
+ const chatCtx = this.chatCtx.copy({
756
+ excludeFunctionCall: true,
757
+ excludeInstructions: true,
758
+ excludeEmptyMessage: true,
751
759
  });
752
760
 
753
- this.#ws.onerror = (error) => {
754
- reject(new Error('OpenAI Realtime WebSocket error: ' + error.message));
755
- };
756
-
757
- await once(this.#ws, 'open');
758
- this.#closing = false;
761
+ const oldChatCtx = this.remoteChatCtx;
762
+ this.remoteChatCtx = new llm.RemoteChatContext();
763
+ events.push(...this.createChatCtxUpdateEvents(chatCtx));
759
764
 
760
- this.#ws.onmessage = (message) => {
761
- const event: api_proto.ServerEvent = JSON.parse(message.data as string);
762
- this.#logger.debug(`<- ${JSON.stringify(this.#loggableEvent(event))}`);
763
- switch (event.type) {
764
- case 'error':
765
- this.#handleError(event);
766
- break;
767
- case 'session.created':
768
- this.#handleSessionCreated(event);
769
- break;
770
- case 'session.updated':
771
- this.#handleSessionUpdated(event);
772
- break;
773
- case 'conversation.created':
774
- this.#handleConversationCreated(event);
775
- break;
776
- case 'input_audio_buffer.committed':
777
- this.#handleInputAudioBufferCommitted(event);
778
- break;
779
- case 'input_audio_buffer.cleared':
780
- this.#handleInputAudioBufferCleared(event);
781
- break;
782
- case 'input_audio_buffer.speech_started':
783
- this.#handleInputAudioBufferSpeechStarted(event);
784
- break;
785
- case 'input_audio_buffer.speech_stopped':
786
- this.#handleInputAudioBufferSpeechStopped(event);
787
- break;
788
- case 'conversation.item.created':
789
- this.#handleConversationItemCreated(event);
790
- break;
791
- case 'conversation.item.input_audio_transcription.completed':
792
- this.#handleConversationItemInputAudioTranscriptionCompleted(event);
793
- break;
794
- case 'conversation.item.input_audio_transcription.failed':
795
- this.#handleConversationItemInputAudioTranscriptionFailed(event);
796
- break;
797
- case 'conversation.item.truncated':
798
- this.#handleConversationItemTruncated(event);
799
- break;
800
- case 'conversation.item.deleted':
801
- this.#handleConversationItemDeleted(event);
802
- break;
803
- case 'response.created':
804
- this.#handleResponseCreated(event);
805
- break;
806
- case 'response.done':
807
- this.#handleResponseDone(event);
808
- break;
809
- case 'response.output_item.added':
810
- this.#handleResponseOutputItemAdded(event);
811
- break;
812
- case 'response.output_item.done':
813
- this.#handleResponseOutputItemDone(event);
814
- break;
815
- case 'response.content_part.added':
816
- this.#handleResponseContentPartAdded(event);
817
- break;
818
- case 'response.content_part.done':
819
- this.#handleResponseContentPartDone(event);
820
- break;
821
- case 'response.text.delta':
822
- this.#handleResponseTextDelta(event);
823
- break;
824
- case 'response.text.done':
825
- this.#handleResponseTextDone(event);
826
- break;
827
- case 'response.audio_transcript.delta':
828
- this.#handleResponseAudioTranscriptDelta(event);
829
- break;
830
- case 'response.audio_transcript.done':
831
- this.#handleResponseAudioTranscriptDone(event);
832
- break;
833
- case 'response.audio.delta':
834
- this.#handleResponseAudioDelta(event);
835
- break;
836
- case 'response.audio.done':
837
- this.#handleResponseAudioDone(event);
838
- break;
839
- case 'response.function_call_arguments.delta':
840
- this.#handleResponseFunctionCallArgumentsDelta(event);
841
- break;
842
- case 'response.function_call_arguments.done':
843
- this.#handleResponseFunctionCallArgumentsDone(event);
844
- break;
845
- case 'rate_limits.updated':
846
- this.#handleRateLimitsUpdated(event);
847
- break;
765
+ try {
766
+ for (const ev of events) {
767
+ this.emit('openai_client_event_queued', ev);
768
+ wsConn!.send(JSON.stringify(ev));
848
769
  }
849
- };
770
+ } catch (error) {
771
+ this.remoteChatCtx = oldChatCtx;
772
+ throw new APIConnectionError({
773
+ message: 'Failed to send message to OpenAI Realtime API during session re-connection',
774
+ });
775
+ }
850
776
 
851
- const sendTask = async () => {
852
- while (this.#ws && !this.#closing && this.#ws.readyState === WebSocket.OPEN) {
853
- try {
854
- const event = await this.#sendQueue.get();
855
- if (event.type !== 'input_audio_buffer.append') {
856
- this.#logger.debug(`-> ${JSON.stringify(this.#loggableEvent(event))}`);
857
- }
858
- this.#ws.send(JSON.stringify(event));
859
- } catch (error) {
860
- this.#logger.error('Error sending event:', error);
861
- }
777
+ this.#logger.debug('Reconnected to OpenAI Realtime API');
778
+
779
+ this.emit('session_reconnected', {} as llm.RealtimeSessionReconnectedEvent);
780
+ };
781
+
782
+ reconnecting = false;
783
+ while (!this.#closed) {
784
+ this.#logger.debug('Creating WebSocket connection to OpenAI Realtime API');
785
+ wsConn = await this.createWsConn();
786
+
787
+ try {
788
+ if (reconnecting) {
789
+ await reconnect();
790
+ numRetries = 0;
791
+ }
792
+ await this.runWs(wsConn);
793
+ } catch (error) {
794
+ if (!isAPIError(error)) {
795
+ this.emitError({ error: error as Error, recoverable: false });
796
+ throw error;
862
797
  }
863
- };
864
798
 
865
- sendTask();
799
+ if (maxRetries === 0 || !error.retryable) {
800
+ this.emitError({ error: error as Error, recoverable: false });
801
+ throw error;
802
+ }
866
803
 
867
- this.#ws.onclose = () => {
868
- if (this.#expiresAt && Date.now() >= this.#expiresAt * 1000) {
869
- this.#closing = true;
804
+ if (numRetries === maxRetries) {
805
+ this.emitError({ error: error as Error, recoverable: false });
806
+ throw new APIConnectionError({
807
+ message: `OpenAI Realtime API connection failed after ${numRetries} attempts`,
808
+ options: {
809
+ body: error,
810
+ retryable: false,
811
+ },
812
+ });
870
813
  }
871
- if (!this.#closing) {
872
- reject(new Error('OpenAI Realtime connection closed unexpectedly'));
814
+
815
+ this.emitError({ error: error as Error, recoverable: true });
816
+ const retryInterval =
817
+ numRetries === 0
818
+ ? DEFAULT_FIRST_RETRY_INTERVAL_MS
819
+ : this.oaiRealtimeModel._options.connOptions.retryIntervalMs;
820
+ this.#logger.warn(
821
+ {
822
+ attempt: numRetries,
823
+ maxRetries,
824
+ error,
825
+ },
826
+ `OpenAI Realtime API connection failed, retrying in ${retryInterval / 1000}s`,
827
+ );
828
+
829
+ await delay(retryInterval);
830
+ numRetries++;
831
+ }
832
+
833
+ reconnecting = true;
834
+ }
835
+ }
836
+
837
+ private async runWs(wsConn: WebSocket): Promise<void> {
838
+ const forwardEvents = async (signal: AbortSignal): Promise<void> => {
839
+ while (!this.#closed && wsConn.readyState === WebSocket.OPEN && !signal.aborted) {
840
+ try {
841
+ const event = await this.messageChannel.get();
842
+ if (signal.aborted) {
843
+ break;
844
+ }
845
+
846
+ if (event.type !== 'input_audio_buffer.append') {
847
+ this.#logger.debug(`(client) -> ${JSON.stringify(this.#loggableEvent(event))}`);
848
+ }
849
+
850
+ this.emit('openai_client_event_queued', event);
851
+ wsConn.send(JSON.stringify(event));
852
+ } catch (error) {
853
+ break;
873
854
  }
874
- this.#ws = null;
875
- resolve();
876
- };
855
+ }
856
+
857
+ wsConn.close();
858
+ };
859
+
860
+ const wsCloseFuture = new Future<void | Error>();
861
+
862
+ wsConn.onerror = (error) => {
863
+ wsCloseFuture.resolve(new APIConnectionError({ message: error.message }));
864
+ };
865
+ wsConn.onclose = () => {
866
+ wsCloseFuture.resolve();
867
+ };
868
+
869
+ wsConn.onmessage = (message: MessageEvent) => {
870
+ const event: api_proto.ServerEvent = JSON.parse(message.data as string);
871
+
872
+ this.emit('openai_server_event_received', event);
873
+ this.#logger.debug(`(server) <- ${JSON.stringify(this.#loggableEvent(event))}`);
874
+
875
+ switch (event.type) {
876
+ case 'input_audio_buffer.speech_started':
877
+ this.handleInputAudioBufferSpeechStarted(event);
878
+ break;
879
+ case 'input_audio_buffer.speech_stopped':
880
+ this.handleInputAudioBufferSpeechStopped(event);
881
+ break;
882
+ case 'response.created':
883
+ this.handleResponseCreated(event);
884
+ break;
885
+ case 'response.output_item.added':
886
+ this.handleResponseOutputItemAdded(event);
887
+ break;
888
+ case 'conversation.item.created':
889
+ this.handleConversationItemCreated(event);
890
+ break;
891
+ case 'conversation.item.deleted':
892
+ this.handleConversationItemDeleted(event);
893
+ break;
894
+ case 'conversation.item.input_audio_transcription.completed':
895
+ this.handleConversationItemInputAudioTranscriptionCompleted(event);
896
+ break;
897
+ case 'conversation.item.input_audio_transcription.failed':
898
+ this.handleConversationItemInputAudioTranscriptionFailed(event);
899
+ break;
900
+ case 'response.content_part.added':
901
+ this.handleResponseContentPartAdded(event);
902
+ break;
903
+ case 'response.content_part.done':
904
+ this.handleResponseContentPartDone(event);
905
+ break;
906
+ case 'response.audio_transcript.delta':
907
+ this.handleResponseAudioTranscriptDelta(event);
908
+ break;
909
+ case 'response.audio.delta':
910
+ this.handleResponseAudioDelta(event);
911
+ break;
912
+ case 'response.audio_transcript.done':
913
+ this.handleResponseAudioTranscriptDone(event);
914
+ break;
915
+ case 'response.audio.done':
916
+ this.handleResponseAudioDone(event);
917
+ break;
918
+ case 'response.output_item.done':
919
+ this.handleResponseOutputItemDone(event);
920
+ break;
921
+ case 'response.done':
922
+ this.handleResponseDone(event);
923
+ break;
924
+ case 'error':
925
+ this.handleError(event);
926
+ break;
927
+ default:
928
+ this.#logger.debug(`unhandled event: ${event.type}`);
929
+ break;
930
+ }
931
+ };
932
+
933
+ const sendTask = Task.from(({ signal }) => forwardEvents(signal));
934
+
935
+ const wsTask = Task.from(({ signal }) => {
936
+ const abortPromise = new Promise<void>((resolve) => {
937
+ signal.addEventListener('abort', () => {
938
+ resolve();
939
+ });
940
+ });
941
+
942
+ return Promise.race([wsCloseFuture.await, abortPromise]);
943
+ });
944
+
945
+ const waitReconnectTask = Task.from(async ({ signal }) => {
946
+ await delay(this.oaiRealtimeModel._options.maxSessionDuration, { signal });
947
+ return new APIConnectionError({
948
+ message: 'OpenAI Realtime API connection timeout',
949
+ });
877
950
  });
951
+
952
+ try {
953
+ const result = await Promise.race([wsTask.result, sendTask.result, waitReconnectTask.result]);
954
+
955
+ if (waitReconnectTask.done && this.currentGeneration) {
956
+ await this.currentGeneration._doneFut.await;
957
+ }
958
+
959
+ if (result instanceof Error) {
960
+ throw result;
961
+ }
962
+ } finally {
963
+ await cancelAndWait([wsTask, sendTask, waitReconnectTask], 2000);
964
+ wsConn.close();
965
+ }
878
966
  }
879
967
 
880
968
  async close() {
881
- if (!this.#ws) return;
882
- this.#closing = true;
883
- this.#ws.close();
969
+ super.close();
970
+ this.#closed = true;
884
971
  await this.#task;
885
972
  }
886
973
 
887
- #getContent(ptr: ContentPtr): RealtimeContent {
888
- const response = this.#pendingResponses[ptr.response_id];
889
- const output = response!.output[ptr.output_index];
890
- const content = output!.content[ptr.content_index]!;
891
- return content;
974
+ private handleInputAudioBufferSpeechStarted(
975
+ _event: api_proto.InputAudioBufferSpeechStartedEvent,
976
+ ): void {
977
+ this.emit('input_speech_started', {} as llm.InputSpeechStartedEvent);
892
978
  }
893
979
 
894
- #handleError(event: api_proto.ErrorEvent): void {
895
- this.#logger.error(`OpenAI Realtime error ${JSON.stringify(event.error)}`);
980
+ private handleInputAudioBufferSpeechStopped(
981
+ _event: api_proto.InputAudioBufferSpeechStoppedEvent,
982
+ ): void {
983
+ this.emit('input_speech_stopped', {
984
+ userTranscriptionEnabled: this.oaiRealtimeModel._options.inputAudioTranscription !== null,
985
+ } as llm.InputSpeechStoppedEvent);
896
986
  }
897
987
 
898
- #handleSessionCreated(event: api_proto.SessionCreatedEvent): void {
899
- this.#sessionId = event.session.id;
900
- this.#expiresAt = event.session.expires_at;
901
- this.#logger = this.#logger.child({ sessionId: this.#sessionId });
902
- }
988
+ private handleResponseCreated(event: api_proto.ResponseCreatedEvent): void {
989
+ if (!event.response.id) {
990
+ throw new Error('response.id is missing');
991
+ }
992
+
993
+ this.currentGeneration = {
994
+ messageChannel: stream.createStreamChannel<llm.MessageGeneration>(),
995
+ functionChannel: stream.createStreamChannel<llm.FunctionCall>(),
996
+ messages: new Map(),
997
+ _doneFut: new Future(),
998
+ _createdTimestamp: Date.now(),
999
+ };
903
1000
 
904
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
905
- #handleSessionUpdated(event: api_proto.SessionUpdatedEvent): void {}
1001
+ if (!event.response.metadata || !event.response.metadata.client_event_id) return;
906
1002
 
907
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
908
- #handleConversationCreated(event: api_proto.ConversationCreatedEvent): void {}
1003
+ const handle = this.responseCreatedFutures[event.response.metadata.client_event_id];
1004
+ if (handle) {
1005
+ delete this.responseCreatedFutures[event.response.metadata.client_event_id];
909
1006
 
910
- #handleInputAudioBufferCommitted(event: api_proto.InputAudioBufferCommittedEvent): void {
911
- this.emit('input_speech_committed', {
912
- itemId: event.item_id,
913
- } as InputSpeechCommitted);
1007
+ // set key to the response id
1008
+ this.responseCreatedFutures[event.response.id] = handle;
1009
+ }
1010
+
1011
+ // the generation_created event is emitted when
1012
+ // 1. the response is not a message on response.output_item.added event
1013
+ // 2. the content is audio on response.content_part.added event
1014
+ // will try to recover from text response on response.content_part.done event
1015
+ this.emit('generation_created', {
1016
+ messageStream: this.currentGeneration.messageChannel.stream(),
1017
+ functionStream: this.currentGeneration.functionChannel.stream(),
1018
+ userInitiated: false,
1019
+ } as GenerationCreatedEvent);
914
1020
  }
915
1021
 
916
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
917
- #handleInputAudioBufferCleared(event: api_proto.InputAudioBufferClearedEvent): void {}
1022
+ private handleResponseOutputItemAdded(event: api_proto.ResponseOutputItemAddedEvent): void {
1023
+ if (!this.currentGeneration) {
1024
+ throw new Error('currentGeneration is not set');
1025
+ }
918
1026
 
919
- #handleInputAudioBufferSpeechStarted(
920
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
921
- event: api_proto.InputAudioBufferSpeechStartedEvent,
922
- ): void {
923
- this.emit('input_speech_started', {
924
- itemId: event.item_id,
925
- } as InputSpeechStarted);
1027
+ if (!event.item.type) {
1028
+ throw new Error('item.type is not set');
1029
+ }
1030
+
1031
+ if (!event.response_id) {
1032
+ throw new Error('response_id is not set');
1033
+ }
1034
+
1035
+ const itemType = event.item.type;
1036
+ const responseId = event.response_id;
1037
+
1038
+ if (itemType !== 'message') {
1039
+ // emit immediately if it's not a message, otherwise wait response.content_part.added
1040
+ this.emitGenerationEvent(responseId);
1041
+ this.textModeRecoveryRetries = 0;
1042
+ return;
1043
+ }
926
1044
  }
927
1045
 
928
- #handleInputAudioBufferSpeechStopped(
929
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
930
- event: api_proto.InputAudioBufferSpeechStoppedEvent,
931
- ): void {
932
- this.emit('input_speech_stopped');
1046
+ private handleConversationItemCreated(event: api_proto.ConversationItemCreatedEvent): void {
1047
+ if (!event.item.id) {
1048
+ throw new Error('item.id is not set');
1049
+ }
1050
+
1051
+ try {
1052
+ this.remoteChatCtx.insert(event.previous_item_id, openAIItemToLivekitItem(event.item));
1053
+ } catch (error) {
1054
+ this.#logger.error({ error, itemId: event.item.id }, 'failed to insert conversation item');
1055
+ }
1056
+
1057
+ const fut = this.itemCreateFutures[event.item.id];
1058
+ if (fut) {
1059
+ fut.resolve();
1060
+ delete this.itemCreateFutures[event.item.id];
1061
+ }
933
1062
  }
934
1063
 
935
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
936
- #handleConversationItemCreated(event: api_proto.ConversationItemCreatedEvent): void {}
1064
+ private handleConversationItemDeleted(event: api_proto.ConversationItemDeletedEvent): void {
1065
+ if (!event.item_id) {
1066
+ throw new Error('item_id is not set');
1067
+ }
1068
+
1069
+ try {
1070
+ this.remoteChatCtx.delete(event.item_id);
1071
+ } catch (error) {
1072
+ this.#logger.error({ error, itemId: event.item_id }, 'failed to delete conversation item');
1073
+ }
937
1074
 
938
- #handleConversationItemInputAudioTranscriptionCompleted(
1075
+ const fut = this.itemDeleteFutures[event.item_id];
1076
+ if (fut) {
1077
+ fut.resolve();
1078
+ delete this.itemDeleteFutures[event.item_id];
1079
+ }
1080
+ }
1081
+
1082
+ private handleConversationItemInputAudioTranscriptionCompleted(
939
1083
  event: api_proto.ConversationItemInputAudioTranscriptionCompletedEvent,
940
1084
  ): void {
941
- const transcript = event.transcript;
942
- this.emit('input_speech_transcription_completed', {
1085
+ const remoteItem = this.remoteChatCtx.get(event.item_id);
1086
+ if (!remoteItem) {
1087
+ return;
1088
+ }
1089
+
1090
+ const item = remoteItem.item;
1091
+ if (item instanceof llm.ChatMessage) {
1092
+ item.content.push(event.transcript);
1093
+ } else {
1094
+ throw new Error('item is not a chat message');
1095
+ }
1096
+
1097
+ this.emit('input_audio_transcription_completed', {
943
1098
  itemId: event.item_id,
944
- transcript: transcript,
945
- } as InputSpeechTranscriptionCompleted);
1099
+ transcript: event.transcript,
1100
+ isFinal: true,
1101
+ } as llm.InputTranscriptionCompleted);
946
1102
  }
947
1103
 
948
- #handleConversationItemInputAudioTranscriptionFailed(
1104
+ private handleConversationItemInputAudioTranscriptionFailed(
949
1105
  event: api_proto.ConversationItemInputAudioTranscriptionFailedEvent,
950
1106
  ): void {
951
- const error = event.error;
952
- this.#logger.error(`OpenAI Realtime failed to transcribe input audio: ${error.message}`);
953
- this.emit('input_speech_transcription_failed', {
954
- itemId: event.item_id,
955
- message: error.message,
956
- } as InputSpeechTranscriptionFailed);
1107
+ this.#logger.error(
1108
+ { error: event.error },
1109
+ 'OpenAI Realtime API failed to transcribe input audio',
1110
+ );
957
1111
  }
958
1112
 
959
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
960
- #handleConversationItemTruncated(event: api_proto.ConversationItemTruncatedEvent): void {}
961
-
962
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
963
- #handleConversationItemDeleted(event: api_proto.ConversationItemDeletedEvent): void {}
964
-
965
- #handleResponseCreated(responseCreated: api_proto.ResponseCreatedEvent): void {
966
- const response = responseCreated.response;
967
- const doneFut = new Future();
968
- const newResponse: RealtimeResponse = {
969
- id: response.id,
970
- status: response.status,
971
- statusDetails: response.status_details,
972
- usage: null,
973
- output: [],
974
- doneFut: doneFut,
975
- createdTimestamp: Date.now(),
976
- };
977
- this.#pendingResponses[newResponse.id] = newResponse;
978
- this.emit('response_created', newResponse);
979
- }
1113
+ private handleResponseContentPartAdded(event: api_proto.ResponseContentPartAddedEvent): void {
1114
+ if (!this.currentGeneration) {
1115
+ throw new Error('currentGeneration is not set');
1116
+ }
980
1117
 
981
- #handleResponseDone(event: api_proto.ResponseDoneEvent): void {
982
- const responseData = event.response;
983
- const responseId = responseData.id;
984
- const response = this.#pendingResponses[responseId]!;
985
- response.status = responseData.status;
986
- response.statusDetails = responseData.status_details;
987
- response.usage = responseData.usage ?? null;
988
- this.#pendingResponses[responseId] = response;
989
- response.doneFut.resolve();
990
-
991
- let metricsError: Error | undefined;
992
- let cancelled = false;
993
- switch (response.status) {
994
- case 'failed': {
995
- if (response.statusDetails.type !== 'failed') break;
996
- const err = response.statusDetails.error;
997
- metricsError = new metrics.MultimodalLLMError({
998
- type: response.statusDetails.type,
999
- code: err?.code,
1000
- message: err?.message,
1001
- });
1002
- this.#logger
1003
- .child({ code: err?.code, error: err?.message })
1004
- .error('response generation failed');
1005
- break;
1006
- }
1007
- case 'incomplete': {
1008
- if (response.statusDetails.type !== 'incomplete') break;
1009
- const reason = response.statusDetails.reason;
1010
- metricsError = new metrics.MultimodalLLMError({
1011
- type: response.statusDetails.type,
1012
- reason,
1013
- });
1014
- this.#logger.child({ reason }).error('response generation incomplete');
1015
- break;
1118
+ const itemId = event.item_id;
1119
+ const itemType = event.part.type;
1120
+ const responseId = event.response_id;
1121
+
1122
+ if (itemType === 'audio') {
1123
+ this.emitGenerationEvent(responseId);
1124
+ if (this.textModeRecoveryRetries > 0) {
1125
+ this.#logger.info(
1126
+ { retries: this.textModeRecoveryRetries },
1127
+ 'recovered from text-only response',
1128
+ );
1129
+ this.textModeRecoveryRetries = 0;
1016
1130
  }
1017
- case 'cancelled': {
1018
- cancelled = true;
1019
- break;
1131
+
1132
+ const itemGeneration: MessageGeneration = {
1133
+ messageId: itemId,
1134
+ textChannel: stream.createStreamChannel<string>(),
1135
+ audioChannel: stream.createStreamChannel<AudioFrame>(),
1136
+ audioTranscript: '',
1137
+ };
1138
+
1139
+ this.currentGeneration.messageChannel.write({
1140
+ messageId: itemId,
1141
+ textStream: itemGeneration.textChannel.stream(),
1142
+ audioStream: itemGeneration.audioChannel.stream(),
1143
+ });
1144
+
1145
+ this.currentGeneration.messages.set(itemId, itemGeneration);
1146
+ this.currentGeneration._firstTokenTimestamp = Date.now();
1147
+ return;
1148
+ } else {
1149
+ this.interrupt();
1150
+ if (this.textModeRecoveryRetries === 0) {
1151
+ this.#logger.warn({ responseId }, 'received text-only response from OpenAI Realtime API');
1020
1152
  }
1021
1153
  }
1022
- this.emit('response_done', response);
1154
+ }
1023
1155
 
1024
- let ttft: number | undefined;
1025
- if (response.firstTokenTimestamp) {
1026
- ttft = response.firstTokenTimestamp - response.createdTimestamp;
1156
+ private handleResponseContentPartDone(event: api_proto.ResponseContentPartDoneEvent): void {
1157
+ if (event.part.type !== 'text') {
1158
+ return;
1027
1159
  }
1028
- const duration = Date.now() - response.createdTimestamp;
1029
1160
 
1030
- const usage = response.usage;
1031
- const metric: metrics.MultimodalLLMMetrics = {
1032
- timestamp: response.createdTimestamp,
1033
- requestId: response.id,
1034
- ttft: ttft!,
1035
- duration,
1036
- cancelled,
1037
- label: this.constructor.name,
1038
- completionTokens: usage?.output_tokens || 0,
1039
- promptTokens: usage?.input_tokens || 0,
1040
- totalTokens: usage?.total_tokens || 0,
1041
- tokensPerSecond: ((usage?.output_tokens || 0) / duration) * 1000,
1042
- error: metricsError,
1043
- inputTokenDetails: {
1044
- cachedTokens: usage?.input_token_details.cached_tokens || 0,
1045
- textTokens: usage?.input_token_details.text_tokens || 0,
1046
- audioTokens: usage?.input_token_details.audio_tokens || 0,
1047
- },
1048
- outputTokenDetails: {
1049
- textTokens: usage?.output_token_details.text_tokens || 0,
1050
- audioTokens: usage?.output_token_details.audio_tokens || 0,
1051
- },
1052
- };
1053
- this.emit('metrics_collected', metric);
1054
- }
1161
+ if (!this.currentGeneration) {
1162
+ throw new Error('currentGeneration is not set');
1163
+ }
1055
1164
 
1056
- #handleResponseOutputItemAdded(event: api_proto.ResponseOutputItemAddedEvent): void {
1057
- const responseId = event.response_id;
1058
- const response = this.#pendingResponses[responseId];
1059
- const itemData = event.item;
1165
+ // TODO(shubhra): handle text mode recovery
1166
+ }
1060
1167
 
1061
- if (itemData.type !== 'message' && itemData.type !== 'function_call') {
1062
- throw new Error(`Unexpected item type: ${itemData.type}`);
1168
+ private handleResponseAudioTranscriptDelta(
1169
+ event: api_proto.ResponseAudioTranscriptDeltaEvent,
1170
+ ): void {
1171
+ if (!this.currentGeneration) {
1172
+ throw new Error('currentGeneration is not set');
1063
1173
  }
1064
1174
 
1065
- let role: api_proto.Role;
1066
- if (itemData.type === 'function_call') {
1067
- role = 'assistant'; // function_call doesn't have a role field, defaulting it to assistant
1175
+ const itemId = event.item_id;
1176
+ const delta = event.delta;
1177
+
1178
+ // TODO (shubhra): add timed string support
1179
+
1180
+ const itemGeneration = this.currentGeneration.messages.get(itemId);
1181
+ if (!itemGeneration) {
1182
+ throw new Error('itemGeneration is not set');
1068
1183
  } else {
1069
- role = itemData.role;
1184
+ itemGeneration.textChannel.write(delta);
1185
+ itemGeneration.audioTranscript += delta;
1070
1186
  }
1187
+ }
1071
1188
 
1072
- const newOutput: RealtimeOutput = {
1073
- responseId: responseId,
1074
- itemId: itemData.id,
1075
- outputIndex: event.output_index,
1076
- type: itemData.type,
1077
- role: role,
1078
- content: [],
1079
- doneFut: new Future(),
1080
- };
1081
- response?.output.push(newOutput);
1082
- this.emit('response_output_added', newOutput);
1189
+ private handleResponseAudioDelta(event: api_proto.ResponseAudioDeltaEvent): void {
1190
+ if (!this.currentGeneration) {
1191
+ throw new Error('currentGeneration is not set');
1192
+ }
1193
+
1194
+ const itemGeneration = this.currentGeneration.messages.get(event.item_id);
1195
+ if (!itemGeneration) {
1196
+ throw new Error('itemGeneration is not set');
1197
+ }
1198
+
1199
+ const binaryString = atob(event.delta);
1200
+ const len = binaryString.length;
1201
+ const bytes = new Uint8Array(len);
1202
+ for (let i = 0; i < len; i++) {
1203
+ bytes[i] = binaryString.charCodeAt(i);
1204
+ }
1205
+
1206
+ itemGeneration.audioChannel.write(
1207
+ new AudioFrame(
1208
+ new Int16Array(bytes.buffer),
1209
+ api_proto.SAMPLE_RATE,
1210
+ api_proto.NUM_CHANNELS,
1211
+ bytes.length / 2,
1212
+ ),
1213
+ );
1083
1214
  }
1084
1215
 
1085
- #handleResponseOutputItemDone(event: api_proto.ResponseOutputItemDoneEvent): void {
1086
- const responseId = event.response_id;
1087
- const response = this.#pendingResponses[responseId];
1088
- const outputIndex = event.output_index;
1089
- const output = response!.output[outputIndex];
1216
+ private handleResponseAudioTranscriptDone(
1217
+ _event: api_proto.ResponseAudioTranscriptDoneEvent,
1218
+ ): void {
1219
+ if (!this.currentGeneration) {
1220
+ throw new Error('currentGeneration is not set');
1221
+ }
1222
+ }
1090
1223
 
1091
- if (output?.type === 'function_call') {
1092
- if (!this.#fncCtx) {
1093
- this.#logger.error('function call received but no fncCtx is available');
1094
- return;
1095
- }
1224
+ private handleResponseAudioDone(_event: api_proto.ResponseAudioDoneEvent): void {
1225
+ if (!this.currentGeneration) {
1226
+ throw new Error('currentGeneration is not set');
1227
+ }
1228
+ }
1229
+
1230
+ private handleResponseOutputItemDone(event: api_proto.ResponseOutputItemDoneEvent): void {
1231
+ if (!this.currentGeneration) {
1232
+ throw new Error('currentGeneration is not set');
1233
+ }
1234
+
1235
+ const itemId = event.item.id;
1236
+ const itemType = event.item.type;
1096
1237
 
1097
- // parse the arguments and call the function inside the fnc_ctx
1238
+ if (itemType === 'function_call') {
1098
1239
  const item = event.item;
1099
- if (item.type !== 'function_call') {
1100
- throw new Error('Expected function_call item');
1240
+ if (!item.call_id || !item.name || !item.arguments) {
1241
+ throw new Error('item is not a function call');
1101
1242
  }
1102
- const func = this.#fncCtx[item.name];
1103
- if (!func) {
1104
- this.#logger.error(`no function with name ${item.name} in fncCtx`);
1243
+ this.currentGeneration.functionChannel.write({
1244
+ callId: item.call_id,
1245
+ name: item.name,
1246
+ args: item.arguments,
1247
+ } as llm.FunctionCall);
1248
+ } else if (itemType === 'message') {
1249
+ const itemGeneration = this.currentGeneration.messages.get(itemId);
1250
+ if (!itemGeneration) {
1105
1251
  return;
1106
1252
  }
1253
+ // text response doesn't have itemGeneration
1254
+ itemGeneration.textChannel.close();
1255
+ itemGeneration.audioChannel.close();
1256
+ }
1257
+ }
1107
1258
 
1108
- this.emit('function_call_started', {
1109
- callId: item.call_id,
1110
- });
1259
+ private handleResponseDone(_event: api_proto.ResponseDoneEvent): void {
1260
+ if (!this.currentGeneration) {
1261
+ // OpenAI has a race condition where we could receive response.done without any
1262
+ // previous response.created (This happens generally during interruption)
1263
+ return;
1264
+ }
1111
1265
 
1112
- const parsedArgs = JSON.parse(item.arguments);
1266
+ const createdTimestamp = this.currentGeneration._createdTimestamp;
1267
+ const firstTokenTimestamp = this.currentGeneration._firstTokenTimestamp;
1113
1268
 
1114
- this.#logger.debug(
1115
- `[Function Call ${item.call_id}] Executing ${item.name} with arguments ${parsedArgs}`,
1116
- );
1269
+ this.#logger.debug(
1270
+ {
1271
+ messageCount: this.currentGeneration.messages.size,
1272
+ },
1273
+ 'Closing generation channels in handleResponseDone',
1274
+ );
1117
1275
 
1118
- func.execute(parsedArgs).then(
1119
- (content) => {
1120
- this.#logger.debug(`[Function Call ${item.call_id}] ${item.name} returned ${content}`);
1121
- this.emit('function_call_completed', {
1122
- callId: item.call_id,
1123
- });
1124
- this.conversation.item.create(
1125
- llm.ChatMessage.createToolFromFunctionResult({
1126
- name: item.name,
1127
- toolCallId: item.call_id,
1128
- result: content,
1129
- }),
1130
- output.itemId,
1131
- );
1132
- this.response.create();
1133
- },
1134
- (error) => {
1135
- this.#logger.error(`[Function Call ${item.call_id}] ${item.name} failed with ${error}`);
1136
- // TODO: send it back up as failed?
1137
- this.emit('function_call_failed', {
1138
- callId: item.call_id,
1139
- });
1140
- },
1141
- );
1276
+ for (const generation of this.currentGeneration.messages.values()) {
1277
+ generation.textChannel.close();
1278
+ generation.audioChannel.close();
1142
1279
  }
1143
1280
 
1144
- output?.doneFut.resolve();
1145
- this.emit('response_output_done', output);
1146
- }
1281
+ this.currentGeneration.functionChannel.close();
1282
+ this.currentGeneration.messageChannel.close();
1147
1283
 
1148
- #handleResponseContentPartAdded(event: api_proto.ResponseContentPartAddedEvent): void {
1149
- const responseId = event.response_id;
1150
- const response = this.#pendingResponses[responseId];
1151
- const outputIndex = event.output_index;
1152
- const output = response!.output[outputIndex];
1284
+ for (const itemId of this.currentGeneration.messages.keys()) {
1285
+ const remoteItem = this.remoteChatCtx.get(itemId);
1286
+ if (remoteItem && remoteItem.item instanceof llm.ChatMessage) {
1287
+ remoteItem.item.content.push(this.currentGeneration.messages.get(itemId)!.audioTranscript);
1288
+ }
1289
+ }
1153
1290
 
1154
- const textStream = new AsyncIterableQueue<string>();
1155
- const audioStream = new AsyncIterableQueue<AudioFrame>();
1291
+ this.currentGeneration._doneFut.resolve();
1292
+ this.currentGeneration = undefined;
1156
1293
 
1157
- const newContent: RealtimeContent = {
1158
- responseId: responseId,
1159
- itemId: event.item_id,
1160
- outputIndex: outputIndex,
1161
- contentIndex: event.content_index,
1162
- text: '',
1163
- audio: [],
1164
- textStream: textStream,
1165
- audioStream: audioStream,
1166
- toolCalls: [],
1167
- contentType: event.part.type,
1294
+ // Calculate and emit metrics
1295
+ const usage = _event.response.usage;
1296
+ const ttft = firstTokenTimestamp ? firstTokenTimestamp - createdTimestamp : -1;
1297
+ const duration = (Date.now() - createdTimestamp) / 1000; // Convert to seconds
1298
+
1299
+ const realtimeMetrics: metrics.RealtimeModelMetrics = {
1300
+ type: 'realtime_model_metrics',
1301
+ timestamp: createdTimestamp / 1000, // Convert to seconds
1302
+ requestId: _event.response.id || '',
1303
+ ttft,
1304
+ duration,
1305
+ cancelled: _event.response.status === 'cancelled',
1306
+ label: 'openai_realtime',
1307
+ inputTokens: usage?.input_tokens ?? 0,
1308
+ outputTokens: usage?.output_tokens ?? 0,
1309
+ totalTokens: usage?.total_tokens ?? 0,
1310
+ tokensPerSecond: duration > 0 ? (usage?.output_tokens ?? 0) / duration : 0,
1311
+ inputTokenDetails: {
1312
+ audioTokens: usage?.input_token_details?.audio_tokens ?? 0,
1313
+ textTokens: usage?.input_token_details?.text_tokens ?? 0,
1314
+ imageTokens: 0, // Not supported yet
1315
+ cachedTokens: usage?.input_token_details?.cached_tokens ?? 0,
1316
+ cachedTokensDetails: usage?.input_token_details?.cached_tokens_details
1317
+ ? {
1318
+ audioTokens: usage?.input_token_details?.cached_tokens_details?.audio_tokens ?? 0,
1319
+ textTokens: usage?.input_token_details?.cached_tokens_details?.text_tokens ?? 0,
1320
+ imageTokens: usage?.input_token_details?.cached_tokens_details?.image_tokens ?? 0,
1321
+ }
1322
+ : undefined,
1323
+ },
1324
+ outputTokenDetails: {
1325
+ textTokens: usage?.output_token_details?.text_tokens ?? 0,
1326
+ audioTokens: usage?.output_token_details?.audio_tokens ?? 0,
1327
+ imageTokens: 0,
1328
+ },
1168
1329
  };
1169
- output?.content.push(newContent);
1170
- response!.firstTokenTimestamp = Date.now();
1171
- this.emit('response_content_added', newContent);
1330
+
1331
+ this.emit('metrics_collected', realtimeMetrics);
1332
+ // TODO(brian): handle response done but not complete
1172
1333
  }
1173
1334
 
1174
- #handleResponseContentPartDone(event: api_proto.ResponseContentPartDoneEvent): void {
1175
- const content = this.#getContent(event);
1176
- this.emit('response_content_done', content);
1335
+ private handleError(event: api_proto.ErrorEvent): void {
1336
+ if (event.error.message.startsWith('Cancellation failed')) {
1337
+ return;
1338
+ }
1339
+
1340
+ this.#logger.error({ error: event.error }, 'OpenAI Realtime API returned an error');
1341
+ this.emitError({
1342
+ error: new APIError(event.error.message, {
1343
+ body: event.error,
1344
+ retryable: true,
1345
+ }),
1346
+ recoverable: true,
1347
+ });
1348
+
1349
+ // TODO(brian): set error for response future if it exists
1177
1350
  }
1178
1351
 
1179
- #handleResponseTextDelta(event: api_proto.ResponseTextDeltaEvent): void {
1180
- this.emit('response_text_delta', event);
1352
+ private emitError({ error, recoverable }: { error: Error; recoverable: boolean }): void {
1353
+ // IMPORTANT: only emit error if there are listeners; otherwise emit will throw an error
1354
+ this.emit('error', {
1355
+ timestamp: Date.now(),
1356
+ // TODO(brian): add label
1357
+ label: '',
1358
+ error,
1359
+ recoverable,
1360
+ } as llm.RealtimeModelError);
1181
1361
  }
1182
1362
 
1183
- #handleResponseTextDone(event: api_proto.ResponseTextDoneEvent): void {
1184
- const content = this.#getContent(event);
1185
- content.text = event.text;
1186
- this.emit('response_text_done', event);
1363
+ private *resampleAudio(frame: AudioFrame): Generator<AudioFrame> {
1364
+ yield frame;
1187
1365
  }
1188
1366
 
1189
- #handleResponseAudioTranscriptDelta(event: api_proto.ResponseAudioTranscriptDeltaEvent): void {
1190
- const content = this.#getContent(event);
1191
- const transcript = event.delta;
1192
- content.text += transcript;
1367
+ private createResponse({
1368
+ userInitiated,
1369
+ instructions,
1370
+ oldHandle,
1371
+ }: {
1372
+ userInitiated: boolean;
1373
+ instructions?: string;
1374
+ oldHandle?: CreateResponseHandle;
1375
+ }): CreateResponseHandle {
1376
+ const handle = oldHandle || new CreateResponseHandle({ instructions });
1377
+ if (oldHandle && instructions) {
1378
+ handle.instructions = instructions;
1379
+ }
1193
1380
 
1194
- content.textStream.put(transcript);
1381
+ const eventId = shortuuid('response_create_');
1382
+ if (userInitiated) {
1383
+ this.responseCreatedFutures[eventId] = handle;
1384
+ }
1385
+
1386
+ const response: api_proto.ResponseCreateEvent['response'] = {};
1387
+ if (instructions) response.instructions = instructions;
1388
+ if (userInitiated) response.metadata = { client_event_id: eventId };
1389
+
1390
+ this.sendEvent({
1391
+ type: 'response.create',
1392
+ event_id: eventId,
1393
+ response: Object.keys(response).length > 0 ? response : undefined,
1394
+ });
1395
+
1396
+ return handle;
1195
1397
  }
1196
1398
 
1197
- #handleResponseAudioTranscriptDone(event: api_proto.ResponseAudioTranscriptDoneEvent): void {
1198
- const content = this.#getContent(event);
1199
- content.textStream.close();
1399
+ private emitGenerationEvent(responseId: string): void {
1400
+ if (!this.currentGeneration) {
1401
+ throw new Error('currentGeneration is not set');
1402
+ }
1403
+
1404
+ const generation_ev: llm.GenerationCreatedEvent = {
1405
+ messageStream: this.currentGeneration.messageChannel.stream(),
1406
+ functionStream: this.currentGeneration.functionChannel.stream(),
1407
+ userInitiated: false,
1408
+ };
1409
+
1410
+ const handle = this.responseCreatedFutures[responseId];
1411
+ if (handle) {
1412
+ delete this.responseCreatedFutures[responseId];
1413
+ generation_ev.userInitiated = true;
1414
+ if (handle.doneFut.done) {
1415
+ this.#logger.warn({ responseId }, 'response received after timeout');
1416
+ } else {
1417
+ handle.doneFut.resolve(generation_ev);
1418
+ }
1419
+ }
1420
+
1421
+ this.#logger.debug({ responseId }, 'Emitting generation_created event');
1422
+ this.emit('generation_created', generation_ev);
1200
1423
  }
1424
+ }
1201
1425
 
1202
- #handleResponseAudioDelta(event: api_proto.ResponseAudioDeltaEvent): void {
1203
- const content = this.#getContent(event);
1204
- const data = Buffer.from(event.delta, 'base64');
1205
- const audio = new AudioFrame(
1206
- new Int16Array(data.buffer),
1207
- api_proto.SAMPLE_RATE,
1208
- api_proto.NUM_CHANNELS,
1209
- data.length / 2,
1210
- );
1211
- content.audio.push(audio);
1426
+ function livekitItemToOpenAIItem(item: llm.ChatItem): api_proto.ItemResource {
1427
+ switch (item.type) {
1428
+ case 'function_call':
1429
+ return {
1430
+ id: item.id,
1431
+ type: 'function_call',
1432
+ call_id: item.callId,
1433
+ name: item.name,
1434
+ arguments: item.args,
1435
+ } as api_proto.FunctionCallItem;
1436
+ case 'function_call_output':
1437
+ return {
1438
+ id: item.id,
1439
+ type: 'function_call_output',
1440
+ call_id: item.callId,
1441
+ output: item.output,
1442
+ } as api_proto.FunctionCallOutputItem;
1443
+ case 'message':
1444
+ const role = item.role === 'developer' ? 'system' : item.role;
1445
+ const contentList: api_proto.Content[] = [];
1446
+ for (const c of item.content) {
1447
+ if (typeof c === 'string') {
1448
+ contentList.push({
1449
+ type: role === 'assistant' ? 'text' : 'input_text',
1450
+ text: c,
1451
+ } as api_proto.InputTextContent);
1452
+ } else if (c.type === 'image_content') {
1453
+ // not supported for now
1454
+ continue;
1455
+ } else if (c.type === 'audio_content') {
1456
+ if (role === 'user') {
1457
+ const encodedAudio = Buffer.from(combineAudioFrames(c.frame).data).toString('base64');
1458
+ contentList.push({
1459
+ type: 'input_audio',
1460
+ audio: encodedAudio,
1461
+ } as api_proto.InputAudioContent);
1462
+ }
1463
+ }
1464
+ }
1465
+ return {
1466
+ id: item.id,
1467
+ type: 'message',
1468
+ role,
1469
+ content: contentList,
1470
+ } as api_proto.UserItem;
1471
+ }
1472
+ }
1212
1473
 
1213
- content.audioStream.put(audio);
1474
+ function openAIItemToLivekitItem(item: api_proto.ItemResource): llm.ChatItem {
1475
+ if (!item.id) {
1476
+ throw new Error('item.id is not set');
1214
1477
  }
1215
1478
 
1216
- #handleResponseAudioDone(event: api_proto.ResponseAudioDoneEvent): void {
1217
- const content = this.#getContent(event);
1218
- content.audioStream.close();
1479
+ switch (item.type) {
1480
+ case 'function_call':
1481
+ return llm.FunctionCall.create({
1482
+ id: item.id,
1483
+ callId: item.call_id,
1484
+ name: item.name,
1485
+ args: item.arguments,
1486
+ });
1487
+ case 'function_call_output':
1488
+ return llm.FunctionCallOutput.create({
1489
+ id: item.id,
1490
+ callId: item.call_id,
1491
+ output: item.output,
1492
+ isError: false,
1493
+ });
1494
+ case 'message':
1495
+ const content: llm.ChatContent[] = [];
1496
+ // item.content can be a single object or an array; normalize to array
1497
+ const contents = Array.isArray(item.content) ? item.content : [item.content];
1498
+ for (const c of contents) {
1499
+ if (c.type === 'text' || c.type === 'input_text') {
1500
+ content.push(c.text);
1501
+ }
1502
+ }
1503
+ return llm.ChatMessage.create({
1504
+ id: item.id,
1505
+ role: item.role,
1506
+ content,
1507
+ });
1219
1508
  }
1509
+ }
1220
1510
 
1221
- #handleResponseFunctionCallArgumentsDelta(
1222
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
1223
- event: api_proto.ResponseFunctionCallArgumentsDeltaEvent,
1224
- ): void {}
1511
+ function createMockAudioItem(durationSeconds: number = 2): llm.ChatMessage {
1512
+ const audioData = Buffer.alloc(durationSeconds * SAMPLE_RATE);
1513
+ return llm.ChatMessage.create({
1514
+ id: shortuuid(MOCK_AUDIO_ID_PREFIX),
1515
+ role: 'user',
1516
+ content: [
1517
+ {
1518
+ type: 'audio_content',
1519
+ frame: [
1520
+ new AudioFrame(
1521
+ new Int16Array(audioData.buffer),
1522
+ SAMPLE_RATE,
1523
+ NUM_CHANNELS,
1524
+ audioData.length / 2,
1525
+ ),
1526
+ ],
1527
+ } as llm.AudioContent,
1528
+ ],
1529
+ });
1530
+ }
1531
+
1532
+ function toOaiToolChoice(toolChoice?: llm.ToolChoice): api_proto.ToolChoice {
1533
+ if (typeof toolChoice === 'string') {
1534
+ return toolChoice;
1535
+ }
1225
1536
 
1226
- #handleResponseFunctionCallArgumentsDone(
1227
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
1228
- event: api_proto.ResponseFunctionCallArgumentsDoneEvent,
1229
- ): void {}
1537
+ if (toolChoice?.type === 'function') {
1538
+ return toolChoice.function.name;
1539
+ }
1230
1540
 
1231
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
1232
- #handleRateLimitsUpdated(event: api_proto.RateLimitsUpdatedEvent): void {}
1541
+ return 'auto';
1233
1542
  }