@livekit/agents-plugin-openai 0.9.2 → 1.0.0-next.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/dist/index.cjs +16 -5
  2. package/dist/index.cjs.map +1 -1
  3. package/dist/index.d.cts +4 -4
  4. package/dist/index.d.ts +4 -4
  5. package/dist/index.d.ts.map +1 -1
  6. package/dist/index.js +14 -3
  7. package/dist/index.js.map +1 -1
  8. package/dist/llm.cjs +156 -188
  9. package/dist/llm.cjs.map +1 -1
  10. package/dist/llm.d.cts +27 -8
  11. package/dist/llm.d.ts +27 -8
  12. package/dist/llm.d.ts.map +1 -1
  13. package/dist/llm.js +164 -179
  14. package/dist/llm.js.map +1 -1
  15. package/dist/models.cjs +14 -0
  16. package/dist/models.cjs.map +1 -1
  17. package/dist/models.d.cts +11 -6
  18. package/dist/models.d.ts +11 -6
  19. package/dist/models.d.ts.map +1 -1
  20. package/dist/models.js +6 -0
  21. package/dist/models.js.map +1 -1
  22. package/dist/realtime/api_proto.cjs.map +1 -1
  23. package/dist/realtime/api_proto.d.cts +15 -0
  24. package/dist/realtime/api_proto.d.ts +15 -0
  25. package/dist/realtime/api_proto.d.ts.map +1 -1
  26. package/dist/realtime/api_proto.js.map +1 -1
  27. package/dist/realtime/realtime_model.cjs +1057 -820
  28. package/dist/realtime/realtime_model.cjs.map +1 -1
  29. package/dist/realtime/realtime_model.d.cts +126 -160
  30. package/dist/realtime/realtime_model.d.ts +126 -160
  31. package/dist/realtime/realtime_model.d.ts.map +1 -1
  32. package/dist/realtime/realtime_model.js +1067 -825
  33. package/dist/realtime/realtime_model.js.map +1 -1
  34. package/dist/tts.cjs +5 -5
  35. package/dist/tts.cjs.map +1 -1
  36. package/dist/tts.d.cts +2 -1
  37. package/dist/tts.d.ts +2 -1
  38. package/dist/tts.d.ts.map +1 -1
  39. package/dist/tts.js +6 -6
  40. package/dist/tts.js.map +1 -1
  41. package/package.json +9 -7
  42. package/src/index.ts +19 -5
  43. package/src/llm.ts +227 -218
  44. package/src/models.ts +83 -5
  45. package/src/realtime/api_proto.ts +15 -1
  46. package/src/realtime/realtime_model.ts +1305 -996
  47. package/src/tts.ts +6 -6
@@ -1,380 +1,455 @@
1
1
  import {
2
- AsyncIterableQueue,
2
+ APIConnectionError,
3
+ APIError,
4
+ AudioByteStream,
5
+ DEFAULT_API_CONNECT_OPTIONS,
3
6
  Future,
4
7
  Queue,
8
+ Task,
9
+ cancelAndWait,
10
+ isAPIError,
5
11
  llm,
6
12
  log,
7
- mergeFrames,
8
- metrics,
9
- multimodal
13
+ shortuuid,
14
+ stream
10
15
  } from "@livekit/agents";
11
- import { AudioFrame } from "@livekit/rtc-node";
12
- import { once } from "node:events";
16
+ import { Mutex } from "@livekit/mutex";
17
+ import { AudioFrame, combineAudioFrames } from "@livekit/rtc-node";
18
+ import { delay } from "@std/async";
13
19
  import { WebSocket } from "ws";
14
20
  import * as api_proto from "./api_proto.js";
15
- class InputAudioBuffer {
16
- #session;
17
- constructor(session) {
18
- this.#session = session;
19
- }
20
- append(frame) {
21
- this.#session.queueMsg({
22
- type: "input_audio_buffer.append",
23
- audio: Buffer.from(frame.data.buffer).toString("base64")
24
- });
25
- }
26
- clear() {
27
- this.#session.queueMsg({
28
- type: "input_audio_buffer.clear"
29
- });
30
- }
31
- commit() {
32
- this.#session.queueMsg({
33
- type: "input_audio_buffer.commit"
34
- });
21
+ const SAMPLE_RATE = 24e3;
22
+ const NUM_CHANNELS = 1;
23
+ const BASE_URL = "https://api.openai.com/v1";
24
+ const MOCK_AUDIO_ID_PREFIX = "lk_mock_audio_item_";
25
+ class CreateResponseHandle {
26
+ instructions;
27
+ doneFut;
28
+ // TODO(shubhra): add timeout
29
+ constructor({ instructions }) {
30
+ this.instructions = instructions;
31
+ this.doneFut = new Future();
35
32
  }
36
33
  }
37
- class ConversationItem {
38
- #session;
39
- #logger = log();
40
- constructor(session) {
41
- this.#session = session;
42
- }
43
- truncate(itemId, contentIndex, audioEnd) {
44
- this.#session.queueMsg({
45
- type: "conversation.item.truncate",
46
- item_id: itemId,
47
- content_index: contentIndex,
48
- audio_end_ms: audioEnd
49
- });
50
- }
51
- delete(itemId) {
52
- this.#session.queueMsg({
53
- type: "conversation.item.delete",
54
- item_id: itemId
34
+ const DEFAULT_FIRST_RETRY_INTERVAL_MS = 100;
35
+ const DEFAULT_TEMPERATURE = 0.8;
36
+ const DEFAULT_TURN_DETECTION = {
37
+ type: "server_vad",
38
+ threshold: 0.5,
39
+ prefix_padding_ms: 300,
40
+ silence_duration_ms: 200,
41
+ create_response: true,
42
+ interrupt_response: true
43
+ };
44
+ const DEFAULT_INPUT_AUDIO_TRANSCRIPTION = {
45
+ model: "gpt-4o-mini-transcribe"
46
+ };
47
+ const DEFAULT_TOOL_CHOICE = "auto";
48
+ const DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS = "inf";
49
+ const AZURE_DEFAULT_INPUT_AUDIO_TRANSCRIPTION = {
50
+ model: "whisper-1"
51
+ };
52
+ const AZURE_DEFAULT_TURN_DETECTION = {
53
+ type: "server_vad",
54
+ threshold: 0.5,
55
+ prefix_padding_ms: 300,
56
+ silence_duration_ms: 200,
57
+ create_response: true
58
+ };
59
+ const DEFAULT_MAX_SESSION_DURATION = 20 * 60 * 1e3;
60
+ const DEFAULT_REALTIME_MODEL_OPTIONS = {
61
+ model: "gpt-4o-realtime-preview",
62
+ voice: "alloy",
63
+ temperature: DEFAULT_TEMPERATURE,
64
+ inputAudioTranscription: DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
65
+ turnDetection: DEFAULT_TURN_DETECTION,
66
+ toolChoice: DEFAULT_TOOL_CHOICE,
67
+ maxResponseOutputTokens: DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS,
68
+ maxSessionDuration: DEFAULT_MAX_SESSION_DURATION,
69
+ connOptions: DEFAULT_API_CONNECT_OPTIONS
70
+ };
71
+ class RealtimeModel extends llm.RealtimeModel {
72
+ sampleRate = api_proto.SAMPLE_RATE;
73
+ numChannels = api_proto.NUM_CHANNELS;
74
+ inFrameSize = api_proto.IN_FRAME_SIZE;
75
+ outFrameSize = api_proto.OUT_FRAME_SIZE;
76
+ /* @internal */
77
+ _options;
78
+ constructor(options = {}) {
79
+ super({
80
+ messageTruncation: true,
81
+ turnDetection: options.turnDetection !== null,
82
+ userTranscription: options.inputAudioTranscription !== null,
83
+ autoToolReplyGeneration: false
55
84
  });
56
- }
57
- create(message, previousItemId) {
58
- if (!message.content) {
59
- return;
85
+ const isAzure = !!(options.apiVersion || options.entraToken || options.azureDeployment);
86
+ if (options.apiKey === "" && !isAzure) {
87
+ throw new Error(
88
+ "OpenAI API key is required, either using the argument or by setting the OPENAI_API_KEY environmental variable"
89
+ );
60
90
  }
61
- let event;
62
- if (message.toolCallId) {
63
- if (typeof message.content !== "string") {
64
- throw new TypeError("message.content must be a string");
65
- }
66
- event = {
67
- type: "conversation.item.create",
68
- previous_item_id: previousItemId,
69
- item: {
70
- type: "function_call_output",
71
- call_id: message.toolCallId,
72
- output: message.content
73
- }
74
- };
75
- } else {
76
- let content = message.content;
77
- if (!Array.isArray(content)) {
78
- content = [content];
79
- }
80
- if (message.role === llm.ChatRole.USER) {
81
- const contents = [];
82
- for (const c of content) {
83
- if (typeof c === "string") {
84
- contents.push({
85
- type: "input_text",
86
- text: c
87
- });
88
- } else if (
89
- // typescript type guard for determining ChatAudio vs ChatImage
90
- ((c2) => {
91
- return c2.frame !== void 0;
92
- })(c)
93
- ) {
94
- contents.push({
95
- type: "input_audio",
96
- audio: Buffer.from(mergeFrames(c.frame).data.buffer).toString("base64")
97
- });
98
- }
99
- }
100
- event = {
101
- type: "conversation.item.create",
102
- previous_item_id: previousItemId,
103
- item: {
104
- type: "message",
105
- role: "user",
106
- content: contents
107
- }
108
- };
109
- } else if (message.role === llm.ChatRole.ASSISTANT) {
110
- const contents = [];
111
- for (const c of content) {
112
- if (typeof c === "string") {
113
- contents.push({
114
- type: "text",
115
- text: c
116
- });
117
- } else if (
118
- // typescript type guard for determining ChatAudio vs ChatImage
119
- ((c2) => {
120
- return c2.frame !== void 0;
121
- })(c)
122
- ) {
123
- this.#logger.warn("audio content in assistant message is not supported");
124
- }
125
- }
126
- event = {
127
- type: "conversation.item.create",
128
- previous_item_id: previousItemId,
129
- item: {
130
- type: "message",
131
- role: "assistant",
132
- content: contents
133
- }
134
- };
135
- } else if (message.role === llm.ChatRole.SYSTEM) {
136
- const contents = [];
137
- for (const c of content) {
138
- if (typeof c === "string") {
139
- contents.push({
140
- type: "input_text",
141
- text: c
142
- });
143
- } else if (
144
- // typescript type guard for determining ChatAudio vs ChatImage
145
- ((c2) => {
146
- return c2.frame !== void 0;
147
- })(c)
148
- ) {
149
- this.#logger.warn("audio content in system message is not supported");
150
- }
151
- }
152
- event = {
153
- type: "conversation.item.create",
154
- previous_item_id: previousItemId,
155
- item: {
156
- type: "message",
157
- role: "system",
158
- content: contents
159
- }
160
- };
161
- } else {
162
- this.#logger.child({ message }).warn("chat message is not supported inside the realtime API");
163
- return;
91
+ const apiKey = options.apiKey || process.env.OPENAI_API_KEY;
92
+ if (!apiKey && !isAzure) {
93
+ throw new Error(
94
+ "OpenAI API key is required, either using the argument or by setting the OPENAI_API_KEY environmental variable"
95
+ );
96
+ }
97
+ if (!options.baseURL && isAzure) {
98
+ const azureEndpoint = process.env.AZURE_OPENAI_ENDPOINT;
99
+ if (!azureEndpoint) {
100
+ throw new Error(
101
+ "Missing Azure endpoint. Please pass base_url or set AZURE_OPENAI_ENDPOINT environment variable."
102
+ );
164
103
  }
104
+ options.baseURL = `${azureEndpoint.replace(/\/$/, "")}/openai`;
165
105
  }
166
- this.#session.queueMsg(event);
167
- }
168
- }
169
- class Conversation {
170
- #session;
171
- constructor(session) {
172
- this.#session = session;
173
- }
174
- get item() {
175
- return new ConversationItem(this.#session);
176
- }
177
- }
178
- class Response {
179
- #session;
180
- constructor(session) {
181
- this.#session = session;
182
- }
183
- create() {
184
- this.#session.queueMsg({
185
- type: "response.create"
186
- });
187
- }
188
- cancel() {
189
- this.#session.queueMsg({
190
- type: "response.cancel"
191
- });
106
+ this._options = {
107
+ ...DEFAULT_REALTIME_MODEL_OPTIONS,
108
+ ...options,
109
+ baseURL: options.baseURL || BASE_URL,
110
+ apiKey,
111
+ isAzure,
112
+ model: options.model || DEFAULT_REALTIME_MODEL_OPTIONS.model
113
+ };
192
114
  }
193
- }
194
- class RealtimeModel extends multimodal.RealtimeModel {
195
- sampleRate = api_proto.SAMPLE_RATE;
196
- numChannels = api_proto.NUM_CHANNELS;
197
- inFrameSize = api_proto.IN_FRAME_SIZE;
198
- outFrameSize = api_proto.OUT_FRAME_SIZE;
199
- #defaultOpts;
200
- #sessions = [];
115
+ /**
116
+ * Create a RealtimeModel instance configured for Azure OpenAI Service.
117
+ *
118
+ * @param azureDeployment - The name of your Azure OpenAI deployment.
119
+ * @param azureEndpoint - The endpoint URL for your Azure OpenAI resource. If undefined, will attempt to read from the environment variable AZURE_OPENAI_ENDPOINT.
120
+ * @param apiVersion - API version to use with Azure OpenAI Service. If undefined, will attempt to read from the environment variable OPENAI_API_VERSION.
121
+ * @param apiKey - Azure OpenAI API key. If undefined, will attempt to read from the environment variable AZURE_OPENAI_API_KEY.
122
+ * @param entraToken - Azure Entra authentication token. Required if not using API key authentication.
123
+ * @param baseURL - Base URL for the API endpoint. If undefined, constructed from the azure_endpoint.
124
+ * @param voice - Voice setting for audio outputs. Defaults to "alloy".
125
+ * @param inputAudioTranscription - Options for transcribing input audio. Defaults to @see DEFAULT_INPUT_AUDIO_TRANSCRIPTION.
126
+ * @param turnDetection - Options for server-based voice activity detection (VAD). Defaults to @see DEFAULT_SERVER_VAD_OPTIONS.
127
+ * @param temperature - Sampling temperature for response generation. Defaults to @see DEFAULT_TEMPERATURE.
128
+ * @param speed - Speed of the audio output. Defaults to 1.0.
129
+ * @param maxResponseOutputTokens - Maximum number of tokens in the response. Defaults to @see DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS.
130
+ * @param maxSessionDuration - Maximum duration of the session in milliseconds. Defaults to @see DEFAULT_MAX_SESSION_DURATION.
131
+ *
132
+ * @returns A RealtimeModel instance configured for Azure OpenAI Service.
133
+ *
134
+ * @throws Error if required Azure parameters are missing or invalid.
135
+ */
201
136
  static withAzure({
202
- baseURL,
203
137
  azureDeployment,
204
- apiVersion = "2024-10-01-preview",
205
- apiKey = void 0,
206
- entraToken = void 0,
207
- instructions = "",
208
- modalities = ["text", "audio"],
209
- voice = "alloy",
210
- inputAudioFormat = "pcm16",
211
- outputAudioFormat = "pcm16",
212
- inputAudioTranscription = { model: "whisper-1" },
213
- turnDetection = { type: "server_vad" },
214
- temperature = 0.8,
215
- maxResponseOutputTokens = Infinity
216
- }) {
217
- return new RealtimeModel({
218
- isAzure: true,
219
- baseURL: new URL("openai", baseURL).toString(),
220
- model: azureDeployment,
221
- apiVersion,
222
- apiKey,
223
- entraToken,
224
- instructions,
225
- modalities,
226
- voice,
227
- inputAudioFormat,
228
- outputAudioFormat,
229
- inputAudioTranscription,
230
- turnDetection,
231
- temperature,
232
- maxResponseOutputTokens
233
- });
234
- }
235
- constructor({
236
- modalities = ["text", "audio"],
237
- instructions = "",
138
+ azureEndpoint,
139
+ apiVersion,
140
+ apiKey,
141
+ entraToken,
142
+ baseURL,
238
143
  voice = "alloy",
239
- inputAudioFormat = "pcm16",
240
- outputAudioFormat = "pcm16",
241
- inputAudioTranscription = { model: "whisper-1" },
242
- turnDetection = { type: "server_vad" },
144
+ inputAudioTranscription = AZURE_DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
145
+ turnDetection = AZURE_DEFAULT_TURN_DETECTION,
243
146
  temperature = 0.8,
244
- maxResponseOutputTokens = Infinity,
245
- model = "gpt-4o-realtime-preview-2024-10-01",
246
- apiKey = process.env.OPENAI_API_KEY || "",
247
- baseURL = api_proto.BASE_URL,
248
- // used for microsoft
249
- isAzure = false,
250
- apiVersion = void 0,
251
- entraToken = void 0
147
+ speed
252
148
  }) {
253
- super();
254
- if (apiKey === "" && !(isAzure && entraToken)) {
149
+ apiKey = apiKey || process.env.AZURE_OPENAI_API_KEY;
150
+ if (!apiKey && !entraToken) {
255
151
  throw new Error(
256
- "OpenAI API key is required, either using the argument or by setting the OPENAI_API_KEY environmental variable"
152
+ "Missing credentials. Please pass one of `apiKey`, `entraToken`, or the `AZURE_OPENAI_API_KEY` environment variable."
257
153
  );
258
154
  }
259
- this.#defaultOpts = {
260
- modalities,
261
- instructions,
155
+ apiVersion = apiVersion || process.env.OPENAI_API_VERSION;
156
+ if (!apiVersion) {
157
+ throw new Error(
158
+ "Must provide either the `apiVersion` argument or the `OPENAI_API_VERSION` environment variable"
159
+ );
160
+ }
161
+ if (!baseURL) {
162
+ azureEndpoint = azureEndpoint || process.env.AZURE_OPENAI_ENDPOINT;
163
+ if (!azureEndpoint) {
164
+ throw new Error(
165
+ "Missing Azure endpoint. Please pass the `azure_endpoint` parameter or set the `AZURE_OPENAI_ENDPOINT` environment variable."
166
+ );
167
+ }
168
+ baseURL = `${azureEndpoint.replace(/\/$/, "")}/openai`;
169
+ }
170
+ return new RealtimeModel({
262
171
  voice,
263
- inputAudioFormat,
264
- outputAudioFormat,
265
172
  inputAudioTranscription,
266
173
  turnDetection,
267
174
  temperature,
268
- maxResponseOutputTokens,
269
- model,
175
+ speed,
270
176
  apiKey,
271
- baseURL,
272
- isAzure,
177
+ azureDeployment,
273
178
  apiVersion,
274
- entraToken
275
- };
276
- }
277
- get sessions() {
278
- return this.#sessions;
279
- }
280
- session({
281
- fncCtx,
282
- chatCtx,
283
- modalities = this.#defaultOpts.modalities,
284
- instructions = this.#defaultOpts.instructions,
285
- voice = this.#defaultOpts.voice,
286
- inputAudioFormat = this.#defaultOpts.inputAudioFormat,
287
- outputAudioFormat = this.#defaultOpts.outputAudioFormat,
288
- inputAudioTranscription = this.#defaultOpts.inputAudioTranscription,
289
- turnDetection = this.#defaultOpts.turnDetection,
290
- temperature = this.#defaultOpts.temperature,
291
- maxResponseOutputTokens = this.#defaultOpts.maxResponseOutputTokens
292
- }) {
293
- const opts = {
294
- modalities,
295
- instructions,
296
- voice,
297
- inputAudioFormat,
298
- outputAudioFormat,
299
- inputAudioTranscription,
300
- turnDetection,
301
- temperature,
302
- maxResponseOutputTokens,
303
- model: this.#defaultOpts.model,
304
- apiKey: this.#defaultOpts.apiKey,
305
- baseURL: this.#defaultOpts.baseURL,
306
- isAzure: this.#defaultOpts.isAzure,
307
- apiVersion: this.#defaultOpts.apiVersion,
308
- entraToken: this.#defaultOpts.entraToken
309
- };
310
- const newSession = new RealtimeSession(opts, {
311
- chatCtx: chatCtx || new llm.ChatContext(),
312
- fncCtx
179
+ entraToken,
180
+ baseURL
313
181
  });
314
- this.#sessions.push(newSession);
315
- return newSession;
182
+ }
183
+ session() {
184
+ return new RealtimeSession(this);
316
185
  }
317
186
  async close() {
318
- await Promise.allSettled(this.#sessions.map((session) => session.close()));
187
+ return;
188
+ }
189
+ }
190
+ function processBaseURL({
191
+ baseURL,
192
+ model,
193
+ isAzure = false,
194
+ azureDeployment,
195
+ apiVersion
196
+ }) {
197
+ const url = new URL([baseURL, "realtime"].join("/"));
198
+ if (url.protocol === "https:") {
199
+ url.protocol = "wss:";
200
+ }
201
+ if (!url.pathname || ["", "/v1", "/openai"].includes(url.pathname.replace(/\/$/, ""))) {
202
+ url.pathname = url.pathname.replace(/\/$/, "") + "/realtime";
203
+ } else {
204
+ url.pathname = url.pathname.replace(/\/$/, "");
205
+ }
206
+ const queryParams = {};
207
+ if (isAzure) {
208
+ if (apiVersion) {
209
+ queryParams["api-version"] = apiVersion;
210
+ }
211
+ if (azureDeployment) {
212
+ queryParams["deployment"] = azureDeployment;
213
+ }
214
+ } else {
215
+ queryParams["model"] = model;
216
+ }
217
+ for (const [key, value] of Object.entries(queryParams)) {
218
+ url.searchParams.set(key, value);
319
219
  }
220
+ return url.toString();
320
221
  }
321
- class RealtimeSession extends multimodal.RealtimeSession {
322
- #chatCtx = void 0;
323
- #fncCtx = void 0;
324
- #opts;
325
- #pendingResponses = {};
326
- #sessionId = "not-connected";
327
- #ws = null;
328
- #expiresAt = null;
222
+ class RealtimeSession extends llm.RealtimeSession {
223
+ _tools = {};
224
+ remoteChatCtx = new llm.RemoteChatContext();
225
+ messageChannel = new Queue();
226
+ inputResampler;
227
+ instructions;
228
+ oaiRealtimeModel;
229
+ currentGeneration;
230
+ responseCreatedFutures = {};
231
+ textModeRecoveryRetries = 0;
232
+ itemCreateFutures = {};
233
+ itemDeleteFutures = {};
234
+ updateChatCtxLock = new Mutex();
235
+ updateFuncCtxLock = new Mutex();
236
+ // 100ms chunks
237
+ bstream = new AudioByteStream(SAMPLE_RATE, NUM_CHANNELS, SAMPLE_RATE / 10);
238
+ pushedDurationMs = 0;
329
239
  #logger = log();
330
240
  #task;
331
- #closing = true;
332
- #sendQueue = new Queue();
333
- constructor(opts, { fncCtx, chatCtx }) {
334
- super();
335
- this.#opts = opts;
336
- this.#chatCtx = chatCtx;
337
- this.#fncCtx = fncCtx;
338
- this.#task = this.#start();
339
- this.sessionUpdate({
340
- modalities: this.#opts.modalities,
341
- instructions: this.#opts.instructions,
342
- voice: this.#opts.voice,
343
- inputAudioFormat: this.#opts.inputAudioFormat,
344
- outputAudioFormat: this.#opts.outputAudioFormat,
345
- inputAudioTranscription: this.#opts.inputAudioTranscription,
346
- turnDetection: this.#opts.turnDetection,
347
- temperature: this.#opts.temperature,
348
- maxResponseOutputTokens: this.#opts.maxResponseOutputTokens,
349
- toolChoice: "auto"
350
- });
241
+ #closed = false;
242
+ constructor(realtimeModel) {
243
+ super(realtimeModel);
244
+ this.oaiRealtimeModel = realtimeModel;
245
+ this.#task = this.#mainTask();
246
+ this.sendEvent(this.createSessionUpdateEvent());
247
+ }
248
+ sendEvent(command) {
249
+ this.messageChannel.put(command);
250
+ }
251
+ createSessionUpdateEvent() {
252
+ return {
253
+ type: "session.update",
254
+ session: {
255
+ model: this.oaiRealtimeModel._options.model,
256
+ voice: this.oaiRealtimeModel._options.voice,
257
+ input_audio_format: "pcm16",
258
+ output_audio_format: "pcm16",
259
+ modalities: ["text", "audio"],
260
+ turn_detection: this.oaiRealtimeModel._options.turnDetection,
261
+ input_audio_transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
262
+ // TODO(shubhra): add inputAudioNoiseReduction
263
+ temperature: this.oaiRealtimeModel._options.temperature,
264
+ tool_choice: toOaiToolChoice(this.oaiRealtimeModel._options.toolChoice),
265
+ max_response_output_tokens: this.oaiRealtimeModel._options.maxResponseOutputTokens === Infinity ? "inf" : this.oaiRealtimeModel._options.maxResponseOutputTokens,
266
+ // TODO(shubhra): add tracing options
267
+ instructions: this.instructions,
268
+ speed: this.oaiRealtimeModel._options.speed
269
+ }
270
+ };
351
271
  }
352
272
  get chatCtx() {
353
- return this.#chatCtx;
273
+ return this.remoteChatCtx.toChatCtx();
354
274
  }
355
- get fncCtx() {
356
- return this.#fncCtx;
275
+ get tools() {
276
+ return { ...this._tools };
357
277
  }
358
- set fncCtx(ctx) {
359
- this.#fncCtx = ctx;
278
+ async updateChatCtx(_chatCtx) {
279
+ const unlock = await this.updateChatCtxLock.lock();
280
+ const events = this.createChatCtxUpdateEvents(_chatCtx);
281
+ const futures = [];
282
+ for (const event of events) {
283
+ const future = new Future();
284
+ futures.push(future);
285
+ if (event.type === "conversation.item.create") {
286
+ this.itemCreateFutures[event.item.id] = future;
287
+ } else if (event.type == "conversation.item.delete") {
288
+ this.itemDeleteFutures[event.item_id] = future;
289
+ }
290
+ this.sendEvent(event);
291
+ }
292
+ if (futures.length === 0) {
293
+ unlock();
294
+ return;
295
+ }
296
+ try {
297
+ await Promise.race([
298
+ Promise.all(futures),
299
+ delay(5e3).then(() => {
300
+ throw new Error("Chat ctx update events timed out");
301
+ })
302
+ ]);
303
+ } catch (e) {
304
+ this.#logger.error(e.message);
305
+ throw e;
306
+ } finally {
307
+ unlock();
308
+ }
360
309
  }
361
- get conversation() {
362
- return new Conversation(this);
310
+ createChatCtxUpdateEvents(chatCtx, addMockAudio = false) {
311
+ const newChatCtx = chatCtx.copy();
312
+ if (addMockAudio) {
313
+ newChatCtx.items.push(createMockAudioItem());
314
+ } else {
315
+ newChatCtx.items = newChatCtx.items.filter(
316
+ (item) => !item.id.startsWith(MOCK_AUDIO_ID_PREFIX)
317
+ );
318
+ }
319
+ const events = [];
320
+ const diffOps = llm.computeChatCtxDiff(this.chatCtx, newChatCtx);
321
+ for (const op of diffOps.toRemove) {
322
+ events.push({
323
+ type: "conversation.item.delete",
324
+ item_id: op,
325
+ event_id: shortuuid("chat_ctx_delete_")
326
+ });
327
+ }
328
+ for (const [previousId, id] of diffOps.toCreate) {
329
+ const chatItem = newChatCtx.getById(id);
330
+ if (!chatItem) {
331
+ throw new Error(`Chat item ${id} not found`);
332
+ }
333
+ events.push({
334
+ type: "conversation.item.create",
335
+ item: livekitItemToOpenAIItem(chatItem),
336
+ previous_item_id: previousId ?? void 0,
337
+ event_id: shortuuid("chat_ctx_create_")
338
+ });
339
+ }
340
+ return events;
363
341
  }
364
- get inputAudioBuffer() {
365
- return new InputAudioBuffer(this);
342
+ async updateTools(_tools) {
343
+ const unlock = await this.updateFuncCtxLock.lock();
344
+ const ev = this.createToolsUpdateEvent(_tools);
345
+ this.sendEvent(ev);
346
+ if (!ev.session.tools) {
347
+ throw new Error("Tools are missing in the session update event");
348
+ }
349
+ const retainedToolNames = new Set(ev.session.tools.map((tool) => tool.name));
350
+ const retainedTools = Object.fromEntries(
351
+ Object.entries(_tools).filter(
352
+ ([name, tool]) => llm.isFunctionTool(tool) && retainedToolNames.has(name)
353
+ )
354
+ );
355
+ this._tools = retainedTools;
356
+ unlock();
366
357
  }
367
- get response() {
368
- return new Response(this);
358
+ createToolsUpdateEvent(_tools) {
359
+ const oaiTools = [];
360
+ for (const [name, tool] of Object.entries(_tools)) {
361
+ if (!llm.isFunctionTool(tool)) {
362
+ this.#logger.error({ name, tool }, "OpenAI Realtime API doesn't support this tool type");
363
+ continue;
364
+ }
365
+ const { parameters: toolParameters, description } = tool;
366
+ try {
367
+ const parameters = llm.toJsonSchema(
368
+ toolParameters
369
+ );
370
+ oaiTools.push({
371
+ name,
372
+ description,
373
+ parameters,
374
+ type: "function"
375
+ });
376
+ } catch (e) {
377
+ this.#logger.error({ name, tool }, "OpenAI Realtime API doesn't support this tool type");
378
+ continue;
379
+ }
380
+ }
381
+ return {
382
+ type: "session.update",
383
+ session: {
384
+ model: this.oaiRealtimeModel._options.model,
385
+ tools: oaiTools
386
+ },
387
+ event_id: shortuuid("tools_update_")
388
+ };
389
+ }
390
+ async updateInstructions(_instructions) {
391
+ const eventId = shortuuid("instructions_update_");
392
+ this.sendEvent({
393
+ type: "session.update",
394
+ session: {
395
+ instructions: _instructions
396
+ },
397
+ event_id: eventId
398
+ });
399
+ this.instructions = _instructions;
400
+ }
401
+ updateOptions({ toolChoice }) {
402
+ const options = {};
403
+ this.oaiRealtimeModel._options.toolChoice = toolChoice;
404
+ options.tool_choice = toOaiToolChoice(toolChoice);
405
+ this.sendEvent({
406
+ type: "session.update",
407
+ session: options,
408
+ event_id: shortuuid("options_update_")
409
+ });
410
+ }
411
+ pushAudio(frame) {
412
+ for (const f of this.resampleAudio(frame)) {
413
+ for (const nf of this.bstream.write(f.data.buffer)) {
414
+ this.sendEvent({
415
+ type: "input_audio_buffer.append",
416
+ audio: Buffer.from(nf.data.buffer).toString("base64")
417
+ });
418
+ this.pushedDurationMs += nf.samplesPerChannel / nf.sampleRate * 1e3;
419
+ }
420
+ }
369
421
  }
370
- get expiration() {
371
- if (!this.#expiresAt) {
372
- throw new Error("session not started");
422
+ async commitAudio() {
423
+ if (this.pushedDurationMs > 100) {
424
+ this.sendEvent({
425
+ type: "input_audio_buffer.commit"
426
+ });
427
+ this.pushedDurationMs = 0;
373
428
  }
374
- return this.#expiresAt * 1e3;
375
429
  }
376
- queueMsg(command) {
377
- this.#sendQueue.put(command);
430
+ async clearAudio() {
431
+ this.sendEvent({
432
+ type: "input_audio_buffer.clear"
433
+ });
434
+ this.pushedDurationMs = 0;
435
+ }
436
+ async generateReply(instructions) {
437
+ const handle = this.createResponse({ instructions, userInitiated: true });
438
+ this.textModeRecoveryRetries = 0;
439
+ return handle.doneFut.await;
440
+ }
441
+ async interrupt() {
442
+ this.sendEvent({
443
+ type: "response.cancel"
444
+ });
445
+ }
446
+ async truncate(_options) {
447
+ this.sendEvent({
448
+ type: "conversation.item.truncate",
449
+ content_index: 0,
450
+ item_id: _options.messageId,
451
+ audio_end_ms: _options.audioEndMs
452
+ });
378
453
  }
379
454
  /// Truncates the data field of the event to the specified maxLength to avoid overwhelming logs
380
455
  /// with large amounts of base64 audio data.
@@ -395,549 +470,716 @@ class RealtimeSession extends multimodal.RealtimeSession {
395
470
  }
396
471
  return untypedEvent;
397
472
  }
398
- sessionUpdate({
399
- modalities = this.#opts.modalities,
400
- instructions = this.#opts.instructions,
401
- voice = this.#opts.voice,
402
- inputAudioFormat = this.#opts.inputAudioFormat,
403
- outputAudioFormat = this.#opts.outputAudioFormat,
404
- inputAudioTranscription = this.#opts.inputAudioTranscription,
405
- turnDetection = this.#opts.turnDetection,
406
- temperature = this.#opts.temperature,
407
- maxResponseOutputTokens = this.#opts.maxResponseOutputTokens,
408
- toolChoice = "auto",
409
- selectedTools = Object.keys(this.#fncCtx || {})
410
- }) {
411
- this.#opts = {
412
- modalities,
413
- instructions,
414
- voice,
415
- inputAudioFormat,
416
- outputAudioFormat,
417
- inputAudioTranscription,
418
- turnDetection,
419
- temperature,
420
- maxResponseOutputTokens,
421
- model: this.#opts.model,
422
- apiKey: this.#opts.apiKey,
423
- baseURL: this.#opts.baseURL,
424
- isAzure: this.#opts.isAzure,
425
- apiVersion: this.#opts.apiVersion,
426
- entraToken: this.#opts.entraToken
427
- };
428
- const tools = this.#fncCtx ? Object.entries(this.#fncCtx).filter(([name]) => selectedTools.includes(name)).map(([name, func]) => ({
429
- type: "function",
430
- name,
431
- description: func.description,
432
- parameters: (
433
- // don't format parameters if they are raw openai params
434
- func.parameters.type == "object" ? func.parameters : llm.oaiParams(func.parameters)
435
- )
436
- })) : [];
437
- const sessionUpdateEvent = {
438
- type: "session.update",
439
- session: {
440
- modalities: this.#opts.modalities,
441
- instructions: this.#opts.instructions,
442
- voice: this.#opts.voice,
443
- input_audio_format: this.#opts.inputAudioFormat,
444
- output_audio_format: this.#opts.outputAudioFormat,
445
- input_audio_transcription: this.#opts.inputAudioTranscription,
446
- turn_detection: this.#opts.turnDetection,
447
- temperature: this.#opts.temperature,
448
- max_response_output_tokens: this.#opts.maxResponseOutputTokens === Infinity ? "inf" : this.#opts.maxResponseOutputTokens,
449
- tools,
450
- tool_choice: toolChoice
451
- }
473
+ async createWsConn() {
474
+ const headers = {
475
+ "User-Agent": "LiveKit-Agents-JS"
452
476
  };
453
- if (this.#opts.isAzure && this.#opts.maxResponseOutputTokens === Infinity) {
454
- sessionUpdateEvent.session.max_response_output_tokens = void 0;
455
- }
456
- this.queueMsg(sessionUpdateEvent);
457
- }
458
- /** Create an empty audio message with the given duration. */
459
- #createEmptyUserAudioMessage(duration) {
460
- const samples = duration * api_proto.SAMPLE_RATE;
461
- return new llm.ChatMessage({
462
- role: llm.ChatRole.USER,
463
- content: {
464
- frame: new AudioFrame(
465
- new Int16Array(samples * api_proto.NUM_CHANNELS),
466
- api_proto.SAMPLE_RATE,
467
- api_proto.NUM_CHANNELS,
468
- samples
469
- )
477
+ if (this.oaiRealtimeModel._options.isAzure) {
478
+ if (this.oaiRealtimeModel._options.entraToken) {
479
+ headers.Authorization = `Bearer ${this.oaiRealtimeModel._options.entraToken}`;
480
+ } else if (this.oaiRealtimeModel._options.apiKey) {
481
+ headers["api-key"] = this.oaiRealtimeModel._options.apiKey;
482
+ } else {
483
+ throw new Error("Microsoft API key or entraToken is required");
470
484
  }
471
- });
472
- }
473
- /**
474
- * Try to recover from a text response to audio mode.
475
- *
476
- * @remarks
477
- * Sometimes the OpenAI Realtime API returns text instead of audio responses.
478
- * This method tries to recover from this by requesting a new response after deleting the text
479
- * response and creating an empty user audio message.
480
- */
481
- recoverFromTextResponse(itemId) {
482
- if (itemId) {
483
- this.conversation.item.delete(itemId);
485
+ } else {
486
+ headers.Authorization = `Bearer ${this.oaiRealtimeModel._options.apiKey}`;
487
+ headers["OpenAI-Beta"] = "realtime=v1";
484
488
  }
485
- this.conversation.item.create(this.#createEmptyUserAudioMessage(1));
486
- this.response.create();
489
+ const url = processBaseURL({
490
+ baseURL: this.oaiRealtimeModel._options.baseURL,
491
+ model: this.oaiRealtimeModel._options.model,
492
+ isAzure: this.oaiRealtimeModel._options.isAzure,
493
+ apiVersion: this.oaiRealtimeModel._options.apiVersion,
494
+ azureDeployment: this.oaiRealtimeModel._options.azureDeployment
495
+ });
496
+ this.#logger.debug(`Connecting to OpenAI Realtime API at ${url}`);
497
+ return new Promise((resolve, reject) => {
498
+ const ws = new WebSocket(url, { headers });
499
+ let waiting = true;
500
+ const timeout = setTimeout(() => {
501
+ ws.close();
502
+ reject(new Error("WebSocket connection timeout"));
503
+ }, this.oaiRealtimeModel._options.connOptions.timeoutMs);
504
+ ws.once("open", () => {
505
+ if (!waiting) return;
506
+ waiting = false;
507
+ clearTimeout(timeout);
508
+ resolve(ws);
509
+ });
510
+ ws.once("close", () => {
511
+ if (!waiting) return;
512
+ waiting = false;
513
+ clearTimeout(timeout);
514
+ reject(new Error("OpenAI Realtime API connection closed"));
515
+ });
516
+ });
487
517
  }
488
- #start() {
489
- return new Promise(async (resolve, reject) => {
490
- const headers = {
491
- "User-Agent": "LiveKit-Agents-JS"
492
- };
493
- if (this.#opts.isAzure) {
494
- if (this.#opts.entraToken) {
495
- headers.Authorization = `Bearer ${this.#opts.entraToken}`;
496
- } else if (this.#opts.apiKey) {
497
- headers["api-key"] = this.#opts.apiKey;
498
- } else {
499
- reject(new Error("Microsoft API key or entraToken is required"));
500
- return;
501
- }
502
- } else {
503
- headers.Authorization = `Bearer ${this.#opts.apiKey}`;
504
- headers["OpenAI-Beta"] = "realtime=v1";
505
- }
506
- const url = new URL([this.#opts.baseURL, "realtime"].join("/"));
507
- if (url.protocol === "https:") {
508
- url.protocol = "wss:";
509
- }
510
- const queryParams = {};
511
- if (this.#opts.isAzure) {
512
- queryParams["api-version"] = this.#opts.apiVersion ?? "2024-10-01-preview";
513
- queryParams["deployment"] = this.#opts.model;
514
- } else {
515
- queryParams["model"] = this.#opts.model;
516
- }
517
- for (const [key, value] of Object.entries(queryParams)) {
518
- url.searchParams.set(key, value);
518
+ async #mainTask() {
519
+ let reconnecting = false;
520
+ let numRetries = 0;
521
+ let wsConn = null;
522
+ const maxRetries = this.oaiRealtimeModel._options.connOptions.maxRetry;
523
+ const reconnect = async () => {
524
+ this.#logger.debug(
525
+ {
526
+ maxSessionDuration: this.oaiRealtimeModel._options.maxSessionDuration
527
+ },
528
+ "Reconnecting to OpenAI Realtime API"
529
+ );
530
+ const events = [];
531
+ events.push(this.createSessionUpdateEvent());
532
+ if (Object.keys(this._tools).length > 0) {
533
+ events.push(this.createToolsUpdateEvent(this._tools));
519
534
  }
520
- console.debug("Connecting to OpenAI Realtime API at ", url.toString());
521
- this.#ws = new WebSocket(url.toString(), {
522
- headers
535
+ const chatCtx = this.chatCtx.copy({
536
+ excludeFunctionCall: true,
537
+ excludeInstructions: true,
538
+ excludeEmptyMessage: true
523
539
  });
524
- this.#ws.onerror = (error) => {
525
- reject(new Error("OpenAI Realtime WebSocket error: " + error.message));
526
- };
527
- await once(this.#ws, "open");
528
- this.#closing = false;
529
- this.#ws.onmessage = (message) => {
530
- const event = JSON.parse(message.data);
531
- this.#logger.debug(`<- ${JSON.stringify(this.#loggableEvent(event))}`);
532
- switch (event.type) {
533
- case "error":
534
- this.#handleError(event);
535
- break;
536
- case "session.created":
537
- this.#handleSessionCreated(event);
538
- break;
539
- case "session.updated":
540
- this.#handleSessionUpdated(event);
541
- break;
542
- case "conversation.created":
543
- this.#handleConversationCreated(event);
544
- break;
545
- case "input_audio_buffer.committed":
546
- this.#handleInputAudioBufferCommitted(event);
547
- break;
548
- case "input_audio_buffer.cleared":
549
- this.#handleInputAudioBufferCleared(event);
550
- break;
551
- case "input_audio_buffer.speech_started":
552
- this.#handleInputAudioBufferSpeechStarted(event);
553
- break;
554
- case "input_audio_buffer.speech_stopped":
555
- this.#handleInputAudioBufferSpeechStopped(event);
556
- break;
557
- case "conversation.item.created":
558
- this.#handleConversationItemCreated(event);
559
- break;
560
- case "conversation.item.input_audio_transcription.completed":
561
- this.#handleConversationItemInputAudioTranscriptionCompleted(event);
562
- break;
563
- case "conversation.item.input_audio_transcription.failed":
564
- this.#handleConversationItemInputAudioTranscriptionFailed(event);
565
- break;
566
- case "conversation.item.truncated":
567
- this.#handleConversationItemTruncated(event);
568
- break;
569
- case "conversation.item.deleted":
570
- this.#handleConversationItemDeleted(event);
571
- break;
572
- case "response.created":
573
- this.#handleResponseCreated(event);
574
- break;
575
- case "response.done":
576
- this.#handleResponseDone(event);
577
- break;
578
- case "response.output_item.added":
579
- this.#handleResponseOutputItemAdded(event);
580
- break;
581
- case "response.output_item.done":
582
- this.#handleResponseOutputItemDone(event);
583
- break;
584
- case "response.content_part.added":
585
- this.#handleResponseContentPartAdded(event);
586
- break;
587
- case "response.content_part.done":
588
- this.#handleResponseContentPartDone(event);
589
- break;
590
- case "response.text.delta":
591
- this.#handleResponseTextDelta(event);
592
- break;
593
- case "response.text.done":
594
- this.#handleResponseTextDone(event);
595
- break;
596
- case "response.audio_transcript.delta":
597
- this.#handleResponseAudioTranscriptDelta(event);
598
- break;
599
- case "response.audio_transcript.done":
600
- this.#handleResponseAudioTranscriptDone(event);
601
- break;
602
- case "response.audio.delta":
603
- this.#handleResponseAudioDelta(event);
604
- break;
605
- case "response.audio.done":
606
- this.#handleResponseAudioDone(event);
607
- break;
608
- case "response.function_call_arguments.delta":
609
- this.#handleResponseFunctionCallArgumentsDelta(event);
610
- break;
611
- case "response.function_call_arguments.done":
612
- this.#handleResponseFunctionCallArgumentsDone(event);
613
- break;
614
- case "rate_limits.updated":
615
- this.#handleRateLimitsUpdated(event);
616
- break;
540
+ const oldChatCtx = this.remoteChatCtx;
541
+ this.remoteChatCtx = new llm.RemoteChatContext();
542
+ events.push(...this.createChatCtxUpdateEvents(chatCtx));
543
+ try {
544
+ for (const ev of events) {
545
+ this.emit("openai_client_event_queued", ev);
546
+ wsConn.send(JSON.stringify(ev));
617
547
  }
618
- };
619
- const sendTask = async () => {
620
- while (this.#ws && !this.#closing && this.#ws.readyState === WebSocket.OPEN) {
621
- try {
622
- const event = await this.#sendQueue.get();
623
- if (event.type !== "input_audio_buffer.append") {
624
- this.#logger.debug(`-> ${JSON.stringify(this.#loggableEvent(event))}`);
625
- }
626
- this.#ws.send(JSON.stringify(event));
627
- } catch (error) {
628
- this.#logger.error("Error sending event:", error);
629
- }
548
+ } catch (error) {
549
+ this.remoteChatCtx = oldChatCtx;
550
+ throw new APIConnectionError({
551
+ message: "Failed to send message to OpenAI Realtime API during session re-connection"
552
+ });
553
+ }
554
+ this.#logger.debug("Reconnected to OpenAI Realtime API");
555
+ this.emit("session_reconnected", {});
556
+ };
557
+ reconnecting = false;
558
+ while (!this.#closed) {
559
+ this.#logger.debug("Creating WebSocket connection to OpenAI Realtime API");
560
+ wsConn = await this.createWsConn();
561
+ try {
562
+ if (reconnecting) {
563
+ await reconnect();
564
+ numRetries = 0;
630
565
  }
631
- };
632
- sendTask();
633
- this.#ws.onclose = () => {
634
- if (this.#expiresAt && Date.now() >= this.#expiresAt * 1e3) {
635
- this.#closing = true;
566
+ await this.runWs(wsConn);
567
+ } catch (error) {
568
+ if (!isAPIError(error)) {
569
+ this.emitError({ error, recoverable: false });
570
+ throw error;
636
571
  }
637
- if (!this.#closing) {
638
- reject(new Error("OpenAI Realtime connection closed unexpectedly"));
572
+ if (maxRetries === 0 || !error.retryable) {
573
+ this.emitError({ error, recoverable: false });
574
+ throw error;
639
575
  }
640
- this.#ws = null;
641
- resolve();
642
- };
576
+ if (numRetries === maxRetries) {
577
+ this.emitError({ error, recoverable: false });
578
+ throw new APIConnectionError({
579
+ message: `OpenAI Realtime API connection failed after ${numRetries} attempts`,
580
+ options: {
581
+ body: error,
582
+ retryable: false
583
+ }
584
+ });
585
+ }
586
+ this.emitError({ error, recoverable: true });
587
+ const retryInterval = numRetries === 0 ? DEFAULT_FIRST_RETRY_INTERVAL_MS : this.oaiRealtimeModel._options.connOptions.retryIntervalMs;
588
+ this.#logger.warn(
589
+ {
590
+ attempt: numRetries,
591
+ maxRetries,
592
+ error
593
+ },
594
+ `OpenAI Realtime API connection failed, retrying in ${retryInterval / 1e3}s`
595
+ );
596
+ await delay(retryInterval);
597
+ numRetries++;
598
+ }
599
+ reconnecting = true;
600
+ }
601
+ }
602
+ async runWs(wsConn) {
603
+ const forwardEvents = async (signal) => {
604
+ while (!this.#closed && wsConn.readyState === WebSocket.OPEN && !signal.aborted) {
605
+ try {
606
+ const event = await this.messageChannel.get();
607
+ if (signal.aborted) {
608
+ break;
609
+ }
610
+ if (event.type !== "input_audio_buffer.append") {
611
+ this.#logger.debug(`(client) -> ${JSON.stringify(this.#loggableEvent(event))}`);
612
+ }
613
+ this.emit("openai_client_event_queued", event);
614
+ wsConn.send(JSON.stringify(event));
615
+ } catch (error) {
616
+ break;
617
+ }
618
+ }
619
+ wsConn.close();
620
+ };
621
+ const wsCloseFuture = new Future();
622
+ wsConn.onerror = (error) => {
623
+ wsCloseFuture.resolve(new APIConnectionError({ message: error.message }));
624
+ };
625
+ wsConn.onclose = () => {
626
+ wsCloseFuture.resolve();
627
+ };
628
+ wsConn.onmessage = (message) => {
629
+ const event = JSON.parse(message.data);
630
+ this.emit("openai_server_event_received", event);
631
+ this.#logger.debug(`(server) <- ${JSON.stringify(this.#loggableEvent(event))}`);
632
+ switch (event.type) {
633
+ case "input_audio_buffer.speech_started":
634
+ this.handleInputAudioBufferSpeechStarted(event);
635
+ break;
636
+ case "input_audio_buffer.speech_stopped":
637
+ this.handleInputAudioBufferSpeechStopped(event);
638
+ break;
639
+ case "response.created":
640
+ this.handleResponseCreated(event);
641
+ break;
642
+ case "response.output_item.added":
643
+ this.handleResponseOutputItemAdded(event);
644
+ break;
645
+ case "conversation.item.created":
646
+ this.handleConversationItemCreated(event);
647
+ break;
648
+ case "conversation.item.deleted":
649
+ this.handleConversationItemDeleted(event);
650
+ break;
651
+ case "conversation.item.input_audio_transcription.completed":
652
+ this.handleConversationItemInputAudioTranscriptionCompleted(event);
653
+ break;
654
+ case "conversation.item.input_audio_transcription.failed":
655
+ this.handleConversationItemInputAudioTranscriptionFailed(event);
656
+ break;
657
+ case "response.content_part.added":
658
+ this.handleResponseContentPartAdded(event);
659
+ break;
660
+ case "response.content_part.done":
661
+ this.handleResponseContentPartDone(event);
662
+ break;
663
+ case "response.audio_transcript.delta":
664
+ this.handleResponseAudioTranscriptDelta(event);
665
+ break;
666
+ case "response.audio.delta":
667
+ this.handleResponseAudioDelta(event);
668
+ break;
669
+ case "response.audio_transcript.done":
670
+ this.handleResponseAudioTranscriptDone(event);
671
+ break;
672
+ case "response.audio.done":
673
+ this.handleResponseAudioDone(event);
674
+ break;
675
+ case "response.output_item.done":
676
+ this.handleResponseOutputItemDone(event);
677
+ break;
678
+ case "response.done":
679
+ this.handleResponseDone(event);
680
+ break;
681
+ case "error":
682
+ this.handleError(event);
683
+ break;
684
+ default:
685
+ this.#logger.debug(`unhandled event: ${event.type}`);
686
+ break;
687
+ }
688
+ };
689
+ const sendTask = Task.from(({ signal }) => forwardEvents(signal));
690
+ const wsTask = Task.from(({ signal }) => {
691
+ const abortPromise = new Promise((resolve) => {
692
+ signal.addEventListener("abort", () => {
693
+ resolve();
694
+ });
695
+ });
696
+ return Promise.race([wsCloseFuture.await, abortPromise]);
643
697
  });
698
+ const waitReconnectTask = Task.from(async ({ signal }) => {
699
+ await delay(this.oaiRealtimeModel._options.maxSessionDuration, { signal });
700
+ return new APIConnectionError({
701
+ message: "OpenAI Realtime API connection timeout"
702
+ });
703
+ });
704
+ try {
705
+ const result = await Promise.race([wsTask.result, sendTask.result, waitReconnectTask.result]);
706
+ if (waitReconnectTask.done && this.currentGeneration) {
707
+ await this.currentGeneration._doneFut.await;
708
+ }
709
+ if (result instanceof Error) {
710
+ throw result;
711
+ }
712
+ } finally {
713
+ await cancelAndWait([wsTask, sendTask, waitReconnectTask], 2e3);
714
+ wsConn.close();
715
+ }
644
716
  }
645
717
  async close() {
646
- if (!this.#ws) return;
647
- this.#closing = true;
648
- this.#ws.close();
718
+ super.close();
719
+ this.#closed = true;
649
720
  await this.#task;
650
721
  }
651
- #getContent(ptr) {
652
- const response = this.#pendingResponses[ptr.response_id];
653
- const output = response.output[ptr.output_index];
654
- const content = output.content[ptr.content_index];
655
- return content;
722
+ handleInputAudioBufferSpeechStarted(_event) {
723
+ this.emit("input_speech_started", {});
656
724
  }
657
- #handleError(event) {
658
- this.#logger.error(`OpenAI Realtime error ${JSON.stringify(event.error)}`);
725
+ handleInputAudioBufferSpeechStopped(_event) {
726
+ this.emit("input_speech_stopped", {
727
+ userTranscriptionEnabled: this.oaiRealtimeModel._options.inputAudioTranscription !== null
728
+ });
659
729
  }
660
- #handleSessionCreated(event) {
661
- this.#sessionId = event.session.id;
662
- this.#expiresAt = event.session.expires_at;
663
- this.#logger = this.#logger.child({ sessionId: this.#sessionId });
730
+ handleResponseCreated(event) {
731
+ if (!event.response.id) {
732
+ throw new Error("response.id is missing");
733
+ }
734
+ this.currentGeneration = {
735
+ messageChannel: stream.createStreamChannel(),
736
+ functionChannel: stream.createStreamChannel(),
737
+ messages: /* @__PURE__ */ new Map(),
738
+ _doneFut: new Future(),
739
+ _createdTimestamp: Date.now()
740
+ };
741
+ if (!event.response.metadata || !event.response.metadata.client_event_id) return;
742
+ const handle = this.responseCreatedFutures[event.response.metadata.client_event_id];
743
+ if (handle) {
744
+ delete this.responseCreatedFutures[event.response.metadata.client_event_id];
745
+ this.responseCreatedFutures[event.response.id] = handle;
746
+ }
747
+ this.emit("generation_created", {
748
+ messageStream: this.currentGeneration.messageChannel.stream(),
749
+ functionStream: this.currentGeneration.functionChannel.stream(),
750
+ userInitiated: false
751
+ });
664
752
  }
665
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
666
- #handleSessionUpdated(event) {
753
+ handleResponseOutputItemAdded(event) {
754
+ if (!this.currentGeneration) {
755
+ throw new Error("currentGeneration is not set");
756
+ }
757
+ if (!event.item.type) {
758
+ throw new Error("item.type is not set");
759
+ }
760
+ if (!event.response_id) {
761
+ throw new Error("response_id is not set");
762
+ }
763
+ const itemType = event.item.type;
764
+ const responseId = event.response_id;
765
+ if (itemType !== "message") {
766
+ this.emitGenerationEvent(responseId);
767
+ this.textModeRecoveryRetries = 0;
768
+ return;
769
+ }
667
770
  }
668
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
669
- #handleConversationCreated(event) {
771
+ handleConversationItemCreated(event) {
772
+ if (!event.item.id) {
773
+ throw new Error("item.id is not set");
774
+ }
775
+ try {
776
+ this.remoteChatCtx.insert(event.previous_item_id, openAIItemToLivekitItem(event.item));
777
+ } catch (error) {
778
+ this.#logger.error({ error, itemId: event.item.id }, "failed to insert conversation item");
779
+ }
780
+ const fut = this.itemCreateFutures[event.item.id];
781
+ if (fut) {
782
+ fut.resolve();
783
+ delete this.itemCreateFutures[event.item.id];
784
+ }
670
785
  }
671
- #handleInputAudioBufferCommitted(event) {
672
- this.emit("input_speech_committed", {
673
- itemId: event.item_id
786
+ handleConversationItemDeleted(event) {
787
+ if (!event.item_id) {
788
+ throw new Error("item_id is not set");
789
+ }
790
+ try {
791
+ this.remoteChatCtx.delete(event.item_id);
792
+ } catch (error) {
793
+ this.#logger.error({ error, itemId: event.item_id }, "failed to delete conversation item");
794
+ }
795
+ const fut = this.itemDeleteFutures[event.item_id];
796
+ if (fut) {
797
+ fut.resolve();
798
+ delete this.itemDeleteFutures[event.item_id];
799
+ }
800
+ }
801
+ handleConversationItemInputAudioTranscriptionCompleted(event) {
802
+ const remoteItem = this.remoteChatCtx.get(event.item_id);
803
+ if (!remoteItem) {
804
+ return;
805
+ }
806
+ const item = remoteItem.item;
807
+ if (item instanceof llm.ChatMessage) {
808
+ item.content.push(event.transcript);
809
+ } else {
810
+ throw new Error("item is not a chat message");
811
+ }
812
+ this.emit("input_audio_transcription_completed", {
813
+ itemId: event.item_id,
814
+ transcript: event.transcript,
815
+ isFinal: true
674
816
  });
675
817
  }
676
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
677
- #handleInputAudioBufferCleared(event) {
818
+ handleConversationItemInputAudioTranscriptionFailed(event) {
819
+ this.#logger.error(
820
+ { error: event.error },
821
+ "OpenAI Realtime API failed to transcribe input audio"
822
+ );
678
823
  }
679
- #handleInputAudioBufferSpeechStarted(event) {
680
- this.emit("input_speech_started", {
681
- itemId: event.item_id
682
- });
824
+ handleResponseContentPartAdded(event) {
825
+ if (!this.currentGeneration) {
826
+ throw new Error("currentGeneration is not set");
827
+ }
828
+ const itemId = event.item_id;
829
+ const itemType = event.part.type;
830
+ const responseId = event.response_id;
831
+ if (itemType === "audio") {
832
+ this.emitGenerationEvent(responseId);
833
+ if (this.textModeRecoveryRetries > 0) {
834
+ this.#logger.info(
835
+ { retries: this.textModeRecoveryRetries },
836
+ "recovered from text-only response"
837
+ );
838
+ this.textModeRecoveryRetries = 0;
839
+ }
840
+ const itemGeneration = {
841
+ messageId: itemId,
842
+ textChannel: stream.createStreamChannel(),
843
+ audioChannel: stream.createStreamChannel(),
844
+ audioTranscript: ""
845
+ };
846
+ this.currentGeneration.messageChannel.write({
847
+ messageId: itemId,
848
+ textStream: itemGeneration.textChannel.stream(),
849
+ audioStream: itemGeneration.audioChannel.stream()
850
+ });
851
+ this.currentGeneration.messages.set(itemId, itemGeneration);
852
+ this.currentGeneration._firstTokenTimestamp = Date.now();
853
+ return;
854
+ } else {
855
+ this.interrupt();
856
+ if (this.textModeRecoveryRetries === 0) {
857
+ this.#logger.warn({ responseId }, "received text-only response from OpenAI Realtime API");
858
+ }
859
+ }
683
860
  }
684
- #handleInputAudioBufferSpeechStopped(event) {
685
- this.emit("input_speech_stopped");
861
+ handleResponseContentPartDone(event) {
862
+ if (event.part.type !== "text") {
863
+ return;
864
+ }
865
+ if (!this.currentGeneration) {
866
+ throw new Error("currentGeneration is not set");
867
+ }
686
868
  }
687
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
688
- #handleConversationItemCreated(event) {
869
+ handleResponseAudioTranscriptDelta(event) {
870
+ if (!this.currentGeneration) {
871
+ throw new Error("currentGeneration is not set");
872
+ }
873
+ const itemId = event.item_id;
874
+ const delta = event.delta;
875
+ const itemGeneration = this.currentGeneration.messages.get(itemId);
876
+ if (!itemGeneration) {
877
+ throw new Error("itemGeneration is not set");
878
+ } else {
879
+ itemGeneration.textChannel.write(delta);
880
+ itemGeneration.audioTranscript += delta;
881
+ }
689
882
  }
690
- #handleConversationItemInputAudioTranscriptionCompleted(event) {
691
- const transcript = event.transcript;
692
- this.emit("input_speech_transcription_completed", {
693
- itemId: event.item_id,
694
- transcript
695
- });
883
+ handleResponseAudioDelta(event) {
884
+ if (!this.currentGeneration) {
885
+ throw new Error("currentGeneration is not set");
886
+ }
887
+ const itemGeneration = this.currentGeneration.messages.get(event.item_id);
888
+ if (!itemGeneration) {
889
+ throw new Error("itemGeneration is not set");
890
+ }
891
+ const binaryString = atob(event.delta);
892
+ const len = binaryString.length;
893
+ const bytes = new Uint8Array(len);
894
+ for (let i = 0; i < len; i++) {
895
+ bytes[i] = binaryString.charCodeAt(i);
896
+ }
897
+ itemGeneration.audioChannel.write(
898
+ new AudioFrame(
899
+ new Int16Array(bytes.buffer),
900
+ api_proto.SAMPLE_RATE,
901
+ api_proto.NUM_CHANNELS,
902
+ bytes.length / 2
903
+ )
904
+ );
696
905
  }
697
- #handleConversationItemInputAudioTranscriptionFailed(event) {
698
- const error = event.error;
699
- this.#logger.error(`OpenAI Realtime failed to transcribe input audio: ${error.message}`);
700
- this.emit("input_speech_transcription_failed", {
701
- itemId: event.item_id,
702
- message: error.message
703
- });
906
+ handleResponseAudioTranscriptDone(_event) {
907
+ if (!this.currentGeneration) {
908
+ throw new Error("currentGeneration is not set");
909
+ }
704
910
  }
705
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
706
- #handleConversationItemTruncated(event) {
707
- }
708
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
709
- #handleConversationItemDeleted(event) {
710
- }
711
- #handleResponseCreated(responseCreated) {
712
- const response = responseCreated.response;
713
- const doneFut = new Future();
714
- const newResponse = {
715
- id: response.id,
716
- status: response.status,
717
- statusDetails: response.status_details,
718
- usage: null,
719
- output: [],
720
- doneFut,
721
- createdTimestamp: Date.now()
722
- };
723
- this.#pendingResponses[newResponse.id] = newResponse;
724
- this.emit("response_created", newResponse);
725
- }
726
- #handleResponseDone(event) {
727
- const responseData = event.response;
728
- const responseId = responseData.id;
729
- const response = this.#pendingResponses[responseId];
730
- response.status = responseData.status;
731
- response.statusDetails = responseData.status_details;
732
- response.usage = responseData.usage ?? null;
733
- this.#pendingResponses[responseId] = response;
734
- response.doneFut.resolve();
735
- let metricsError;
736
- let cancelled = false;
737
- switch (response.status) {
738
- case "failed": {
739
- if (response.statusDetails.type !== "failed") break;
740
- const err = response.statusDetails.error;
741
- metricsError = new metrics.MultimodalLLMError({
742
- type: response.statusDetails.type,
743
- code: err == null ? void 0 : err.code,
744
- message: err == null ? void 0 : err.message
745
- });
746
- this.#logger.child({ code: err == null ? void 0 : err.code, error: err == null ? void 0 : err.message }).error("response generation failed");
747
- break;
748
- }
749
- case "incomplete": {
750
- if (response.statusDetails.type !== "incomplete") break;
751
- const reason = response.statusDetails.reason;
752
- metricsError = new metrics.MultimodalLLMError({
753
- type: response.statusDetails.type,
754
- reason
755
- });
756
- this.#logger.child({ reason }).error("response generation incomplete");
757
- break;
911
+ handleResponseAudioDone(_event) {
912
+ if (!this.currentGeneration) {
913
+ throw new Error("currentGeneration is not set");
914
+ }
915
+ }
916
+ handleResponseOutputItemDone(event) {
917
+ if (!this.currentGeneration) {
918
+ throw new Error("currentGeneration is not set");
919
+ }
920
+ const itemId = event.item.id;
921
+ const itemType = event.item.type;
922
+ if (itemType === "function_call") {
923
+ const item = event.item;
924
+ if (!item.call_id || !item.name || !item.arguments) {
925
+ throw new Error("item is not a function call");
758
926
  }
759
- case "cancelled": {
760
- cancelled = true;
761
- break;
927
+ this.currentGeneration.functionChannel.write({
928
+ callId: item.call_id,
929
+ name: item.name,
930
+ args: item.arguments
931
+ });
932
+ } else if (itemType === "message") {
933
+ const itemGeneration = this.currentGeneration.messages.get(itemId);
934
+ if (!itemGeneration) {
935
+ return;
762
936
  }
937
+ itemGeneration.textChannel.close();
938
+ itemGeneration.audioChannel.close();
763
939
  }
764
- this.emit("response_done", response);
765
- let ttft;
766
- if (response.firstTokenTimestamp) {
767
- ttft = response.firstTokenTimestamp - response.createdTimestamp;
940
+ }
941
+ handleResponseDone(_event) {
942
+ var _a, _b, _c, _d, _e, _f, _g, _h, _i, _j, _k, _l;
943
+ if (!this.currentGeneration) {
944
+ return;
768
945
  }
769
- const duration = Date.now() - response.createdTimestamp;
770
- const usage = response.usage;
771
- const metric = {
772
- timestamp: response.createdTimestamp,
773
- requestId: response.id,
946
+ const createdTimestamp = this.currentGeneration._createdTimestamp;
947
+ const firstTokenTimestamp = this.currentGeneration._firstTokenTimestamp;
948
+ this.#logger.debug(
949
+ {
950
+ messageCount: this.currentGeneration.messages.size
951
+ },
952
+ "Closing generation channels in handleResponseDone"
953
+ );
954
+ for (const generation of this.currentGeneration.messages.values()) {
955
+ generation.textChannel.close();
956
+ generation.audioChannel.close();
957
+ }
958
+ this.currentGeneration.functionChannel.close();
959
+ this.currentGeneration.messageChannel.close();
960
+ for (const itemId of this.currentGeneration.messages.keys()) {
961
+ const remoteItem = this.remoteChatCtx.get(itemId);
962
+ if (remoteItem && remoteItem.item instanceof llm.ChatMessage) {
963
+ remoteItem.item.content.push(this.currentGeneration.messages.get(itemId).audioTranscript);
964
+ }
965
+ }
966
+ this.currentGeneration._doneFut.resolve();
967
+ this.currentGeneration = void 0;
968
+ const usage = _event.response.usage;
969
+ const ttft = firstTokenTimestamp ? firstTokenTimestamp - createdTimestamp : -1;
970
+ const duration = (Date.now() - createdTimestamp) / 1e3;
971
+ const realtimeMetrics = {
972
+ type: "realtime_model_metrics",
973
+ timestamp: createdTimestamp / 1e3,
974
+ // Convert to seconds
975
+ requestId: _event.response.id || "",
774
976
  ttft,
775
977
  duration,
776
- cancelled,
777
- label: this.constructor.name,
778
- completionTokens: (usage == null ? void 0 : usage.output_tokens) || 0,
779
- promptTokens: (usage == null ? void 0 : usage.input_tokens) || 0,
780
- totalTokens: (usage == null ? void 0 : usage.total_tokens) || 0,
781
- tokensPerSecond: ((usage == null ? void 0 : usage.output_tokens) || 0) / duration * 1e3,
782
- error: metricsError,
978
+ cancelled: _event.response.status === "cancelled",
979
+ label: "openai_realtime",
980
+ inputTokens: (usage == null ? void 0 : usage.input_tokens) ?? 0,
981
+ outputTokens: (usage == null ? void 0 : usage.output_tokens) ?? 0,
982
+ totalTokens: (usage == null ? void 0 : usage.total_tokens) ?? 0,
983
+ tokensPerSecond: duration > 0 ? ((usage == null ? void 0 : usage.output_tokens) ?? 0) / duration : 0,
783
984
  inputTokenDetails: {
784
- cachedTokens: (usage == null ? void 0 : usage.input_token_details.cached_tokens) || 0,
785
- textTokens: (usage == null ? void 0 : usage.input_token_details.text_tokens) || 0,
786
- audioTokens: (usage == null ? void 0 : usage.input_token_details.audio_tokens) || 0
985
+ audioTokens: ((_a = usage == null ? void 0 : usage.input_token_details) == null ? void 0 : _a.audio_tokens) ?? 0,
986
+ textTokens: ((_b = usage == null ? void 0 : usage.input_token_details) == null ? void 0 : _b.text_tokens) ?? 0,
987
+ imageTokens: 0,
988
+ // Not supported yet
989
+ cachedTokens: ((_c = usage == null ? void 0 : usage.input_token_details) == null ? void 0 : _c.cached_tokens) ?? 0,
990
+ cachedTokensDetails: ((_d = usage == null ? void 0 : usage.input_token_details) == null ? void 0 : _d.cached_tokens_details) ? {
991
+ audioTokens: ((_f = (_e = usage == null ? void 0 : usage.input_token_details) == null ? void 0 : _e.cached_tokens_details) == null ? void 0 : _f.audio_tokens) ?? 0,
992
+ textTokens: ((_h = (_g = usage == null ? void 0 : usage.input_token_details) == null ? void 0 : _g.cached_tokens_details) == null ? void 0 : _h.text_tokens) ?? 0,
993
+ imageTokens: ((_j = (_i = usage == null ? void 0 : usage.input_token_details) == null ? void 0 : _i.cached_tokens_details) == null ? void 0 : _j.image_tokens) ?? 0
994
+ } : void 0
787
995
  },
788
996
  outputTokenDetails: {
789
- textTokens: (usage == null ? void 0 : usage.output_token_details.text_tokens) || 0,
790
- audioTokens: (usage == null ? void 0 : usage.output_token_details.audio_tokens) || 0
997
+ textTokens: ((_k = usage == null ? void 0 : usage.output_token_details) == null ? void 0 : _k.text_tokens) ?? 0,
998
+ audioTokens: ((_l = usage == null ? void 0 : usage.output_token_details) == null ? void 0 : _l.audio_tokens) ?? 0,
999
+ imageTokens: 0
791
1000
  }
792
1001
  };
793
- this.emit("metrics_collected", metric);
1002
+ this.emit("metrics_collected", realtimeMetrics);
794
1003
  }
795
- #handleResponseOutputItemAdded(event) {
796
- const responseId = event.response_id;
797
- const response = this.#pendingResponses[responseId];
798
- const itemData = event.item;
799
- if (itemData.type !== "message" && itemData.type !== "function_call") {
800
- throw new Error(`Unexpected item type: ${itemData.type}`);
801
- }
802
- let role;
803
- if (itemData.type === "function_call") {
804
- role = "assistant";
805
- } else {
806
- role = itemData.role;
807
- }
808
- const newOutput = {
809
- responseId,
810
- itemId: itemData.id,
811
- outputIndex: event.output_index,
812
- type: itemData.type,
813
- role,
814
- content: [],
815
- doneFut: new Future()
816
- };
817
- response == null ? void 0 : response.output.push(newOutput);
818
- this.emit("response_output_added", newOutput);
1004
+ handleError(event) {
1005
+ if (event.error.message.startsWith("Cancellation failed")) {
1006
+ return;
1007
+ }
1008
+ this.#logger.error({ error: event.error }, "OpenAI Realtime API returned an error");
1009
+ this.emitError({
1010
+ error: new APIError(event.error.message, {
1011
+ body: event.error,
1012
+ retryable: true
1013
+ }),
1014
+ recoverable: true
1015
+ });
819
1016
  }
820
- #handleResponseOutputItemDone(event) {
821
- const responseId = event.response_id;
822
- const response = this.#pendingResponses[responseId];
823
- const outputIndex = event.output_index;
824
- const output = response.output[outputIndex];
825
- if ((output == null ? void 0 : output.type) === "function_call") {
826
- if (!this.#fncCtx) {
827
- this.#logger.error("function call received but no fncCtx is available");
828
- return;
829
- }
830
- const item = event.item;
831
- if (item.type !== "function_call") {
832
- throw new Error("Expected function_call item");
833
- }
834
- const func = this.#fncCtx[item.name];
835
- if (!func) {
836
- this.#logger.error(`no function with name ${item.name} in fncCtx`);
837
- return;
838
- }
839
- this.emit("function_call_started", {
840
- callId: item.call_id
841
- });
842
- const parsedArgs = JSON.parse(item.arguments);
843
- this.#logger.debug(
844
- `[Function Call ${item.call_id}] Executing ${item.name} with arguments ${parsedArgs}`
845
- );
846
- func.execute(parsedArgs).then(
847
- (content) => {
848
- this.#logger.debug(`[Function Call ${item.call_id}] ${item.name} returned ${content}`);
849
- this.emit("function_call_completed", {
850
- callId: item.call_id
851
- });
852
- this.conversation.item.create(
853
- llm.ChatMessage.createToolFromFunctionResult({
854
- name: item.name,
855
- toolCallId: item.call_id,
856
- result: content
857
- }),
858
- output.itemId
859
- );
860
- this.response.create();
861
- },
862
- (error) => {
863
- this.#logger.error(`[Function Call ${item.call_id}] ${item.name} failed with ${error}`);
864
- this.emit("function_call_failed", {
865
- callId: item.call_id
866
- });
867
- }
868
- );
1017
+ emitError({ error, recoverable }) {
1018
+ this.emit("error", {
1019
+ timestamp: Date.now(),
1020
+ // TODO(brian): add label
1021
+ label: "",
1022
+ error,
1023
+ recoverable
1024
+ });
1025
+ }
1026
+ *resampleAudio(frame) {
1027
+ yield frame;
1028
+ }
1029
+ createResponse({
1030
+ userInitiated,
1031
+ instructions,
1032
+ oldHandle
1033
+ }) {
1034
+ const handle = oldHandle || new CreateResponseHandle({ instructions });
1035
+ if (oldHandle && instructions) {
1036
+ handle.instructions = instructions;
869
1037
  }
870
- output == null ? void 0 : output.doneFut.resolve();
871
- this.emit("response_output_done", output);
1038
+ const eventId = shortuuid("response_create_");
1039
+ if (userInitiated) {
1040
+ this.responseCreatedFutures[eventId] = handle;
1041
+ }
1042
+ const response = {};
1043
+ if (instructions) response.instructions = instructions;
1044
+ if (userInitiated) response.metadata = { client_event_id: eventId };
1045
+ this.sendEvent({
1046
+ type: "response.create",
1047
+ event_id: eventId,
1048
+ response: Object.keys(response).length > 0 ? response : void 0
1049
+ });
1050
+ return handle;
872
1051
  }
873
- #handleResponseContentPartAdded(event) {
874
- const responseId = event.response_id;
875
- const response = this.#pendingResponses[responseId];
876
- const outputIndex = event.output_index;
877
- const output = response.output[outputIndex];
878
- const textStream = new AsyncIterableQueue();
879
- const audioStream = new AsyncIterableQueue();
880
- const newContent = {
881
- responseId,
882
- itemId: event.item_id,
883
- outputIndex,
884
- contentIndex: event.content_index,
885
- text: "",
886
- audio: [],
887
- textStream,
888
- audioStream,
889
- toolCalls: [],
890
- contentType: event.part.type
1052
+ emitGenerationEvent(responseId) {
1053
+ if (!this.currentGeneration) {
1054
+ throw new Error("currentGeneration is not set");
1055
+ }
1056
+ const generation_ev = {
1057
+ messageStream: this.currentGeneration.messageChannel.stream(),
1058
+ functionStream: this.currentGeneration.functionChannel.stream(),
1059
+ userInitiated: false
891
1060
  };
892
- output == null ? void 0 : output.content.push(newContent);
893
- response.firstTokenTimestamp = Date.now();
894
- this.emit("response_content_added", newContent);
895
- }
896
- #handleResponseContentPartDone(event) {
897
- const content = this.#getContent(event);
898
- this.emit("response_content_done", content);
899
- }
900
- #handleResponseTextDelta(event) {
901
- this.emit("response_text_delta", event);
902
- }
903
- #handleResponseTextDone(event) {
904
- const content = this.#getContent(event);
905
- content.text = event.text;
906
- this.emit("response_text_done", event);
907
- }
908
- #handleResponseAudioTranscriptDelta(event) {
909
- const content = this.#getContent(event);
910
- const transcript = event.delta;
911
- content.text += transcript;
912
- content.textStream.put(transcript);
913
- }
914
- #handleResponseAudioTranscriptDone(event) {
915
- const content = this.#getContent(event);
916
- content.textStream.close();
917
- }
918
- #handleResponseAudioDelta(event) {
919
- const content = this.#getContent(event);
920
- const data = Buffer.from(event.delta, "base64");
921
- const audio = new AudioFrame(
922
- new Int16Array(data.buffer),
923
- api_proto.SAMPLE_RATE,
924
- api_proto.NUM_CHANNELS,
925
- data.length / 2
926
- );
927
- content.audio.push(audio);
928
- content.audioStream.put(audio);
1061
+ const handle = this.responseCreatedFutures[responseId];
1062
+ if (handle) {
1063
+ delete this.responseCreatedFutures[responseId];
1064
+ generation_ev.userInitiated = true;
1065
+ if (handle.doneFut.done) {
1066
+ this.#logger.warn({ responseId }, "response received after timeout");
1067
+ } else {
1068
+ handle.doneFut.resolve(generation_ev);
1069
+ }
1070
+ }
1071
+ this.#logger.debug({ responseId }, "Emitting generation_created event");
1072
+ this.emit("generation_created", generation_ev);
929
1073
  }
930
- #handleResponseAudioDone(event) {
931
- const content = this.#getContent(event);
932
- content.audioStream.close();
1074
+ }
1075
+ function livekitItemToOpenAIItem(item) {
1076
+ switch (item.type) {
1077
+ case "function_call":
1078
+ return {
1079
+ id: item.id,
1080
+ type: "function_call",
1081
+ call_id: item.callId,
1082
+ name: item.name,
1083
+ arguments: item.args
1084
+ };
1085
+ case "function_call_output":
1086
+ return {
1087
+ id: item.id,
1088
+ type: "function_call_output",
1089
+ call_id: item.callId,
1090
+ output: item.output
1091
+ };
1092
+ case "message":
1093
+ const role = item.role === "developer" ? "system" : item.role;
1094
+ const contentList = [];
1095
+ for (const c of item.content) {
1096
+ if (typeof c === "string") {
1097
+ contentList.push({
1098
+ type: role === "assistant" ? "text" : "input_text",
1099
+ text: c
1100
+ });
1101
+ } else if (c.type === "image_content") {
1102
+ continue;
1103
+ } else if (c.type === "audio_content") {
1104
+ if (role === "user") {
1105
+ const encodedAudio = Buffer.from(combineAudioFrames(c.frame).data).toString("base64");
1106
+ contentList.push({
1107
+ type: "input_audio",
1108
+ audio: encodedAudio
1109
+ });
1110
+ }
1111
+ }
1112
+ }
1113
+ return {
1114
+ id: item.id,
1115
+ type: "message",
1116
+ role,
1117
+ content: contentList
1118
+ };
933
1119
  }
934
- #handleResponseFunctionCallArgumentsDelta(event) {
1120
+ }
1121
+ function openAIItemToLivekitItem(item) {
1122
+ if (!item.id) {
1123
+ throw new Error("item.id is not set");
935
1124
  }
936
- #handleResponseFunctionCallArgumentsDone(event) {
1125
+ switch (item.type) {
1126
+ case "function_call":
1127
+ return llm.FunctionCall.create({
1128
+ id: item.id,
1129
+ callId: item.call_id,
1130
+ name: item.name,
1131
+ args: item.arguments
1132
+ });
1133
+ case "function_call_output":
1134
+ return llm.FunctionCallOutput.create({
1135
+ id: item.id,
1136
+ callId: item.call_id,
1137
+ output: item.output,
1138
+ isError: false
1139
+ });
1140
+ case "message":
1141
+ const content = [];
1142
+ const contents = Array.isArray(item.content) ? item.content : [item.content];
1143
+ for (const c of contents) {
1144
+ if (c.type === "text" || c.type === "input_text") {
1145
+ content.push(c.text);
1146
+ }
1147
+ }
1148
+ return llm.ChatMessage.create({
1149
+ id: item.id,
1150
+ role: item.role,
1151
+ content
1152
+ });
1153
+ }
1154
+ }
1155
+ function createMockAudioItem(durationSeconds = 2) {
1156
+ const audioData = Buffer.alloc(durationSeconds * SAMPLE_RATE);
1157
+ return llm.ChatMessage.create({
1158
+ id: shortuuid(MOCK_AUDIO_ID_PREFIX),
1159
+ role: "user",
1160
+ content: [
1161
+ {
1162
+ type: "audio_content",
1163
+ frame: [
1164
+ new AudioFrame(
1165
+ new Int16Array(audioData.buffer),
1166
+ SAMPLE_RATE,
1167
+ NUM_CHANNELS,
1168
+ audioData.length / 2
1169
+ )
1170
+ ]
1171
+ }
1172
+ ]
1173
+ });
1174
+ }
1175
+ function toOaiToolChoice(toolChoice) {
1176
+ if (typeof toolChoice === "string") {
1177
+ return toolChoice;
937
1178
  }
938
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
939
- #handleRateLimitsUpdated(event) {
1179
+ if ((toolChoice == null ? void 0 : toolChoice.type) === "function") {
1180
+ return toolChoice.function.name;
940
1181
  }
1182
+ return "auto";
941
1183
  }
942
1184
  export {
943
1185
  RealtimeModel,