@livekit/agents-plugin-openai 0.9.3 → 1.0.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/dist/index.cjs +16 -5
  2. package/dist/index.cjs.map +1 -1
  3. package/dist/index.d.cts +4 -4
  4. package/dist/index.d.ts +4 -4
  5. package/dist/index.d.ts.map +1 -1
  6. package/dist/index.js +14 -3
  7. package/dist/index.js.map +1 -1
  8. package/dist/llm.cjs +156 -197
  9. package/dist/llm.cjs.map +1 -1
  10. package/dist/llm.d.cts +27 -8
  11. package/dist/llm.d.ts +27 -8
  12. package/dist/llm.d.ts.map +1 -1
  13. package/dist/llm.js +164 -188
  14. package/dist/llm.js.map +1 -1
  15. package/dist/models.cjs +14 -0
  16. package/dist/models.cjs.map +1 -1
  17. package/dist/models.d.cts +11 -6
  18. package/dist/models.d.ts +11 -6
  19. package/dist/models.d.ts.map +1 -1
  20. package/dist/models.js +6 -0
  21. package/dist/models.js.map +1 -1
  22. package/dist/realtime/api_proto.cjs.map +1 -1
  23. package/dist/realtime/api_proto.d.cts +15 -0
  24. package/dist/realtime/api_proto.d.ts +15 -0
  25. package/dist/realtime/api_proto.d.ts.map +1 -1
  26. package/dist/realtime/api_proto.js.map +1 -1
  27. package/dist/realtime/realtime_model.cjs +1056 -820
  28. package/dist/realtime/realtime_model.cjs.map +1 -1
  29. package/dist/realtime/realtime_model.d.cts +126 -160
  30. package/dist/realtime/realtime_model.d.ts +126 -160
  31. package/dist/realtime/realtime_model.d.ts.map +1 -1
  32. package/dist/realtime/realtime_model.js +1067 -825
  33. package/dist/realtime/realtime_model.js.map +1 -1
  34. package/dist/tts.cjs +5 -5
  35. package/dist/tts.cjs.map +1 -1
  36. package/dist/tts.d.cts +2 -1
  37. package/dist/tts.d.ts +2 -1
  38. package/dist/tts.d.ts.map +1 -1
  39. package/dist/tts.js +6 -6
  40. package/dist/tts.js.map +1 -1
  41. package/package.json +8 -7
  42. package/src/index.ts +19 -5
  43. package/src/llm.ts +227 -228
  44. package/src/models.ts +83 -5
  45. package/src/realtime/api_proto.ts +15 -1
  46. package/src/realtime/realtime_model.ts +1305 -996
  47. package/src/tts.ts +6 -6
@@ -33,373 +33,442 @@ __export(realtime_model_exports, {
33
33
  });
34
34
  module.exports = __toCommonJS(realtime_model_exports);
35
35
  var import_agents = require("@livekit/agents");
36
+ var import_mutex = require("@livekit/mutex");
36
37
  var import_rtc_node = require("@livekit/rtc-node");
37
- var import_node_events = require("node:events");
38
38
  var import_ws = require("ws");
39
39
  var api_proto = __toESM(require("./api_proto.cjs"), 1);
40
- class InputAudioBuffer {
41
- #session;
42
- constructor(session) {
43
- this.#session = session;
44
- }
45
- append(frame) {
46
- this.#session.queueMsg({
47
- type: "input_audio_buffer.append",
48
- audio: Buffer.from(frame.data.buffer).toString("base64")
49
- });
50
- }
51
- clear() {
52
- this.#session.queueMsg({
53
- type: "input_audio_buffer.clear"
54
- });
55
- }
56
- commit() {
57
- this.#session.queueMsg({
58
- type: "input_audio_buffer.commit"
59
- });
40
+ const SAMPLE_RATE = 24e3;
41
+ const NUM_CHANNELS = 1;
42
+ const BASE_URL = "https://api.openai.com/v1";
43
+ const MOCK_AUDIO_ID_PREFIX = "lk_mock_audio_item_";
44
+ class CreateResponseHandle {
45
+ instructions;
46
+ doneFut;
47
+ // TODO(shubhra): add timeout
48
+ constructor({ instructions }) {
49
+ this.instructions = instructions;
50
+ this.doneFut = new import_agents.Future();
60
51
  }
61
52
  }
62
- class ConversationItem {
63
- #session;
64
- #logger = (0, import_agents.log)();
65
- constructor(session) {
66
- this.#session = session;
67
- }
68
- truncate(itemId, contentIndex, audioEnd) {
69
- this.#session.queueMsg({
70
- type: "conversation.item.truncate",
71
- item_id: itemId,
72
- content_index: contentIndex,
73
- audio_end_ms: audioEnd
74
- });
75
- }
76
- delete(itemId) {
77
- this.#session.queueMsg({
78
- type: "conversation.item.delete",
79
- item_id: itemId
53
+ const DEFAULT_FIRST_RETRY_INTERVAL_MS = 100;
54
+ const DEFAULT_TEMPERATURE = 0.8;
55
+ const DEFAULT_TURN_DETECTION = {
56
+ type: "server_vad",
57
+ threshold: 0.5,
58
+ prefix_padding_ms: 300,
59
+ silence_duration_ms: 200,
60
+ create_response: true,
61
+ interrupt_response: true
62
+ };
63
+ const DEFAULT_INPUT_AUDIO_TRANSCRIPTION = {
64
+ model: "gpt-4o-mini-transcribe"
65
+ };
66
+ const DEFAULT_TOOL_CHOICE = "auto";
67
+ const DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS = "inf";
68
+ const AZURE_DEFAULT_INPUT_AUDIO_TRANSCRIPTION = {
69
+ model: "whisper-1"
70
+ };
71
+ const AZURE_DEFAULT_TURN_DETECTION = {
72
+ type: "server_vad",
73
+ threshold: 0.5,
74
+ prefix_padding_ms: 300,
75
+ silence_duration_ms: 200,
76
+ create_response: true
77
+ };
78
+ const DEFAULT_MAX_SESSION_DURATION = 20 * 60 * 1e3;
79
+ const DEFAULT_REALTIME_MODEL_OPTIONS = {
80
+ model: "gpt-4o-realtime-preview",
81
+ voice: "alloy",
82
+ temperature: DEFAULT_TEMPERATURE,
83
+ inputAudioTranscription: DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
84
+ turnDetection: DEFAULT_TURN_DETECTION,
85
+ toolChoice: DEFAULT_TOOL_CHOICE,
86
+ maxResponseOutputTokens: DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS,
87
+ maxSessionDuration: DEFAULT_MAX_SESSION_DURATION,
88
+ connOptions: import_agents.DEFAULT_API_CONNECT_OPTIONS
89
+ };
90
+ class RealtimeModel extends import_agents.llm.RealtimeModel {
91
+ sampleRate = api_proto.SAMPLE_RATE;
92
+ numChannels = api_proto.NUM_CHANNELS;
93
+ inFrameSize = api_proto.IN_FRAME_SIZE;
94
+ outFrameSize = api_proto.OUT_FRAME_SIZE;
95
+ /* @internal */
96
+ _options;
97
+ constructor(options = {}) {
98
+ super({
99
+ messageTruncation: true,
100
+ turnDetection: options.turnDetection !== null,
101
+ userTranscription: options.inputAudioTranscription !== null,
102
+ autoToolReplyGeneration: false
80
103
  });
81
- }
82
- create(message, previousItemId) {
83
- if (!message.content) {
84
- return;
104
+ const isAzure = !!(options.apiVersion || options.entraToken || options.azureDeployment);
105
+ if (options.apiKey === "" && !isAzure) {
106
+ throw new Error(
107
+ "OpenAI API key is required, either using the argument or by setting the OPENAI_API_KEY environmental variable"
108
+ );
85
109
  }
86
- let event;
87
- if (message.toolCallId) {
88
- if (typeof message.content !== "string") {
89
- throw new TypeError("message.content must be a string");
90
- }
91
- event = {
92
- type: "conversation.item.create",
93
- previous_item_id: previousItemId,
94
- item: {
95
- type: "function_call_output",
96
- call_id: message.toolCallId,
97
- output: message.content
98
- }
99
- };
100
- } else {
101
- let content = message.content;
102
- if (!Array.isArray(content)) {
103
- content = [content];
104
- }
105
- if (message.role === import_agents.llm.ChatRole.USER) {
106
- const contents = [];
107
- for (const c of content) {
108
- if (typeof c === "string") {
109
- contents.push({
110
- type: "input_text",
111
- text: c
112
- });
113
- } else if (
114
- // typescript type guard for determining ChatAudio vs ChatImage
115
- ((c2) => {
116
- return c2.frame !== void 0;
117
- })(c)
118
- ) {
119
- contents.push({
120
- type: "input_audio",
121
- audio: Buffer.from((0, import_agents.mergeFrames)(c.frame).data.buffer).toString("base64")
122
- });
123
- }
124
- }
125
- event = {
126
- type: "conversation.item.create",
127
- previous_item_id: previousItemId,
128
- item: {
129
- type: "message",
130
- role: "user",
131
- content: contents
132
- }
133
- };
134
- } else if (message.role === import_agents.llm.ChatRole.ASSISTANT) {
135
- const contents = [];
136
- for (const c of content) {
137
- if (typeof c === "string") {
138
- contents.push({
139
- type: "text",
140
- text: c
141
- });
142
- } else if (
143
- // typescript type guard for determining ChatAudio vs ChatImage
144
- ((c2) => {
145
- return c2.frame !== void 0;
146
- })(c)
147
- ) {
148
- this.#logger.warn("audio content in assistant message is not supported");
149
- }
150
- }
151
- event = {
152
- type: "conversation.item.create",
153
- previous_item_id: previousItemId,
154
- item: {
155
- type: "message",
156
- role: "assistant",
157
- content: contents
158
- }
159
- };
160
- } else if (message.role === import_agents.llm.ChatRole.SYSTEM) {
161
- const contents = [];
162
- for (const c of content) {
163
- if (typeof c === "string") {
164
- contents.push({
165
- type: "input_text",
166
- text: c
167
- });
168
- } else if (
169
- // typescript type guard for determining ChatAudio vs ChatImage
170
- ((c2) => {
171
- return c2.frame !== void 0;
172
- })(c)
173
- ) {
174
- this.#logger.warn("audio content in system message is not supported");
175
- }
176
- }
177
- event = {
178
- type: "conversation.item.create",
179
- previous_item_id: previousItemId,
180
- item: {
181
- type: "message",
182
- role: "system",
183
- content: contents
184
- }
185
- };
186
- } else {
187
- this.#logger.child({ message }).warn("chat message is not supported inside the realtime API");
188
- return;
110
+ const apiKey = options.apiKey || process.env.OPENAI_API_KEY;
111
+ if (!apiKey && !isAzure) {
112
+ throw new Error(
113
+ "OpenAI API key is required, either using the argument or by setting the OPENAI_API_KEY environmental variable"
114
+ );
115
+ }
116
+ if (!options.baseURL && isAzure) {
117
+ const azureEndpoint = process.env.AZURE_OPENAI_ENDPOINT;
118
+ if (!azureEndpoint) {
119
+ throw new Error(
120
+ "Missing Azure endpoint. Please pass base_url or set AZURE_OPENAI_ENDPOINT environment variable."
121
+ );
189
122
  }
123
+ options.baseURL = `${azureEndpoint.replace(/\/$/, "")}/openai`;
190
124
  }
191
- this.#session.queueMsg(event);
192
- }
193
- }
194
- class Conversation {
195
- #session;
196
- constructor(session) {
197
- this.#session = session;
198
- }
199
- get item() {
200
- return new ConversationItem(this.#session);
201
- }
202
- }
203
- class Response {
204
- #session;
205
- constructor(session) {
206
- this.#session = session;
207
- }
208
- create() {
209
- this.#session.queueMsg({
210
- type: "response.create"
211
- });
212
- }
213
- cancel() {
214
- this.#session.queueMsg({
215
- type: "response.cancel"
216
- });
125
+ this._options = {
126
+ ...DEFAULT_REALTIME_MODEL_OPTIONS,
127
+ ...options,
128
+ baseURL: options.baseURL || BASE_URL,
129
+ apiKey,
130
+ isAzure,
131
+ model: options.model || DEFAULT_REALTIME_MODEL_OPTIONS.model
132
+ };
217
133
  }
218
- }
219
- class RealtimeModel extends import_agents.multimodal.RealtimeModel {
220
- sampleRate = api_proto.SAMPLE_RATE;
221
- numChannels = api_proto.NUM_CHANNELS;
222
- inFrameSize = api_proto.IN_FRAME_SIZE;
223
- outFrameSize = api_proto.OUT_FRAME_SIZE;
224
- #defaultOpts;
225
- #sessions = [];
134
+ /**
135
+ * Create a RealtimeModel instance configured for Azure OpenAI Service.
136
+ *
137
+ * @param azureDeployment - The name of your Azure OpenAI deployment.
138
+ * @param azureEndpoint - The endpoint URL for your Azure OpenAI resource. If undefined, will attempt to read from the environment variable AZURE_OPENAI_ENDPOINT.
139
+ * @param apiVersion - API version to use with Azure OpenAI Service. If undefined, will attempt to read from the environment variable OPENAI_API_VERSION.
140
+ * @param apiKey - Azure OpenAI API key. If undefined, will attempt to read from the environment variable AZURE_OPENAI_API_KEY.
141
+ * @param entraToken - Azure Entra authentication token. Required if not using API key authentication.
142
+ * @param baseURL - Base URL for the API endpoint. If undefined, constructed from the azure_endpoint.
143
+ * @param voice - Voice setting for audio outputs. Defaults to "alloy".
144
+ * @param inputAudioTranscription - Options for transcribing input audio. Defaults to @see DEFAULT_INPUT_AUDIO_TRANSCRIPTION.
145
+ * @param turnDetection - Options for server-based voice activity detection (VAD). Defaults to @see DEFAULT_SERVER_VAD_OPTIONS.
146
+ * @param temperature - Sampling temperature for response generation. Defaults to @see DEFAULT_TEMPERATURE.
147
+ * @param speed - Speed of the audio output. Defaults to 1.0.
148
+ * @param maxResponseOutputTokens - Maximum number of tokens in the response. Defaults to @see DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS.
149
+ * @param maxSessionDuration - Maximum duration of the session in milliseconds. Defaults to @see DEFAULT_MAX_SESSION_DURATION.
150
+ *
151
+ * @returns A RealtimeModel instance configured for Azure OpenAI Service.
152
+ *
153
+ * @throws Error if required Azure parameters are missing or invalid.
154
+ */
226
155
  static withAzure({
227
- baseURL,
228
156
  azureDeployment,
229
- apiVersion = "2024-10-01-preview",
230
- apiKey = void 0,
231
- entraToken = void 0,
232
- instructions = "",
233
- modalities = ["text", "audio"],
234
- voice = "alloy",
235
- inputAudioFormat = "pcm16",
236
- outputAudioFormat = "pcm16",
237
- inputAudioTranscription = { model: "whisper-1" },
238
- turnDetection = { type: "server_vad" },
239
- temperature = 0.8,
240
- maxResponseOutputTokens = Infinity
241
- }) {
242
- return new RealtimeModel({
243
- isAzure: true,
244
- baseURL: new URL("openai", baseURL).toString(),
245
- model: azureDeployment,
246
- apiVersion,
247
- apiKey,
248
- entraToken,
249
- instructions,
250
- modalities,
251
- voice,
252
- inputAudioFormat,
253
- outputAudioFormat,
254
- inputAudioTranscription,
255
- turnDetection,
256
- temperature,
257
- maxResponseOutputTokens
258
- });
259
- }
260
- constructor({
261
- modalities = ["text", "audio"],
262
- instructions = "",
157
+ azureEndpoint,
158
+ apiVersion,
159
+ apiKey,
160
+ entraToken,
161
+ baseURL,
263
162
  voice = "alloy",
264
- inputAudioFormat = "pcm16",
265
- outputAudioFormat = "pcm16",
266
- inputAudioTranscription = { model: "whisper-1" },
267
- turnDetection = { type: "server_vad" },
163
+ inputAudioTranscription = AZURE_DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
164
+ turnDetection = AZURE_DEFAULT_TURN_DETECTION,
268
165
  temperature = 0.8,
269
- maxResponseOutputTokens = Infinity,
270
- model = "gpt-4o-realtime-preview-2024-10-01",
271
- apiKey = process.env.OPENAI_API_KEY || "",
272
- baseURL = api_proto.BASE_URL,
273
- // used for microsoft
274
- isAzure = false,
275
- apiVersion = void 0,
276
- entraToken = void 0
166
+ speed
277
167
  }) {
278
- super();
279
- if (apiKey === "" && !(isAzure && entraToken)) {
168
+ apiKey = apiKey || process.env.AZURE_OPENAI_API_KEY;
169
+ if (!apiKey && !entraToken) {
280
170
  throw new Error(
281
- "OpenAI API key is required, either using the argument or by setting the OPENAI_API_KEY environmental variable"
171
+ "Missing credentials. Please pass one of `apiKey`, `entraToken`, or the `AZURE_OPENAI_API_KEY` environment variable."
282
172
  );
283
173
  }
284
- this.#defaultOpts = {
285
- modalities,
286
- instructions,
174
+ apiVersion = apiVersion || process.env.OPENAI_API_VERSION;
175
+ if (!apiVersion) {
176
+ throw new Error(
177
+ "Must provide either the `apiVersion` argument or the `OPENAI_API_VERSION` environment variable"
178
+ );
179
+ }
180
+ if (!baseURL) {
181
+ azureEndpoint = azureEndpoint || process.env.AZURE_OPENAI_ENDPOINT;
182
+ if (!azureEndpoint) {
183
+ throw new Error(
184
+ "Missing Azure endpoint. Please pass the `azure_endpoint` parameter or set the `AZURE_OPENAI_ENDPOINT` environment variable."
185
+ );
186
+ }
187
+ baseURL = `${azureEndpoint.replace(/\/$/, "")}/openai`;
188
+ }
189
+ return new RealtimeModel({
287
190
  voice,
288
- inputAudioFormat,
289
- outputAudioFormat,
290
191
  inputAudioTranscription,
291
192
  turnDetection,
292
193
  temperature,
293
- maxResponseOutputTokens,
294
- model,
194
+ speed,
295
195
  apiKey,
296
- baseURL,
297
- isAzure,
196
+ azureDeployment,
298
197
  apiVersion,
299
- entraToken
300
- };
301
- }
302
- get sessions() {
303
- return this.#sessions;
304
- }
305
- session({
306
- fncCtx,
307
- chatCtx,
308
- modalities = this.#defaultOpts.modalities,
309
- instructions = this.#defaultOpts.instructions,
310
- voice = this.#defaultOpts.voice,
311
- inputAudioFormat = this.#defaultOpts.inputAudioFormat,
312
- outputAudioFormat = this.#defaultOpts.outputAudioFormat,
313
- inputAudioTranscription = this.#defaultOpts.inputAudioTranscription,
314
- turnDetection = this.#defaultOpts.turnDetection,
315
- temperature = this.#defaultOpts.temperature,
316
- maxResponseOutputTokens = this.#defaultOpts.maxResponseOutputTokens
317
- }) {
318
- const opts = {
319
- modalities,
320
- instructions,
321
- voice,
322
- inputAudioFormat,
323
- outputAudioFormat,
324
- inputAudioTranscription,
325
- turnDetection,
326
- temperature,
327
- maxResponseOutputTokens,
328
- model: this.#defaultOpts.model,
329
- apiKey: this.#defaultOpts.apiKey,
330
- baseURL: this.#defaultOpts.baseURL,
331
- isAzure: this.#defaultOpts.isAzure,
332
- apiVersion: this.#defaultOpts.apiVersion,
333
- entraToken: this.#defaultOpts.entraToken
334
- };
335
- const newSession = new RealtimeSession(opts, {
336
- chatCtx: chatCtx || new import_agents.llm.ChatContext(),
337
- fncCtx
198
+ entraToken,
199
+ baseURL
338
200
  });
339
- this.#sessions.push(newSession);
340
- return newSession;
201
+ }
202
+ session() {
203
+ return new RealtimeSession(this);
341
204
  }
342
205
  async close() {
343
- await Promise.allSettled(this.#sessions.map((session) => session.close()));
206
+ return;
207
+ }
208
+ }
209
+ function processBaseURL({
210
+ baseURL,
211
+ model,
212
+ isAzure = false,
213
+ azureDeployment,
214
+ apiVersion
215
+ }) {
216
+ const url = new URL([baseURL, "realtime"].join("/"));
217
+ if (url.protocol === "https:") {
218
+ url.protocol = "wss:";
219
+ }
220
+ if (!url.pathname || ["", "/v1", "/openai"].includes(url.pathname.replace(/\/$/, ""))) {
221
+ url.pathname = url.pathname.replace(/\/$/, "") + "/realtime";
222
+ } else {
223
+ url.pathname = url.pathname.replace(/\/$/, "");
224
+ }
225
+ const queryParams = {};
226
+ if (isAzure) {
227
+ if (apiVersion) {
228
+ queryParams["api-version"] = apiVersion;
229
+ }
230
+ if (azureDeployment) {
231
+ queryParams["deployment"] = azureDeployment;
232
+ }
233
+ } else {
234
+ queryParams["model"] = model;
235
+ }
236
+ for (const [key, value] of Object.entries(queryParams)) {
237
+ url.searchParams.set(key, value);
344
238
  }
239
+ return url.toString();
345
240
  }
346
- class RealtimeSession extends import_agents.multimodal.RealtimeSession {
347
- #chatCtx = void 0;
348
- #fncCtx = void 0;
349
- #opts;
350
- #pendingResponses = {};
351
- #sessionId = "not-connected";
352
- #ws = null;
353
- #expiresAt = null;
241
+ class RealtimeSession extends import_agents.llm.RealtimeSession {
242
+ _tools = {};
243
+ remoteChatCtx = new import_agents.llm.RemoteChatContext();
244
+ messageChannel = new import_agents.Queue();
245
+ inputResampler;
246
+ instructions;
247
+ oaiRealtimeModel;
248
+ currentGeneration;
249
+ responseCreatedFutures = {};
250
+ textModeRecoveryRetries = 0;
251
+ itemCreateFutures = {};
252
+ itemDeleteFutures = {};
253
+ updateChatCtxLock = new import_mutex.Mutex();
254
+ updateFuncCtxLock = new import_mutex.Mutex();
255
+ // 100ms chunks
256
+ bstream = new import_agents.AudioByteStream(SAMPLE_RATE, NUM_CHANNELS, SAMPLE_RATE / 10);
257
+ pushedDurationMs = 0;
354
258
  #logger = (0, import_agents.log)();
355
259
  #task;
356
- #closing = true;
357
- #sendQueue = new import_agents.Queue();
358
- constructor(opts, { fncCtx, chatCtx }) {
359
- super();
360
- this.#opts = opts;
361
- this.#chatCtx = chatCtx;
362
- this.#fncCtx = fncCtx;
363
- this.#task = this.#start();
364
- this.sessionUpdate({
365
- modalities: this.#opts.modalities,
366
- instructions: this.#opts.instructions,
367
- voice: this.#opts.voice,
368
- inputAudioFormat: this.#opts.inputAudioFormat,
369
- outputAudioFormat: this.#opts.outputAudioFormat,
370
- inputAudioTranscription: this.#opts.inputAudioTranscription,
371
- turnDetection: this.#opts.turnDetection,
372
- temperature: this.#opts.temperature,
373
- maxResponseOutputTokens: this.#opts.maxResponseOutputTokens,
374
- toolChoice: "auto"
375
- });
260
+ #closed = false;
261
+ constructor(realtimeModel) {
262
+ super(realtimeModel);
263
+ this.oaiRealtimeModel = realtimeModel;
264
+ this.#task = this.#mainTask();
265
+ this.sendEvent(this.createSessionUpdateEvent());
266
+ }
267
+ sendEvent(command) {
268
+ this.messageChannel.put(command);
269
+ }
270
+ createSessionUpdateEvent() {
271
+ return {
272
+ type: "session.update",
273
+ session: {
274
+ model: this.oaiRealtimeModel._options.model,
275
+ voice: this.oaiRealtimeModel._options.voice,
276
+ input_audio_format: "pcm16",
277
+ output_audio_format: "pcm16",
278
+ modalities: ["text", "audio"],
279
+ turn_detection: this.oaiRealtimeModel._options.turnDetection,
280
+ input_audio_transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
281
+ // TODO(shubhra): add inputAudioNoiseReduction
282
+ temperature: this.oaiRealtimeModel._options.temperature,
283
+ tool_choice: toOaiToolChoice(this.oaiRealtimeModel._options.toolChoice),
284
+ max_response_output_tokens: this.oaiRealtimeModel._options.maxResponseOutputTokens === Infinity ? "inf" : this.oaiRealtimeModel._options.maxResponseOutputTokens,
285
+ // TODO(shubhra): add tracing options
286
+ instructions: this.instructions,
287
+ speed: this.oaiRealtimeModel._options.speed
288
+ }
289
+ };
376
290
  }
377
291
  get chatCtx() {
378
- return this.#chatCtx;
292
+ return this.remoteChatCtx.toChatCtx();
379
293
  }
380
- get fncCtx() {
381
- return this.#fncCtx;
294
+ get tools() {
295
+ return { ...this._tools };
382
296
  }
383
- set fncCtx(ctx) {
384
- this.#fncCtx = ctx;
297
+ async updateChatCtx(_chatCtx) {
298
+ const unlock = await this.updateChatCtxLock.lock();
299
+ const events = this.createChatCtxUpdateEvents(_chatCtx);
300
+ const futures = [];
301
+ for (const event of events) {
302
+ const future = new import_agents.Future();
303
+ futures.push(future);
304
+ if (event.type === "conversation.item.create") {
305
+ this.itemCreateFutures[event.item.id] = future;
306
+ } else if (event.type == "conversation.item.delete") {
307
+ this.itemDeleteFutures[event.item_id] = future;
308
+ }
309
+ this.sendEvent(event);
310
+ }
311
+ if (futures.length === 0) {
312
+ unlock();
313
+ return;
314
+ }
315
+ try {
316
+ await Promise.race([
317
+ Promise.all(futures),
318
+ (0, import_agents.delay)(5e3).then(() => {
319
+ throw new Error("Chat ctx update events timed out");
320
+ })
321
+ ]);
322
+ } catch (e) {
323
+ this.#logger.error(e.message);
324
+ throw e;
325
+ } finally {
326
+ unlock();
327
+ }
385
328
  }
386
- get conversation() {
387
- return new Conversation(this);
329
+ createChatCtxUpdateEvents(chatCtx, addMockAudio = false) {
330
+ const newChatCtx = chatCtx.copy();
331
+ if (addMockAudio) {
332
+ newChatCtx.items.push(createMockAudioItem());
333
+ } else {
334
+ newChatCtx.items = newChatCtx.items.filter(
335
+ (item) => !item.id.startsWith(MOCK_AUDIO_ID_PREFIX)
336
+ );
337
+ }
338
+ const events = [];
339
+ const diffOps = import_agents.llm.computeChatCtxDiff(this.chatCtx, newChatCtx);
340
+ for (const op of diffOps.toRemove) {
341
+ events.push({
342
+ type: "conversation.item.delete",
343
+ item_id: op,
344
+ event_id: (0, import_agents.shortuuid)("chat_ctx_delete_")
345
+ });
346
+ }
347
+ for (const [previousId, id] of diffOps.toCreate) {
348
+ const chatItem = newChatCtx.getById(id);
349
+ if (!chatItem) {
350
+ throw new Error(`Chat item ${id} not found`);
351
+ }
352
+ events.push({
353
+ type: "conversation.item.create",
354
+ item: livekitItemToOpenAIItem(chatItem),
355
+ previous_item_id: previousId ?? void 0,
356
+ event_id: (0, import_agents.shortuuid)("chat_ctx_create_")
357
+ });
358
+ }
359
+ return events;
388
360
  }
389
- get inputAudioBuffer() {
390
- return new InputAudioBuffer(this);
361
+ async updateTools(_tools) {
362
+ const unlock = await this.updateFuncCtxLock.lock();
363
+ const ev = this.createToolsUpdateEvent(_tools);
364
+ this.sendEvent(ev);
365
+ if (!ev.session.tools) {
366
+ throw new Error("Tools are missing in the session update event");
367
+ }
368
+ const retainedToolNames = new Set(ev.session.tools.map((tool) => tool.name));
369
+ const retainedTools = Object.fromEntries(
370
+ Object.entries(_tools).filter(
371
+ ([name, tool]) => import_agents.llm.isFunctionTool(tool) && retainedToolNames.has(name)
372
+ )
373
+ );
374
+ this._tools = retainedTools;
375
+ unlock();
391
376
  }
392
- get response() {
393
- return new Response(this);
377
+ createToolsUpdateEvent(_tools) {
378
+ const oaiTools = [];
379
+ for (const [name, tool] of Object.entries(_tools)) {
380
+ if (!import_agents.llm.isFunctionTool(tool)) {
381
+ this.#logger.error({ name, tool }, "OpenAI Realtime API doesn't support this tool type");
382
+ continue;
383
+ }
384
+ const { parameters: toolParameters, description } = tool;
385
+ try {
386
+ const parameters = import_agents.llm.toJsonSchema(
387
+ toolParameters
388
+ );
389
+ oaiTools.push({
390
+ name,
391
+ description,
392
+ parameters,
393
+ type: "function"
394
+ });
395
+ } catch (e) {
396
+ this.#logger.error({ name, tool }, "OpenAI Realtime API doesn't support this tool type");
397
+ continue;
398
+ }
399
+ }
400
+ return {
401
+ type: "session.update",
402
+ session: {
403
+ model: this.oaiRealtimeModel._options.model,
404
+ tools: oaiTools
405
+ },
406
+ event_id: (0, import_agents.shortuuid)("tools_update_")
407
+ };
408
+ }
409
+ async updateInstructions(_instructions) {
410
+ const eventId = (0, import_agents.shortuuid)("instructions_update_");
411
+ this.sendEvent({
412
+ type: "session.update",
413
+ session: {
414
+ instructions: _instructions
415
+ },
416
+ event_id: eventId
417
+ });
418
+ this.instructions = _instructions;
419
+ }
420
+ updateOptions({ toolChoice }) {
421
+ const options = {};
422
+ this.oaiRealtimeModel._options.toolChoice = toolChoice;
423
+ options.tool_choice = toOaiToolChoice(toolChoice);
424
+ this.sendEvent({
425
+ type: "session.update",
426
+ session: options,
427
+ event_id: (0, import_agents.shortuuid)("options_update_")
428
+ });
429
+ }
430
+ pushAudio(frame) {
431
+ for (const f of this.resampleAudio(frame)) {
432
+ for (const nf of this.bstream.write(f.data.buffer)) {
433
+ this.sendEvent({
434
+ type: "input_audio_buffer.append",
435
+ audio: Buffer.from(nf.data.buffer).toString("base64")
436
+ });
437
+ this.pushedDurationMs += nf.samplesPerChannel / nf.sampleRate * 1e3;
438
+ }
439
+ }
394
440
  }
395
- get expiration() {
396
- if (!this.#expiresAt) {
397
- throw new Error("session not started");
441
+ async commitAudio() {
442
+ if (this.pushedDurationMs > 100) {
443
+ this.sendEvent({
444
+ type: "input_audio_buffer.commit"
445
+ });
446
+ this.pushedDurationMs = 0;
398
447
  }
399
- return this.#expiresAt * 1e3;
400
448
  }
401
- queueMsg(command) {
402
- this.#sendQueue.put(command);
449
+ async clearAudio() {
450
+ this.sendEvent({
451
+ type: "input_audio_buffer.clear"
452
+ });
453
+ this.pushedDurationMs = 0;
454
+ }
455
+ async generateReply(instructions) {
456
+ const handle = this.createResponse({ instructions, userInitiated: true });
457
+ this.textModeRecoveryRetries = 0;
458
+ return handle.doneFut.await;
459
+ }
460
+ async interrupt() {
461
+ this.sendEvent({
462
+ type: "response.cancel"
463
+ });
464
+ }
465
+ async truncate(_options) {
466
+ this.sendEvent({
467
+ type: "conversation.item.truncate",
468
+ content_index: 0,
469
+ item_id: _options.messageId,
470
+ audio_end_ms: _options.audioEndMs
471
+ });
403
472
  }
404
473
  /// Truncates the data field of the event to the specified maxLength to avoid overwhelming logs
405
474
  /// with large amounts of base64 audio data.
@@ -420,549 +489,716 @@ class RealtimeSession extends import_agents.multimodal.RealtimeSession {
420
489
  }
421
490
  return untypedEvent;
422
491
  }
423
- sessionUpdate({
424
- modalities = this.#opts.modalities,
425
- instructions = this.#opts.instructions,
426
- voice = this.#opts.voice,
427
- inputAudioFormat = this.#opts.inputAudioFormat,
428
- outputAudioFormat = this.#opts.outputAudioFormat,
429
- inputAudioTranscription = this.#opts.inputAudioTranscription,
430
- turnDetection = this.#opts.turnDetection,
431
- temperature = this.#opts.temperature,
432
- maxResponseOutputTokens = this.#opts.maxResponseOutputTokens,
433
- toolChoice = "auto",
434
- selectedTools = Object.keys(this.#fncCtx || {})
435
- }) {
436
- this.#opts = {
437
- modalities,
438
- instructions,
439
- voice,
440
- inputAudioFormat,
441
- outputAudioFormat,
442
- inputAudioTranscription,
443
- turnDetection,
444
- temperature,
445
- maxResponseOutputTokens,
446
- model: this.#opts.model,
447
- apiKey: this.#opts.apiKey,
448
- baseURL: this.#opts.baseURL,
449
- isAzure: this.#opts.isAzure,
450
- apiVersion: this.#opts.apiVersion,
451
- entraToken: this.#opts.entraToken
452
- };
453
- const tools = this.#fncCtx ? Object.entries(this.#fncCtx).filter(([name]) => selectedTools.includes(name)).map(([name, func]) => ({
454
- type: "function",
455
- name,
456
- description: func.description,
457
- parameters: (
458
- // don't format parameters if they are raw openai params
459
- func.parameters.type == "object" ? func.parameters : import_agents.llm.oaiParams(func.parameters)
460
- )
461
- })) : [];
462
- const sessionUpdateEvent = {
463
- type: "session.update",
464
- session: {
465
- modalities: this.#opts.modalities,
466
- instructions: this.#opts.instructions,
467
- voice: this.#opts.voice,
468
- input_audio_format: this.#opts.inputAudioFormat,
469
- output_audio_format: this.#opts.outputAudioFormat,
470
- input_audio_transcription: this.#opts.inputAudioTranscription,
471
- turn_detection: this.#opts.turnDetection,
472
- temperature: this.#opts.temperature,
473
- max_response_output_tokens: this.#opts.maxResponseOutputTokens === Infinity ? "inf" : this.#opts.maxResponseOutputTokens,
474
- tools,
475
- tool_choice: toolChoice
476
- }
492
+ async createWsConn() {
493
+ const headers = {
494
+ "User-Agent": "LiveKit-Agents-JS"
477
495
  };
478
- if (this.#opts.isAzure && this.#opts.maxResponseOutputTokens === Infinity) {
479
- sessionUpdateEvent.session.max_response_output_tokens = void 0;
480
- }
481
- this.queueMsg(sessionUpdateEvent);
482
- }
483
- /** Create an empty audio message with the given duration. */
484
- #createEmptyUserAudioMessage(duration) {
485
- const samples = duration * api_proto.SAMPLE_RATE;
486
- return new import_agents.llm.ChatMessage({
487
- role: import_agents.llm.ChatRole.USER,
488
- content: {
489
- frame: new import_rtc_node.AudioFrame(
490
- new Int16Array(samples * api_proto.NUM_CHANNELS),
491
- api_proto.SAMPLE_RATE,
492
- api_proto.NUM_CHANNELS,
493
- samples
494
- )
496
+ if (this.oaiRealtimeModel._options.isAzure) {
497
+ if (this.oaiRealtimeModel._options.entraToken) {
498
+ headers.Authorization = `Bearer ${this.oaiRealtimeModel._options.entraToken}`;
499
+ } else if (this.oaiRealtimeModel._options.apiKey) {
500
+ headers["api-key"] = this.oaiRealtimeModel._options.apiKey;
501
+ } else {
502
+ throw new Error("Microsoft API key or entraToken is required");
495
503
  }
496
- });
497
- }
498
- /**
499
- * Try to recover from a text response to audio mode.
500
- *
501
- * @remarks
502
- * Sometimes the OpenAI Realtime API returns text instead of audio responses.
503
- * This method tries to recover from this by requesting a new response after deleting the text
504
- * response and creating an empty user audio message.
505
- */
506
- recoverFromTextResponse(itemId) {
507
- if (itemId) {
508
- this.conversation.item.delete(itemId);
504
+ } else {
505
+ headers.Authorization = `Bearer ${this.oaiRealtimeModel._options.apiKey}`;
506
+ headers["OpenAI-Beta"] = "realtime=v1";
509
507
  }
510
- this.conversation.item.create(this.#createEmptyUserAudioMessage(1));
511
- this.response.create();
508
+ const url = processBaseURL({
509
+ baseURL: this.oaiRealtimeModel._options.baseURL,
510
+ model: this.oaiRealtimeModel._options.model,
511
+ isAzure: this.oaiRealtimeModel._options.isAzure,
512
+ apiVersion: this.oaiRealtimeModel._options.apiVersion,
513
+ azureDeployment: this.oaiRealtimeModel._options.azureDeployment
514
+ });
515
+ this.#logger.debug(`Connecting to OpenAI Realtime API at ${url}`);
516
+ return new Promise((resolve, reject) => {
517
+ const ws = new import_ws.WebSocket(url, { headers });
518
+ let waiting = true;
519
+ const timeout = setTimeout(() => {
520
+ ws.close();
521
+ reject(new Error("WebSocket connection timeout"));
522
+ }, this.oaiRealtimeModel._options.connOptions.timeoutMs);
523
+ ws.once("open", () => {
524
+ if (!waiting) return;
525
+ waiting = false;
526
+ clearTimeout(timeout);
527
+ resolve(ws);
528
+ });
529
+ ws.once("close", () => {
530
+ if (!waiting) return;
531
+ waiting = false;
532
+ clearTimeout(timeout);
533
+ reject(new Error("OpenAI Realtime API connection closed"));
534
+ });
535
+ });
512
536
  }
513
- #start() {
514
- return new Promise(async (resolve, reject) => {
515
- const headers = {
516
- "User-Agent": "LiveKit-Agents-JS"
517
- };
518
- if (this.#opts.isAzure) {
519
- if (this.#opts.entraToken) {
520
- headers.Authorization = `Bearer ${this.#opts.entraToken}`;
521
- } else if (this.#opts.apiKey) {
522
- headers["api-key"] = this.#opts.apiKey;
523
- } else {
524
- reject(new Error("Microsoft API key or entraToken is required"));
525
- return;
526
- }
527
- } else {
528
- headers.Authorization = `Bearer ${this.#opts.apiKey}`;
529
- headers["OpenAI-Beta"] = "realtime=v1";
530
- }
531
- const url = new URL([this.#opts.baseURL, "realtime"].join("/"));
532
- if (url.protocol === "https:") {
533
- url.protocol = "wss:";
534
- }
535
- const queryParams = {};
536
- if (this.#opts.isAzure) {
537
- queryParams["api-version"] = this.#opts.apiVersion ?? "2024-10-01-preview";
538
- queryParams["deployment"] = this.#opts.model;
539
- } else {
540
- queryParams["model"] = this.#opts.model;
541
- }
542
- for (const [key, value] of Object.entries(queryParams)) {
543
- url.searchParams.set(key, value);
537
+ async #mainTask() {
538
+ let reconnecting = false;
539
+ let numRetries = 0;
540
+ let wsConn = null;
541
+ const maxRetries = this.oaiRealtimeModel._options.connOptions.maxRetry;
542
+ const reconnect = async () => {
543
+ this.#logger.debug(
544
+ {
545
+ maxSessionDuration: this.oaiRealtimeModel._options.maxSessionDuration
546
+ },
547
+ "Reconnecting to OpenAI Realtime API"
548
+ );
549
+ const events = [];
550
+ events.push(this.createSessionUpdateEvent());
551
+ if (Object.keys(this._tools).length > 0) {
552
+ events.push(this.createToolsUpdateEvent(this._tools));
544
553
  }
545
- console.debug("Connecting to OpenAI Realtime API at ", url.toString());
546
- this.#ws = new import_ws.WebSocket(url.toString(), {
547
- headers
554
+ const chatCtx = this.chatCtx.copy({
555
+ excludeFunctionCall: true,
556
+ excludeInstructions: true,
557
+ excludeEmptyMessage: true
548
558
  });
549
- this.#ws.onerror = (error) => {
550
- reject(new Error("OpenAI Realtime WebSocket error: " + error.message));
551
- };
552
- await (0, import_node_events.once)(this.#ws, "open");
553
- this.#closing = false;
554
- this.#ws.onmessage = (message) => {
555
- const event = JSON.parse(message.data);
556
- this.#logger.debug(`<- ${JSON.stringify(this.#loggableEvent(event))}`);
557
- switch (event.type) {
558
- case "error":
559
- this.#handleError(event);
560
- break;
561
- case "session.created":
562
- this.#handleSessionCreated(event);
563
- break;
564
- case "session.updated":
565
- this.#handleSessionUpdated(event);
566
- break;
567
- case "conversation.created":
568
- this.#handleConversationCreated(event);
569
- break;
570
- case "input_audio_buffer.committed":
571
- this.#handleInputAudioBufferCommitted(event);
572
- break;
573
- case "input_audio_buffer.cleared":
574
- this.#handleInputAudioBufferCleared(event);
575
- break;
576
- case "input_audio_buffer.speech_started":
577
- this.#handleInputAudioBufferSpeechStarted(event);
578
- break;
579
- case "input_audio_buffer.speech_stopped":
580
- this.#handleInputAudioBufferSpeechStopped(event);
581
- break;
582
- case "conversation.item.created":
583
- this.#handleConversationItemCreated(event);
584
- break;
585
- case "conversation.item.input_audio_transcription.completed":
586
- this.#handleConversationItemInputAudioTranscriptionCompleted(event);
587
- break;
588
- case "conversation.item.input_audio_transcription.failed":
589
- this.#handleConversationItemInputAudioTranscriptionFailed(event);
590
- break;
591
- case "conversation.item.truncated":
592
- this.#handleConversationItemTruncated(event);
593
- break;
594
- case "conversation.item.deleted":
595
- this.#handleConversationItemDeleted(event);
596
- break;
597
- case "response.created":
598
- this.#handleResponseCreated(event);
599
- break;
600
- case "response.done":
601
- this.#handleResponseDone(event);
602
- break;
603
- case "response.output_item.added":
604
- this.#handleResponseOutputItemAdded(event);
605
- break;
606
- case "response.output_item.done":
607
- this.#handleResponseOutputItemDone(event);
608
- break;
609
- case "response.content_part.added":
610
- this.#handleResponseContentPartAdded(event);
611
- break;
612
- case "response.content_part.done":
613
- this.#handleResponseContentPartDone(event);
614
- break;
615
- case "response.text.delta":
616
- this.#handleResponseTextDelta(event);
617
- break;
618
- case "response.text.done":
619
- this.#handleResponseTextDone(event);
620
- break;
621
- case "response.audio_transcript.delta":
622
- this.#handleResponseAudioTranscriptDelta(event);
623
- break;
624
- case "response.audio_transcript.done":
625
- this.#handleResponseAudioTranscriptDone(event);
626
- break;
627
- case "response.audio.delta":
628
- this.#handleResponseAudioDelta(event);
629
- break;
630
- case "response.audio.done":
631
- this.#handleResponseAudioDone(event);
632
- break;
633
- case "response.function_call_arguments.delta":
634
- this.#handleResponseFunctionCallArgumentsDelta(event);
635
- break;
636
- case "response.function_call_arguments.done":
637
- this.#handleResponseFunctionCallArgumentsDone(event);
638
- break;
639
- case "rate_limits.updated":
640
- this.#handleRateLimitsUpdated(event);
641
- break;
559
+ const oldChatCtx = this.remoteChatCtx;
560
+ this.remoteChatCtx = new import_agents.llm.RemoteChatContext();
561
+ events.push(...this.createChatCtxUpdateEvents(chatCtx));
562
+ try {
563
+ for (const ev of events) {
564
+ this.emit("openai_client_event_queued", ev);
565
+ wsConn.send(JSON.stringify(ev));
642
566
  }
643
- };
644
- const sendTask = async () => {
645
- while (this.#ws && !this.#closing && this.#ws.readyState === import_ws.WebSocket.OPEN) {
646
- try {
647
- const event = await this.#sendQueue.get();
648
- if (event.type !== "input_audio_buffer.append") {
649
- this.#logger.debug(`-> ${JSON.stringify(this.#loggableEvent(event))}`);
650
- }
651
- this.#ws.send(JSON.stringify(event));
652
- } catch (error) {
653
- this.#logger.error("Error sending event:", error);
654
- }
567
+ } catch (error) {
568
+ this.remoteChatCtx = oldChatCtx;
569
+ throw new import_agents.APIConnectionError({
570
+ message: "Failed to send message to OpenAI Realtime API during session re-connection"
571
+ });
572
+ }
573
+ this.#logger.debug("Reconnected to OpenAI Realtime API");
574
+ this.emit("session_reconnected", {});
575
+ };
576
+ reconnecting = false;
577
+ while (!this.#closed) {
578
+ this.#logger.debug("Creating WebSocket connection to OpenAI Realtime API");
579
+ wsConn = await this.createWsConn();
580
+ try {
581
+ if (reconnecting) {
582
+ await reconnect();
583
+ numRetries = 0;
655
584
  }
656
- };
657
- sendTask();
658
- this.#ws.onclose = () => {
659
- if (this.#expiresAt && Date.now() >= this.#expiresAt * 1e3) {
660
- this.#closing = true;
585
+ await this.runWs(wsConn);
586
+ } catch (error) {
587
+ if (!(0, import_agents.isAPIError)(error)) {
588
+ this.emitError({ error, recoverable: false });
589
+ throw error;
661
590
  }
662
- if (!this.#closing) {
663
- reject(new Error("OpenAI Realtime connection closed unexpectedly"));
591
+ if (maxRetries === 0 || !error.retryable) {
592
+ this.emitError({ error, recoverable: false });
593
+ throw error;
664
594
  }
665
- this.#ws = null;
666
- resolve();
667
- };
595
+ if (numRetries === maxRetries) {
596
+ this.emitError({ error, recoverable: false });
597
+ throw new import_agents.APIConnectionError({
598
+ message: `OpenAI Realtime API connection failed after ${numRetries} attempts`,
599
+ options: {
600
+ body: error,
601
+ retryable: false
602
+ }
603
+ });
604
+ }
605
+ this.emitError({ error, recoverable: true });
606
+ const retryInterval = numRetries === 0 ? DEFAULT_FIRST_RETRY_INTERVAL_MS : this.oaiRealtimeModel._options.connOptions.retryIntervalMs;
607
+ this.#logger.warn(
608
+ {
609
+ attempt: numRetries,
610
+ maxRetries,
611
+ error
612
+ },
613
+ `OpenAI Realtime API connection failed, retrying in ${retryInterval / 1e3}s`
614
+ );
615
+ await (0, import_agents.delay)(retryInterval);
616
+ numRetries++;
617
+ }
618
+ reconnecting = true;
619
+ }
620
+ }
621
+ async runWs(wsConn) {
622
+ const forwardEvents = async (signal) => {
623
+ while (!this.#closed && wsConn.readyState === import_ws.WebSocket.OPEN && !signal.aborted) {
624
+ try {
625
+ const event = await this.messageChannel.get();
626
+ if (signal.aborted) {
627
+ break;
628
+ }
629
+ if (event.type !== "input_audio_buffer.append") {
630
+ this.#logger.debug(`(client) -> ${JSON.stringify(this.#loggableEvent(event))}`);
631
+ }
632
+ this.emit("openai_client_event_queued", event);
633
+ wsConn.send(JSON.stringify(event));
634
+ } catch (error) {
635
+ break;
636
+ }
637
+ }
638
+ wsConn.close();
639
+ };
640
+ const wsCloseFuture = new import_agents.Future();
641
+ wsConn.onerror = (error) => {
642
+ wsCloseFuture.resolve(new import_agents.APIConnectionError({ message: error.message }));
643
+ };
644
+ wsConn.onclose = () => {
645
+ wsCloseFuture.resolve();
646
+ };
647
+ wsConn.onmessage = (message) => {
648
+ const event = JSON.parse(message.data);
649
+ this.emit("openai_server_event_received", event);
650
+ this.#logger.debug(`(server) <- ${JSON.stringify(this.#loggableEvent(event))}`);
651
+ switch (event.type) {
652
+ case "input_audio_buffer.speech_started":
653
+ this.handleInputAudioBufferSpeechStarted(event);
654
+ break;
655
+ case "input_audio_buffer.speech_stopped":
656
+ this.handleInputAudioBufferSpeechStopped(event);
657
+ break;
658
+ case "response.created":
659
+ this.handleResponseCreated(event);
660
+ break;
661
+ case "response.output_item.added":
662
+ this.handleResponseOutputItemAdded(event);
663
+ break;
664
+ case "conversation.item.created":
665
+ this.handleConversationItemCreated(event);
666
+ break;
667
+ case "conversation.item.deleted":
668
+ this.handleConversationItemDeleted(event);
669
+ break;
670
+ case "conversation.item.input_audio_transcription.completed":
671
+ this.handleConversationItemInputAudioTranscriptionCompleted(event);
672
+ break;
673
+ case "conversation.item.input_audio_transcription.failed":
674
+ this.handleConversationItemInputAudioTranscriptionFailed(event);
675
+ break;
676
+ case "response.content_part.added":
677
+ this.handleResponseContentPartAdded(event);
678
+ break;
679
+ case "response.content_part.done":
680
+ this.handleResponseContentPartDone(event);
681
+ break;
682
+ case "response.audio_transcript.delta":
683
+ this.handleResponseAudioTranscriptDelta(event);
684
+ break;
685
+ case "response.audio.delta":
686
+ this.handleResponseAudioDelta(event);
687
+ break;
688
+ case "response.audio_transcript.done":
689
+ this.handleResponseAudioTranscriptDone(event);
690
+ break;
691
+ case "response.audio.done":
692
+ this.handleResponseAudioDone(event);
693
+ break;
694
+ case "response.output_item.done":
695
+ this.handleResponseOutputItemDone(event);
696
+ break;
697
+ case "response.done":
698
+ this.handleResponseDone(event);
699
+ break;
700
+ case "error":
701
+ this.handleError(event);
702
+ break;
703
+ default:
704
+ this.#logger.debug(`unhandled event: ${event.type}`);
705
+ break;
706
+ }
707
+ };
708
+ const sendTask = import_agents.Task.from(({ signal }) => forwardEvents(signal));
709
+ const wsTask = import_agents.Task.from(({ signal }) => {
710
+ const abortPromise = new Promise((resolve) => {
711
+ signal.addEventListener("abort", () => {
712
+ resolve();
713
+ });
714
+ });
715
+ return Promise.race([wsCloseFuture.await, abortPromise]);
668
716
  });
717
+ const waitReconnectTask = import_agents.Task.from(async ({ signal }) => {
718
+ await (0, import_agents.delay)(this.oaiRealtimeModel._options.maxSessionDuration, { signal });
719
+ return new import_agents.APIConnectionError({
720
+ message: "OpenAI Realtime API connection timeout"
721
+ });
722
+ });
723
+ try {
724
+ const result = await Promise.race([wsTask.result, sendTask.result, waitReconnectTask.result]);
725
+ if (waitReconnectTask.done && this.currentGeneration) {
726
+ await this.currentGeneration._doneFut.await;
727
+ }
728
+ if (result instanceof Error) {
729
+ throw result;
730
+ }
731
+ } finally {
732
+ await (0, import_agents.cancelAndWait)([wsTask, sendTask, waitReconnectTask], 2e3);
733
+ wsConn.close();
734
+ }
669
735
  }
670
736
  async close() {
671
- if (!this.#ws) return;
672
- this.#closing = true;
673
- this.#ws.close();
737
+ super.close();
738
+ this.#closed = true;
674
739
  await this.#task;
675
740
  }
676
- #getContent(ptr) {
677
- const response = this.#pendingResponses[ptr.response_id];
678
- const output = response.output[ptr.output_index];
679
- const content = output.content[ptr.content_index];
680
- return content;
741
+ handleInputAudioBufferSpeechStarted(_event) {
742
+ this.emit("input_speech_started", {});
681
743
  }
682
- #handleError(event) {
683
- this.#logger.error(`OpenAI Realtime error ${JSON.stringify(event.error)}`);
744
+ handleInputAudioBufferSpeechStopped(_event) {
745
+ this.emit("input_speech_stopped", {
746
+ userTranscriptionEnabled: this.oaiRealtimeModel._options.inputAudioTranscription !== null
747
+ });
684
748
  }
685
- #handleSessionCreated(event) {
686
- this.#sessionId = event.session.id;
687
- this.#expiresAt = event.session.expires_at;
688
- this.#logger = this.#logger.child({ sessionId: this.#sessionId });
749
+ handleResponseCreated(event) {
750
+ if (!event.response.id) {
751
+ throw new Error("response.id is missing");
752
+ }
753
+ this.currentGeneration = {
754
+ messageChannel: import_agents.stream.createStreamChannel(),
755
+ functionChannel: import_agents.stream.createStreamChannel(),
756
+ messages: /* @__PURE__ */ new Map(),
757
+ _doneFut: new import_agents.Future(),
758
+ _createdTimestamp: Date.now()
759
+ };
760
+ if (!event.response.metadata || !event.response.metadata.client_event_id) return;
761
+ const handle = this.responseCreatedFutures[event.response.metadata.client_event_id];
762
+ if (handle) {
763
+ delete this.responseCreatedFutures[event.response.metadata.client_event_id];
764
+ this.responseCreatedFutures[event.response.id] = handle;
765
+ }
766
+ this.emit("generation_created", {
767
+ messageStream: this.currentGeneration.messageChannel.stream(),
768
+ functionStream: this.currentGeneration.functionChannel.stream(),
769
+ userInitiated: false
770
+ });
689
771
  }
690
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
691
- #handleSessionUpdated(event) {
772
+ handleResponseOutputItemAdded(event) {
773
+ if (!this.currentGeneration) {
774
+ throw new Error("currentGeneration is not set");
775
+ }
776
+ if (!event.item.type) {
777
+ throw new Error("item.type is not set");
778
+ }
779
+ if (!event.response_id) {
780
+ throw new Error("response_id is not set");
781
+ }
782
+ const itemType = event.item.type;
783
+ const responseId = event.response_id;
784
+ if (itemType !== "message") {
785
+ this.emitGenerationEvent(responseId);
786
+ this.textModeRecoveryRetries = 0;
787
+ return;
788
+ }
692
789
  }
693
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
694
- #handleConversationCreated(event) {
790
+ handleConversationItemCreated(event) {
791
+ if (!event.item.id) {
792
+ throw new Error("item.id is not set");
793
+ }
794
+ try {
795
+ this.remoteChatCtx.insert(event.previous_item_id, openAIItemToLivekitItem(event.item));
796
+ } catch (error) {
797
+ this.#logger.error({ error, itemId: event.item.id }, "failed to insert conversation item");
798
+ }
799
+ const fut = this.itemCreateFutures[event.item.id];
800
+ if (fut) {
801
+ fut.resolve();
802
+ delete this.itemCreateFutures[event.item.id];
803
+ }
695
804
  }
696
- #handleInputAudioBufferCommitted(event) {
697
- this.emit("input_speech_committed", {
698
- itemId: event.item_id
805
+ handleConversationItemDeleted(event) {
806
+ if (!event.item_id) {
807
+ throw new Error("item_id is not set");
808
+ }
809
+ try {
810
+ this.remoteChatCtx.delete(event.item_id);
811
+ } catch (error) {
812
+ this.#logger.error({ error, itemId: event.item_id }, "failed to delete conversation item");
813
+ }
814
+ const fut = this.itemDeleteFutures[event.item_id];
815
+ if (fut) {
816
+ fut.resolve();
817
+ delete this.itemDeleteFutures[event.item_id];
818
+ }
819
+ }
820
+ handleConversationItemInputAudioTranscriptionCompleted(event) {
821
+ const remoteItem = this.remoteChatCtx.get(event.item_id);
822
+ if (!remoteItem) {
823
+ return;
824
+ }
825
+ const item = remoteItem.item;
826
+ if (item instanceof import_agents.llm.ChatMessage) {
827
+ item.content.push(event.transcript);
828
+ } else {
829
+ throw new Error("item is not a chat message");
830
+ }
831
+ this.emit("input_audio_transcription_completed", {
832
+ itemId: event.item_id,
833
+ transcript: event.transcript,
834
+ isFinal: true
699
835
  });
700
836
  }
701
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
702
- #handleInputAudioBufferCleared(event) {
837
+ handleConversationItemInputAudioTranscriptionFailed(event) {
838
+ this.#logger.error(
839
+ { error: event.error },
840
+ "OpenAI Realtime API failed to transcribe input audio"
841
+ );
703
842
  }
704
- #handleInputAudioBufferSpeechStarted(event) {
705
- this.emit("input_speech_started", {
706
- itemId: event.item_id
707
- });
843
+ handleResponseContentPartAdded(event) {
844
+ if (!this.currentGeneration) {
845
+ throw new Error("currentGeneration is not set");
846
+ }
847
+ const itemId = event.item_id;
848
+ const itemType = event.part.type;
849
+ const responseId = event.response_id;
850
+ if (itemType === "audio") {
851
+ this.emitGenerationEvent(responseId);
852
+ if (this.textModeRecoveryRetries > 0) {
853
+ this.#logger.info(
854
+ { retries: this.textModeRecoveryRetries },
855
+ "recovered from text-only response"
856
+ );
857
+ this.textModeRecoveryRetries = 0;
858
+ }
859
+ const itemGeneration = {
860
+ messageId: itemId,
861
+ textChannel: import_agents.stream.createStreamChannel(),
862
+ audioChannel: import_agents.stream.createStreamChannel(),
863
+ audioTranscript: ""
864
+ };
865
+ this.currentGeneration.messageChannel.write({
866
+ messageId: itemId,
867
+ textStream: itemGeneration.textChannel.stream(),
868
+ audioStream: itemGeneration.audioChannel.stream()
869
+ });
870
+ this.currentGeneration.messages.set(itemId, itemGeneration);
871
+ this.currentGeneration._firstTokenTimestamp = Date.now();
872
+ return;
873
+ } else {
874
+ this.interrupt();
875
+ if (this.textModeRecoveryRetries === 0) {
876
+ this.#logger.warn({ responseId }, "received text-only response from OpenAI Realtime API");
877
+ }
878
+ }
708
879
  }
709
- #handleInputAudioBufferSpeechStopped(event) {
710
- this.emit("input_speech_stopped");
880
+ handleResponseContentPartDone(event) {
881
+ if (event.part.type !== "text") {
882
+ return;
883
+ }
884
+ if (!this.currentGeneration) {
885
+ throw new Error("currentGeneration is not set");
886
+ }
711
887
  }
712
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
713
- #handleConversationItemCreated(event) {
888
+ handleResponseAudioTranscriptDelta(event) {
889
+ if (!this.currentGeneration) {
890
+ throw new Error("currentGeneration is not set");
891
+ }
892
+ const itemId = event.item_id;
893
+ const delta = event.delta;
894
+ const itemGeneration = this.currentGeneration.messages.get(itemId);
895
+ if (!itemGeneration) {
896
+ throw new Error("itemGeneration is not set");
897
+ } else {
898
+ itemGeneration.textChannel.write(delta);
899
+ itemGeneration.audioTranscript += delta;
900
+ }
714
901
  }
715
- #handleConversationItemInputAudioTranscriptionCompleted(event) {
716
- const transcript = event.transcript;
717
- this.emit("input_speech_transcription_completed", {
718
- itemId: event.item_id,
719
- transcript
720
- });
902
+ handleResponseAudioDelta(event) {
903
+ if (!this.currentGeneration) {
904
+ throw new Error("currentGeneration is not set");
905
+ }
906
+ const itemGeneration = this.currentGeneration.messages.get(event.item_id);
907
+ if (!itemGeneration) {
908
+ throw new Error("itemGeneration is not set");
909
+ }
910
+ const binaryString = atob(event.delta);
911
+ const len = binaryString.length;
912
+ const bytes = new Uint8Array(len);
913
+ for (let i = 0; i < len; i++) {
914
+ bytes[i] = binaryString.charCodeAt(i);
915
+ }
916
+ itemGeneration.audioChannel.write(
917
+ new import_rtc_node.AudioFrame(
918
+ new Int16Array(bytes.buffer),
919
+ api_proto.SAMPLE_RATE,
920
+ api_proto.NUM_CHANNELS,
921
+ bytes.length / 2
922
+ )
923
+ );
721
924
  }
722
- #handleConversationItemInputAudioTranscriptionFailed(event) {
723
- const error = event.error;
724
- this.#logger.error(`OpenAI Realtime failed to transcribe input audio: ${error.message}`);
725
- this.emit("input_speech_transcription_failed", {
726
- itemId: event.item_id,
727
- message: error.message
728
- });
925
+ handleResponseAudioTranscriptDone(_event) {
926
+ if (!this.currentGeneration) {
927
+ throw new Error("currentGeneration is not set");
928
+ }
729
929
  }
730
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
731
- #handleConversationItemTruncated(event) {
732
- }
733
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
734
- #handleConversationItemDeleted(event) {
735
- }
736
- #handleResponseCreated(responseCreated) {
737
- const response = responseCreated.response;
738
- const doneFut = new import_agents.Future();
739
- const newResponse = {
740
- id: response.id,
741
- status: response.status,
742
- statusDetails: response.status_details,
743
- usage: null,
744
- output: [],
745
- doneFut,
746
- createdTimestamp: Date.now()
747
- };
748
- this.#pendingResponses[newResponse.id] = newResponse;
749
- this.emit("response_created", newResponse);
750
- }
751
- #handleResponseDone(event) {
752
- const responseData = event.response;
753
- const responseId = responseData.id;
754
- const response = this.#pendingResponses[responseId];
755
- response.status = responseData.status;
756
- response.statusDetails = responseData.status_details;
757
- response.usage = responseData.usage ?? null;
758
- this.#pendingResponses[responseId] = response;
759
- response.doneFut.resolve();
760
- let metricsError;
761
- let cancelled = false;
762
- switch (response.status) {
763
- case "failed": {
764
- if (response.statusDetails.type !== "failed") break;
765
- const err = response.statusDetails.error;
766
- metricsError = new import_agents.metrics.MultimodalLLMError({
767
- type: response.statusDetails.type,
768
- code: err == null ? void 0 : err.code,
769
- message: err == null ? void 0 : err.message
770
- });
771
- this.#logger.child({ code: err == null ? void 0 : err.code, error: err == null ? void 0 : err.message }).error("response generation failed");
772
- break;
773
- }
774
- case "incomplete": {
775
- if (response.statusDetails.type !== "incomplete") break;
776
- const reason = response.statusDetails.reason;
777
- metricsError = new import_agents.metrics.MultimodalLLMError({
778
- type: response.statusDetails.type,
779
- reason
780
- });
781
- this.#logger.child({ reason }).error("response generation incomplete");
782
- break;
930
+ handleResponseAudioDone(_event) {
931
+ if (!this.currentGeneration) {
932
+ throw new Error("currentGeneration is not set");
933
+ }
934
+ }
935
+ handleResponseOutputItemDone(event) {
936
+ if (!this.currentGeneration) {
937
+ throw new Error("currentGeneration is not set");
938
+ }
939
+ const itemId = event.item.id;
940
+ const itemType = event.item.type;
941
+ if (itemType === "function_call") {
942
+ const item = event.item;
943
+ if (!item.call_id || !item.name || !item.arguments) {
944
+ throw new Error("item is not a function call");
783
945
  }
784
- case "cancelled": {
785
- cancelled = true;
786
- break;
946
+ this.currentGeneration.functionChannel.write({
947
+ callId: item.call_id,
948
+ name: item.name,
949
+ args: item.arguments
950
+ });
951
+ } else if (itemType === "message") {
952
+ const itemGeneration = this.currentGeneration.messages.get(itemId);
953
+ if (!itemGeneration) {
954
+ return;
787
955
  }
956
+ itemGeneration.textChannel.close();
957
+ itemGeneration.audioChannel.close();
788
958
  }
789
- this.emit("response_done", response);
790
- let ttft;
791
- if (response.firstTokenTimestamp) {
792
- ttft = response.firstTokenTimestamp - response.createdTimestamp;
959
+ }
960
+ handleResponseDone(_event) {
961
+ var _a, _b, _c, _d, _e, _f, _g, _h, _i, _j, _k, _l;
962
+ if (!this.currentGeneration) {
963
+ return;
793
964
  }
794
- const duration = Date.now() - response.createdTimestamp;
795
- const usage = response.usage;
796
- const metric = {
797
- timestamp: response.createdTimestamp,
798
- requestId: response.id,
965
+ const createdTimestamp = this.currentGeneration._createdTimestamp;
966
+ const firstTokenTimestamp = this.currentGeneration._firstTokenTimestamp;
967
+ this.#logger.debug(
968
+ {
969
+ messageCount: this.currentGeneration.messages.size
970
+ },
971
+ "Closing generation channels in handleResponseDone"
972
+ );
973
+ for (const generation of this.currentGeneration.messages.values()) {
974
+ generation.textChannel.close();
975
+ generation.audioChannel.close();
976
+ }
977
+ this.currentGeneration.functionChannel.close();
978
+ this.currentGeneration.messageChannel.close();
979
+ for (const itemId of this.currentGeneration.messages.keys()) {
980
+ const remoteItem = this.remoteChatCtx.get(itemId);
981
+ if (remoteItem && remoteItem.item instanceof import_agents.llm.ChatMessage) {
982
+ remoteItem.item.content.push(this.currentGeneration.messages.get(itemId).audioTranscript);
983
+ }
984
+ }
985
+ this.currentGeneration._doneFut.resolve();
986
+ this.currentGeneration = void 0;
987
+ const usage = _event.response.usage;
988
+ const ttft = firstTokenTimestamp ? firstTokenTimestamp - createdTimestamp : -1;
989
+ const duration = (Date.now() - createdTimestamp) / 1e3;
990
+ const realtimeMetrics = {
991
+ type: "realtime_model_metrics",
992
+ timestamp: createdTimestamp / 1e3,
993
+ // Convert to seconds
994
+ requestId: _event.response.id || "",
799
995
  ttft,
800
996
  duration,
801
- cancelled,
802
- label: this.constructor.name,
803
- completionTokens: (usage == null ? void 0 : usage.output_tokens) || 0,
804
- promptTokens: (usage == null ? void 0 : usage.input_tokens) || 0,
805
- totalTokens: (usage == null ? void 0 : usage.total_tokens) || 0,
806
- tokensPerSecond: ((usage == null ? void 0 : usage.output_tokens) || 0) / duration * 1e3,
807
- error: metricsError,
997
+ cancelled: _event.response.status === "cancelled",
998
+ label: "openai_realtime",
999
+ inputTokens: (usage == null ? void 0 : usage.input_tokens) ?? 0,
1000
+ outputTokens: (usage == null ? void 0 : usage.output_tokens) ?? 0,
1001
+ totalTokens: (usage == null ? void 0 : usage.total_tokens) ?? 0,
1002
+ tokensPerSecond: duration > 0 ? ((usage == null ? void 0 : usage.output_tokens) ?? 0) / duration : 0,
808
1003
  inputTokenDetails: {
809
- cachedTokens: (usage == null ? void 0 : usage.input_token_details.cached_tokens) || 0,
810
- textTokens: (usage == null ? void 0 : usage.input_token_details.text_tokens) || 0,
811
- audioTokens: (usage == null ? void 0 : usage.input_token_details.audio_tokens) || 0
1004
+ audioTokens: ((_a = usage == null ? void 0 : usage.input_token_details) == null ? void 0 : _a.audio_tokens) ?? 0,
1005
+ textTokens: ((_b = usage == null ? void 0 : usage.input_token_details) == null ? void 0 : _b.text_tokens) ?? 0,
1006
+ imageTokens: 0,
1007
+ // Not supported yet
1008
+ cachedTokens: ((_c = usage == null ? void 0 : usage.input_token_details) == null ? void 0 : _c.cached_tokens) ?? 0,
1009
+ cachedTokensDetails: ((_d = usage == null ? void 0 : usage.input_token_details) == null ? void 0 : _d.cached_tokens_details) ? {
1010
+ audioTokens: ((_f = (_e = usage == null ? void 0 : usage.input_token_details) == null ? void 0 : _e.cached_tokens_details) == null ? void 0 : _f.audio_tokens) ?? 0,
1011
+ textTokens: ((_h = (_g = usage == null ? void 0 : usage.input_token_details) == null ? void 0 : _g.cached_tokens_details) == null ? void 0 : _h.text_tokens) ?? 0,
1012
+ imageTokens: ((_j = (_i = usage == null ? void 0 : usage.input_token_details) == null ? void 0 : _i.cached_tokens_details) == null ? void 0 : _j.image_tokens) ?? 0
1013
+ } : void 0
812
1014
  },
813
1015
  outputTokenDetails: {
814
- textTokens: (usage == null ? void 0 : usage.output_token_details.text_tokens) || 0,
815
- audioTokens: (usage == null ? void 0 : usage.output_token_details.audio_tokens) || 0
1016
+ textTokens: ((_k = usage == null ? void 0 : usage.output_token_details) == null ? void 0 : _k.text_tokens) ?? 0,
1017
+ audioTokens: ((_l = usage == null ? void 0 : usage.output_token_details) == null ? void 0 : _l.audio_tokens) ?? 0,
1018
+ imageTokens: 0
816
1019
  }
817
1020
  };
818
- this.emit("metrics_collected", metric);
1021
+ this.emit("metrics_collected", realtimeMetrics);
819
1022
  }
820
- #handleResponseOutputItemAdded(event) {
821
- const responseId = event.response_id;
822
- const response = this.#pendingResponses[responseId];
823
- const itemData = event.item;
824
- if (itemData.type !== "message" && itemData.type !== "function_call") {
825
- throw new Error(`Unexpected item type: ${itemData.type}`);
826
- }
827
- let role;
828
- if (itemData.type === "function_call") {
829
- role = "assistant";
830
- } else {
831
- role = itemData.role;
832
- }
833
- const newOutput = {
834
- responseId,
835
- itemId: itemData.id,
836
- outputIndex: event.output_index,
837
- type: itemData.type,
838
- role,
839
- content: [],
840
- doneFut: new import_agents.Future()
841
- };
842
- response == null ? void 0 : response.output.push(newOutput);
843
- this.emit("response_output_added", newOutput);
1023
+ handleError(event) {
1024
+ if (event.error.message.startsWith("Cancellation failed")) {
1025
+ return;
1026
+ }
1027
+ this.#logger.error({ error: event.error }, "OpenAI Realtime API returned an error");
1028
+ this.emitError({
1029
+ error: new import_agents.APIError(event.error.message, {
1030
+ body: event.error,
1031
+ retryable: true
1032
+ }),
1033
+ recoverable: true
1034
+ });
844
1035
  }
845
- #handleResponseOutputItemDone(event) {
846
- const responseId = event.response_id;
847
- const response = this.#pendingResponses[responseId];
848
- const outputIndex = event.output_index;
849
- const output = response.output[outputIndex];
850
- if ((output == null ? void 0 : output.type) === "function_call") {
851
- if (!this.#fncCtx) {
852
- this.#logger.error("function call received but no fncCtx is available");
853
- return;
854
- }
855
- const item = event.item;
856
- if (item.type !== "function_call") {
857
- throw new Error("Expected function_call item");
858
- }
859
- const func = this.#fncCtx[item.name];
860
- if (!func) {
861
- this.#logger.error(`no function with name ${item.name} in fncCtx`);
862
- return;
863
- }
864
- this.emit("function_call_started", {
865
- callId: item.call_id
866
- });
867
- const parsedArgs = JSON.parse(item.arguments);
868
- this.#logger.debug(
869
- `[Function Call ${item.call_id}] Executing ${item.name} with arguments ${parsedArgs}`
870
- );
871
- func.execute(parsedArgs).then(
872
- (content) => {
873
- this.#logger.debug(`[Function Call ${item.call_id}] ${item.name} returned ${content}`);
874
- this.emit("function_call_completed", {
875
- callId: item.call_id
876
- });
877
- this.conversation.item.create(
878
- import_agents.llm.ChatMessage.createToolFromFunctionResult({
879
- name: item.name,
880
- toolCallId: item.call_id,
881
- result: content
882
- }),
883
- output.itemId
884
- );
885
- this.response.create();
886
- },
887
- (error) => {
888
- this.#logger.error(`[Function Call ${item.call_id}] ${item.name} failed with ${error}`);
889
- this.emit("function_call_failed", {
890
- callId: item.call_id
891
- });
892
- }
893
- );
1036
+ emitError({ error, recoverable }) {
1037
+ this.emit("error", {
1038
+ timestamp: Date.now(),
1039
+ // TODO(brian): add label
1040
+ label: "",
1041
+ error,
1042
+ recoverable
1043
+ });
1044
+ }
1045
+ *resampleAudio(frame) {
1046
+ yield frame;
1047
+ }
1048
+ createResponse({
1049
+ userInitiated,
1050
+ instructions,
1051
+ oldHandle
1052
+ }) {
1053
+ const handle = oldHandle || new CreateResponseHandle({ instructions });
1054
+ if (oldHandle && instructions) {
1055
+ handle.instructions = instructions;
894
1056
  }
895
- output == null ? void 0 : output.doneFut.resolve();
896
- this.emit("response_output_done", output);
1057
+ const eventId = (0, import_agents.shortuuid)("response_create_");
1058
+ if (userInitiated) {
1059
+ this.responseCreatedFutures[eventId] = handle;
1060
+ }
1061
+ const response = {};
1062
+ if (instructions) response.instructions = instructions;
1063
+ if (userInitiated) response.metadata = { client_event_id: eventId };
1064
+ this.sendEvent({
1065
+ type: "response.create",
1066
+ event_id: eventId,
1067
+ response: Object.keys(response).length > 0 ? response : void 0
1068
+ });
1069
+ return handle;
897
1070
  }
898
- #handleResponseContentPartAdded(event) {
899
- const responseId = event.response_id;
900
- const response = this.#pendingResponses[responseId];
901
- const outputIndex = event.output_index;
902
- const output = response.output[outputIndex];
903
- const textStream = new import_agents.AsyncIterableQueue();
904
- const audioStream = new import_agents.AsyncIterableQueue();
905
- const newContent = {
906
- responseId,
907
- itemId: event.item_id,
908
- outputIndex,
909
- contentIndex: event.content_index,
910
- text: "",
911
- audio: [],
912
- textStream,
913
- audioStream,
914
- toolCalls: [],
915
- contentType: event.part.type
1071
+ emitGenerationEvent(responseId) {
1072
+ if (!this.currentGeneration) {
1073
+ throw new Error("currentGeneration is not set");
1074
+ }
1075
+ const generation_ev = {
1076
+ messageStream: this.currentGeneration.messageChannel.stream(),
1077
+ functionStream: this.currentGeneration.functionChannel.stream(),
1078
+ userInitiated: false
916
1079
  };
917
- output == null ? void 0 : output.content.push(newContent);
918
- response.firstTokenTimestamp = Date.now();
919
- this.emit("response_content_added", newContent);
920
- }
921
- #handleResponseContentPartDone(event) {
922
- const content = this.#getContent(event);
923
- this.emit("response_content_done", content);
924
- }
925
- #handleResponseTextDelta(event) {
926
- this.emit("response_text_delta", event);
927
- }
928
- #handleResponseTextDone(event) {
929
- const content = this.#getContent(event);
930
- content.text = event.text;
931
- this.emit("response_text_done", event);
932
- }
933
- #handleResponseAudioTranscriptDelta(event) {
934
- const content = this.#getContent(event);
935
- const transcript = event.delta;
936
- content.text += transcript;
937
- content.textStream.put(transcript);
938
- }
939
- #handleResponseAudioTranscriptDone(event) {
940
- const content = this.#getContent(event);
941
- content.textStream.close();
942
- }
943
- #handleResponseAudioDelta(event) {
944
- const content = this.#getContent(event);
945
- const data = Buffer.from(event.delta, "base64");
946
- const audio = new import_rtc_node.AudioFrame(
947
- new Int16Array(data.buffer),
948
- api_proto.SAMPLE_RATE,
949
- api_proto.NUM_CHANNELS,
950
- data.length / 2
951
- );
952
- content.audio.push(audio);
953
- content.audioStream.put(audio);
1080
+ const handle = this.responseCreatedFutures[responseId];
1081
+ if (handle) {
1082
+ delete this.responseCreatedFutures[responseId];
1083
+ generation_ev.userInitiated = true;
1084
+ if (handle.doneFut.done) {
1085
+ this.#logger.warn({ responseId }, "response received after timeout");
1086
+ } else {
1087
+ handle.doneFut.resolve(generation_ev);
1088
+ }
1089
+ }
1090
+ this.#logger.debug({ responseId }, "Emitting generation_created event");
1091
+ this.emit("generation_created", generation_ev);
954
1092
  }
955
- #handleResponseAudioDone(event) {
956
- const content = this.#getContent(event);
957
- content.audioStream.close();
1093
+ }
1094
+ function livekitItemToOpenAIItem(item) {
1095
+ switch (item.type) {
1096
+ case "function_call":
1097
+ return {
1098
+ id: item.id,
1099
+ type: "function_call",
1100
+ call_id: item.callId,
1101
+ name: item.name,
1102
+ arguments: item.args
1103
+ };
1104
+ case "function_call_output":
1105
+ return {
1106
+ id: item.id,
1107
+ type: "function_call_output",
1108
+ call_id: item.callId,
1109
+ output: item.output
1110
+ };
1111
+ case "message":
1112
+ const role = item.role === "developer" ? "system" : item.role;
1113
+ const contentList = [];
1114
+ for (const c of item.content) {
1115
+ if (typeof c === "string") {
1116
+ contentList.push({
1117
+ type: role === "assistant" ? "text" : "input_text",
1118
+ text: c
1119
+ });
1120
+ } else if (c.type === "image_content") {
1121
+ continue;
1122
+ } else if (c.type === "audio_content") {
1123
+ if (role === "user") {
1124
+ const encodedAudio = Buffer.from((0, import_rtc_node.combineAudioFrames)(c.frame).data).toString("base64");
1125
+ contentList.push({
1126
+ type: "input_audio",
1127
+ audio: encodedAudio
1128
+ });
1129
+ }
1130
+ }
1131
+ }
1132
+ return {
1133
+ id: item.id,
1134
+ type: "message",
1135
+ role,
1136
+ content: contentList
1137
+ };
958
1138
  }
959
- #handleResponseFunctionCallArgumentsDelta(event) {
1139
+ }
1140
+ function openAIItemToLivekitItem(item) {
1141
+ if (!item.id) {
1142
+ throw new Error("item.id is not set");
960
1143
  }
961
- #handleResponseFunctionCallArgumentsDone(event) {
1144
+ switch (item.type) {
1145
+ case "function_call":
1146
+ return import_agents.llm.FunctionCall.create({
1147
+ id: item.id,
1148
+ callId: item.call_id,
1149
+ name: item.name,
1150
+ args: item.arguments
1151
+ });
1152
+ case "function_call_output":
1153
+ return import_agents.llm.FunctionCallOutput.create({
1154
+ id: item.id,
1155
+ callId: item.call_id,
1156
+ output: item.output,
1157
+ isError: false
1158
+ });
1159
+ case "message":
1160
+ const content = [];
1161
+ const contents = Array.isArray(item.content) ? item.content : [item.content];
1162
+ for (const c of contents) {
1163
+ if (c.type === "text" || c.type === "input_text") {
1164
+ content.push(c.text);
1165
+ }
1166
+ }
1167
+ return import_agents.llm.ChatMessage.create({
1168
+ id: item.id,
1169
+ role: item.role,
1170
+ content
1171
+ });
1172
+ }
1173
+ }
1174
+ function createMockAudioItem(durationSeconds = 2) {
1175
+ const audioData = Buffer.alloc(durationSeconds * SAMPLE_RATE);
1176
+ return import_agents.llm.ChatMessage.create({
1177
+ id: (0, import_agents.shortuuid)(MOCK_AUDIO_ID_PREFIX),
1178
+ role: "user",
1179
+ content: [
1180
+ {
1181
+ type: "audio_content",
1182
+ frame: [
1183
+ new import_rtc_node.AudioFrame(
1184
+ new Int16Array(audioData.buffer),
1185
+ SAMPLE_RATE,
1186
+ NUM_CHANNELS,
1187
+ audioData.length / 2
1188
+ )
1189
+ ]
1190
+ }
1191
+ ]
1192
+ });
1193
+ }
1194
+ function toOaiToolChoice(toolChoice) {
1195
+ if (typeof toolChoice === "string") {
1196
+ return toolChoice;
962
1197
  }
963
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
964
- #handleRateLimitsUpdated(event) {
1198
+ if ((toolChoice == null ? void 0 : toolChoice.type) === "function") {
1199
+ return toolChoice.function.name;
965
1200
  }
1201
+ return "auto";
966
1202
  }
967
1203
  // Annotate the CommonJS export names for ESM import in node:
968
1204
  0 && (module.exports = {