@livekit/agents-plugin-openai 0.3.4 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,53 +1,41 @@
1
- var __classPrivateFieldSet = (this && this.__classPrivateFieldSet) || function (receiver, state, value, kind, f) {
2
- if (kind === "m") throw new TypeError("Private method is not writable");
3
- if (kind === "a" && !f) throw new TypeError("Private accessor was defined without a setter");
4
- if (typeof state === "function" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError("Cannot write private member to an object whose class did not declare it");
5
- return (kind === "a" ? f.call(receiver, value) : f ? f.value = value : state.set(receiver, value)), value;
6
- };
7
- var __classPrivateFieldGet = (this && this.__classPrivateFieldGet) || function (receiver, state, kind, f) {
8
- if (kind === "a" && !f) throw new TypeError("Private accessor was defined without a getter");
9
- if (typeof state === "function" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError("Cannot read private member from an object whose class did not declare it");
10
- return kind === "m" ? f : kind === "a" ? f.call(receiver) : f ? f.value : state.get(receiver);
11
- };
12
- var _InputAudioBuffer_session, _ConversationItem_session, _Conversation_session, _Response_session, _RealtimeModel_defaultOpts, _RealtimeModel_sessions, _RealtimeSession_instances, _RealtimeSession_fncCtx, _RealtimeSession_opts, _RealtimeSession_pendingResponses, _RealtimeSession_sessionId, _RealtimeSession_ws, _RealtimeSession_expiresAt, _RealtimeSession_logger, _RealtimeSession_task, _RealtimeSession_closing, _RealtimeSession_sendQueue, _RealtimeSession_loggableEvent, _RealtimeSession_start, _RealtimeSession_getContent, _RealtimeSession_handleError, _RealtimeSession_handleSessionCreated, _RealtimeSession_handleSessionUpdated, _RealtimeSession_handleConversationCreated, _RealtimeSession_handleInputAudioBufferCommitted, _RealtimeSession_handleInputAudioBufferCleared, _RealtimeSession_handleInputAudioBufferSpeechStarted, _RealtimeSession_handleInputAudioBufferSpeechStopped, _RealtimeSession_handleConversationItemCreated, _RealtimeSession_handleConversationItemInputAudioTranscriptionCompleted, _RealtimeSession_handleConversationItemInputAudioTranscriptionFailed, _RealtimeSession_handleConversationItemTruncated, _RealtimeSession_handleConversationItemDeleted, _RealtimeSession_handleResponseCreated, _RealtimeSession_handleResponseDone, _RealtimeSession_handleResponseOutputItemAdded, _RealtimeSession_handleResponseOutputItemDone, _RealtimeSession_handleResponseContentPartAdded, _RealtimeSession_handleResponseContentPartDone, _RealtimeSession_handleResponseTextDelta, _RealtimeSession_handleResponseTextDone, _RealtimeSession_handleResponseAudioTranscriptDelta, _RealtimeSession_handleResponseAudioTranscriptDone, _RealtimeSession_handleResponseAudioDelta, _RealtimeSession_handleResponseAudioDone, _RealtimeSession_handleResponseFunctionCallArgumentsDelta, _RealtimeSession_handleResponseFunctionCallArgumentsDone, _RealtimeSession_handleRateLimitsUpdated;
13
1
  // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
14
2
  //
15
3
  // SPDX-License-Identifier: Apache-2.0
16
- import { AsyncIterableQueue, Future, Queue, llm, log, multimodal } from '@livekit/agents';
4
+ import { AsyncIterableQueue, Future, Queue, llm, log, mergeFrames, multimodal, } from '@livekit/agents';
17
5
  import { AudioFrame } from '@livekit/rtc-node';
18
- import { once } from 'events';
6
+ import { once } from 'node:events';
19
7
  import { WebSocket } from 'ws';
20
8
  import * as api_proto from './api_proto.js';
21
9
  class InputAudioBuffer {
10
+ #session;
22
11
  constructor(session) {
23
- _InputAudioBuffer_session.set(this, void 0);
24
- __classPrivateFieldSet(this, _InputAudioBuffer_session, session, "f");
12
+ this.#session = session;
25
13
  }
26
14
  append(frame) {
27
- __classPrivateFieldGet(this, _InputAudioBuffer_session, "f").queueMsg({
15
+ this.#session.queueMsg({
28
16
  type: 'input_audio_buffer.append',
29
17
  audio: Buffer.from(frame.data.buffer).toString('base64'),
30
18
  });
31
19
  }
32
20
  clear() {
33
- __classPrivateFieldGet(this, _InputAudioBuffer_session, "f").queueMsg({
21
+ this.#session.queueMsg({
34
22
  type: 'input_audio_buffer.clear',
35
23
  });
36
24
  }
37
25
  commit() {
38
- __classPrivateFieldGet(this, _InputAudioBuffer_session, "f").queueMsg({
26
+ this.#session.queueMsg({
39
27
  type: 'input_audio_buffer.commit',
40
28
  });
41
29
  }
42
30
  }
43
- _InputAudioBuffer_session = new WeakMap();
44
31
  class ConversationItem {
32
+ #session;
33
+ #logger = log();
45
34
  constructor(session) {
46
- _ConversationItem_session.set(this, void 0);
47
- __classPrivateFieldSet(this, _ConversationItem_session, session, "f");
35
+ this.#session = session;
48
36
  }
49
37
  truncate(itemId, contentIndex, audioEnd) {
50
- __classPrivateFieldGet(this, _ConversationItem_session, "f").queueMsg({
38
+ this.#session.queueMsg({
51
39
  type: 'conversation.item.truncate',
52
40
  item_id: itemId,
53
41
  content_index: contentIndex,
@@ -55,48 +43,161 @@ class ConversationItem {
55
43
  });
56
44
  }
57
45
  delete(itemId) {
58
- __classPrivateFieldGet(this, _ConversationItem_session, "f").queueMsg({
46
+ this.#session.queueMsg({
59
47
  type: 'conversation.item.delete',
60
48
  item_id: itemId,
61
49
  });
62
50
  }
63
- create(item, previousItemId) {
64
- __classPrivateFieldGet(this, _ConversationItem_session, "f").queueMsg({
65
- type: 'conversation.item.create',
66
- item,
67
- previous_item_id: previousItemId,
68
- });
51
+ create(message, previousItemId) {
52
+ if (!message.content) {
53
+ return;
54
+ }
55
+ let event;
56
+ if (message.toolCallId) {
57
+ if (typeof message.content !== 'string') {
58
+ throw new TypeError('message.content must be a string');
59
+ }
60
+ event = {
61
+ type: 'conversation.item.create',
62
+ previous_item_id: previousItemId,
63
+ item: {
64
+ type: 'function_call_output',
65
+ call_id: message.toolCallId,
66
+ output: message.content,
67
+ },
68
+ };
69
+ }
70
+ else {
71
+ let content = message.content;
72
+ if (!Array.isArray(content)) {
73
+ content = [content];
74
+ }
75
+ if (message.role === llm.ChatRole.USER) {
76
+ const contents = [];
77
+ for (const c of content) {
78
+ if (typeof c === 'string') {
79
+ contents.push({
80
+ type: 'input_text',
81
+ text: c,
82
+ });
83
+ }
84
+ else if (
85
+ // typescript type guard for determining ChatAudio vs ChatImage
86
+ ((c) => {
87
+ return c.frame !== undefined;
88
+ })(c)) {
89
+ contents.push({
90
+ type: 'input_audio',
91
+ audio: Buffer.from(mergeFrames(c.frame).data.buffer).toString('base64'),
92
+ });
93
+ }
94
+ }
95
+ event = {
96
+ type: 'conversation.item.create',
97
+ previous_item_id: previousItemId,
98
+ item: {
99
+ type: 'message',
100
+ role: 'user',
101
+ content: contents,
102
+ },
103
+ };
104
+ }
105
+ else if (message.role === llm.ChatRole.ASSISTANT) {
106
+ const contents = [];
107
+ for (const c of content) {
108
+ if (typeof c === 'string') {
109
+ contents.push({
110
+ type: 'text',
111
+ text: c,
112
+ });
113
+ }
114
+ else if (
115
+ // typescript type guard for determining ChatAudio vs ChatImage
116
+ ((c) => {
117
+ return c.frame !== undefined;
118
+ })(c)) {
119
+ this.#logger.warn('audio content in assistant message is not supported');
120
+ }
121
+ }
122
+ event = {
123
+ type: 'conversation.item.create',
124
+ previous_item_id: previousItemId,
125
+ item: {
126
+ type: 'message',
127
+ role: 'assistant',
128
+ content: contents,
129
+ },
130
+ };
131
+ }
132
+ else if (message.role === llm.ChatRole.SYSTEM) {
133
+ const contents = [];
134
+ for (const c of content) {
135
+ if (typeof c === 'string') {
136
+ contents.push({
137
+ type: 'input_text',
138
+ text: c,
139
+ });
140
+ }
141
+ else if (
142
+ // typescript type guard for determining ChatAudio vs ChatImage
143
+ ((c) => {
144
+ return c.frame !== undefined;
145
+ })(c)) {
146
+ this.#logger.warn('audio content in system message is not supported');
147
+ }
148
+ }
149
+ event = {
150
+ type: 'conversation.item.create',
151
+ previous_item_id: previousItemId,
152
+ item: {
153
+ type: 'message',
154
+ role: 'system',
155
+ content: contents,
156
+ },
157
+ };
158
+ }
159
+ else {
160
+ this.#logger
161
+ .child({ message })
162
+ .warn('chat message is not supported inside the realtime API');
163
+ return;
164
+ }
165
+ }
166
+ this.#session.queueMsg(event);
69
167
  }
70
168
  }
71
- _ConversationItem_session = new WeakMap();
72
169
  class Conversation {
170
+ #session;
73
171
  constructor(session) {
74
- _Conversation_session.set(this, void 0);
75
- __classPrivateFieldSet(this, _Conversation_session, session, "f");
172
+ this.#session = session;
76
173
  }
77
174
  get item() {
78
- return new ConversationItem(__classPrivateFieldGet(this, _Conversation_session, "f"));
175
+ return new ConversationItem(this.#session);
79
176
  }
80
177
  }
81
- _Conversation_session = new WeakMap();
82
178
  class Response {
179
+ #session;
83
180
  constructor(session) {
84
- _Response_session.set(this, void 0);
85
- __classPrivateFieldSet(this, _Response_session, session, "f");
181
+ this.#session = session;
86
182
  }
87
183
  create() {
88
- __classPrivateFieldGet(this, _Response_session, "f").queueMsg({
184
+ this.#session.queueMsg({
89
185
  type: 'response.create',
90
186
  });
91
187
  }
92
188
  cancel() {
93
- __classPrivateFieldGet(this, _Response_session, "f").queueMsg({
189
+ this.#session.queueMsg({
94
190
  type: 'response.cancel',
95
191
  });
96
192
  }
97
193
  }
98
- _Response_session = new WeakMap();
99
194
  export class RealtimeModel extends multimodal.RealtimeModel {
195
+ sampleRate = api_proto.SAMPLE_RATE;
196
+ numChannels = api_proto.NUM_CHANNELS;
197
+ inFrameSize = api_proto.IN_FRAME_SIZE;
198
+ outFrameSize = api_proto.OUT_FRAME_SIZE;
199
+ #defaultOpts;
200
+ #sessions = [];
100
201
  static withAzure({ baseURL, azureDeployment, apiVersion = '2024-10-01-preview', apiKey = undefined, entraToken = undefined, instructions = '', modalities = ['text', 'audio'], voice = 'alloy', inputAudioFormat = 'pcm16', outputAudioFormat = 'pcm16', inputAudioTranscription = { model: 'whisper-1' }, turnDetection = { type: 'server_vad' }, temperature = 0.8, maxResponseOutputTokens = Infinity, }) {
101
202
  return new RealtimeModel({
102
203
  isAzure: true,
@@ -120,16 +221,10 @@ export class RealtimeModel extends multimodal.RealtimeModel {
120
221
  // used for microsoft
121
222
  isAzure = false, apiVersion = undefined, entraToken = undefined, }) {
122
223
  super();
123
- this.sampleRate = api_proto.SAMPLE_RATE;
124
- this.numChannels = api_proto.NUM_CHANNELS;
125
- this.inFrameSize = api_proto.IN_FRAME_SIZE;
126
- this.outFrameSize = api_proto.OUT_FRAME_SIZE;
127
- _RealtimeModel_defaultOpts.set(this, void 0);
128
- _RealtimeModel_sessions.set(this, []);
129
224
  if (apiKey === '') {
130
225
  throw new Error('OpenAI API key is required, either using the argument or by setting the OPENAI_API_KEY environmental variable');
131
226
  }
132
- __classPrivateFieldSet(this, _RealtimeModel_defaultOpts, {
227
+ this.#defaultOpts = {
133
228
  modalities,
134
229
  instructions,
135
230
  voice,
@@ -145,12 +240,12 @@ export class RealtimeModel extends multimodal.RealtimeModel {
145
240
  isAzure,
146
241
  apiVersion,
147
242
  entraToken,
148
- }, "f");
243
+ };
149
244
  }
150
245
  get sessions() {
151
- return __classPrivateFieldGet(this, _RealtimeModel_sessions, "f");
246
+ return this.#sessions;
152
247
  }
153
- session({ fncCtx, modalities = __classPrivateFieldGet(this, _RealtimeModel_defaultOpts, "f").modalities, instructions = __classPrivateFieldGet(this, _RealtimeModel_defaultOpts, "f").instructions, voice = __classPrivateFieldGet(this, _RealtimeModel_defaultOpts, "f").voice, inputAudioFormat = __classPrivateFieldGet(this, _RealtimeModel_defaultOpts, "f").inputAudioFormat, outputAudioFormat = __classPrivateFieldGet(this, _RealtimeModel_defaultOpts, "f").outputAudioFormat, inputAudioTranscription = __classPrivateFieldGet(this, _RealtimeModel_defaultOpts, "f").inputAudioTranscription, turnDetection = __classPrivateFieldGet(this, _RealtimeModel_defaultOpts, "f").turnDetection, temperature = __classPrivateFieldGet(this, _RealtimeModel_defaultOpts, "f").temperature, maxResponseOutputTokens = __classPrivateFieldGet(this, _RealtimeModel_defaultOpts, "f").maxResponseOutputTokens, }) {
248
+ session({ fncCtx, chatCtx, modalities = this.#defaultOpts.modalities, instructions = this.#defaultOpts.instructions, voice = this.#defaultOpts.voice, inputAudioFormat = this.#defaultOpts.inputAudioFormat, outputAudioFormat = this.#defaultOpts.outputAudioFormat, inputAudioTranscription = this.#defaultOpts.inputAudioTranscription, turnDetection = this.#defaultOpts.turnDetection, temperature = this.#defaultOpts.temperature, maxResponseOutputTokens = this.#defaultOpts.maxResponseOutputTokens, }) {
154
249
  const opts = {
155
250
  modalities,
156
251
  instructions,
@@ -161,57 +256,63 @@ export class RealtimeModel extends multimodal.RealtimeModel {
161
256
  turnDetection,
162
257
  temperature,
163
258
  maxResponseOutputTokens,
164
- model: __classPrivateFieldGet(this, _RealtimeModel_defaultOpts, "f").model,
165
- apiKey: __classPrivateFieldGet(this, _RealtimeModel_defaultOpts, "f").apiKey,
166
- baseURL: __classPrivateFieldGet(this, _RealtimeModel_defaultOpts, "f").baseURL,
167
- isAzure: __classPrivateFieldGet(this, _RealtimeModel_defaultOpts, "f").isAzure,
168
- apiVersion: __classPrivateFieldGet(this, _RealtimeModel_defaultOpts, "f").apiVersion,
169
- entraToken: __classPrivateFieldGet(this, _RealtimeModel_defaultOpts, "f").entraToken,
259
+ model: this.#defaultOpts.model,
260
+ apiKey: this.#defaultOpts.apiKey,
261
+ baseURL: this.#defaultOpts.baseURL,
262
+ isAzure: this.#defaultOpts.isAzure,
263
+ apiVersion: this.#defaultOpts.apiVersion,
264
+ entraToken: this.#defaultOpts.entraToken,
170
265
  };
171
- const newSession = new RealtimeSession(opts, fncCtx);
172
- __classPrivateFieldGet(this, _RealtimeModel_sessions, "f").push(newSession);
266
+ const newSession = new RealtimeSession(opts, {
267
+ chatCtx: chatCtx || new llm.ChatContext(),
268
+ fncCtx,
269
+ });
270
+ this.#sessions.push(newSession);
173
271
  return newSession;
174
272
  }
175
273
  async close() {
176
- await Promise.allSettled(__classPrivateFieldGet(this, _RealtimeModel_sessions, "f").map((session) => session.close()));
274
+ await Promise.allSettled(this.#sessions.map((session) => session.close()));
177
275
  }
178
276
  }
179
- _RealtimeModel_defaultOpts = new WeakMap(), _RealtimeModel_sessions = new WeakMap();
180
277
  export class RealtimeSession extends multimodal.RealtimeSession {
181
- constructor(opts, fncCtx) {
278
+ #chatCtx = undefined;
279
+ #fncCtx = undefined;
280
+ #opts;
281
+ #pendingResponses = {};
282
+ #sessionId = 'not-connected';
283
+ #ws = null;
284
+ #expiresAt = null;
285
+ #logger = log();
286
+ #task;
287
+ #closing = true;
288
+ #sendQueue = new Queue();
289
+ constructor(opts, { fncCtx, chatCtx }) {
182
290
  super();
183
- _RealtimeSession_instances.add(this);
184
- _RealtimeSession_fncCtx.set(this, undefined);
185
- _RealtimeSession_opts.set(this, void 0);
186
- _RealtimeSession_pendingResponses.set(this, {});
187
- _RealtimeSession_sessionId.set(this, 'not-connected');
188
- _RealtimeSession_ws.set(this, null);
189
- _RealtimeSession_expiresAt.set(this, null);
190
- _RealtimeSession_logger.set(this, log());
191
- _RealtimeSession_task.set(this, void 0);
192
- _RealtimeSession_closing.set(this, true);
193
- _RealtimeSession_sendQueue.set(this, new Queue());
194
- __classPrivateFieldSet(this, _RealtimeSession_opts, opts, "f");
195
- __classPrivateFieldSet(this, _RealtimeSession_fncCtx, fncCtx, "f");
196
- __classPrivateFieldSet(this, _RealtimeSession_task, __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_start).call(this), "f");
291
+ this.#opts = opts;
292
+ this.#chatCtx = chatCtx;
293
+ this.#fncCtx = fncCtx;
294
+ this.#task = this.#start();
197
295
  this.sessionUpdate({
198
- modalities: __classPrivateFieldGet(this, _RealtimeSession_opts, "f").modalities,
199
- instructions: __classPrivateFieldGet(this, _RealtimeSession_opts, "f").instructions,
200
- voice: __classPrivateFieldGet(this, _RealtimeSession_opts, "f").voice,
201
- inputAudioFormat: __classPrivateFieldGet(this, _RealtimeSession_opts, "f").inputAudioFormat,
202
- outputAudioFormat: __classPrivateFieldGet(this, _RealtimeSession_opts, "f").outputAudioFormat,
203
- inputAudioTranscription: __classPrivateFieldGet(this, _RealtimeSession_opts, "f").inputAudioTranscription,
204
- turnDetection: __classPrivateFieldGet(this, _RealtimeSession_opts, "f").turnDetection,
205
- temperature: __classPrivateFieldGet(this, _RealtimeSession_opts, "f").temperature,
206
- maxResponseOutputTokens: __classPrivateFieldGet(this, _RealtimeSession_opts, "f").maxResponseOutputTokens,
296
+ modalities: this.#opts.modalities,
297
+ instructions: this.#opts.instructions,
298
+ voice: this.#opts.voice,
299
+ inputAudioFormat: this.#opts.inputAudioFormat,
300
+ outputAudioFormat: this.#opts.outputAudioFormat,
301
+ inputAudioTranscription: this.#opts.inputAudioTranscription,
302
+ turnDetection: this.#opts.turnDetection,
303
+ temperature: this.#opts.temperature,
304
+ maxResponseOutputTokens: this.#opts.maxResponseOutputTokens,
207
305
  toolChoice: 'auto',
208
306
  });
209
307
  }
308
+ get chatCtx() {
309
+ return this.#chatCtx;
310
+ }
210
311
  get fncCtx() {
211
- return __classPrivateFieldGet(this, _RealtimeSession_fncCtx, "f");
312
+ return this.#fncCtx;
212
313
  }
213
314
  set fncCtx(ctx) {
214
- __classPrivateFieldSet(this, _RealtimeSession_fncCtx, ctx, "f");
315
+ this.#fncCtx = ctx;
215
316
  }
216
317
  get conversation() {
217
318
  return new Conversation(this);
@@ -223,16 +324,37 @@ export class RealtimeSession extends multimodal.RealtimeSession {
223
324
  return new Response(this);
224
325
  }
225
326
  get expiration() {
226
- if (!__classPrivateFieldGet(this, _RealtimeSession_expiresAt, "f")) {
327
+ if (!this.#expiresAt) {
227
328
  throw new Error('session not started');
228
329
  }
229
- return __classPrivateFieldGet(this, _RealtimeSession_expiresAt, "f") * 1000;
330
+ return this.#expiresAt * 1000;
230
331
  }
231
332
  queueMsg(command) {
232
- __classPrivateFieldGet(this, _RealtimeSession_sendQueue, "f").put(command);
333
+ this.#sendQueue.put(command);
334
+ }
335
+ /// Truncates the data field of the event to the specified maxLength to avoid overwhelming logs
336
+ /// with large amounts of base64 audio data.
337
+ #loggableEvent(event, maxLength = 30) {
338
+ const untypedEvent = {};
339
+ for (const [key, value] of Object.entries(event)) {
340
+ if (value !== undefined) {
341
+ untypedEvent[key] = value;
342
+ }
343
+ }
344
+ if (untypedEvent.audio && typeof untypedEvent.audio === 'string') {
345
+ const truncatedData = untypedEvent.audio.slice(0, maxLength) + (untypedEvent.audio.length > maxLength ? '…' : '');
346
+ return { ...untypedEvent, audio: truncatedData };
347
+ }
348
+ if (untypedEvent.delta &&
349
+ typeof untypedEvent.delta === 'string' &&
350
+ event.type === 'response.audio.delta') {
351
+ const truncatedDelta = untypedEvent.delta.slice(0, maxLength) + (untypedEvent.delta.length > maxLength ? '…' : '');
352
+ return { ...untypedEvent, delta: truncatedDelta };
353
+ }
354
+ return untypedEvent;
233
355
  }
234
- sessionUpdate({ modalities = __classPrivateFieldGet(this, _RealtimeSession_opts, "f").modalities, instructions = __classPrivateFieldGet(this, _RealtimeSession_opts, "f").instructions, voice = __classPrivateFieldGet(this, _RealtimeSession_opts, "f").voice, inputAudioFormat = __classPrivateFieldGet(this, _RealtimeSession_opts, "f").inputAudioFormat, outputAudioFormat = __classPrivateFieldGet(this, _RealtimeSession_opts, "f").outputAudioFormat, inputAudioTranscription = __classPrivateFieldGet(this, _RealtimeSession_opts, "f").inputAudioTranscription, turnDetection = __classPrivateFieldGet(this, _RealtimeSession_opts, "f").turnDetection, temperature = __classPrivateFieldGet(this, _RealtimeSession_opts, "f").temperature, maxResponseOutputTokens = __classPrivateFieldGet(this, _RealtimeSession_opts, "f").maxResponseOutputTokens, toolChoice = 'auto', }) {
235
- __classPrivateFieldSet(this, _RealtimeSession_opts, {
356
+ sessionUpdate({ modalities = this.#opts.modalities, instructions = this.#opts.instructions, voice = this.#opts.voice, inputAudioFormat = this.#opts.inputAudioFormat, outputAudioFormat = this.#opts.outputAudioFormat, inputAudioTranscription = this.#opts.inputAudioTranscription, turnDetection = this.#opts.turnDetection, temperature = this.#opts.temperature, maxResponseOutputTokens = this.#opts.maxResponseOutputTokens, toolChoice = 'auto', }) {
357
+ this.#opts = {
236
358
  modalities,
237
359
  instructions,
238
360
  voice,
@@ -242,15 +364,15 @@ export class RealtimeSession extends multimodal.RealtimeSession {
242
364
  turnDetection,
243
365
  temperature,
244
366
  maxResponseOutputTokens,
245
- model: __classPrivateFieldGet(this, _RealtimeSession_opts, "f").model,
246
- apiKey: __classPrivateFieldGet(this, _RealtimeSession_opts, "f").apiKey,
247
- baseURL: __classPrivateFieldGet(this, _RealtimeSession_opts, "f").baseURL,
248
- isAzure: __classPrivateFieldGet(this, _RealtimeSession_opts, "f").isAzure,
249
- apiVersion: __classPrivateFieldGet(this, _RealtimeSession_opts, "f").apiVersion,
250
- entraToken: __classPrivateFieldGet(this, _RealtimeSession_opts, "f").entraToken,
251
- }, "f");
252
- const tools = __classPrivateFieldGet(this, _RealtimeSession_fncCtx, "f")
253
- ? Object.entries(__classPrivateFieldGet(this, _RealtimeSession_fncCtx, "f")).map(([name, func]) => ({
367
+ model: this.#opts.model,
368
+ apiKey: this.#opts.apiKey,
369
+ baseURL: this.#opts.baseURL,
370
+ isAzure: this.#opts.isAzure,
371
+ apiVersion: this.#opts.apiVersion,
372
+ entraToken: this.#opts.entraToken,
373
+ };
374
+ const tools = this.#fncCtx
375
+ ? Object.entries(this.#fncCtx).map(([name, func]) => ({
254
376
  type: 'function',
255
377
  name,
256
378
  description: func.description,
@@ -260,386 +382,408 @@ export class RealtimeSession extends multimodal.RealtimeSession {
260
382
  const sessionUpdateEvent = {
261
383
  type: 'session.update',
262
384
  session: {
263
- modalities: __classPrivateFieldGet(this, _RealtimeSession_opts, "f").modalities,
264
- instructions: __classPrivateFieldGet(this, _RealtimeSession_opts, "f").instructions,
265
- voice: __classPrivateFieldGet(this, _RealtimeSession_opts, "f").voice,
266
- input_audio_format: __classPrivateFieldGet(this, _RealtimeSession_opts, "f").inputAudioFormat,
267
- output_audio_format: __classPrivateFieldGet(this, _RealtimeSession_opts, "f").outputAudioFormat,
268
- input_audio_transcription: __classPrivateFieldGet(this, _RealtimeSession_opts, "f").inputAudioTranscription,
269
- turn_detection: __classPrivateFieldGet(this, _RealtimeSession_opts, "f").turnDetection,
270
- temperature: __classPrivateFieldGet(this, _RealtimeSession_opts, "f").temperature,
271
- max_response_output_tokens: __classPrivateFieldGet(this, _RealtimeSession_opts, "f").maxResponseOutputTokens === Infinity
385
+ modalities: this.#opts.modalities,
386
+ instructions: this.#opts.instructions,
387
+ voice: this.#opts.voice,
388
+ input_audio_format: this.#opts.inputAudioFormat,
389
+ output_audio_format: this.#opts.outputAudioFormat,
390
+ input_audio_transcription: this.#opts.inputAudioTranscription,
391
+ turn_detection: this.#opts.turnDetection,
392
+ temperature: this.#opts.temperature,
393
+ max_response_output_tokens: this.#opts.maxResponseOutputTokens === Infinity
272
394
  ? 'inf'
273
- : __classPrivateFieldGet(this, _RealtimeSession_opts, "f").maxResponseOutputTokens,
395
+ : this.#opts.maxResponseOutputTokens,
274
396
  tools,
275
397
  tool_choice: toolChoice,
276
398
  },
277
399
  };
278
- if (__classPrivateFieldGet(this, _RealtimeSession_opts, "f").isAzure && __classPrivateFieldGet(this, _RealtimeSession_opts, "f").maxResponseOutputTokens === Infinity) {
400
+ if (this.#opts.isAzure && this.#opts.maxResponseOutputTokens === Infinity) {
279
401
  // microsoft doesn't support inf for max_response_output_tokens, but accepts no args
280
402
  sessionUpdateEvent.session.max_response_output_tokens = undefined;
281
403
  }
282
404
  this.queueMsg(sessionUpdateEvent);
283
405
  }
406
+ #start() {
407
+ return new Promise(async (resolve, reject) => {
408
+ const headers = {
409
+ 'User-Agent': 'LiveKit-Agents-JS',
410
+ };
411
+ if (this.#opts.isAzure) {
412
+ // Microsoft API has two ways of authentication
413
+ // 1. Entra token set as `Bearer` token
414
+ // 2. API key set as `api_key` header (also accepts query string)
415
+ if (this.#opts.entraToken) {
416
+ headers.Authorization = `Bearer ${this.#opts.entraToken}`;
417
+ }
418
+ else if (this.#opts.apiKey) {
419
+ headers['api-key'] = this.#opts.apiKey;
420
+ }
421
+ else {
422
+ reject(new Error('Microsoft API key or entraToken is required'));
423
+ return;
424
+ }
425
+ }
426
+ else {
427
+ headers.Authorization = `Bearer ${this.#opts.apiKey}`;
428
+ headers['OpenAI-Beta'] = 'realtime=v1';
429
+ }
430
+ const url = new URL([this.#opts.baseURL, 'realtime'].join('/'));
431
+ if (url.protocol === 'https:') {
432
+ url.protocol = 'wss:';
433
+ }
434
+ // Construct query parameters
435
+ const queryParams = {};
436
+ if (this.#opts.isAzure) {
437
+ queryParams['api-version'] = '2024-10-01-preview';
438
+ queryParams['deployment'] = this.#opts.model;
439
+ }
440
+ else {
441
+ queryParams['model'] = this.#opts.model;
442
+ }
443
+ for (const [key, value] of Object.entries(queryParams)) {
444
+ url.searchParams.set(key, value);
445
+ }
446
+ console.debug('Connecting to OpenAI Realtime API at ', url.toString());
447
+ this.#ws = new WebSocket(url.toString(), {
448
+ headers: headers,
449
+ });
450
+ this.#ws.onerror = (error) => {
451
+ reject(new Error('OpenAI Realtime WebSocket error: ' + error.message));
452
+ };
453
+ await once(this.#ws, 'open');
454
+ this.#closing = false;
455
+ this.#ws.onmessage = (message) => {
456
+ const event = JSON.parse(message.data);
457
+ this.#logger.debug(`<- ${JSON.stringify(this.#loggableEvent(event))}`);
458
+ switch (event.type) {
459
+ case 'error':
460
+ this.#handleError(event);
461
+ break;
462
+ case 'session.created':
463
+ this.#handleSessionCreated(event);
464
+ break;
465
+ case 'session.updated':
466
+ this.#handleSessionUpdated(event);
467
+ break;
468
+ case 'conversation.created':
469
+ this.#handleConversationCreated(event);
470
+ break;
471
+ case 'input_audio_buffer.committed':
472
+ this.#handleInputAudioBufferCommitted(event);
473
+ break;
474
+ case 'input_audio_buffer.cleared':
475
+ this.#handleInputAudioBufferCleared(event);
476
+ break;
477
+ case 'input_audio_buffer.speech_started':
478
+ this.#handleInputAudioBufferSpeechStarted(event);
479
+ break;
480
+ case 'input_audio_buffer.speech_stopped':
481
+ this.#handleInputAudioBufferSpeechStopped(event);
482
+ break;
483
+ case 'conversation.item.created':
484
+ this.#handleConversationItemCreated(event);
485
+ break;
486
+ case 'conversation.item.input_audio_transcription.completed':
487
+ this.#handleConversationItemInputAudioTranscriptionCompleted(event);
488
+ break;
489
+ case 'conversation.item.input_audio_transcription.failed':
490
+ this.#handleConversationItemInputAudioTranscriptionFailed(event);
491
+ break;
492
+ case 'conversation.item.truncated':
493
+ this.#handleConversationItemTruncated(event);
494
+ break;
495
+ case 'conversation.item.deleted':
496
+ this.#handleConversationItemDeleted(event);
497
+ break;
498
+ case 'response.created':
499
+ this.#handleResponseCreated(event);
500
+ break;
501
+ case 'response.done':
502
+ this.#handleResponseDone(event);
503
+ break;
504
+ case 'response.output_item.added':
505
+ this.#handleResponseOutputItemAdded(event);
506
+ break;
507
+ case 'response.output_item.done':
508
+ this.#handleResponseOutputItemDone(event);
509
+ break;
510
+ case 'response.content_part.added':
511
+ this.#handleResponseContentPartAdded(event);
512
+ break;
513
+ case 'response.content_part.done':
514
+ this.#handleResponseContentPartDone(event);
515
+ break;
516
+ case 'response.text.delta':
517
+ this.#handleResponseTextDelta(event);
518
+ break;
519
+ case 'response.text.done':
520
+ this.#handleResponseTextDone(event);
521
+ break;
522
+ case 'response.audio_transcript.delta':
523
+ this.#handleResponseAudioTranscriptDelta(event);
524
+ break;
525
+ case 'response.audio_transcript.done':
526
+ this.#handleResponseAudioTranscriptDone(event);
527
+ break;
528
+ case 'response.audio.delta':
529
+ this.#handleResponseAudioDelta(event);
530
+ break;
531
+ case 'response.audio.done':
532
+ this.#handleResponseAudioDone(event);
533
+ break;
534
+ case 'response.function_call_arguments.delta':
535
+ this.#handleResponseFunctionCallArgumentsDelta(event);
536
+ break;
537
+ case 'response.function_call_arguments.done':
538
+ this.#handleResponseFunctionCallArgumentsDone(event);
539
+ break;
540
+ case 'rate_limits.updated':
541
+ this.#handleRateLimitsUpdated(event);
542
+ break;
543
+ }
544
+ };
545
+ const sendTask = async () => {
546
+ while (this.#ws && !this.#closing && this.#ws.readyState === WebSocket.OPEN) {
547
+ try {
548
+ const event = await this.#sendQueue.get();
549
+ if (event.type !== 'input_audio_buffer.append') {
550
+ this.#logger.debug(`-> ${JSON.stringify(this.#loggableEvent(event))}`);
551
+ }
552
+ this.#ws.send(JSON.stringify(event));
553
+ }
554
+ catch (error) {
555
+ this.#logger.error('Error sending event:', error);
556
+ }
557
+ }
558
+ };
559
+ sendTask();
560
+ this.#ws.onclose = () => {
561
+ if (this.#expiresAt && Date.now() >= this.#expiresAt * 1000) {
562
+ this.#closing = true;
563
+ }
564
+ if (!this.#closing) {
565
+ reject(new Error('OpenAI Realtime connection closed unexpectedly'));
566
+ }
567
+ this.#ws = null;
568
+ resolve();
569
+ };
570
+ });
571
+ }
284
572
  async close() {
285
- if (!__classPrivateFieldGet(this, _RealtimeSession_ws, "f"))
573
+ if (!this.#ws)
286
574
  return;
287
- __classPrivateFieldSet(this, _RealtimeSession_closing, true, "f");
288
- __classPrivateFieldGet(this, _RealtimeSession_ws, "f").close();
289
- await __classPrivateFieldGet(this, _RealtimeSession_task, "f");
575
+ this.#closing = true;
576
+ this.#ws.close();
577
+ await this.#task;
578
+ }
579
+ #getContent(ptr) {
580
+ const response = this.#pendingResponses[ptr.response_id];
581
+ const output = response.output[ptr.output_index];
582
+ const content = output.content[ptr.content_index];
583
+ return content;
584
+ }
585
+ #handleError(event) {
586
+ this.#logger.error(`OpenAI Realtime error ${JSON.stringify(event.error)}`);
587
+ }
588
+ #handleSessionCreated(event) {
589
+ this.#sessionId = event.session.id;
590
+ this.#expiresAt = event.session.expires_at;
591
+ this.#logger = this.#logger.child({ sessionId: this.#sessionId });
592
+ }
593
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
594
+ #handleSessionUpdated(event) { }
595
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
596
+ #handleConversationCreated(event) { }
597
+ #handleInputAudioBufferCommitted(event) {
598
+ this.emit('input_speech_committed', {
599
+ itemId: event.item_id,
600
+ });
290
601
  }
291
- }
292
- _RealtimeSession_fncCtx = new WeakMap(), _RealtimeSession_opts = new WeakMap(), _RealtimeSession_pendingResponses = new WeakMap(), _RealtimeSession_sessionId = new WeakMap(), _RealtimeSession_ws = new WeakMap(), _RealtimeSession_expiresAt = new WeakMap(), _RealtimeSession_logger = new WeakMap(), _RealtimeSession_task = new WeakMap(), _RealtimeSession_closing = new WeakMap(), _RealtimeSession_sendQueue = new WeakMap(), _RealtimeSession_instances = new WeakSet(), _RealtimeSession_loggableEvent = function _RealtimeSession_loggableEvent(event, maxLength = 30) {
293
- const untypedEvent = {};
294
- for (const [key, value] of Object.entries(event)) {
295
- if (value !== undefined) {
296
- untypedEvent[key] = value;
297
- }
602
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
603
+ #handleInputAudioBufferCleared(event) { }
604
+ #handleInputAudioBufferSpeechStarted(
605
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
606
+ event) {
607
+ this.emit('input_speech_started', {
608
+ itemId: event.item_id,
609
+ });
298
610
  }
299
- if (untypedEvent.audio && typeof untypedEvent.audio === 'string') {
300
- const truncatedData = untypedEvent.audio.slice(0, maxLength) + (untypedEvent.audio.length > maxLength ? '…' : '');
301
- return Object.assign(Object.assign({}, untypedEvent), { audio: truncatedData });
302
- }
303
- if (untypedEvent.delta &&
304
- typeof untypedEvent.delta === 'string' &&
305
- event.type === 'response.audio.delta') {
306
- const truncatedDelta = untypedEvent.delta.slice(0, maxLength) + (untypedEvent.delta.length > maxLength ? '…' : '');
307
- return Object.assign(Object.assign({}, untypedEvent), { delta: truncatedDelta });
308
- }
309
- return untypedEvent;
310
- }, _RealtimeSession_start = function _RealtimeSession_start() {
311
- return new Promise(async (resolve, reject) => {
312
- const headers = {
313
- 'User-Agent': 'LiveKit-Agents-JS',
611
+ #handleInputAudioBufferSpeechStopped(
612
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
613
+ event) {
614
+ this.emit('input_speech_stopped');
615
+ }
616
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
617
+ #handleConversationItemCreated(event) { }
618
+ #handleConversationItemInputAudioTranscriptionCompleted(event) {
619
+ const transcript = event.transcript;
620
+ this.emit('input_speech_transcription_completed', {
621
+ itemId: event.item_id,
622
+ transcript: transcript,
623
+ });
624
+ }
625
+ #handleConversationItemInputAudioTranscriptionFailed(event) {
626
+ const error = event.error;
627
+ this.#logger.error(`OpenAI Realtime failed to transcribe input audio: ${error.message}`);
628
+ this.emit('input_speech_transcription_failed', {
629
+ itemId: event.item_id,
630
+ message: error.message,
631
+ });
632
+ }
633
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
634
+ #handleConversationItemTruncated(event) { }
635
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
636
+ #handleConversationItemDeleted(event) { }
637
+ #handleResponseCreated(responseCreated) {
638
+ const response = responseCreated.response;
639
+ const doneFut = new Future();
640
+ const newResponse = {
641
+ id: response.id,
642
+ status: response.status,
643
+ statusDetails: response.status_details,
644
+ usage: null,
645
+ output: [],
646
+ doneFut: doneFut,
314
647
  };
315
- if (__classPrivateFieldGet(this, _RealtimeSession_opts, "f").isAzure) {
316
- // Microsoft API has two ways of authentication
317
- // 1. Entra token set as `Bearer` token
318
- // 2. API key set as `api_key` header (also accepts query string)
319
- if (__classPrivateFieldGet(this, _RealtimeSession_opts, "f").entraToken) {
320
- headers.Authorization = `Bearer ${__classPrivateFieldGet(this, _RealtimeSession_opts, "f").entraToken}`;
321
- }
322
- else if (__classPrivateFieldGet(this, _RealtimeSession_opts, "f").apiKey) {
323
- headers['api-key'] = __classPrivateFieldGet(this, _RealtimeSession_opts, "f").apiKey;
324
- }
325
- else {
326
- reject(new Error('Microsoft API key or entraToken is required'));
327
- return;
328
- }
329
- }
330
- else {
331
- headers.Authorization = `Bearer ${__classPrivateFieldGet(this, _RealtimeSession_opts, "f").apiKey}`;
332
- headers['OpenAI-Beta'] = 'realtime=v1';
648
+ this.#pendingResponses[newResponse.id] = newResponse;
649
+ this.emit('response_created', newResponse);
650
+ }
651
+ #handleResponseDone(event) {
652
+ const responseData = event.response;
653
+ const responseId = responseData.id;
654
+ const response = this.#pendingResponses[responseId];
655
+ response.status = responseData.status;
656
+ response.statusDetails = responseData.status_details;
657
+ response.usage = responseData.usage;
658
+ this.#pendingResponses[responseId] = response;
659
+ response.doneFut.resolve();
660
+ this.emit('response_done', response);
661
+ }
662
+ #handleResponseOutputItemAdded(event) {
663
+ const responseId = event.response_id;
664
+ const response = this.#pendingResponses[responseId];
665
+ const itemData = event.item;
666
+ if (itemData.type !== 'message' && itemData.type !== 'function_call') {
667
+ throw new Error(`Unexpected item type: ${itemData.type}`);
333
668
  }
334
- const url = new URL([__classPrivateFieldGet(this, _RealtimeSession_opts, "f").baseURL, 'realtime'].join('/'));
335
- if (url.protocol === 'https:') {
336
- url.protocol = 'wss:';
337
- }
338
- // Construct query parameters
339
- const queryParams = {};
340
- if (__classPrivateFieldGet(this, _RealtimeSession_opts, "f").isAzure) {
341
- queryParams['api-version'] = '2024-10-01-preview';
342
- queryParams['deployment'] = __classPrivateFieldGet(this, _RealtimeSession_opts, "f").model;
669
+ let role;
670
+ if (itemData.type === 'function_call') {
671
+ role = 'assistant'; // function_call doesn't have a role field, defaulting it to assistant
343
672
  }
344
673
  else {
345
- queryParams['model'] = __classPrivateFieldGet(this, _RealtimeSession_opts, "f").model;
346
- }
347
- for (const [key, value] of Object.entries(queryParams)) {
348
- url.searchParams.set(key, value);
674
+ role = itemData.role;
349
675
  }
350
- console.debug('Connecting to OpenAI Realtime API at ', url.toString());
351
- __classPrivateFieldSet(this, _RealtimeSession_ws, new WebSocket(url.toString(), {
352
- headers: headers,
353
- }), "f");
354
- __classPrivateFieldGet(this, _RealtimeSession_ws, "f").onerror = (error) => {
355
- reject(new Error('OpenAI Realtime WebSocket error: ' + error.message));
356
- };
357
- await once(__classPrivateFieldGet(this, _RealtimeSession_ws, "f"), 'open');
358
- __classPrivateFieldSet(this, _RealtimeSession_closing, false, "f");
359
- __classPrivateFieldGet(this, _RealtimeSession_ws, "f").onmessage = (message) => {
360
- const event = JSON.parse(message.data);
361
- __classPrivateFieldGet(this, _RealtimeSession_logger, "f").debug(`<- ${JSON.stringify(__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_loggableEvent).call(this, event))}`);
362
- switch (event.type) {
363
- case 'error':
364
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleError).call(this, event);
365
- break;
366
- case 'session.created':
367
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleSessionCreated).call(this, event);
368
- break;
369
- case 'session.updated':
370
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleSessionUpdated).call(this, event);
371
- break;
372
- case 'conversation.created':
373
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleConversationCreated).call(this, event);
374
- break;
375
- case 'input_audio_buffer.committed':
376
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleInputAudioBufferCommitted).call(this, event);
377
- break;
378
- case 'input_audio_buffer.cleared':
379
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleInputAudioBufferCleared).call(this, event);
380
- break;
381
- case 'input_audio_buffer.speech_started':
382
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleInputAudioBufferSpeechStarted).call(this, event);
383
- break;
384
- case 'input_audio_buffer.speech_stopped':
385
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleInputAudioBufferSpeechStopped).call(this, event);
386
- break;
387
- case 'conversation.item.created':
388
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleConversationItemCreated).call(this, event);
389
- break;
390
- case 'conversation.item.input_audio_transcription.completed':
391
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleConversationItemInputAudioTranscriptionCompleted).call(this, event);
392
- break;
393
- case 'conversation.item.input_audio_transcription.failed':
394
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleConversationItemInputAudioTranscriptionFailed).call(this, event);
395
- break;
396
- case 'conversation.item.truncated':
397
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleConversationItemTruncated).call(this, event);
398
- break;
399
- case 'conversation.item.deleted':
400
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleConversationItemDeleted).call(this, event);
401
- break;
402
- case 'response.created':
403
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseCreated).call(this, event);
404
- break;
405
- case 'response.done':
406
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseDone).call(this, event);
407
- break;
408
- case 'response.output_item.added':
409
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseOutputItemAdded).call(this, event);
410
- break;
411
- case 'response.output_item.done':
412
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseOutputItemDone).call(this, event);
413
- break;
414
- case 'response.content_part.added':
415
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseContentPartAdded).call(this, event);
416
- break;
417
- case 'response.content_part.done':
418
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseContentPartDone).call(this, event);
419
- break;
420
- case 'response.text.delta':
421
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseTextDelta).call(this, event);
422
- break;
423
- case 'response.text.done':
424
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseTextDone).call(this, event);
425
- break;
426
- case 'response.audio_transcript.delta':
427
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseAudioTranscriptDelta).call(this, event);
428
- break;
429
- case 'response.audio_transcript.done':
430
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseAudioTranscriptDone).call(this, event);
431
- break;
432
- case 'response.audio.delta':
433
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseAudioDelta).call(this, event);
434
- break;
435
- case 'response.audio.done':
436
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseAudioDone).call(this, event);
437
- break;
438
- case 'response.function_call_arguments.delta':
439
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseFunctionCallArgumentsDelta).call(this, event);
440
- break;
441
- case 'response.function_call_arguments.done':
442
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseFunctionCallArgumentsDone).call(this, event);
443
- break;
444
- case 'rate_limits.updated':
445
- __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleRateLimitsUpdated).call(this, event);
446
- break;
447
- }
676
+ const newOutput = {
677
+ responseId: responseId,
678
+ itemId: itemData.id,
679
+ outputIndex: event.output_index,
680
+ type: itemData.type,
681
+ role: role,
682
+ content: [],
683
+ doneFut: new Future(),
448
684
  };
449
- const sendTask = async () => {
450
- while (__classPrivateFieldGet(this, _RealtimeSession_ws, "f") && !__classPrivateFieldGet(this, _RealtimeSession_closing, "f") && __classPrivateFieldGet(this, _RealtimeSession_ws, "f").readyState === WebSocket.OPEN) {
451
- try {
452
- const event = await __classPrivateFieldGet(this, _RealtimeSession_sendQueue, "f").get();
453
- if (event.type !== 'input_audio_buffer.append') {
454
- __classPrivateFieldGet(this, _RealtimeSession_logger, "f").debug(`-> ${JSON.stringify(__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_loggableEvent).call(this, event))}`);
455
- }
456
- __classPrivateFieldGet(this, _RealtimeSession_ws, "f").send(JSON.stringify(event));
457
- }
458
- catch (error) {
459
- __classPrivateFieldGet(this, _RealtimeSession_logger, "f").error('Error sending event:', error);
460
- }
461
- }
462
- };
463
- sendTask();
464
- __classPrivateFieldGet(this, _RealtimeSession_ws, "f").onclose = () => {
465
- if (__classPrivateFieldGet(this, _RealtimeSession_expiresAt, "f") && Date.now() >= __classPrivateFieldGet(this, _RealtimeSession_expiresAt, "f") * 1000) {
466
- __classPrivateFieldSet(this, _RealtimeSession_closing, true, "f");
685
+ response.output.push(newOutput);
686
+ this.emit('response_output_added', newOutput);
687
+ }
688
+ #handleResponseOutputItemDone(event) {
689
+ const responseId = event.response_id;
690
+ const response = this.#pendingResponses[responseId];
691
+ const outputIndex = event.output_index;
692
+ const output = response.output[outputIndex];
693
+ if (output.type === 'function_call') {
694
+ if (!this.#fncCtx) {
695
+ this.#logger.error('function call received but no fncCtx is available');
696
+ return;
467
697
  }
468
- if (!__classPrivateFieldGet(this, _RealtimeSession_closing, "f")) {
469
- reject(new Error('OpenAI Realtime connection closed unexpectedly'));
698
+ // parse the arguments and call the function inside the fnc_ctx
699
+ const item = event.item;
700
+ if (item.type !== 'function_call') {
701
+ throw new Error('Expected function_call item');
470
702
  }
471
- __classPrivateFieldSet(this, _RealtimeSession_ws, null, "f");
472
- resolve();
473
- };
474
- });
475
- }, _RealtimeSession_getContent = function _RealtimeSession_getContent(ptr) {
476
- const response = __classPrivateFieldGet(this, _RealtimeSession_pendingResponses, "f")[ptr.response_id];
477
- const output = response.output[ptr.output_index];
478
- const content = output.content[ptr.content_index];
479
- return content;
480
- }, _RealtimeSession_handleError = function _RealtimeSession_handleError(event) {
481
- __classPrivateFieldGet(this, _RealtimeSession_logger, "f").error(`OpenAI Realtime error ${JSON.stringify(event.error)}`);
482
- }, _RealtimeSession_handleSessionCreated = function _RealtimeSession_handleSessionCreated(event) {
483
- __classPrivateFieldSet(this, _RealtimeSession_sessionId, event.session.id, "f");
484
- __classPrivateFieldSet(this, _RealtimeSession_expiresAt, event.session.expires_at, "f");
485
- __classPrivateFieldSet(this, _RealtimeSession_logger, __classPrivateFieldGet(this, _RealtimeSession_logger, "f").child({ sessionId: __classPrivateFieldGet(this, _RealtimeSession_sessionId, "f") }), "f");
486
- }, _RealtimeSession_handleSessionUpdated = function _RealtimeSession_handleSessionUpdated(event) { }, _RealtimeSession_handleConversationCreated = function _RealtimeSession_handleConversationCreated(event) { }, _RealtimeSession_handleInputAudioBufferCommitted = function _RealtimeSession_handleInputAudioBufferCommitted(event) {
487
- this.emit('input_speech_committed', {
488
- itemId: event.item_id,
489
- });
490
- }, _RealtimeSession_handleInputAudioBufferCleared = function _RealtimeSession_handleInputAudioBufferCleared(event) { }, _RealtimeSession_handleInputAudioBufferSpeechStarted = function _RealtimeSession_handleInputAudioBufferSpeechStarted(
491
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
492
- event) {
493
- this.emit('input_speech_started', {
494
- itemId: event.item_id,
495
- });
496
- }, _RealtimeSession_handleInputAudioBufferSpeechStopped = function _RealtimeSession_handleInputAudioBufferSpeechStopped(
497
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
498
- event) {
499
- this.emit('input_speech_stopped');
500
- }, _RealtimeSession_handleConversationItemCreated = function _RealtimeSession_handleConversationItemCreated(event) { }, _RealtimeSession_handleConversationItemInputAudioTranscriptionCompleted = function _RealtimeSession_handleConversationItemInputAudioTranscriptionCompleted(event) {
501
- const transcript = event.transcript;
502
- this.emit('input_speech_transcription_completed', {
503
- itemId: event.item_id,
504
- transcript: transcript,
505
- });
506
- }, _RealtimeSession_handleConversationItemInputAudioTranscriptionFailed = function _RealtimeSession_handleConversationItemInputAudioTranscriptionFailed(event) {
507
- const error = event.error;
508
- __classPrivateFieldGet(this, _RealtimeSession_logger, "f").error(`OpenAI Realtime failed to transcribe input audio: ${error.message}`);
509
- this.emit('input_speech_transcription_failed', {
510
- itemId: event.item_id,
511
- message: error.message,
512
- });
513
- }, _RealtimeSession_handleConversationItemTruncated = function _RealtimeSession_handleConversationItemTruncated(event) { }, _RealtimeSession_handleConversationItemDeleted = function _RealtimeSession_handleConversationItemDeleted(event) { }, _RealtimeSession_handleResponseCreated = function _RealtimeSession_handleResponseCreated(responseCreated) {
514
- const response = responseCreated.response;
515
- const doneFut = new Future();
516
- const newResponse = {
517
- id: response.id,
518
- status: response.status,
519
- statusDetails: response.status_details,
520
- output: [],
521
- doneFut: doneFut,
522
- };
523
- __classPrivateFieldGet(this, _RealtimeSession_pendingResponses, "f")[newResponse.id] = newResponse;
524
- this.emit('response_created', newResponse);
525
- }, _RealtimeSession_handleResponseDone = function _RealtimeSession_handleResponseDone(event) {
526
- const responseData = event.response;
527
- const responseId = responseData.id;
528
- const response = __classPrivateFieldGet(this, _RealtimeSession_pendingResponses, "f")[responseId];
529
- response.status = responseData.status;
530
- response.statusDetails = responseData.status_details;
531
- __classPrivateFieldGet(this, _RealtimeSession_pendingResponses, "f")[responseId] = response;
532
- response.doneFut.resolve();
533
- this.emit('response_done', response);
534
- }, _RealtimeSession_handleResponseOutputItemAdded = function _RealtimeSession_handleResponseOutputItemAdded(event) {
535
- const responseId = event.response_id;
536
- const response = __classPrivateFieldGet(this, _RealtimeSession_pendingResponses, "f")[responseId];
537
- const itemData = event.item;
538
- if (itemData.type !== 'message' && itemData.type !== 'function_call') {
539
- throw new Error(`Unexpected item type: ${itemData.type}`);
540
- }
541
- let role;
542
- if (itemData.type === 'function_call') {
543
- role = 'assistant'; // function_call doesn't have a role field, defaulting it to assistant
544
- }
545
- else {
546
- role = itemData.role;
547
- }
548
- const newOutput = {
549
- responseId: responseId,
550
- itemId: itemData.id,
551
- outputIndex: event.output_index,
552
- type: itemData.type,
553
- role: role,
554
- content: [],
555
- doneFut: new Future(),
556
- };
557
- response.output.push(newOutput);
558
- this.emit('response_output_added', newOutput);
559
- }, _RealtimeSession_handleResponseOutputItemDone = function _RealtimeSession_handleResponseOutputItemDone(event) {
560
- const responseId = event.response_id;
561
- const response = __classPrivateFieldGet(this, _RealtimeSession_pendingResponses, "f")[responseId];
562
- const outputIndex = event.output_index;
563
- const output = response.output[outputIndex];
564
- if (output.type === 'function_call') {
565
- if (!__classPrivateFieldGet(this, _RealtimeSession_fncCtx, "f")) {
566
- __classPrivateFieldGet(this, _RealtimeSession_logger, "f").error('function call received but no fncCtx is available');
567
- return;
568
- }
569
- // parse the arguments and call the function inside the fnc_ctx
570
- const item = event.item;
571
- if (item.type !== 'function_call') {
572
- throw new Error('Expected function_call item');
573
- }
574
- this.emit('function_call_started', {
575
- callId: item.call_id,
576
- });
577
- const parsedArgs = JSON.parse(item.arguments);
578
- __classPrivateFieldGet(this, _RealtimeSession_logger, "f").debug(`[Function Call ${item.call_id}] Executing ${item.name} with arguments ${parsedArgs}`);
579
- __classPrivateFieldGet(this, _RealtimeSession_fncCtx, "f")[item.name].execute(parsedArgs).then((content) => {
580
- __classPrivateFieldGet(this, _RealtimeSession_logger, "f").debug(`[Function Call ${item.call_id}] ${item.name} returned ${content}`);
581
- this.emit('function_call_completed', {
703
+ this.emit('function_call_started', {
582
704
  callId: item.call_id,
583
705
  });
584
- this.conversation.item.create({
585
- type: 'function_call_output',
586
- call_id: item.call_id,
587
- output: content,
588
- }, output.itemId);
589
- this.response.create();
590
- }, (error) => {
591
- __classPrivateFieldGet(this, _RealtimeSession_logger, "f").error(`[Function Call ${item.call_id}] ${item.name} failed with ${error}`);
592
- // TODO: send it back up as failed?
593
- this.emit('function_call_failed', {
594
- callId: item.call_id,
706
+ const parsedArgs = JSON.parse(item.arguments);
707
+ this.#logger.debug(`[Function Call ${item.call_id}] Executing ${item.name} with arguments ${parsedArgs}`);
708
+ this.#fncCtx[item.name].execute(parsedArgs).then((content) => {
709
+ this.#logger.debug(`[Function Call ${item.call_id}] ${item.name} returned ${content}`);
710
+ this.emit('function_call_completed', {
711
+ callId: item.call_id,
712
+ });
713
+ this.conversation.item.create(llm.ChatMessage.createToolFromFunctionResult({
714
+ name: item.name,
715
+ toolCallId: item.call_id,
716
+ result: content,
717
+ }), output.itemId);
718
+ this.response.create();
719
+ }, (error) => {
720
+ this.#logger.error(`[Function Call ${item.call_id}] ${item.name} failed with ${error}`);
721
+ // TODO: send it back up as failed?
722
+ this.emit('function_call_failed', {
723
+ callId: item.call_id,
724
+ });
595
725
  });
596
- });
597
- }
598
- output.doneFut.resolve();
599
- this.emit('response_output_done', output);
600
- }, _RealtimeSession_handleResponseContentPartAdded = function _RealtimeSession_handleResponseContentPartAdded(event) {
601
- const responseId = event.response_id;
602
- const response = __classPrivateFieldGet(this, _RealtimeSession_pendingResponses, "f")[responseId];
603
- const outputIndex = event.output_index;
604
- const output = response.output[outputIndex];
605
- const textStream = new AsyncIterableQueue();
606
- const audioStream = new AsyncIterableQueue();
607
- const newContent = {
608
- responseId: responseId,
609
- itemId: event.item_id,
610
- outputIndex: outputIndex,
611
- contentIndex: event.content_index,
612
- text: '',
613
- audio: [],
614
- textStream: textStream,
615
- audioStream: audioStream,
616
- toolCalls: [],
617
- };
618
- output.content.push(newContent);
619
- this.emit('response_content_added', newContent);
620
- }, _RealtimeSession_handleResponseContentPartDone = function _RealtimeSession_handleResponseContentPartDone(event) {
621
- const content = __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_getContent).call(this, event);
622
- this.emit('response_content_done', content);
623
- }, _RealtimeSession_handleResponseTextDelta = function _RealtimeSession_handleResponseTextDelta(event) { }, _RealtimeSession_handleResponseTextDone = function _RealtimeSession_handleResponseTextDone(event) { }, _RealtimeSession_handleResponseAudioTranscriptDelta = function _RealtimeSession_handleResponseAudioTranscriptDelta(event) {
624
- const content = __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_getContent).call(this, event);
625
- const transcript = event.delta;
626
- content.text += transcript;
627
- content.textStream.put(transcript);
628
- }, _RealtimeSession_handleResponseAudioTranscriptDone = function _RealtimeSession_handleResponseAudioTranscriptDone(event) {
629
- const content = __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_getContent).call(this, event);
630
- content.textStream.close();
631
- }, _RealtimeSession_handleResponseAudioDelta = function _RealtimeSession_handleResponseAudioDelta(event) {
632
- const content = __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_getContent).call(this, event);
633
- const data = Buffer.from(event.delta, 'base64');
634
- const audio = new AudioFrame(new Int16Array(data.buffer), api_proto.SAMPLE_RATE, api_proto.NUM_CHANNELS, data.length / 2);
635
- content.audio.push(audio);
636
- content.audioStream.put(audio);
637
- }, _RealtimeSession_handleResponseAudioDone = function _RealtimeSession_handleResponseAudioDone(event) {
638
- const content = __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_getContent).call(this, event);
639
- content.audioStream.close();
640
- }, _RealtimeSession_handleResponseFunctionCallArgumentsDelta = function _RealtimeSession_handleResponseFunctionCallArgumentsDelta(
641
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
642
- event) { }, _RealtimeSession_handleResponseFunctionCallArgumentsDone = function _RealtimeSession_handleResponseFunctionCallArgumentsDone(
643
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
644
- event) { }, _RealtimeSession_handleRateLimitsUpdated = function _RealtimeSession_handleRateLimitsUpdated(event) { };
726
+ }
727
+ output.doneFut.resolve();
728
+ this.emit('response_output_done', output);
729
+ }
730
+ #handleResponseContentPartAdded(event) {
731
+ const responseId = event.response_id;
732
+ const response = this.#pendingResponses[responseId];
733
+ const outputIndex = event.output_index;
734
+ const output = response.output[outputIndex];
735
+ const textStream = new AsyncIterableQueue();
736
+ const audioStream = new AsyncIterableQueue();
737
+ const newContent = {
738
+ responseId: responseId,
739
+ itemId: event.item_id,
740
+ outputIndex: outputIndex,
741
+ contentIndex: event.content_index,
742
+ text: '',
743
+ audio: [],
744
+ textStream: textStream,
745
+ audioStream: audioStream,
746
+ toolCalls: [],
747
+ };
748
+ output.content.push(newContent);
749
+ this.emit('response_content_added', newContent);
750
+ }
751
+ #handleResponseContentPartDone(event) {
752
+ const content = this.#getContent(event);
753
+ this.emit('response_content_done', content);
754
+ }
755
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
756
+ #handleResponseTextDelta(event) { }
757
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
758
+ #handleResponseTextDone(event) { }
759
+ #handleResponseAudioTranscriptDelta(event) {
760
+ const content = this.#getContent(event);
761
+ const transcript = event.delta;
762
+ content.text += transcript;
763
+ content.textStream.put(transcript);
764
+ }
765
+ #handleResponseAudioTranscriptDone(event) {
766
+ const content = this.#getContent(event);
767
+ content.textStream.close();
768
+ }
769
+ #handleResponseAudioDelta(event) {
770
+ const content = this.#getContent(event);
771
+ const data = Buffer.from(event.delta, 'base64');
772
+ const audio = new AudioFrame(new Int16Array(data.buffer), api_proto.SAMPLE_RATE, api_proto.NUM_CHANNELS, data.length / 2);
773
+ content.audio.push(audio);
774
+ content.audioStream.put(audio);
775
+ }
776
+ #handleResponseAudioDone(event) {
777
+ const content = this.#getContent(event);
778
+ content.audioStream.close();
779
+ }
780
+ #handleResponseFunctionCallArgumentsDelta(
781
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
782
+ event) { }
783
+ #handleResponseFunctionCallArgumentsDone(
784
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
785
+ event) { }
786
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
787
+ #handleRateLimitsUpdated(event) { }
788
+ }
645
789
  //# sourceMappingURL=realtime_model.js.map