@livekit/agents-plugin-openai 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/README.md +18 -0
  2. package/dist/index.cjs +55 -0
  3. package/dist/index.cjs.map +1 -0
  4. package/dist/index.js +13 -8
  5. package/dist/index.js.map +1 -1
  6. package/dist/llm.cjs +502 -0
  7. package/dist/llm.cjs.map +1 -0
  8. package/dist/llm.js +435 -424
  9. package/dist/llm.js.map +1 -1
  10. package/dist/models.cjs +17 -0
  11. package/dist/models.cjs.map +1 -0
  12. package/dist/models.js +0 -4
  13. package/dist/models.js.map +1 -1
  14. package/dist/realtime/api_proto.cjs +41 -0
  15. package/dist/realtime/api_proto.cjs.map +1 -0
  16. package/dist/realtime/api_proto.d.ts +1 -1
  17. package/dist/realtime/api_proto.d.ts.map +1 -1
  18. package/dist/realtime/api_proto.js +12 -8
  19. package/dist/realtime/api_proto.js.map +1 -1
  20. package/dist/realtime/index.cjs +25 -0
  21. package/dist/realtime/index.cjs.map +1 -0
  22. package/dist/realtime/index.js +2 -5
  23. package/dist/realtime/index.js.map +1 -1
  24. package/dist/realtime/realtime_model.cjs +878 -0
  25. package/dist/realtime/realtime_model.cjs.map +1 -0
  26. package/dist/realtime/realtime_model.js +828 -777
  27. package/dist/realtime/realtime_model.js.map +1 -1
  28. package/dist/stt.cjs +130 -0
  29. package/dist/stt.cjs.map +1 -0
  30. package/dist/stt.js +99 -102
  31. package/dist/stt.js.map +1 -1
  32. package/dist/tts.cjs +100 -0
  33. package/dist/tts.cjs.map +1 -0
  34. package/dist/tts.d.ts +1 -1
  35. package/dist/tts.d.ts.map +1 -1
  36. package/dist/tts.js +67 -65
  37. package/dist/tts.js.map +1 -1
  38. package/package.json +23 -7
  39. package/src/realtime/api_proto.ts +10 -1
  40. package/src/tts.ts +2 -1
  41. package/.turbo/turbo-build.log +0 -4
  42. package/CHANGELOG.md +0 -148
  43. package/api-extractor.json +0 -20
  44. package/tsconfig.json +0 -16
  45. package/tsconfig.tsbuildinfo +0 -1
@@ -1,800 +1,851 @@
1
- // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
- //
3
- // SPDX-License-Identifier: Apache-2.0
4
- import { AsyncIterableQueue, Future, Queue, llm, log, mergeFrames, multimodal, } from '@livekit/agents';
5
- import { AudioFrame } from '@livekit/rtc-node';
6
- import { once } from 'node:events';
7
- import { WebSocket } from 'ws';
8
- import * as api_proto from './api_proto.js';
1
+ import {
2
+ AsyncIterableQueue,
3
+ Future,
4
+ Queue,
5
+ llm,
6
+ log,
7
+ mergeFrames,
8
+ multimodal
9
+ } from "@livekit/agents";
10
+ import { AudioFrame } from "@livekit/rtc-node";
11
+ import { once } from "node:events";
12
+ import { WebSocket } from "ws";
13
+ import * as api_proto from "./api_proto.js";
9
14
  class InputAudioBuffer {
10
- #session;
11
- constructor(session) {
12
- this.#session = session;
13
- }
14
- append(frame) {
15
- this.#session.queueMsg({
16
- type: 'input_audio_buffer.append',
17
- audio: Buffer.from(frame.data.buffer).toString('base64'),
18
- });
19
- }
20
- clear() {
21
- this.#session.queueMsg({
22
- type: 'input_audio_buffer.clear',
23
- });
24
- }
25
- commit() {
26
- this.#session.queueMsg({
27
- type: 'input_audio_buffer.commit',
28
- });
29
- }
15
+ #session;
16
+ constructor(session) {
17
+ this.#session = session;
18
+ }
19
+ append(frame) {
20
+ this.#session.queueMsg({
21
+ type: "input_audio_buffer.append",
22
+ audio: Buffer.from(frame.data.buffer).toString("base64")
23
+ });
24
+ }
25
+ clear() {
26
+ this.#session.queueMsg({
27
+ type: "input_audio_buffer.clear"
28
+ });
29
+ }
30
+ commit() {
31
+ this.#session.queueMsg({
32
+ type: "input_audio_buffer.commit"
33
+ });
34
+ }
30
35
  }
31
36
  class ConversationItem {
32
- #session;
33
- #logger = log();
34
- constructor(session) {
35
- this.#session = session;
36
- }
37
- truncate(itemId, contentIndex, audioEnd) {
38
- this.#session.queueMsg({
39
- type: 'conversation.item.truncate',
40
- item_id: itemId,
41
- content_index: contentIndex,
42
- audio_end_ms: audioEnd,
43
- });
44
- }
45
- delete(itemId) {
46
- this.#session.queueMsg({
47
- type: 'conversation.item.delete',
48
- item_id: itemId,
49
- });
50
- }
51
- create(message, previousItemId) {
52
- if (!message.content) {
53
- return;
37
+ #session;
38
+ #logger = log();
39
+ constructor(session) {
40
+ this.#session = session;
41
+ }
42
+ truncate(itemId, contentIndex, audioEnd) {
43
+ this.#session.queueMsg({
44
+ type: "conversation.item.truncate",
45
+ item_id: itemId,
46
+ content_index: contentIndex,
47
+ audio_end_ms: audioEnd
48
+ });
49
+ }
50
+ delete(itemId) {
51
+ this.#session.queueMsg({
52
+ type: "conversation.item.delete",
53
+ item_id: itemId
54
+ });
55
+ }
56
+ create(message, previousItemId) {
57
+ if (!message.content) {
58
+ return;
59
+ }
60
+ let event;
61
+ if (message.toolCallId) {
62
+ if (typeof message.content !== "string") {
63
+ throw new TypeError("message.content must be a string");
64
+ }
65
+ event = {
66
+ type: "conversation.item.create",
67
+ previous_item_id: previousItemId,
68
+ item: {
69
+ type: "function_call_output",
70
+ call_id: message.toolCallId,
71
+ output: message.content
54
72
  }
55
- let event;
56
- if (message.toolCallId) {
57
- if (typeof message.content !== 'string') {
58
- throw new TypeError('message.content must be a string');
59
- }
60
- event = {
61
- type: 'conversation.item.create',
62
- previous_item_id: previousItemId,
63
- item: {
64
- type: 'function_call_output',
65
- call_id: message.toolCallId,
66
- output: message.content,
67
- },
68
- };
73
+ };
74
+ } else {
75
+ let content = message.content;
76
+ if (!Array.isArray(content)) {
77
+ content = [content];
78
+ }
79
+ if (message.role === llm.ChatRole.USER) {
80
+ const contents = [];
81
+ for (const c of content) {
82
+ if (typeof c === "string") {
83
+ contents.push({
84
+ type: "input_text",
85
+ text: c
86
+ });
87
+ } else if (
88
+ // typescript type guard for determining ChatAudio vs ChatImage
89
+ ((c2) => {
90
+ return c2.frame !== void 0;
91
+ })(c)
92
+ ) {
93
+ contents.push({
94
+ type: "input_audio",
95
+ audio: Buffer.from(mergeFrames(c.frame).data.buffer).toString("base64")
96
+ });
97
+ }
69
98
  }
70
- else {
71
- let content = message.content;
72
- if (!Array.isArray(content)) {
73
- content = [content];
74
- }
75
- if (message.role === llm.ChatRole.USER) {
76
- const contents = [];
77
- for (const c of content) {
78
- if (typeof c === 'string') {
79
- contents.push({
80
- type: 'input_text',
81
- text: c,
82
- });
83
- }
84
- else if (
85
- // typescript type guard for determining ChatAudio vs ChatImage
86
- ((c) => {
87
- return c.frame !== undefined;
88
- })(c)) {
89
- contents.push({
90
- type: 'input_audio',
91
- audio: Buffer.from(mergeFrames(c.frame).data.buffer).toString('base64'),
92
- });
93
- }
94
- }
95
- event = {
96
- type: 'conversation.item.create',
97
- previous_item_id: previousItemId,
98
- item: {
99
- type: 'message',
100
- role: 'user',
101
- content: contents,
102
- },
103
- };
104
- }
105
- else if (message.role === llm.ChatRole.ASSISTANT) {
106
- const contents = [];
107
- for (const c of content) {
108
- if (typeof c === 'string') {
109
- contents.push({
110
- type: 'text',
111
- text: c,
112
- });
113
- }
114
- else if (
115
- // typescript type guard for determining ChatAudio vs ChatImage
116
- ((c) => {
117
- return c.frame !== undefined;
118
- })(c)) {
119
- this.#logger.warn('audio content in assistant message is not supported');
120
- }
121
- }
122
- event = {
123
- type: 'conversation.item.create',
124
- previous_item_id: previousItemId,
125
- item: {
126
- type: 'message',
127
- role: 'assistant',
128
- content: contents,
129
- },
130
- };
131
- }
132
- else if (message.role === llm.ChatRole.SYSTEM) {
133
- const contents = [];
134
- for (const c of content) {
135
- if (typeof c === 'string') {
136
- contents.push({
137
- type: 'input_text',
138
- text: c,
139
- });
140
- }
141
- else if (
142
- // typescript type guard for determining ChatAudio vs ChatImage
143
- ((c) => {
144
- return c.frame !== undefined;
145
- })(c)) {
146
- this.#logger.warn('audio content in system message is not supported');
147
- }
148
- }
149
- event = {
150
- type: 'conversation.item.create',
151
- previous_item_id: previousItemId,
152
- item: {
153
- type: 'message',
154
- role: 'system',
155
- content: contents,
156
- },
157
- };
158
- }
159
- else {
160
- this.#logger
161
- .child({ message })
162
- .warn('chat message is not supported inside the realtime API');
163
- return;
164
- }
99
+ event = {
100
+ type: "conversation.item.create",
101
+ previous_item_id: previousItemId,
102
+ item: {
103
+ type: "message",
104
+ role: "user",
105
+ content: contents
106
+ }
107
+ };
108
+ } else if (message.role === llm.ChatRole.ASSISTANT) {
109
+ const contents = [];
110
+ for (const c of content) {
111
+ if (typeof c === "string") {
112
+ contents.push({
113
+ type: "text",
114
+ text: c
115
+ });
116
+ } else if (
117
+ // typescript type guard for determining ChatAudio vs ChatImage
118
+ ((c2) => {
119
+ return c2.frame !== void 0;
120
+ })(c)
121
+ ) {
122
+ this.#logger.warn("audio content in assistant message is not supported");
123
+ }
165
124
  }
166
- this.#session.queueMsg(event);
125
+ event = {
126
+ type: "conversation.item.create",
127
+ previous_item_id: previousItemId,
128
+ item: {
129
+ type: "message",
130
+ role: "assistant",
131
+ content: contents
132
+ }
133
+ };
134
+ } else if (message.role === llm.ChatRole.SYSTEM) {
135
+ const contents = [];
136
+ for (const c of content) {
137
+ if (typeof c === "string") {
138
+ contents.push({
139
+ type: "input_text",
140
+ text: c
141
+ });
142
+ } else if (
143
+ // typescript type guard for determining ChatAudio vs ChatImage
144
+ ((c2) => {
145
+ return c2.frame !== void 0;
146
+ })(c)
147
+ ) {
148
+ this.#logger.warn("audio content in system message is not supported");
149
+ }
150
+ }
151
+ event = {
152
+ type: "conversation.item.create",
153
+ previous_item_id: previousItemId,
154
+ item: {
155
+ type: "message",
156
+ role: "system",
157
+ content: contents
158
+ }
159
+ };
160
+ } else {
161
+ this.#logger.child({ message }).warn("chat message is not supported inside the realtime API");
162
+ return;
163
+ }
167
164
  }
165
+ this.#session.queueMsg(event);
166
+ }
168
167
  }
169
168
  class Conversation {
170
- #session;
171
- constructor(session) {
172
- this.#session = session;
173
- }
174
- get item() {
175
- return new ConversationItem(this.#session);
176
- }
169
+ #session;
170
+ constructor(session) {
171
+ this.#session = session;
172
+ }
173
+ get item() {
174
+ return new ConversationItem(this.#session);
175
+ }
177
176
  }
178
177
  class Response {
179
- #session;
180
- constructor(session) {
181
- this.#session = session;
182
- }
183
- create() {
184
- this.#session.queueMsg({
185
- type: 'response.create',
186
- });
187
- }
188
- cancel() {
189
- this.#session.queueMsg({
190
- type: 'response.cancel',
191
- });
192
- }
178
+ #session;
179
+ constructor(session) {
180
+ this.#session = session;
181
+ }
182
+ create() {
183
+ this.#session.queueMsg({
184
+ type: "response.create"
185
+ });
186
+ }
187
+ cancel() {
188
+ this.#session.queueMsg({
189
+ type: "response.cancel"
190
+ });
191
+ }
193
192
  }
194
- export class RealtimeModel extends multimodal.RealtimeModel {
195
- sampleRate = api_proto.SAMPLE_RATE;
196
- numChannels = api_proto.NUM_CHANNELS;
197
- inFrameSize = api_proto.IN_FRAME_SIZE;
198
- outFrameSize = api_proto.OUT_FRAME_SIZE;
199
- #defaultOpts;
200
- #sessions = [];
201
- static withAzure({ baseURL, azureDeployment, apiVersion = '2024-10-01-preview', apiKey = undefined, entraToken = undefined, instructions = '', modalities = ['text', 'audio'], voice = 'alloy', inputAudioFormat = 'pcm16', outputAudioFormat = 'pcm16', inputAudioTranscription = { model: 'whisper-1' }, turnDetection = { type: 'server_vad' }, temperature = 0.8, maxResponseOutputTokens = Infinity, }) {
202
- return new RealtimeModel({
203
- isAzure: true,
204
- baseURL: new URL('openai', baseURL).toString(),
205
- model: azureDeployment,
206
- apiVersion,
207
- apiKey,
208
- entraToken,
209
- instructions,
210
- modalities,
211
- voice,
212
- inputAudioFormat,
213
- outputAudioFormat,
214
- inputAudioTranscription,
215
- turnDetection,
216
- temperature,
217
- maxResponseOutputTokens,
218
- });
219
- }
220
- constructor({ modalities = ['text', 'audio'], instructions = '', voice = 'alloy', inputAudioFormat = 'pcm16', outputAudioFormat = 'pcm16', inputAudioTranscription = { model: 'whisper-1' }, turnDetection = { type: 'server_vad' }, temperature = 0.8, maxResponseOutputTokens = Infinity, model = 'gpt-4o-realtime-preview-2024-10-01', apiKey = process.env.OPENAI_API_KEY || '', baseURL = api_proto.BASE_URL,
193
+ class RealtimeModel extends multimodal.RealtimeModel {
194
+ sampleRate = api_proto.SAMPLE_RATE;
195
+ numChannels = api_proto.NUM_CHANNELS;
196
+ inFrameSize = api_proto.IN_FRAME_SIZE;
197
+ outFrameSize = api_proto.OUT_FRAME_SIZE;
198
+ #defaultOpts;
199
+ #sessions = [];
200
+ static withAzure({
201
+ baseURL,
202
+ azureDeployment,
203
+ apiVersion = "2024-10-01-preview",
204
+ apiKey = void 0,
205
+ entraToken = void 0,
206
+ instructions = "",
207
+ modalities = ["text", "audio"],
208
+ voice = "alloy",
209
+ inputAudioFormat = "pcm16",
210
+ outputAudioFormat = "pcm16",
211
+ inputAudioTranscription = { model: "whisper-1" },
212
+ turnDetection = { type: "server_vad" },
213
+ temperature = 0.8,
214
+ maxResponseOutputTokens = Infinity
215
+ }) {
216
+ return new RealtimeModel({
217
+ isAzure: true,
218
+ baseURL: new URL("openai", baseURL).toString(),
219
+ model: azureDeployment,
220
+ apiVersion,
221
+ apiKey,
222
+ entraToken,
223
+ instructions,
224
+ modalities,
225
+ voice,
226
+ inputAudioFormat,
227
+ outputAudioFormat,
228
+ inputAudioTranscription,
229
+ turnDetection,
230
+ temperature,
231
+ maxResponseOutputTokens
232
+ });
233
+ }
234
+ constructor({
235
+ modalities = ["text", "audio"],
236
+ instructions = "",
237
+ voice = "alloy",
238
+ inputAudioFormat = "pcm16",
239
+ outputAudioFormat = "pcm16",
240
+ inputAudioTranscription = { model: "whisper-1" },
241
+ turnDetection = { type: "server_vad" },
242
+ temperature = 0.8,
243
+ maxResponseOutputTokens = Infinity,
244
+ model = "gpt-4o-realtime-preview-2024-10-01",
245
+ apiKey = process.env.OPENAI_API_KEY || "",
246
+ baseURL = api_proto.BASE_URL,
221
247
  // used for microsoft
222
- isAzure = false, apiVersion = undefined, entraToken = undefined, }) {
223
- super();
224
- if (apiKey === '') {
225
- throw new Error('OpenAI API key is required, either using the argument or by setting the OPENAI_API_KEY environmental variable');
226
- }
227
- this.#defaultOpts = {
228
- modalities,
229
- instructions,
230
- voice,
231
- inputAudioFormat,
232
- outputAudioFormat,
233
- inputAudioTranscription,
234
- turnDetection,
235
- temperature,
236
- maxResponseOutputTokens,
237
- model,
238
- apiKey,
239
- baseURL,
240
- isAzure,
241
- apiVersion,
242
- entraToken,
243
- };
244
- }
245
- get sessions() {
246
- return this.#sessions;
247
- }
248
- session({ fncCtx, chatCtx, modalities = this.#defaultOpts.modalities, instructions = this.#defaultOpts.instructions, voice = this.#defaultOpts.voice, inputAudioFormat = this.#defaultOpts.inputAudioFormat, outputAudioFormat = this.#defaultOpts.outputAudioFormat, inputAudioTranscription = this.#defaultOpts.inputAudioTranscription, turnDetection = this.#defaultOpts.turnDetection, temperature = this.#defaultOpts.temperature, maxResponseOutputTokens = this.#defaultOpts.maxResponseOutputTokens, }) {
249
- const opts = {
250
- modalities,
251
- instructions,
252
- voice,
253
- inputAudioFormat,
254
- outputAudioFormat,
255
- inputAudioTranscription,
256
- turnDetection,
257
- temperature,
258
- maxResponseOutputTokens,
259
- model: this.#defaultOpts.model,
260
- apiKey: this.#defaultOpts.apiKey,
261
- baseURL: this.#defaultOpts.baseURL,
262
- isAzure: this.#defaultOpts.isAzure,
263
- apiVersion: this.#defaultOpts.apiVersion,
264
- entraToken: this.#defaultOpts.entraToken,
265
- };
266
- const newSession = new RealtimeSession(opts, {
267
- chatCtx: chatCtx || new llm.ChatContext(),
268
- fncCtx,
269
- });
270
- this.#sessions.push(newSession);
271
- return newSession;
272
- }
273
- async close() {
274
- await Promise.allSettled(this.#sessions.map((session) => session.close()));
275
- }
248
+ isAzure = false,
249
+ apiVersion = void 0,
250
+ entraToken = void 0
251
+ }) {
252
+ super();
253
+ if (apiKey === "") {
254
+ throw new Error(
255
+ "OpenAI API key is required, either using the argument or by setting the OPENAI_API_KEY environmental variable"
256
+ );
257
+ }
258
+ this.#defaultOpts = {
259
+ modalities,
260
+ instructions,
261
+ voice,
262
+ inputAudioFormat,
263
+ outputAudioFormat,
264
+ inputAudioTranscription,
265
+ turnDetection,
266
+ temperature,
267
+ maxResponseOutputTokens,
268
+ model,
269
+ apiKey,
270
+ baseURL,
271
+ isAzure,
272
+ apiVersion,
273
+ entraToken
274
+ };
275
+ }
276
+ get sessions() {
277
+ return this.#sessions;
278
+ }
279
+ session({
280
+ fncCtx,
281
+ chatCtx,
282
+ modalities = this.#defaultOpts.modalities,
283
+ instructions = this.#defaultOpts.instructions,
284
+ voice = this.#defaultOpts.voice,
285
+ inputAudioFormat = this.#defaultOpts.inputAudioFormat,
286
+ outputAudioFormat = this.#defaultOpts.outputAudioFormat,
287
+ inputAudioTranscription = this.#defaultOpts.inputAudioTranscription,
288
+ turnDetection = this.#defaultOpts.turnDetection,
289
+ temperature = this.#defaultOpts.temperature,
290
+ maxResponseOutputTokens = this.#defaultOpts.maxResponseOutputTokens
291
+ }) {
292
+ const opts = {
293
+ modalities,
294
+ instructions,
295
+ voice,
296
+ inputAudioFormat,
297
+ outputAudioFormat,
298
+ inputAudioTranscription,
299
+ turnDetection,
300
+ temperature,
301
+ maxResponseOutputTokens,
302
+ model: this.#defaultOpts.model,
303
+ apiKey: this.#defaultOpts.apiKey,
304
+ baseURL: this.#defaultOpts.baseURL,
305
+ isAzure: this.#defaultOpts.isAzure,
306
+ apiVersion: this.#defaultOpts.apiVersion,
307
+ entraToken: this.#defaultOpts.entraToken
308
+ };
309
+ const newSession = new RealtimeSession(opts, {
310
+ chatCtx: chatCtx || new llm.ChatContext(),
311
+ fncCtx
312
+ });
313
+ this.#sessions.push(newSession);
314
+ return newSession;
315
+ }
316
+ async close() {
317
+ await Promise.allSettled(this.#sessions.map((session) => session.close()));
318
+ }
276
319
  }
277
- export class RealtimeSession extends multimodal.RealtimeSession {
278
- #chatCtx = undefined;
279
- #fncCtx = undefined;
280
- #opts;
281
- #pendingResponses = {};
282
- #sessionId = 'not-connected';
283
- #ws = null;
284
- #expiresAt = null;
285
- #logger = log();
286
- #task;
287
- #closing = true;
288
- #sendQueue = new Queue();
289
- constructor(opts, { fncCtx, chatCtx }) {
290
- super();
291
- this.#opts = opts;
292
- this.#chatCtx = chatCtx;
293
- this.#fncCtx = fncCtx;
294
- this.#task = this.#start();
295
- this.sessionUpdate({
296
- modalities: this.#opts.modalities,
297
- instructions: this.#opts.instructions,
298
- voice: this.#opts.voice,
299
- inputAudioFormat: this.#opts.inputAudioFormat,
300
- outputAudioFormat: this.#opts.outputAudioFormat,
301
- inputAudioTranscription: this.#opts.inputAudioTranscription,
302
- turnDetection: this.#opts.turnDetection,
303
- temperature: this.#opts.temperature,
304
- maxResponseOutputTokens: this.#opts.maxResponseOutputTokens,
305
- toolChoice: 'auto',
306
- });
307
- }
308
- get chatCtx() {
309
- return this.#chatCtx;
310
- }
311
- get fncCtx() {
312
- return this.#fncCtx;
313
- }
314
- set fncCtx(ctx) {
315
- this.#fncCtx = ctx;
316
- }
317
- get conversation() {
318
- return new Conversation(this);
319
- }
320
- get inputAudioBuffer() {
321
- return new InputAudioBuffer(this);
322
- }
323
- get response() {
324
- return new Response(this);
325
- }
326
- get expiration() {
327
- if (!this.#expiresAt) {
328
- throw new Error('session not started');
320
+ class RealtimeSession extends multimodal.RealtimeSession {
321
+ #chatCtx = void 0;
322
+ #fncCtx = void 0;
323
+ #opts;
324
+ #pendingResponses = {};
325
+ #sessionId = "not-connected";
326
+ #ws = null;
327
+ #expiresAt = null;
328
+ #logger = log();
329
+ #task;
330
+ #closing = true;
331
+ #sendQueue = new Queue();
332
+ constructor(opts, { fncCtx, chatCtx }) {
333
+ super();
334
+ this.#opts = opts;
335
+ this.#chatCtx = chatCtx;
336
+ this.#fncCtx = fncCtx;
337
+ this.#task = this.#start();
338
+ this.sessionUpdate({
339
+ modalities: this.#opts.modalities,
340
+ instructions: this.#opts.instructions,
341
+ voice: this.#opts.voice,
342
+ inputAudioFormat: this.#opts.inputAudioFormat,
343
+ outputAudioFormat: this.#opts.outputAudioFormat,
344
+ inputAudioTranscription: this.#opts.inputAudioTranscription,
345
+ turnDetection: this.#opts.turnDetection,
346
+ temperature: this.#opts.temperature,
347
+ maxResponseOutputTokens: this.#opts.maxResponseOutputTokens,
348
+ toolChoice: "auto"
349
+ });
350
+ }
351
+ get chatCtx() {
352
+ return this.#chatCtx;
353
+ }
354
+ get fncCtx() {
355
+ return this.#fncCtx;
356
+ }
357
+ set fncCtx(ctx) {
358
+ this.#fncCtx = ctx;
359
+ }
360
+ get conversation() {
361
+ return new Conversation(this);
362
+ }
363
+ get inputAudioBuffer() {
364
+ return new InputAudioBuffer(this);
365
+ }
366
+ get response() {
367
+ return new Response(this);
368
+ }
369
+ get expiration() {
370
+ if (!this.#expiresAt) {
371
+ throw new Error("session not started");
372
+ }
373
+ return this.#expiresAt * 1e3;
374
+ }
375
+ queueMsg(command) {
376
+ this.#sendQueue.put(command);
377
+ }
378
+ /// Truncates the data field of the event to the specified maxLength to avoid overwhelming logs
379
+ /// with large amounts of base64 audio data.
380
+ #loggableEvent(event, maxLength = 30) {
381
+ const untypedEvent = {};
382
+ for (const [key, value] of Object.entries(event)) {
383
+ if (value !== void 0) {
384
+ untypedEvent[key] = value;
385
+ }
386
+ }
387
+ if (untypedEvent.audio && typeof untypedEvent.audio === "string") {
388
+ const truncatedData = untypedEvent.audio.slice(0, maxLength) + (untypedEvent.audio.length > maxLength ? "\u2026" : "");
389
+ return { ...untypedEvent, audio: truncatedData };
390
+ }
391
+ if (untypedEvent.delta && typeof untypedEvent.delta === "string" && event.type === "response.audio.delta") {
392
+ const truncatedDelta = untypedEvent.delta.slice(0, maxLength) + (untypedEvent.delta.length > maxLength ? "\u2026" : "");
393
+ return { ...untypedEvent, delta: truncatedDelta };
394
+ }
395
+ return untypedEvent;
396
+ }
397
+ sessionUpdate({
398
+ modalities = this.#opts.modalities,
399
+ instructions = this.#opts.instructions,
400
+ voice = this.#opts.voice,
401
+ inputAudioFormat = this.#opts.inputAudioFormat,
402
+ outputAudioFormat = this.#opts.outputAudioFormat,
403
+ inputAudioTranscription = this.#opts.inputAudioTranscription,
404
+ turnDetection = this.#opts.turnDetection,
405
+ temperature = this.#opts.temperature,
406
+ maxResponseOutputTokens = this.#opts.maxResponseOutputTokens,
407
+ toolChoice = "auto"
408
+ }) {
409
+ this.#opts = {
410
+ modalities,
411
+ instructions,
412
+ voice,
413
+ inputAudioFormat,
414
+ outputAudioFormat,
415
+ inputAudioTranscription,
416
+ turnDetection,
417
+ temperature,
418
+ maxResponseOutputTokens,
419
+ model: this.#opts.model,
420
+ apiKey: this.#opts.apiKey,
421
+ baseURL: this.#opts.baseURL,
422
+ isAzure: this.#opts.isAzure,
423
+ apiVersion: this.#opts.apiVersion,
424
+ entraToken: this.#opts.entraToken
425
+ };
426
+ const tools = this.#fncCtx ? Object.entries(this.#fncCtx).map(([name, func]) => ({
427
+ type: "function",
428
+ name,
429
+ description: func.description,
430
+ parameters: (
431
+ // don't format parameters if they are raw openai params
432
+ func.parameters.type == "object" ? func.parameters : llm.oaiParams(func.parameters)
433
+ )
434
+ })) : [];
435
+ const sessionUpdateEvent = {
436
+ type: "session.update",
437
+ session: {
438
+ modalities: this.#opts.modalities,
439
+ instructions: this.#opts.instructions,
440
+ voice: this.#opts.voice,
441
+ input_audio_format: this.#opts.inputAudioFormat,
442
+ output_audio_format: this.#opts.outputAudioFormat,
443
+ input_audio_transcription: this.#opts.inputAudioTranscription,
444
+ turn_detection: this.#opts.turnDetection,
445
+ temperature: this.#opts.temperature,
446
+ max_response_output_tokens: this.#opts.maxResponseOutputTokens === Infinity ? "inf" : this.#opts.maxResponseOutputTokens,
447
+ tools,
448
+ tool_choice: toolChoice
449
+ }
450
+ };
451
+ if (this.#opts.isAzure && this.#opts.maxResponseOutputTokens === Infinity) {
452
+ sessionUpdateEvent.session.max_response_output_tokens = void 0;
453
+ }
454
+ this.queueMsg(sessionUpdateEvent);
455
+ }
456
+ #start() {
457
+ return new Promise(async (resolve, reject) => {
458
+ const headers = {
459
+ "User-Agent": "LiveKit-Agents-JS"
460
+ };
461
+ if (this.#opts.isAzure) {
462
+ if (this.#opts.entraToken) {
463
+ headers.Authorization = `Bearer ${this.#opts.entraToken}`;
464
+ } else if (this.#opts.apiKey) {
465
+ headers["api-key"] = this.#opts.apiKey;
466
+ } else {
467
+ reject(new Error("Microsoft API key or entraToken is required"));
468
+ return;
329
469
  }
330
- return this.#expiresAt * 1000;
331
- }
332
- queueMsg(command) {
333
- this.#sendQueue.put(command);
334
- }
335
- /// Truncates the data field of the event to the specified maxLength to avoid overwhelming logs
336
- /// with large amounts of base64 audio data.
337
- #loggableEvent(event, maxLength = 30) {
338
- const untypedEvent = {};
339
- for (const [key, value] of Object.entries(event)) {
340
- if (value !== undefined) {
341
- untypedEvent[key] = value;
342
- }
470
+ } else {
471
+ headers.Authorization = `Bearer ${this.#opts.apiKey}`;
472
+ headers["OpenAI-Beta"] = "realtime=v1";
473
+ }
474
+ const url = new URL([this.#opts.baseURL, "realtime"].join("/"));
475
+ if (url.protocol === "https:") {
476
+ url.protocol = "wss:";
477
+ }
478
+ const queryParams = {};
479
+ if (this.#opts.isAzure) {
480
+ queryParams["api-version"] = "2024-10-01-preview";
481
+ queryParams["deployment"] = this.#opts.model;
482
+ } else {
483
+ queryParams["model"] = this.#opts.model;
484
+ }
485
+ for (const [key, value] of Object.entries(queryParams)) {
486
+ url.searchParams.set(key, value);
487
+ }
488
+ console.debug("Connecting to OpenAI Realtime API at ", url.toString());
489
+ this.#ws = new WebSocket(url.toString(), {
490
+ headers
491
+ });
492
+ this.#ws.onerror = (error) => {
493
+ reject(new Error("OpenAI Realtime WebSocket error: " + error.message));
494
+ };
495
+ await once(this.#ws, "open");
496
+ this.#closing = false;
497
+ this.#ws.onmessage = (message) => {
498
+ const event = JSON.parse(message.data);
499
+ this.#logger.debug(`<- ${JSON.stringify(this.#loggableEvent(event))}`);
500
+ switch (event.type) {
501
+ case "error":
502
+ this.#handleError(event);
503
+ break;
504
+ case "session.created":
505
+ this.#handleSessionCreated(event);
506
+ break;
507
+ case "session.updated":
508
+ this.#handleSessionUpdated(event);
509
+ break;
510
+ case "conversation.created":
511
+ this.#handleConversationCreated(event);
512
+ break;
513
+ case "input_audio_buffer.committed":
514
+ this.#handleInputAudioBufferCommitted(event);
515
+ break;
516
+ case "input_audio_buffer.cleared":
517
+ this.#handleInputAudioBufferCleared(event);
518
+ break;
519
+ case "input_audio_buffer.speech_started":
520
+ this.#handleInputAudioBufferSpeechStarted(event);
521
+ break;
522
+ case "input_audio_buffer.speech_stopped":
523
+ this.#handleInputAudioBufferSpeechStopped(event);
524
+ break;
525
+ case "conversation.item.created":
526
+ this.#handleConversationItemCreated(event);
527
+ break;
528
+ case "conversation.item.input_audio_transcription.completed":
529
+ this.#handleConversationItemInputAudioTranscriptionCompleted(event);
530
+ break;
531
+ case "conversation.item.input_audio_transcription.failed":
532
+ this.#handleConversationItemInputAudioTranscriptionFailed(event);
533
+ break;
534
+ case "conversation.item.truncated":
535
+ this.#handleConversationItemTruncated(event);
536
+ break;
537
+ case "conversation.item.deleted":
538
+ this.#handleConversationItemDeleted(event);
539
+ break;
540
+ case "response.created":
541
+ this.#handleResponseCreated(event);
542
+ break;
543
+ case "response.done":
544
+ this.#handleResponseDone(event);
545
+ break;
546
+ case "response.output_item.added":
547
+ this.#handleResponseOutputItemAdded(event);
548
+ break;
549
+ case "response.output_item.done":
550
+ this.#handleResponseOutputItemDone(event);
551
+ break;
552
+ case "response.content_part.added":
553
+ this.#handleResponseContentPartAdded(event);
554
+ break;
555
+ case "response.content_part.done":
556
+ this.#handleResponseContentPartDone(event);
557
+ break;
558
+ case "response.text.delta":
559
+ this.#handleResponseTextDelta(event);
560
+ break;
561
+ case "response.text.done":
562
+ this.#handleResponseTextDone(event);
563
+ break;
564
+ case "response.audio_transcript.delta":
565
+ this.#handleResponseAudioTranscriptDelta(event);
566
+ break;
567
+ case "response.audio_transcript.done":
568
+ this.#handleResponseAudioTranscriptDone(event);
569
+ break;
570
+ case "response.audio.delta":
571
+ this.#handleResponseAudioDelta(event);
572
+ break;
573
+ case "response.audio.done":
574
+ this.#handleResponseAudioDone(event);
575
+ break;
576
+ case "response.function_call_arguments.delta":
577
+ this.#handleResponseFunctionCallArgumentsDelta(event);
578
+ break;
579
+ case "response.function_call_arguments.done":
580
+ this.#handleResponseFunctionCallArgumentsDone(event);
581
+ break;
582
+ case "rate_limits.updated":
583
+ this.#handleRateLimitsUpdated(event);
584
+ break;
343
585
  }
344
- if (untypedEvent.audio && typeof untypedEvent.audio === 'string') {
345
- const truncatedData = untypedEvent.audio.slice(0, maxLength) + (untypedEvent.audio.length > maxLength ? '…' : '');
346
- return { ...untypedEvent, audio: truncatedData };
347
- }
348
- if (untypedEvent.delta &&
349
- typeof untypedEvent.delta === 'string' &&
350
- event.type === 'response.audio.delta') {
351
- const truncatedDelta = untypedEvent.delta.slice(0, maxLength) + (untypedEvent.delta.length > maxLength ? '…' : '');
352
- return { ...untypedEvent, delta: truncatedDelta };
353
- }
354
- return untypedEvent;
355
- }
356
- sessionUpdate({ modalities = this.#opts.modalities, instructions = this.#opts.instructions, voice = this.#opts.voice, inputAudioFormat = this.#opts.inputAudioFormat, outputAudioFormat = this.#opts.outputAudioFormat, inputAudioTranscription = this.#opts.inputAudioTranscription, turnDetection = this.#opts.turnDetection, temperature = this.#opts.temperature, maxResponseOutputTokens = this.#opts.maxResponseOutputTokens, toolChoice = 'auto', }) {
357
- this.#opts = {
358
- modalities,
359
- instructions,
360
- voice,
361
- inputAudioFormat,
362
- outputAudioFormat,
363
- inputAudioTranscription,
364
- turnDetection,
365
- temperature,
366
- maxResponseOutputTokens,
367
- model: this.#opts.model,
368
- apiKey: this.#opts.apiKey,
369
- baseURL: this.#opts.baseURL,
370
- isAzure: this.#opts.isAzure,
371
- apiVersion: this.#opts.apiVersion,
372
- entraToken: this.#opts.entraToken,
373
- };
374
- const tools = this.#fncCtx
375
- ? Object.entries(this.#fncCtx).map(([name, func]) => ({
376
- type: 'function',
377
- name,
378
- description: func.description,
379
- parameters:
380
- // don't format parameters if they are raw openai params
381
- func.parameters.type == 'object'
382
- ? func.parameters
383
- : llm.oaiParams(func.parameters),
384
- }))
385
- : [];
386
- const sessionUpdateEvent = {
387
- type: 'session.update',
388
- session: {
389
- modalities: this.#opts.modalities,
390
- instructions: this.#opts.instructions,
391
- voice: this.#opts.voice,
392
- input_audio_format: this.#opts.inputAudioFormat,
393
- output_audio_format: this.#opts.outputAudioFormat,
394
- input_audio_transcription: this.#opts.inputAudioTranscription,
395
- turn_detection: this.#opts.turnDetection,
396
- temperature: this.#opts.temperature,
397
- max_response_output_tokens: this.#opts.maxResponseOutputTokens === Infinity
398
- ? 'inf'
399
- : this.#opts.maxResponseOutputTokens,
400
- tools,
401
- tool_choice: toolChoice,
402
- },
403
- };
404
- if (this.#opts.isAzure && this.#opts.maxResponseOutputTokens === Infinity) {
405
- // microsoft doesn't support inf for max_response_output_tokens, but accepts no args
406
- sessionUpdateEvent.session.max_response_output_tokens = undefined;
407
- }
408
- this.queueMsg(sessionUpdateEvent);
409
- }
410
- #start() {
411
- return new Promise(async (resolve, reject) => {
412
- const headers = {
413
- 'User-Agent': 'LiveKit-Agents-JS',
414
- };
415
- if (this.#opts.isAzure) {
416
- // Microsoft API has two ways of authentication
417
- // 1. Entra token set as `Bearer` token
418
- // 2. API key set as `api_key` header (also accepts query string)
419
- if (this.#opts.entraToken) {
420
- headers.Authorization = `Bearer ${this.#opts.entraToken}`;
421
- }
422
- else if (this.#opts.apiKey) {
423
- headers['api-key'] = this.#opts.apiKey;
424
- }
425
- else {
426
- reject(new Error('Microsoft API key or entraToken is required'));
427
- return;
428
- }
429
- }
430
- else {
431
- headers.Authorization = `Bearer ${this.#opts.apiKey}`;
432
- headers['OpenAI-Beta'] = 'realtime=v1';
433
- }
434
- const url = new URL([this.#opts.baseURL, 'realtime'].join('/'));
435
- if (url.protocol === 'https:') {
436
- url.protocol = 'wss:';
437
- }
438
- // Construct query parameters
439
- const queryParams = {};
440
- if (this.#opts.isAzure) {
441
- queryParams['api-version'] = '2024-10-01-preview';
442
- queryParams['deployment'] = this.#opts.model;
443
- }
444
- else {
445
- queryParams['model'] = this.#opts.model;
586
+ };
587
+ const sendTask = async () => {
588
+ while (this.#ws && !this.#closing && this.#ws.readyState === WebSocket.OPEN) {
589
+ try {
590
+ const event = await this.#sendQueue.get();
591
+ if (event.type !== "input_audio_buffer.append") {
592
+ this.#logger.debug(`-> ${JSON.stringify(this.#loggableEvent(event))}`);
446
593
  }
447
- for (const [key, value] of Object.entries(queryParams)) {
448
- url.searchParams.set(key, value);
449
- }
450
- console.debug('Connecting to OpenAI Realtime API at ', url.toString());
451
- this.#ws = new WebSocket(url.toString(), {
452
- headers: headers,
453
- });
454
- this.#ws.onerror = (error) => {
455
- reject(new Error('OpenAI Realtime WebSocket error: ' + error.message));
456
- };
457
- await once(this.#ws, 'open');
458
- this.#closing = false;
459
- this.#ws.onmessage = (message) => {
460
- const event = JSON.parse(message.data);
461
- this.#logger.debug(`<- ${JSON.stringify(this.#loggableEvent(event))}`);
462
- switch (event.type) {
463
- case 'error':
464
- this.#handleError(event);
465
- break;
466
- case 'session.created':
467
- this.#handleSessionCreated(event);
468
- break;
469
- case 'session.updated':
470
- this.#handleSessionUpdated(event);
471
- break;
472
- case 'conversation.created':
473
- this.#handleConversationCreated(event);
474
- break;
475
- case 'input_audio_buffer.committed':
476
- this.#handleInputAudioBufferCommitted(event);
477
- break;
478
- case 'input_audio_buffer.cleared':
479
- this.#handleInputAudioBufferCleared(event);
480
- break;
481
- case 'input_audio_buffer.speech_started':
482
- this.#handleInputAudioBufferSpeechStarted(event);
483
- break;
484
- case 'input_audio_buffer.speech_stopped':
485
- this.#handleInputAudioBufferSpeechStopped(event);
486
- break;
487
- case 'conversation.item.created':
488
- this.#handleConversationItemCreated(event);
489
- break;
490
- case 'conversation.item.input_audio_transcription.completed':
491
- this.#handleConversationItemInputAudioTranscriptionCompleted(event);
492
- break;
493
- case 'conversation.item.input_audio_transcription.failed':
494
- this.#handleConversationItemInputAudioTranscriptionFailed(event);
495
- break;
496
- case 'conversation.item.truncated':
497
- this.#handleConversationItemTruncated(event);
498
- break;
499
- case 'conversation.item.deleted':
500
- this.#handleConversationItemDeleted(event);
501
- break;
502
- case 'response.created':
503
- this.#handleResponseCreated(event);
504
- break;
505
- case 'response.done':
506
- this.#handleResponseDone(event);
507
- break;
508
- case 'response.output_item.added':
509
- this.#handleResponseOutputItemAdded(event);
510
- break;
511
- case 'response.output_item.done':
512
- this.#handleResponseOutputItemDone(event);
513
- break;
514
- case 'response.content_part.added':
515
- this.#handleResponseContentPartAdded(event);
516
- break;
517
- case 'response.content_part.done':
518
- this.#handleResponseContentPartDone(event);
519
- break;
520
- case 'response.text.delta':
521
- this.#handleResponseTextDelta(event);
522
- break;
523
- case 'response.text.done':
524
- this.#handleResponseTextDone(event);
525
- break;
526
- case 'response.audio_transcript.delta':
527
- this.#handleResponseAudioTranscriptDelta(event);
528
- break;
529
- case 'response.audio_transcript.done':
530
- this.#handleResponseAudioTranscriptDone(event);
531
- break;
532
- case 'response.audio.delta':
533
- this.#handleResponseAudioDelta(event);
534
- break;
535
- case 'response.audio.done':
536
- this.#handleResponseAudioDone(event);
537
- break;
538
- case 'response.function_call_arguments.delta':
539
- this.#handleResponseFunctionCallArgumentsDelta(event);
540
- break;
541
- case 'response.function_call_arguments.done':
542
- this.#handleResponseFunctionCallArgumentsDone(event);
543
- break;
544
- case 'rate_limits.updated':
545
- this.#handleRateLimitsUpdated(event);
546
- break;
547
- }
548
- };
549
- const sendTask = async () => {
550
- while (this.#ws && !this.#closing && this.#ws.readyState === WebSocket.OPEN) {
551
- try {
552
- const event = await this.#sendQueue.get();
553
- if (event.type !== 'input_audio_buffer.append') {
554
- this.#logger.debug(`-> ${JSON.stringify(this.#loggableEvent(event))}`);
555
- }
556
- this.#ws.send(JSON.stringify(event));
557
- }
558
- catch (error) {
559
- this.#logger.error('Error sending event:', error);
560
- }
561
- }
562
- };
563
- sendTask();
564
- this.#ws.onclose = () => {
565
- if (this.#expiresAt && Date.now() >= this.#expiresAt * 1000) {
566
- this.#closing = true;
567
- }
568
- if (!this.#closing) {
569
- reject(new Error('OpenAI Realtime connection closed unexpectedly'));
570
- }
571
- this.#ws = null;
572
- resolve();
573
- };
574
- });
575
- }
576
- async close() {
577
- if (!this.#ws)
578
- return;
579
- this.#closing = true;
580
- this.#ws.close();
581
- await this.#task;
582
- }
583
- #getContent(ptr) {
584
- const response = this.#pendingResponses[ptr.response_id];
585
- const output = response.output[ptr.output_index];
586
- const content = output.content[ptr.content_index];
587
- return content;
588
- }
589
- #handleError(event) {
590
- this.#logger.error(`OpenAI Realtime error ${JSON.stringify(event.error)}`);
591
- }
592
- #handleSessionCreated(event) {
593
- this.#sessionId = event.session.id;
594
- this.#expiresAt = event.session.expires_at;
595
- this.#logger = this.#logger.child({ sessionId: this.#sessionId });
596
- }
597
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
598
- #handleSessionUpdated(event) { }
599
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
600
- #handleConversationCreated(event) { }
601
- #handleInputAudioBufferCommitted(event) {
602
- this.emit('input_speech_committed', {
603
- itemId: event.item_id,
604
- });
605
- }
606
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
607
- #handleInputAudioBufferCleared(event) { }
608
- #handleInputAudioBufferSpeechStarted(
609
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
610
- event) {
611
- this.emit('input_speech_started', {
612
- itemId: event.item_id,
613
- });
614
- }
615
- #handleInputAudioBufferSpeechStopped(
616
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
617
- event) {
618
- this.emit('input_speech_stopped');
619
- }
620
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
621
- #handleConversationItemCreated(event) { }
622
- #handleConversationItemInputAudioTranscriptionCompleted(event) {
623
- const transcript = event.transcript;
624
- this.emit('input_speech_transcription_completed', {
625
- itemId: event.item_id,
626
- transcript: transcript,
627
- });
628
- }
629
- #handleConversationItemInputAudioTranscriptionFailed(event) {
630
- const error = event.error;
631
- this.#logger.error(`OpenAI Realtime failed to transcribe input audio: ${error.message}`);
632
- this.emit('input_speech_transcription_failed', {
633
- itemId: event.item_id,
634
- message: error.message,
635
- });
636
- }
637
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
638
- #handleConversationItemTruncated(event) { }
639
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
640
- #handleConversationItemDeleted(event) { }
641
- #handleResponseCreated(responseCreated) {
642
- const response = responseCreated.response;
643
- const doneFut = new Future();
644
- const newResponse = {
645
- id: response.id,
646
- status: response.status,
647
- statusDetails: response.status_details,
648
- usage: null,
649
- output: [],
650
- doneFut: doneFut,
651
- };
652
- this.#pendingResponses[newResponse.id] = newResponse;
653
- this.emit('response_created', newResponse);
654
- }
655
- #handleResponseDone(event) {
656
- const responseData = event.response;
657
- const responseId = responseData.id;
658
- const response = this.#pendingResponses[responseId];
659
- response.status = responseData.status;
660
- response.statusDetails = responseData.status_details;
661
- response.usage = responseData.usage ?? null;
662
- this.#pendingResponses[responseId] = response;
663
- response.doneFut.resolve();
664
- this.emit('response_done', response);
665
- }
666
- #handleResponseOutputItemAdded(event) {
667
- const responseId = event.response_id;
668
- const response = this.#pendingResponses[responseId];
669
- const itemData = event.item;
670
- if (itemData.type !== 'message' && itemData.type !== 'function_call') {
671
- throw new Error(`Unexpected item type: ${itemData.type}`);
594
+ this.#ws.send(JSON.stringify(event));
595
+ } catch (error) {
596
+ this.#logger.error("Error sending event:", error);
597
+ }
672
598
  }
673
- let role;
674
- if (itemData.type === 'function_call') {
675
- role = 'assistant'; // function_call doesn't have a role field, defaulting it to assistant
599
+ };
600
+ sendTask();
601
+ this.#ws.onclose = () => {
602
+ if (this.#expiresAt && Date.now() >= this.#expiresAt * 1e3) {
603
+ this.#closing = true;
676
604
  }
677
- else {
678
- role = itemData.role;
605
+ if (!this.#closing) {
606
+ reject(new Error("OpenAI Realtime connection closed unexpectedly"));
679
607
  }
680
- const newOutput = {
681
- responseId: responseId,
682
- itemId: itemData.id,
683
- outputIndex: event.output_index,
684
- type: itemData.type,
685
- role: role,
686
- content: [],
687
- doneFut: new Future(),
688
- };
689
- response?.output.push(newOutput);
690
- this.emit('response_output_added', newOutput);
691
- }
692
- #handleResponseOutputItemDone(event) {
693
- const responseId = event.response_id;
694
- const response = this.#pendingResponses[responseId];
695
- const outputIndex = event.output_index;
696
- const output = response.output[outputIndex];
697
- if (output?.type === 'function_call') {
698
- if (!this.#fncCtx) {
699
- this.#logger.error('function call received but no fncCtx is available');
700
- return;
701
- }
702
- // parse the arguments and call the function inside the fnc_ctx
703
- const item = event.item;
704
- if (item.type !== 'function_call') {
705
- throw new Error('Expected function_call item');
706
- }
707
- const func = this.#fncCtx[item.name];
708
- if (!func) {
709
- this.#logger.error(`no function with name ${item.name} in fncCtx`);
710
- return;
711
- }
712
- this.emit('function_call_started', {
713
- callId: item.call_id,
714
- });
715
- const parsedArgs = JSON.parse(item.arguments);
716
- this.#logger.debug(`[Function Call ${item.call_id}] Executing ${item.name} with arguments ${parsedArgs}`);
717
- func.execute(parsedArgs).then((content) => {
718
- this.#logger.debug(`[Function Call ${item.call_id}] ${item.name} returned ${content}`);
719
- this.emit('function_call_completed', {
720
- callId: item.call_id,
721
- });
722
- this.conversation.item.create(llm.ChatMessage.createToolFromFunctionResult({
723
- name: item.name,
724
- toolCallId: item.call_id,
725
- result: content,
726
- }), output.itemId);
727
- this.response.create();
728
- }, (error) => {
729
- this.#logger.error(`[Function Call ${item.call_id}] ${item.name} failed with ${error}`);
730
- // TODO: send it back up as failed?
731
- this.emit('function_call_failed', {
732
- callId: item.call_id,
733
- });
734
- });
608
+ this.#ws = null;
609
+ resolve();
610
+ };
611
+ });
612
+ }
613
+ async close() {
614
+ if (!this.#ws) return;
615
+ this.#closing = true;
616
+ this.#ws.close();
617
+ await this.#task;
618
+ }
619
+ #getContent(ptr) {
620
+ const response = this.#pendingResponses[ptr.response_id];
621
+ const output = response.output[ptr.output_index];
622
+ const content = output.content[ptr.content_index];
623
+ return content;
624
+ }
625
+ #handleError(event) {
626
+ this.#logger.error(`OpenAI Realtime error ${JSON.stringify(event.error)}`);
627
+ }
628
+ #handleSessionCreated(event) {
629
+ this.#sessionId = event.session.id;
630
+ this.#expiresAt = event.session.expires_at;
631
+ this.#logger = this.#logger.child({ sessionId: this.#sessionId });
632
+ }
633
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
634
+ #handleSessionUpdated(event) {
635
+ }
636
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
637
+ #handleConversationCreated(event) {
638
+ }
639
+ #handleInputAudioBufferCommitted(event) {
640
+ this.emit("input_speech_committed", {
641
+ itemId: event.item_id
642
+ });
643
+ }
644
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
645
+ #handleInputAudioBufferCleared(event) {
646
+ }
647
+ #handleInputAudioBufferSpeechStarted(event) {
648
+ this.emit("input_speech_started", {
649
+ itemId: event.item_id
650
+ });
651
+ }
652
+ #handleInputAudioBufferSpeechStopped(event) {
653
+ this.emit("input_speech_stopped");
654
+ }
655
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
656
+ #handleConversationItemCreated(event) {
657
+ }
658
+ #handleConversationItemInputAudioTranscriptionCompleted(event) {
659
+ const transcript = event.transcript;
660
+ this.emit("input_speech_transcription_completed", {
661
+ itemId: event.item_id,
662
+ transcript
663
+ });
664
+ }
665
+ #handleConversationItemInputAudioTranscriptionFailed(event) {
666
+ const error = event.error;
667
+ this.#logger.error(`OpenAI Realtime failed to transcribe input audio: ${error.message}`);
668
+ this.emit("input_speech_transcription_failed", {
669
+ itemId: event.item_id,
670
+ message: error.message
671
+ });
672
+ }
673
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
674
+ #handleConversationItemTruncated(event) {
675
+ }
676
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
677
+ #handleConversationItemDeleted(event) {
678
+ }
679
+ #handleResponseCreated(responseCreated) {
680
+ const response = responseCreated.response;
681
+ const doneFut = new Future();
682
+ const newResponse = {
683
+ id: response.id,
684
+ status: response.status,
685
+ statusDetails: response.status_details,
686
+ usage: null,
687
+ output: [],
688
+ doneFut
689
+ };
690
+ this.#pendingResponses[newResponse.id] = newResponse;
691
+ this.emit("response_created", newResponse);
692
+ }
693
+ #handleResponseDone(event) {
694
+ const responseData = event.response;
695
+ const responseId = responseData.id;
696
+ const response = this.#pendingResponses[responseId];
697
+ response.status = responseData.status;
698
+ response.statusDetails = responseData.status_details;
699
+ response.usage = responseData.usage ?? null;
700
+ this.#pendingResponses[responseId] = response;
701
+ response.doneFut.resolve();
702
+ this.emit("response_done", response);
703
+ }
704
+ #handleResponseOutputItemAdded(event) {
705
+ const responseId = event.response_id;
706
+ const response = this.#pendingResponses[responseId];
707
+ const itemData = event.item;
708
+ if (itemData.type !== "message" && itemData.type !== "function_call") {
709
+ throw new Error(`Unexpected item type: ${itemData.type}`);
710
+ }
711
+ let role;
712
+ if (itemData.type === "function_call") {
713
+ role = "assistant";
714
+ } else {
715
+ role = itemData.role;
716
+ }
717
+ const newOutput = {
718
+ responseId,
719
+ itemId: itemData.id,
720
+ outputIndex: event.output_index,
721
+ type: itemData.type,
722
+ role,
723
+ content: [],
724
+ doneFut: new Future()
725
+ };
726
+ response == null ? void 0 : response.output.push(newOutput);
727
+ this.emit("response_output_added", newOutput);
728
+ }
729
+ #handleResponseOutputItemDone(event) {
730
+ const responseId = event.response_id;
731
+ const response = this.#pendingResponses[responseId];
732
+ const outputIndex = event.output_index;
733
+ const output = response.output[outputIndex];
734
+ if ((output == null ? void 0 : output.type) === "function_call") {
735
+ if (!this.#fncCtx) {
736
+ this.#logger.error("function call received but no fncCtx is available");
737
+ return;
738
+ }
739
+ const item = event.item;
740
+ if (item.type !== "function_call") {
741
+ throw new Error("Expected function_call item");
742
+ }
743
+ const func = this.#fncCtx[item.name];
744
+ if (!func) {
745
+ this.#logger.error(`no function with name ${item.name} in fncCtx`);
746
+ return;
747
+ }
748
+ this.emit("function_call_started", {
749
+ callId: item.call_id
750
+ });
751
+ const parsedArgs = JSON.parse(item.arguments);
752
+ this.#logger.debug(
753
+ `[Function Call ${item.call_id}] Executing ${item.name} with arguments ${parsedArgs}`
754
+ );
755
+ func.execute(parsedArgs).then(
756
+ (content) => {
757
+ this.#logger.debug(`[Function Call ${item.call_id}] ${item.name} returned ${content}`);
758
+ this.emit("function_call_completed", {
759
+ callId: item.call_id
760
+ });
761
+ this.conversation.item.create(
762
+ llm.ChatMessage.createToolFromFunctionResult({
763
+ name: item.name,
764
+ toolCallId: item.call_id,
765
+ result: content
766
+ }),
767
+ output.itemId
768
+ );
769
+ this.response.create();
770
+ },
771
+ (error) => {
772
+ this.#logger.error(`[Function Call ${item.call_id}] ${item.name} failed with ${error}`);
773
+ this.emit("function_call_failed", {
774
+ callId: item.call_id
775
+ });
735
776
  }
736
- output?.doneFut.resolve();
737
- this.emit('response_output_done', output);
738
- }
739
- #handleResponseContentPartAdded(event) {
740
- const responseId = event.response_id;
741
- const response = this.#pendingResponses[responseId];
742
- const outputIndex = event.output_index;
743
- const output = response.output[outputIndex];
744
- const textStream = new AsyncIterableQueue();
745
- const audioStream = new AsyncIterableQueue();
746
- const newContent = {
747
- responseId: responseId,
748
- itemId: event.item_id,
749
- outputIndex: outputIndex,
750
- contentIndex: event.content_index,
751
- text: '',
752
- audio: [],
753
- textStream: textStream,
754
- audioStream: audioStream,
755
- toolCalls: [],
756
- };
757
- output?.content.push(newContent);
758
- this.emit('response_content_added', newContent);
759
- }
760
- #handleResponseContentPartDone(event) {
761
- const content = this.#getContent(event);
762
- this.emit('response_content_done', content);
763
- }
764
- #handleResponseTextDelta(event) {
765
- this.emit('response_text_delta', event);
766
- }
767
- #handleResponseTextDone(event) {
768
- this.emit('response_text_done', event);
769
- }
770
- #handleResponseAudioTranscriptDelta(event) {
771
- const content = this.#getContent(event);
772
- const transcript = event.delta;
773
- content.text += transcript;
774
- content.textStream.put(transcript);
775
- }
776
- #handleResponseAudioTranscriptDone(event) {
777
- const content = this.#getContent(event);
778
- content.textStream.close();
779
- }
780
- #handleResponseAudioDelta(event) {
781
- const content = this.#getContent(event);
782
- const data = Buffer.from(event.delta, 'base64');
783
- const audio = new AudioFrame(new Int16Array(data.buffer), api_proto.SAMPLE_RATE, api_proto.NUM_CHANNELS, data.length / 2);
784
- content.audio.push(audio);
785
- content.audioStream.put(audio);
786
- }
787
- #handleResponseAudioDone(event) {
788
- const content = this.#getContent(event);
789
- content.audioStream.close();
790
- }
791
- #handleResponseFunctionCallArgumentsDelta(
792
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
793
- event) { }
794
- #handleResponseFunctionCallArgumentsDone(
795
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
796
- event) { }
797
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
798
- #handleRateLimitsUpdated(event) { }
777
+ );
778
+ }
779
+ output == null ? void 0 : output.doneFut.resolve();
780
+ this.emit("response_output_done", output);
781
+ }
782
+ #handleResponseContentPartAdded(event) {
783
+ const responseId = event.response_id;
784
+ const response = this.#pendingResponses[responseId];
785
+ const outputIndex = event.output_index;
786
+ const output = response.output[outputIndex];
787
+ const textStream = new AsyncIterableQueue();
788
+ const audioStream = new AsyncIterableQueue();
789
+ const newContent = {
790
+ responseId,
791
+ itemId: event.item_id,
792
+ outputIndex,
793
+ contentIndex: event.content_index,
794
+ text: "",
795
+ audio: [],
796
+ textStream,
797
+ audioStream,
798
+ toolCalls: []
799
+ };
800
+ output == null ? void 0 : output.content.push(newContent);
801
+ this.emit("response_content_added", newContent);
802
+ }
803
+ #handleResponseContentPartDone(event) {
804
+ const content = this.#getContent(event);
805
+ this.emit("response_content_done", content);
806
+ }
807
+ #handleResponseTextDelta(event) {
808
+ this.emit("response_text_delta", event);
809
+ }
810
+ #handleResponseTextDone(event) {
811
+ this.emit("response_text_done", event);
812
+ }
813
+ #handleResponseAudioTranscriptDelta(event) {
814
+ const content = this.#getContent(event);
815
+ const transcript = event.delta;
816
+ content.text += transcript;
817
+ content.textStream.put(transcript);
818
+ }
819
+ #handleResponseAudioTranscriptDone(event) {
820
+ const content = this.#getContent(event);
821
+ content.textStream.close();
822
+ }
823
+ #handleResponseAudioDelta(event) {
824
+ const content = this.#getContent(event);
825
+ const data = Buffer.from(event.delta, "base64");
826
+ const audio = new AudioFrame(
827
+ new Int16Array(data.buffer),
828
+ api_proto.SAMPLE_RATE,
829
+ api_proto.NUM_CHANNELS,
830
+ data.length / 2
831
+ );
832
+ content.audio.push(audio);
833
+ content.audioStream.put(audio);
834
+ }
835
+ #handleResponseAudioDone(event) {
836
+ const content = this.#getContent(event);
837
+ content.audioStream.close();
838
+ }
839
+ #handleResponseFunctionCallArgumentsDelta(event) {
840
+ }
841
+ #handleResponseFunctionCallArgumentsDone(event) {
842
+ }
843
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
844
+ #handleRateLimitsUpdated(event) {
845
+ }
799
846
  }
847
+ export {
848
+ RealtimeModel,
849
+ RealtimeSession
850
+ };
800
851
  //# sourceMappingURL=realtime_model.js.map