@livekit/agents-plugin-openai 0.6.1 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -0
- package/dist/index.cjs +55 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.js +13 -8
- package/dist/index.js.map +1 -1
- package/dist/llm.cjs +506 -0
- package/dist/llm.cjs.map +1 -0
- package/dist/llm.d.ts.map +1 -1
- package/dist/llm.js +438 -423
- package/dist/llm.js.map +1 -1
- package/dist/llm.test.cjs +8 -0
- package/dist/llm.test.cjs.map +1 -0
- package/dist/llm.test.d.ts +2 -0
- package/dist/llm.test.d.ts.map +1 -0
- package/dist/llm.test.js +7 -0
- package/dist/llm.test.js.map +1 -0
- package/dist/models.cjs +17 -0
- package/dist/models.cjs.map +1 -0
- package/dist/models.js +0 -4
- package/dist/models.js.map +1 -1
- package/dist/realtime/api_proto.cjs +41 -0
- package/dist/realtime/api_proto.cjs.map +1 -0
- package/dist/realtime/api_proto.js +12 -8
- package/dist/realtime/api_proto.js.map +1 -1
- package/dist/realtime/index.cjs +25 -0
- package/dist/realtime/index.cjs.map +1 -0
- package/dist/realtime/index.js +2 -5
- package/dist/realtime/index.js.map +1 -1
- package/dist/realtime/realtime_model.cjs +878 -0
- package/dist/realtime/realtime_model.cjs.map +1 -0
- package/dist/realtime/realtime_model.js +828 -777
- package/dist/realtime/realtime_model.js.map +1 -1
- package/dist/stt.cjs +130 -0
- package/dist/stt.cjs.map +1 -0
- package/dist/stt.js +99 -102
- package/dist/stt.js.map +1 -1
- package/dist/stt.test.cjs +9 -0
- package/dist/stt.test.cjs.map +1 -0
- package/dist/stt.test.d.ts +2 -0
- package/dist/stt.test.d.ts.map +1 -0
- package/dist/stt.test.js +8 -0
- package/dist/stt.test.js.map +1 -0
- package/dist/tts.cjs +100 -0
- package/dist/tts.cjs.map +1 -0
- package/dist/tts.d.ts +1 -1
- package/dist/tts.d.ts.map +1 -1
- package/dist/tts.js +67 -65
- package/dist/tts.js.map +1 -1
- package/dist/tts.test.cjs +9 -0
- package/dist/tts.test.cjs.map +1 -0
- package/dist/tts.test.d.ts +2 -0
- package/dist/tts.test.d.ts.map +1 -0
- package/dist/tts.test.js +8 -0
- package/dist/tts.test.js.map +1 -0
- package/package.json +20 -8
- package/src/llm.test.ts +10 -0
- package/src/llm.ts +7 -2
- package/src/stt.test.ts +11 -0
- package/src/tts.test.ts +11 -0
- package/src/tts.ts +2 -1
|
@@ -0,0 +1,878 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __defProp = Object.defineProperty;
|
|
4
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
7
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __export = (target, all) => {
|
|
9
|
+
for (var name in all)
|
|
10
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
11
|
+
};
|
|
12
|
+
var __copyProps = (to, from, except, desc) => {
|
|
13
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
14
|
+
for (let key of __getOwnPropNames(from))
|
|
15
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
16
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
17
|
+
}
|
|
18
|
+
return to;
|
|
19
|
+
};
|
|
20
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
21
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
22
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
23
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
24
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
25
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
26
|
+
mod
|
|
27
|
+
));
|
|
28
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
|
+
var realtime_model_exports = {};
|
|
30
|
+
__export(realtime_model_exports, {
|
|
31
|
+
RealtimeModel: () => RealtimeModel,
|
|
32
|
+
RealtimeSession: () => RealtimeSession
|
|
33
|
+
});
|
|
34
|
+
module.exports = __toCommonJS(realtime_model_exports);
|
|
35
|
+
var import_agents = require("@livekit/agents");
|
|
36
|
+
var import_rtc_node = require("@livekit/rtc-node");
|
|
37
|
+
var import_node_events = require("node:events");
|
|
38
|
+
var import_ws = require("ws");
|
|
39
|
+
var api_proto = __toESM(require("./api_proto.cjs"), 1);
|
|
40
|
+
class InputAudioBuffer {
|
|
41
|
+
#session;
|
|
42
|
+
constructor(session) {
|
|
43
|
+
this.#session = session;
|
|
44
|
+
}
|
|
45
|
+
append(frame) {
|
|
46
|
+
this.#session.queueMsg({
|
|
47
|
+
type: "input_audio_buffer.append",
|
|
48
|
+
audio: Buffer.from(frame.data.buffer).toString("base64")
|
|
49
|
+
});
|
|
50
|
+
}
|
|
51
|
+
clear() {
|
|
52
|
+
this.#session.queueMsg({
|
|
53
|
+
type: "input_audio_buffer.clear"
|
|
54
|
+
});
|
|
55
|
+
}
|
|
56
|
+
commit() {
|
|
57
|
+
this.#session.queueMsg({
|
|
58
|
+
type: "input_audio_buffer.commit"
|
|
59
|
+
});
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
class ConversationItem {
|
|
63
|
+
#session;
|
|
64
|
+
#logger = (0, import_agents.log)();
|
|
65
|
+
constructor(session) {
|
|
66
|
+
this.#session = session;
|
|
67
|
+
}
|
|
68
|
+
truncate(itemId, contentIndex, audioEnd) {
|
|
69
|
+
this.#session.queueMsg({
|
|
70
|
+
type: "conversation.item.truncate",
|
|
71
|
+
item_id: itemId,
|
|
72
|
+
content_index: contentIndex,
|
|
73
|
+
audio_end_ms: audioEnd
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
delete(itemId) {
|
|
77
|
+
this.#session.queueMsg({
|
|
78
|
+
type: "conversation.item.delete",
|
|
79
|
+
item_id: itemId
|
|
80
|
+
});
|
|
81
|
+
}
|
|
82
|
+
create(message, previousItemId) {
|
|
83
|
+
if (!message.content) {
|
|
84
|
+
return;
|
|
85
|
+
}
|
|
86
|
+
let event;
|
|
87
|
+
if (message.toolCallId) {
|
|
88
|
+
if (typeof message.content !== "string") {
|
|
89
|
+
throw new TypeError("message.content must be a string");
|
|
90
|
+
}
|
|
91
|
+
event = {
|
|
92
|
+
type: "conversation.item.create",
|
|
93
|
+
previous_item_id: previousItemId,
|
|
94
|
+
item: {
|
|
95
|
+
type: "function_call_output",
|
|
96
|
+
call_id: message.toolCallId,
|
|
97
|
+
output: message.content
|
|
98
|
+
}
|
|
99
|
+
};
|
|
100
|
+
} else {
|
|
101
|
+
let content = message.content;
|
|
102
|
+
if (!Array.isArray(content)) {
|
|
103
|
+
content = [content];
|
|
104
|
+
}
|
|
105
|
+
if (message.role === import_agents.llm.ChatRole.USER) {
|
|
106
|
+
const contents = [];
|
|
107
|
+
for (const c of content) {
|
|
108
|
+
if (typeof c === "string") {
|
|
109
|
+
contents.push({
|
|
110
|
+
type: "input_text",
|
|
111
|
+
text: c
|
|
112
|
+
});
|
|
113
|
+
} else if (
|
|
114
|
+
// typescript type guard for determining ChatAudio vs ChatImage
|
|
115
|
+
((c2) => {
|
|
116
|
+
return c2.frame !== void 0;
|
|
117
|
+
})(c)
|
|
118
|
+
) {
|
|
119
|
+
contents.push({
|
|
120
|
+
type: "input_audio",
|
|
121
|
+
audio: Buffer.from((0, import_agents.mergeFrames)(c.frame).data.buffer).toString("base64")
|
|
122
|
+
});
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
event = {
|
|
126
|
+
type: "conversation.item.create",
|
|
127
|
+
previous_item_id: previousItemId,
|
|
128
|
+
item: {
|
|
129
|
+
type: "message",
|
|
130
|
+
role: "user",
|
|
131
|
+
content: contents
|
|
132
|
+
}
|
|
133
|
+
};
|
|
134
|
+
} else if (message.role === import_agents.llm.ChatRole.ASSISTANT) {
|
|
135
|
+
const contents = [];
|
|
136
|
+
for (const c of content) {
|
|
137
|
+
if (typeof c === "string") {
|
|
138
|
+
contents.push({
|
|
139
|
+
type: "text",
|
|
140
|
+
text: c
|
|
141
|
+
});
|
|
142
|
+
} else if (
|
|
143
|
+
// typescript type guard for determining ChatAudio vs ChatImage
|
|
144
|
+
((c2) => {
|
|
145
|
+
return c2.frame !== void 0;
|
|
146
|
+
})(c)
|
|
147
|
+
) {
|
|
148
|
+
this.#logger.warn("audio content in assistant message is not supported");
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
event = {
|
|
152
|
+
type: "conversation.item.create",
|
|
153
|
+
previous_item_id: previousItemId,
|
|
154
|
+
item: {
|
|
155
|
+
type: "message",
|
|
156
|
+
role: "assistant",
|
|
157
|
+
content: contents
|
|
158
|
+
}
|
|
159
|
+
};
|
|
160
|
+
} else if (message.role === import_agents.llm.ChatRole.SYSTEM) {
|
|
161
|
+
const contents = [];
|
|
162
|
+
for (const c of content) {
|
|
163
|
+
if (typeof c === "string") {
|
|
164
|
+
contents.push({
|
|
165
|
+
type: "input_text",
|
|
166
|
+
text: c
|
|
167
|
+
});
|
|
168
|
+
} else if (
|
|
169
|
+
// typescript type guard for determining ChatAudio vs ChatImage
|
|
170
|
+
((c2) => {
|
|
171
|
+
return c2.frame !== void 0;
|
|
172
|
+
})(c)
|
|
173
|
+
) {
|
|
174
|
+
this.#logger.warn("audio content in system message is not supported");
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
event = {
|
|
178
|
+
type: "conversation.item.create",
|
|
179
|
+
previous_item_id: previousItemId,
|
|
180
|
+
item: {
|
|
181
|
+
type: "message",
|
|
182
|
+
role: "system",
|
|
183
|
+
content: contents
|
|
184
|
+
}
|
|
185
|
+
};
|
|
186
|
+
} else {
|
|
187
|
+
this.#logger.child({ message }).warn("chat message is not supported inside the realtime API");
|
|
188
|
+
return;
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
this.#session.queueMsg(event);
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
class Conversation {
|
|
195
|
+
#session;
|
|
196
|
+
constructor(session) {
|
|
197
|
+
this.#session = session;
|
|
198
|
+
}
|
|
199
|
+
get item() {
|
|
200
|
+
return new ConversationItem(this.#session);
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
class Response {
|
|
204
|
+
#session;
|
|
205
|
+
constructor(session) {
|
|
206
|
+
this.#session = session;
|
|
207
|
+
}
|
|
208
|
+
create() {
|
|
209
|
+
this.#session.queueMsg({
|
|
210
|
+
type: "response.create"
|
|
211
|
+
});
|
|
212
|
+
}
|
|
213
|
+
cancel() {
|
|
214
|
+
this.#session.queueMsg({
|
|
215
|
+
type: "response.cancel"
|
|
216
|
+
});
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
class RealtimeModel extends import_agents.multimodal.RealtimeModel {
|
|
220
|
+
sampleRate = api_proto.SAMPLE_RATE;
|
|
221
|
+
numChannels = api_proto.NUM_CHANNELS;
|
|
222
|
+
inFrameSize = api_proto.IN_FRAME_SIZE;
|
|
223
|
+
outFrameSize = api_proto.OUT_FRAME_SIZE;
|
|
224
|
+
#defaultOpts;
|
|
225
|
+
#sessions = [];
|
|
226
|
+
static withAzure({
|
|
227
|
+
baseURL,
|
|
228
|
+
azureDeployment,
|
|
229
|
+
apiVersion = "2024-10-01-preview",
|
|
230
|
+
apiKey = void 0,
|
|
231
|
+
entraToken = void 0,
|
|
232
|
+
instructions = "",
|
|
233
|
+
modalities = ["text", "audio"],
|
|
234
|
+
voice = "alloy",
|
|
235
|
+
inputAudioFormat = "pcm16",
|
|
236
|
+
outputAudioFormat = "pcm16",
|
|
237
|
+
inputAudioTranscription = { model: "whisper-1" },
|
|
238
|
+
turnDetection = { type: "server_vad" },
|
|
239
|
+
temperature = 0.8,
|
|
240
|
+
maxResponseOutputTokens = Infinity
|
|
241
|
+
}) {
|
|
242
|
+
return new RealtimeModel({
|
|
243
|
+
isAzure: true,
|
|
244
|
+
baseURL: new URL("openai", baseURL).toString(),
|
|
245
|
+
model: azureDeployment,
|
|
246
|
+
apiVersion,
|
|
247
|
+
apiKey,
|
|
248
|
+
entraToken,
|
|
249
|
+
instructions,
|
|
250
|
+
modalities,
|
|
251
|
+
voice,
|
|
252
|
+
inputAudioFormat,
|
|
253
|
+
outputAudioFormat,
|
|
254
|
+
inputAudioTranscription,
|
|
255
|
+
turnDetection,
|
|
256
|
+
temperature,
|
|
257
|
+
maxResponseOutputTokens
|
|
258
|
+
});
|
|
259
|
+
}
|
|
260
|
+
constructor({
|
|
261
|
+
modalities = ["text", "audio"],
|
|
262
|
+
instructions = "",
|
|
263
|
+
voice = "alloy",
|
|
264
|
+
inputAudioFormat = "pcm16",
|
|
265
|
+
outputAudioFormat = "pcm16",
|
|
266
|
+
inputAudioTranscription = { model: "whisper-1" },
|
|
267
|
+
turnDetection = { type: "server_vad" },
|
|
268
|
+
temperature = 0.8,
|
|
269
|
+
maxResponseOutputTokens = Infinity,
|
|
270
|
+
model = "gpt-4o-realtime-preview-2024-10-01",
|
|
271
|
+
apiKey = process.env.OPENAI_API_KEY || "",
|
|
272
|
+
baseURL = api_proto.BASE_URL,
|
|
273
|
+
// used for microsoft
|
|
274
|
+
isAzure = false,
|
|
275
|
+
apiVersion = void 0,
|
|
276
|
+
entraToken = void 0
|
|
277
|
+
}) {
|
|
278
|
+
super();
|
|
279
|
+
if (apiKey === "") {
|
|
280
|
+
throw new Error(
|
|
281
|
+
"OpenAI API key is required, either using the argument or by setting the OPENAI_API_KEY environmental variable"
|
|
282
|
+
);
|
|
283
|
+
}
|
|
284
|
+
this.#defaultOpts = {
|
|
285
|
+
modalities,
|
|
286
|
+
instructions,
|
|
287
|
+
voice,
|
|
288
|
+
inputAudioFormat,
|
|
289
|
+
outputAudioFormat,
|
|
290
|
+
inputAudioTranscription,
|
|
291
|
+
turnDetection,
|
|
292
|
+
temperature,
|
|
293
|
+
maxResponseOutputTokens,
|
|
294
|
+
model,
|
|
295
|
+
apiKey,
|
|
296
|
+
baseURL,
|
|
297
|
+
isAzure,
|
|
298
|
+
apiVersion,
|
|
299
|
+
entraToken
|
|
300
|
+
};
|
|
301
|
+
}
|
|
302
|
+
get sessions() {
|
|
303
|
+
return this.#sessions;
|
|
304
|
+
}
|
|
305
|
+
session({
|
|
306
|
+
fncCtx,
|
|
307
|
+
chatCtx,
|
|
308
|
+
modalities = this.#defaultOpts.modalities,
|
|
309
|
+
instructions = this.#defaultOpts.instructions,
|
|
310
|
+
voice = this.#defaultOpts.voice,
|
|
311
|
+
inputAudioFormat = this.#defaultOpts.inputAudioFormat,
|
|
312
|
+
outputAudioFormat = this.#defaultOpts.outputAudioFormat,
|
|
313
|
+
inputAudioTranscription = this.#defaultOpts.inputAudioTranscription,
|
|
314
|
+
turnDetection = this.#defaultOpts.turnDetection,
|
|
315
|
+
temperature = this.#defaultOpts.temperature,
|
|
316
|
+
maxResponseOutputTokens = this.#defaultOpts.maxResponseOutputTokens
|
|
317
|
+
}) {
|
|
318
|
+
const opts = {
|
|
319
|
+
modalities,
|
|
320
|
+
instructions,
|
|
321
|
+
voice,
|
|
322
|
+
inputAudioFormat,
|
|
323
|
+
outputAudioFormat,
|
|
324
|
+
inputAudioTranscription,
|
|
325
|
+
turnDetection,
|
|
326
|
+
temperature,
|
|
327
|
+
maxResponseOutputTokens,
|
|
328
|
+
model: this.#defaultOpts.model,
|
|
329
|
+
apiKey: this.#defaultOpts.apiKey,
|
|
330
|
+
baseURL: this.#defaultOpts.baseURL,
|
|
331
|
+
isAzure: this.#defaultOpts.isAzure,
|
|
332
|
+
apiVersion: this.#defaultOpts.apiVersion,
|
|
333
|
+
entraToken: this.#defaultOpts.entraToken
|
|
334
|
+
};
|
|
335
|
+
const newSession = new RealtimeSession(opts, {
|
|
336
|
+
chatCtx: chatCtx || new import_agents.llm.ChatContext(),
|
|
337
|
+
fncCtx
|
|
338
|
+
});
|
|
339
|
+
this.#sessions.push(newSession);
|
|
340
|
+
return newSession;
|
|
341
|
+
}
|
|
342
|
+
async close() {
|
|
343
|
+
await Promise.allSettled(this.#sessions.map((session) => session.close()));
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
class RealtimeSession extends import_agents.multimodal.RealtimeSession {
|
|
347
|
+
#chatCtx = void 0;
|
|
348
|
+
#fncCtx = void 0;
|
|
349
|
+
#opts;
|
|
350
|
+
#pendingResponses = {};
|
|
351
|
+
#sessionId = "not-connected";
|
|
352
|
+
#ws = null;
|
|
353
|
+
#expiresAt = null;
|
|
354
|
+
#logger = (0, import_agents.log)();
|
|
355
|
+
#task;
|
|
356
|
+
#closing = true;
|
|
357
|
+
#sendQueue = new import_agents.Queue();
|
|
358
|
+
constructor(opts, { fncCtx, chatCtx }) {
|
|
359
|
+
super();
|
|
360
|
+
this.#opts = opts;
|
|
361
|
+
this.#chatCtx = chatCtx;
|
|
362
|
+
this.#fncCtx = fncCtx;
|
|
363
|
+
this.#task = this.#start();
|
|
364
|
+
this.sessionUpdate({
|
|
365
|
+
modalities: this.#opts.modalities,
|
|
366
|
+
instructions: this.#opts.instructions,
|
|
367
|
+
voice: this.#opts.voice,
|
|
368
|
+
inputAudioFormat: this.#opts.inputAudioFormat,
|
|
369
|
+
outputAudioFormat: this.#opts.outputAudioFormat,
|
|
370
|
+
inputAudioTranscription: this.#opts.inputAudioTranscription,
|
|
371
|
+
turnDetection: this.#opts.turnDetection,
|
|
372
|
+
temperature: this.#opts.temperature,
|
|
373
|
+
maxResponseOutputTokens: this.#opts.maxResponseOutputTokens,
|
|
374
|
+
toolChoice: "auto"
|
|
375
|
+
});
|
|
376
|
+
}
|
|
377
|
+
get chatCtx() {
|
|
378
|
+
return this.#chatCtx;
|
|
379
|
+
}
|
|
380
|
+
get fncCtx() {
|
|
381
|
+
return this.#fncCtx;
|
|
382
|
+
}
|
|
383
|
+
set fncCtx(ctx) {
|
|
384
|
+
this.#fncCtx = ctx;
|
|
385
|
+
}
|
|
386
|
+
get conversation() {
|
|
387
|
+
return new Conversation(this);
|
|
388
|
+
}
|
|
389
|
+
get inputAudioBuffer() {
|
|
390
|
+
return new InputAudioBuffer(this);
|
|
391
|
+
}
|
|
392
|
+
get response() {
|
|
393
|
+
return new Response(this);
|
|
394
|
+
}
|
|
395
|
+
get expiration() {
|
|
396
|
+
if (!this.#expiresAt) {
|
|
397
|
+
throw new Error("session not started");
|
|
398
|
+
}
|
|
399
|
+
return this.#expiresAt * 1e3;
|
|
400
|
+
}
|
|
401
|
+
queueMsg(command) {
|
|
402
|
+
this.#sendQueue.put(command);
|
|
403
|
+
}
|
|
404
|
+
/// Truncates the data field of the event to the specified maxLength to avoid overwhelming logs
|
|
405
|
+
/// with large amounts of base64 audio data.
|
|
406
|
+
#loggableEvent(event, maxLength = 30) {
|
|
407
|
+
const untypedEvent = {};
|
|
408
|
+
for (const [key, value] of Object.entries(event)) {
|
|
409
|
+
if (value !== void 0) {
|
|
410
|
+
untypedEvent[key] = value;
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
if (untypedEvent.audio && typeof untypedEvent.audio === "string") {
|
|
414
|
+
const truncatedData = untypedEvent.audio.slice(0, maxLength) + (untypedEvent.audio.length > maxLength ? "\u2026" : "");
|
|
415
|
+
return { ...untypedEvent, audio: truncatedData };
|
|
416
|
+
}
|
|
417
|
+
if (untypedEvent.delta && typeof untypedEvent.delta === "string" && event.type === "response.audio.delta") {
|
|
418
|
+
const truncatedDelta = untypedEvent.delta.slice(0, maxLength) + (untypedEvent.delta.length > maxLength ? "\u2026" : "");
|
|
419
|
+
return { ...untypedEvent, delta: truncatedDelta };
|
|
420
|
+
}
|
|
421
|
+
return untypedEvent;
|
|
422
|
+
}
|
|
423
|
+
sessionUpdate({
|
|
424
|
+
modalities = this.#opts.modalities,
|
|
425
|
+
instructions = this.#opts.instructions,
|
|
426
|
+
voice = this.#opts.voice,
|
|
427
|
+
inputAudioFormat = this.#opts.inputAudioFormat,
|
|
428
|
+
outputAudioFormat = this.#opts.outputAudioFormat,
|
|
429
|
+
inputAudioTranscription = this.#opts.inputAudioTranscription,
|
|
430
|
+
turnDetection = this.#opts.turnDetection,
|
|
431
|
+
temperature = this.#opts.temperature,
|
|
432
|
+
maxResponseOutputTokens = this.#opts.maxResponseOutputTokens,
|
|
433
|
+
toolChoice = "auto"
|
|
434
|
+
}) {
|
|
435
|
+
this.#opts = {
|
|
436
|
+
modalities,
|
|
437
|
+
instructions,
|
|
438
|
+
voice,
|
|
439
|
+
inputAudioFormat,
|
|
440
|
+
outputAudioFormat,
|
|
441
|
+
inputAudioTranscription,
|
|
442
|
+
turnDetection,
|
|
443
|
+
temperature,
|
|
444
|
+
maxResponseOutputTokens,
|
|
445
|
+
model: this.#opts.model,
|
|
446
|
+
apiKey: this.#opts.apiKey,
|
|
447
|
+
baseURL: this.#opts.baseURL,
|
|
448
|
+
isAzure: this.#opts.isAzure,
|
|
449
|
+
apiVersion: this.#opts.apiVersion,
|
|
450
|
+
entraToken: this.#opts.entraToken
|
|
451
|
+
};
|
|
452
|
+
const tools = this.#fncCtx ? Object.entries(this.#fncCtx).map(([name, func]) => ({
|
|
453
|
+
type: "function",
|
|
454
|
+
name,
|
|
455
|
+
description: func.description,
|
|
456
|
+
parameters: (
|
|
457
|
+
// don't format parameters if they are raw openai params
|
|
458
|
+
func.parameters.type == "object" ? func.parameters : import_agents.llm.oaiParams(func.parameters)
|
|
459
|
+
)
|
|
460
|
+
})) : [];
|
|
461
|
+
const sessionUpdateEvent = {
|
|
462
|
+
type: "session.update",
|
|
463
|
+
session: {
|
|
464
|
+
modalities: this.#opts.modalities,
|
|
465
|
+
instructions: this.#opts.instructions,
|
|
466
|
+
voice: this.#opts.voice,
|
|
467
|
+
input_audio_format: this.#opts.inputAudioFormat,
|
|
468
|
+
output_audio_format: this.#opts.outputAudioFormat,
|
|
469
|
+
input_audio_transcription: this.#opts.inputAudioTranscription,
|
|
470
|
+
turn_detection: this.#opts.turnDetection,
|
|
471
|
+
temperature: this.#opts.temperature,
|
|
472
|
+
max_response_output_tokens: this.#opts.maxResponseOutputTokens === Infinity ? "inf" : this.#opts.maxResponseOutputTokens,
|
|
473
|
+
tools,
|
|
474
|
+
tool_choice: toolChoice
|
|
475
|
+
}
|
|
476
|
+
};
|
|
477
|
+
if (this.#opts.isAzure && this.#opts.maxResponseOutputTokens === Infinity) {
|
|
478
|
+
sessionUpdateEvent.session.max_response_output_tokens = void 0;
|
|
479
|
+
}
|
|
480
|
+
this.queueMsg(sessionUpdateEvent);
|
|
481
|
+
}
|
|
482
|
+
#start() {
|
|
483
|
+
return new Promise(async (resolve, reject) => {
|
|
484
|
+
const headers = {
|
|
485
|
+
"User-Agent": "LiveKit-Agents-JS"
|
|
486
|
+
};
|
|
487
|
+
if (this.#opts.isAzure) {
|
|
488
|
+
if (this.#opts.entraToken) {
|
|
489
|
+
headers.Authorization = `Bearer ${this.#opts.entraToken}`;
|
|
490
|
+
} else if (this.#opts.apiKey) {
|
|
491
|
+
headers["api-key"] = this.#opts.apiKey;
|
|
492
|
+
} else {
|
|
493
|
+
reject(new Error("Microsoft API key or entraToken is required"));
|
|
494
|
+
return;
|
|
495
|
+
}
|
|
496
|
+
} else {
|
|
497
|
+
headers.Authorization = `Bearer ${this.#opts.apiKey}`;
|
|
498
|
+
headers["OpenAI-Beta"] = "realtime=v1";
|
|
499
|
+
}
|
|
500
|
+
const url = new URL([this.#opts.baseURL, "realtime"].join("/"));
|
|
501
|
+
if (url.protocol === "https:") {
|
|
502
|
+
url.protocol = "wss:";
|
|
503
|
+
}
|
|
504
|
+
const queryParams = {};
|
|
505
|
+
if (this.#opts.isAzure) {
|
|
506
|
+
queryParams["api-version"] = "2024-10-01-preview";
|
|
507
|
+
queryParams["deployment"] = this.#opts.model;
|
|
508
|
+
} else {
|
|
509
|
+
queryParams["model"] = this.#opts.model;
|
|
510
|
+
}
|
|
511
|
+
for (const [key, value] of Object.entries(queryParams)) {
|
|
512
|
+
url.searchParams.set(key, value);
|
|
513
|
+
}
|
|
514
|
+
console.debug("Connecting to OpenAI Realtime API at ", url.toString());
|
|
515
|
+
this.#ws = new import_ws.WebSocket(url.toString(), {
|
|
516
|
+
headers
|
|
517
|
+
});
|
|
518
|
+
this.#ws.onerror = (error) => {
|
|
519
|
+
reject(new Error("OpenAI Realtime WebSocket error: " + error.message));
|
|
520
|
+
};
|
|
521
|
+
await (0, import_node_events.once)(this.#ws, "open");
|
|
522
|
+
this.#closing = false;
|
|
523
|
+
this.#ws.onmessage = (message) => {
|
|
524
|
+
const event = JSON.parse(message.data);
|
|
525
|
+
this.#logger.debug(`<- ${JSON.stringify(this.#loggableEvent(event))}`);
|
|
526
|
+
switch (event.type) {
|
|
527
|
+
case "error":
|
|
528
|
+
this.#handleError(event);
|
|
529
|
+
break;
|
|
530
|
+
case "session.created":
|
|
531
|
+
this.#handleSessionCreated(event);
|
|
532
|
+
break;
|
|
533
|
+
case "session.updated":
|
|
534
|
+
this.#handleSessionUpdated(event);
|
|
535
|
+
break;
|
|
536
|
+
case "conversation.created":
|
|
537
|
+
this.#handleConversationCreated(event);
|
|
538
|
+
break;
|
|
539
|
+
case "input_audio_buffer.committed":
|
|
540
|
+
this.#handleInputAudioBufferCommitted(event);
|
|
541
|
+
break;
|
|
542
|
+
case "input_audio_buffer.cleared":
|
|
543
|
+
this.#handleInputAudioBufferCleared(event);
|
|
544
|
+
break;
|
|
545
|
+
case "input_audio_buffer.speech_started":
|
|
546
|
+
this.#handleInputAudioBufferSpeechStarted(event);
|
|
547
|
+
break;
|
|
548
|
+
case "input_audio_buffer.speech_stopped":
|
|
549
|
+
this.#handleInputAudioBufferSpeechStopped(event);
|
|
550
|
+
break;
|
|
551
|
+
case "conversation.item.created":
|
|
552
|
+
this.#handleConversationItemCreated(event);
|
|
553
|
+
break;
|
|
554
|
+
case "conversation.item.input_audio_transcription.completed":
|
|
555
|
+
this.#handleConversationItemInputAudioTranscriptionCompleted(event);
|
|
556
|
+
break;
|
|
557
|
+
case "conversation.item.input_audio_transcription.failed":
|
|
558
|
+
this.#handleConversationItemInputAudioTranscriptionFailed(event);
|
|
559
|
+
break;
|
|
560
|
+
case "conversation.item.truncated":
|
|
561
|
+
this.#handleConversationItemTruncated(event);
|
|
562
|
+
break;
|
|
563
|
+
case "conversation.item.deleted":
|
|
564
|
+
this.#handleConversationItemDeleted(event);
|
|
565
|
+
break;
|
|
566
|
+
case "response.created":
|
|
567
|
+
this.#handleResponseCreated(event);
|
|
568
|
+
break;
|
|
569
|
+
case "response.done":
|
|
570
|
+
this.#handleResponseDone(event);
|
|
571
|
+
break;
|
|
572
|
+
case "response.output_item.added":
|
|
573
|
+
this.#handleResponseOutputItemAdded(event);
|
|
574
|
+
break;
|
|
575
|
+
case "response.output_item.done":
|
|
576
|
+
this.#handleResponseOutputItemDone(event);
|
|
577
|
+
break;
|
|
578
|
+
case "response.content_part.added":
|
|
579
|
+
this.#handleResponseContentPartAdded(event);
|
|
580
|
+
break;
|
|
581
|
+
case "response.content_part.done":
|
|
582
|
+
this.#handleResponseContentPartDone(event);
|
|
583
|
+
break;
|
|
584
|
+
case "response.text.delta":
|
|
585
|
+
this.#handleResponseTextDelta(event);
|
|
586
|
+
break;
|
|
587
|
+
case "response.text.done":
|
|
588
|
+
this.#handleResponseTextDone(event);
|
|
589
|
+
break;
|
|
590
|
+
case "response.audio_transcript.delta":
|
|
591
|
+
this.#handleResponseAudioTranscriptDelta(event);
|
|
592
|
+
break;
|
|
593
|
+
case "response.audio_transcript.done":
|
|
594
|
+
this.#handleResponseAudioTranscriptDone(event);
|
|
595
|
+
break;
|
|
596
|
+
case "response.audio.delta":
|
|
597
|
+
this.#handleResponseAudioDelta(event);
|
|
598
|
+
break;
|
|
599
|
+
case "response.audio.done":
|
|
600
|
+
this.#handleResponseAudioDone(event);
|
|
601
|
+
break;
|
|
602
|
+
case "response.function_call_arguments.delta":
|
|
603
|
+
this.#handleResponseFunctionCallArgumentsDelta(event);
|
|
604
|
+
break;
|
|
605
|
+
case "response.function_call_arguments.done":
|
|
606
|
+
this.#handleResponseFunctionCallArgumentsDone(event);
|
|
607
|
+
break;
|
|
608
|
+
case "rate_limits.updated":
|
|
609
|
+
this.#handleRateLimitsUpdated(event);
|
|
610
|
+
break;
|
|
611
|
+
}
|
|
612
|
+
};
|
|
613
|
+
const sendTask = async () => {
|
|
614
|
+
while (this.#ws && !this.#closing && this.#ws.readyState === import_ws.WebSocket.OPEN) {
|
|
615
|
+
try {
|
|
616
|
+
const event = await this.#sendQueue.get();
|
|
617
|
+
if (event.type !== "input_audio_buffer.append") {
|
|
618
|
+
this.#logger.debug(`-> ${JSON.stringify(this.#loggableEvent(event))}`);
|
|
619
|
+
}
|
|
620
|
+
this.#ws.send(JSON.stringify(event));
|
|
621
|
+
} catch (error) {
|
|
622
|
+
this.#logger.error("Error sending event:", error);
|
|
623
|
+
}
|
|
624
|
+
}
|
|
625
|
+
};
|
|
626
|
+
sendTask();
|
|
627
|
+
this.#ws.onclose = () => {
|
|
628
|
+
if (this.#expiresAt && Date.now() >= this.#expiresAt * 1e3) {
|
|
629
|
+
this.#closing = true;
|
|
630
|
+
}
|
|
631
|
+
if (!this.#closing) {
|
|
632
|
+
reject(new Error("OpenAI Realtime connection closed unexpectedly"));
|
|
633
|
+
}
|
|
634
|
+
this.#ws = null;
|
|
635
|
+
resolve();
|
|
636
|
+
};
|
|
637
|
+
});
|
|
638
|
+
}
|
|
639
|
+
async close() {
|
|
640
|
+
if (!this.#ws) return;
|
|
641
|
+
this.#closing = true;
|
|
642
|
+
this.#ws.close();
|
|
643
|
+
await this.#task;
|
|
644
|
+
}
|
|
645
|
+
#getContent(ptr) {
|
|
646
|
+
const response = this.#pendingResponses[ptr.response_id];
|
|
647
|
+
const output = response.output[ptr.output_index];
|
|
648
|
+
const content = output.content[ptr.content_index];
|
|
649
|
+
return content;
|
|
650
|
+
}
|
|
651
|
+
#handleError(event) {
|
|
652
|
+
this.#logger.error(`OpenAI Realtime error ${JSON.stringify(event.error)}`);
|
|
653
|
+
}
|
|
654
|
+
#handleSessionCreated(event) {
|
|
655
|
+
this.#sessionId = event.session.id;
|
|
656
|
+
this.#expiresAt = event.session.expires_at;
|
|
657
|
+
this.#logger = this.#logger.child({ sessionId: this.#sessionId });
|
|
658
|
+
}
|
|
659
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
660
|
+
#handleSessionUpdated(event) {
|
|
661
|
+
}
|
|
662
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
663
|
+
#handleConversationCreated(event) {
|
|
664
|
+
}
|
|
665
|
+
#handleInputAudioBufferCommitted(event) {
|
|
666
|
+
this.emit("input_speech_committed", {
|
|
667
|
+
itemId: event.item_id
|
|
668
|
+
});
|
|
669
|
+
}
|
|
670
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
671
|
+
#handleInputAudioBufferCleared(event) {
|
|
672
|
+
}
|
|
673
|
+
#handleInputAudioBufferSpeechStarted(event) {
|
|
674
|
+
this.emit("input_speech_started", {
|
|
675
|
+
itemId: event.item_id
|
|
676
|
+
});
|
|
677
|
+
}
|
|
678
|
+
#handleInputAudioBufferSpeechStopped(event) {
|
|
679
|
+
this.emit("input_speech_stopped");
|
|
680
|
+
}
|
|
681
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
682
|
+
#handleConversationItemCreated(event) {
|
|
683
|
+
}
|
|
684
|
+
#handleConversationItemInputAudioTranscriptionCompleted(event) {
|
|
685
|
+
const transcript = event.transcript;
|
|
686
|
+
this.emit("input_speech_transcription_completed", {
|
|
687
|
+
itemId: event.item_id,
|
|
688
|
+
transcript
|
|
689
|
+
});
|
|
690
|
+
}
|
|
691
|
+
#handleConversationItemInputAudioTranscriptionFailed(event) {
|
|
692
|
+
const error = event.error;
|
|
693
|
+
this.#logger.error(`OpenAI Realtime failed to transcribe input audio: ${error.message}`);
|
|
694
|
+
this.emit("input_speech_transcription_failed", {
|
|
695
|
+
itemId: event.item_id,
|
|
696
|
+
message: error.message
|
|
697
|
+
});
|
|
698
|
+
}
|
|
699
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
700
|
+
#handleConversationItemTruncated(event) {
|
|
701
|
+
}
|
|
702
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
703
|
+
#handleConversationItemDeleted(event) {
|
|
704
|
+
}
|
|
705
|
+
#handleResponseCreated(responseCreated) {
|
|
706
|
+
const response = responseCreated.response;
|
|
707
|
+
const doneFut = new import_agents.Future();
|
|
708
|
+
const newResponse = {
|
|
709
|
+
id: response.id,
|
|
710
|
+
status: response.status,
|
|
711
|
+
statusDetails: response.status_details,
|
|
712
|
+
usage: null,
|
|
713
|
+
output: [],
|
|
714
|
+
doneFut
|
|
715
|
+
};
|
|
716
|
+
this.#pendingResponses[newResponse.id] = newResponse;
|
|
717
|
+
this.emit("response_created", newResponse);
|
|
718
|
+
}
|
|
719
|
+
#handleResponseDone(event) {
|
|
720
|
+
const responseData = event.response;
|
|
721
|
+
const responseId = responseData.id;
|
|
722
|
+
const response = this.#pendingResponses[responseId];
|
|
723
|
+
response.status = responseData.status;
|
|
724
|
+
response.statusDetails = responseData.status_details;
|
|
725
|
+
response.usage = responseData.usage ?? null;
|
|
726
|
+
this.#pendingResponses[responseId] = response;
|
|
727
|
+
response.doneFut.resolve();
|
|
728
|
+
this.emit("response_done", response);
|
|
729
|
+
}
|
|
730
|
+
#handleResponseOutputItemAdded(event) {
|
|
731
|
+
const responseId = event.response_id;
|
|
732
|
+
const response = this.#pendingResponses[responseId];
|
|
733
|
+
const itemData = event.item;
|
|
734
|
+
if (itemData.type !== "message" && itemData.type !== "function_call") {
|
|
735
|
+
throw new Error(`Unexpected item type: ${itemData.type}`);
|
|
736
|
+
}
|
|
737
|
+
let role;
|
|
738
|
+
if (itemData.type === "function_call") {
|
|
739
|
+
role = "assistant";
|
|
740
|
+
} else {
|
|
741
|
+
role = itemData.role;
|
|
742
|
+
}
|
|
743
|
+
const newOutput = {
|
|
744
|
+
responseId,
|
|
745
|
+
itemId: itemData.id,
|
|
746
|
+
outputIndex: event.output_index,
|
|
747
|
+
type: itemData.type,
|
|
748
|
+
role,
|
|
749
|
+
content: [],
|
|
750
|
+
doneFut: new import_agents.Future()
|
|
751
|
+
};
|
|
752
|
+
response == null ? void 0 : response.output.push(newOutput);
|
|
753
|
+
this.emit("response_output_added", newOutput);
|
|
754
|
+
}
|
|
755
|
+
#handleResponseOutputItemDone(event) {
|
|
756
|
+
const responseId = event.response_id;
|
|
757
|
+
const response = this.#pendingResponses[responseId];
|
|
758
|
+
const outputIndex = event.output_index;
|
|
759
|
+
const output = response.output[outputIndex];
|
|
760
|
+
if ((output == null ? void 0 : output.type) === "function_call") {
|
|
761
|
+
if (!this.#fncCtx) {
|
|
762
|
+
this.#logger.error("function call received but no fncCtx is available");
|
|
763
|
+
return;
|
|
764
|
+
}
|
|
765
|
+
const item = event.item;
|
|
766
|
+
if (item.type !== "function_call") {
|
|
767
|
+
throw new Error("Expected function_call item");
|
|
768
|
+
}
|
|
769
|
+
const func = this.#fncCtx[item.name];
|
|
770
|
+
if (!func) {
|
|
771
|
+
this.#logger.error(`no function with name ${item.name} in fncCtx`);
|
|
772
|
+
return;
|
|
773
|
+
}
|
|
774
|
+
this.emit("function_call_started", {
|
|
775
|
+
callId: item.call_id
|
|
776
|
+
});
|
|
777
|
+
const parsedArgs = JSON.parse(item.arguments);
|
|
778
|
+
this.#logger.debug(
|
|
779
|
+
`[Function Call ${item.call_id}] Executing ${item.name} with arguments ${parsedArgs}`
|
|
780
|
+
);
|
|
781
|
+
func.execute(parsedArgs).then(
|
|
782
|
+
(content) => {
|
|
783
|
+
this.#logger.debug(`[Function Call ${item.call_id}] ${item.name} returned ${content}`);
|
|
784
|
+
this.emit("function_call_completed", {
|
|
785
|
+
callId: item.call_id
|
|
786
|
+
});
|
|
787
|
+
this.conversation.item.create(
|
|
788
|
+
import_agents.llm.ChatMessage.createToolFromFunctionResult({
|
|
789
|
+
name: item.name,
|
|
790
|
+
toolCallId: item.call_id,
|
|
791
|
+
result: content
|
|
792
|
+
}),
|
|
793
|
+
output.itemId
|
|
794
|
+
);
|
|
795
|
+
this.response.create();
|
|
796
|
+
},
|
|
797
|
+
(error) => {
|
|
798
|
+
this.#logger.error(`[Function Call ${item.call_id}] ${item.name} failed with ${error}`);
|
|
799
|
+
this.emit("function_call_failed", {
|
|
800
|
+
callId: item.call_id
|
|
801
|
+
});
|
|
802
|
+
}
|
|
803
|
+
);
|
|
804
|
+
}
|
|
805
|
+
output == null ? void 0 : output.doneFut.resolve();
|
|
806
|
+
this.emit("response_output_done", output);
|
|
807
|
+
}
|
|
808
|
+
#handleResponseContentPartAdded(event) {
|
|
809
|
+
const responseId = event.response_id;
|
|
810
|
+
const response = this.#pendingResponses[responseId];
|
|
811
|
+
const outputIndex = event.output_index;
|
|
812
|
+
const output = response.output[outputIndex];
|
|
813
|
+
const textStream = new import_agents.AsyncIterableQueue();
|
|
814
|
+
const audioStream = new import_agents.AsyncIterableQueue();
|
|
815
|
+
const newContent = {
|
|
816
|
+
responseId,
|
|
817
|
+
itemId: event.item_id,
|
|
818
|
+
outputIndex,
|
|
819
|
+
contentIndex: event.content_index,
|
|
820
|
+
text: "",
|
|
821
|
+
audio: [],
|
|
822
|
+
textStream,
|
|
823
|
+
audioStream,
|
|
824
|
+
toolCalls: []
|
|
825
|
+
};
|
|
826
|
+
output == null ? void 0 : output.content.push(newContent);
|
|
827
|
+
this.emit("response_content_added", newContent);
|
|
828
|
+
}
|
|
829
|
+
#handleResponseContentPartDone(event) {
|
|
830
|
+
const content = this.#getContent(event);
|
|
831
|
+
this.emit("response_content_done", content);
|
|
832
|
+
}
|
|
833
|
+
#handleResponseTextDelta(event) {
|
|
834
|
+
this.emit("response_text_delta", event);
|
|
835
|
+
}
|
|
836
|
+
#handleResponseTextDone(event) {
|
|
837
|
+
this.emit("response_text_done", event);
|
|
838
|
+
}
|
|
839
|
+
#handleResponseAudioTranscriptDelta(event) {
|
|
840
|
+
const content = this.#getContent(event);
|
|
841
|
+
const transcript = event.delta;
|
|
842
|
+
content.text += transcript;
|
|
843
|
+
content.textStream.put(transcript);
|
|
844
|
+
}
|
|
845
|
+
#handleResponseAudioTranscriptDone(event) {
|
|
846
|
+
const content = this.#getContent(event);
|
|
847
|
+
content.textStream.close();
|
|
848
|
+
}
|
|
849
|
+
#handleResponseAudioDelta(event) {
|
|
850
|
+
const content = this.#getContent(event);
|
|
851
|
+
const data = Buffer.from(event.delta, "base64");
|
|
852
|
+
const audio = new import_rtc_node.AudioFrame(
|
|
853
|
+
new Int16Array(data.buffer),
|
|
854
|
+
api_proto.SAMPLE_RATE,
|
|
855
|
+
api_proto.NUM_CHANNELS,
|
|
856
|
+
data.length / 2
|
|
857
|
+
);
|
|
858
|
+
content.audio.push(audio);
|
|
859
|
+
content.audioStream.put(audio);
|
|
860
|
+
}
|
|
861
|
+
#handleResponseAudioDone(event) {
|
|
862
|
+
const content = this.#getContent(event);
|
|
863
|
+
content.audioStream.close();
|
|
864
|
+
}
|
|
865
|
+
#handleResponseFunctionCallArgumentsDelta(event) {
|
|
866
|
+
}
|
|
867
|
+
#handleResponseFunctionCallArgumentsDone(event) {
|
|
868
|
+
}
|
|
869
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
870
|
+
#handleRateLimitsUpdated(event) {
|
|
871
|
+
}
|
|
872
|
+
}
|
|
873
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
874
|
+
0 && (module.exports = {
|
|
875
|
+
RealtimeModel,
|
|
876
|
+
RealtimeSession
|
|
877
|
+
});
|
|
878
|
+
//# sourceMappingURL=realtime_model.cjs.map
|