@livekit/agents-plugin-openai 0.9.2 → 1.0.0-next.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +16 -5
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +4 -4
- package/dist/index.d.ts +4 -4
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +14 -3
- package/dist/index.js.map +1 -1
- package/dist/llm.cjs +156 -188
- package/dist/llm.cjs.map +1 -1
- package/dist/llm.d.cts +27 -8
- package/dist/llm.d.ts +27 -8
- package/dist/llm.d.ts.map +1 -1
- package/dist/llm.js +164 -179
- package/dist/llm.js.map +1 -1
- package/dist/models.cjs +14 -0
- package/dist/models.cjs.map +1 -1
- package/dist/models.d.cts +11 -6
- package/dist/models.d.ts +11 -6
- package/dist/models.d.ts.map +1 -1
- package/dist/models.js +6 -0
- package/dist/models.js.map +1 -1
- package/dist/realtime/api_proto.cjs.map +1 -1
- package/dist/realtime/api_proto.d.cts +15 -0
- package/dist/realtime/api_proto.d.ts +15 -0
- package/dist/realtime/api_proto.d.ts.map +1 -1
- package/dist/realtime/api_proto.js.map +1 -1
- package/dist/realtime/realtime_model.cjs +1057 -820
- package/dist/realtime/realtime_model.cjs.map +1 -1
- package/dist/realtime/realtime_model.d.cts +126 -160
- package/dist/realtime/realtime_model.d.ts +126 -160
- package/dist/realtime/realtime_model.d.ts.map +1 -1
- package/dist/realtime/realtime_model.js +1067 -825
- package/dist/realtime/realtime_model.js.map +1 -1
- package/dist/tts.cjs +5 -5
- package/dist/tts.cjs.map +1 -1
- package/dist/tts.d.cts +2 -1
- package/dist/tts.d.ts +2 -1
- package/dist/tts.d.ts.map +1 -1
- package/dist/tts.js +6 -6
- package/dist/tts.js.map +1 -1
- package/package.json +9 -7
- package/src/index.ts +19 -5
- package/src/llm.ts +227 -218
- package/src/models.ts +83 -5
- package/src/realtime/api_proto.ts +15 -1
- package/src/realtime/realtime_model.ts +1305 -996
- package/src/tts.ts +6 -6
|
@@ -33,373 +33,443 @@ __export(realtime_model_exports, {
|
|
|
33
33
|
});
|
|
34
34
|
module.exports = __toCommonJS(realtime_model_exports);
|
|
35
35
|
var import_agents = require("@livekit/agents");
|
|
36
|
+
var import_mutex = require("@livekit/mutex");
|
|
36
37
|
var import_rtc_node = require("@livekit/rtc-node");
|
|
37
|
-
var
|
|
38
|
+
var import_async = require("@std/async");
|
|
38
39
|
var import_ws = require("ws");
|
|
39
40
|
var api_proto = __toESM(require("./api_proto.cjs"), 1);
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
clear() {
|
|
52
|
-
this.#session.queueMsg({
|
|
53
|
-
type: "input_audio_buffer.clear"
|
|
54
|
-
});
|
|
55
|
-
}
|
|
56
|
-
commit() {
|
|
57
|
-
this.#session.queueMsg({
|
|
58
|
-
type: "input_audio_buffer.commit"
|
|
59
|
-
});
|
|
41
|
+
const SAMPLE_RATE = 24e3;
|
|
42
|
+
const NUM_CHANNELS = 1;
|
|
43
|
+
const BASE_URL = "https://api.openai.com/v1";
|
|
44
|
+
const MOCK_AUDIO_ID_PREFIX = "lk_mock_audio_item_";
|
|
45
|
+
class CreateResponseHandle {
|
|
46
|
+
instructions;
|
|
47
|
+
doneFut;
|
|
48
|
+
// TODO(shubhra): add timeout
|
|
49
|
+
constructor({ instructions }) {
|
|
50
|
+
this.instructions = instructions;
|
|
51
|
+
this.doneFut = new import_agents.Future();
|
|
60
52
|
}
|
|
61
53
|
}
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
54
|
+
const DEFAULT_FIRST_RETRY_INTERVAL_MS = 100;
|
|
55
|
+
const DEFAULT_TEMPERATURE = 0.8;
|
|
56
|
+
const DEFAULT_TURN_DETECTION = {
|
|
57
|
+
type: "server_vad",
|
|
58
|
+
threshold: 0.5,
|
|
59
|
+
prefix_padding_ms: 300,
|
|
60
|
+
silence_duration_ms: 200,
|
|
61
|
+
create_response: true,
|
|
62
|
+
interrupt_response: true
|
|
63
|
+
};
|
|
64
|
+
const DEFAULT_INPUT_AUDIO_TRANSCRIPTION = {
|
|
65
|
+
model: "gpt-4o-mini-transcribe"
|
|
66
|
+
};
|
|
67
|
+
const DEFAULT_TOOL_CHOICE = "auto";
|
|
68
|
+
const DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS = "inf";
|
|
69
|
+
const AZURE_DEFAULT_INPUT_AUDIO_TRANSCRIPTION = {
|
|
70
|
+
model: "whisper-1"
|
|
71
|
+
};
|
|
72
|
+
const AZURE_DEFAULT_TURN_DETECTION = {
|
|
73
|
+
type: "server_vad",
|
|
74
|
+
threshold: 0.5,
|
|
75
|
+
prefix_padding_ms: 300,
|
|
76
|
+
silence_duration_ms: 200,
|
|
77
|
+
create_response: true
|
|
78
|
+
};
|
|
79
|
+
const DEFAULT_MAX_SESSION_DURATION = 20 * 60 * 1e3;
|
|
80
|
+
const DEFAULT_REALTIME_MODEL_OPTIONS = {
|
|
81
|
+
model: "gpt-4o-realtime-preview",
|
|
82
|
+
voice: "alloy",
|
|
83
|
+
temperature: DEFAULT_TEMPERATURE,
|
|
84
|
+
inputAudioTranscription: DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
|
|
85
|
+
turnDetection: DEFAULT_TURN_DETECTION,
|
|
86
|
+
toolChoice: DEFAULT_TOOL_CHOICE,
|
|
87
|
+
maxResponseOutputTokens: DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS,
|
|
88
|
+
maxSessionDuration: DEFAULT_MAX_SESSION_DURATION,
|
|
89
|
+
connOptions: import_agents.DEFAULT_API_CONNECT_OPTIONS
|
|
90
|
+
};
|
|
91
|
+
class RealtimeModel extends import_agents.llm.RealtimeModel {
|
|
92
|
+
sampleRate = api_proto.SAMPLE_RATE;
|
|
93
|
+
numChannels = api_proto.NUM_CHANNELS;
|
|
94
|
+
inFrameSize = api_proto.IN_FRAME_SIZE;
|
|
95
|
+
outFrameSize = api_proto.OUT_FRAME_SIZE;
|
|
96
|
+
/* @internal */
|
|
97
|
+
_options;
|
|
98
|
+
constructor(options = {}) {
|
|
99
|
+
super({
|
|
100
|
+
messageTruncation: true,
|
|
101
|
+
turnDetection: options.turnDetection !== null,
|
|
102
|
+
userTranscription: options.inputAudioTranscription !== null,
|
|
103
|
+
autoToolReplyGeneration: false
|
|
80
104
|
});
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
105
|
+
const isAzure = !!(options.apiVersion || options.entraToken || options.azureDeployment);
|
|
106
|
+
if (options.apiKey === "" && !isAzure) {
|
|
107
|
+
throw new Error(
|
|
108
|
+
"OpenAI API key is required, either using the argument or by setting the OPENAI_API_KEY environmental variable"
|
|
109
|
+
);
|
|
85
110
|
}
|
|
86
|
-
|
|
87
|
-
if (
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
}
|
|
99
|
-
};
|
|
100
|
-
} else {
|
|
101
|
-
let content = message.content;
|
|
102
|
-
if (!Array.isArray(content)) {
|
|
103
|
-
content = [content];
|
|
104
|
-
}
|
|
105
|
-
if (message.role === import_agents.llm.ChatRole.USER) {
|
|
106
|
-
const contents = [];
|
|
107
|
-
for (const c of content) {
|
|
108
|
-
if (typeof c === "string") {
|
|
109
|
-
contents.push({
|
|
110
|
-
type: "input_text",
|
|
111
|
-
text: c
|
|
112
|
-
});
|
|
113
|
-
} else if (
|
|
114
|
-
// typescript type guard for determining ChatAudio vs ChatImage
|
|
115
|
-
((c2) => {
|
|
116
|
-
return c2.frame !== void 0;
|
|
117
|
-
})(c)
|
|
118
|
-
) {
|
|
119
|
-
contents.push({
|
|
120
|
-
type: "input_audio",
|
|
121
|
-
audio: Buffer.from((0, import_agents.mergeFrames)(c.frame).data.buffer).toString("base64")
|
|
122
|
-
});
|
|
123
|
-
}
|
|
124
|
-
}
|
|
125
|
-
event = {
|
|
126
|
-
type: "conversation.item.create",
|
|
127
|
-
previous_item_id: previousItemId,
|
|
128
|
-
item: {
|
|
129
|
-
type: "message",
|
|
130
|
-
role: "user",
|
|
131
|
-
content: contents
|
|
132
|
-
}
|
|
133
|
-
};
|
|
134
|
-
} else if (message.role === import_agents.llm.ChatRole.ASSISTANT) {
|
|
135
|
-
const contents = [];
|
|
136
|
-
for (const c of content) {
|
|
137
|
-
if (typeof c === "string") {
|
|
138
|
-
contents.push({
|
|
139
|
-
type: "text",
|
|
140
|
-
text: c
|
|
141
|
-
});
|
|
142
|
-
} else if (
|
|
143
|
-
// typescript type guard for determining ChatAudio vs ChatImage
|
|
144
|
-
((c2) => {
|
|
145
|
-
return c2.frame !== void 0;
|
|
146
|
-
})(c)
|
|
147
|
-
) {
|
|
148
|
-
this.#logger.warn("audio content in assistant message is not supported");
|
|
149
|
-
}
|
|
150
|
-
}
|
|
151
|
-
event = {
|
|
152
|
-
type: "conversation.item.create",
|
|
153
|
-
previous_item_id: previousItemId,
|
|
154
|
-
item: {
|
|
155
|
-
type: "message",
|
|
156
|
-
role: "assistant",
|
|
157
|
-
content: contents
|
|
158
|
-
}
|
|
159
|
-
};
|
|
160
|
-
} else if (message.role === import_agents.llm.ChatRole.SYSTEM) {
|
|
161
|
-
const contents = [];
|
|
162
|
-
for (const c of content) {
|
|
163
|
-
if (typeof c === "string") {
|
|
164
|
-
contents.push({
|
|
165
|
-
type: "input_text",
|
|
166
|
-
text: c
|
|
167
|
-
});
|
|
168
|
-
} else if (
|
|
169
|
-
// typescript type guard for determining ChatAudio vs ChatImage
|
|
170
|
-
((c2) => {
|
|
171
|
-
return c2.frame !== void 0;
|
|
172
|
-
})(c)
|
|
173
|
-
) {
|
|
174
|
-
this.#logger.warn("audio content in system message is not supported");
|
|
175
|
-
}
|
|
176
|
-
}
|
|
177
|
-
event = {
|
|
178
|
-
type: "conversation.item.create",
|
|
179
|
-
previous_item_id: previousItemId,
|
|
180
|
-
item: {
|
|
181
|
-
type: "message",
|
|
182
|
-
role: "system",
|
|
183
|
-
content: contents
|
|
184
|
-
}
|
|
185
|
-
};
|
|
186
|
-
} else {
|
|
187
|
-
this.#logger.child({ message }).warn("chat message is not supported inside the realtime API");
|
|
188
|
-
return;
|
|
111
|
+
const apiKey = options.apiKey || process.env.OPENAI_API_KEY;
|
|
112
|
+
if (!apiKey && !isAzure) {
|
|
113
|
+
throw new Error(
|
|
114
|
+
"OpenAI API key is required, either using the argument or by setting the OPENAI_API_KEY environmental variable"
|
|
115
|
+
);
|
|
116
|
+
}
|
|
117
|
+
if (!options.baseURL && isAzure) {
|
|
118
|
+
const azureEndpoint = process.env.AZURE_OPENAI_ENDPOINT;
|
|
119
|
+
if (!azureEndpoint) {
|
|
120
|
+
throw new Error(
|
|
121
|
+
"Missing Azure endpoint. Please pass base_url or set AZURE_OPENAI_ENDPOINT environment variable."
|
|
122
|
+
);
|
|
189
123
|
}
|
|
124
|
+
options.baseURL = `${azureEndpoint.replace(/\/$/, "")}/openai`;
|
|
190
125
|
}
|
|
191
|
-
this
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
get item() {
|
|
200
|
-
return new ConversationItem(this.#session);
|
|
201
|
-
}
|
|
202
|
-
}
|
|
203
|
-
class Response {
|
|
204
|
-
#session;
|
|
205
|
-
constructor(session) {
|
|
206
|
-
this.#session = session;
|
|
207
|
-
}
|
|
208
|
-
create() {
|
|
209
|
-
this.#session.queueMsg({
|
|
210
|
-
type: "response.create"
|
|
211
|
-
});
|
|
212
|
-
}
|
|
213
|
-
cancel() {
|
|
214
|
-
this.#session.queueMsg({
|
|
215
|
-
type: "response.cancel"
|
|
216
|
-
});
|
|
126
|
+
this._options = {
|
|
127
|
+
...DEFAULT_REALTIME_MODEL_OPTIONS,
|
|
128
|
+
...options,
|
|
129
|
+
baseURL: options.baseURL || BASE_URL,
|
|
130
|
+
apiKey,
|
|
131
|
+
isAzure,
|
|
132
|
+
model: options.model || DEFAULT_REALTIME_MODEL_OPTIONS.model
|
|
133
|
+
};
|
|
217
134
|
}
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
135
|
+
/**
|
|
136
|
+
* Create a RealtimeModel instance configured for Azure OpenAI Service.
|
|
137
|
+
*
|
|
138
|
+
* @param azureDeployment - The name of your Azure OpenAI deployment.
|
|
139
|
+
* @param azureEndpoint - The endpoint URL for your Azure OpenAI resource. If undefined, will attempt to read from the environment variable AZURE_OPENAI_ENDPOINT.
|
|
140
|
+
* @param apiVersion - API version to use with Azure OpenAI Service. If undefined, will attempt to read from the environment variable OPENAI_API_VERSION.
|
|
141
|
+
* @param apiKey - Azure OpenAI API key. If undefined, will attempt to read from the environment variable AZURE_OPENAI_API_KEY.
|
|
142
|
+
* @param entraToken - Azure Entra authentication token. Required if not using API key authentication.
|
|
143
|
+
* @param baseURL - Base URL for the API endpoint. If undefined, constructed from the azure_endpoint.
|
|
144
|
+
* @param voice - Voice setting for audio outputs. Defaults to "alloy".
|
|
145
|
+
* @param inputAudioTranscription - Options for transcribing input audio. Defaults to @see DEFAULT_INPUT_AUDIO_TRANSCRIPTION.
|
|
146
|
+
* @param turnDetection - Options for server-based voice activity detection (VAD). Defaults to @see DEFAULT_SERVER_VAD_OPTIONS.
|
|
147
|
+
* @param temperature - Sampling temperature for response generation. Defaults to @see DEFAULT_TEMPERATURE.
|
|
148
|
+
* @param speed - Speed of the audio output. Defaults to 1.0.
|
|
149
|
+
* @param maxResponseOutputTokens - Maximum number of tokens in the response. Defaults to @see DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS.
|
|
150
|
+
* @param maxSessionDuration - Maximum duration of the session in milliseconds. Defaults to @see DEFAULT_MAX_SESSION_DURATION.
|
|
151
|
+
*
|
|
152
|
+
* @returns A RealtimeModel instance configured for Azure OpenAI Service.
|
|
153
|
+
*
|
|
154
|
+
* @throws Error if required Azure parameters are missing or invalid.
|
|
155
|
+
*/
|
|
226
156
|
static withAzure({
|
|
227
|
-
baseURL,
|
|
228
157
|
azureDeployment,
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
voice = "alloy",
|
|
235
|
-
inputAudioFormat = "pcm16",
|
|
236
|
-
outputAudioFormat = "pcm16",
|
|
237
|
-
inputAudioTranscription = { model: "whisper-1" },
|
|
238
|
-
turnDetection = { type: "server_vad" },
|
|
239
|
-
temperature = 0.8,
|
|
240
|
-
maxResponseOutputTokens = Infinity
|
|
241
|
-
}) {
|
|
242
|
-
return new RealtimeModel({
|
|
243
|
-
isAzure: true,
|
|
244
|
-
baseURL: new URL("openai", baseURL).toString(),
|
|
245
|
-
model: azureDeployment,
|
|
246
|
-
apiVersion,
|
|
247
|
-
apiKey,
|
|
248
|
-
entraToken,
|
|
249
|
-
instructions,
|
|
250
|
-
modalities,
|
|
251
|
-
voice,
|
|
252
|
-
inputAudioFormat,
|
|
253
|
-
outputAudioFormat,
|
|
254
|
-
inputAudioTranscription,
|
|
255
|
-
turnDetection,
|
|
256
|
-
temperature,
|
|
257
|
-
maxResponseOutputTokens
|
|
258
|
-
});
|
|
259
|
-
}
|
|
260
|
-
constructor({
|
|
261
|
-
modalities = ["text", "audio"],
|
|
262
|
-
instructions = "",
|
|
158
|
+
azureEndpoint,
|
|
159
|
+
apiVersion,
|
|
160
|
+
apiKey,
|
|
161
|
+
entraToken,
|
|
162
|
+
baseURL,
|
|
263
163
|
voice = "alloy",
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
inputAudioTranscription = { model: "whisper-1" },
|
|
267
|
-
turnDetection = { type: "server_vad" },
|
|
164
|
+
inputAudioTranscription = AZURE_DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
|
|
165
|
+
turnDetection = AZURE_DEFAULT_TURN_DETECTION,
|
|
268
166
|
temperature = 0.8,
|
|
269
|
-
|
|
270
|
-
model = "gpt-4o-realtime-preview-2024-10-01",
|
|
271
|
-
apiKey = process.env.OPENAI_API_KEY || "",
|
|
272
|
-
baseURL = api_proto.BASE_URL,
|
|
273
|
-
// used for microsoft
|
|
274
|
-
isAzure = false,
|
|
275
|
-
apiVersion = void 0,
|
|
276
|
-
entraToken = void 0
|
|
167
|
+
speed
|
|
277
168
|
}) {
|
|
278
|
-
|
|
279
|
-
if (apiKey
|
|
169
|
+
apiKey = apiKey || process.env.AZURE_OPENAI_API_KEY;
|
|
170
|
+
if (!apiKey && !entraToken) {
|
|
280
171
|
throw new Error(
|
|
281
|
-
"
|
|
172
|
+
"Missing credentials. Please pass one of `apiKey`, `entraToken`, or the `AZURE_OPENAI_API_KEY` environment variable."
|
|
282
173
|
);
|
|
283
174
|
}
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
175
|
+
apiVersion = apiVersion || process.env.OPENAI_API_VERSION;
|
|
176
|
+
if (!apiVersion) {
|
|
177
|
+
throw new Error(
|
|
178
|
+
"Must provide either the `apiVersion` argument or the `OPENAI_API_VERSION` environment variable"
|
|
179
|
+
);
|
|
180
|
+
}
|
|
181
|
+
if (!baseURL) {
|
|
182
|
+
azureEndpoint = azureEndpoint || process.env.AZURE_OPENAI_ENDPOINT;
|
|
183
|
+
if (!azureEndpoint) {
|
|
184
|
+
throw new Error(
|
|
185
|
+
"Missing Azure endpoint. Please pass the `azure_endpoint` parameter or set the `AZURE_OPENAI_ENDPOINT` environment variable."
|
|
186
|
+
);
|
|
187
|
+
}
|
|
188
|
+
baseURL = `${azureEndpoint.replace(/\/$/, "")}/openai`;
|
|
189
|
+
}
|
|
190
|
+
return new RealtimeModel({
|
|
287
191
|
voice,
|
|
288
|
-
inputAudioFormat,
|
|
289
|
-
outputAudioFormat,
|
|
290
192
|
inputAudioTranscription,
|
|
291
193
|
turnDetection,
|
|
292
194
|
temperature,
|
|
293
|
-
|
|
294
|
-
model,
|
|
195
|
+
speed,
|
|
295
196
|
apiKey,
|
|
296
|
-
|
|
297
|
-
isAzure,
|
|
197
|
+
azureDeployment,
|
|
298
198
|
apiVersion,
|
|
299
|
-
entraToken
|
|
300
|
-
|
|
301
|
-
}
|
|
302
|
-
get sessions() {
|
|
303
|
-
return this.#sessions;
|
|
304
|
-
}
|
|
305
|
-
session({
|
|
306
|
-
fncCtx,
|
|
307
|
-
chatCtx,
|
|
308
|
-
modalities = this.#defaultOpts.modalities,
|
|
309
|
-
instructions = this.#defaultOpts.instructions,
|
|
310
|
-
voice = this.#defaultOpts.voice,
|
|
311
|
-
inputAudioFormat = this.#defaultOpts.inputAudioFormat,
|
|
312
|
-
outputAudioFormat = this.#defaultOpts.outputAudioFormat,
|
|
313
|
-
inputAudioTranscription = this.#defaultOpts.inputAudioTranscription,
|
|
314
|
-
turnDetection = this.#defaultOpts.turnDetection,
|
|
315
|
-
temperature = this.#defaultOpts.temperature,
|
|
316
|
-
maxResponseOutputTokens = this.#defaultOpts.maxResponseOutputTokens
|
|
317
|
-
}) {
|
|
318
|
-
const opts = {
|
|
319
|
-
modalities,
|
|
320
|
-
instructions,
|
|
321
|
-
voice,
|
|
322
|
-
inputAudioFormat,
|
|
323
|
-
outputAudioFormat,
|
|
324
|
-
inputAudioTranscription,
|
|
325
|
-
turnDetection,
|
|
326
|
-
temperature,
|
|
327
|
-
maxResponseOutputTokens,
|
|
328
|
-
model: this.#defaultOpts.model,
|
|
329
|
-
apiKey: this.#defaultOpts.apiKey,
|
|
330
|
-
baseURL: this.#defaultOpts.baseURL,
|
|
331
|
-
isAzure: this.#defaultOpts.isAzure,
|
|
332
|
-
apiVersion: this.#defaultOpts.apiVersion,
|
|
333
|
-
entraToken: this.#defaultOpts.entraToken
|
|
334
|
-
};
|
|
335
|
-
const newSession = new RealtimeSession(opts, {
|
|
336
|
-
chatCtx: chatCtx || new import_agents.llm.ChatContext(),
|
|
337
|
-
fncCtx
|
|
199
|
+
entraToken,
|
|
200
|
+
baseURL
|
|
338
201
|
});
|
|
339
|
-
|
|
340
|
-
|
|
202
|
+
}
|
|
203
|
+
session() {
|
|
204
|
+
return new RealtimeSession(this);
|
|
341
205
|
}
|
|
342
206
|
async close() {
|
|
343
|
-
|
|
207
|
+
return;
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
function processBaseURL({
|
|
211
|
+
baseURL,
|
|
212
|
+
model,
|
|
213
|
+
isAzure = false,
|
|
214
|
+
azureDeployment,
|
|
215
|
+
apiVersion
|
|
216
|
+
}) {
|
|
217
|
+
const url = new URL([baseURL, "realtime"].join("/"));
|
|
218
|
+
if (url.protocol === "https:") {
|
|
219
|
+
url.protocol = "wss:";
|
|
220
|
+
}
|
|
221
|
+
if (!url.pathname || ["", "/v1", "/openai"].includes(url.pathname.replace(/\/$/, ""))) {
|
|
222
|
+
url.pathname = url.pathname.replace(/\/$/, "") + "/realtime";
|
|
223
|
+
} else {
|
|
224
|
+
url.pathname = url.pathname.replace(/\/$/, "");
|
|
225
|
+
}
|
|
226
|
+
const queryParams = {};
|
|
227
|
+
if (isAzure) {
|
|
228
|
+
if (apiVersion) {
|
|
229
|
+
queryParams["api-version"] = apiVersion;
|
|
230
|
+
}
|
|
231
|
+
if (azureDeployment) {
|
|
232
|
+
queryParams["deployment"] = azureDeployment;
|
|
233
|
+
}
|
|
234
|
+
} else {
|
|
235
|
+
queryParams["model"] = model;
|
|
236
|
+
}
|
|
237
|
+
for (const [key, value] of Object.entries(queryParams)) {
|
|
238
|
+
url.searchParams.set(key, value);
|
|
344
239
|
}
|
|
240
|
+
return url.toString();
|
|
345
241
|
}
|
|
346
|
-
class RealtimeSession extends import_agents.
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
242
|
+
class RealtimeSession extends import_agents.llm.RealtimeSession {
|
|
243
|
+
_tools = {};
|
|
244
|
+
remoteChatCtx = new import_agents.llm.RemoteChatContext();
|
|
245
|
+
messageChannel = new import_agents.Queue();
|
|
246
|
+
inputResampler;
|
|
247
|
+
instructions;
|
|
248
|
+
oaiRealtimeModel;
|
|
249
|
+
currentGeneration;
|
|
250
|
+
responseCreatedFutures = {};
|
|
251
|
+
textModeRecoveryRetries = 0;
|
|
252
|
+
itemCreateFutures = {};
|
|
253
|
+
itemDeleteFutures = {};
|
|
254
|
+
updateChatCtxLock = new import_mutex.Mutex();
|
|
255
|
+
updateFuncCtxLock = new import_mutex.Mutex();
|
|
256
|
+
// 100ms chunks
|
|
257
|
+
bstream = new import_agents.AudioByteStream(SAMPLE_RATE, NUM_CHANNELS, SAMPLE_RATE / 10);
|
|
258
|
+
pushedDurationMs = 0;
|
|
354
259
|
#logger = (0, import_agents.log)();
|
|
355
260
|
#task;
|
|
356
|
-
#
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
this.#
|
|
361
|
-
this
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
this.
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
261
|
+
#closed = false;
|
|
262
|
+
constructor(realtimeModel) {
|
|
263
|
+
super(realtimeModel);
|
|
264
|
+
this.oaiRealtimeModel = realtimeModel;
|
|
265
|
+
this.#task = this.#mainTask();
|
|
266
|
+
this.sendEvent(this.createSessionUpdateEvent());
|
|
267
|
+
}
|
|
268
|
+
sendEvent(command) {
|
|
269
|
+
this.messageChannel.put(command);
|
|
270
|
+
}
|
|
271
|
+
createSessionUpdateEvent() {
|
|
272
|
+
return {
|
|
273
|
+
type: "session.update",
|
|
274
|
+
session: {
|
|
275
|
+
model: this.oaiRealtimeModel._options.model,
|
|
276
|
+
voice: this.oaiRealtimeModel._options.voice,
|
|
277
|
+
input_audio_format: "pcm16",
|
|
278
|
+
output_audio_format: "pcm16",
|
|
279
|
+
modalities: ["text", "audio"],
|
|
280
|
+
turn_detection: this.oaiRealtimeModel._options.turnDetection,
|
|
281
|
+
input_audio_transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
|
|
282
|
+
// TODO(shubhra): add inputAudioNoiseReduction
|
|
283
|
+
temperature: this.oaiRealtimeModel._options.temperature,
|
|
284
|
+
tool_choice: toOaiToolChoice(this.oaiRealtimeModel._options.toolChoice),
|
|
285
|
+
max_response_output_tokens: this.oaiRealtimeModel._options.maxResponseOutputTokens === Infinity ? "inf" : this.oaiRealtimeModel._options.maxResponseOutputTokens,
|
|
286
|
+
// TODO(shubhra): add tracing options
|
|
287
|
+
instructions: this.instructions,
|
|
288
|
+
speed: this.oaiRealtimeModel._options.speed
|
|
289
|
+
}
|
|
290
|
+
};
|
|
376
291
|
}
|
|
377
292
|
get chatCtx() {
|
|
378
|
-
return this
|
|
293
|
+
return this.remoteChatCtx.toChatCtx();
|
|
379
294
|
}
|
|
380
|
-
get
|
|
381
|
-
return this
|
|
295
|
+
get tools() {
|
|
296
|
+
return { ...this._tools };
|
|
382
297
|
}
|
|
383
|
-
|
|
384
|
-
|
|
298
|
+
async updateChatCtx(_chatCtx) {
|
|
299
|
+
const unlock = await this.updateChatCtxLock.lock();
|
|
300
|
+
const events = this.createChatCtxUpdateEvents(_chatCtx);
|
|
301
|
+
const futures = [];
|
|
302
|
+
for (const event of events) {
|
|
303
|
+
const future = new import_agents.Future();
|
|
304
|
+
futures.push(future);
|
|
305
|
+
if (event.type === "conversation.item.create") {
|
|
306
|
+
this.itemCreateFutures[event.item.id] = future;
|
|
307
|
+
} else if (event.type == "conversation.item.delete") {
|
|
308
|
+
this.itemDeleteFutures[event.item_id] = future;
|
|
309
|
+
}
|
|
310
|
+
this.sendEvent(event);
|
|
311
|
+
}
|
|
312
|
+
if (futures.length === 0) {
|
|
313
|
+
unlock();
|
|
314
|
+
return;
|
|
315
|
+
}
|
|
316
|
+
try {
|
|
317
|
+
await Promise.race([
|
|
318
|
+
Promise.all(futures),
|
|
319
|
+
(0, import_async.delay)(5e3).then(() => {
|
|
320
|
+
throw new Error("Chat ctx update events timed out");
|
|
321
|
+
})
|
|
322
|
+
]);
|
|
323
|
+
} catch (e) {
|
|
324
|
+
this.#logger.error(e.message);
|
|
325
|
+
throw e;
|
|
326
|
+
} finally {
|
|
327
|
+
unlock();
|
|
328
|
+
}
|
|
385
329
|
}
|
|
386
|
-
|
|
387
|
-
|
|
330
|
+
createChatCtxUpdateEvents(chatCtx, addMockAudio = false) {
|
|
331
|
+
const newChatCtx = chatCtx.copy();
|
|
332
|
+
if (addMockAudio) {
|
|
333
|
+
newChatCtx.items.push(createMockAudioItem());
|
|
334
|
+
} else {
|
|
335
|
+
newChatCtx.items = newChatCtx.items.filter(
|
|
336
|
+
(item) => !item.id.startsWith(MOCK_AUDIO_ID_PREFIX)
|
|
337
|
+
);
|
|
338
|
+
}
|
|
339
|
+
const events = [];
|
|
340
|
+
const diffOps = import_agents.llm.computeChatCtxDiff(this.chatCtx, newChatCtx);
|
|
341
|
+
for (const op of diffOps.toRemove) {
|
|
342
|
+
events.push({
|
|
343
|
+
type: "conversation.item.delete",
|
|
344
|
+
item_id: op,
|
|
345
|
+
event_id: (0, import_agents.shortuuid)("chat_ctx_delete_")
|
|
346
|
+
});
|
|
347
|
+
}
|
|
348
|
+
for (const [previousId, id] of diffOps.toCreate) {
|
|
349
|
+
const chatItem = newChatCtx.getById(id);
|
|
350
|
+
if (!chatItem) {
|
|
351
|
+
throw new Error(`Chat item ${id} not found`);
|
|
352
|
+
}
|
|
353
|
+
events.push({
|
|
354
|
+
type: "conversation.item.create",
|
|
355
|
+
item: livekitItemToOpenAIItem(chatItem),
|
|
356
|
+
previous_item_id: previousId ?? void 0,
|
|
357
|
+
event_id: (0, import_agents.shortuuid)("chat_ctx_create_")
|
|
358
|
+
});
|
|
359
|
+
}
|
|
360
|
+
return events;
|
|
388
361
|
}
|
|
389
|
-
|
|
390
|
-
|
|
362
|
+
async updateTools(_tools) {
|
|
363
|
+
const unlock = await this.updateFuncCtxLock.lock();
|
|
364
|
+
const ev = this.createToolsUpdateEvent(_tools);
|
|
365
|
+
this.sendEvent(ev);
|
|
366
|
+
if (!ev.session.tools) {
|
|
367
|
+
throw new Error("Tools are missing in the session update event");
|
|
368
|
+
}
|
|
369
|
+
const retainedToolNames = new Set(ev.session.tools.map((tool) => tool.name));
|
|
370
|
+
const retainedTools = Object.fromEntries(
|
|
371
|
+
Object.entries(_tools).filter(
|
|
372
|
+
([name, tool]) => import_agents.llm.isFunctionTool(tool) && retainedToolNames.has(name)
|
|
373
|
+
)
|
|
374
|
+
);
|
|
375
|
+
this._tools = retainedTools;
|
|
376
|
+
unlock();
|
|
391
377
|
}
|
|
392
|
-
|
|
393
|
-
|
|
378
|
+
createToolsUpdateEvent(_tools) {
|
|
379
|
+
const oaiTools = [];
|
|
380
|
+
for (const [name, tool] of Object.entries(_tools)) {
|
|
381
|
+
if (!import_agents.llm.isFunctionTool(tool)) {
|
|
382
|
+
this.#logger.error({ name, tool }, "OpenAI Realtime API doesn't support this tool type");
|
|
383
|
+
continue;
|
|
384
|
+
}
|
|
385
|
+
const { parameters: toolParameters, description } = tool;
|
|
386
|
+
try {
|
|
387
|
+
const parameters = import_agents.llm.toJsonSchema(
|
|
388
|
+
toolParameters
|
|
389
|
+
);
|
|
390
|
+
oaiTools.push({
|
|
391
|
+
name,
|
|
392
|
+
description,
|
|
393
|
+
parameters,
|
|
394
|
+
type: "function"
|
|
395
|
+
});
|
|
396
|
+
} catch (e) {
|
|
397
|
+
this.#logger.error({ name, tool }, "OpenAI Realtime API doesn't support this tool type");
|
|
398
|
+
continue;
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
return {
|
|
402
|
+
type: "session.update",
|
|
403
|
+
session: {
|
|
404
|
+
model: this.oaiRealtimeModel._options.model,
|
|
405
|
+
tools: oaiTools
|
|
406
|
+
},
|
|
407
|
+
event_id: (0, import_agents.shortuuid)("tools_update_")
|
|
408
|
+
};
|
|
409
|
+
}
|
|
410
|
+
async updateInstructions(_instructions) {
|
|
411
|
+
const eventId = (0, import_agents.shortuuid)("instructions_update_");
|
|
412
|
+
this.sendEvent({
|
|
413
|
+
type: "session.update",
|
|
414
|
+
session: {
|
|
415
|
+
instructions: _instructions
|
|
416
|
+
},
|
|
417
|
+
event_id: eventId
|
|
418
|
+
});
|
|
419
|
+
this.instructions = _instructions;
|
|
420
|
+
}
|
|
421
|
+
updateOptions({ toolChoice }) {
|
|
422
|
+
const options = {};
|
|
423
|
+
this.oaiRealtimeModel._options.toolChoice = toolChoice;
|
|
424
|
+
options.tool_choice = toOaiToolChoice(toolChoice);
|
|
425
|
+
this.sendEvent({
|
|
426
|
+
type: "session.update",
|
|
427
|
+
session: options,
|
|
428
|
+
event_id: (0, import_agents.shortuuid)("options_update_")
|
|
429
|
+
});
|
|
430
|
+
}
|
|
431
|
+
pushAudio(frame) {
|
|
432
|
+
for (const f of this.resampleAudio(frame)) {
|
|
433
|
+
for (const nf of this.bstream.write(f.data.buffer)) {
|
|
434
|
+
this.sendEvent({
|
|
435
|
+
type: "input_audio_buffer.append",
|
|
436
|
+
audio: Buffer.from(nf.data.buffer).toString("base64")
|
|
437
|
+
});
|
|
438
|
+
this.pushedDurationMs += nf.samplesPerChannel / nf.sampleRate * 1e3;
|
|
439
|
+
}
|
|
440
|
+
}
|
|
394
441
|
}
|
|
395
|
-
|
|
396
|
-
if (
|
|
397
|
-
|
|
442
|
+
async commitAudio() {
|
|
443
|
+
if (this.pushedDurationMs > 100) {
|
|
444
|
+
this.sendEvent({
|
|
445
|
+
type: "input_audio_buffer.commit"
|
|
446
|
+
});
|
|
447
|
+
this.pushedDurationMs = 0;
|
|
398
448
|
}
|
|
399
|
-
return this.#expiresAt * 1e3;
|
|
400
449
|
}
|
|
401
|
-
|
|
402
|
-
this
|
|
450
|
+
async clearAudio() {
|
|
451
|
+
this.sendEvent({
|
|
452
|
+
type: "input_audio_buffer.clear"
|
|
453
|
+
});
|
|
454
|
+
this.pushedDurationMs = 0;
|
|
455
|
+
}
|
|
456
|
+
async generateReply(instructions) {
|
|
457
|
+
const handle = this.createResponse({ instructions, userInitiated: true });
|
|
458
|
+
this.textModeRecoveryRetries = 0;
|
|
459
|
+
return handle.doneFut.await;
|
|
460
|
+
}
|
|
461
|
+
async interrupt() {
|
|
462
|
+
this.sendEvent({
|
|
463
|
+
type: "response.cancel"
|
|
464
|
+
});
|
|
465
|
+
}
|
|
466
|
+
async truncate(_options) {
|
|
467
|
+
this.sendEvent({
|
|
468
|
+
type: "conversation.item.truncate",
|
|
469
|
+
content_index: 0,
|
|
470
|
+
item_id: _options.messageId,
|
|
471
|
+
audio_end_ms: _options.audioEndMs
|
|
472
|
+
});
|
|
403
473
|
}
|
|
404
474
|
/// Truncates the data field of the event to the specified maxLength to avoid overwhelming logs
|
|
405
475
|
/// with large amounts of base64 audio data.
|
|
@@ -420,549 +490,716 @@ class RealtimeSession extends import_agents.multimodal.RealtimeSession {
|
|
|
420
490
|
}
|
|
421
491
|
return untypedEvent;
|
|
422
492
|
}
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
voice = this.#opts.voice,
|
|
427
|
-
inputAudioFormat = this.#opts.inputAudioFormat,
|
|
428
|
-
outputAudioFormat = this.#opts.outputAudioFormat,
|
|
429
|
-
inputAudioTranscription = this.#opts.inputAudioTranscription,
|
|
430
|
-
turnDetection = this.#opts.turnDetection,
|
|
431
|
-
temperature = this.#opts.temperature,
|
|
432
|
-
maxResponseOutputTokens = this.#opts.maxResponseOutputTokens,
|
|
433
|
-
toolChoice = "auto",
|
|
434
|
-
selectedTools = Object.keys(this.#fncCtx || {})
|
|
435
|
-
}) {
|
|
436
|
-
this.#opts = {
|
|
437
|
-
modalities,
|
|
438
|
-
instructions,
|
|
439
|
-
voice,
|
|
440
|
-
inputAudioFormat,
|
|
441
|
-
outputAudioFormat,
|
|
442
|
-
inputAudioTranscription,
|
|
443
|
-
turnDetection,
|
|
444
|
-
temperature,
|
|
445
|
-
maxResponseOutputTokens,
|
|
446
|
-
model: this.#opts.model,
|
|
447
|
-
apiKey: this.#opts.apiKey,
|
|
448
|
-
baseURL: this.#opts.baseURL,
|
|
449
|
-
isAzure: this.#opts.isAzure,
|
|
450
|
-
apiVersion: this.#opts.apiVersion,
|
|
451
|
-
entraToken: this.#opts.entraToken
|
|
452
|
-
};
|
|
453
|
-
const tools = this.#fncCtx ? Object.entries(this.#fncCtx).filter(([name]) => selectedTools.includes(name)).map(([name, func]) => ({
|
|
454
|
-
type: "function",
|
|
455
|
-
name,
|
|
456
|
-
description: func.description,
|
|
457
|
-
parameters: (
|
|
458
|
-
// don't format parameters if they are raw openai params
|
|
459
|
-
func.parameters.type == "object" ? func.parameters : import_agents.llm.oaiParams(func.parameters)
|
|
460
|
-
)
|
|
461
|
-
})) : [];
|
|
462
|
-
const sessionUpdateEvent = {
|
|
463
|
-
type: "session.update",
|
|
464
|
-
session: {
|
|
465
|
-
modalities: this.#opts.modalities,
|
|
466
|
-
instructions: this.#opts.instructions,
|
|
467
|
-
voice: this.#opts.voice,
|
|
468
|
-
input_audio_format: this.#opts.inputAudioFormat,
|
|
469
|
-
output_audio_format: this.#opts.outputAudioFormat,
|
|
470
|
-
input_audio_transcription: this.#opts.inputAudioTranscription,
|
|
471
|
-
turn_detection: this.#opts.turnDetection,
|
|
472
|
-
temperature: this.#opts.temperature,
|
|
473
|
-
max_response_output_tokens: this.#opts.maxResponseOutputTokens === Infinity ? "inf" : this.#opts.maxResponseOutputTokens,
|
|
474
|
-
tools,
|
|
475
|
-
tool_choice: toolChoice
|
|
476
|
-
}
|
|
493
|
+
async createWsConn() {
|
|
494
|
+
const headers = {
|
|
495
|
+
"User-Agent": "LiveKit-Agents-JS"
|
|
477
496
|
};
|
|
478
|
-
if (this
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
const samples = duration * api_proto.SAMPLE_RATE;
|
|
486
|
-
return new import_agents.llm.ChatMessage({
|
|
487
|
-
role: import_agents.llm.ChatRole.USER,
|
|
488
|
-
content: {
|
|
489
|
-
frame: new import_rtc_node.AudioFrame(
|
|
490
|
-
new Int16Array(samples * api_proto.NUM_CHANNELS),
|
|
491
|
-
api_proto.SAMPLE_RATE,
|
|
492
|
-
api_proto.NUM_CHANNELS,
|
|
493
|
-
samples
|
|
494
|
-
)
|
|
497
|
+
if (this.oaiRealtimeModel._options.isAzure) {
|
|
498
|
+
if (this.oaiRealtimeModel._options.entraToken) {
|
|
499
|
+
headers.Authorization = `Bearer ${this.oaiRealtimeModel._options.entraToken}`;
|
|
500
|
+
} else if (this.oaiRealtimeModel._options.apiKey) {
|
|
501
|
+
headers["api-key"] = this.oaiRealtimeModel._options.apiKey;
|
|
502
|
+
} else {
|
|
503
|
+
throw new Error("Microsoft API key or entraToken is required");
|
|
495
504
|
}
|
|
496
|
-
}
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
* Try to recover from a text response to audio mode.
|
|
500
|
-
*
|
|
501
|
-
* @remarks
|
|
502
|
-
* Sometimes the OpenAI Realtime API returns text instead of audio responses.
|
|
503
|
-
* This method tries to recover from this by requesting a new response after deleting the text
|
|
504
|
-
* response and creating an empty user audio message.
|
|
505
|
-
*/
|
|
506
|
-
recoverFromTextResponse(itemId) {
|
|
507
|
-
if (itemId) {
|
|
508
|
-
this.conversation.item.delete(itemId);
|
|
505
|
+
} else {
|
|
506
|
+
headers.Authorization = `Bearer ${this.oaiRealtimeModel._options.apiKey}`;
|
|
507
|
+
headers["OpenAI-Beta"] = "realtime=v1";
|
|
509
508
|
}
|
|
510
|
-
|
|
511
|
-
|
|
509
|
+
const url = processBaseURL({
|
|
510
|
+
baseURL: this.oaiRealtimeModel._options.baseURL,
|
|
511
|
+
model: this.oaiRealtimeModel._options.model,
|
|
512
|
+
isAzure: this.oaiRealtimeModel._options.isAzure,
|
|
513
|
+
apiVersion: this.oaiRealtimeModel._options.apiVersion,
|
|
514
|
+
azureDeployment: this.oaiRealtimeModel._options.azureDeployment
|
|
515
|
+
});
|
|
516
|
+
this.#logger.debug(`Connecting to OpenAI Realtime API at ${url}`);
|
|
517
|
+
return new Promise((resolve, reject) => {
|
|
518
|
+
const ws = new import_ws.WebSocket(url, { headers });
|
|
519
|
+
let waiting = true;
|
|
520
|
+
const timeout = setTimeout(() => {
|
|
521
|
+
ws.close();
|
|
522
|
+
reject(new Error("WebSocket connection timeout"));
|
|
523
|
+
}, this.oaiRealtimeModel._options.connOptions.timeoutMs);
|
|
524
|
+
ws.once("open", () => {
|
|
525
|
+
if (!waiting) return;
|
|
526
|
+
waiting = false;
|
|
527
|
+
clearTimeout(timeout);
|
|
528
|
+
resolve(ws);
|
|
529
|
+
});
|
|
530
|
+
ws.once("close", () => {
|
|
531
|
+
if (!waiting) return;
|
|
532
|
+
waiting = false;
|
|
533
|
+
clearTimeout(timeout);
|
|
534
|
+
reject(new Error("OpenAI Realtime API connection closed"));
|
|
535
|
+
});
|
|
536
|
+
});
|
|
512
537
|
}
|
|
513
|
-
#
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
headers["OpenAI-Beta"] = "realtime=v1";
|
|
530
|
-
}
|
|
531
|
-
const url = new URL([this.#opts.baseURL, "realtime"].join("/"));
|
|
532
|
-
if (url.protocol === "https:") {
|
|
533
|
-
url.protocol = "wss:";
|
|
534
|
-
}
|
|
535
|
-
const queryParams = {};
|
|
536
|
-
if (this.#opts.isAzure) {
|
|
537
|
-
queryParams["api-version"] = this.#opts.apiVersion ?? "2024-10-01-preview";
|
|
538
|
-
queryParams["deployment"] = this.#opts.model;
|
|
539
|
-
} else {
|
|
540
|
-
queryParams["model"] = this.#opts.model;
|
|
541
|
-
}
|
|
542
|
-
for (const [key, value] of Object.entries(queryParams)) {
|
|
543
|
-
url.searchParams.set(key, value);
|
|
538
|
+
async #mainTask() {
|
|
539
|
+
let reconnecting = false;
|
|
540
|
+
let numRetries = 0;
|
|
541
|
+
let wsConn = null;
|
|
542
|
+
const maxRetries = this.oaiRealtimeModel._options.connOptions.maxRetry;
|
|
543
|
+
const reconnect = async () => {
|
|
544
|
+
this.#logger.debug(
|
|
545
|
+
{
|
|
546
|
+
maxSessionDuration: this.oaiRealtimeModel._options.maxSessionDuration
|
|
547
|
+
},
|
|
548
|
+
"Reconnecting to OpenAI Realtime API"
|
|
549
|
+
);
|
|
550
|
+
const events = [];
|
|
551
|
+
events.push(this.createSessionUpdateEvent());
|
|
552
|
+
if (Object.keys(this._tools).length > 0) {
|
|
553
|
+
events.push(this.createToolsUpdateEvent(this._tools));
|
|
544
554
|
}
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
555
|
+
const chatCtx = this.chatCtx.copy({
|
|
556
|
+
excludeFunctionCall: true,
|
|
557
|
+
excludeInstructions: true,
|
|
558
|
+
excludeEmptyMessage: true
|
|
548
559
|
});
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
this.#logger.debug(`<- ${JSON.stringify(this.#loggableEvent(event))}`);
|
|
557
|
-
switch (event.type) {
|
|
558
|
-
case "error":
|
|
559
|
-
this.#handleError(event);
|
|
560
|
-
break;
|
|
561
|
-
case "session.created":
|
|
562
|
-
this.#handleSessionCreated(event);
|
|
563
|
-
break;
|
|
564
|
-
case "session.updated":
|
|
565
|
-
this.#handleSessionUpdated(event);
|
|
566
|
-
break;
|
|
567
|
-
case "conversation.created":
|
|
568
|
-
this.#handleConversationCreated(event);
|
|
569
|
-
break;
|
|
570
|
-
case "input_audio_buffer.committed":
|
|
571
|
-
this.#handleInputAudioBufferCommitted(event);
|
|
572
|
-
break;
|
|
573
|
-
case "input_audio_buffer.cleared":
|
|
574
|
-
this.#handleInputAudioBufferCleared(event);
|
|
575
|
-
break;
|
|
576
|
-
case "input_audio_buffer.speech_started":
|
|
577
|
-
this.#handleInputAudioBufferSpeechStarted(event);
|
|
578
|
-
break;
|
|
579
|
-
case "input_audio_buffer.speech_stopped":
|
|
580
|
-
this.#handleInputAudioBufferSpeechStopped(event);
|
|
581
|
-
break;
|
|
582
|
-
case "conversation.item.created":
|
|
583
|
-
this.#handleConversationItemCreated(event);
|
|
584
|
-
break;
|
|
585
|
-
case "conversation.item.input_audio_transcription.completed":
|
|
586
|
-
this.#handleConversationItemInputAudioTranscriptionCompleted(event);
|
|
587
|
-
break;
|
|
588
|
-
case "conversation.item.input_audio_transcription.failed":
|
|
589
|
-
this.#handleConversationItemInputAudioTranscriptionFailed(event);
|
|
590
|
-
break;
|
|
591
|
-
case "conversation.item.truncated":
|
|
592
|
-
this.#handleConversationItemTruncated(event);
|
|
593
|
-
break;
|
|
594
|
-
case "conversation.item.deleted":
|
|
595
|
-
this.#handleConversationItemDeleted(event);
|
|
596
|
-
break;
|
|
597
|
-
case "response.created":
|
|
598
|
-
this.#handleResponseCreated(event);
|
|
599
|
-
break;
|
|
600
|
-
case "response.done":
|
|
601
|
-
this.#handleResponseDone(event);
|
|
602
|
-
break;
|
|
603
|
-
case "response.output_item.added":
|
|
604
|
-
this.#handleResponseOutputItemAdded(event);
|
|
605
|
-
break;
|
|
606
|
-
case "response.output_item.done":
|
|
607
|
-
this.#handleResponseOutputItemDone(event);
|
|
608
|
-
break;
|
|
609
|
-
case "response.content_part.added":
|
|
610
|
-
this.#handleResponseContentPartAdded(event);
|
|
611
|
-
break;
|
|
612
|
-
case "response.content_part.done":
|
|
613
|
-
this.#handleResponseContentPartDone(event);
|
|
614
|
-
break;
|
|
615
|
-
case "response.text.delta":
|
|
616
|
-
this.#handleResponseTextDelta(event);
|
|
617
|
-
break;
|
|
618
|
-
case "response.text.done":
|
|
619
|
-
this.#handleResponseTextDone(event);
|
|
620
|
-
break;
|
|
621
|
-
case "response.audio_transcript.delta":
|
|
622
|
-
this.#handleResponseAudioTranscriptDelta(event);
|
|
623
|
-
break;
|
|
624
|
-
case "response.audio_transcript.done":
|
|
625
|
-
this.#handleResponseAudioTranscriptDone(event);
|
|
626
|
-
break;
|
|
627
|
-
case "response.audio.delta":
|
|
628
|
-
this.#handleResponseAudioDelta(event);
|
|
629
|
-
break;
|
|
630
|
-
case "response.audio.done":
|
|
631
|
-
this.#handleResponseAudioDone(event);
|
|
632
|
-
break;
|
|
633
|
-
case "response.function_call_arguments.delta":
|
|
634
|
-
this.#handleResponseFunctionCallArgumentsDelta(event);
|
|
635
|
-
break;
|
|
636
|
-
case "response.function_call_arguments.done":
|
|
637
|
-
this.#handleResponseFunctionCallArgumentsDone(event);
|
|
638
|
-
break;
|
|
639
|
-
case "rate_limits.updated":
|
|
640
|
-
this.#handleRateLimitsUpdated(event);
|
|
641
|
-
break;
|
|
560
|
+
const oldChatCtx = this.remoteChatCtx;
|
|
561
|
+
this.remoteChatCtx = new import_agents.llm.RemoteChatContext();
|
|
562
|
+
events.push(...this.createChatCtxUpdateEvents(chatCtx));
|
|
563
|
+
try {
|
|
564
|
+
for (const ev of events) {
|
|
565
|
+
this.emit("openai_client_event_queued", ev);
|
|
566
|
+
wsConn.send(JSON.stringify(ev));
|
|
642
567
|
}
|
|
643
|
-
}
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
568
|
+
} catch (error) {
|
|
569
|
+
this.remoteChatCtx = oldChatCtx;
|
|
570
|
+
throw new import_agents.APIConnectionError({
|
|
571
|
+
message: "Failed to send message to OpenAI Realtime API during session re-connection"
|
|
572
|
+
});
|
|
573
|
+
}
|
|
574
|
+
this.#logger.debug("Reconnected to OpenAI Realtime API");
|
|
575
|
+
this.emit("session_reconnected", {});
|
|
576
|
+
};
|
|
577
|
+
reconnecting = false;
|
|
578
|
+
while (!this.#closed) {
|
|
579
|
+
this.#logger.debug("Creating WebSocket connection to OpenAI Realtime API");
|
|
580
|
+
wsConn = await this.createWsConn();
|
|
581
|
+
try {
|
|
582
|
+
if (reconnecting) {
|
|
583
|
+
await reconnect();
|
|
584
|
+
numRetries = 0;
|
|
655
585
|
}
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
586
|
+
await this.runWs(wsConn);
|
|
587
|
+
} catch (error) {
|
|
588
|
+
if (!(0, import_agents.isAPIError)(error)) {
|
|
589
|
+
this.emitError({ error, recoverable: false });
|
|
590
|
+
throw error;
|
|
661
591
|
}
|
|
662
|
-
if (!
|
|
663
|
-
|
|
592
|
+
if (maxRetries === 0 || !error.retryable) {
|
|
593
|
+
this.emitError({ error, recoverable: false });
|
|
594
|
+
throw error;
|
|
664
595
|
}
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
596
|
+
if (numRetries === maxRetries) {
|
|
597
|
+
this.emitError({ error, recoverable: false });
|
|
598
|
+
throw new import_agents.APIConnectionError({
|
|
599
|
+
message: `OpenAI Realtime API connection failed after ${numRetries} attempts`,
|
|
600
|
+
options: {
|
|
601
|
+
body: error,
|
|
602
|
+
retryable: false
|
|
603
|
+
}
|
|
604
|
+
});
|
|
605
|
+
}
|
|
606
|
+
this.emitError({ error, recoverable: true });
|
|
607
|
+
const retryInterval = numRetries === 0 ? DEFAULT_FIRST_RETRY_INTERVAL_MS : this.oaiRealtimeModel._options.connOptions.retryIntervalMs;
|
|
608
|
+
this.#logger.warn(
|
|
609
|
+
{
|
|
610
|
+
attempt: numRetries,
|
|
611
|
+
maxRetries,
|
|
612
|
+
error
|
|
613
|
+
},
|
|
614
|
+
`OpenAI Realtime API connection failed, retrying in ${retryInterval / 1e3}s`
|
|
615
|
+
);
|
|
616
|
+
await (0, import_async.delay)(retryInterval);
|
|
617
|
+
numRetries++;
|
|
618
|
+
}
|
|
619
|
+
reconnecting = true;
|
|
620
|
+
}
|
|
621
|
+
}
|
|
622
|
+
async runWs(wsConn) {
|
|
623
|
+
const forwardEvents = async (signal) => {
|
|
624
|
+
while (!this.#closed && wsConn.readyState === import_ws.WebSocket.OPEN && !signal.aborted) {
|
|
625
|
+
try {
|
|
626
|
+
const event = await this.messageChannel.get();
|
|
627
|
+
if (signal.aborted) {
|
|
628
|
+
break;
|
|
629
|
+
}
|
|
630
|
+
if (event.type !== "input_audio_buffer.append") {
|
|
631
|
+
this.#logger.debug(`(client) -> ${JSON.stringify(this.#loggableEvent(event))}`);
|
|
632
|
+
}
|
|
633
|
+
this.emit("openai_client_event_queued", event);
|
|
634
|
+
wsConn.send(JSON.stringify(event));
|
|
635
|
+
} catch (error) {
|
|
636
|
+
break;
|
|
637
|
+
}
|
|
638
|
+
}
|
|
639
|
+
wsConn.close();
|
|
640
|
+
};
|
|
641
|
+
const wsCloseFuture = new import_agents.Future();
|
|
642
|
+
wsConn.onerror = (error) => {
|
|
643
|
+
wsCloseFuture.resolve(new import_agents.APIConnectionError({ message: error.message }));
|
|
644
|
+
};
|
|
645
|
+
wsConn.onclose = () => {
|
|
646
|
+
wsCloseFuture.resolve();
|
|
647
|
+
};
|
|
648
|
+
wsConn.onmessage = (message) => {
|
|
649
|
+
const event = JSON.parse(message.data);
|
|
650
|
+
this.emit("openai_server_event_received", event);
|
|
651
|
+
this.#logger.debug(`(server) <- ${JSON.stringify(this.#loggableEvent(event))}`);
|
|
652
|
+
switch (event.type) {
|
|
653
|
+
case "input_audio_buffer.speech_started":
|
|
654
|
+
this.handleInputAudioBufferSpeechStarted(event);
|
|
655
|
+
break;
|
|
656
|
+
case "input_audio_buffer.speech_stopped":
|
|
657
|
+
this.handleInputAudioBufferSpeechStopped(event);
|
|
658
|
+
break;
|
|
659
|
+
case "response.created":
|
|
660
|
+
this.handleResponseCreated(event);
|
|
661
|
+
break;
|
|
662
|
+
case "response.output_item.added":
|
|
663
|
+
this.handleResponseOutputItemAdded(event);
|
|
664
|
+
break;
|
|
665
|
+
case "conversation.item.created":
|
|
666
|
+
this.handleConversationItemCreated(event);
|
|
667
|
+
break;
|
|
668
|
+
case "conversation.item.deleted":
|
|
669
|
+
this.handleConversationItemDeleted(event);
|
|
670
|
+
break;
|
|
671
|
+
case "conversation.item.input_audio_transcription.completed":
|
|
672
|
+
this.handleConversationItemInputAudioTranscriptionCompleted(event);
|
|
673
|
+
break;
|
|
674
|
+
case "conversation.item.input_audio_transcription.failed":
|
|
675
|
+
this.handleConversationItemInputAudioTranscriptionFailed(event);
|
|
676
|
+
break;
|
|
677
|
+
case "response.content_part.added":
|
|
678
|
+
this.handleResponseContentPartAdded(event);
|
|
679
|
+
break;
|
|
680
|
+
case "response.content_part.done":
|
|
681
|
+
this.handleResponseContentPartDone(event);
|
|
682
|
+
break;
|
|
683
|
+
case "response.audio_transcript.delta":
|
|
684
|
+
this.handleResponseAudioTranscriptDelta(event);
|
|
685
|
+
break;
|
|
686
|
+
case "response.audio.delta":
|
|
687
|
+
this.handleResponseAudioDelta(event);
|
|
688
|
+
break;
|
|
689
|
+
case "response.audio_transcript.done":
|
|
690
|
+
this.handleResponseAudioTranscriptDone(event);
|
|
691
|
+
break;
|
|
692
|
+
case "response.audio.done":
|
|
693
|
+
this.handleResponseAudioDone(event);
|
|
694
|
+
break;
|
|
695
|
+
case "response.output_item.done":
|
|
696
|
+
this.handleResponseOutputItemDone(event);
|
|
697
|
+
break;
|
|
698
|
+
case "response.done":
|
|
699
|
+
this.handleResponseDone(event);
|
|
700
|
+
break;
|
|
701
|
+
case "error":
|
|
702
|
+
this.handleError(event);
|
|
703
|
+
break;
|
|
704
|
+
default:
|
|
705
|
+
this.#logger.debug(`unhandled event: ${event.type}`);
|
|
706
|
+
break;
|
|
707
|
+
}
|
|
708
|
+
};
|
|
709
|
+
const sendTask = import_agents.Task.from(({ signal }) => forwardEvents(signal));
|
|
710
|
+
const wsTask = import_agents.Task.from(({ signal }) => {
|
|
711
|
+
const abortPromise = new Promise((resolve) => {
|
|
712
|
+
signal.addEventListener("abort", () => {
|
|
713
|
+
resolve();
|
|
714
|
+
});
|
|
715
|
+
});
|
|
716
|
+
return Promise.race([wsCloseFuture.await, abortPromise]);
|
|
668
717
|
});
|
|
718
|
+
const waitReconnectTask = import_agents.Task.from(async ({ signal }) => {
|
|
719
|
+
await (0, import_async.delay)(this.oaiRealtimeModel._options.maxSessionDuration, { signal });
|
|
720
|
+
return new import_agents.APIConnectionError({
|
|
721
|
+
message: "OpenAI Realtime API connection timeout"
|
|
722
|
+
});
|
|
723
|
+
});
|
|
724
|
+
try {
|
|
725
|
+
const result = await Promise.race([wsTask.result, sendTask.result, waitReconnectTask.result]);
|
|
726
|
+
if (waitReconnectTask.done && this.currentGeneration) {
|
|
727
|
+
await this.currentGeneration._doneFut.await;
|
|
728
|
+
}
|
|
729
|
+
if (result instanceof Error) {
|
|
730
|
+
throw result;
|
|
731
|
+
}
|
|
732
|
+
} finally {
|
|
733
|
+
await (0, import_agents.cancelAndWait)([wsTask, sendTask, waitReconnectTask], 2e3);
|
|
734
|
+
wsConn.close();
|
|
735
|
+
}
|
|
669
736
|
}
|
|
670
737
|
async close() {
|
|
671
|
-
|
|
672
|
-
this.#
|
|
673
|
-
this.#ws.close();
|
|
738
|
+
super.close();
|
|
739
|
+
this.#closed = true;
|
|
674
740
|
await this.#task;
|
|
675
741
|
}
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
const output = response.output[ptr.output_index];
|
|
679
|
-
const content = output.content[ptr.content_index];
|
|
680
|
-
return content;
|
|
742
|
+
handleInputAudioBufferSpeechStarted(_event) {
|
|
743
|
+
this.emit("input_speech_started", {});
|
|
681
744
|
}
|
|
682
|
-
|
|
683
|
-
this
|
|
745
|
+
handleInputAudioBufferSpeechStopped(_event) {
|
|
746
|
+
this.emit("input_speech_stopped", {
|
|
747
|
+
userTranscriptionEnabled: this.oaiRealtimeModel._options.inputAudioTranscription !== null
|
|
748
|
+
});
|
|
684
749
|
}
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
750
|
+
handleResponseCreated(event) {
|
|
751
|
+
if (!event.response.id) {
|
|
752
|
+
throw new Error("response.id is missing");
|
|
753
|
+
}
|
|
754
|
+
this.currentGeneration = {
|
|
755
|
+
messageChannel: import_agents.stream.createStreamChannel(),
|
|
756
|
+
functionChannel: import_agents.stream.createStreamChannel(),
|
|
757
|
+
messages: /* @__PURE__ */ new Map(),
|
|
758
|
+
_doneFut: new import_agents.Future(),
|
|
759
|
+
_createdTimestamp: Date.now()
|
|
760
|
+
};
|
|
761
|
+
if (!event.response.metadata || !event.response.metadata.client_event_id) return;
|
|
762
|
+
const handle = this.responseCreatedFutures[event.response.metadata.client_event_id];
|
|
763
|
+
if (handle) {
|
|
764
|
+
delete this.responseCreatedFutures[event.response.metadata.client_event_id];
|
|
765
|
+
this.responseCreatedFutures[event.response.id] = handle;
|
|
766
|
+
}
|
|
767
|
+
this.emit("generation_created", {
|
|
768
|
+
messageStream: this.currentGeneration.messageChannel.stream(),
|
|
769
|
+
functionStream: this.currentGeneration.functionChannel.stream(),
|
|
770
|
+
userInitiated: false
|
|
771
|
+
});
|
|
689
772
|
}
|
|
690
|
-
|
|
691
|
-
|
|
773
|
+
handleResponseOutputItemAdded(event) {
|
|
774
|
+
if (!this.currentGeneration) {
|
|
775
|
+
throw new Error("currentGeneration is not set");
|
|
776
|
+
}
|
|
777
|
+
if (!event.item.type) {
|
|
778
|
+
throw new Error("item.type is not set");
|
|
779
|
+
}
|
|
780
|
+
if (!event.response_id) {
|
|
781
|
+
throw new Error("response_id is not set");
|
|
782
|
+
}
|
|
783
|
+
const itemType = event.item.type;
|
|
784
|
+
const responseId = event.response_id;
|
|
785
|
+
if (itemType !== "message") {
|
|
786
|
+
this.emitGenerationEvent(responseId);
|
|
787
|
+
this.textModeRecoveryRetries = 0;
|
|
788
|
+
return;
|
|
789
|
+
}
|
|
692
790
|
}
|
|
693
|
-
|
|
694
|
-
|
|
791
|
+
handleConversationItemCreated(event) {
|
|
792
|
+
if (!event.item.id) {
|
|
793
|
+
throw new Error("item.id is not set");
|
|
794
|
+
}
|
|
795
|
+
try {
|
|
796
|
+
this.remoteChatCtx.insert(event.previous_item_id, openAIItemToLivekitItem(event.item));
|
|
797
|
+
} catch (error) {
|
|
798
|
+
this.#logger.error({ error, itemId: event.item.id }, "failed to insert conversation item");
|
|
799
|
+
}
|
|
800
|
+
const fut = this.itemCreateFutures[event.item.id];
|
|
801
|
+
if (fut) {
|
|
802
|
+
fut.resolve();
|
|
803
|
+
delete this.itemCreateFutures[event.item.id];
|
|
804
|
+
}
|
|
695
805
|
}
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
806
|
+
handleConversationItemDeleted(event) {
|
|
807
|
+
if (!event.item_id) {
|
|
808
|
+
throw new Error("item_id is not set");
|
|
809
|
+
}
|
|
810
|
+
try {
|
|
811
|
+
this.remoteChatCtx.delete(event.item_id);
|
|
812
|
+
} catch (error) {
|
|
813
|
+
this.#logger.error({ error, itemId: event.item_id }, "failed to delete conversation item");
|
|
814
|
+
}
|
|
815
|
+
const fut = this.itemDeleteFutures[event.item_id];
|
|
816
|
+
if (fut) {
|
|
817
|
+
fut.resolve();
|
|
818
|
+
delete this.itemDeleteFutures[event.item_id];
|
|
819
|
+
}
|
|
820
|
+
}
|
|
821
|
+
handleConversationItemInputAudioTranscriptionCompleted(event) {
|
|
822
|
+
const remoteItem = this.remoteChatCtx.get(event.item_id);
|
|
823
|
+
if (!remoteItem) {
|
|
824
|
+
return;
|
|
825
|
+
}
|
|
826
|
+
const item = remoteItem.item;
|
|
827
|
+
if (item instanceof import_agents.llm.ChatMessage) {
|
|
828
|
+
item.content.push(event.transcript);
|
|
829
|
+
} else {
|
|
830
|
+
throw new Error("item is not a chat message");
|
|
831
|
+
}
|
|
832
|
+
this.emit("input_audio_transcription_completed", {
|
|
833
|
+
itemId: event.item_id,
|
|
834
|
+
transcript: event.transcript,
|
|
835
|
+
isFinal: true
|
|
699
836
|
});
|
|
700
837
|
}
|
|
701
|
-
|
|
702
|
-
|
|
838
|
+
handleConversationItemInputAudioTranscriptionFailed(event) {
|
|
839
|
+
this.#logger.error(
|
|
840
|
+
{ error: event.error },
|
|
841
|
+
"OpenAI Realtime API failed to transcribe input audio"
|
|
842
|
+
);
|
|
703
843
|
}
|
|
704
|
-
|
|
705
|
-
this.
|
|
706
|
-
|
|
707
|
-
}
|
|
844
|
+
handleResponseContentPartAdded(event) {
|
|
845
|
+
if (!this.currentGeneration) {
|
|
846
|
+
throw new Error("currentGeneration is not set");
|
|
847
|
+
}
|
|
848
|
+
const itemId = event.item_id;
|
|
849
|
+
const itemType = event.part.type;
|
|
850
|
+
const responseId = event.response_id;
|
|
851
|
+
if (itemType === "audio") {
|
|
852
|
+
this.emitGenerationEvent(responseId);
|
|
853
|
+
if (this.textModeRecoveryRetries > 0) {
|
|
854
|
+
this.#logger.info(
|
|
855
|
+
{ retries: this.textModeRecoveryRetries },
|
|
856
|
+
"recovered from text-only response"
|
|
857
|
+
);
|
|
858
|
+
this.textModeRecoveryRetries = 0;
|
|
859
|
+
}
|
|
860
|
+
const itemGeneration = {
|
|
861
|
+
messageId: itemId,
|
|
862
|
+
textChannel: import_agents.stream.createStreamChannel(),
|
|
863
|
+
audioChannel: import_agents.stream.createStreamChannel(),
|
|
864
|
+
audioTranscript: ""
|
|
865
|
+
};
|
|
866
|
+
this.currentGeneration.messageChannel.write({
|
|
867
|
+
messageId: itemId,
|
|
868
|
+
textStream: itemGeneration.textChannel.stream(),
|
|
869
|
+
audioStream: itemGeneration.audioChannel.stream()
|
|
870
|
+
});
|
|
871
|
+
this.currentGeneration.messages.set(itemId, itemGeneration);
|
|
872
|
+
this.currentGeneration._firstTokenTimestamp = Date.now();
|
|
873
|
+
return;
|
|
874
|
+
} else {
|
|
875
|
+
this.interrupt();
|
|
876
|
+
if (this.textModeRecoveryRetries === 0) {
|
|
877
|
+
this.#logger.warn({ responseId }, "received text-only response from OpenAI Realtime API");
|
|
878
|
+
}
|
|
879
|
+
}
|
|
708
880
|
}
|
|
709
|
-
|
|
710
|
-
|
|
881
|
+
handleResponseContentPartDone(event) {
|
|
882
|
+
if (event.part.type !== "text") {
|
|
883
|
+
return;
|
|
884
|
+
}
|
|
885
|
+
if (!this.currentGeneration) {
|
|
886
|
+
throw new Error("currentGeneration is not set");
|
|
887
|
+
}
|
|
711
888
|
}
|
|
712
|
-
|
|
713
|
-
|
|
889
|
+
handleResponseAudioTranscriptDelta(event) {
|
|
890
|
+
if (!this.currentGeneration) {
|
|
891
|
+
throw new Error("currentGeneration is not set");
|
|
892
|
+
}
|
|
893
|
+
const itemId = event.item_id;
|
|
894
|
+
const delta = event.delta;
|
|
895
|
+
const itemGeneration = this.currentGeneration.messages.get(itemId);
|
|
896
|
+
if (!itemGeneration) {
|
|
897
|
+
throw new Error("itemGeneration is not set");
|
|
898
|
+
} else {
|
|
899
|
+
itemGeneration.textChannel.write(delta);
|
|
900
|
+
itemGeneration.audioTranscript += delta;
|
|
901
|
+
}
|
|
714
902
|
}
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
903
|
+
handleResponseAudioDelta(event) {
|
|
904
|
+
if (!this.currentGeneration) {
|
|
905
|
+
throw new Error("currentGeneration is not set");
|
|
906
|
+
}
|
|
907
|
+
const itemGeneration = this.currentGeneration.messages.get(event.item_id);
|
|
908
|
+
if (!itemGeneration) {
|
|
909
|
+
throw new Error("itemGeneration is not set");
|
|
910
|
+
}
|
|
911
|
+
const binaryString = atob(event.delta);
|
|
912
|
+
const len = binaryString.length;
|
|
913
|
+
const bytes = new Uint8Array(len);
|
|
914
|
+
for (let i = 0; i < len; i++) {
|
|
915
|
+
bytes[i] = binaryString.charCodeAt(i);
|
|
916
|
+
}
|
|
917
|
+
itemGeneration.audioChannel.write(
|
|
918
|
+
new import_rtc_node.AudioFrame(
|
|
919
|
+
new Int16Array(bytes.buffer),
|
|
920
|
+
api_proto.SAMPLE_RATE,
|
|
921
|
+
api_proto.NUM_CHANNELS,
|
|
922
|
+
bytes.length / 2
|
|
923
|
+
)
|
|
924
|
+
);
|
|
721
925
|
}
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
itemId: event.item_id,
|
|
727
|
-
message: error.message
|
|
728
|
-
});
|
|
926
|
+
handleResponseAudioTranscriptDone(_event) {
|
|
927
|
+
if (!this.currentGeneration) {
|
|
928
|
+
throw new Error("currentGeneration is not set");
|
|
929
|
+
}
|
|
729
930
|
}
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
const
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
doneFut,
|
|
746
|
-
createdTimestamp: Date.now()
|
|
747
|
-
};
|
|
748
|
-
this.#pendingResponses[newResponse.id] = newResponse;
|
|
749
|
-
this.emit("response_created", newResponse);
|
|
750
|
-
}
|
|
751
|
-
#handleResponseDone(event) {
|
|
752
|
-
const responseData = event.response;
|
|
753
|
-
const responseId = responseData.id;
|
|
754
|
-
const response = this.#pendingResponses[responseId];
|
|
755
|
-
response.status = responseData.status;
|
|
756
|
-
response.statusDetails = responseData.status_details;
|
|
757
|
-
response.usage = responseData.usage ?? null;
|
|
758
|
-
this.#pendingResponses[responseId] = response;
|
|
759
|
-
response.doneFut.resolve();
|
|
760
|
-
let metricsError;
|
|
761
|
-
let cancelled = false;
|
|
762
|
-
switch (response.status) {
|
|
763
|
-
case "failed": {
|
|
764
|
-
if (response.statusDetails.type !== "failed") break;
|
|
765
|
-
const err = response.statusDetails.error;
|
|
766
|
-
metricsError = new import_agents.metrics.MultimodalLLMError({
|
|
767
|
-
type: response.statusDetails.type,
|
|
768
|
-
code: err == null ? void 0 : err.code,
|
|
769
|
-
message: err == null ? void 0 : err.message
|
|
770
|
-
});
|
|
771
|
-
this.#logger.child({ code: err == null ? void 0 : err.code, error: err == null ? void 0 : err.message }).error("response generation failed");
|
|
772
|
-
break;
|
|
773
|
-
}
|
|
774
|
-
case "incomplete": {
|
|
775
|
-
if (response.statusDetails.type !== "incomplete") break;
|
|
776
|
-
const reason = response.statusDetails.reason;
|
|
777
|
-
metricsError = new import_agents.metrics.MultimodalLLMError({
|
|
778
|
-
type: response.statusDetails.type,
|
|
779
|
-
reason
|
|
780
|
-
});
|
|
781
|
-
this.#logger.child({ reason }).error("response generation incomplete");
|
|
782
|
-
break;
|
|
931
|
+
handleResponseAudioDone(_event) {
|
|
932
|
+
if (!this.currentGeneration) {
|
|
933
|
+
throw new Error("currentGeneration is not set");
|
|
934
|
+
}
|
|
935
|
+
}
|
|
936
|
+
handleResponseOutputItemDone(event) {
|
|
937
|
+
if (!this.currentGeneration) {
|
|
938
|
+
throw new Error("currentGeneration is not set");
|
|
939
|
+
}
|
|
940
|
+
const itemId = event.item.id;
|
|
941
|
+
const itemType = event.item.type;
|
|
942
|
+
if (itemType === "function_call") {
|
|
943
|
+
const item = event.item;
|
|
944
|
+
if (!item.call_id || !item.name || !item.arguments) {
|
|
945
|
+
throw new Error("item is not a function call");
|
|
783
946
|
}
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
947
|
+
this.currentGeneration.functionChannel.write({
|
|
948
|
+
callId: item.call_id,
|
|
949
|
+
name: item.name,
|
|
950
|
+
args: item.arguments
|
|
951
|
+
});
|
|
952
|
+
} else if (itemType === "message") {
|
|
953
|
+
const itemGeneration = this.currentGeneration.messages.get(itemId);
|
|
954
|
+
if (!itemGeneration) {
|
|
955
|
+
return;
|
|
787
956
|
}
|
|
957
|
+
itemGeneration.textChannel.close();
|
|
958
|
+
itemGeneration.audioChannel.close();
|
|
788
959
|
}
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
960
|
+
}
|
|
961
|
+
handleResponseDone(_event) {
|
|
962
|
+
var _a, _b, _c, _d, _e, _f, _g, _h, _i, _j, _k, _l;
|
|
963
|
+
if (!this.currentGeneration) {
|
|
964
|
+
return;
|
|
793
965
|
}
|
|
794
|
-
const
|
|
795
|
-
const
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
966
|
+
const createdTimestamp = this.currentGeneration._createdTimestamp;
|
|
967
|
+
const firstTokenTimestamp = this.currentGeneration._firstTokenTimestamp;
|
|
968
|
+
this.#logger.debug(
|
|
969
|
+
{
|
|
970
|
+
messageCount: this.currentGeneration.messages.size
|
|
971
|
+
},
|
|
972
|
+
"Closing generation channels in handleResponseDone"
|
|
973
|
+
);
|
|
974
|
+
for (const generation of this.currentGeneration.messages.values()) {
|
|
975
|
+
generation.textChannel.close();
|
|
976
|
+
generation.audioChannel.close();
|
|
977
|
+
}
|
|
978
|
+
this.currentGeneration.functionChannel.close();
|
|
979
|
+
this.currentGeneration.messageChannel.close();
|
|
980
|
+
for (const itemId of this.currentGeneration.messages.keys()) {
|
|
981
|
+
const remoteItem = this.remoteChatCtx.get(itemId);
|
|
982
|
+
if (remoteItem && remoteItem.item instanceof import_agents.llm.ChatMessage) {
|
|
983
|
+
remoteItem.item.content.push(this.currentGeneration.messages.get(itemId).audioTranscript);
|
|
984
|
+
}
|
|
985
|
+
}
|
|
986
|
+
this.currentGeneration._doneFut.resolve();
|
|
987
|
+
this.currentGeneration = void 0;
|
|
988
|
+
const usage = _event.response.usage;
|
|
989
|
+
const ttft = firstTokenTimestamp ? firstTokenTimestamp - createdTimestamp : -1;
|
|
990
|
+
const duration = (Date.now() - createdTimestamp) / 1e3;
|
|
991
|
+
const realtimeMetrics = {
|
|
992
|
+
type: "realtime_model_metrics",
|
|
993
|
+
timestamp: createdTimestamp / 1e3,
|
|
994
|
+
// Convert to seconds
|
|
995
|
+
requestId: _event.response.id || "",
|
|
799
996
|
ttft,
|
|
800
997
|
duration,
|
|
801
|
-
cancelled,
|
|
802
|
-
label:
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
totalTokens: (usage == null ? void 0 : usage.total_tokens)
|
|
806
|
-
tokensPerSecond: ((usage == null ? void 0 : usage.output_tokens)
|
|
807
|
-
error: metricsError,
|
|
998
|
+
cancelled: _event.response.status === "cancelled",
|
|
999
|
+
label: "openai_realtime",
|
|
1000
|
+
inputTokens: (usage == null ? void 0 : usage.input_tokens) ?? 0,
|
|
1001
|
+
outputTokens: (usage == null ? void 0 : usage.output_tokens) ?? 0,
|
|
1002
|
+
totalTokens: (usage == null ? void 0 : usage.total_tokens) ?? 0,
|
|
1003
|
+
tokensPerSecond: duration > 0 ? ((usage == null ? void 0 : usage.output_tokens) ?? 0) / duration : 0,
|
|
808
1004
|
inputTokenDetails: {
|
|
809
|
-
|
|
810
|
-
textTokens: (usage == null ? void 0 : usage.input_token_details.text_tokens)
|
|
811
|
-
|
|
1005
|
+
audioTokens: ((_a = usage == null ? void 0 : usage.input_token_details) == null ? void 0 : _a.audio_tokens) ?? 0,
|
|
1006
|
+
textTokens: ((_b = usage == null ? void 0 : usage.input_token_details) == null ? void 0 : _b.text_tokens) ?? 0,
|
|
1007
|
+
imageTokens: 0,
|
|
1008
|
+
// Not supported yet
|
|
1009
|
+
cachedTokens: ((_c = usage == null ? void 0 : usage.input_token_details) == null ? void 0 : _c.cached_tokens) ?? 0,
|
|
1010
|
+
cachedTokensDetails: ((_d = usage == null ? void 0 : usage.input_token_details) == null ? void 0 : _d.cached_tokens_details) ? {
|
|
1011
|
+
audioTokens: ((_f = (_e = usage == null ? void 0 : usage.input_token_details) == null ? void 0 : _e.cached_tokens_details) == null ? void 0 : _f.audio_tokens) ?? 0,
|
|
1012
|
+
textTokens: ((_h = (_g = usage == null ? void 0 : usage.input_token_details) == null ? void 0 : _g.cached_tokens_details) == null ? void 0 : _h.text_tokens) ?? 0,
|
|
1013
|
+
imageTokens: ((_j = (_i = usage == null ? void 0 : usage.input_token_details) == null ? void 0 : _i.cached_tokens_details) == null ? void 0 : _j.image_tokens) ?? 0
|
|
1014
|
+
} : void 0
|
|
812
1015
|
},
|
|
813
1016
|
outputTokenDetails: {
|
|
814
|
-
textTokens: (usage == null ? void 0 : usage.output_token_details.text_tokens)
|
|
815
|
-
audioTokens: (usage == null ? void 0 : usage.output_token_details.audio_tokens)
|
|
1017
|
+
textTokens: ((_k = usage == null ? void 0 : usage.output_token_details) == null ? void 0 : _k.text_tokens) ?? 0,
|
|
1018
|
+
audioTokens: ((_l = usage == null ? void 0 : usage.output_token_details) == null ? void 0 : _l.audio_tokens) ?? 0,
|
|
1019
|
+
imageTokens: 0
|
|
816
1020
|
}
|
|
817
1021
|
};
|
|
818
|
-
this.emit("metrics_collected",
|
|
1022
|
+
this.emit("metrics_collected", realtimeMetrics);
|
|
819
1023
|
}
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
}
|
|
833
|
-
const newOutput = {
|
|
834
|
-
responseId,
|
|
835
|
-
itemId: itemData.id,
|
|
836
|
-
outputIndex: event.output_index,
|
|
837
|
-
type: itemData.type,
|
|
838
|
-
role,
|
|
839
|
-
content: [],
|
|
840
|
-
doneFut: new import_agents.Future()
|
|
841
|
-
};
|
|
842
|
-
response == null ? void 0 : response.output.push(newOutput);
|
|
843
|
-
this.emit("response_output_added", newOutput);
|
|
1024
|
+
handleError(event) {
|
|
1025
|
+
if (event.error.message.startsWith("Cancellation failed")) {
|
|
1026
|
+
return;
|
|
1027
|
+
}
|
|
1028
|
+
this.#logger.error({ error: event.error }, "OpenAI Realtime API returned an error");
|
|
1029
|
+
this.emitError({
|
|
1030
|
+
error: new import_agents.APIError(event.error.message, {
|
|
1031
|
+
body: event.error,
|
|
1032
|
+
retryable: true
|
|
1033
|
+
}),
|
|
1034
|
+
recoverable: true
|
|
1035
|
+
});
|
|
844
1036
|
}
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
callId: item.call_id
|
|
866
|
-
});
|
|
867
|
-
const parsedArgs = JSON.parse(item.arguments);
|
|
868
|
-
this.#logger.debug(
|
|
869
|
-
`[Function Call ${item.call_id}] Executing ${item.name} with arguments ${parsedArgs}`
|
|
870
|
-
);
|
|
871
|
-
func.execute(parsedArgs).then(
|
|
872
|
-
(content) => {
|
|
873
|
-
this.#logger.debug(`[Function Call ${item.call_id}] ${item.name} returned ${content}`);
|
|
874
|
-
this.emit("function_call_completed", {
|
|
875
|
-
callId: item.call_id
|
|
876
|
-
});
|
|
877
|
-
this.conversation.item.create(
|
|
878
|
-
import_agents.llm.ChatMessage.createToolFromFunctionResult({
|
|
879
|
-
name: item.name,
|
|
880
|
-
toolCallId: item.call_id,
|
|
881
|
-
result: content
|
|
882
|
-
}),
|
|
883
|
-
output.itemId
|
|
884
|
-
);
|
|
885
|
-
this.response.create();
|
|
886
|
-
},
|
|
887
|
-
(error) => {
|
|
888
|
-
this.#logger.error(`[Function Call ${item.call_id}] ${item.name} failed with ${error}`);
|
|
889
|
-
this.emit("function_call_failed", {
|
|
890
|
-
callId: item.call_id
|
|
891
|
-
});
|
|
892
|
-
}
|
|
893
|
-
);
|
|
1037
|
+
emitError({ error, recoverable }) {
|
|
1038
|
+
this.emit("error", {
|
|
1039
|
+
timestamp: Date.now(),
|
|
1040
|
+
// TODO(brian): add label
|
|
1041
|
+
label: "",
|
|
1042
|
+
error,
|
|
1043
|
+
recoverable
|
|
1044
|
+
});
|
|
1045
|
+
}
|
|
1046
|
+
*resampleAudio(frame) {
|
|
1047
|
+
yield frame;
|
|
1048
|
+
}
|
|
1049
|
+
createResponse({
|
|
1050
|
+
userInitiated,
|
|
1051
|
+
instructions,
|
|
1052
|
+
oldHandle
|
|
1053
|
+
}) {
|
|
1054
|
+
const handle = oldHandle || new CreateResponseHandle({ instructions });
|
|
1055
|
+
if (oldHandle && instructions) {
|
|
1056
|
+
handle.instructions = instructions;
|
|
894
1057
|
}
|
|
895
|
-
|
|
896
|
-
|
|
1058
|
+
const eventId = (0, import_agents.shortuuid)("response_create_");
|
|
1059
|
+
if (userInitiated) {
|
|
1060
|
+
this.responseCreatedFutures[eventId] = handle;
|
|
1061
|
+
}
|
|
1062
|
+
const response = {};
|
|
1063
|
+
if (instructions) response.instructions = instructions;
|
|
1064
|
+
if (userInitiated) response.metadata = { client_event_id: eventId };
|
|
1065
|
+
this.sendEvent({
|
|
1066
|
+
type: "response.create",
|
|
1067
|
+
event_id: eventId,
|
|
1068
|
+
response: Object.keys(response).length > 0 ? response : void 0
|
|
1069
|
+
});
|
|
1070
|
+
return handle;
|
|
897
1071
|
}
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
const
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
responseId,
|
|
907
|
-
itemId: event.item_id,
|
|
908
|
-
outputIndex,
|
|
909
|
-
contentIndex: event.content_index,
|
|
910
|
-
text: "",
|
|
911
|
-
audio: [],
|
|
912
|
-
textStream,
|
|
913
|
-
audioStream,
|
|
914
|
-
toolCalls: [],
|
|
915
|
-
contentType: event.part.type
|
|
1072
|
+
emitGenerationEvent(responseId) {
|
|
1073
|
+
if (!this.currentGeneration) {
|
|
1074
|
+
throw new Error("currentGeneration is not set");
|
|
1075
|
+
}
|
|
1076
|
+
const generation_ev = {
|
|
1077
|
+
messageStream: this.currentGeneration.messageChannel.stream(),
|
|
1078
|
+
functionStream: this.currentGeneration.functionChannel.stream(),
|
|
1079
|
+
userInitiated: false
|
|
916
1080
|
};
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
const content = this.#getContent(event);
|
|
930
|
-
content.text = event.text;
|
|
931
|
-
this.emit("response_text_done", event);
|
|
932
|
-
}
|
|
933
|
-
#handleResponseAudioTranscriptDelta(event) {
|
|
934
|
-
const content = this.#getContent(event);
|
|
935
|
-
const transcript = event.delta;
|
|
936
|
-
content.text += transcript;
|
|
937
|
-
content.textStream.put(transcript);
|
|
938
|
-
}
|
|
939
|
-
#handleResponseAudioTranscriptDone(event) {
|
|
940
|
-
const content = this.#getContent(event);
|
|
941
|
-
content.textStream.close();
|
|
942
|
-
}
|
|
943
|
-
#handleResponseAudioDelta(event) {
|
|
944
|
-
const content = this.#getContent(event);
|
|
945
|
-
const data = Buffer.from(event.delta, "base64");
|
|
946
|
-
const audio = new import_rtc_node.AudioFrame(
|
|
947
|
-
new Int16Array(data.buffer),
|
|
948
|
-
api_proto.SAMPLE_RATE,
|
|
949
|
-
api_proto.NUM_CHANNELS,
|
|
950
|
-
data.length / 2
|
|
951
|
-
);
|
|
952
|
-
content.audio.push(audio);
|
|
953
|
-
content.audioStream.put(audio);
|
|
1081
|
+
const handle = this.responseCreatedFutures[responseId];
|
|
1082
|
+
if (handle) {
|
|
1083
|
+
delete this.responseCreatedFutures[responseId];
|
|
1084
|
+
generation_ev.userInitiated = true;
|
|
1085
|
+
if (handle.doneFut.done) {
|
|
1086
|
+
this.#logger.warn({ responseId }, "response received after timeout");
|
|
1087
|
+
} else {
|
|
1088
|
+
handle.doneFut.resolve(generation_ev);
|
|
1089
|
+
}
|
|
1090
|
+
}
|
|
1091
|
+
this.#logger.debug({ responseId }, "Emitting generation_created event");
|
|
1092
|
+
this.emit("generation_created", generation_ev);
|
|
954
1093
|
}
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
1094
|
+
}
|
|
1095
|
+
function livekitItemToOpenAIItem(item) {
|
|
1096
|
+
switch (item.type) {
|
|
1097
|
+
case "function_call":
|
|
1098
|
+
return {
|
|
1099
|
+
id: item.id,
|
|
1100
|
+
type: "function_call",
|
|
1101
|
+
call_id: item.callId,
|
|
1102
|
+
name: item.name,
|
|
1103
|
+
arguments: item.args
|
|
1104
|
+
};
|
|
1105
|
+
case "function_call_output":
|
|
1106
|
+
return {
|
|
1107
|
+
id: item.id,
|
|
1108
|
+
type: "function_call_output",
|
|
1109
|
+
call_id: item.callId,
|
|
1110
|
+
output: item.output
|
|
1111
|
+
};
|
|
1112
|
+
case "message":
|
|
1113
|
+
const role = item.role === "developer" ? "system" : item.role;
|
|
1114
|
+
const contentList = [];
|
|
1115
|
+
for (const c of item.content) {
|
|
1116
|
+
if (typeof c === "string") {
|
|
1117
|
+
contentList.push({
|
|
1118
|
+
type: role === "assistant" ? "text" : "input_text",
|
|
1119
|
+
text: c
|
|
1120
|
+
});
|
|
1121
|
+
} else if (c.type === "image_content") {
|
|
1122
|
+
continue;
|
|
1123
|
+
} else if (c.type === "audio_content") {
|
|
1124
|
+
if (role === "user") {
|
|
1125
|
+
const encodedAudio = Buffer.from((0, import_rtc_node.combineAudioFrames)(c.frame).data).toString("base64");
|
|
1126
|
+
contentList.push({
|
|
1127
|
+
type: "input_audio",
|
|
1128
|
+
audio: encodedAudio
|
|
1129
|
+
});
|
|
1130
|
+
}
|
|
1131
|
+
}
|
|
1132
|
+
}
|
|
1133
|
+
return {
|
|
1134
|
+
id: item.id,
|
|
1135
|
+
type: "message",
|
|
1136
|
+
role,
|
|
1137
|
+
content: contentList
|
|
1138
|
+
};
|
|
958
1139
|
}
|
|
959
|
-
|
|
1140
|
+
}
|
|
1141
|
+
function openAIItemToLivekitItem(item) {
|
|
1142
|
+
if (!item.id) {
|
|
1143
|
+
throw new Error("item.id is not set");
|
|
960
1144
|
}
|
|
961
|
-
|
|
1145
|
+
switch (item.type) {
|
|
1146
|
+
case "function_call":
|
|
1147
|
+
return import_agents.llm.FunctionCall.create({
|
|
1148
|
+
id: item.id,
|
|
1149
|
+
callId: item.call_id,
|
|
1150
|
+
name: item.name,
|
|
1151
|
+
args: item.arguments
|
|
1152
|
+
});
|
|
1153
|
+
case "function_call_output":
|
|
1154
|
+
return import_agents.llm.FunctionCallOutput.create({
|
|
1155
|
+
id: item.id,
|
|
1156
|
+
callId: item.call_id,
|
|
1157
|
+
output: item.output,
|
|
1158
|
+
isError: false
|
|
1159
|
+
});
|
|
1160
|
+
case "message":
|
|
1161
|
+
const content = [];
|
|
1162
|
+
const contents = Array.isArray(item.content) ? item.content : [item.content];
|
|
1163
|
+
for (const c of contents) {
|
|
1164
|
+
if (c.type === "text" || c.type === "input_text") {
|
|
1165
|
+
content.push(c.text);
|
|
1166
|
+
}
|
|
1167
|
+
}
|
|
1168
|
+
return import_agents.llm.ChatMessage.create({
|
|
1169
|
+
id: item.id,
|
|
1170
|
+
role: item.role,
|
|
1171
|
+
content
|
|
1172
|
+
});
|
|
1173
|
+
}
|
|
1174
|
+
}
|
|
1175
|
+
function createMockAudioItem(durationSeconds = 2) {
|
|
1176
|
+
const audioData = Buffer.alloc(durationSeconds * SAMPLE_RATE);
|
|
1177
|
+
return import_agents.llm.ChatMessage.create({
|
|
1178
|
+
id: (0, import_agents.shortuuid)(MOCK_AUDIO_ID_PREFIX),
|
|
1179
|
+
role: "user",
|
|
1180
|
+
content: [
|
|
1181
|
+
{
|
|
1182
|
+
type: "audio_content",
|
|
1183
|
+
frame: [
|
|
1184
|
+
new import_rtc_node.AudioFrame(
|
|
1185
|
+
new Int16Array(audioData.buffer),
|
|
1186
|
+
SAMPLE_RATE,
|
|
1187
|
+
NUM_CHANNELS,
|
|
1188
|
+
audioData.length / 2
|
|
1189
|
+
)
|
|
1190
|
+
]
|
|
1191
|
+
}
|
|
1192
|
+
]
|
|
1193
|
+
});
|
|
1194
|
+
}
|
|
1195
|
+
function toOaiToolChoice(toolChoice) {
|
|
1196
|
+
if (typeof toolChoice === "string") {
|
|
1197
|
+
return toolChoice;
|
|
962
1198
|
}
|
|
963
|
-
|
|
964
|
-
|
|
1199
|
+
if ((toolChoice == null ? void 0 : toolChoice.type) === "function") {
|
|
1200
|
+
return toolChoice.function.name;
|
|
965
1201
|
}
|
|
1202
|
+
return "auto";
|
|
966
1203
|
}
|
|
967
1204
|
// Annotate the CommonJS export names for ESM import in node:
|
|
968
1205
|
0 && (module.exports = {
|