@livekit/agents-plugin-openai 0.9.3 → 1.0.0-next.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +16 -5
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +4 -4
- package/dist/index.d.ts +4 -4
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +14 -3
- package/dist/index.js.map +1 -1
- package/dist/llm.cjs +156 -197
- package/dist/llm.cjs.map +1 -1
- package/dist/llm.d.cts +27 -8
- package/dist/llm.d.ts +27 -8
- package/dist/llm.d.ts.map +1 -1
- package/dist/llm.js +164 -188
- package/dist/llm.js.map +1 -1
- package/dist/models.cjs +14 -0
- package/dist/models.cjs.map +1 -1
- package/dist/models.d.cts +11 -6
- package/dist/models.d.ts +11 -6
- package/dist/models.d.ts.map +1 -1
- package/dist/models.js +6 -0
- package/dist/models.js.map +1 -1
- package/dist/realtime/api_proto.cjs.map +1 -1
- package/dist/realtime/api_proto.d.cts +15 -0
- package/dist/realtime/api_proto.d.ts +15 -0
- package/dist/realtime/api_proto.d.ts.map +1 -1
- package/dist/realtime/api_proto.js.map +1 -1
- package/dist/realtime/realtime_model.cjs +1057 -820
- package/dist/realtime/realtime_model.cjs.map +1 -1
- package/dist/realtime/realtime_model.d.cts +126 -160
- package/dist/realtime/realtime_model.d.ts +126 -160
- package/dist/realtime/realtime_model.d.ts.map +1 -1
- package/dist/realtime/realtime_model.js +1067 -825
- package/dist/realtime/realtime_model.js.map +1 -1
- package/dist/tts.cjs +5 -5
- package/dist/tts.cjs.map +1 -1
- package/dist/tts.d.cts +2 -1
- package/dist/tts.d.ts +2 -1
- package/dist/tts.d.ts.map +1 -1
- package/dist/tts.js +6 -6
- package/dist/tts.js.map +1 -1
- package/package.json +9 -7
- package/src/index.ts +19 -5
- package/src/llm.ts +227 -228
- package/src/models.ts +83 -5
- package/src/realtime/api_proto.ts +15 -1
- package/src/realtime/realtime_model.ts +1305 -996
- package/src/tts.ts +6 -6
|
@@ -1,561 +1,644 @@
|
|
|
1
1
|
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import type { metrics } from '@livekit/agents';
|
|
4
5
|
import {
|
|
5
|
-
|
|
6
|
+
type APIConnectOptions,
|
|
7
|
+
APIConnectionError,
|
|
8
|
+
APIError,
|
|
9
|
+
AudioByteStream,
|
|
10
|
+
DEFAULT_API_CONNECT_OPTIONS,
|
|
6
11
|
Future,
|
|
7
12
|
Queue,
|
|
13
|
+
Task,
|
|
14
|
+
cancelAndWait,
|
|
15
|
+
isAPIError,
|
|
8
16
|
llm,
|
|
9
17
|
log,
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
multimodal,
|
|
18
|
+
shortuuid,
|
|
19
|
+
stream,
|
|
13
20
|
} from '@livekit/agents';
|
|
14
|
-
import {
|
|
15
|
-
import {
|
|
16
|
-
import {
|
|
21
|
+
import { Mutex } from '@livekit/mutex';
|
|
22
|
+
import type { AudioResampler } from '@livekit/rtc-node';
|
|
23
|
+
import { AudioFrame, combineAudioFrames } from '@livekit/rtc-node';
|
|
24
|
+
import { delay } from '@std/async';
|
|
25
|
+
import type { GenerationCreatedEvent } from 'agents/dist/llm/realtime.js';
|
|
26
|
+
import { type MessageEvent, WebSocket } from 'ws';
|
|
17
27
|
import * as api_proto from './api_proto.js';
|
|
18
28
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
29
|
+
const SAMPLE_RATE = 24000;
|
|
30
|
+
const NUM_CHANNELS = 1;
|
|
31
|
+
const BASE_URL = 'https://api.openai.com/v1';
|
|
32
|
+
|
|
33
|
+
const MOCK_AUDIO_ID_PREFIX = 'lk_mock_audio_item_';
|
|
34
|
+
|
|
35
|
+
interface RealtimeOptions {
|
|
36
|
+
model: api_proto.Model;
|
|
22
37
|
voice: api_proto.Voice;
|
|
23
|
-
inputAudioFormat: api_proto.AudioFormat;
|
|
24
|
-
outputAudioFormat: api_proto.AudioFormat;
|
|
25
|
-
inputAudioTranscription: api_proto.InputAudioTranscription | null;
|
|
26
|
-
turnDetection: api_proto.TurnDetectionType | null;
|
|
27
38
|
temperature: number;
|
|
28
|
-
|
|
29
|
-
|
|
39
|
+
toolChoice?: llm.ToolChoice;
|
|
40
|
+
inputAudioTranscription?: api_proto.InputAudioTranscription | null;
|
|
41
|
+
// TODO(shubhra): add inputAudioNoiseReduction
|
|
42
|
+
turnDetection?: api_proto.TurnDetectionType | null;
|
|
43
|
+
maxResponseOutputTokens?: number | 'inf';
|
|
44
|
+
speed?: number;
|
|
45
|
+
// TODO(shubhra): add openai tracing options
|
|
30
46
|
apiKey?: string;
|
|
31
47
|
baseURL: string;
|
|
32
48
|
isAzure: boolean;
|
|
49
|
+
azureDeployment?: string;
|
|
33
50
|
entraToken?: string;
|
|
34
51
|
apiVersion?: string;
|
|
52
|
+
maxSessionDuration: number;
|
|
53
|
+
// reset the connection after this many seconds if provided
|
|
54
|
+
connOptions: APIConnectOptions;
|
|
35
55
|
}
|
|
36
56
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
output: RealtimeOutput[];
|
|
43
|
-
doneFut: Future;
|
|
44
|
-
createdTimestamp: number;
|
|
45
|
-
firstTokenTimestamp?: number;
|
|
57
|
+
interface MessageGeneration {
|
|
58
|
+
messageId: string;
|
|
59
|
+
textChannel: stream.StreamChannel<string>;
|
|
60
|
+
audioChannel: stream.StreamChannel<AudioFrame>;
|
|
61
|
+
audioTranscript: string;
|
|
46
62
|
}
|
|
47
63
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
64
|
+
interface ResponseGeneration {
|
|
65
|
+
messageChannel: stream.StreamChannel<llm.MessageGeneration>;
|
|
66
|
+
functionChannel: stream.StreamChannel<llm.FunctionCall>;
|
|
67
|
+
messages: Map<string, MessageGeneration>;
|
|
68
|
+
|
|
69
|
+
/** @internal */
|
|
70
|
+
_doneFut: Future;
|
|
71
|
+
/** @internal */
|
|
72
|
+
_createdTimestamp: number;
|
|
73
|
+
/** @internal */
|
|
74
|
+
_firstTokenTimestamp?: number;
|
|
56
75
|
}
|
|
57
76
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
audioStream: AsyncIterableQueue<AudioFrame>;
|
|
67
|
-
toolCalls: RealtimeToolCall[];
|
|
68
|
-
contentType: api_proto.Modality;
|
|
77
|
+
class CreateResponseHandle {
|
|
78
|
+
instructions?: string;
|
|
79
|
+
doneFut: Future<llm.GenerationCreatedEvent>;
|
|
80
|
+
// TODO(shubhra): add timeout
|
|
81
|
+
constructor({ instructions }: { instructions?: string }) {
|
|
82
|
+
this.instructions = instructions;
|
|
83
|
+
this.doneFut = new Future();
|
|
84
|
+
}
|
|
69
85
|
}
|
|
70
86
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
87
|
+
// default values got from a "default" session from their API
|
|
88
|
+
const DEFAULT_FIRST_RETRY_INTERVAL_MS = 100;
|
|
89
|
+
const DEFAULT_TEMPERATURE = 0.8;
|
|
90
|
+
const DEFAULT_TURN_DETECTION: api_proto.TurnDetectionType = {
|
|
91
|
+
type: 'server_vad',
|
|
92
|
+
threshold: 0.5,
|
|
93
|
+
prefix_padding_ms: 300,
|
|
94
|
+
silence_duration_ms: 200,
|
|
95
|
+
create_response: true,
|
|
96
|
+
interrupt_response: true,
|
|
97
|
+
};
|
|
98
|
+
const DEFAULT_INPUT_AUDIO_TRANSCRIPTION: api_proto.InputAudioTranscription = {
|
|
99
|
+
model: 'gpt-4o-mini-transcribe',
|
|
100
|
+
};
|
|
101
|
+
const DEFAULT_TOOL_CHOICE: llm.ToolChoice = 'auto';
|
|
102
|
+
const DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS: number | 'inf' = 'inf';
|
|
103
|
+
|
|
104
|
+
const AZURE_DEFAULT_INPUT_AUDIO_TRANSCRIPTION: api_proto.InputAudioTranscription = {
|
|
105
|
+
model: 'whisper-1',
|
|
106
|
+
};
|
|
107
|
+
|
|
108
|
+
const AZURE_DEFAULT_TURN_DETECTION: api_proto.TurnDetectionType = {
|
|
109
|
+
type: 'server_vad',
|
|
110
|
+
threshold: 0.5,
|
|
111
|
+
prefix_padding_ms: 300,
|
|
112
|
+
silence_duration_ms: 200,
|
|
113
|
+
create_response: true,
|
|
114
|
+
};
|
|
115
|
+
|
|
116
|
+
const DEFAULT_MAX_SESSION_DURATION = 20 * 60 * 1000; // 20 minutes
|
|
117
|
+
|
|
118
|
+
const DEFAULT_REALTIME_MODEL_OPTIONS = {
|
|
119
|
+
model: 'gpt-4o-realtime-preview',
|
|
120
|
+
voice: 'alloy',
|
|
121
|
+
temperature: DEFAULT_TEMPERATURE,
|
|
122
|
+
inputAudioTranscription: DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
|
|
123
|
+
turnDetection: DEFAULT_TURN_DETECTION,
|
|
124
|
+
toolChoice: DEFAULT_TOOL_CHOICE,
|
|
125
|
+
maxResponseOutputTokens: DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS,
|
|
126
|
+
maxSessionDuration: DEFAULT_MAX_SESSION_DURATION,
|
|
127
|
+
connOptions: DEFAULT_API_CONNECT_OPTIONS,
|
|
128
|
+
};
|
|
129
|
+
export class RealtimeModel extends llm.RealtimeModel {
|
|
130
|
+
sampleRate = api_proto.SAMPLE_RATE;
|
|
131
|
+
numChannels = api_proto.NUM_CHANNELS;
|
|
132
|
+
inFrameSize = api_proto.IN_FRAME_SIZE;
|
|
133
|
+
outFrameSize = api_proto.OUT_FRAME_SIZE;
|
|
76
134
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
transcript: string;
|
|
80
|
-
}
|
|
135
|
+
/* @internal */
|
|
136
|
+
_options: RealtimeOptions;
|
|
81
137
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
138
|
+
constructor(
|
|
139
|
+
options: {
|
|
140
|
+
model?: string;
|
|
141
|
+
voice?: string;
|
|
142
|
+
temperature?: number;
|
|
143
|
+
toolChoice?: llm.ToolChoice;
|
|
144
|
+
baseURL?: string;
|
|
145
|
+
inputAudioTranscription?: api_proto.InputAudioTranscription | null;
|
|
146
|
+
// TODO(shubhra): add inputAudioNoiseReduction
|
|
147
|
+
turnDetection?: api_proto.TurnDetectionType | null;
|
|
148
|
+
speed?: number;
|
|
149
|
+
// TODO(shubhra): add openai tracing options
|
|
150
|
+
azureDeployment?: string;
|
|
151
|
+
apiKey?: string;
|
|
152
|
+
entraToken?: string;
|
|
153
|
+
apiVersion?: string;
|
|
154
|
+
maxSessionDuration?: number;
|
|
155
|
+
connOptions?: APIConnectOptions;
|
|
156
|
+
} = {},
|
|
157
|
+
) {
|
|
158
|
+
super({
|
|
159
|
+
messageTruncation: true,
|
|
160
|
+
turnDetection: options.turnDetection !== null,
|
|
161
|
+
userTranscription: options.inputAudioTranscription !== null,
|
|
162
|
+
autoToolReplyGeneration: false,
|
|
163
|
+
});
|
|
86
164
|
|
|
87
|
-
|
|
88
|
-
itemId: string;
|
|
89
|
-
}
|
|
165
|
+
const isAzure = !!(options.apiVersion || options.entraToken || options.azureDeployment);
|
|
90
166
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
167
|
+
if (options.apiKey === '' && !isAzure) {
|
|
168
|
+
throw new Error(
|
|
169
|
+
'OpenAI API key is required, either using the argument or by setting the OPENAI_API_KEY environmental variable',
|
|
170
|
+
);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
const apiKey = options.apiKey || process.env.OPENAI_API_KEY;
|
|
174
|
+
|
|
175
|
+
if (!apiKey && !isAzure) {
|
|
176
|
+
throw new Error(
|
|
177
|
+
'OpenAI API key is required, either using the argument or by setting the OPENAI_API_KEY environmental variable',
|
|
178
|
+
);
|
|
179
|
+
}
|
|
94
180
|
|
|
95
|
-
|
|
96
|
-
|
|
181
|
+
if (!options.baseURL && isAzure) {
|
|
182
|
+
const azureEndpoint = process.env.AZURE_OPENAI_ENDPOINT;
|
|
183
|
+
if (!azureEndpoint) {
|
|
184
|
+
throw new Error(
|
|
185
|
+
'Missing Azure endpoint. Please pass base_url or set AZURE_OPENAI_ENDPOINT environment variable.',
|
|
186
|
+
);
|
|
187
|
+
}
|
|
188
|
+
options.baseURL = `${azureEndpoint.replace(/\/$/, '')}/openai`;
|
|
189
|
+
}
|
|
97
190
|
|
|
98
|
-
|
|
99
|
-
|
|
191
|
+
this._options = {
|
|
192
|
+
...DEFAULT_REALTIME_MODEL_OPTIONS,
|
|
193
|
+
...options,
|
|
194
|
+
baseURL: options.baseURL || BASE_URL,
|
|
195
|
+
apiKey,
|
|
196
|
+
isAzure,
|
|
197
|
+
model: options.model || DEFAULT_REALTIME_MODEL_OPTIONS.model,
|
|
198
|
+
};
|
|
100
199
|
}
|
|
101
200
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
201
|
+
/**
|
|
202
|
+
* Create a RealtimeModel instance configured for Azure OpenAI Service.
|
|
203
|
+
*
|
|
204
|
+
* @param azureDeployment - The name of your Azure OpenAI deployment.
|
|
205
|
+
* @param azureEndpoint - The endpoint URL for your Azure OpenAI resource. If undefined, will attempt to read from the environment variable AZURE_OPENAI_ENDPOINT.
|
|
206
|
+
* @param apiVersion - API version to use with Azure OpenAI Service. If undefined, will attempt to read from the environment variable OPENAI_API_VERSION.
|
|
207
|
+
* @param apiKey - Azure OpenAI API key. If undefined, will attempt to read from the environment variable AZURE_OPENAI_API_KEY.
|
|
208
|
+
* @param entraToken - Azure Entra authentication token. Required if not using API key authentication.
|
|
209
|
+
* @param baseURL - Base URL for the API endpoint. If undefined, constructed from the azure_endpoint.
|
|
210
|
+
* @param voice - Voice setting for audio outputs. Defaults to "alloy".
|
|
211
|
+
* @param inputAudioTranscription - Options for transcribing input audio. Defaults to @see DEFAULT_INPUT_AUDIO_TRANSCRIPTION.
|
|
212
|
+
* @param turnDetection - Options for server-based voice activity detection (VAD). Defaults to @see DEFAULT_SERVER_VAD_OPTIONS.
|
|
213
|
+
* @param temperature - Sampling temperature for response generation. Defaults to @see DEFAULT_TEMPERATURE.
|
|
214
|
+
* @param speed - Speed of the audio output. Defaults to 1.0.
|
|
215
|
+
* @param maxResponseOutputTokens - Maximum number of tokens in the response. Defaults to @see DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS.
|
|
216
|
+
* @param maxSessionDuration - Maximum duration of the session in milliseconds. Defaults to @see DEFAULT_MAX_SESSION_DURATION.
|
|
217
|
+
*
|
|
218
|
+
* @returns A RealtimeModel instance configured for Azure OpenAI Service.
|
|
219
|
+
*
|
|
220
|
+
* @throws Error if required Azure parameters are missing or invalid.
|
|
221
|
+
*/
|
|
222
|
+
static withAzure({
|
|
223
|
+
azureDeployment,
|
|
224
|
+
azureEndpoint,
|
|
225
|
+
apiVersion,
|
|
226
|
+
apiKey,
|
|
227
|
+
entraToken,
|
|
228
|
+
baseURL,
|
|
229
|
+
voice = 'alloy',
|
|
230
|
+
inputAudioTranscription = AZURE_DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
|
|
231
|
+
turnDetection = AZURE_DEFAULT_TURN_DETECTION,
|
|
232
|
+
temperature = 0.8,
|
|
233
|
+
speed,
|
|
234
|
+
}: {
|
|
235
|
+
azureDeployment: string;
|
|
236
|
+
azureEndpoint?: string;
|
|
237
|
+
apiVersion?: string;
|
|
238
|
+
apiKey?: string;
|
|
239
|
+
entraToken?: string;
|
|
240
|
+
baseURL?: string;
|
|
241
|
+
voice?: string;
|
|
242
|
+
inputAudioTranscription?: api_proto.InputAudioTranscription;
|
|
243
|
+
// TODO(shubhra): add inputAudioNoiseReduction
|
|
244
|
+
turnDetection?: api_proto.TurnDetectionType;
|
|
245
|
+
temperature?: number;
|
|
246
|
+
speed?: number;
|
|
247
|
+
}) {
|
|
248
|
+
apiKey = apiKey || process.env.AZURE_OPENAI_API_KEY;
|
|
249
|
+
if (!apiKey && !entraToken) {
|
|
250
|
+
throw new Error(
|
|
251
|
+
'Missing credentials. Please pass one of `apiKey`, `entraToken`, or the `AZURE_OPENAI_API_KEY` environment variable.',
|
|
252
|
+
);
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
apiVersion = apiVersion || process.env.OPENAI_API_VERSION;
|
|
256
|
+
if (!apiVersion) {
|
|
257
|
+
throw new Error(
|
|
258
|
+
'Must provide either the `apiVersion` argument or the `OPENAI_API_VERSION` environment variable',
|
|
259
|
+
);
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
if (!baseURL) {
|
|
263
|
+
azureEndpoint = azureEndpoint || process.env.AZURE_OPENAI_ENDPOINT;
|
|
264
|
+
if (!azureEndpoint) {
|
|
265
|
+
throw new Error(
|
|
266
|
+
'Missing Azure endpoint. Please pass the `azure_endpoint` parameter or set the `AZURE_OPENAI_ENDPOINT` environment variable.',
|
|
267
|
+
);
|
|
268
|
+
}
|
|
269
|
+
baseURL = `${azureEndpoint.replace(/\/$/, '')}/openai`;
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
return new RealtimeModel({
|
|
273
|
+
voice,
|
|
274
|
+
inputAudioTranscription,
|
|
275
|
+
turnDetection,
|
|
276
|
+
temperature,
|
|
277
|
+
speed,
|
|
278
|
+
apiKey,
|
|
279
|
+
azureDeployment,
|
|
280
|
+
apiVersion,
|
|
281
|
+
entraToken,
|
|
282
|
+
baseURL,
|
|
106
283
|
});
|
|
107
284
|
}
|
|
108
285
|
|
|
109
|
-
|
|
110
|
-
this
|
|
111
|
-
type: 'input_audio_buffer.clear',
|
|
112
|
-
});
|
|
286
|
+
session() {
|
|
287
|
+
return new RealtimeSession(this);
|
|
113
288
|
}
|
|
114
289
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
type: 'input_audio_buffer.commit',
|
|
118
|
-
});
|
|
290
|
+
async close() {
|
|
291
|
+
return;
|
|
119
292
|
}
|
|
120
293
|
}
|
|
121
294
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
295
|
+
function processBaseURL({
|
|
296
|
+
baseURL,
|
|
297
|
+
model,
|
|
298
|
+
isAzure = false,
|
|
299
|
+
azureDeployment,
|
|
300
|
+
apiVersion,
|
|
301
|
+
}: {
|
|
302
|
+
baseURL: string;
|
|
303
|
+
model: string;
|
|
304
|
+
isAzure: boolean;
|
|
305
|
+
azureDeployment?: string;
|
|
306
|
+
apiVersion?: string;
|
|
307
|
+
}): string {
|
|
308
|
+
const url = new URL([baseURL, 'realtime'].join('/'));
|
|
129
309
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
type: 'conversation.item.truncate',
|
|
133
|
-
item_id: itemId,
|
|
134
|
-
content_index: contentIndex,
|
|
135
|
-
audio_end_ms: audioEnd,
|
|
136
|
-
});
|
|
310
|
+
if (url.protocol === 'https:') {
|
|
311
|
+
url.protocol = 'wss:';
|
|
137
312
|
}
|
|
138
313
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
314
|
+
// ensure "/realtime" is added if the path is empty OR "/v1"
|
|
315
|
+
if (!url.pathname || ['', '/v1', '/openai'].includes(url.pathname.replace(/\/$/, ''))) {
|
|
316
|
+
url.pathname = url.pathname.replace(/\/$/, '') + '/realtime';
|
|
317
|
+
} else {
|
|
318
|
+
url.pathname = url.pathname.replace(/\/$/, '');
|
|
144
319
|
}
|
|
145
320
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
321
|
+
const queryParams: Record<string, string> = {};
|
|
322
|
+
if (isAzure) {
|
|
323
|
+
if (apiVersion) {
|
|
324
|
+
queryParams['api-version'] = apiVersion;
|
|
149
325
|
}
|
|
326
|
+
if (azureDeployment) {
|
|
327
|
+
queryParams['deployment'] = azureDeployment;
|
|
328
|
+
}
|
|
329
|
+
} else {
|
|
330
|
+
queryParams['model'] = model;
|
|
331
|
+
}
|
|
150
332
|
|
|
151
|
-
|
|
333
|
+
for (const [key, value] of Object.entries(queryParams)) {
|
|
334
|
+
url.searchParams.set(key, value);
|
|
335
|
+
}
|
|
152
336
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
throw new TypeError('message.content must be a string');
|
|
156
|
-
}
|
|
337
|
+
return url.toString();
|
|
338
|
+
}
|
|
157
339
|
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
340
|
+
/**
|
|
341
|
+
* A session for the OpenAI Realtime API.
|
|
342
|
+
*
|
|
343
|
+
* This class is used to interact with the OpenAI Realtime API.
|
|
344
|
+
* It is responsible for sending events to the OpenAI Realtime API and receiving events from it.
|
|
345
|
+
*
|
|
346
|
+
* It exposes two more events:
|
|
347
|
+
* - openai_server_event_received: expose the raw server events from the OpenAI Realtime API
|
|
348
|
+
* - openai_client_event_queued: expose the raw client events sent to the OpenAI Realtime API
|
|
349
|
+
*/
|
|
350
|
+
export class RealtimeSession extends llm.RealtimeSession {
|
|
351
|
+
private _tools: llm.ToolContext = {};
|
|
352
|
+
private remoteChatCtx: llm.RemoteChatContext = new llm.RemoteChatContext();
|
|
353
|
+
private messageChannel = new Queue<api_proto.ClientEvent>();
|
|
354
|
+
private inputResampler?: AudioResampler;
|
|
355
|
+
private instructions?: string;
|
|
356
|
+
private oaiRealtimeModel: RealtimeModel;
|
|
357
|
+
private currentGeneration?: ResponseGeneration;
|
|
358
|
+
private responseCreatedFutures: { [id: string]: CreateResponseHandle } = {};
|
|
359
|
+
|
|
360
|
+
private textModeRecoveryRetries: number = 0;
|
|
361
|
+
|
|
362
|
+
private itemCreateFutures: { [id: string]: Future } = {};
|
|
363
|
+
private itemDeleteFutures: { [id: string]: Future } = {};
|
|
364
|
+
|
|
365
|
+
private updateChatCtxLock = new Mutex();
|
|
366
|
+
private updateFuncCtxLock = new Mutex();
|
|
367
|
+
|
|
368
|
+
// 100ms chunks
|
|
369
|
+
private bstream = new AudioByteStream(SAMPLE_RATE, NUM_CHANNELS, SAMPLE_RATE / 10);
|
|
370
|
+
|
|
371
|
+
private pushedDurationMs: number = 0;
|
|
172
372
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
if (typeof c === 'string') {
|
|
177
|
-
contents.push({
|
|
178
|
-
type: 'input_text',
|
|
179
|
-
text: c,
|
|
180
|
-
});
|
|
181
|
-
} else if (
|
|
182
|
-
// typescript type guard for determining ChatAudio vs ChatImage
|
|
183
|
-
((c: llm.ChatAudio | llm.ChatImage): c is llm.ChatAudio => {
|
|
184
|
-
return (c as llm.ChatAudio).frame !== undefined;
|
|
185
|
-
})(c)
|
|
186
|
-
) {
|
|
187
|
-
contents.push({
|
|
188
|
-
type: 'input_audio',
|
|
189
|
-
audio: Buffer.from(mergeFrames(c.frame).data.buffer).toString('base64'),
|
|
190
|
-
});
|
|
191
|
-
}
|
|
192
|
-
}
|
|
373
|
+
#logger = log();
|
|
374
|
+
#task: Promise<void>;
|
|
375
|
+
#closed = false;
|
|
193
376
|
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
previous_item_id: previousItemId,
|
|
197
|
-
item: {
|
|
198
|
-
type: 'message',
|
|
199
|
-
role: 'user',
|
|
200
|
-
content: contents,
|
|
201
|
-
},
|
|
202
|
-
};
|
|
203
|
-
} else if (message.role === llm.ChatRole.ASSISTANT) {
|
|
204
|
-
const contents: api_proto.TextContent[] = [];
|
|
205
|
-
for (const c of content) {
|
|
206
|
-
if (typeof c === 'string') {
|
|
207
|
-
contents.push({
|
|
208
|
-
type: 'text',
|
|
209
|
-
text: c,
|
|
210
|
-
});
|
|
211
|
-
} else if (
|
|
212
|
-
// typescript type guard for determining ChatAudio vs ChatImage
|
|
213
|
-
((c: llm.ChatAudio | llm.ChatImage): c is llm.ChatAudio => {
|
|
214
|
-
return (c as llm.ChatAudio).frame !== undefined;
|
|
215
|
-
})(c)
|
|
216
|
-
) {
|
|
217
|
-
this.#logger.warn('audio content in assistant message is not supported');
|
|
218
|
-
}
|
|
219
|
-
}
|
|
377
|
+
constructor(realtimeModel: RealtimeModel) {
|
|
378
|
+
super(realtimeModel);
|
|
220
379
|
|
|
221
|
-
|
|
222
|
-
type: 'conversation.item.create',
|
|
223
|
-
previous_item_id: previousItemId,
|
|
224
|
-
item: {
|
|
225
|
-
type: 'message',
|
|
226
|
-
role: 'assistant',
|
|
227
|
-
content: contents,
|
|
228
|
-
},
|
|
229
|
-
};
|
|
230
|
-
} else if (message.role === llm.ChatRole.SYSTEM) {
|
|
231
|
-
const contents: api_proto.InputTextContent[] = [];
|
|
232
|
-
for (const c of content) {
|
|
233
|
-
if (typeof c === 'string') {
|
|
234
|
-
contents.push({
|
|
235
|
-
type: 'input_text',
|
|
236
|
-
text: c,
|
|
237
|
-
});
|
|
238
|
-
} else if (
|
|
239
|
-
// typescript type guard for determining ChatAudio vs ChatImage
|
|
240
|
-
((c: llm.ChatAudio | llm.ChatImage): c is llm.ChatAudio => {
|
|
241
|
-
return (c as llm.ChatAudio).frame !== undefined;
|
|
242
|
-
})(c)
|
|
243
|
-
) {
|
|
244
|
-
this.#logger.warn('audio content in system message is not supported');
|
|
245
|
-
}
|
|
246
|
-
}
|
|
380
|
+
this.oaiRealtimeModel = realtimeModel;
|
|
247
381
|
|
|
248
|
-
|
|
249
|
-
type: 'conversation.item.create',
|
|
250
|
-
previous_item_id: previousItemId,
|
|
251
|
-
item: {
|
|
252
|
-
type: 'message',
|
|
253
|
-
role: 'system',
|
|
254
|
-
content: contents,
|
|
255
|
-
},
|
|
256
|
-
};
|
|
257
|
-
} else {
|
|
258
|
-
this.#logger
|
|
259
|
-
.child({ message })
|
|
260
|
-
.warn('chat message is not supported inside the realtime API');
|
|
261
|
-
return;
|
|
262
|
-
}
|
|
263
|
-
}
|
|
382
|
+
this.#task = this.#mainTask();
|
|
264
383
|
|
|
265
|
-
this
|
|
384
|
+
this.sendEvent(this.createSessionUpdateEvent());
|
|
266
385
|
}
|
|
267
|
-
}
|
|
268
|
-
|
|
269
|
-
class Conversation {
|
|
270
|
-
#session: RealtimeSession;
|
|
271
386
|
|
|
272
|
-
|
|
273
|
-
this
|
|
387
|
+
sendEvent(command: api_proto.ClientEvent): void {
|
|
388
|
+
this.messageChannel.put(command);
|
|
274
389
|
}
|
|
275
390
|
|
|
276
|
-
|
|
277
|
-
return
|
|
391
|
+
private createSessionUpdateEvent(): api_proto.SessionUpdateEvent {
|
|
392
|
+
return {
|
|
393
|
+
type: 'session.update',
|
|
394
|
+
session: {
|
|
395
|
+
model: this.oaiRealtimeModel._options.model,
|
|
396
|
+
voice: this.oaiRealtimeModel._options.voice,
|
|
397
|
+
input_audio_format: 'pcm16',
|
|
398
|
+
output_audio_format: 'pcm16',
|
|
399
|
+
modalities: ['text', 'audio'],
|
|
400
|
+
turn_detection: this.oaiRealtimeModel._options.turnDetection,
|
|
401
|
+
input_audio_transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
|
|
402
|
+
// TODO(shubhra): add inputAudioNoiseReduction
|
|
403
|
+
temperature: this.oaiRealtimeModel._options.temperature,
|
|
404
|
+
tool_choice: toOaiToolChoice(this.oaiRealtimeModel._options.toolChoice),
|
|
405
|
+
max_response_output_tokens:
|
|
406
|
+
this.oaiRealtimeModel._options.maxResponseOutputTokens === Infinity
|
|
407
|
+
? 'inf'
|
|
408
|
+
: this.oaiRealtimeModel._options.maxResponseOutputTokens,
|
|
409
|
+
// TODO(shubhra): add tracing options
|
|
410
|
+
instructions: this.instructions,
|
|
411
|
+
speed: this.oaiRealtimeModel._options.speed,
|
|
412
|
+
},
|
|
413
|
+
};
|
|
278
414
|
}
|
|
279
|
-
}
|
|
280
|
-
|
|
281
|
-
class Response {
|
|
282
|
-
#session: RealtimeSession;
|
|
283
415
|
|
|
284
|
-
|
|
285
|
-
this
|
|
416
|
+
get chatCtx() {
|
|
417
|
+
return this.remoteChatCtx.toChatCtx();
|
|
286
418
|
}
|
|
287
419
|
|
|
288
|
-
|
|
289
|
-
this
|
|
290
|
-
type: 'response.create',
|
|
291
|
-
});
|
|
420
|
+
get tools() {
|
|
421
|
+
return { ...this._tools } as llm.ToolContext;
|
|
292
422
|
}
|
|
293
423
|
|
|
294
|
-
|
|
295
|
-
this
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
}
|
|
299
|
-
}
|
|
424
|
+
async updateChatCtx(_chatCtx: llm.ChatContext): Promise<void> {
|
|
425
|
+
const unlock = await this.updateChatCtxLock.lock();
|
|
426
|
+
const events = this.createChatCtxUpdateEvents(_chatCtx);
|
|
427
|
+
const futures: Future<void>[] = [];
|
|
300
428
|
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
content_index: number;
|
|
305
|
-
}
|
|
429
|
+
for (const event of events) {
|
|
430
|
+
const future = new Future<void>();
|
|
431
|
+
futures.push(future);
|
|
306
432
|
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
433
|
+
if (event.type === 'conversation.item.create') {
|
|
434
|
+
this.itemCreateFutures[event.item.id] = future;
|
|
435
|
+
} else if (event.type == 'conversation.item.delete') {
|
|
436
|
+
this.itemDeleteFutures[event.item_id] = future;
|
|
437
|
+
}
|
|
312
438
|
|
|
313
|
-
|
|
314
|
-
|
|
439
|
+
this.sendEvent(event);
|
|
440
|
+
}
|
|
315
441
|
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
apiKey = undefined,
|
|
321
|
-
entraToken = undefined,
|
|
322
|
-
instructions = '',
|
|
323
|
-
modalities = ['text', 'audio'],
|
|
324
|
-
voice = 'alloy',
|
|
325
|
-
inputAudioFormat = 'pcm16',
|
|
326
|
-
outputAudioFormat = 'pcm16',
|
|
327
|
-
inputAudioTranscription = { model: 'whisper-1' },
|
|
328
|
-
turnDetection = { type: 'server_vad' },
|
|
329
|
-
temperature = 0.8,
|
|
330
|
-
maxResponseOutputTokens = Infinity,
|
|
331
|
-
}: {
|
|
332
|
-
baseURL: string;
|
|
333
|
-
azureDeployment: string;
|
|
334
|
-
apiVersion?: string;
|
|
335
|
-
apiKey?: string;
|
|
336
|
-
entraToken?: string;
|
|
337
|
-
instructions?: string;
|
|
338
|
-
modalities?: ['text', 'audio'] | ['text'];
|
|
339
|
-
voice?: api_proto.Voice;
|
|
340
|
-
inputAudioFormat?: api_proto.AudioFormat;
|
|
341
|
-
outputAudioFormat?: api_proto.AudioFormat;
|
|
342
|
-
inputAudioTranscription?: api_proto.InputAudioTranscription;
|
|
343
|
-
turnDetection?: api_proto.TurnDetectionType;
|
|
344
|
-
temperature?: number;
|
|
345
|
-
maxResponseOutputTokens?: number;
|
|
346
|
-
}) {
|
|
347
|
-
return new RealtimeModel({
|
|
348
|
-
isAzure: true,
|
|
349
|
-
baseURL: new URL('openai', baseURL).toString(),
|
|
350
|
-
model: azureDeployment,
|
|
351
|
-
apiVersion,
|
|
352
|
-
apiKey,
|
|
353
|
-
entraToken,
|
|
354
|
-
instructions,
|
|
355
|
-
modalities,
|
|
356
|
-
voice,
|
|
357
|
-
inputAudioFormat,
|
|
358
|
-
outputAudioFormat,
|
|
359
|
-
inputAudioTranscription,
|
|
360
|
-
turnDetection,
|
|
361
|
-
temperature,
|
|
362
|
-
maxResponseOutputTokens,
|
|
363
|
-
});
|
|
364
|
-
}
|
|
442
|
+
if (futures.length === 0) {
|
|
443
|
+
unlock();
|
|
444
|
+
return;
|
|
445
|
+
}
|
|
365
446
|
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
apiVersion = undefined,
|
|
382
|
-
entraToken = undefined,
|
|
383
|
-
}: {
|
|
384
|
-
modalities?: ['text', 'audio'] | ['text'];
|
|
385
|
-
instructions?: string;
|
|
386
|
-
voice?: api_proto.Voice;
|
|
387
|
-
inputAudioFormat?: api_proto.AudioFormat;
|
|
388
|
-
outputAudioFormat?: api_proto.AudioFormat;
|
|
389
|
-
inputAudioTranscription?: api_proto.InputAudioTranscription;
|
|
390
|
-
turnDetection?: api_proto.TurnDetectionType;
|
|
391
|
-
temperature?: number;
|
|
392
|
-
maxResponseOutputTokens?: number;
|
|
393
|
-
model?: api_proto.Model;
|
|
394
|
-
apiKey?: string;
|
|
395
|
-
baseURL?: string;
|
|
396
|
-
isAzure?: boolean;
|
|
397
|
-
apiVersion?: string;
|
|
398
|
-
entraToken?: string;
|
|
399
|
-
}) {
|
|
400
|
-
super();
|
|
447
|
+
try {
|
|
448
|
+
// wait for futures to resolve or timeout
|
|
449
|
+
await Promise.race([
|
|
450
|
+
Promise.all(futures),
|
|
451
|
+
delay(5000).then(() => {
|
|
452
|
+
throw new Error('Chat ctx update events timed out');
|
|
453
|
+
}),
|
|
454
|
+
]);
|
|
455
|
+
} catch (e) {
|
|
456
|
+
this.#logger.error((e as Error).message);
|
|
457
|
+
throw e;
|
|
458
|
+
} finally {
|
|
459
|
+
unlock();
|
|
460
|
+
}
|
|
461
|
+
}
|
|
401
462
|
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
463
|
+
private createChatCtxUpdateEvents(
|
|
464
|
+
chatCtx: llm.ChatContext,
|
|
465
|
+
addMockAudio: boolean = false,
|
|
466
|
+
): (api_proto.ConversationItemCreateEvent | api_proto.ConversationItemDeleteEvent)[] {
|
|
467
|
+
const newChatCtx = chatCtx.copy();
|
|
468
|
+
if (addMockAudio) {
|
|
469
|
+
newChatCtx.items.push(createMockAudioItem());
|
|
470
|
+
} else {
|
|
471
|
+
// clean up existing mock audio items
|
|
472
|
+
newChatCtx.items = newChatCtx.items.filter(
|
|
473
|
+
(item) => !item.id.startsWith(MOCK_AUDIO_ID_PREFIX),
|
|
405
474
|
);
|
|
406
475
|
}
|
|
407
476
|
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
isAzure,
|
|
422
|
-
apiVersion,
|
|
423
|
-
entraToken,
|
|
424
|
-
};
|
|
425
|
-
}
|
|
477
|
+
const events: (
|
|
478
|
+
| api_proto.ConversationItemCreateEvent
|
|
479
|
+
| api_proto.ConversationItemDeleteEvent
|
|
480
|
+
)[] = [];
|
|
481
|
+
|
|
482
|
+
const diffOps = llm.computeChatCtxDiff(this.chatCtx, newChatCtx);
|
|
483
|
+
for (const op of diffOps.toRemove) {
|
|
484
|
+
events.push({
|
|
485
|
+
type: 'conversation.item.delete',
|
|
486
|
+
item_id: op,
|
|
487
|
+
event_id: shortuuid('chat_ctx_delete_'),
|
|
488
|
+
} as api_proto.ConversationItemDeleteEvent);
|
|
489
|
+
}
|
|
426
490
|
|
|
427
|
-
|
|
428
|
-
|
|
491
|
+
for (const [previousId, id] of diffOps.toCreate) {
|
|
492
|
+
const chatItem = newChatCtx.getById(id);
|
|
493
|
+
if (!chatItem) {
|
|
494
|
+
throw new Error(`Chat item ${id} not found`);
|
|
495
|
+
}
|
|
496
|
+
events.push({
|
|
497
|
+
type: 'conversation.item.create',
|
|
498
|
+
item: livekitItemToOpenAIItem(chatItem),
|
|
499
|
+
previous_item_id: previousId ?? undefined,
|
|
500
|
+
event_id: shortuuid('chat_ctx_create_'),
|
|
501
|
+
} as api_proto.ConversationItemCreateEvent);
|
|
502
|
+
}
|
|
503
|
+
return events;
|
|
429
504
|
}
|
|
430
505
|
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
instructions = this.#defaultOpts.instructions,
|
|
436
|
-
voice = this.#defaultOpts.voice,
|
|
437
|
-
inputAudioFormat = this.#defaultOpts.inputAudioFormat,
|
|
438
|
-
outputAudioFormat = this.#defaultOpts.outputAudioFormat,
|
|
439
|
-
inputAudioTranscription = this.#defaultOpts.inputAudioTranscription,
|
|
440
|
-
turnDetection = this.#defaultOpts.turnDetection,
|
|
441
|
-
temperature = this.#defaultOpts.temperature,
|
|
442
|
-
maxResponseOutputTokens = this.#defaultOpts.maxResponseOutputTokens,
|
|
443
|
-
}: {
|
|
444
|
-
fncCtx?: llm.FunctionContext;
|
|
445
|
-
chatCtx?: llm.ChatContext;
|
|
446
|
-
modalities?: ['text', 'audio'] | ['text'];
|
|
447
|
-
instructions?: string;
|
|
448
|
-
voice?: api_proto.Voice;
|
|
449
|
-
inputAudioFormat?: api_proto.AudioFormat;
|
|
450
|
-
outputAudioFormat?: api_proto.AudioFormat;
|
|
451
|
-
inputAudioTranscription?: api_proto.InputAudioTranscription | null;
|
|
452
|
-
turnDetection?: api_proto.TurnDetectionType | null;
|
|
453
|
-
temperature?: number;
|
|
454
|
-
maxResponseOutputTokens?: number;
|
|
455
|
-
}): RealtimeSession {
|
|
456
|
-
const opts: ModelOptions = {
|
|
457
|
-
modalities,
|
|
458
|
-
instructions,
|
|
459
|
-
voice,
|
|
460
|
-
inputAudioFormat,
|
|
461
|
-
outputAudioFormat,
|
|
462
|
-
inputAudioTranscription,
|
|
463
|
-
turnDetection,
|
|
464
|
-
temperature,
|
|
465
|
-
maxResponseOutputTokens,
|
|
466
|
-
model: this.#defaultOpts.model,
|
|
467
|
-
apiKey: this.#defaultOpts.apiKey,
|
|
468
|
-
baseURL: this.#defaultOpts.baseURL,
|
|
469
|
-
isAzure: this.#defaultOpts.isAzure,
|
|
470
|
-
apiVersion: this.#defaultOpts.apiVersion,
|
|
471
|
-
entraToken: this.#defaultOpts.entraToken,
|
|
472
|
-
};
|
|
506
|
+
async updateTools(_tools: llm.ToolContext): Promise<void> {
|
|
507
|
+
const unlock = await this.updateFuncCtxLock.lock();
|
|
508
|
+
const ev = this.createToolsUpdateEvent(_tools);
|
|
509
|
+
this.sendEvent(ev);
|
|
473
510
|
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
});
|
|
478
|
-
this.#sessions.push(newSession);
|
|
479
|
-
return newSession;
|
|
480
|
-
}
|
|
511
|
+
if (!ev.session.tools) {
|
|
512
|
+
throw new Error('Tools are missing in the session update event');
|
|
513
|
+
}
|
|
481
514
|
|
|
482
|
-
|
|
483
|
-
|
|
515
|
+
// TODO(brian): these logics below are noops I think, leaving it here to keep
|
|
516
|
+
// parity with the python but we should remove them later
|
|
517
|
+
const retainedToolNames = new Set(ev.session.tools.map((tool) => tool.name));
|
|
518
|
+
const retainedTools = Object.fromEntries(
|
|
519
|
+
Object.entries(_tools).filter(
|
|
520
|
+
([name, tool]) => llm.isFunctionTool(tool) && retainedToolNames.has(name),
|
|
521
|
+
),
|
|
522
|
+
);
|
|
523
|
+
|
|
524
|
+
this._tools = retainedTools as llm.ToolContext;
|
|
525
|
+
|
|
526
|
+
unlock();
|
|
484
527
|
}
|
|
485
|
-
}
|
|
486
528
|
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
#fncCtx: llm.FunctionContext | undefined = undefined;
|
|
490
|
-
#opts: ModelOptions;
|
|
491
|
-
#pendingResponses: { [id: string]: RealtimeResponse } = {};
|
|
492
|
-
#sessionId = 'not-connected';
|
|
493
|
-
#ws: WebSocket | null = null;
|
|
494
|
-
#expiresAt: number | null = null;
|
|
495
|
-
#logger = log();
|
|
496
|
-
#task: Promise<void>;
|
|
497
|
-
#closing = true;
|
|
498
|
-
#sendQueue = new Queue<api_proto.ClientEvent>();
|
|
529
|
+
private createToolsUpdateEvent(_tools: llm.ToolContext): api_proto.SessionUpdateEvent {
|
|
530
|
+
const oaiTools: api_proto.Tool[] = [];
|
|
499
531
|
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
532
|
+
for (const [name, tool] of Object.entries(_tools)) {
|
|
533
|
+
if (!llm.isFunctionTool(tool)) {
|
|
534
|
+
this.#logger.error({ name, tool }, "OpenAI Realtime API doesn't support this tool type");
|
|
535
|
+
continue;
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
const { parameters: toolParameters, description } = tool;
|
|
539
|
+
try {
|
|
540
|
+
const parameters = llm.toJsonSchema(
|
|
541
|
+
toolParameters,
|
|
542
|
+
) as unknown as api_proto.Tool['parameters'];
|
|
543
|
+
|
|
544
|
+
oaiTools.push({
|
|
545
|
+
name,
|
|
546
|
+
description,
|
|
547
|
+
parameters: parameters,
|
|
548
|
+
type: 'function',
|
|
549
|
+
});
|
|
550
|
+
} catch (e) {
|
|
551
|
+
this.#logger.error({ name, tool }, "OpenAI Realtime API doesn't support this tool type");
|
|
552
|
+
continue;
|
|
553
|
+
}
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
return {
|
|
557
|
+
type: 'session.update',
|
|
558
|
+
session: {
|
|
559
|
+
model: this.oaiRealtimeModel._options.model,
|
|
560
|
+
tools: oaiTools,
|
|
561
|
+
},
|
|
562
|
+
event_id: shortuuid('tools_update_'),
|
|
563
|
+
};
|
|
524
564
|
}
|
|
525
565
|
|
|
526
|
-
|
|
527
|
-
|
|
566
|
+
async updateInstructions(_instructions: string): Promise<void> {
|
|
567
|
+
const eventId = shortuuid('instructions_update_');
|
|
568
|
+
this.sendEvent({
|
|
569
|
+
type: 'session.update',
|
|
570
|
+
session: {
|
|
571
|
+
instructions: _instructions,
|
|
572
|
+
},
|
|
573
|
+
event_id: eventId,
|
|
574
|
+
} as api_proto.SessionUpdateEvent);
|
|
575
|
+
this.instructions = _instructions;
|
|
528
576
|
}
|
|
529
577
|
|
|
530
|
-
|
|
531
|
-
|
|
578
|
+
updateOptions({ toolChoice }: { toolChoice?: llm.ToolChoice }): void {
|
|
579
|
+
const options: api_proto.SessionUpdateEvent['session'] = {};
|
|
580
|
+
|
|
581
|
+
this.oaiRealtimeModel._options.toolChoice = toolChoice;
|
|
582
|
+
options.tool_choice = toOaiToolChoice(toolChoice);
|
|
583
|
+
|
|
584
|
+
// TODO(brian): add other options here
|
|
585
|
+
|
|
586
|
+
this.sendEvent({
|
|
587
|
+
type: 'session.update',
|
|
588
|
+
session: options,
|
|
589
|
+
event_id: shortuuid('options_update_'),
|
|
590
|
+
});
|
|
532
591
|
}
|
|
533
592
|
|
|
534
|
-
|
|
535
|
-
this
|
|
593
|
+
pushAudio(frame: AudioFrame): void {
|
|
594
|
+
for (const f of this.resampleAudio(frame)) {
|
|
595
|
+
for (const nf of this.bstream.write(f.data.buffer)) {
|
|
596
|
+
this.sendEvent({
|
|
597
|
+
type: 'input_audio_buffer.append',
|
|
598
|
+
audio: Buffer.from(nf.data.buffer).toString('base64'),
|
|
599
|
+
} as api_proto.InputAudioBufferAppendEvent);
|
|
600
|
+
// TODO(AJS-102): use frame.durationMs once available in rtc-node
|
|
601
|
+
this.pushedDurationMs += (nf.samplesPerChannel / nf.sampleRate) * 1000;
|
|
602
|
+
}
|
|
603
|
+
}
|
|
536
604
|
}
|
|
537
605
|
|
|
538
|
-
|
|
539
|
-
|
|
606
|
+
async commitAudio(): Promise<void> {
|
|
607
|
+
if (this.pushedDurationMs > 100) {
|
|
608
|
+
// OpenAI requires at least 100ms of audio
|
|
609
|
+
this.sendEvent({
|
|
610
|
+
type: 'input_audio_buffer.commit',
|
|
611
|
+
} as api_proto.InputAudioBufferCommitEvent);
|
|
612
|
+
this.pushedDurationMs = 0;
|
|
613
|
+
}
|
|
540
614
|
}
|
|
541
615
|
|
|
542
|
-
|
|
543
|
-
|
|
616
|
+
async clearAudio(): Promise<void> {
|
|
617
|
+
this.sendEvent({
|
|
618
|
+
type: 'input_audio_buffer.clear',
|
|
619
|
+
} as api_proto.InputAudioBufferClearEvent);
|
|
620
|
+
this.pushedDurationMs = 0;
|
|
544
621
|
}
|
|
545
622
|
|
|
546
|
-
|
|
547
|
-
|
|
623
|
+
async generateReply(instructions?: string): Promise<llm.GenerationCreatedEvent> {
|
|
624
|
+
const handle = this.createResponse({ instructions, userInitiated: true });
|
|
625
|
+
this.textModeRecoveryRetries = 0;
|
|
626
|
+
return handle.doneFut.await;
|
|
548
627
|
}
|
|
549
628
|
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
}
|
|
554
|
-
return this.#expiresAt * 1000;
|
|
629
|
+
async interrupt(): Promise<void> {
|
|
630
|
+
this.sendEvent({
|
|
631
|
+
type: 'response.cancel',
|
|
632
|
+
} as api_proto.ResponseCancelEvent);
|
|
555
633
|
}
|
|
556
634
|
|
|
557
|
-
|
|
558
|
-
this
|
|
635
|
+
async truncate(_options: { messageId: string; audioEndMs: number }): Promise<void> {
|
|
636
|
+
this.sendEvent({
|
|
637
|
+
type: 'conversation.item.truncate',
|
|
638
|
+
content_index: 0,
|
|
639
|
+
item_id: _options.messageId,
|
|
640
|
+
audio_end_ms: _options.audioEndMs,
|
|
641
|
+
} as api_proto.ConversationItemTruncateEvent);
|
|
559
642
|
}
|
|
560
643
|
|
|
561
644
|
/// Truncates the data field of the event to the specified maxLength to avoid overwhelming logs
|
|
@@ -588,646 +671,872 @@ export class RealtimeSession extends multimodal.RealtimeSession {
|
|
|
588
671
|
return untypedEvent;
|
|
589
672
|
}
|
|
590
673
|
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
voice = this.#opts.voice,
|
|
595
|
-
inputAudioFormat = this.#opts.inputAudioFormat,
|
|
596
|
-
outputAudioFormat = this.#opts.outputAudioFormat,
|
|
597
|
-
inputAudioTranscription = this.#opts.inputAudioTranscription,
|
|
598
|
-
turnDetection = this.#opts.turnDetection,
|
|
599
|
-
temperature = this.#opts.temperature,
|
|
600
|
-
maxResponseOutputTokens = this.#opts.maxResponseOutputTokens,
|
|
601
|
-
toolChoice = 'auto',
|
|
602
|
-
selectedTools = Object.keys(this.#fncCtx || {}),
|
|
603
|
-
}: {
|
|
604
|
-
modalities: ['text', 'audio'] | ['text'];
|
|
605
|
-
instructions?: string;
|
|
606
|
-
voice?: api_proto.Voice;
|
|
607
|
-
inputAudioFormat?: api_proto.AudioFormat;
|
|
608
|
-
outputAudioFormat?: api_proto.AudioFormat;
|
|
609
|
-
inputAudioTranscription?: api_proto.InputAudioTranscription | null;
|
|
610
|
-
turnDetection?: api_proto.TurnDetectionType | null;
|
|
611
|
-
temperature?: number;
|
|
612
|
-
maxResponseOutputTokens?: number;
|
|
613
|
-
toolChoice?: api_proto.ToolChoice;
|
|
614
|
-
selectedTools?: string[];
|
|
615
|
-
}) {
|
|
616
|
-
this.#opts = {
|
|
617
|
-
modalities,
|
|
618
|
-
instructions,
|
|
619
|
-
voice,
|
|
620
|
-
inputAudioFormat,
|
|
621
|
-
outputAudioFormat,
|
|
622
|
-
inputAudioTranscription,
|
|
623
|
-
turnDetection,
|
|
624
|
-
temperature,
|
|
625
|
-
maxResponseOutputTokens,
|
|
626
|
-
model: this.#opts.model,
|
|
627
|
-
apiKey: this.#opts.apiKey,
|
|
628
|
-
baseURL: this.#opts.baseURL,
|
|
629
|
-
isAzure: this.#opts.isAzure,
|
|
630
|
-
apiVersion: this.#opts.apiVersion,
|
|
631
|
-
entraToken: this.#opts.entraToken,
|
|
674
|
+
private async createWsConn(): Promise<WebSocket> {
|
|
675
|
+
const headers: Record<string, string> = {
|
|
676
|
+
'User-Agent': 'LiveKit-Agents-JS',
|
|
632
677
|
};
|
|
633
678
|
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
const sessionUpdateEvent: api_proto.SessionUpdateEvent = {
|
|
650
|
-
type: 'session.update',
|
|
651
|
-
session: {
|
|
652
|
-
modalities: this.#opts.modalities,
|
|
653
|
-
instructions: this.#opts.instructions,
|
|
654
|
-
voice: this.#opts.voice,
|
|
655
|
-
input_audio_format: this.#opts.inputAudioFormat,
|
|
656
|
-
output_audio_format: this.#opts.outputAudioFormat,
|
|
657
|
-
input_audio_transcription: this.#opts.inputAudioTranscription,
|
|
658
|
-
turn_detection: this.#opts.turnDetection,
|
|
659
|
-
temperature: this.#opts.temperature,
|
|
660
|
-
max_response_output_tokens:
|
|
661
|
-
this.#opts.maxResponseOutputTokens === Infinity
|
|
662
|
-
? 'inf'
|
|
663
|
-
: this.#opts.maxResponseOutputTokens,
|
|
664
|
-
tools,
|
|
665
|
-
tool_choice: toolChoice,
|
|
666
|
-
},
|
|
667
|
-
};
|
|
668
|
-
|
|
669
|
-
if (this.#opts.isAzure && this.#opts.maxResponseOutputTokens === Infinity) {
|
|
670
|
-
// microsoft doesn't support inf for max_response_output_tokens, but accepts no args
|
|
671
|
-
sessionUpdateEvent.session.max_response_output_tokens = undefined;
|
|
672
|
-
}
|
|
679
|
+
if (this.oaiRealtimeModel._options.isAzure) {
|
|
680
|
+
// Microsoft API has two ways of authentication
|
|
681
|
+
// 1. Entra token set as `Bearer` token
|
|
682
|
+
// 2. API key set as `api_key` header (also accepts query string)
|
|
683
|
+
if (this.oaiRealtimeModel._options.entraToken) {
|
|
684
|
+
headers.Authorization = `Bearer ${this.oaiRealtimeModel._options.entraToken}`;
|
|
685
|
+
} else if (this.oaiRealtimeModel._options.apiKey) {
|
|
686
|
+
headers['api-key'] = this.oaiRealtimeModel._options.apiKey;
|
|
687
|
+
} else {
|
|
688
|
+
throw new Error('Microsoft API key or entraToken is required');
|
|
689
|
+
}
|
|
690
|
+
} else {
|
|
691
|
+
headers.Authorization = `Bearer ${this.oaiRealtimeModel._options.apiKey}`;
|
|
692
|
+
headers['OpenAI-Beta'] = 'realtime=v1';
|
|
693
|
+
}
|
|
673
694
|
|
|
674
|
-
|
|
675
|
-
|
|
695
|
+
const url = processBaseURL({
|
|
696
|
+
baseURL: this.oaiRealtimeModel._options.baseURL,
|
|
697
|
+
model: this.oaiRealtimeModel._options.model,
|
|
698
|
+
isAzure: this.oaiRealtimeModel._options.isAzure,
|
|
699
|
+
apiVersion: this.oaiRealtimeModel._options.apiVersion,
|
|
700
|
+
azureDeployment: this.oaiRealtimeModel._options.azureDeployment,
|
|
701
|
+
});
|
|
676
702
|
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
703
|
+
this.#logger.debug(`Connecting to OpenAI Realtime API at ${url}`);
|
|
704
|
+
|
|
705
|
+
return new Promise((resolve, reject) => {
|
|
706
|
+
const ws = new WebSocket(url, { headers });
|
|
707
|
+
let waiting = true;
|
|
708
|
+
|
|
709
|
+
const timeout = setTimeout(() => {
|
|
710
|
+
ws.close();
|
|
711
|
+
reject(new Error('WebSocket connection timeout'));
|
|
712
|
+
}, this.oaiRealtimeModel._options.connOptions.timeoutMs);
|
|
713
|
+
|
|
714
|
+
ws.once('open', () => {
|
|
715
|
+
if (!waiting) return;
|
|
716
|
+
waiting = false;
|
|
717
|
+
clearTimeout(timeout);
|
|
718
|
+
resolve(ws);
|
|
719
|
+
});
|
|
720
|
+
|
|
721
|
+
ws.once('close', () => {
|
|
722
|
+
if (!waiting) return;
|
|
723
|
+
waiting = false;
|
|
724
|
+
clearTimeout(timeout);
|
|
725
|
+
reject(new Error('OpenAI Realtime API connection closed'));
|
|
726
|
+
});
|
|
690
727
|
});
|
|
691
728
|
}
|
|
692
729
|
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
* This method tries to recover from this by requesting a new response after deleting the text
|
|
699
|
-
* response and creating an empty user audio message.
|
|
700
|
-
*/
|
|
701
|
-
recoverFromTextResponse(itemId: string) {
|
|
702
|
-
if (itemId) {
|
|
703
|
-
this.conversation.item.delete(itemId);
|
|
704
|
-
}
|
|
705
|
-
this.conversation.item.create(this.#createEmptyUserAudioMessage(1));
|
|
706
|
-
this.response.create();
|
|
707
|
-
}
|
|
730
|
+
async #mainTask(): Promise<void> {
|
|
731
|
+
let reconnecting = false;
|
|
732
|
+
let numRetries = 0;
|
|
733
|
+
let wsConn: WebSocket | null = null;
|
|
734
|
+
const maxRetries = this.oaiRealtimeModel._options.connOptions.maxRetry;
|
|
708
735
|
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
// 1. Entra token set as `Bearer` token
|
|
717
|
-
// 2. API key set as `api_key` header (also accepts query string)
|
|
718
|
-
if (this.#opts.entraToken) {
|
|
719
|
-
headers.Authorization = `Bearer ${this.#opts.entraToken}`;
|
|
720
|
-
} else if (this.#opts.apiKey) {
|
|
721
|
-
headers['api-key'] = this.#opts.apiKey;
|
|
722
|
-
} else {
|
|
723
|
-
reject(new Error('Microsoft API key or entraToken is required'));
|
|
724
|
-
return;
|
|
725
|
-
}
|
|
726
|
-
} else {
|
|
727
|
-
headers.Authorization = `Bearer ${this.#opts.apiKey}`;
|
|
728
|
-
headers['OpenAI-Beta'] = 'realtime=v1';
|
|
729
|
-
}
|
|
730
|
-
const url = new URL([this.#opts.baseURL, 'realtime'].join('/'));
|
|
731
|
-
if (url.protocol === 'https:') {
|
|
732
|
-
url.protocol = 'wss:';
|
|
733
|
-
}
|
|
736
|
+
const reconnect = async () => {
|
|
737
|
+
this.#logger.debug(
|
|
738
|
+
{
|
|
739
|
+
maxSessionDuration: this.oaiRealtimeModel._options.maxSessionDuration,
|
|
740
|
+
},
|
|
741
|
+
'Reconnecting to OpenAI Realtime API',
|
|
742
|
+
);
|
|
734
743
|
|
|
735
|
-
|
|
736
|
-
const queryParams: Record<string, string> = {};
|
|
737
|
-
if (this.#opts.isAzure) {
|
|
738
|
-
queryParams['api-version'] = this.#opts.apiVersion ?? '2024-10-01-preview';
|
|
739
|
-
queryParams['deployment'] = this.#opts.model;
|
|
740
|
-
} else {
|
|
741
|
-
queryParams['model'] = this.#opts.model;
|
|
742
|
-
}
|
|
744
|
+
const events: api_proto.ClientEvent[] = [];
|
|
743
745
|
|
|
744
|
-
|
|
745
|
-
|
|
746
|
+
// options and instructions
|
|
747
|
+
events.push(this.createSessionUpdateEvent());
|
|
748
|
+
|
|
749
|
+
// tools
|
|
750
|
+
if (Object.keys(this._tools).length > 0) {
|
|
751
|
+
events.push(this.createToolsUpdateEvent(this._tools));
|
|
746
752
|
}
|
|
747
753
|
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
754
|
+
// chat context
|
|
755
|
+
const chatCtx = this.chatCtx.copy({
|
|
756
|
+
excludeFunctionCall: true,
|
|
757
|
+
excludeInstructions: true,
|
|
758
|
+
excludeEmptyMessage: true,
|
|
751
759
|
});
|
|
752
760
|
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
await once(this.#ws, 'open');
|
|
758
|
-
this.#closing = false;
|
|
761
|
+
const oldChatCtx = this.remoteChatCtx;
|
|
762
|
+
this.remoteChatCtx = new llm.RemoteChatContext();
|
|
763
|
+
events.push(...this.createChatCtxUpdateEvents(chatCtx));
|
|
759
764
|
|
|
760
|
-
|
|
761
|
-
const
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
case 'error':
|
|
765
|
-
this.#handleError(event);
|
|
766
|
-
break;
|
|
767
|
-
case 'session.created':
|
|
768
|
-
this.#handleSessionCreated(event);
|
|
769
|
-
break;
|
|
770
|
-
case 'session.updated':
|
|
771
|
-
this.#handleSessionUpdated(event);
|
|
772
|
-
break;
|
|
773
|
-
case 'conversation.created':
|
|
774
|
-
this.#handleConversationCreated(event);
|
|
775
|
-
break;
|
|
776
|
-
case 'input_audio_buffer.committed':
|
|
777
|
-
this.#handleInputAudioBufferCommitted(event);
|
|
778
|
-
break;
|
|
779
|
-
case 'input_audio_buffer.cleared':
|
|
780
|
-
this.#handleInputAudioBufferCleared(event);
|
|
781
|
-
break;
|
|
782
|
-
case 'input_audio_buffer.speech_started':
|
|
783
|
-
this.#handleInputAudioBufferSpeechStarted(event);
|
|
784
|
-
break;
|
|
785
|
-
case 'input_audio_buffer.speech_stopped':
|
|
786
|
-
this.#handleInputAudioBufferSpeechStopped(event);
|
|
787
|
-
break;
|
|
788
|
-
case 'conversation.item.created':
|
|
789
|
-
this.#handleConversationItemCreated(event);
|
|
790
|
-
break;
|
|
791
|
-
case 'conversation.item.input_audio_transcription.completed':
|
|
792
|
-
this.#handleConversationItemInputAudioTranscriptionCompleted(event);
|
|
793
|
-
break;
|
|
794
|
-
case 'conversation.item.input_audio_transcription.failed':
|
|
795
|
-
this.#handleConversationItemInputAudioTranscriptionFailed(event);
|
|
796
|
-
break;
|
|
797
|
-
case 'conversation.item.truncated':
|
|
798
|
-
this.#handleConversationItemTruncated(event);
|
|
799
|
-
break;
|
|
800
|
-
case 'conversation.item.deleted':
|
|
801
|
-
this.#handleConversationItemDeleted(event);
|
|
802
|
-
break;
|
|
803
|
-
case 'response.created':
|
|
804
|
-
this.#handleResponseCreated(event);
|
|
805
|
-
break;
|
|
806
|
-
case 'response.done':
|
|
807
|
-
this.#handleResponseDone(event);
|
|
808
|
-
break;
|
|
809
|
-
case 'response.output_item.added':
|
|
810
|
-
this.#handleResponseOutputItemAdded(event);
|
|
811
|
-
break;
|
|
812
|
-
case 'response.output_item.done':
|
|
813
|
-
this.#handleResponseOutputItemDone(event);
|
|
814
|
-
break;
|
|
815
|
-
case 'response.content_part.added':
|
|
816
|
-
this.#handleResponseContentPartAdded(event);
|
|
817
|
-
break;
|
|
818
|
-
case 'response.content_part.done':
|
|
819
|
-
this.#handleResponseContentPartDone(event);
|
|
820
|
-
break;
|
|
821
|
-
case 'response.text.delta':
|
|
822
|
-
this.#handleResponseTextDelta(event);
|
|
823
|
-
break;
|
|
824
|
-
case 'response.text.done':
|
|
825
|
-
this.#handleResponseTextDone(event);
|
|
826
|
-
break;
|
|
827
|
-
case 'response.audio_transcript.delta':
|
|
828
|
-
this.#handleResponseAudioTranscriptDelta(event);
|
|
829
|
-
break;
|
|
830
|
-
case 'response.audio_transcript.done':
|
|
831
|
-
this.#handleResponseAudioTranscriptDone(event);
|
|
832
|
-
break;
|
|
833
|
-
case 'response.audio.delta':
|
|
834
|
-
this.#handleResponseAudioDelta(event);
|
|
835
|
-
break;
|
|
836
|
-
case 'response.audio.done':
|
|
837
|
-
this.#handleResponseAudioDone(event);
|
|
838
|
-
break;
|
|
839
|
-
case 'response.function_call_arguments.delta':
|
|
840
|
-
this.#handleResponseFunctionCallArgumentsDelta(event);
|
|
841
|
-
break;
|
|
842
|
-
case 'response.function_call_arguments.done':
|
|
843
|
-
this.#handleResponseFunctionCallArgumentsDone(event);
|
|
844
|
-
break;
|
|
845
|
-
case 'rate_limits.updated':
|
|
846
|
-
this.#handleRateLimitsUpdated(event);
|
|
847
|
-
break;
|
|
765
|
+
try {
|
|
766
|
+
for (const ev of events) {
|
|
767
|
+
this.emit('openai_client_event_queued', ev);
|
|
768
|
+
wsConn!.send(JSON.stringify(ev));
|
|
848
769
|
}
|
|
849
|
-
}
|
|
770
|
+
} catch (error) {
|
|
771
|
+
this.remoteChatCtx = oldChatCtx;
|
|
772
|
+
throw new APIConnectionError({
|
|
773
|
+
message: 'Failed to send message to OpenAI Realtime API during session re-connection',
|
|
774
|
+
});
|
|
775
|
+
}
|
|
850
776
|
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
777
|
+
this.#logger.debug('Reconnected to OpenAI Realtime API');
|
|
778
|
+
|
|
779
|
+
this.emit('session_reconnected', {} as llm.RealtimeSessionReconnectedEvent);
|
|
780
|
+
};
|
|
781
|
+
|
|
782
|
+
reconnecting = false;
|
|
783
|
+
while (!this.#closed) {
|
|
784
|
+
this.#logger.debug('Creating WebSocket connection to OpenAI Realtime API');
|
|
785
|
+
wsConn = await this.createWsConn();
|
|
786
|
+
|
|
787
|
+
try {
|
|
788
|
+
if (reconnecting) {
|
|
789
|
+
await reconnect();
|
|
790
|
+
numRetries = 0;
|
|
791
|
+
}
|
|
792
|
+
await this.runWs(wsConn);
|
|
793
|
+
} catch (error) {
|
|
794
|
+
if (!isAPIError(error)) {
|
|
795
|
+
this.emitError({ error: error as Error, recoverable: false });
|
|
796
|
+
throw error;
|
|
862
797
|
}
|
|
863
|
-
};
|
|
864
798
|
|
|
865
|
-
|
|
799
|
+
if (maxRetries === 0 || !error.retryable) {
|
|
800
|
+
this.emitError({ error: error as Error, recoverable: false });
|
|
801
|
+
throw error;
|
|
802
|
+
}
|
|
866
803
|
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
804
|
+
if (numRetries === maxRetries) {
|
|
805
|
+
this.emitError({ error: error as Error, recoverable: false });
|
|
806
|
+
throw new APIConnectionError({
|
|
807
|
+
message: `OpenAI Realtime API connection failed after ${numRetries} attempts`,
|
|
808
|
+
options: {
|
|
809
|
+
body: error,
|
|
810
|
+
retryable: false,
|
|
811
|
+
},
|
|
812
|
+
});
|
|
870
813
|
}
|
|
871
|
-
|
|
872
|
-
|
|
814
|
+
|
|
815
|
+
this.emitError({ error: error as Error, recoverable: true });
|
|
816
|
+
const retryInterval =
|
|
817
|
+
numRetries === 0
|
|
818
|
+
? DEFAULT_FIRST_RETRY_INTERVAL_MS
|
|
819
|
+
: this.oaiRealtimeModel._options.connOptions.retryIntervalMs;
|
|
820
|
+
this.#logger.warn(
|
|
821
|
+
{
|
|
822
|
+
attempt: numRetries,
|
|
823
|
+
maxRetries,
|
|
824
|
+
error,
|
|
825
|
+
},
|
|
826
|
+
`OpenAI Realtime API connection failed, retrying in ${retryInterval / 1000}s`,
|
|
827
|
+
);
|
|
828
|
+
|
|
829
|
+
await delay(retryInterval);
|
|
830
|
+
numRetries++;
|
|
831
|
+
}
|
|
832
|
+
|
|
833
|
+
reconnecting = true;
|
|
834
|
+
}
|
|
835
|
+
}
|
|
836
|
+
|
|
837
|
+
private async runWs(wsConn: WebSocket): Promise<void> {
|
|
838
|
+
const forwardEvents = async (signal: AbortSignal): Promise<void> => {
|
|
839
|
+
while (!this.#closed && wsConn.readyState === WebSocket.OPEN && !signal.aborted) {
|
|
840
|
+
try {
|
|
841
|
+
const event = await this.messageChannel.get();
|
|
842
|
+
if (signal.aborted) {
|
|
843
|
+
break;
|
|
844
|
+
}
|
|
845
|
+
|
|
846
|
+
if (event.type !== 'input_audio_buffer.append') {
|
|
847
|
+
this.#logger.debug(`(client) -> ${JSON.stringify(this.#loggableEvent(event))}`);
|
|
848
|
+
}
|
|
849
|
+
|
|
850
|
+
this.emit('openai_client_event_queued', event);
|
|
851
|
+
wsConn.send(JSON.stringify(event));
|
|
852
|
+
} catch (error) {
|
|
853
|
+
break;
|
|
873
854
|
}
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
855
|
+
}
|
|
856
|
+
|
|
857
|
+
wsConn.close();
|
|
858
|
+
};
|
|
859
|
+
|
|
860
|
+
const wsCloseFuture = new Future<void | Error>();
|
|
861
|
+
|
|
862
|
+
wsConn.onerror = (error) => {
|
|
863
|
+
wsCloseFuture.resolve(new APIConnectionError({ message: error.message }));
|
|
864
|
+
};
|
|
865
|
+
wsConn.onclose = () => {
|
|
866
|
+
wsCloseFuture.resolve();
|
|
867
|
+
};
|
|
868
|
+
|
|
869
|
+
wsConn.onmessage = (message: MessageEvent) => {
|
|
870
|
+
const event: api_proto.ServerEvent = JSON.parse(message.data as string);
|
|
871
|
+
|
|
872
|
+
this.emit('openai_server_event_received', event);
|
|
873
|
+
this.#logger.debug(`(server) <- ${JSON.stringify(this.#loggableEvent(event))}`);
|
|
874
|
+
|
|
875
|
+
switch (event.type) {
|
|
876
|
+
case 'input_audio_buffer.speech_started':
|
|
877
|
+
this.handleInputAudioBufferSpeechStarted(event);
|
|
878
|
+
break;
|
|
879
|
+
case 'input_audio_buffer.speech_stopped':
|
|
880
|
+
this.handleInputAudioBufferSpeechStopped(event);
|
|
881
|
+
break;
|
|
882
|
+
case 'response.created':
|
|
883
|
+
this.handleResponseCreated(event);
|
|
884
|
+
break;
|
|
885
|
+
case 'response.output_item.added':
|
|
886
|
+
this.handleResponseOutputItemAdded(event);
|
|
887
|
+
break;
|
|
888
|
+
case 'conversation.item.created':
|
|
889
|
+
this.handleConversationItemCreated(event);
|
|
890
|
+
break;
|
|
891
|
+
case 'conversation.item.deleted':
|
|
892
|
+
this.handleConversationItemDeleted(event);
|
|
893
|
+
break;
|
|
894
|
+
case 'conversation.item.input_audio_transcription.completed':
|
|
895
|
+
this.handleConversationItemInputAudioTranscriptionCompleted(event);
|
|
896
|
+
break;
|
|
897
|
+
case 'conversation.item.input_audio_transcription.failed':
|
|
898
|
+
this.handleConversationItemInputAudioTranscriptionFailed(event);
|
|
899
|
+
break;
|
|
900
|
+
case 'response.content_part.added':
|
|
901
|
+
this.handleResponseContentPartAdded(event);
|
|
902
|
+
break;
|
|
903
|
+
case 'response.content_part.done':
|
|
904
|
+
this.handleResponseContentPartDone(event);
|
|
905
|
+
break;
|
|
906
|
+
case 'response.audio_transcript.delta':
|
|
907
|
+
this.handleResponseAudioTranscriptDelta(event);
|
|
908
|
+
break;
|
|
909
|
+
case 'response.audio.delta':
|
|
910
|
+
this.handleResponseAudioDelta(event);
|
|
911
|
+
break;
|
|
912
|
+
case 'response.audio_transcript.done':
|
|
913
|
+
this.handleResponseAudioTranscriptDone(event);
|
|
914
|
+
break;
|
|
915
|
+
case 'response.audio.done':
|
|
916
|
+
this.handleResponseAudioDone(event);
|
|
917
|
+
break;
|
|
918
|
+
case 'response.output_item.done':
|
|
919
|
+
this.handleResponseOutputItemDone(event);
|
|
920
|
+
break;
|
|
921
|
+
case 'response.done':
|
|
922
|
+
this.handleResponseDone(event);
|
|
923
|
+
break;
|
|
924
|
+
case 'error':
|
|
925
|
+
this.handleError(event);
|
|
926
|
+
break;
|
|
927
|
+
default:
|
|
928
|
+
this.#logger.debug(`unhandled event: ${event.type}`);
|
|
929
|
+
break;
|
|
930
|
+
}
|
|
931
|
+
};
|
|
932
|
+
|
|
933
|
+
const sendTask = Task.from(({ signal }) => forwardEvents(signal));
|
|
934
|
+
|
|
935
|
+
const wsTask = Task.from(({ signal }) => {
|
|
936
|
+
const abortPromise = new Promise<void>((resolve) => {
|
|
937
|
+
signal.addEventListener('abort', () => {
|
|
938
|
+
resolve();
|
|
939
|
+
});
|
|
940
|
+
});
|
|
941
|
+
|
|
942
|
+
return Promise.race([wsCloseFuture.await, abortPromise]);
|
|
943
|
+
});
|
|
944
|
+
|
|
945
|
+
const waitReconnectTask = Task.from(async ({ signal }) => {
|
|
946
|
+
await delay(this.oaiRealtimeModel._options.maxSessionDuration, { signal });
|
|
947
|
+
return new APIConnectionError({
|
|
948
|
+
message: 'OpenAI Realtime API connection timeout',
|
|
949
|
+
});
|
|
877
950
|
});
|
|
951
|
+
|
|
952
|
+
try {
|
|
953
|
+
const result = await Promise.race([wsTask.result, sendTask.result, waitReconnectTask.result]);
|
|
954
|
+
|
|
955
|
+
if (waitReconnectTask.done && this.currentGeneration) {
|
|
956
|
+
await this.currentGeneration._doneFut.await;
|
|
957
|
+
}
|
|
958
|
+
|
|
959
|
+
if (result instanceof Error) {
|
|
960
|
+
throw result;
|
|
961
|
+
}
|
|
962
|
+
} finally {
|
|
963
|
+
await cancelAndWait([wsTask, sendTask, waitReconnectTask], 2000);
|
|
964
|
+
wsConn.close();
|
|
965
|
+
}
|
|
878
966
|
}
|
|
879
967
|
|
|
880
968
|
async close() {
|
|
881
|
-
|
|
882
|
-
this.#
|
|
883
|
-
this.#ws.close();
|
|
969
|
+
super.close();
|
|
970
|
+
this.#closed = true;
|
|
884
971
|
await this.#task;
|
|
885
972
|
}
|
|
886
973
|
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
return content;
|
|
974
|
+
private handleInputAudioBufferSpeechStarted(
|
|
975
|
+
_event: api_proto.InputAudioBufferSpeechStartedEvent,
|
|
976
|
+
): void {
|
|
977
|
+
this.emit('input_speech_started', {} as llm.InputSpeechStartedEvent);
|
|
892
978
|
}
|
|
893
979
|
|
|
894
|
-
|
|
895
|
-
|
|
980
|
+
private handleInputAudioBufferSpeechStopped(
|
|
981
|
+
_event: api_proto.InputAudioBufferSpeechStoppedEvent,
|
|
982
|
+
): void {
|
|
983
|
+
this.emit('input_speech_stopped', {
|
|
984
|
+
userTranscriptionEnabled: this.oaiRealtimeModel._options.inputAudioTranscription !== null,
|
|
985
|
+
} as llm.InputSpeechStoppedEvent);
|
|
896
986
|
}
|
|
897
987
|
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
988
|
+
private handleResponseCreated(event: api_proto.ResponseCreatedEvent): void {
|
|
989
|
+
if (!event.response.id) {
|
|
990
|
+
throw new Error('response.id is missing');
|
|
991
|
+
}
|
|
992
|
+
|
|
993
|
+
this.currentGeneration = {
|
|
994
|
+
messageChannel: stream.createStreamChannel<llm.MessageGeneration>(),
|
|
995
|
+
functionChannel: stream.createStreamChannel<llm.FunctionCall>(),
|
|
996
|
+
messages: new Map(),
|
|
997
|
+
_doneFut: new Future(),
|
|
998
|
+
_createdTimestamp: Date.now(),
|
|
999
|
+
};
|
|
903
1000
|
|
|
904
|
-
|
|
905
|
-
#handleSessionUpdated(event: api_proto.SessionUpdatedEvent): void {}
|
|
1001
|
+
if (!event.response.metadata || !event.response.metadata.client_event_id) return;
|
|
906
1002
|
|
|
907
|
-
|
|
908
|
-
|
|
1003
|
+
const handle = this.responseCreatedFutures[event.response.metadata.client_event_id];
|
|
1004
|
+
if (handle) {
|
|
1005
|
+
delete this.responseCreatedFutures[event.response.metadata.client_event_id];
|
|
909
1006
|
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
1007
|
+
// set key to the response id
|
|
1008
|
+
this.responseCreatedFutures[event.response.id] = handle;
|
|
1009
|
+
}
|
|
1010
|
+
|
|
1011
|
+
// the generation_created event is emitted when
|
|
1012
|
+
// 1. the response is not a message on response.output_item.added event
|
|
1013
|
+
// 2. the content is audio on response.content_part.added event
|
|
1014
|
+
// will try to recover from text response on response.content_part.done event
|
|
1015
|
+
this.emit('generation_created', {
|
|
1016
|
+
messageStream: this.currentGeneration.messageChannel.stream(),
|
|
1017
|
+
functionStream: this.currentGeneration.functionChannel.stream(),
|
|
1018
|
+
userInitiated: false,
|
|
1019
|
+
} as GenerationCreatedEvent);
|
|
914
1020
|
}
|
|
915
1021
|
|
|
916
|
-
|
|
917
|
-
|
|
1022
|
+
private handleResponseOutputItemAdded(event: api_proto.ResponseOutputItemAddedEvent): void {
|
|
1023
|
+
if (!this.currentGeneration) {
|
|
1024
|
+
throw new Error('currentGeneration is not set');
|
|
1025
|
+
}
|
|
918
1026
|
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
}
|
|
1027
|
+
if (!event.item.type) {
|
|
1028
|
+
throw new Error('item.type is not set');
|
|
1029
|
+
}
|
|
1030
|
+
|
|
1031
|
+
if (!event.response_id) {
|
|
1032
|
+
throw new Error('response_id is not set');
|
|
1033
|
+
}
|
|
1034
|
+
|
|
1035
|
+
const itemType = event.item.type;
|
|
1036
|
+
const responseId = event.response_id;
|
|
1037
|
+
|
|
1038
|
+
if (itemType !== 'message') {
|
|
1039
|
+
// emit immediately if it's not a message, otherwise wait response.content_part.added
|
|
1040
|
+
this.emitGenerationEvent(responseId);
|
|
1041
|
+
this.textModeRecoveryRetries = 0;
|
|
1042
|
+
return;
|
|
1043
|
+
}
|
|
926
1044
|
}
|
|
927
1045
|
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
1046
|
+
private handleConversationItemCreated(event: api_proto.ConversationItemCreatedEvent): void {
|
|
1047
|
+
if (!event.item.id) {
|
|
1048
|
+
throw new Error('item.id is not set');
|
|
1049
|
+
}
|
|
1050
|
+
|
|
1051
|
+
try {
|
|
1052
|
+
this.remoteChatCtx.insert(event.previous_item_id, openAIItemToLivekitItem(event.item));
|
|
1053
|
+
} catch (error) {
|
|
1054
|
+
this.#logger.error({ error, itemId: event.item.id }, 'failed to insert conversation item');
|
|
1055
|
+
}
|
|
1056
|
+
|
|
1057
|
+
const fut = this.itemCreateFutures[event.item.id];
|
|
1058
|
+
if (fut) {
|
|
1059
|
+
fut.resolve();
|
|
1060
|
+
delete this.itemCreateFutures[event.item.id];
|
|
1061
|
+
}
|
|
933
1062
|
}
|
|
934
1063
|
|
|
935
|
-
|
|
936
|
-
|
|
1064
|
+
private handleConversationItemDeleted(event: api_proto.ConversationItemDeletedEvent): void {
|
|
1065
|
+
if (!event.item_id) {
|
|
1066
|
+
throw new Error('item_id is not set');
|
|
1067
|
+
}
|
|
1068
|
+
|
|
1069
|
+
try {
|
|
1070
|
+
this.remoteChatCtx.delete(event.item_id);
|
|
1071
|
+
} catch (error) {
|
|
1072
|
+
this.#logger.error({ error, itemId: event.item_id }, 'failed to delete conversation item');
|
|
1073
|
+
}
|
|
937
1074
|
|
|
938
|
-
|
|
1075
|
+
const fut = this.itemDeleteFutures[event.item_id];
|
|
1076
|
+
if (fut) {
|
|
1077
|
+
fut.resolve();
|
|
1078
|
+
delete this.itemDeleteFutures[event.item_id];
|
|
1079
|
+
}
|
|
1080
|
+
}
|
|
1081
|
+
|
|
1082
|
+
private handleConversationItemInputAudioTranscriptionCompleted(
|
|
939
1083
|
event: api_proto.ConversationItemInputAudioTranscriptionCompletedEvent,
|
|
940
1084
|
): void {
|
|
941
|
-
const
|
|
942
|
-
|
|
1085
|
+
const remoteItem = this.remoteChatCtx.get(event.item_id);
|
|
1086
|
+
if (!remoteItem) {
|
|
1087
|
+
return;
|
|
1088
|
+
}
|
|
1089
|
+
|
|
1090
|
+
const item = remoteItem.item;
|
|
1091
|
+
if (item instanceof llm.ChatMessage) {
|
|
1092
|
+
item.content.push(event.transcript);
|
|
1093
|
+
} else {
|
|
1094
|
+
throw new Error('item is not a chat message');
|
|
1095
|
+
}
|
|
1096
|
+
|
|
1097
|
+
this.emit('input_audio_transcription_completed', {
|
|
943
1098
|
itemId: event.item_id,
|
|
944
|
-
transcript: transcript,
|
|
945
|
-
|
|
1099
|
+
transcript: event.transcript,
|
|
1100
|
+
isFinal: true,
|
|
1101
|
+
} as llm.InputTranscriptionCompleted);
|
|
946
1102
|
}
|
|
947
1103
|
|
|
948
|
-
|
|
1104
|
+
private handleConversationItemInputAudioTranscriptionFailed(
|
|
949
1105
|
event: api_proto.ConversationItemInputAudioTranscriptionFailedEvent,
|
|
950
1106
|
): void {
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
message: error.message,
|
|
956
|
-
} as InputSpeechTranscriptionFailed);
|
|
1107
|
+
this.#logger.error(
|
|
1108
|
+
{ error: event.error },
|
|
1109
|
+
'OpenAI Realtime API failed to transcribe input audio',
|
|
1110
|
+
);
|
|
957
1111
|
}
|
|
958
1112
|
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
#handleConversationItemDeleted(event: api_proto.ConversationItemDeletedEvent): void {}
|
|
964
|
-
|
|
965
|
-
#handleResponseCreated(responseCreated: api_proto.ResponseCreatedEvent): void {
|
|
966
|
-
const response = responseCreated.response;
|
|
967
|
-
const doneFut = new Future();
|
|
968
|
-
const newResponse: RealtimeResponse = {
|
|
969
|
-
id: response.id,
|
|
970
|
-
status: response.status,
|
|
971
|
-
statusDetails: response.status_details,
|
|
972
|
-
usage: null,
|
|
973
|
-
output: [],
|
|
974
|
-
doneFut: doneFut,
|
|
975
|
-
createdTimestamp: Date.now(),
|
|
976
|
-
};
|
|
977
|
-
this.#pendingResponses[newResponse.id] = newResponse;
|
|
978
|
-
this.emit('response_created', newResponse);
|
|
979
|
-
}
|
|
1113
|
+
private handleResponseContentPartAdded(event: api_proto.ResponseContentPartAddedEvent): void {
|
|
1114
|
+
if (!this.currentGeneration) {
|
|
1115
|
+
throw new Error('currentGeneration is not set');
|
|
1116
|
+
}
|
|
980
1117
|
|
|
981
|
-
|
|
982
|
-
const
|
|
983
|
-
const responseId =
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
switch (response.status) {
|
|
994
|
-
case 'failed': {
|
|
995
|
-
if (response.statusDetails.type !== 'failed') break;
|
|
996
|
-
const err = response.statusDetails.error;
|
|
997
|
-
metricsError = new metrics.MultimodalLLMError({
|
|
998
|
-
type: response.statusDetails.type,
|
|
999
|
-
code: err?.code,
|
|
1000
|
-
message: err?.message,
|
|
1001
|
-
});
|
|
1002
|
-
this.#logger
|
|
1003
|
-
.child({ code: err?.code, error: err?.message })
|
|
1004
|
-
.error('response generation failed');
|
|
1005
|
-
break;
|
|
1006
|
-
}
|
|
1007
|
-
case 'incomplete': {
|
|
1008
|
-
if (response.statusDetails.type !== 'incomplete') break;
|
|
1009
|
-
const reason = response.statusDetails.reason;
|
|
1010
|
-
metricsError = new metrics.MultimodalLLMError({
|
|
1011
|
-
type: response.statusDetails.type,
|
|
1012
|
-
reason,
|
|
1013
|
-
});
|
|
1014
|
-
this.#logger.child({ reason }).error('response generation incomplete');
|
|
1015
|
-
break;
|
|
1118
|
+
const itemId = event.item_id;
|
|
1119
|
+
const itemType = event.part.type;
|
|
1120
|
+
const responseId = event.response_id;
|
|
1121
|
+
|
|
1122
|
+
if (itemType === 'audio') {
|
|
1123
|
+
this.emitGenerationEvent(responseId);
|
|
1124
|
+
if (this.textModeRecoveryRetries > 0) {
|
|
1125
|
+
this.#logger.info(
|
|
1126
|
+
{ retries: this.textModeRecoveryRetries },
|
|
1127
|
+
'recovered from text-only response',
|
|
1128
|
+
);
|
|
1129
|
+
this.textModeRecoveryRetries = 0;
|
|
1016
1130
|
}
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1131
|
+
|
|
1132
|
+
const itemGeneration: MessageGeneration = {
|
|
1133
|
+
messageId: itemId,
|
|
1134
|
+
textChannel: stream.createStreamChannel<string>(),
|
|
1135
|
+
audioChannel: stream.createStreamChannel<AudioFrame>(),
|
|
1136
|
+
audioTranscript: '',
|
|
1137
|
+
};
|
|
1138
|
+
|
|
1139
|
+
this.currentGeneration.messageChannel.write({
|
|
1140
|
+
messageId: itemId,
|
|
1141
|
+
textStream: itemGeneration.textChannel.stream(),
|
|
1142
|
+
audioStream: itemGeneration.audioChannel.stream(),
|
|
1143
|
+
});
|
|
1144
|
+
|
|
1145
|
+
this.currentGeneration.messages.set(itemId, itemGeneration);
|
|
1146
|
+
this.currentGeneration._firstTokenTimestamp = Date.now();
|
|
1147
|
+
return;
|
|
1148
|
+
} else {
|
|
1149
|
+
this.interrupt();
|
|
1150
|
+
if (this.textModeRecoveryRetries === 0) {
|
|
1151
|
+
this.#logger.warn({ responseId }, 'received text-only response from OpenAI Realtime API');
|
|
1020
1152
|
}
|
|
1021
1153
|
}
|
|
1022
|
-
|
|
1154
|
+
}
|
|
1023
1155
|
|
|
1024
|
-
|
|
1025
|
-
if (
|
|
1026
|
-
|
|
1156
|
+
private handleResponseContentPartDone(event: api_proto.ResponseContentPartDoneEvent): void {
|
|
1157
|
+
if (event.part.type !== 'text') {
|
|
1158
|
+
return;
|
|
1027
1159
|
}
|
|
1028
|
-
const duration = Date.now() - response.createdTimestamp;
|
|
1029
1160
|
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
requestId: response.id,
|
|
1034
|
-
ttft: ttft!,
|
|
1035
|
-
duration,
|
|
1036
|
-
cancelled,
|
|
1037
|
-
label: this.constructor.name,
|
|
1038
|
-
completionTokens: usage?.output_tokens || 0,
|
|
1039
|
-
promptTokens: usage?.input_tokens || 0,
|
|
1040
|
-
totalTokens: usage?.total_tokens || 0,
|
|
1041
|
-
tokensPerSecond: ((usage?.output_tokens || 0) / duration) * 1000,
|
|
1042
|
-
error: metricsError,
|
|
1043
|
-
inputTokenDetails: {
|
|
1044
|
-
cachedTokens: usage?.input_token_details.cached_tokens || 0,
|
|
1045
|
-
textTokens: usage?.input_token_details.text_tokens || 0,
|
|
1046
|
-
audioTokens: usage?.input_token_details.audio_tokens || 0,
|
|
1047
|
-
},
|
|
1048
|
-
outputTokenDetails: {
|
|
1049
|
-
textTokens: usage?.output_token_details.text_tokens || 0,
|
|
1050
|
-
audioTokens: usage?.output_token_details.audio_tokens || 0,
|
|
1051
|
-
},
|
|
1052
|
-
};
|
|
1053
|
-
this.emit('metrics_collected', metric);
|
|
1054
|
-
}
|
|
1161
|
+
if (!this.currentGeneration) {
|
|
1162
|
+
throw new Error('currentGeneration is not set');
|
|
1163
|
+
}
|
|
1055
1164
|
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
const response = this.#pendingResponses[responseId];
|
|
1059
|
-
const itemData = event.item;
|
|
1165
|
+
// TODO(shubhra): handle text mode recovery
|
|
1166
|
+
}
|
|
1060
1167
|
|
|
1061
|
-
|
|
1062
|
-
|
|
1168
|
+
private handleResponseAudioTranscriptDelta(
|
|
1169
|
+
event: api_proto.ResponseAudioTranscriptDeltaEvent,
|
|
1170
|
+
): void {
|
|
1171
|
+
if (!this.currentGeneration) {
|
|
1172
|
+
throw new Error('currentGeneration is not set');
|
|
1063
1173
|
}
|
|
1064
1174
|
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1175
|
+
const itemId = event.item_id;
|
|
1176
|
+
const delta = event.delta;
|
|
1177
|
+
|
|
1178
|
+
// TODO (shubhra): add timed string support
|
|
1179
|
+
|
|
1180
|
+
const itemGeneration = this.currentGeneration.messages.get(itemId);
|
|
1181
|
+
if (!itemGeneration) {
|
|
1182
|
+
throw new Error('itemGeneration is not set');
|
|
1068
1183
|
} else {
|
|
1069
|
-
|
|
1184
|
+
itemGeneration.textChannel.write(delta);
|
|
1185
|
+
itemGeneration.audioTranscript += delta;
|
|
1070
1186
|
}
|
|
1187
|
+
}
|
|
1071
1188
|
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
}
|
|
1081
|
-
|
|
1082
|
-
|
|
1189
|
+
private handleResponseAudioDelta(event: api_proto.ResponseAudioDeltaEvent): void {
|
|
1190
|
+
if (!this.currentGeneration) {
|
|
1191
|
+
throw new Error('currentGeneration is not set');
|
|
1192
|
+
}
|
|
1193
|
+
|
|
1194
|
+
const itemGeneration = this.currentGeneration.messages.get(event.item_id);
|
|
1195
|
+
if (!itemGeneration) {
|
|
1196
|
+
throw new Error('itemGeneration is not set');
|
|
1197
|
+
}
|
|
1198
|
+
|
|
1199
|
+
const binaryString = atob(event.delta);
|
|
1200
|
+
const len = binaryString.length;
|
|
1201
|
+
const bytes = new Uint8Array(len);
|
|
1202
|
+
for (let i = 0; i < len; i++) {
|
|
1203
|
+
bytes[i] = binaryString.charCodeAt(i);
|
|
1204
|
+
}
|
|
1205
|
+
|
|
1206
|
+
itemGeneration.audioChannel.write(
|
|
1207
|
+
new AudioFrame(
|
|
1208
|
+
new Int16Array(bytes.buffer),
|
|
1209
|
+
api_proto.SAMPLE_RATE,
|
|
1210
|
+
api_proto.NUM_CHANNELS,
|
|
1211
|
+
bytes.length / 2,
|
|
1212
|
+
),
|
|
1213
|
+
);
|
|
1083
1214
|
}
|
|
1084
1215
|
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1216
|
+
private handleResponseAudioTranscriptDone(
|
|
1217
|
+
_event: api_proto.ResponseAudioTranscriptDoneEvent,
|
|
1218
|
+
): void {
|
|
1219
|
+
if (!this.currentGeneration) {
|
|
1220
|
+
throw new Error('currentGeneration is not set');
|
|
1221
|
+
}
|
|
1222
|
+
}
|
|
1090
1223
|
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1224
|
+
private handleResponseAudioDone(_event: api_proto.ResponseAudioDoneEvent): void {
|
|
1225
|
+
if (!this.currentGeneration) {
|
|
1226
|
+
throw new Error('currentGeneration is not set');
|
|
1227
|
+
}
|
|
1228
|
+
}
|
|
1229
|
+
|
|
1230
|
+
private handleResponseOutputItemDone(event: api_proto.ResponseOutputItemDoneEvent): void {
|
|
1231
|
+
if (!this.currentGeneration) {
|
|
1232
|
+
throw new Error('currentGeneration is not set');
|
|
1233
|
+
}
|
|
1234
|
+
|
|
1235
|
+
const itemId = event.item.id;
|
|
1236
|
+
const itemType = event.item.type;
|
|
1096
1237
|
|
|
1097
|
-
|
|
1238
|
+
if (itemType === 'function_call') {
|
|
1098
1239
|
const item = event.item;
|
|
1099
|
-
if (item.
|
|
1100
|
-
throw new Error('
|
|
1240
|
+
if (!item.call_id || !item.name || !item.arguments) {
|
|
1241
|
+
throw new Error('item is not a function call');
|
|
1101
1242
|
}
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1243
|
+
this.currentGeneration.functionChannel.write({
|
|
1244
|
+
callId: item.call_id,
|
|
1245
|
+
name: item.name,
|
|
1246
|
+
args: item.arguments,
|
|
1247
|
+
} as llm.FunctionCall);
|
|
1248
|
+
} else if (itemType === 'message') {
|
|
1249
|
+
const itemGeneration = this.currentGeneration.messages.get(itemId);
|
|
1250
|
+
if (!itemGeneration) {
|
|
1105
1251
|
return;
|
|
1106
1252
|
}
|
|
1253
|
+
// text response doesn't have itemGeneration
|
|
1254
|
+
itemGeneration.textChannel.close();
|
|
1255
|
+
itemGeneration.audioChannel.close();
|
|
1256
|
+
}
|
|
1257
|
+
}
|
|
1107
1258
|
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1259
|
+
private handleResponseDone(_event: api_proto.ResponseDoneEvent): void {
|
|
1260
|
+
if (!this.currentGeneration) {
|
|
1261
|
+
// OpenAI has a race condition where we could receive response.done without any
|
|
1262
|
+
// previous response.created (This happens generally during interruption)
|
|
1263
|
+
return;
|
|
1264
|
+
}
|
|
1111
1265
|
|
|
1112
|
-
|
|
1266
|
+
const createdTimestamp = this.currentGeneration._createdTimestamp;
|
|
1267
|
+
const firstTokenTimestamp = this.currentGeneration._firstTokenTimestamp;
|
|
1113
1268
|
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1269
|
+
this.#logger.debug(
|
|
1270
|
+
{
|
|
1271
|
+
messageCount: this.currentGeneration.messages.size,
|
|
1272
|
+
},
|
|
1273
|
+
'Closing generation channels in handleResponseDone',
|
|
1274
|
+
);
|
|
1117
1275
|
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
this.emit('function_call_completed', {
|
|
1122
|
-
callId: item.call_id,
|
|
1123
|
-
});
|
|
1124
|
-
this.conversation.item.create(
|
|
1125
|
-
llm.ChatMessage.createToolFromFunctionResult({
|
|
1126
|
-
name: item.name,
|
|
1127
|
-
toolCallId: item.call_id,
|
|
1128
|
-
result: content,
|
|
1129
|
-
}),
|
|
1130
|
-
output.itemId,
|
|
1131
|
-
);
|
|
1132
|
-
this.response.create();
|
|
1133
|
-
},
|
|
1134
|
-
(error) => {
|
|
1135
|
-
this.#logger.error(`[Function Call ${item.call_id}] ${item.name} failed with ${error}`);
|
|
1136
|
-
// TODO: send it back up as failed?
|
|
1137
|
-
this.emit('function_call_failed', {
|
|
1138
|
-
callId: item.call_id,
|
|
1139
|
-
});
|
|
1140
|
-
},
|
|
1141
|
-
);
|
|
1276
|
+
for (const generation of this.currentGeneration.messages.values()) {
|
|
1277
|
+
generation.textChannel.close();
|
|
1278
|
+
generation.audioChannel.close();
|
|
1142
1279
|
}
|
|
1143
1280
|
|
|
1144
|
-
|
|
1145
|
-
this.
|
|
1146
|
-
}
|
|
1281
|
+
this.currentGeneration.functionChannel.close();
|
|
1282
|
+
this.currentGeneration.messageChannel.close();
|
|
1147
1283
|
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1284
|
+
for (const itemId of this.currentGeneration.messages.keys()) {
|
|
1285
|
+
const remoteItem = this.remoteChatCtx.get(itemId);
|
|
1286
|
+
if (remoteItem && remoteItem.item instanceof llm.ChatMessage) {
|
|
1287
|
+
remoteItem.item.content.push(this.currentGeneration.messages.get(itemId)!.audioTranscript);
|
|
1288
|
+
}
|
|
1289
|
+
}
|
|
1153
1290
|
|
|
1154
|
-
|
|
1155
|
-
|
|
1291
|
+
this.currentGeneration._doneFut.resolve();
|
|
1292
|
+
this.currentGeneration = undefined;
|
|
1156
1293
|
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1294
|
+
// Calculate and emit metrics
|
|
1295
|
+
const usage = _event.response.usage;
|
|
1296
|
+
const ttft = firstTokenTimestamp ? firstTokenTimestamp - createdTimestamp : -1;
|
|
1297
|
+
const duration = (Date.now() - createdTimestamp) / 1000; // Convert to seconds
|
|
1298
|
+
|
|
1299
|
+
const realtimeMetrics: metrics.RealtimeModelMetrics = {
|
|
1300
|
+
type: 'realtime_model_metrics',
|
|
1301
|
+
timestamp: createdTimestamp / 1000, // Convert to seconds
|
|
1302
|
+
requestId: _event.response.id || '',
|
|
1303
|
+
ttft,
|
|
1304
|
+
duration,
|
|
1305
|
+
cancelled: _event.response.status === 'cancelled',
|
|
1306
|
+
label: 'openai_realtime',
|
|
1307
|
+
inputTokens: usage?.input_tokens ?? 0,
|
|
1308
|
+
outputTokens: usage?.output_tokens ?? 0,
|
|
1309
|
+
totalTokens: usage?.total_tokens ?? 0,
|
|
1310
|
+
tokensPerSecond: duration > 0 ? (usage?.output_tokens ?? 0) / duration : 0,
|
|
1311
|
+
inputTokenDetails: {
|
|
1312
|
+
audioTokens: usage?.input_token_details?.audio_tokens ?? 0,
|
|
1313
|
+
textTokens: usage?.input_token_details?.text_tokens ?? 0,
|
|
1314
|
+
imageTokens: 0, // Not supported yet
|
|
1315
|
+
cachedTokens: usage?.input_token_details?.cached_tokens ?? 0,
|
|
1316
|
+
cachedTokensDetails: usage?.input_token_details?.cached_tokens_details
|
|
1317
|
+
? {
|
|
1318
|
+
audioTokens: usage?.input_token_details?.cached_tokens_details?.audio_tokens ?? 0,
|
|
1319
|
+
textTokens: usage?.input_token_details?.cached_tokens_details?.text_tokens ?? 0,
|
|
1320
|
+
imageTokens: usage?.input_token_details?.cached_tokens_details?.image_tokens ?? 0,
|
|
1321
|
+
}
|
|
1322
|
+
: undefined,
|
|
1323
|
+
},
|
|
1324
|
+
outputTokenDetails: {
|
|
1325
|
+
textTokens: usage?.output_token_details?.text_tokens ?? 0,
|
|
1326
|
+
audioTokens: usage?.output_token_details?.audio_tokens ?? 0,
|
|
1327
|
+
imageTokens: 0,
|
|
1328
|
+
},
|
|
1168
1329
|
};
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1330
|
+
|
|
1331
|
+
this.emit('metrics_collected', realtimeMetrics);
|
|
1332
|
+
// TODO(brian): handle response done but not complete
|
|
1172
1333
|
}
|
|
1173
1334
|
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1335
|
+
private handleError(event: api_proto.ErrorEvent): void {
|
|
1336
|
+
if (event.error.message.startsWith('Cancellation failed')) {
|
|
1337
|
+
return;
|
|
1338
|
+
}
|
|
1339
|
+
|
|
1340
|
+
this.#logger.error({ error: event.error }, 'OpenAI Realtime API returned an error');
|
|
1341
|
+
this.emitError({
|
|
1342
|
+
error: new APIError(event.error.message, {
|
|
1343
|
+
body: event.error,
|
|
1344
|
+
retryable: true,
|
|
1345
|
+
}),
|
|
1346
|
+
recoverable: true,
|
|
1347
|
+
});
|
|
1348
|
+
|
|
1349
|
+
// TODO(brian): set error for response future if it exists
|
|
1177
1350
|
}
|
|
1178
1351
|
|
|
1179
|
-
|
|
1180
|
-
|
|
1352
|
+
private emitError({ error, recoverable }: { error: Error; recoverable: boolean }): void {
|
|
1353
|
+
// IMPORTANT: only emit error if there are listeners; otherwise emit will throw an error
|
|
1354
|
+
this.emit('error', {
|
|
1355
|
+
timestamp: Date.now(),
|
|
1356
|
+
// TODO(brian): add label
|
|
1357
|
+
label: '',
|
|
1358
|
+
error,
|
|
1359
|
+
recoverable,
|
|
1360
|
+
} as llm.RealtimeModelError);
|
|
1181
1361
|
}
|
|
1182
1362
|
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
content.text = event.text;
|
|
1186
|
-
this.emit('response_text_done', event);
|
|
1363
|
+
private *resampleAudio(frame: AudioFrame): Generator<AudioFrame> {
|
|
1364
|
+
yield frame;
|
|
1187
1365
|
}
|
|
1188
1366
|
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1367
|
+
private createResponse({
|
|
1368
|
+
userInitiated,
|
|
1369
|
+
instructions,
|
|
1370
|
+
oldHandle,
|
|
1371
|
+
}: {
|
|
1372
|
+
userInitiated: boolean;
|
|
1373
|
+
instructions?: string;
|
|
1374
|
+
oldHandle?: CreateResponseHandle;
|
|
1375
|
+
}): CreateResponseHandle {
|
|
1376
|
+
const handle = oldHandle || new CreateResponseHandle({ instructions });
|
|
1377
|
+
if (oldHandle && instructions) {
|
|
1378
|
+
handle.instructions = instructions;
|
|
1379
|
+
}
|
|
1193
1380
|
|
|
1194
|
-
|
|
1381
|
+
const eventId = shortuuid('response_create_');
|
|
1382
|
+
if (userInitiated) {
|
|
1383
|
+
this.responseCreatedFutures[eventId] = handle;
|
|
1384
|
+
}
|
|
1385
|
+
|
|
1386
|
+
const response: api_proto.ResponseCreateEvent['response'] = {};
|
|
1387
|
+
if (instructions) response.instructions = instructions;
|
|
1388
|
+
if (userInitiated) response.metadata = { client_event_id: eventId };
|
|
1389
|
+
|
|
1390
|
+
this.sendEvent({
|
|
1391
|
+
type: 'response.create',
|
|
1392
|
+
event_id: eventId,
|
|
1393
|
+
response: Object.keys(response).length > 0 ? response : undefined,
|
|
1394
|
+
});
|
|
1395
|
+
|
|
1396
|
+
return handle;
|
|
1195
1397
|
}
|
|
1196
1398
|
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1399
|
+
private emitGenerationEvent(responseId: string): void {
|
|
1400
|
+
if (!this.currentGeneration) {
|
|
1401
|
+
throw new Error('currentGeneration is not set');
|
|
1402
|
+
}
|
|
1403
|
+
|
|
1404
|
+
const generation_ev: llm.GenerationCreatedEvent = {
|
|
1405
|
+
messageStream: this.currentGeneration.messageChannel.stream(),
|
|
1406
|
+
functionStream: this.currentGeneration.functionChannel.stream(),
|
|
1407
|
+
userInitiated: false,
|
|
1408
|
+
};
|
|
1409
|
+
|
|
1410
|
+
const handle = this.responseCreatedFutures[responseId];
|
|
1411
|
+
if (handle) {
|
|
1412
|
+
delete this.responseCreatedFutures[responseId];
|
|
1413
|
+
generation_ev.userInitiated = true;
|
|
1414
|
+
if (handle.doneFut.done) {
|
|
1415
|
+
this.#logger.warn({ responseId }, 'response received after timeout');
|
|
1416
|
+
} else {
|
|
1417
|
+
handle.doneFut.resolve(generation_ev);
|
|
1418
|
+
}
|
|
1419
|
+
}
|
|
1420
|
+
|
|
1421
|
+
this.#logger.debug({ responseId }, 'Emitting generation_created event');
|
|
1422
|
+
this.emit('generation_created', generation_ev);
|
|
1200
1423
|
}
|
|
1424
|
+
}
|
|
1201
1425
|
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1426
|
+
function livekitItemToOpenAIItem(item: llm.ChatItem): api_proto.ItemResource {
|
|
1427
|
+
switch (item.type) {
|
|
1428
|
+
case 'function_call':
|
|
1429
|
+
return {
|
|
1430
|
+
id: item.id,
|
|
1431
|
+
type: 'function_call',
|
|
1432
|
+
call_id: item.callId,
|
|
1433
|
+
name: item.name,
|
|
1434
|
+
arguments: item.args,
|
|
1435
|
+
} as api_proto.FunctionCallItem;
|
|
1436
|
+
case 'function_call_output':
|
|
1437
|
+
return {
|
|
1438
|
+
id: item.id,
|
|
1439
|
+
type: 'function_call_output',
|
|
1440
|
+
call_id: item.callId,
|
|
1441
|
+
output: item.output,
|
|
1442
|
+
} as api_proto.FunctionCallOutputItem;
|
|
1443
|
+
case 'message':
|
|
1444
|
+
const role = item.role === 'developer' ? 'system' : item.role;
|
|
1445
|
+
const contentList: api_proto.Content[] = [];
|
|
1446
|
+
for (const c of item.content) {
|
|
1447
|
+
if (typeof c === 'string') {
|
|
1448
|
+
contentList.push({
|
|
1449
|
+
type: role === 'assistant' ? 'text' : 'input_text',
|
|
1450
|
+
text: c,
|
|
1451
|
+
} as api_proto.InputTextContent);
|
|
1452
|
+
} else if (c.type === 'image_content') {
|
|
1453
|
+
// not supported for now
|
|
1454
|
+
continue;
|
|
1455
|
+
} else if (c.type === 'audio_content') {
|
|
1456
|
+
if (role === 'user') {
|
|
1457
|
+
const encodedAudio = Buffer.from(combineAudioFrames(c.frame).data).toString('base64');
|
|
1458
|
+
contentList.push({
|
|
1459
|
+
type: 'input_audio',
|
|
1460
|
+
audio: encodedAudio,
|
|
1461
|
+
} as api_proto.InputAudioContent);
|
|
1462
|
+
}
|
|
1463
|
+
}
|
|
1464
|
+
}
|
|
1465
|
+
return {
|
|
1466
|
+
id: item.id,
|
|
1467
|
+
type: 'message',
|
|
1468
|
+
role,
|
|
1469
|
+
content: contentList,
|
|
1470
|
+
} as api_proto.UserItem;
|
|
1471
|
+
}
|
|
1472
|
+
}
|
|
1212
1473
|
|
|
1213
|
-
|
|
1474
|
+
function openAIItemToLivekitItem(item: api_proto.ItemResource): llm.ChatItem {
|
|
1475
|
+
if (!item.id) {
|
|
1476
|
+
throw new Error('item.id is not set');
|
|
1214
1477
|
}
|
|
1215
1478
|
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1479
|
+
switch (item.type) {
|
|
1480
|
+
case 'function_call':
|
|
1481
|
+
return llm.FunctionCall.create({
|
|
1482
|
+
id: item.id,
|
|
1483
|
+
callId: item.call_id,
|
|
1484
|
+
name: item.name,
|
|
1485
|
+
args: item.arguments,
|
|
1486
|
+
});
|
|
1487
|
+
case 'function_call_output':
|
|
1488
|
+
return llm.FunctionCallOutput.create({
|
|
1489
|
+
id: item.id,
|
|
1490
|
+
callId: item.call_id,
|
|
1491
|
+
output: item.output,
|
|
1492
|
+
isError: false,
|
|
1493
|
+
});
|
|
1494
|
+
case 'message':
|
|
1495
|
+
const content: llm.ChatContent[] = [];
|
|
1496
|
+
// item.content can be a single object or an array; normalize to array
|
|
1497
|
+
const contents = Array.isArray(item.content) ? item.content : [item.content];
|
|
1498
|
+
for (const c of contents) {
|
|
1499
|
+
if (c.type === 'text' || c.type === 'input_text') {
|
|
1500
|
+
content.push(c.text);
|
|
1501
|
+
}
|
|
1502
|
+
}
|
|
1503
|
+
return llm.ChatMessage.create({
|
|
1504
|
+
id: item.id,
|
|
1505
|
+
role: item.role,
|
|
1506
|
+
content,
|
|
1507
|
+
});
|
|
1219
1508
|
}
|
|
1509
|
+
}
|
|
1220
1510
|
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1511
|
+
function createMockAudioItem(durationSeconds: number = 2): llm.ChatMessage {
|
|
1512
|
+
const audioData = Buffer.alloc(durationSeconds * SAMPLE_RATE);
|
|
1513
|
+
return llm.ChatMessage.create({
|
|
1514
|
+
id: shortuuid(MOCK_AUDIO_ID_PREFIX),
|
|
1515
|
+
role: 'user',
|
|
1516
|
+
content: [
|
|
1517
|
+
{
|
|
1518
|
+
type: 'audio_content',
|
|
1519
|
+
frame: [
|
|
1520
|
+
new AudioFrame(
|
|
1521
|
+
new Int16Array(audioData.buffer),
|
|
1522
|
+
SAMPLE_RATE,
|
|
1523
|
+
NUM_CHANNELS,
|
|
1524
|
+
audioData.length / 2,
|
|
1525
|
+
),
|
|
1526
|
+
],
|
|
1527
|
+
} as llm.AudioContent,
|
|
1528
|
+
],
|
|
1529
|
+
});
|
|
1530
|
+
}
|
|
1531
|
+
|
|
1532
|
+
function toOaiToolChoice(toolChoice?: llm.ToolChoice): api_proto.ToolChoice {
|
|
1533
|
+
if (typeof toolChoice === 'string') {
|
|
1534
|
+
return toolChoice;
|
|
1535
|
+
}
|
|
1225
1536
|
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
): void {}
|
|
1537
|
+
if (toolChoice?.type === 'function') {
|
|
1538
|
+
return toolChoice.function.name;
|
|
1539
|
+
}
|
|
1230
1540
|
|
|
1231
|
-
|
|
1232
|
-
#handleRateLimitsUpdated(event: api_proto.RateLimitsUpdatedEvent): void {}
|
|
1541
|
+
return 'auto';
|
|
1233
1542
|
}
|