@livekit/agents-plugin-openai 0.3.5 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +1 -1
- package/CHANGELOG.md +22 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/llm.d.ts +195 -0
- package/dist/llm.d.ts.map +1 -0
- package/dist/llm.js +453 -0
- package/dist/llm.js.map +1 -0
- package/dist/models.d.ts +10 -0
- package/dist/models.d.ts.map +1 -0
- package/dist/models.js +5 -0
- package/dist/models.js.map +1 -0
- package/dist/realtime/api_proto.d.ts +1 -1
- package/dist/realtime/api_proto.d.ts.map +1 -1
- package/dist/realtime/realtime_model.d.ts +8 -3
- package/dist/realtime/realtime_model.d.ts.map +1 -1
- package/dist/realtime/realtime_model.js +601 -459
- package/dist/realtime/realtime_model.js.map +1 -1
- package/package.json +5 -3
- package/src/index.ts +2 -0
- package/src/llm.ts +670 -0
- package/src/models.ts +107 -0
- package/src/realtime/api_proto.ts +1 -1
- package/src/realtime/realtime_model.ts +152 -15
- package/tsconfig.tsbuildinfo +1 -1
|
@@ -1,53 +1,41 @@
|
|
|
1
|
-
var __classPrivateFieldSet = (this && this.__classPrivateFieldSet) || function (receiver, state, value, kind, f) {
|
|
2
|
-
if (kind === "m") throw new TypeError("Private method is not writable");
|
|
3
|
-
if (kind === "a" && !f) throw new TypeError("Private accessor was defined without a setter");
|
|
4
|
-
if (typeof state === "function" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError("Cannot write private member to an object whose class did not declare it");
|
|
5
|
-
return (kind === "a" ? f.call(receiver, value) : f ? f.value = value : state.set(receiver, value)), value;
|
|
6
|
-
};
|
|
7
|
-
var __classPrivateFieldGet = (this && this.__classPrivateFieldGet) || function (receiver, state, kind, f) {
|
|
8
|
-
if (kind === "a" && !f) throw new TypeError("Private accessor was defined without a getter");
|
|
9
|
-
if (typeof state === "function" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError("Cannot read private member from an object whose class did not declare it");
|
|
10
|
-
return kind === "m" ? f : kind === "a" ? f.call(receiver) : f ? f.value : state.get(receiver);
|
|
11
|
-
};
|
|
12
|
-
var _InputAudioBuffer_session, _ConversationItem_session, _Conversation_session, _Response_session, _RealtimeModel_defaultOpts, _RealtimeModel_sessions, _RealtimeSession_instances, _RealtimeSession_fncCtx, _RealtimeSession_opts, _RealtimeSession_pendingResponses, _RealtimeSession_sessionId, _RealtimeSession_ws, _RealtimeSession_expiresAt, _RealtimeSession_logger, _RealtimeSession_task, _RealtimeSession_closing, _RealtimeSession_sendQueue, _RealtimeSession_loggableEvent, _RealtimeSession_start, _RealtimeSession_getContent, _RealtimeSession_handleError, _RealtimeSession_handleSessionCreated, _RealtimeSession_handleSessionUpdated, _RealtimeSession_handleConversationCreated, _RealtimeSession_handleInputAudioBufferCommitted, _RealtimeSession_handleInputAudioBufferCleared, _RealtimeSession_handleInputAudioBufferSpeechStarted, _RealtimeSession_handleInputAudioBufferSpeechStopped, _RealtimeSession_handleConversationItemCreated, _RealtimeSession_handleConversationItemInputAudioTranscriptionCompleted, _RealtimeSession_handleConversationItemInputAudioTranscriptionFailed, _RealtimeSession_handleConversationItemTruncated, _RealtimeSession_handleConversationItemDeleted, _RealtimeSession_handleResponseCreated, _RealtimeSession_handleResponseDone, _RealtimeSession_handleResponseOutputItemAdded, _RealtimeSession_handleResponseOutputItemDone, _RealtimeSession_handleResponseContentPartAdded, _RealtimeSession_handleResponseContentPartDone, _RealtimeSession_handleResponseTextDelta, _RealtimeSession_handleResponseTextDone, _RealtimeSession_handleResponseAudioTranscriptDelta, _RealtimeSession_handleResponseAudioTranscriptDone, _RealtimeSession_handleResponseAudioDelta, _RealtimeSession_handleResponseAudioDone, _RealtimeSession_handleResponseFunctionCallArgumentsDelta, _RealtimeSession_handleResponseFunctionCallArgumentsDone, _RealtimeSession_handleRateLimitsUpdated;
|
|
13
1
|
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
14
2
|
//
|
|
15
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
16
|
-
import { AsyncIterableQueue, Future, Queue, llm, log, multimodal } from '@livekit/agents';
|
|
4
|
+
import { AsyncIterableQueue, Future, Queue, llm, log, mergeFrames, multimodal, } from '@livekit/agents';
|
|
17
5
|
import { AudioFrame } from '@livekit/rtc-node';
|
|
18
|
-
import { once } from 'events';
|
|
6
|
+
import { once } from 'node:events';
|
|
19
7
|
import { WebSocket } from 'ws';
|
|
20
8
|
import * as api_proto from './api_proto.js';
|
|
21
9
|
class InputAudioBuffer {
|
|
10
|
+
#session;
|
|
22
11
|
constructor(session) {
|
|
23
|
-
|
|
24
|
-
__classPrivateFieldSet(this, _InputAudioBuffer_session, session, "f");
|
|
12
|
+
this.#session = session;
|
|
25
13
|
}
|
|
26
14
|
append(frame) {
|
|
27
|
-
|
|
15
|
+
this.#session.queueMsg({
|
|
28
16
|
type: 'input_audio_buffer.append',
|
|
29
17
|
audio: Buffer.from(frame.data.buffer).toString('base64'),
|
|
30
18
|
});
|
|
31
19
|
}
|
|
32
20
|
clear() {
|
|
33
|
-
|
|
21
|
+
this.#session.queueMsg({
|
|
34
22
|
type: 'input_audio_buffer.clear',
|
|
35
23
|
});
|
|
36
24
|
}
|
|
37
25
|
commit() {
|
|
38
|
-
|
|
26
|
+
this.#session.queueMsg({
|
|
39
27
|
type: 'input_audio_buffer.commit',
|
|
40
28
|
});
|
|
41
29
|
}
|
|
42
30
|
}
|
|
43
|
-
_InputAudioBuffer_session = new WeakMap();
|
|
44
31
|
class ConversationItem {
|
|
32
|
+
#session;
|
|
33
|
+
#logger = log();
|
|
45
34
|
constructor(session) {
|
|
46
|
-
|
|
47
|
-
__classPrivateFieldSet(this, _ConversationItem_session, session, "f");
|
|
35
|
+
this.#session = session;
|
|
48
36
|
}
|
|
49
37
|
truncate(itemId, contentIndex, audioEnd) {
|
|
50
|
-
|
|
38
|
+
this.#session.queueMsg({
|
|
51
39
|
type: 'conversation.item.truncate',
|
|
52
40
|
item_id: itemId,
|
|
53
41
|
content_index: contentIndex,
|
|
@@ -55,48 +43,161 @@ class ConversationItem {
|
|
|
55
43
|
});
|
|
56
44
|
}
|
|
57
45
|
delete(itemId) {
|
|
58
|
-
|
|
46
|
+
this.#session.queueMsg({
|
|
59
47
|
type: 'conversation.item.delete',
|
|
60
48
|
item_id: itemId,
|
|
61
49
|
});
|
|
62
50
|
}
|
|
63
|
-
create(
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
51
|
+
create(message, previousItemId) {
|
|
52
|
+
if (!message.content) {
|
|
53
|
+
return;
|
|
54
|
+
}
|
|
55
|
+
let event;
|
|
56
|
+
if (message.toolCallId) {
|
|
57
|
+
if (typeof message.content !== 'string') {
|
|
58
|
+
throw new TypeError('message.content must be a string');
|
|
59
|
+
}
|
|
60
|
+
event = {
|
|
61
|
+
type: 'conversation.item.create',
|
|
62
|
+
previous_item_id: previousItemId,
|
|
63
|
+
item: {
|
|
64
|
+
type: 'function_call_output',
|
|
65
|
+
call_id: message.toolCallId,
|
|
66
|
+
output: message.content,
|
|
67
|
+
},
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
else {
|
|
71
|
+
let content = message.content;
|
|
72
|
+
if (!Array.isArray(content)) {
|
|
73
|
+
content = [content];
|
|
74
|
+
}
|
|
75
|
+
if (message.role === llm.ChatRole.USER) {
|
|
76
|
+
const contents = [];
|
|
77
|
+
for (const c of content) {
|
|
78
|
+
if (typeof c === 'string') {
|
|
79
|
+
contents.push({
|
|
80
|
+
type: 'input_text',
|
|
81
|
+
text: c,
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
else if (
|
|
85
|
+
// typescript type guard for determining ChatAudio vs ChatImage
|
|
86
|
+
((c) => {
|
|
87
|
+
return c.frame !== undefined;
|
|
88
|
+
})(c)) {
|
|
89
|
+
contents.push({
|
|
90
|
+
type: 'input_audio',
|
|
91
|
+
audio: Buffer.from(mergeFrames(c.frame).data.buffer).toString('base64'),
|
|
92
|
+
});
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
event = {
|
|
96
|
+
type: 'conversation.item.create',
|
|
97
|
+
previous_item_id: previousItemId,
|
|
98
|
+
item: {
|
|
99
|
+
type: 'message',
|
|
100
|
+
role: 'user',
|
|
101
|
+
content: contents,
|
|
102
|
+
},
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
else if (message.role === llm.ChatRole.ASSISTANT) {
|
|
106
|
+
const contents = [];
|
|
107
|
+
for (const c of content) {
|
|
108
|
+
if (typeof c === 'string') {
|
|
109
|
+
contents.push({
|
|
110
|
+
type: 'text',
|
|
111
|
+
text: c,
|
|
112
|
+
});
|
|
113
|
+
}
|
|
114
|
+
else if (
|
|
115
|
+
// typescript type guard for determining ChatAudio vs ChatImage
|
|
116
|
+
((c) => {
|
|
117
|
+
return c.frame !== undefined;
|
|
118
|
+
})(c)) {
|
|
119
|
+
this.#logger.warn('audio content in assistant message is not supported');
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
event = {
|
|
123
|
+
type: 'conversation.item.create',
|
|
124
|
+
previous_item_id: previousItemId,
|
|
125
|
+
item: {
|
|
126
|
+
type: 'message',
|
|
127
|
+
role: 'assistant',
|
|
128
|
+
content: contents,
|
|
129
|
+
},
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
else if (message.role === llm.ChatRole.SYSTEM) {
|
|
133
|
+
const contents = [];
|
|
134
|
+
for (const c of content) {
|
|
135
|
+
if (typeof c === 'string') {
|
|
136
|
+
contents.push({
|
|
137
|
+
type: 'input_text',
|
|
138
|
+
text: c,
|
|
139
|
+
});
|
|
140
|
+
}
|
|
141
|
+
else if (
|
|
142
|
+
// typescript type guard for determining ChatAudio vs ChatImage
|
|
143
|
+
((c) => {
|
|
144
|
+
return c.frame !== undefined;
|
|
145
|
+
})(c)) {
|
|
146
|
+
this.#logger.warn('audio content in system message is not supported');
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
event = {
|
|
150
|
+
type: 'conversation.item.create',
|
|
151
|
+
previous_item_id: previousItemId,
|
|
152
|
+
item: {
|
|
153
|
+
type: 'message',
|
|
154
|
+
role: 'system',
|
|
155
|
+
content: contents,
|
|
156
|
+
},
|
|
157
|
+
};
|
|
158
|
+
}
|
|
159
|
+
else {
|
|
160
|
+
this.#logger
|
|
161
|
+
.child({ message })
|
|
162
|
+
.warn('chat message is not supported inside the realtime API');
|
|
163
|
+
return;
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
this.#session.queueMsg(event);
|
|
69
167
|
}
|
|
70
168
|
}
|
|
71
|
-
_ConversationItem_session = new WeakMap();
|
|
72
169
|
class Conversation {
|
|
170
|
+
#session;
|
|
73
171
|
constructor(session) {
|
|
74
|
-
|
|
75
|
-
__classPrivateFieldSet(this, _Conversation_session, session, "f");
|
|
172
|
+
this.#session = session;
|
|
76
173
|
}
|
|
77
174
|
get item() {
|
|
78
|
-
return new ConversationItem(
|
|
175
|
+
return new ConversationItem(this.#session);
|
|
79
176
|
}
|
|
80
177
|
}
|
|
81
|
-
_Conversation_session = new WeakMap();
|
|
82
178
|
class Response {
|
|
179
|
+
#session;
|
|
83
180
|
constructor(session) {
|
|
84
|
-
|
|
85
|
-
__classPrivateFieldSet(this, _Response_session, session, "f");
|
|
181
|
+
this.#session = session;
|
|
86
182
|
}
|
|
87
183
|
create() {
|
|
88
|
-
|
|
184
|
+
this.#session.queueMsg({
|
|
89
185
|
type: 'response.create',
|
|
90
186
|
});
|
|
91
187
|
}
|
|
92
188
|
cancel() {
|
|
93
|
-
|
|
189
|
+
this.#session.queueMsg({
|
|
94
190
|
type: 'response.cancel',
|
|
95
191
|
});
|
|
96
192
|
}
|
|
97
193
|
}
|
|
98
|
-
_Response_session = new WeakMap();
|
|
99
194
|
export class RealtimeModel extends multimodal.RealtimeModel {
|
|
195
|
+
sampleRate = api_proto.SAMPLE_RATE;
|
|
196
|
+
numChannels = api_proto.NUM_CHANNELS;
|
|
197
|
+
inFrameSize = api_proto.IN_FRAME_SIZE;
|
|
198
|
+
outFrameSize = api_proto.OUT_FRAME_SIZE;
|
|
199
|
+
#defaultOpts;
|
|
200
|
+
#sessions = [];
|
|
100
201
|
static withAzure({ baseURL, azureDeployment, apiVersion = '2024-10-01-preview', apiKey = undefined, entraToken = undefined, instructions = '', modalities = ['text', 'audio'], voice = 'alloy', inputAudioFormat = 'pcm16', outputAudioFormat = 'pcm16', inputAudioTranscription = { model: 'whisper-1' }, turnDetection = { type: 'server_vad' }, temperature = 0.8, maxResponseOutputTokens = Infinity, }) {
|
|
101
202
|
return new RealtimeModel({
|
|
102
203
|
isAzure: true,
|
|
@@ -120,16 +221,10 @@ export class RealtimeModel extends multimodal.RealtimeModel {
|
|
|
120
221
|
// used for microsoft
|
|
121
222
|
isAzure = false, apiVersion = undefined, entraToken = undefined, }) {
|
|
122
223
|
super();
|
|
123
|
-
this.sampleRate = api_proto.SAMPLE_RATE;
|
|
124
|
-
this.numChannels = api_proto.NUM_CHANNELS;
|
|
125
|
-
this.inFrameSize = api_proto.IN_FRAME_SIZE;
|
|
126
|
-
this.outFrameSize = api_proto.OUT_FRAME_SIZE;
|
|
127
|
-
_RealtimeModel_defaultOpts.set(this, void 0);
|
|
128
|
-
_RealtimeModel_sessions.set(this, []);
|
|
129
224
|
if (apiKey === '') {
|
|
130
225
|
throw new Error('OpenAI API key is required, either using the argument or by setting the OPENAI_API_KEY environmental variable');
|
|
131
226
|
}
|
|
132
|
-
|
|
227
|
+
this.#defaultOpts = {
|
|
133
228
|
modalities,
|
|
134
229
|
instructions,
|
|
135
230
|
voice,
|
|
@@ -145,12 +240,12 @@ export class RealtimeModel extends multimodal.RealtimeModel {
|
|
|
145
240
|
isAzure,
|
|
146
241
|
apiVersion,
|
|
147
242
|
entraToken,
|
|
148
|
-
}
|
|
243
|
+
};
|
|
149
244
|
}
|
|
150
245
|
get sessions() {
|
|
151
|
-
return
|
|
246
|
+
return this.#sessions;
|
|
152
247
|
}
|
|
153
|
-
session({ fncCtx, modalities =
|
|
248
|
+
session({ fncCtx, chatCtx, modalities = this.#defaultOpts.modalities, instructions = this.#defaultOpts.instructions, voice = this.#defaultOpts.voice, inputAudioFormat = this.#defaultOpts.inputAudioFormat, outputAudioFormat = this.#defaultOpts.outputAudioFormat, inputAudioTranscription = this.#defaultOpts.inputAudioTranscription, turnDetection = this.#defaultOpts.turnDetection, temperature = this.#defaultOpts.temperature, maxResponseOutputTokens = this.#defaultOpts.maxResponseOutputTokens, }) {
|
|
154
249
|
const opts = {
|
|
155
250
|
modalities,
|
|
156
251
|
instructions,
|
|
@@ -161,57 +256,63 @@ export class RealtimeModel extends multimodal.RealtimeModel {
|
|
|
161
256
|
turnDetection,
|
|
162
257
|
temperature,
|
|
163
258
|
maxResponseOutputTokens,
|
|
164
|
-
model:
|
|
165
|
-
apiKey:
|
|
166
|
-
baseURL:
|
|
167
|
-
isAzure:
|
|
168
|
-
apiVersion:
|
|
169
|
-
entraToken:
|
|
259
|
+
model: this.#defaultOpts.model,
|
|
260
|
+
apiKey: this.#defaultOpts.apiKey,
|
|
261
|
+
baseURL: this.#defaultOpts.baseURL,
|
|
262
|
+
isAzure: this.#defaultOpts.isAzure,
|
|
263
|
+
apiVersion: this.#defaultOpts.apiVersion,
|
|
264
|
+
entraToken: this.#defaultOpts.entraToken,
|
|
170
265
|
};
|
|
171
|
-
const newSession = new RealtimeSession(opts,
|
|
172
|
-
|
|
266
|
+
const newSession = new RealtimeSession(opts, {
|
|
267
|
+
chatCtx: chatCtx || new llm.ChatContext(),
|
|
268
|
+
fncCtx,
|
|
269
|
+
});
|
|
270
|
+
this.#sessions.push(newSession);
|
|
173
271
|
return newSession;
|
|
174
272
|
}
|
|
175
273
|
async close() {
|
|
176
|
-
await Promise.allSettled(
|
|
274
|
+
await Promise.allSettled(this.#sessions.map((session) => session.close()));
|
|
177
275
|
}
|
|
178
276
|
}
|
|
179
|
-
_RealtimeModel_defaultOpts = new WeakMap(), _RealtimeModel_sessions = new WeakMap();
|
|
180
277
|
export class RealtimeSession extends multimodal.RealtimeSession {
|
|
181
|
-
|
|
278
|
+
#chatCtx = undefined;
|
|
279
|
+
#fncCtx = undefined;
|
|
280
|
+
#opts;
|
|
281
|
+
#pendingResponses = {};
|
|
282
|
+
#sessionId = 'not-connected';
|
|
283
|
+
#ws = null;
|
|
284
|
+
#expiresAt = null;
|
|
285
|
+
#logger = log();
|
|
286
|
+
#task;
|
|
287
|
+
#closing = true;
|
|
288
|
+
#sendQueue = new Queue();
|
|
289
|
+
constructor(opts, { fncCtx, chatCtx }) {
|
|
182
290
|
super();
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
_RealtimeSession_sessionId.set(this, 'not-connected');
|
|
188
|
-
_RealtimeSession_ws.set(this, null);
|
|
189
|
-
_RealtimeSession_expiresAt.set(this, null);
|
|
190
|
-
_RealtimeSession_logger.set(this, log());
|
|
191
|
-
_RealtimeSession_task.set(this, void 0);
|
|
192
|
-
_RealtimeSession_closing.set(this, true);
|
|
193
|
-
_RealtimeSession_sendQueue.set(this, new Queue());
|
|
194
|
-
__classPrivateFieldSet(this, _RealtimeSession_opts, opts, "f");
|
|
195
|
-
__classPrivateFieldSet(this, _RealtimeSession_fncCtx, fncCtx, "f");
|
|
196
|
-
__classPrivateFieldSet(this, _RealtimeSession_task, __classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_start).call(this), "f");
|
|
291
|
+
this.#opts = opts;
|
|
292
|
+
this.#chatCtx = chatCtx;
|
|
293
|
+
this.#fncCtx = fncCtx;
|
|
294
|
+
this.#task = this.#start();
|
|
197
295
|
this.sessionUpdate({
|
|
198
|
-
modalities:
|
|
199
|
-
instructions:
|
|
200
|
-
voice:
|
|
201
|
-
inputAudioFormat:
|
|
202
|
-
outputAudioFormat:
|
|
203
|
-
inputAudioTranscription:
|
|
204
|
-
turnDetection:
|
|
205
|
-
temperature:
|
|
206
|
-
maxResponseOutputTokens:
|
|
296
|
+
modalities: this.#opts.modalities,
|
|
297
|
+
instructions: this.#opts.instructions,
|
|
298
|
+
voice: this.#opts.voice,
|
|
299
|
+
inputAudioFormat: this.#opts.inputAudioFormat,
|
|
300
|
+
outputAudioFormat: this.#opts.outputAudioFormat,
|
|
301
|
+
inputAudioTranscription: this.#opts.inputAudioTranscription,
|
|
302
|
+
turnDetection: this.#opts.turnDetection,
|
|
303
|
+
temperature: this.#opts.temperature,
|
|
304
|
+
maxResponseOutputTokens: this.#opts.maxResponseOutputTokens,
|
|
207
305
|
toolChoice: 'auto',
|
|
208
306
|
});
|
|
209
307
|
}
|
|
308
|
+
get chatCtx() {
|
|
309
|
+
return this.#chatCtx;
|
|
310
|
+
}
|
|
210
311
|
get fncCtx() {
|
|
211
|
-
return
|
|
312
|
+
return this.#fncCtx;
|
|
212
313
|
}
|
|
213
314
|
set fncCtx(ctx) {
|
|
214
|
-
|
|
315
|
+
this.#fncCtx = ctx;
|
|
215
316
|
}
|
|
216
317
|
get conversation() {
|
|
217
318
|
return new Conversation(this);
|
|
@@ -223,16 +324,37 @@ export class RealtimeSession extends multimodal.RealtimeSession {
|
|
|
223
324
|
return new Response(this);
|
|
224
325
|
}
|
|
225
326
|
get expiration() {
|
|
226
|
-
if (!
|
|
327
|
+
if (!this.#expiresAt) {
|
|
227
328
|
throw new Error('session not started');
|
|
228
329
|
}
|
|
229
|
-
return
|
|
330
|
+
return this.#expiresAt * 1000;
|
|
230
331
|
}
|
|
231
332
|
queueMsg(command) {
|
|
232
|
-
|
|
333
|
+
this.#sendQueue.put(command);
|
|
334
|
+
}
|
|
335
|
+
/// Truncates the data field of the event to the specified maxLength to avoid overwhelming logs
|
|
336
|
+
/// with large amounts of base64 audio data.
|
|
337
|
+
#loggableEvent(event, maxLength = 30) {
|
|
338
|
+
const untypedEvent = {};
|
|
339
|
+
for (const [key, value] of Object.entries(event)) {
|
|
340
|
+
if (value !== undefined) {
|
|
341
|
+
untypedEvent[key] = value;
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
if (untypedEvent.audio && typeof untypedEvent.audio === 'string') {
|
|
345
|
+
const truncatedData = untypedEvent.audio.slice(0, maxLength) + (untypedEvent.audio.length > maxLength ? '…' : '');
|
|
346
|
+
return { ...untypedEvent, audio: truncatedData };
|
|
347
|
+
}
|
|
348
|
+
if (untypedEvent.delta &&
|
|
349
|
+
typeof untypedEvent.delta === 'string' &&
|
|
350
|
+
event.type === 'response.audio.delta') {
|
|
351
|
+
const truncatedDelta = untypedEvent.delta.slice(0, maxLength) + (untypedEvent.delta.length > maxLength ? '…' : '');
|
|
352
|
+
return { ...untypedEvent, delta: truncatedDelta };
|
|
353
|
+
}
|
|
354
|
+
return untypedEvent;
|
|
233
355
|
}
|
|
234
|
-
sessionUpdate({ modalities =
|
|
235
|
-
|
|
356
|
+
sessionUpdate({ modalities = this.#opts.modalities, instructions = this.#opts.instructions, voice = this.#opts.voice, inputAudioFormat = this.#opts.inputAudioFormat, outputAudioFormat = this.#opts.outputAudioFormat, inputAudioTranscription = this.#opts.inputAudioTranscription, turnDetection = this.#opts.turnDetection, temperature = this.#opts.temperature, maxResponseOutputTokens = this.#opts.maxResponseOutputTokens, toolChoice = 'auto', }) {
|
|
357
|
+
this.#opts = {
|
|
236
358
|
modalities,
|
|
237
359
|
instructions,
|
|
238
360
|
voice,
|
|
@@ -242,15 +364,15 @@ export class RealtimeSession extends multimodal.RealtimeSession {
|
|
|
242
364
|
turnDetection,
|
|
243
365
|
temperature,
|
|
244
366
|
maxResponseOutputTokens,
|
|
245
|
-
model:
|
|
246
|
-
apiKey:
|
|
247
|
-
baseURL:
|
|
248
|
-
isAzure:
|
|
249
|
-
apiVersion:
|
|
250
|
-
entraToken:
|
|
251
|
-
}
|
|
252
|
-
const tools =
|
|
253
|
-
? Object.entries(
|
|
367
|
+
model: this.#opts.model,
|
|
368
|
+
apiKey: this.#opts.apiKey,
|
|
369
|
+
baseURL: this.#opts.baseURL,
|
|
370
|
+
isAzure: this.#opts.isAzure,
|
|
371
|
+
apiVersion: this.#opts.apiVersion,
|
|
372
|
+
entraToken: this.#opts.entraToken,
|
|
373
|
+
};
|
|
374
|
+
const tools = this.#fncCtx
|
|
375
|
+
? Object.entries(this.#fncCtx).map(([name, func]) => ({
|
|
254
376
|
type: 'function',
|
|
255
377
|
name,
|
|
256
378
|
description: func.description,
|
|
@@ -260,388 +382,408 @@ export class RealtimeSession extends multimodal.RealtimeSession {
|
|
|
260
382
|
const sessionUpdateEvent = {
|
|
261
383
|
type: 'session.update',
|
|
262
384
|
session: {
|
|
263
|
-
modalities:
|
|
264
|
-
instructions:
|
|
265
|
-
voice:
|
|
266
|
-
input_audio_format:
|
|
267
|
-
output_audio_format:
|
|
268
|
-
input_audio_transcription:
|
|
269
|
-
turn_detection:
|
|
270
|
-
temperature:
|
|
271
|
-
max_response_output_tokens:
|
|
385
|
+
modalities: this.#opts.modalities,
|
|
386
|
+
instructions: this.#opts.instructions,
|
|
387
|
+
voice: this.#opts.voice,
|
|
388
|
+
input_audio_format: this.#opts.inputAudioFormat,
|
|
389
|
+
output_audio_format: this.#opts.outputAudioFormat,
|
|
390
|
+
input_audio_transcription: this.#opts.inputAudioTranscription,
|
|
391
|
+
turn_detection: this.#opts.turnDetection,
|
|
392
|
+
temperature: this.#opts.temperature,
|
|
393
|
+
max_response_output_tokens: this.#opts.maxResponseOutputTokens === Infinity
|
|
272
394
|
? 'inf'
|
|
273
|
-
:
|
|
395
|
+
: this.#opts.maxResponseOutputTokens,
|
|
274
396
|
tools,
|
|
275
397
|
tool_choice: toolChoice,
|
|
276
398
|
},
|
|
277
399
|
};
|
|
278
|
-
if (
|
|
400
|
+
if (this.#opts.isAzure && this.#opts.maxResponseOutputTokens === Infinity) {
|
|
279
401
|
// microsoft doesn't support inf for max_response_output_tokens, but accepts no args
|
|
280
402
|
sessionUpdateEvent.session.max_response_output_tokens = undefined;
|
|
281
403
|
}
|
|
282
404
|
this.queueMsg(sessionUpdateEvent);
|
|
283
405
|
}
|
|
406
|
+
#start() {
|
|
407
|
+
return new Promise(async (resolve, reject) => {
|
|
408
|
+
const headers = {
|
|
409
|
+
'User-Agent': 'LiveKit-Agents-JS',
|
|
410
|
+
};
|
|
411
|
+
if (this.#opts.isAzure) {
|
|
412
|
+
// Microsoft API has two ways of authentication
|
|
413
|
+
// 1. Entra token set as `Bearer` token
|
|
414
|
+
// 2. API key set as `api_key` header (also accepts query string)
|
|
415
|
+
if (this.#opts.entraToken) {
|
|
416
|
+
headers.Authorization = `Bearer ${this.#opts.entraToken}`;
|
|
417
|
+
}
|
|
418
|
+
else if (this.#opts.apiKey) {
|
|
419
|
+
headers['api-key'] = this.#opts.apiKey;
|
|
420
|
+
}
|
|
421
|
+
else {
|
|
422
|
+
reject(new Error('Microsoft API key or entraToken is required'));
|
|
423
|
+
return;
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
else {
|
|
427
|
+
headers.Authorization = `Bearer ${this.#opts.apiKey}`;
|
|
428
|
+
headers['OpenAI-Beta'] = 'realtime=v1';
|
|
429
|
+
}
|
|
430
|
+
const url = new URL([this.#opts.baseURL, 'realtime'].join('/'));
|
|
431
|
+
if (url.protocol === 'https:') {
|
|
432
|
+
url.protocol = 'wss:';
|
|
433
|
+
}
|
|
434
|
+
// Construct query parameters
|
|
435
|
+
const queryParams = {};
|
|
436
|
+
if (this.#opts.isAzure) {
|
|
437
|
+
queryParams['api-version'] = '2024-10-01-preview';
|
|
438
|
+
queryParams['deployment'] = this.#opts.model;
|
|
439
|
+
}
|
|
440
|
+
else {
|
|
441
|
+
queryParams['model'] = this.#opts.model;
|
|
442
|
+
}
|
|
443
|
+
for (const [key, value] of Object.entries(queryParams)) {
|
|
444
|
+
url.searchParams.set(key, value);
|
|
445
|
+
}
|
|
446
|
+
console.debug('Connecting to OpenAI Realtime API at ', url.toString());
|
|
447
|
+
this.#ws = new WebSocket(url.toString(), {
|
|
448
|
+
headers: headers,
|
|
449
|
+
});
|
|
450
|
+
this.#ws.onerror = (error) => {
|
|
451
|
+
reject(new Error('OpenAI Realtime WebSocket error: ' + error.message));
|
|
452
|
+
};
|
|
453
|
+
await once(this.#ws, 'open');
|
|
454
|
+
this.#closing = false;
|
|
455
|
+
this.#ws.onmessage = (message) => {
|
|
456
|
+
const event = JSON.parse(message.data);
|
|
457
|
+
this.#logger.debug(`<- ${JSON.stringify(this.#loggableEvent(event))}`);
|
|
458
|
+
switch (event.type) {
|
|
459
|
+
case 'error':
|
|
460
|
+
this.#handleError(event);
|
|
461
|
+
break;
|
|
462
|
+
case 'session.created':
|
|
463
|
+
this.#handleSessionCreated(event);
|
|
464
|
+
break;
|
|
465
|
+
case 'session.updated':
|
|
466
|
+
this.#handleSessionUpdated(event);
|
|
467
|
+
break;
|
|
468
|
+
case 'conversation.created':
|
|
469
|
+
this.#handleConversationCreated(event);
|
|
470
|
+
break;
|
|
471
|
+
case 'input_audio_buffer.committed':
|
|
472
|
+
this.#handleInputAudioBufferCommitted(event);
|
|
473
|
+
break;
|
|
474
|
+
case 'input_audio_buffer.cleared':
|
|
475
|
+
this.#handleInputAudioBufferCleared(event);
|
|
476
|
+
break;
|
|
477
|
+
case 'input_audio_buffer.speech_started':
|
|
478
|
+
this.#handleInputAudioBufferSpeechStarted(event);
|
|
479
|
+
break;
|
|
480
|
+
case 'input_audio_buffer.speech_stopped':
|
|
481
|
+
this.#handleInputAudioBufferSpeechStopped(event);
|
|
482
|
+
break;
|
|
483
|
+
case 'conversation.item.created':
|
|
484
|
+
this.#handleConversationItemCreated(event);
|
|
485
|
+
break;
|
|
486
|
+
case 'conversation.item.input_audio_transcription.completed':
|
|
487
|
+
this.#handleConversationItemInputAudioTranscriptionCompleted(event);
|
|
488
|
+
break;
|
|
489
|
+
case 'conversation.item.input_audio_transcription.failed':
|
|
490
|
+
this.#handleConversationItemInputAudioTranscriptionFailed(event);
|
|
491
|
+
break;
|
|
492
|
+
case 'conversation.item.truncated':
|
|
493
|
+
this.#handleConversationItemTruncated(event);
|
|
494
|
+
break;
|
|
495
|
+
case 'conversation.item.deleted':
|
|
496
|
+
this.#handleConversationItemDeleted(event);
|
|
497
|
+
break;
|
|
498
|
+
case 'response.created':
|
|
499
|
+
this.#handleResponseCreated(event);
|
|
500
|
+
break;
|
|
501
|
+
case 'response.done':
|
|
502
|
+
this.#handleResponseDone(event);
|
|
503
|
+
break;
|
|
504
|
+
case 'response.output_item.added':
|
|
505
|
+
this.#handleResponseOutputItemAdded(event);
|
|
506
|
+
break;
|
|
507
|
+
case 'response.output_item.done':
|
|
508
|
+
this.#handleResponseOutputItemDone(event);
|
|
509
|
+
break;
|
|
510
|
+
case 'response.content_part.added':
|
|
511
|
+
this.#handleResponseContentPartAdded(event);
|
|
512
|
+
break;
|
|
513
|
+
case 'response.content_part.done':
|
|
514
|
+
this.#handleResponseContentPartDone(event);
|
|
515
|
+
break;
|
|
516
|
+
case 'response.text.delta':
|
|
517
|
+
this.#handleResponseTextDelta(event);
|
|
518
|
+
break;
|
|
519
|
+
case 'response.text.done':
|
|
520
|
+
this.#handleResponseTextDone(event);
|
|
521
|
+
break;
|
|
522
|
+
case 'response.audio_transcript.delta':
|
|
523
|
+
this.#handleResponseAudioTranscriptDelta(event);
|
|
524
|
+
break;
|
|
525
|
+
case 'response.audio_transcript.done':
|
|
526
|
+
this.#handleResponseAudioTranscriptDone(event);
|
|
527
|
+
break;
|
|
528
|
+
case 'response.audio.delta':
|
|
529
|
+
this.#handleResponseAudioDelta(event);
|
|
530
|
+
break;
|
|
531
|
+
case 'response.audio.done':
|
|
532
|
+
this.#handleResponseAudioDone(event);
|
|
533
|
+
break;
|
|
534
|
+
case 'response.function_call_arguments.delta':
|
|
535
|
+
this.#handleResponseFunctionCallArgumentsDelta(event);
|
|
536
|
+
break;
|
|
537
|
+
case 'response.function_call_arguments.done':
|
|
538
|
+
this.#handleResponseFunctionCallArgumentsDone(event);
|
|
539
|
+
break;
|
|
540
|
+
case 'rate_limits.updated':
|
|
541
|
+
this.#handleRateLimitsUpdated(event);
|
|
542
|
+
break;
|
|
543
|
+
}
|
|
544
|
+
};
|
|
545
|
+
const sendTask = async () => {
|
|
546
|
+
while (this.#ws && !this.#closing && this.#ws.readyState === WebSocket.OPEN) {
|
|
547
|
+
try {
|
|
548
|
+
const event = await this.#sendQueue.get();
|
|
549
|
+
if (event.type !== 'input_audio_buffer.append') {
|
|
550
|
+
this.#logger.debug(`-> ${JSON.stringify(this.#loggableEvent(event))}`);
|
|
551
|
+
}
|
|
552
|
+
this.#ws.send(JSON.stringify(event));
|
|
553
|
+
}
|
|
554
|
+
catch (error) {
|
|
555
|
+
this.#logger.error('Error sending event:', error);
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
};
|
|
559
|
+
sendTask();
|
|
560
|
+
this.#ws.onclose = () => {
|
|
561
|
+
if (this.#expiresAt && Date.now() >= this.#expiresAt * 1000) {
|
|
562
|
+
this.#closing = true;
|
|
563
|
+
}
|
|
564
|
+
if (!this.#closing) {
|
|
565
|
+
reject(new Error('OpenAI Realtime connection closed unexpectedly'));
|
|
566
|
+
}
|
|
567
|
+
this.#ws = null;
|
|
568
|
+
resolve();
|
|
569
|
+
};
|
|
570
|
+
});
|
|
571
|
+
}
|
|
284
572
|
async close() {
|
|
285
|
-
if (!
|
|
573
|
+
if (!this.#ws)
|
|
286
574
|
return;
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
await
|
|
575
|
+
this.#closing = true;
|
|
576
|
+
this.#ws.close();
|
|
577
|
+
await this.#task;
|
|
578
|
+
}
|
|
579
|
+
#getContent(ptr) {
|
|
580
|
+
const response = this.#pendingResponses[ptr.response_id];
|
|
581
|
+
const output = response.output[ptr.output_index];
|
|
582
|
+
const content = output.content[ptr.content_index];
|
|
583
|
+
return content;
|
|
584
|
+
}
|
|
585
|
+
#handleError(event) {
|
|
586
|
+
this.#logger.error(`OpenAI Realtime error ${JSON.stringify(event.error)}`);
|
|
587
|
+
}
|
|
588
|
+
#handleSessionCreated(event) {
|
|
589
|
+
this.#sessionId = event.session.id;
|
|
590
|
+
this.#expiresAt = event.session.expires_at;
|
|
591
|
+
this.#logger = this.#logger.child({ sessionId: this.#sessionId });
|
|
592
|
+
}
|
|
593
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
594
|
+
#handleSessionUpdated(event) { }
|
|
595
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
596
|
+
#handleConversationCreated(event) { }
|
|
597
|
+
#handleInputAudioBufferCommitted(event) {
|
|
598
|
+
this.emit('input_speech_committed', {
|
|
599
|
+
itemId: event.item_id,
|
|
600
|
+
});
|
|
290
601
|
}
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
602
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
603
|
+
#handleInputAudioBufferCleared(event) { }
|
|
604
|
+
#handleInputAudioBufferSpeechStarted(
|
|
605
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
606
|
+
event) {
|
|
607
|
+
this.emit('input_speech_started', {
|
|
608
|
+
itemId: event.item_id,
|
|
609
|
+
});
|
|
298
610
|
}
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
611
|
+
#handleInputAudioBufferSpeechStopped(
|
|
612
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
613
|
+
event) {
|
|
614
|
+
this.emit('input_speech_stopped');
|
|
615
|
+
}
|
|
616
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
617
|
+
#handleConversationItemCreated(event) { }
|
|
618
|
+
#handleConversationItemInputAudioTranscriptionCompleted(event) {
|
|
619
|
+
const transcript = event.transcript;
|
|
620
|
+
this.emit('input_speech_transcription_completed', {
|
|
621
|
+
itemId: event.item_id,
|
|
622
|
+
transcript: transcript,
|
|
623
|
+
});
|
|
624
|
+
}
|
|
625
|
+
#handleConversationItemInputAudioTranscriptionFailed(event) {
|
|
626
|
+
const error = event.error;
|
|
627
|
+
this.#logger.error(`OpenAI Realtime failed to transcribe input audio: ${error.message}`);
|
|
628
|
+
this.emit('input_speech_transcription_failed', {
|
|
629
|
+
itemId: event.item_id,
|
|
630
|
+
message: error.message,
|
|
631
|
+
});
|
|
632
|
+
}
|
|
633
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
634
|
+
#handleConversationItemTruncated(event) { }
|
|
635
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
636
|
+
#handleConversationItemDeleted(event) { }
|
|
637
|
+
#handleResponseCreated(responseCreated) {
|
|
638
|
+
const response = responseCreated.response;
|
|
639
|
+
const doneFut = new Future();
|
|
640
|
+
const newResponse = {
|
|
641
|
+
id: response.id,
|
|
642
|
+
status: response.status,
|
|
643
|
+
statusDetails: response.status_details,
|
|
644
|
+
usage: null,
|
|
645
|
+
output: [],
|
|
646
|
+
doneFut: doneFut,
|
|
314
647
|
};
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
648
|
+
this.#pendingResponses[newResponse.id] = newResponse;
|
|
649
|
+
this.emit('response_created', newResponse);
|
|
650
|
+
}
|
|
651
|
+
#handleResponseDone(event) {
|
|
652
|
+
const responseData = event.response;
|
|
653
|
+
const responseId = responseData.id;
|
|
654
|
+
const response = this.#pendingResponses[responseId];
|
|
655
|
+
response.status = responseData.status;
|
|
656
|
+
response.statusDetails = responseData.status_details;
|
|
657
|
+
response.usage = responseData.usage;
|
|
658
|
+
this.#pendingResponses[responseId] = response;
|
|
659
|
+
response.doneFut.resolve();
|
|
660
|
+
this.emit('response_done', response);
|
|
661
|
+
}
|
|
662
|
+
#handleResponseOutputItemAdded(event) {
|
|
663
|
+
const responseId = event.response_id;
|
|
664
|
+
const response = this.#pendingResponses[responseId];
|
|
665
|
+
const itemData = event.item;
|
|
666
|
+
if (itemData.type !== 'message' && itemData.type !== 'function_call') {
|
|
667
|
+
throw new Error(`Unexpected item type: ${itemData.type}`);
|
|
333
668
|
}
|
|
334
|
-
|
|
335
|
-
if (
|
|
336
|
-
|
|
337
|
-
}
|
|
338
|
-
// Construct query parameters
|
|
339
|
-
const queryParams = {};
|
|
340
|
-
if (__classPrivateFieldGet(this, _RealtimeSession_opts, "f").isAzure) {
|
|
341
|
-
queryParams['api-version'] = '2024-10-01-preview';
|
|
342
|
-
queryParams['deployment'] = __classPrivateFieldGet(this, _RealtimeSession_opts, "f").model;
|
|
669
|
+
let role;
|
|
670
|
+
if (itemData.type === 'function_call') {
|
|
671
|
+
role = 'assistant'; // function_call doesn't have a role field, defaulting it to assistant
|
|
343
672
|
}
|
|
344
673
|
else {
|
|
345
|
-
|
|
346
|
-
}
|
|
347
|
-
for (const [key, value] of Object.entries(queryParams)) {
|
|
348
|
-
url.searchParams.set(key, value);
|
|
674
|
+
role = itemData.role;
|
|
349
675
|
}
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
__classPrivateFieldSet(this, _RealtimeSession_closing, false, "f");
|
|
359
|
-
__classPrivateFieldGet(this, _RealtimeSession_ws, "f").onmessage = (message) => {
|
|
360
|
-
const event = JSON.parse(message.data);
|
|
361
|
-
__classPrivateFieldGet(this, _RealtimeSession_logger, "f").debug(`<- ${JSON.stringify(__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_loggableEvent).call(this, event))}`);
|
|
362
|
-
switch (event.type) {
|
|
363
|
-
case 'error':
|
|
364
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleError).call(this, event);
|
|
365
|
-
break;
|
|
366
|
-
case 'session.created':
|
|
367
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleSessionCreated).call(this, event);
|
|
368
|
-
break;
|
|
369
|
-
case 'session.updated':
|
|
370
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleSessionUpdated).call(this, event);
|
|
371
|
-
break;
|
|
372
|
-
case 'conversation.created':
|
|
373
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleConversationCreated).call(this, event);
|
|
374
|
-
break;
|
|
375
|
-
case 'input_audio_buffer.committed':
|
|
376
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleInputAudioBufferCommitted).call(this, event);
|
|
377
|
-
break;
|
|
378
|
-
case 'input_audio_buffer.cleared':
|
|
379
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleInputAudioBufferCleared).call(this, event);
|
|
380
|
-
break;
|
|
381
|
-
case 'input_audio_buffer.speech_started':
|
|
382
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleInputAudioBufferSpeechStarted).call(this, event);
|
|
383
|
-
break;
|
|
384
|
-
case 'input_audio_buffer.speech_stopped':
|
|
385
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleInputAudioBufferSpeechStopped).call(this, event);
|
|
386
|
-
break;
|
|
387
|
-
case 'conversation.item.created':
|
|
388
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleConversationItemCreated).call(this, event);
|
|
389
|
-
break;
|
|
390
|
-
case 'conversation.item.input_audio_transcription.completed':
|
|
391
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleConversationItemInputAudioTranscriptionCompleted).call(this, event);
|
|
392
|
-
break;
|
|
393
|
-
case 'conversation.item.input_audio_transcription.failed':
|
|
394
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleConversationItemInputAudioTranscriptionFailed).call(this, event);
|
|
395
|
-
break;
|
|
396
|
-
case 'conversation.item.truncated':
|
|
397
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleConversationItemTruncated).call(this, event);
|
|
398
|
-
break;
|
|
399
|
-
case 'conversation.item.deleted':
|
|
400
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleConversationItemDeleted).call(this, event);
|
|
401
|
-
break;
|
|
402
|
-
case 'response.created':
|
|
403
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseCreated).call(this, event);
|
|
404
|
-
break;
|
|
405
|
-
case 'response.done':
|
|
406
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseDone).call(this, event);
|
|
407
|
-
break;
|
|
408
|
-
case 'response.output_item.added':
|
|
409
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseOutputItemAdded).call(this, event);
|
|
410
|
-
break;
|
|
411
|
-
case 'response.output_item.done':
|
|
412
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseOutputItemDone).call(this, event);
|
|
413
|
-
break;
|
|
414
|
-
case 'response.content_part.added':
|
|
415
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseContentPartAdded).call(this, event);
|
|
416
|
-
break;
|
|
417
|
-
case 'response.content_part.done':
|
|
418
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseContentPartDone).call(this, event);
|
|
419
|
-
break;
|
|
420
|
-
case 'response.text.delta':
|
|
421
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseTextDelta).call(this, event);
|
|
422
|
-
break;
|
|
423
|
-
case 'response.text.done':
|
|
424
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseTextDone).call(this, event);
|
|
425
|
-
break;
|
|
426
|
-
case 'response.audio_transcript.delta':
|
|
427
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseAudioTranscriptDelta).call(this, event);
|
|
428
|
-
break;
|
|
429
|
-
case 'response.audio_transcript.done':
|
|
430
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseAudioTranscriptDone).call(this, event);
|
|
431
|
-
break;
|
|
432
|
-
case 'response.audio.delta':
|
|
433
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseAudioDelta).call(this, event);
|
|
434
|
-
break;
|
|
435
|
-
case 'response.audio.done':
|
|
436
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseAudioDone).call(this, event);
|
|
437
|
-
break;
|
|
438
|
-
case 'response.function_call_arguments.delta':
|
|
439
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseFunctionCallArgumentsDelta).call(this, event);
|
|
440
|
-
break;
|
|
441
|
-
case 'response.function_call_arguments.done':
|
|
442
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleResponseFunctionCallArgumentsDone).call(this, event);
|
|
443
|
-
break;
|
|
444
|
-
case 'rate_limits.updated':
|
|
445
|
-
__classPrivateFieldGet(this, _RealtimeSession_instances, "m", _RealtimeSession_handleRateLimitsUpdated).call(this, event);
|
|
446
|
-
break;
|
|
447
|
-
}
|
|
676
|
+
const newOutput = {
|
|
677
|
+
responseId: responseId,
|
|
678
|
+
itemId: itemData.id,
|
|
679
|
+
outputIndex: event.output_index,
|
|
680
|
+
type: itemData.type,
|
|
681
|
+
role: role,
|
|
682
|
+
content: [],
|
|
683
|
+
doneFut: new Future(),
|
|
448
684
|
};
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
}
|
|
462
|
-
};
|
|
463
|
-
sendTask();
|
|
464
|
-
__classPrivateFieldGet(this, _RealtimeSession_ws, "f").onclose = () => {
|
|
465
|
-
if (__classPrivateFieldGet(this, _RealtimeSession_expiresAt, "f") && Date.now() >= __classPrivateFieldGet(this, _RealtimeSession_expiresAt, "f") * 1000) {
|
|
466
|
-
__classPrivateFieldSet(this, _RealtimeSession_closing, true, "f");
|
|
685
|
+
response.output.push(newOutput);
|
|
686
|
+
this.emit('response_output_added', newOutput);
|
|
687
|
+
}
|
|
688
|
+
#handleResponseOutputItemDone(event) {
|
|
689
|
+
const responseId = event.response_id;
|
|
690
|
+
const response = this.#pendingResponses[responseId];
|
|
691
|
+
const outputIndex = event.output_index;
|
|
692
|
+
const output = response.output[outputIndex];
|
|
693
|
+
if (output.type === 'function_call') {
|
|
694
|
+
if (!this.#fncCtx) {
|
|
695
|
+
this.#logger.error('function call received but no fncCtx is available');
|
|
696
|
+
return;
|
|
467
697
|
}
|
|
468
|
-
|
|
469
|
-
|
|
698
|
+
// parse the arguments and call the function inside the fnc_ctx
|
|
699
|
+
const item = event.item;
|
|
700
|
+
if (item.type !== 'function_call') {
|
|
701
|
+
throw new Error('Expected function_call item');
|
|
470
702
|
}
|
|
471
|
-
|
|
472
|
-
resolve();
|
|
473
|
-
};
|
|
474
|
-
});
|
|
475
|
-
}, _RealtimeSession_getContent = function _RealtimeSession_getContent(ptr) {
|
|
476
|
-
const response = __classPrivateFieldGet(this, _RealtimeSession_pendingResponses, "f")[ptr.response_id];
|
|
477
|
-
const output = response.output[ptr.output_index];
|
|
478
|
-
const content = output.content[ptr.content_index];
|
|
479
|
-
return content;
|
|
480
|
-
}, _RealtimeSession_handleError = function _RealtimeSession_handleError(event) {
|
|
481
|
-
__classPrivateFieldGet(this, _RealtimeSession_logger, "f").error(`OpenAI Realtime error ${JSON.stringify(event.error)}`);
|
|
482
|
-
}, _RealtimeSession_handleSessionCreated = function _RealtimeSession_handleSessionCreated(event) {
|
|
483
|
-
__classPrivateFieldSet(this, _RealtimeSession_sessionId, event.session.id, "f");
|
|
484
|
-
__classPrivateFieldSet(this, _RealtimeSession_expiresAt, event.session.expires_at, "f");
|
|
485
|
-
__classPrivateFieldSet(this, _RealtimeSession_logger, __classPrivateFieldGet(this, _RealtimeSession_logger, "f").child({ sessionId: __classPrivateFieldGet(this, _RealtimeSession_sessionId, "f") }), "f");
|
|
486
|
-
}, _RealtimeSession_handleSessionUpdated = function _RealtimeSession_handleSessionUpdated(event) { }, _RealtimeSession_handleConversationCreated = function _RealtimeSession_handleConversationCreated(event) { }, _RealtimeSession_handleInputAudioBufferCommitted = function _RealtimeSession_handleInputAudioBufferCommitted(event) {
|
|
487
|
-
this.emit('input_speech_committed', {
|
|
488
|
-
itemId: event.item_id,
|
|
489
|
-
});
|
|
490
|
-
}, _RealtimeSession_handleInputAudioBufferCleared = function _RealtimeSession_handleInputAudioBufferCleared(event) { }, _RealtimeSession_handleInputAudioBufferSpeechStarted = function _RealtimeSession_handleInputAudioBufferSpeechStarted(
|
|
491
|
-
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
492
|
-
event) {
|
|
493
|
-
this.emit('input_speech_started', {
|
|
494
|
-
itemId: event.item_id,
|
|
495
|
-
});
|
|
496
|
-
}, _RealtimeSession_handleInputAudioBufferSpeechStopped = function _RealtimeSession_handleInputAudioBufferSpeechStopped(
|
|
497
|
-
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
498
|
-
event) {
|
|
499
|
-
this.emit('input_speech_stopped');
|
|
500
|
-
}, _RealtimeSession_handleConversationItemCreated = function _RealtimeSession_handleConversationItemCreated(event) { }, _RealtimeSession_handleConversationItemInputAudioTranscriptionCompleted = function _RealtimeSession_handleConversationItemInputAudioTranscriptionCompleted(event) {
|
|
501
|
-
const transcript = event.transcript;
|
|
502
|
-
this.emit('input_speech_transcription_completed', {
|
|
503
|
-
itemId: event.item_id,
|
|
504
|
-
transcript: transcript,
|
|
505
|
-
});
|
|
506
|
-
}, _RealtimeSession_handleConversationItemInputAudioTranscriptionFailed = function _RealtimeSession_handleConversationItemInputAudioTranscriptionFailed(event) {
|
|
507
|
-
const error = event.error;
|
|
508
|
-
__classPrivateFieldGet(this, _RealtimeSession_logger, "f").error(`OpenAI Realtime failed to transcribe input audio: ${error.message}`);
|
|
509
|
-
this.emit('input_speech_transcription_failed', {
|
|
510
|
-
itemId: event.item_id,
|
|
511
|
-
message: error.message,
|
|
512
|
-
});
|
|
513
|
-
}, _RealtimeSession_handleConversationItemTruncated = function _RealtimeSession_handleConversationItemTruncated(event) { }, _RealtimeSession_handleConversationItemDeleted = function _RealtimeSession_handleConversationItemDeleted(event) { }, _RealtimeSession_handleResponseCreated = function _RealtimeSession_handleResponseCreated(responseCreated) {
|
|
514
|
-
const response = responseCreated.response;
|
|
515
|
-
const doneFut = new Future();
|
|
516
|
-
const newResponse = {
|
|
517
|
-
id: response.id,
|
|
518
|
-
status: response.status,
|
|
519
|
-
statusDetails: response.status_details,
|
|
520
|
-
usage: null,
|
|
521
|
-
output: [],
|
|
522
|
-
doneFut: doneFut,
|
|
523
|
-
};
|
|
524
|
-
__classPrivateFieldGet(this, _RealtimeSession_pendingResponses, "f")[newResponse.id] = newResponse;
|
|
525
|
-
this.emit('response_created', newResponse);
|
|
526
|
-
}, _RealtimeSession_handleResponseDone = function _RealtimeSession_handleResponseDone(event) {
|
|
527
|
-
const responseData = event.response;
|
|
528
|
-
const responseId = responseData.id;
|
|
529
|
-
const response = __classPrivateFieldGet(this, _RealtimeSession_pendingResponses, "f")[responseId];
|
|
530
|
-
response.status = responseData.status;
|
|
531
|
-
response.statusDetails = responseData.status_details;
|
|
532
|
-
response.usage = responseData.usage;
|
|
533
|
-
__classPrivateFieldGet(this, _RealtimeSession_pendingResponses, "f")[responseId] = response;
|
|
534
|
-
response.doneFut.resolve();
|
|
535
|
-
this.emit('response_done', response);
|
|
536
|
-
}, _RealtimeSession_handleResponseOutputItemAdded = function _RealtimeSession_handleResponseOutputItemAdded(event) {
|
|
537
|
-
const responseId = event.response_id;
|
|
538
|
-
const response = __classPrivateFieldGet(this, _RealtimeSession_pendingResponses, "f")[responseId];
|
|
539
|
-
const itemData = event.item;
|
|
540
|
-
if (itemData.type !== 'message' && itemData.type !== 'function_call') {
|
|
541
|
-
throw new Error(`Unexpected item type: ${itemData.type}`);
|
|
542
|
-
}
|
|
543
|
-
let role;
|
|
544
|
-
if (itemData.type === 'function_call') {
|
|
545
|
-
role = 'assistant'; // function_call doesn't have a role field, defaulting it to assistant
|
|
546
|
-
}
|
|
547
|
-
else {
|
|
548
|
-
role = itemData.role;
|
|
549
|
-
}
|
|
550
|
-
const newOutput = {
|
|
551
|
-
responseId: responseId,
|
|
552
|
-
itemId: itemData.id,
|
|
553
|
-
outputIndex: event.output_index,
|
|
554
|
-
type: itemData.type,
|
|
555
|
-
role: role,
|
|
556
|
-
content: [],
|
|
557
|
-
doneFut: new Future(),
|
|
558
|
-
};
|
|
559
|
-
response.output.push(newOutput);
|
|
560
|
-
this.emit('response_output_added', newOutput);
|
|
561
|
-
}, _RealtimeSession_handleResponseOutputItemDone = function _RealtimeSession_handleResponseOutputItemDone(event) {
|
|
562
|
-
const responseId = event.response_id;
|
|
563
|
-
const response = __classPrivateFieldGet(this, _RealtimeSession_pendingResponses, "f")[responseId];
|
|
564
|
-
const outputIndex = event.output_index;
|
|
565
|
-
const output = response.output[outputIndex];
|
|
566
|
-
if (output.type === 'function_call') {
|
|
567
|
-
if (!__classPrivateFieldGet(this, _RealtimeSession_fncCtx, "f")) {
|
|
568
|
-
__classPrivateFieldGet(this, _RealtimeSession_logger, "f").error('function call received but no fncCtx is available');
|
|
569
|
-
return;
|
|
570
|
-
}
|
|
571
|
-
// parse the arguments and call the function inside the fnc_ctx
|
|
572
|
-
const item = event.item;
|
|
573
|
-
if (item.type !== 'function_call') {
|
|
574
|
-
throw new Error('Expected function_call item');
|
|
575
|
-
}
|
|
576
|
-
this.emit('function_call_started', {
|
|
577
|
-
callId: item.call_id,
|
|
578
|
-
});
|
|
579
|
-
const parsedArgs = JSON.parse(item.arguments);
|
|
580
|
-
__classPrivateFieldGet(this, _RealtimeSession_logger, "f").debug(`[Function Call ${item.call_id}] Executing ${item.name} with arguments ${parsedArgs}`);
|
|
581
|
-
__classPrivateFieldGet(this, _RealtimeSession_fncCtx, "f")[item.name].execute(parsedArgs).then((content) => {
|
|
582
|
-
__classPrivateFieldGet(this, _RealtimeSession_logger, "f").debug(`[Function Call ${item.call_id}] ${item.name} returned ${content}`);
|
|
583
|
-
this.emit('function_call_completed', {
|
|
703
|
+
this.emit('function_call_started', {
|
|
584
704
|
callId: item.call_id,
|
|
585
705
|
});
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
706
|
+
const parsedArgs = JSON.parse(item.arguments);
|
|
707
|
+
this.#logger.debug(`[Function Call ${item.call_id}] Executing ${item.name} with arguments ${parsedArgs}`);
|
|
708
|
+
this.#fncCtx[item.name].execute(parsedArgs).then((content) => {
|
|
709
|
+
this.#logger.debug(`[Function Call ${item.call_id}] ${item.name} returned ${content}`);
|
|
710
|
+
this.emit('function_call_completed', {
|
|
711
|
+
callId: item.call_id,
|
|
712
|
+
});
|
|
713
|
+
this.conversation.item.create(llm.ChatMessage.createToolFromFunctionResult({
|
|
714
|
+
name: item.name,
|
|
715
|
+
toolCallId: item.call_id,
|
|
716
|
+
result: content,
|
|
717
|
+
}), output.itemId);
|
|
718
|
+
this.response.create();
|
|
719
|
+
}, (error) => {
|
|
720
|
+
this.#logger.error(`[Function Call ${item.call_id}] ${item.name} failed with ${error}`);
|
|
721
|
+
// TODO: send it back up as failed?
|
|
722
|
+
this.emit('function_call_failed', {
|
|
723
|
+
callId: item.call_id,
|
|
724
|
+
});
|
|
597
725
|
});
|
|
598
|
-
}
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
}
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
726
|
+
}
|
|
727
|
+
output.doneFut.resolve();
|
|
728
|
+
this.emit('response_output_done', output);
|
|
729
|
+
}
|
|
730
|
+
#handleResponseContentPartAdded(event) {
|
|
731
|
+
const responseId = event.response_id;
|
|
732
|
+
const response = this.#pendingResponses[responseId];
|
|
733
|
+
const outputIndex = event.output_index;
|
|
734
|
+
const output = response.output[outputIndex];
|
|
735
|
+
const textStream = new AsyncIterableQueue();
|
|
736
|
+
const audioStream = new AsyncIterableQueue();
|
|
737
|
+
const newContent = {
|
|
738
|
+
responseId: responseId,
|
|
739
|
+
itemId: event.item_id,
|
|
740
|
+
outputIndex: outputIndex,
|
|
741
|
+
contentIndex: event.content_index,
|
|
742
|
+
text: '',
|
|
743
|
+
audio: [],
|
|
744
|
+
textStream: textStream,
|
|
745
|
+
audioStream: audioStream,
|
|
746
|
+
toolCalls: [],
|
|
747
|
+
};
|
|
748
|
+
output.content.push(newContent);
|
|
749
|
+
this.emit('response_content_added', newContent);
|
|
750
|
+
}
|
|
751
|
+
#handleResponseContentPartDone(event) {
|
|
752
|
+
const content = this.#getContent(event);
|
|
753
|
+
this.emit('response_content_done', content);
|
|
754
|
+
}
|
|
755
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
756
|
+
#handleResponseTextDelta(event) { }
|
|
757
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
758
|
+
#handleResponseTextDone(event) { }
|
|
759
|
+
#handleResponseAudioTranscriptDelta(event) {
|
|
760
|
+
const content = this.#getContent(event);
|
|
761
|
+
const transcript = event.delta;
|
|
762
|
+
content.text += transcript;
|
|
763
|
+
content.textStream.put(transcript);
|
|
764
|
+
}
|
|
765
|
+
#handleResponseAudioTranscriptDone(event) {
|
|
766
|
+
const content = this.#getContent(event);
|
|
767
|
+
content.textStream.close();
|
|
768
|
+
}
|
|
769
|
+
#handleResponseAudioDelta(event) {
|
|
770
|
+
const content = this.#getContent(event);
|
|
771
|
+
const data = Buffer.from(event.delta, 'base64');
|
|
772
|
+
const audio = new AudioFrame(new Int16Array(data.buffer), api_proto.SAMPLE_RATE, api_proto.NUM_CHANNELS, data.length / 2);
|
|
773
|
+
content.audio.push(audio);
|
|
774
|
+
content.audioStream.put(audio);
|
|
775
|
+
}
|
|
776
|
+
#handleResponseAudioDone(event) {
|
|
777
|
+
const content = this.#getContent(event);
|
|
778
|
+
content.audioStream.close();
|
|
779
|
+
}
|
|
780
|
+
#handleResponseFunctionCallArgumentsDelta(
|
|
781
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
782
|
+
event) { }
|
|
783
|
+
#handleResponseFunctionCallArgumentsDone(
|
|
784
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
785
|
+
event) { }
|
|
786
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
787
|
+
#handleRateLimitsUpdated(event) { }
|
|
788
|
+
}
|
|
647
789
|
//# sourceMappingURL=realtime_model.js.map
|