@livekit/agents-plugin-openai 1.0.17 → 1.0.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/llm.cjs +5 -2
- package/dist/llm.cjs.map +1 -1
- package/dist/llm.d.cts +2 -1
- package/dist/llm.d.ts +2 -1
- package/dist/llm.d.ts.map +1 -1
- package/dist/llm.js +5 -2
- package/dist/llm.js.map +1 -1
- package/dist/llm.test.cjs +9 -0
- package/dist/llm.test.cjs.map +1 -1
- package/dist/llm.test.js +10 -1
- package/dist/llm.test.js.map +1 -1
- package/dist/realtime/api_proto.cjs.map +1 -1
- package/dist/realtime/api_proto.d.cts +5 -3
- package/dist/realtime/api_proto.d.ts +5 -3
- package/dist/realtime/api_proto.d.ts.map +1 -1
- package/dist/realtime/api_proto.js.map +1 -1
- package/dist/realtime/realtime_model.cjs +111 -39
- package/dist/realtime/realtime_model.cjs.map +1 -1
- package/dist/realtime/realtime_model.d.cts +7 -0
- package/dist/realtime/realtime_model.d.ts +7 -0
- package/dist/realtime/realtime_model.d.ts.map +1 -1
- package/dist/realtime/realtime_model.js +111 -39
- package/dist/realtime/realtime_model.js.map +1 -1
- package/package.json +7 -7
- package/src/llm.test.ts +11 -1
- package/src/llm.ts +6 -2
- package/src/realtime/api_proto.ts +5 -3
- package/src/realtime/realtime_model.ts +146 -39
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@livekit/agents-plugin-openai",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.19",
|
|
4
4
|
"description": "OpenAI plugin for LiveKit Node Agents",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"require": "dist/index.cjs",
|
|
@@ -30,18 +30,18 @@
|
|
|
30
30
|
"@types/ws": "^8.5.10",
|
|
31
31
|
"tsup": "^8.3.5",
|
|
32
32
|
"typescript": "^5.0.0",
|
|
33
|
-
"@livekit/agents": "1.0.
|
|
34
|
-
"@livekit/agents-plugin-silero": "1.0.
|
|
35
|
-
"@livekit/agents-plugins-test": "1.0.
|
|
33
|
+
"@livekit/agents": "1.0.19",
|
|
34
|
+
"@livekit/agents-plugin-silero": "1.0.19",
|
|
35
|
+
"@livekit/agents-plugins-test": "1.0.19"
|
|
36
36
|
},
|
|
37
37
|
"dependencies": {
|
|
38
38
|
"@livekit/mutex": "^1.1.1",
|
|
39
|
-
"openai": "^
|
|
40
|
-
"ws": "^8.
|
|
39
|
+
"openai": "^6.8.1",
|
|
40
|
+
"ws": "^8.18.0"
|
|
41
41
|
},
|
|
42
42
|
"peerDependencies": {
|
|
43
43
|
"@livekit/rtc-node": "^0.13.12",
|
|
44
|
-
"@livekit/agents": "1.0.
|
|
44
|
+
"@livekit/agents": "1.0.19"
|
|
45
45
|
},
|
|
46
46
|
"scripts": {
|
|
47
47
|
"build": "tsup --onSuccess \"pnpm build:types\"",
|
package/src/llm.test.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
import { llm } from '@livekit/agents-plugins-test';
|
|
4
|
+
import { llm, llmStrict } from '@livekit/agents-plugins-test';
|
|
5
5
|
import { describe } from 'vitest';
|
|
6
6
|
import { LLM } from './llm.js';
|
|
7
7
|
|
|
@@ -10,5 +10,15 @@ describe('OpenAI', async () => {
|
|
|
10
10
|
new LLM({
|
|
11
11
|
temperature: 0,
|
|
12
12
|
}),
|
|
13
|
+
false,
|
|
14
|
+
);
|
|
15
|
+
});
|
|
16
|
+
|
|
17
|
+
describe('OpenAI strict tool schema', async () => {
|
|
18
|
+
await llmStrict(
|
|
19
|
+
new LLM({
|
|
20
|
+
temperature: 0,
|
|
21
|
+
strictToolSchema: true,
|
|
22
|
+
}),
|
|
13
23
|
);
|
|
14
24
|
});
|
package/src/llm.ts
CHANGED
|
@@ -30,17 +30,20 @@ export interface LLMOptions {
|
|
|
30
30
|
maxCompletionTokens?: number;
|
|
31
31
|
serviceTier?: string;
|
|
32
32
|
store?: boolean;
|
|
33
|
+
strictToolSchema?: boolean;
|
|
33
34
|
}
|
|
34
35
|
|
|
35
36
|
const defaultLLMOptions: LLMOptions = {
|
|
36
37
|
model: 'gpt-4.1',
|
|
37
38
|
apiKey: process.env.OPENAI_API_KEY,
|
|
38
39
|
parallelToolCalls: true,
|
|
40
|
+
strictToolSchema: false,
|
|
39
41
|
};
|
|
40
42
|
|
|
41
43
|
const defaultAzureLLMOptions: LLMOptions = {
|
|
42
44
|
model: 'gpt-4.1',
|
|
43
45
|
apiKey: process.env.AZURE_API_KEY,
|
|
46
|
+
strictToolSchema: false,
|
|
44
47
|
};
|
|
45
48
|
|
|
46
49
|
export class LLM extends llm.LLM {
|
|
@@ -445,9 +448,9 @@ export class LLM extends llm.LLM {
|
|
|
445
448
|
connOptions?: APIConnectOptions;
|
|
446
449
|
parallelToolCalls?: boolean;
|
|
447
450
|
toolChoice?: llm.ToolChoice;
|
|
448
|
-
extraKwargs?: Record<string,
|
|
451
|
+
extraKwargs?: Record<string, unknown>;
|
|
449
452
|
}): LLMStream {
|
|
450
|
-
const extras: Record<string,
|
|
453
|
+
const extras: Record<string, unknown> = { ...extraKwargs };
|
|
451
454
|
|
|
452
455
|
if (this.#opts.metadata) {
|
|
453
456
|
extras.metadata = this.#opts.metadata;
|
|
@@ -492,6 +495,7 @@ export class LLM extends llm.LLM {
|
|
|
492
495
|
toolCtx,
|
|
493
496
|
connOptions,
|
|
494
497
|
modelOptions: extras,
|
|
498
|
+
strictToolSchema: this.#opts.strictToolSchema || false,
|
|
495
499
|
gatewayOptions: undefined, // OpenAI plugin doesn't use gateway authentication
|
|
496
500
|
});
|
|
497
501
|
}
|
|
@@ -190,7 +190,7 @@ export interface SessionResource {
|
|
|
190
190
|
id: string;
|
|
191
191
|
object: 'realtime.session';
|
|
192
192
|
model: string;
|
|
193
|
-
modalities: [
|
|
193
|
+
modalities: Modality[]; // default: ["text", "audio"]
|
|
194
194
|
instructions: string;
|
|
195
195
|
voice: Voice; // default: "alloy"
|
|
196
196
|
input_audio_format: AudioFormat; // default: "pcm16"
|
|
@@ -267,7 +267,7 @@ export interface SessionUpdateEvent extends BaseClientEvent {
|
|
|
267
267
|
type: 'session.update';
|
|
268
268
|
session: Partial<{
|
|
269
269
|
model: Model;
|
|
270
|
-
modalities: [
|
|
270
|
+
modalities: Modality[];
|
|
271
271
|
instructions: string;
|
|
272
272
|
voice: Voice;
|
|
273
273
|
input_audio_format: AudioFormat;
|
|
@@ -350,7 +350,7 @@ export interface ConversationItemDeleteEvent extends BaseClientEvent {
|
|
|
350
350
|
export interface ResponseCreateEvent extends BaseClientEvent {
|
|
351
351
|
type: 'response.create';
|
|
352
352
|
response?: Partial<{
|
|
353
|
-
modalities: [
|
|
353
|
+
modalities: Modality[];
|
|
354
354
|
instructions: string;
|
|
355
355
|
voice: Voice;
|
|
356
356
|
output_audio_format: AudioFormat;
|
|
@@ -511,6 +511,7 @@ export interface ResponseContentPartDoneEvent extends BaseServerEvent {
|
|
|
511
511
|
export interface ResponseTextDeltaEvent extends BaseServerEvent {
|
|
512
512
|
type: 'response.text.delta';
|
|
513
513
|
response_id: string;
|
|
514
|
+
item_id: string;
|
|
514
515
|
output_index: number;
|
|
515
516
|
content_index: number;
|
|
516
517
|
delta: string;
|
|
@@ -519,6 +520,7 @@ export interface ResponseTextDeltaEvent extends BaseServerEvent {
|
|
|
519
520
|
export interface ResponseTextDoneEvent extends BaseServerEvent {
|
|
520
521
|
type: 'response.text.done';
|
|
521
522
|
response_id: string;
|
|
523
|
+
item_id: string;
|
|
522
524
|
output_index: number;
|
|
523
525
|
content_index: number;
|
|
524
526
|
text: string;
|
|
@@ -34,6 +34,8 @@ const BASE_URL = 'https://api.openai.com/v1';
|
|
|
34
34
|
|
|
35
35
|
const MOCK_AUDIO_ID_PREFIX = 'lk_mock_audio_item_';
|
|
36
36
|
|
|
37
|
+
type Modality = 'text' | 'audio';
|
|
38
|
+
|
|
37
39
|
interface RealtimeOptions {
|
|
38
40
|
model: api_proto.Model;
|
|
39
41
|
voice: api_proto.Voice;
|
|
@@ -54,6 +56,7 @@ interface RealtimeOptions {
|
|
|
54
56
|
maxSessionDuration: number;
|
|
55
57
|
// reset the connection after this many seconds if provided
|
|
56
58
|
connOptions: APIConnectOptions;
|
|
59
|
+
modalities: Modality[];
|
|
57
60
|
}
|
|
58
61
|
|
|
59
62
|
interface MessageGeneration {
|
|
@@ -61,6 +64,7 @@ interface MessageGeneration {
|
|
|
61
64
|
textChannel: stream.StreamChannel<string>;
|
|
62
65
|
audioChannel: stream.StreamChannel<AudioFrame>;
|
|
63
66
|
audioTranscript: string;
|
|
67
|
+
modalities: Future<('text' | 'audio')[]>;
|
|
64
68
|
}
|
|
65
69
|
|
|
66
70
|
interface ResponseGeneration {
|
|
@@ -125,6 +129,7 @@ const DEFAULT_REALTIME_MODEL_OPTIONS = {
|
|
|
125
129
|
maxResponseOutputTokens: DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS,
|
|
126
130
|
maxSessionDuration: DEFAULT_MAX_SESSION_DURATION,
|
|
127
131
|
connOptions: DEFAULT_API_CONNECT_OPTIONS,
|
|
132
|
+
modalities: ['text', 'audio'] as Modality[],
|
|
128
133
|
};
|
|
129
134
|
export class RealtimeModel extends llm.RealtimeModel {
|
|
130
135
|
sampleRate = api_proto.SAMPLE_RATE;
|
|
@@ -142,6 +147,7 @@ export class RealtimeModel extends llm.RealtimeModel {
|
|
|
142
147
|
temperature?: number;
|
|
143
148
|
toolChoice?: llm.ToolChoice;
|
|
144
149
|
baseURL?: string;
|
|
150
|
+
modalities?: Modality[];
|
|
145
151
|
inputAudioTranscription?: api_proto.InputAudioTranscription | null;
|
|
146
152
|
// TODO(shubhra): add inputAudioNoiseReduction
|
|
147
153
|
turnDetection?: api_proto.TurnDetectionType | null;
|
|
@@ -155,11 +161,15 @@ export class RealtimeModel extends llm.RealtimeModel {
|
|
|
155
161
|
connOptions?: APIConnectOptions;
|
|
156
162
|
} = {},
|
|
157
163
|
) {
|
|
164
|
+
const modalities = (options.modalities ||
|
|
165
|
+
DEFAULT_REALTIME_MODEL_OPTIONS.modalities) as Modality[];
|
|
166
|
+
|
|
158
167
|
super({
|
|
159
168
|
messageTruncation: true,
|
|
160
169
|
turnDetection: options.turnDetection !== null,
|
|
161
170
|
userTranscription: options.inputAudioTranscription !== null,
|
|
162
171
|
autoToolReplyGeneration: false,
|
|
172
|
+
audioOutput: modalities.includes('audio'),
|
|
163
173
|
});
|
|
164
174
|
|
|
165
175
|
const isAzure = !!(options.apiVersion || options.entraToken || options.azureDeployment);
|
|
@@ -188,13 +198,15 @@ export class RealtimeModel extends llm.RealtimeModel {
|
|
|
188
198
|
options.baseURL = `${azureEndpoint.replace(/\/$/, '')}/openai`;
|
|
189
199
|
}
|
|
190
200
|
|
|
201
|
+
const { modalities: _, ...optionsWithoutModalities } = options;
|
|
191
202
|
this._options = {
|
|
192
203
|
...DEFAULT_REALTIME_MODEL_OPTIONS,
|
|
193
|
-
...
|
|
204
|
+
...optionsWithoutModalities,
|
|
194
205
|
baseURL: options.baseURL || BASE_URL,
|
|
195
206
|
apiKey,
|
|
196
207
|
isAzure,
|
|
197
208
|
model: options.model || DEFAULT_REALTIME_MODEL_OPTIONS.model,
|
|
209
|
+
modalities,
|
|
198
210
|
};
|
|
199
211
|
}
|
|
200
212
|
|
|
@@ -389,6 +401,12 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
389
401
|
}
|
|
390
402
|
|
|
391
403
|
private createSessionUpdateEvent(): api_proto.SessionUpdateEvent {
|
|
404
|
+
// OpenAI supports ['text'] or ['text', 'audio'] (audio always includes text transcript)
|
|
405
|
+
// We normalize to ensure 'text' is always present when using audio
|
|
406
|
+
const modalities: Modality[] = this.oaiRealtimeModel._options.modalities.includes('audio')
|
|
407
|
+
? ['text', 'audio']
|
|
408
|
+
: ['text'];
|
|
409
|
+
|
|
392
410
|
return {
|
|
393
411
|
type: 'session.update',
|
|
394
412
|
session: {
|
|
@@ -396,7 +414,7 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
396
414
|
voice: this.oaiRealtimeModel._options.voice,
|
|
397
415
|
input_audio_format: 'pcm16',
|
|
398
416
|
output_audio_format: 'pcm16',
|
|
399
|
-
modalities:
|
|
417
|
+
modalities: modalities,
|
|
400
418
|
turn_detection: this.oaiRealtimeModel._options.turnDetection,
|
|
401
419
|
input_audio_transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
|
|
402
420
|
// TODO(shubhra): add inputAudioNoiseReduction
|
|
@@ -592,7 +610,7 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
592
610
|
|
|
593
611
|
pushAudio(frame: AudioFrame): void {
|
|
594
612
|
for (const f of this.resampleAudio(frame)) {
|
|
595
|
-
for (const nf of this.bstream.write(f.data.buffer)) {
|
|
613
|
+
for (const nf of this.bstream.write(f.data.buffer as ArrayBuffer)) {
|
|
596
614
|
this.sendEvent({
|
|
597
615
|
type: 'input_audio_buffer.append',
|
|
598
616
|
audio: Buffer.from(nf.data.buffer).toString('base64'),
|
|
@@ -632,13 +650,38 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
632
650
|
} as api_proto.ResponseCancelEvent);
|
|
633
651
|
}
|
|
634
652
|
|
|
635
|
-
async truncate(_options: {
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
653
|
+
async truncate(_options: {
|
|
654
|
+
messageId: string;
|
|
655
|
+
audioEndMs: number;
|
|
656
|
+
modalities?: Modality[];
|
|
657
|
+
audioTranscript?: string;
|
|
658
|
+
}): Promise<void> {
|
|
659
|
+
if (!_options.modalities || _options.modalities.includes('audio')) {
|
|
660
|
+
this.sendEvent({
|
|
661
|
+
type: 'conversation.item.truncate',
|
|
662
|
+
content_index: 0,
|
|
663
|
+
item_id: _options.messageId,
|
|
664
|
+
audio_end_ms: _options.audioEndMs,
|
|
665
|
+
} as api_proto.ConversationItemTruncateEvent);
|
|
666
|
+
} else if (_options.audioTranscript !== undefined) {
|
|
667
|
+
// sync it to the remote chat context
|
|
668
|
+
const chatCtx = this.chatCtx.copy();
|
|
669
|
+
const idx = chatCtx.indexById(_options.messageId);
|
|
670
|
+
if (idx !== undefined) {
|
|
671
|
+
const item = chatCtx.items[idx];
|
|
672
|
+
if (item && item.type === 'message') {
|
|
673
|
+
const newItem = llm.ChatMessage.create({
|
|
674
|
+
...item,
|
|
675
|
+
content: [_options.audioTranscript],
|
|
676
|
+
});
|
|
677
|
+
chatCtx.items[idx] = newItem;
|
|
678
|
+
const events = this.createChatCtxUpdateEvents(chatCtx);
|
|
679
|
+
for (const ev of events) {
|
|
680
|
+
this.sendEvent(ev);
|
|
681
|
+
}
|
|
682
|
+
}
|
|
683
|
+
}
|
|
684
|
+
}
|
|
642
685
|
}
|
|
643
686
|
|
|
644
687
|
private loggableEvent(
|
|
@@ -907,6 +950,12 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
907
950
|
case 'response.content_part.done':
|
|
908
951
|
this.handleResponseContentPartDone(event);
|
|
909
952
|
break;
|
|
953
|
+
case 'response.text.delta':
|
|
954
|
+
this.handleResponseTextDelta(event);
|
|
955
|
+
break;
|
|
956
|
+
case 'response.text.done':
|
|
957
|
+
this.handleResponseTextDone(event);
|
|
958
|
+
break;
|
|
910
959
|
case 'response.audio_transcript.delta':
|
|
911
960
|
this.handleResponseAudioTranscriptDelta(event);
|
|
912
961
|
break;
|
|
@@ -1049,6 +1098,35 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
1049
1098
|
this.textModeRecoveryRetries = 0;
|
|
1050
1099
|
return;
|
|
1051
1100
|
}
|
|
1101
|
+
|
|
1102
|
+
const itemId = event.item.id;
|
|
1103
|
+
if (!itemId) {
|
|
1104
|
+
throw new Error('item.id is not set');
|
|
1105
|
+
}
|
|
1106
|
+
|
|
1107
|
+
const modalitiesFut = new Future<Modality[]>();
|
|
1108
|
+
const itemGeneration: MessageGeneration = {
|
|
1109
|
+
messageId: itemId,
|
|
1110
|
+
textChannel: stream.createStreamChannel<string>(),
|
|
1111
|
+
audioChannel: stream.createStreamChannel<AudioFrame>(),
|
|
1112
|
+
audioTranscript: '',
|
|
1113
|
+
modalities: modalitiesFut,
|
|
1114
|
+
};
|
|
1115
|
+
|
|
1116
|
+
// If audioOutput is not supported, close audio channel immediately
|
|
1117
|
+
if (!this.oaiRealtimeModel.capabilities.audioOutput) {
|
|
1118
|
+
itemGeneration.audioChannel.close();
|
|
1119
|
+
modalitiesFut.resolve(['text']);
|
|
1120
|
+
}
|
|
1121
|
+
|
|
1122
|
+
this.currentGeneration.messageChannel.write({
|
|
1123
|
+
messageId: itemId,
|
|
1124
|
+
textStream: itemGeneration.textChannel.stream(),
|
|
1125
|
+
audioStream: itemGeneration.audioChannel.stream(),
|
|
1126
|
+
modalities: modalitiesFut.await,
|
|
1127
|
+
});
|
|
1128
|
+
|
|
1129
|
+
this.currentGeneration.messages.set(itemId, itemGeneration);
|
|
1052
1130
|
}
|
|
1053
1131
|
|
|
1054
1132
|
private handleConversationItemCreated(event: api_proto.ConversationItemCreatedEvent): void {
|
|
@@ -1125,39 +1203,24 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
1125
1203
|
|
|
1126
1204
|
const itemId = event.item_id;
|
|
1127
1205
|
const itemType = event.part.type;
|
|
1128
|
-
const responseId = event.response_id;
|
|
1129
1206
|
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
'recovered from text-only response',
|
|
1136
|
-
);
|
|
1137
|
-
this.textModeRecoveryRetries = 0;
|
|
1138
|
-
}
|
|
1207
|
+
const itemGeneration = this.currentGeneration.messages.get(itemId);
|
|
1208
|
+
if (!itemGeneration) {
|
|
1209
|
+
this.#logger.warn(`itemGeneration not found for itemId=${itemId}`);
|
|
1210
|
+
return;
|
|
1211
|
+
}
|
|
1139
1212
|
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
audioChannel: stream.createStreamChannel<AudioFrame>(),
|
|
1144
|
-
audioTranscript: '',
|
|
1145
|
-
};
|
|
1146
|
-
|
|
1147
|
-
this.currentGeneration.messageChannel.write({
|
|
1148
|
-
messageId: itemId,
|
|
1149
|
-
textStream: itemGeneration.textChannel.stream(),
|
|
1150
|
-
audioStream: itemGeneration.audioChannel.stream(),
|
|
1151
|
-
});
|
|
1213
|
+
if (itemType === 'text' && this.oaiRealtimeModel.capabilities.audioOutput) {
|
|
1214
|
+
this.#logger.warn('Text response received from OpenAI Realtime API in audio modality.');
|
|
1215
|
+
}
|
|
1152
1216
|
|
|
1153
|
-
|
|
1217
|
+
if (!itemGeneration.modalities.done) {
|
|
1218
|
+
const modalityResult: Modality[] = itemType === 'text' ? ['text'] : ['audio', 'text'];
|
|
1219
|
+
itemGeneration.modalities.resolve(modalityResult);
|
|
1220
|
+
}
|
|
1221
|
+
|
|
1222
|
+
if (this.currentGeneration._firstTokenTimestamp === undefined) {
|
|
1154
1223
|
this.currentGeneration._firstTokenTimestamp = Date.now();
|
|
1155
|
-
return;
|
|
1156
|
-
} else {
|
|
1157
|
-
this.interrupt();
|
|
1158
|
-
if (this.textModeRecoveryRetries === 0) {
|
|
1159
|
-
this.#logger.warn({ responseId }, 'received text-only response from OpenAI Realtime API');
|
|
1160
|
-
}
|
|
1161
1224
|
}
|
|
1162
1225
|
}
|
|
1163
1226
|
|
|
@@ -1173,6 +1236,33 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
1173
1236
|
// TODO(shubhra): handle text mode recovery
|
|
1174
1237
|
}
|
|
1175
1238
|
|
|
1239
|
+
private handleResponseTextDelta(event: api_proto.ResponseTextDeltaEvent): void {
|
|
1240
|
+
if (!this.currentGeneration) {
|
|
1241
|
+
throw new Error('currentGeneration is not set');
|
|
1242
|
+
}
|
|
1243
|
+
|
|
1244
|
+
const itemGeneration = this.currentGeneration.messages.get(event.item_id);
|
|
1245
|
+
if (!itemGeneration) {
|
|
1246
|
+
throw new Error('itemGeneration is not set');
|
|
1247
|
+
}
|
|
1248
|
+
|
|
1249
|
+
if (
|
|
1250
|
+
!this.oaiRealtimeModel.capabilities.audioOutput &&
|
|
1251
|
+
!this.currentGeneration._firstTokenTimestamp
|
|
1252
|
+
) {
|
|
1253
|
+
this.currentGeneration._firstTokenTimestamp = Date.now();
|
|
1254
|
+
}
|
|
1255
|
+
|
|
1256
|
+
itemGeneration.textChannel.write(event.delta);
|
|
1257
|
+
itemGeneration.audioTranscript += event.delta;
|
|
1258
|
+
}
|
|
1259
|
+
|
|
1260
|
+
private handleResponseTextDone(_event: api_proto.ResponseTextDoneEvent): void {
|
|
1261
|
+
if (!this.currentGeneration) {
|
|
1262
|
+
throw new Error('currentGeneration is not set');
|
|
1263
|
+
}
|
|
1264
|
+
}
|
|
1265
|
+
|
|
1176
1266
|
private handleResponseAudioTranscriptDelta(
|
|
1177
1267
|
event: api_proto.ResponseAudioTranscriptDeltaEvent,
|
|
1178
1268
|
): void {
|
|
@@ -1204,6 +1294,14 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
1204
1294
|
throw new Error('itemGeneration is not set');
|
|
1205
1295
|
}
|
|
1206
1296
|
|
|
1297
|
+
if (this.currentGeneration._firstTokenTimestamp === undefined) {
|
|
1298
|
+
this.currentGeneration._firstTokenTimestamp = Date.now();
|
|
1299
|
+
}
|
|
1300
|
+
|
|
1301
|
+
if (!itemGeneration.modalities.done) {
|
|
1302
|
+
itemGeneration.modalities.resolve(['audio', 'text']);
|
|
1303
|
+
}
|
|
1304
|
+
|
|
1207
1305
|
const binaryString = atob(event.delta);
|
|
1208
1306
|
const len = binaryString.length;
|
|
1209
1307
|
const bytes = new Uint8Array(len);
|
|
@@ -1261,6 +1359,10 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
1261
1359
|
// text response doesn't have itemGeneration
|
|
1262
1360
|
itemGeneration.textChannel.close();
|
|
1263
1361
|
itemGeneration.audioChannel.close();
|
|
1362
|
+
if (!itemGeneration.modalities.done) {
|
|
1363
|
+
// In case message modalities is not set, this shouldn't happen
|
|
1364
|
+
itemGeneration.modalities.resolve(this.oaiRealtimeModel._options.modalities);
|
|
1365
|
+
}
|
|
1264
1366
|
}
|
|
1265
1367
|
}
|
|
1266
1368
|
|
|
@@ -1284,6 +1386,9 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
1284
1386
|
for (const generation of this.currentGeneration.messages.values()) {
|
|
1285
1387
|
generation.textChannel.close();
|
|
1286
1388
|
generation.audioChannel.close();
|
|
1389
|
+
if (!generation.modalities.done) {
|
|
1390
|
+
generation.modalities.resolve(this.oaiRealtimeModel._options.modalities);
|
|
1391
|
+
}
|
|
1287
1392
|
}
|
|
1288
1393
|
|
|
1289
1394
|
this.currentGeneration.functionChannel.close();
|
|
@@ -1473,6 +1578,8 @@ function livekitItemToOpenAIItem(item: llm.ChatItem): api_proto.ItemResource {
|
|
|
1473
1578
|
role,
|
|
1474
1579
|
content: contentList,
|
|
1475
1580
|
} as api_proto.UserItem;
|
|
1581
|
+
default:
|
|
1582
|
+
throw new Error(`Unsupported item type: ${(item as any).type}`);
|
|
1476
1583
|
}
|
|
1477
1584
|
}
|
|
1478
1585
|
|