@livekit/agents-plugin-openai 1.0.30 → 1.0.32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/realtime/api_proto.cjs.map +1 -1
- package/dist/realtime/api_proto.d.cts +50 -12
- package/dist/realtime/api_proto.d.ts +50 -12
- package/dist/realtime/api_proto.d.ts.map +1 -1
- package/dist/realtime/api_proto.js.map +1 -1
- package/dist/realtime/index.cjs +19 -0
- package/dist/realtime/index.cjs.map +1 -1
- package/dist/realtime/index.d.cts +1 -0
- package/dist/realtime/index.d.ts +1 -0
- package/dist/realtime/index.d.ts.map +1 -1
- package/dist/realtime/index.js +4 -0
- package/dist/realtime/index.js.map +1 -1
- package/dist/realtime/realtime_model.cjs +69 -33
- package/dist/realtime/realtime_model.cjs.map +1 -1
- package/dist/realtime/realtime_model.d.cts +14 -6
- package/dist/realtime/realtime_model.d.ts +14 -6
- package/dist/realtime/realtime_model.d.ts.map +1 -1
- package/dist/realtime/realtime_model.js +69 -33
- package/dist/realtime/realtime_model.js.map +1 -1
- package/dist/realtime/realtime_model_beta.cjs +1300 -0
- package/dist/realtime/realtime_model_beta.cjs.map +1 -0
- package/dist/realtime/realtime_model_beta.d.cts +165 -0
- package/dist/realtime/realtime_model_beta.d.ts +165 -0
- package/dist/realtime/realtime_model_beta.d.ts.map +1 -0
- package/dist/realtime/realtime_model_beta.js +1280 -0
- package/dist/realtime/realtime_model_beta.js.map +1 -0
- package/package.json +5 -5
- package/src/realtime/api_proto.ts +76 -17
- package/src/realtime/index.ts +1 -0
- package/src/realtime/realtime_model.ts +86 -49
- package/src/realtime/realtime_model_beta.ts +1665 -0
|
@@ -39,14 +39,13 @@ type Modality = 'text' | 'audio';
|
|
|
39
39
|
interface RealtimeOptions {
|
|
40
40
|
model: api_proto.Model;
|
|
41
41
|
voice: api_proto.Voice;
|
|
42
|
-
temperature: number;
|
|
43
42
|
toolChoice?: llm.ToolChoice;
|
|
44
43
|
inputAudioTranscription?: api_proto.InputAudioTranscription | null;
|
|
45
|
-
|
|
44
|
+
inputAudioNoiseReduction?: api_proto.NoiseReduction | null;
|
|
46
45
|
turnDetection?: api_proto.TurnDetectionType | null;
|
|
47
46
|
maxResponseOutputTokens?: number | 'inf';
|
|
48
47
|
speed?: number;
|
|
49
|
-
|
|
48
|
+
tracing?: api_proto.TracingConfig | null;
|
|
50
49
|
apiKey?: string;
|
|
51
50
|
baseURL: string;
|
|
52
51
|
isAzure: boolean;
|
|
@@ -90,9 +89,7 @@ class CreateResponseHandle {
|
|
|
90
89
|
}
|
|
91
90
|
}
|
|
92
91
|
|
|
93
|
-
// default values got from a "default" session from their API
|
|
94
92
|
const DEFAULT_FIRST_RETRY_INTERVAL_MS = 100;
|
|
95
|
-
const DEFAULT_TEMPERATURE = 0.8;
|
|
96
93
|
const DEFAULT_TURN_DETECTION: api_proto.TurnDetectionType = {
|
|
97
94
|
type: 'semantic_vad',
|
|
98
95
|
eagerness: 'medium',
|
|
@@ -122,14 +119,15 @@ const DEFAULT_MAX_SESSION_DURATION = 20 * 60 * 1000; // 20 minutes
|
|
|
122
119
|
const DEFAULT_REALTIME_MODEL_OPTIONS = {
|
|
123
120
|
model: 'gpt-realtime',
|
|
124
121
|
voice: 'marin',
|
|
125
|
-
temperature: DEFAULT_TEMPERATURE,
|
|
126
122
|
inputAudioTranscription: DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
|
|
123
|
+
inputAudioNoiseReduction: undefined as api_proto.NoiseReduction | undefined,
|
|
127
124
|
turnDetection: DEFAULT_TURN_DETECTION,
|
|
128
125
|
toolChoice: DEFAULT_TOOL_CHOICE,
|
|
129
126
|
maxResponseOutputTokens: DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS,
|
|
130
127
|
maxSessionDuration: DEFAULT_MAX_SESSION_DURATION,
|
|
131
128
|
connOptions: DEFAULT_API_CONNECT_OPTIONS,
|
|
132
129
|
modalities: ['text', 'audio'] as Modality[],
|
|
130
|
+
tracing: undefined as api_proto.TracingConfig | undefined,
|
|
133
131
|
};
|
|
134
132
|
export class RealtimeModel extends llm.RealtimeModel {
|
|
135
133
|
sampleRate = api_proto.SAMPLE_RATE;
|
|
@@ -140,19 +138,24 @@ export class RealtimeModel extends llm.RealtimeModel {
|
|
|
140
138
|
/* @internal */
|
|
141
139
|
_options: RealtimeOptions;
|
|
142
140
|
|
|
141
|
+
get model(): string {
|
|
142
|
+
return this._options.model;
|
|
143
|
+
}
|
|
144
|
+
|
|
143
145
|
constructor(
|
|
144
146
|
options: {
|
|
145
147
|
model?: string;
|
|
146
148
|
voice?: string;
|
|
149
|
+
/** @deprecated Unused in GA API (v1). Temperature is no longer supported. */
|
|
147
150
|
temperature?: number;
|
|
148
151
|
toolChoice?: llm.ToolChoice;
|
|
149
152
|
baseURL?: string;
|
|
150
153
|
modalities?: Modality[];
|
|
151
154
|
inputAudioTranscription?: api_proto.InputAudioTranscription | null;
|
|
152
|
-
|
|
155
|
+
inputAudioNoiseReduction?: api_proto.NoiseReduction | null;
|
|
153
156
|
turnDetection?: api_proto.TurnDetectionType | null;
|
|
154
157
|
speed?: number;
|
|
155
|
-
|
|
158
|
+
tracing?: api_proto.TracingConfig | null;
|
|
156
159
|
azureDeployment?: string;
|
|
157
160
|
apiKey?: string;
|
|
158
161
|
entraToken?: string;
|
|
@@ -221,11 +224,10 @@ export class RealtimeModel extends llm.RealtimeModel {
|
|
|
221
224
|
* @param baseURL - Base URL for the API endpoint. If undefined, constructed from the azure_endpoint.
|
|
222
225
|
* @param voice - Voice setting for audio outputs. Defaults to "alloy".
|
|
223
226
|
* @param inputAudioTranscription - Options for transcribing input audio. Defaults to @see DEFAULT_INPUT_AUDIO_TRANSCRIPTION.
|
|
227
|
+
* @param inputAudioNoiseReduction - Options for noise reduction. Defaults to undefined.
|
|
224
228
|
* @param turnDetection - Options for server-based voice activity detection (VAD). Defaults to @see DEFAULT_SERVER_VAD_OPTIONS.
|
|
225
|
-
* @param temperature - Sampling temperature for response generation. Defaults to @see DEFAULT_TEMPERATURE.
|
|
226
229
|
* @param speed - Speed of the audio output. Defaults to 1.0.
|
|
227
|
-
* @param
|
|
228
|
-
* @param maxSessionDuration - Maximum duration of the session in milliseconds. Defaults to @see DEFAULT_MAX_SESSION_DURATION.
|
|
230
|
+
* @param tracing - Tracing configuration. Defaults to undefined.
|
|
229
231
|
*
|
|
230
232
|
* @returns A RealtimeModel instance configured for Azure OpenAI Service.
|
|
231
233
|
*
|
|
@@ -239,10 +241,12 @@ export class RealtimeModel extends llm.RealtimeModel {
|
|
|
239
241
|
entraToken,
|
|
240
242
|
baseURL,
|
|
241
243
|
voice = 'alloy',
|
|
244
|
+
temperature, // eslint-disable-line @typescript-eslint/no-unused-vars
|
|
242
245
|
inputAudioTranscription = AZURE_DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
|
|
246
|
+
inputAudioNoiseReduction,
|
|
243
247
|
turnDetection = AZURE_DEFAULT_TURN_DETECTION,
|
|
244
|
-
temperature = 0.8,
|
|
245
248
|
speed,
|
|
249
|
+
tracing,
|
|
246
250
|
}: {
|
|
247
251
|
azureDeployment: string;
|
|
248
252
|
azureEndpoint?: string;
|
|
@@ -251,11 +255,13 @@ export class RealtimeModel extends llm.RealtimeModel {
|
|
|
251
255
|
entraToken?: string;
|
|
252
256
|
baseURL?: string;
|
|
253
257
|
voice?: string;
|
|
258
|
+
/** @deprecated Unused in GA API (v1). Temperature is no longer supported. */
|
|
259
|
+
temperature?: number;
|
|
254
260
|
inputAudioTranscription?: api_proto.InputAudioTranscription;
|
|
255
|
-
|
|
261
|
+
inputAudioNoiseReduction?: api_proto.NoiseReduction;
|
|
256
262
|
turnDetection?: api_proto.TurnDetectionType;
|
|
257
|
-
temperature?: number;
|
|
258
263
|
speed?: number;
|
|
264
|
+
tracing?: api_proto.TracingConfig;
|
|
259
265
|
}) {
|
|
260
266
|
apiKey = apiKey || process.env.AZURE_OPENAI_API_KEY;
|
|
261
267
|
if (!apiKey && !entraToken) {
|
|
@@ -284,9 +290,10 @@ export class RealtimeModel extends llm.RealtimeModel {
|
|
|
284
290
|
return new RealtimeModel({
|
|
285
291
|
voice,
|
|
286
292
|
inputAudioTranscription,
|
|
293
|
+
inputAudioNoiseReduction,
|
|
287
294
|
turnDetection,
|
|
288
|
-
temperature,
|
|
289
295
|
speed,
|
|
296
|
+
tracing,
|
|
290
297
|
apiKey,
|
|
291
298
|
azureDeployment,
|
|
292
299
|
apiVersion,
|
|
@@ -401,32 +408,38 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
401
408
|
}
|
|
402
409
|
|
|
403
410
|
private createSessionUpdateEvent(): api_proto.SessionUpdateEvent {
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
const
|
|
407
|
-
?
|
|
408
|
-
:
|
|
411
|
+
const audioFormat: api_proto.AudioFormat = { type: 'audio/pcm', rate: SAMPLE_RATE };
|
|
412
|
+
|
|
413
|
+
const modality: Modality = this.oaiRealtimeModel._options.modalities.includes('audio')
|
|
414
|
+
? 'audio'
|
|
415
|
+
: 'text';
|
|
409
416
|
|
|
410
417
|
return {
|
|
411
418
|
type: 'session.update',
|
|
412
419
|
session: {
|
|
420
|
+
type: 'realtime',
|
|
413
421
|
model: this.oaiRealtimeModel._options.model,
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
422
|
+
output_modalities: [modality],
|
|
423
|
+
audio: {
|
|
424
|
+
input: {
|
|
425
|
+
format: audioFormat,
|
|
426
|
+
noise_reduction: this.oaiRealtimeModel._options.inputAudioNoiseReduction,
|
|
427
|
+
transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
|
|
428
|
+
turn_detection: this.oaiRealtimeModel._options.turnDetection,
|
|
429
|
+
},
|
|
430
|
+
output: {
|
|
431
|
+
format: audioFormat,
|
|
432
|
+
speed: this.oaiRealtimeModel._options.speed,
|
|
433
|
+
voice: this.oaiRealtimeModel._options.voice,
|
|
434
|
+
},
|
|
435
|
+
},
|
|
436
|
+
max_output_tokens:
|
|
424
437
|
this.oaiRealtimeModel._options.maxResponseOutputTokens === Infinity
|
|
425
438
|
? 'inf'
|
|
426
439
|
: this.oaiRealtimeModel._options.maxResponseOutputTokens,
|
|
427
|
-
|
|
440
|
+
tool_choice: toOaiToolChoice(this.oaiRealtimeModel._options.toolChoice),
|
|
441
|
+
tracing: this.oaiRealtimeModel._options.tracing,
|
|
428
442
|
instructions: this.instructions,
|
|
429
|
-
speed: this.oaiRealtimeModel._options.speed,
|
|
430
443
|
},
|
|
431
444
|
};
|
|
432
445
|
}
|
|
@@ -574,6 +587,7 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
574
587
|
return {
|
|
575
588
|
type: 'session.update',
|
|
576
589
|
session: {
|
|
590
|
+
type: 'realtime',
|
|
577
591
|
model: this.oaiRealtimeModel._options.model,
|
|
578
592
|
tools: oaiTools,
|
|
579
593
|
},
|
|
@@ -586,6 +600,7 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
586
600
|
this.sendEvent({
|
|
587
601
|
type: 'session.update',
|
|
588
602
|
session: {
|
|
603
|
+
type: 'realtime',
|
|
589
604
|
instructions: _instructions,
|
|
590
605
|
},
|
|
591
606
|
event_id: eventId,
|
|
@@ -594,7 +609,9 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
594
609
|
}
|
|
595
610
|
|
|
596
611
|
updateOptions({ toolChoice }: { toolChoice?: llm.ToolChoice }): void {
|
|
597
|
-
const options: api_proto.SessionUpdateEvent['session'] = {
|
|
612
|
+
const options: api_proto.SessionUpdateEvent['session'] = {
|
|
613
|
+
type: 'realtime',
|
|
614
|
+
};
|
|
598
615
|
|
|
599
616
|
this.oaiRealtimeModel._options.toolChoice = toolChoice;
|
|
600
617
|
options.tool_choice = toOaiToolChoice(toolChoice);
|
|
@@ -724,8 +741,12 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
724
741
|
throw new Error('Microsoft API key or entraToken is required');
|
|
725
742
|
}
|
|
726
743
|
} else {
|
|
744
|
+
if (!this.oaiRealtimeModel._options.apiKey) {
|
|
745
|
+
throw new Error(
|
|
746
|
+
'OpenAI API key is required but not set. Check OPENAI_API_KEY environment variable.',
|
|
747
|
+
);
|
|
748
|
+
}
|
|
727
749
|
headers.Authorization = `Bearer ${this.oaiRealtimeModel._options.apiKey}`;
|
|
728
|
-
headers['OpenAI-Beta'] = 'realtime=v1';
|
|
729
750
|
}
|
|
730
751
|
|
|
731
752
|
const url = processBaseURL({
|
|
@@ -912,7 +933,8 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
912
933
|
};
|
|
913
934
|
|
|
914
935
|
wsConn.onmessage = (message: MessageEvent) => {
|
|
915
|
-
|
|
936
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
937
|
+
const event: any = JSON.parse(message.data as string);
|
|
916
938
|
|
|
917
939
|
this.emit('openai_server_event_received', event);
|
|
918
940
|
if (lkOaiDebug) {
|
|
@@ -932,7 +954,8 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
932
954
|
case 'response.output_item.added':
|
|
933
955
|
this.handleResponseOutputItemAdded(event);
|
|
934
956
|
break;
|
|
935
|
-
case 'conversation.item.
|
|
957
|
+
case 'conversation.item.added':
|
|
958
|
+
case 'conversation.item.created': // Beta: kept for backward compatibility
|
|
936
959
|
this.handleConversationItemCreated(event);
|
|
937
960
|
break;
|
|
938
961
|
case 'conversation.item.deleted':
|
|
@@ -950,22 +973,28 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
950
973
|
case 'response.content_part.done':
|
|
951
974
|
this.handleResponseContentPartDone(event);
|
|
952
975
|
break;
|
|
953
|
-
case 'response.
|
|
976
|
+
case 'response.output_text.delta':
|
|
977
|
+
case 'response.text.delta': // Beta: kept for backward compatibility
|
|
954
978
|
this.handleResponseTextDelta(event);
|
|
955
979
|
break;
|
|
956
|
-
case 'response.
|
|
980
|
+
case 'response.output_text.done':
|
|
981
|
+
case 'response.text.done': // Beta: kept for backward compatibility
|
|
957
982
|
this.handleResponseTextDone(event);
|
|
958
983
|
break;
|
|
959
|
-
case 'response.
|
|
984
|
+
case 'response.output_audio_transcript.delta':
|
|
985
|
+
case 'response.audio_transcript.delta': // Beta: kept for backward compatibility
|
|
960
986
|
this.handleResponseAudioTranscriptDelta(event);
|
|
961
987
|
break;
|
|
962
|
-
case 'response.
|
|
988
|
+
case 'response.output_audio.delta':
|
|
989
|
+
case 'response.audio.delta': // Beta: kept for backward compatibility
|
|
963
990
|
this.handleResponseAudioDelta(event);
|
|
964
991
|
break;
|
|
965
|
-
case 'response.
|
|
992
|
+
case 'response.output_audio_transcript.done':
|
|
993
|
+
case 'response.audio_transcript.done': // Beta: kept for backward compatibility
|
|
966
994
|
this.handleResponseAudioTranscriptDone(event);
|
|
967
995
|
break;
|
|
968
|
-
case 'response.
|
|
996
|
+
case 'response.output_audio.done':
|
|
997
|
+
case 'response.audio.done': // Beta: kept for backward compatibility
|
|
969
998
|
this.handleResponseAudioDone(event);
|
|
970
999
|
break;
|
|
971
1000
|
case 'response.output_item.done':
|
|
@@ -1059,6 +1088,7 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
1059
1088
|
messageStream: this.currentGeneration.messageChannel.stream(),
|
|
1060
1089
|
functionStream: this.currentGeneration.functionChannel.stream(),
|
|
1061
1090
|
userInitiated: false,
|
|
1091
|
+
responseId: event.response.id,
|
|
1062
1092
|
} as llm.GenerationCreatedEvent;
|
|
1063
1093
|
|
|
1064
1094
|
const clientEventId = event.response.metadata?.client_event_id;
|
|
@@ -1210,12 +1240,13 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
1210
1240
|
return;
|
|
1211
1241
|
}
|
|
1212
1242
|
|
|
1213
|
-
|
|
1243
|
+
const isTextType = itemType === 'text' || itemType === 'output_text';
|
|
1244
|
+
if (isTextType && this.oaiRealtimeModel.capabilities.audioOutput) {
|
|
1214
1245
|
this.#logger.warn('Text response received from OpenAI Realtime API in audio modality.');
|
|
1215
1246
|
}
|
|
1216
1247
|
|
|
1217
1248
|
if (!itemGeneration.modalities.done) {
|
|
1218
|
-
const modalityResult: Modality[] =
|
|
1249
|
+
const modalityResult: Modality[] = isTextType ? ['text'] : ['audio', 'text'];
|
|
1219
1250
|
itemGeneration.modalities.resolve(modalityResult);
|
|
1220
1251
|
}
|
|
1221
1252
|
|
|
@@ -1225,6 +1256,9 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
1225
1256
|
}
|
|
1226
1257
|
|
|
1227
1258
|
private handleResponseContentPartDone(event: api_proto.ResponseContentPartDoneEvent): void {
|
|
1259
|
+
if (!event.part) {
|
|
1260
|
+
return;
|
|
1261
|
+
}
|
|
1228
1262
|
if (event.part.type !== 'text') {
|
|
1229
1263
|
return;
|
|
1230
1264
|
}
|
|
@@ -1346,11 +1380,13 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
1346
1380
|
if (!item.call_id || !item.name || !item.arguments) {
|
|
1347
1381
|
throw new Error('item is not a function call');
|
|
1348
1382
|
}
|
|
1349
|
-
this.currentGeneration.functionChannel.write(
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1383
|
+
this.currentGeneration.functionChannel.write(
|
|
1384
|
+
llm.FunctionCall.create({
|
|
1385
|
+
callId: item.call_id,
|
|
1386
|
+
name: item.name,
|
|
1387
|
+
args: item.arguments,
|
|
1388
|
+
}),
|
|
1389
|
+
);
|
|
1354
1390
|
} else if (itemType === 'message') {
|
|
1355
1391
|
const itemGeneration = this.currentGeneration.messages.get(itemId);
|
|
1356
1392
|
if (!itemGeneration) {
|
|
@@ -1518,6 +1554,7 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
1518
1554
|
messageStream: this.currentGeneration.messageChannel.stream(),
|
|
1519
1555
|
functionStream: this.currentGeneration.functionChannel.stream(),
|
|
1520
1556
|
userInitiated: false,
|
|
1557
|
+
responseId,
|
|
1521
1558
|
} as llm.GenerationCreatedEvent;
|
|
1522
1559
|
|
|
1523
1560
|
const handle = this.responseCreatedFutures[responseId];
|