@livekit/agents-plugin-openai 1.0.30 → 1.0.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -39,14 +39,13 @@ type Modality = 'text' | 'audio';
39
39
  interface RealtimeOptions {
40
40
  model: api_proto.Model;
41
41
  voice: api_proto.Voice;
42
- temperature: number;
43
42
  toolChoice?: llm.ToolChoice;
44
43
  inputAudioTranscription?: api_proto.InputAudioTranscription | null;
45
- // TODO(shubhra): add inputAudioNoiseReduction
44
+ inputAudioNoiseReduction?: api_proto.NoiseReduction | null;
46
45
  turnDetection?: api_proto.TurnDetectionType | null;
47
46
  maxResponseOutputTokens?: number | 'inf';
48
47
  speed?: number;
49
- // TODO(shubhra): add openai tracing options
48
+ tracing?: api_proto.TracingConfig | null;
50
49
  apiKey?: string;
51
50
  baseURL: string;
52
51
  isAzure: boolean;
@@ -90,9 +89,7 @@ class CreateResponseHandle {
90
89
  }
91
90
  }
92
91
 
93
- // default values got from a "default" session from their API
94
92
  const DEFAULT_FIRST_RETRY_INTERVAL_MS = 100;
95
- const DEFAULT_TEMPERATURE = 0.8;
96
93
  const DEFAULT_TURN_DETECTION: api_proto.TurnDetectionType = {
97
94
  type: 'semantic_vad',
98
95
  eagerness: 'medium',
@@ -122,14 +119,15 @@ const DEFAULT_MAX_SESSION_DURATION = 20 * 60 * 1000; // 20 minutes
122
119
  const DEFAULT_REALTIME_MODEL_OPTIONS = {
123
120
  model: 'gpt-realtime',
124
121
  voice: 'marin',
125
- temperature: DEFAULT_TEMPERATURE,
126
122
  inputAudioTranscription: DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
123
+ inputAudioNoiseReduction: undefined as api_proto.NoiseReduction | undefined,
127
124
  turnDetection: DEFAULT_TURN_DETECTION,
128
125
  toolChoice: DEFAULT_TOOL_CHOICE,
129
126
  maxResponseOutputTokens: DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS,
130
127
  maxSessionDuration: DEFAULT_MAX_SESSION_DURATION,
131
128
  connOptions: DEFAULT_API_CONNECT_OPTIONS,
132
129
  modalities: ['text', 'audio'] as Modality[],
130
+ tracing: undefined as api_proto.TracingConfig | undefined,
133
131
  };
134
132
  export class RealtimeModel extends llm.RealtimeModel {
135
133
  sampleRate = api_proto.SAMPLE_RATE;
@@ -140,19 +138,24 @@ export class RealtimeModel extends llm.RealtimeModel {
140
138
  /* @internal */
141
139
  _options: RealtimeOptions;
142
140
 
141
+ get model(): string {
142
+ return this._options.model;
143
+ }
144
+
143
145
  constructor(
144
146
  options: {
145
147
  model?: string;
146
148
  voice?: string;
149
+ /** @deprecated Unused in GA API (v1). Temperature is no longer supported. */
147
150
  temperature?: number;
148
151
  toolChoice?: llm.ToolChoice;
149
152
  baseURL?: string;
150
153
  modalities?: Modality[];
151
154
  inputAudioTranscription?: api_proto.InputAudioTranscription | null;
152
- // TODO(shubhra): add inputAudioNoiseReduction
155
+ inputAudioNoiseReduction?: api_proto.NoiseReduction | null;
153
156
  turnDetection?: api_proto.TurnDetectionType | null;
154
157
  speed?: number;
155
- // TODO(shubhra): add openai tracing options
158
+ tracing?: api_proto.TracingConfig | null;
156
159
  azureDeployment?: string;
157
160
  apiKey?: string;
158
161
  entraToken?: string;
@@ -221,11 +224,10 @@ export class RealtimeModel extends llm.RealtimeModel {
221
224
  * @param baseURL - Base URL for the API endpoint. If undefined, constructed from the azure_endpoint.
222
225
  * @param voice - Voice setting for audio outputs. Defaults to "alloy".
223
226
  * @param inputAudioTranscription - Options for transcribing input audio. Defaults to @see DEFAULT_INPUT_AUDIO_TRANSCRIPTION.
227
+ * @param inputAudioNoiseReduction - Options for noise reduction. Defaults to undefined.
224
228
  * @param turnDetection - Options for server-based voice activity detection (VAD). Defaults to @see DEFAULT_SERVER_VAD_OPTIONS.
225
- * @param temperature - Sampling temperature for response generation. Defaults to @see DEFAULT_TEMPERATURE.
226
229
  * @param speed - Speed of the audio output. Defaults to 1.0.
227
- * @param maxResponseOutputTokens - Maximum number of tokens in the response. Defaults to @see DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS.
228
- * @param maxSessionDuration - Maximum duration of the session in milliseconds. Defaults to @see DEFAULT_MAX_SESSION_DURATION.
230
+ * @param tracing - Tracing configuration. Defaults to undefined.
229
231
  *
230
232
  * @returns A RealtimeModel instance configured for Azure OpenAI Service.
231
233
  *
@@ -239,10 +241,12 @@ export class RealtimeModel extends llm.RealtimeModel {
239
241
  entraToken,
240
242
  baseURL,
241
243
  voice = 'alloy',
244
+ temperature, // eslint-disable-line @typescript-eslint/no-unused-vars
242
245
  inputAudioTranscription = AZURE_DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
246
+ inputAudioNoiseReduction,
243
247
  turnDetection = AZURE_DEFAULT_TURN_DETECTION,
244
- temperature = 0.8,
245
248
  speed,
249
+ tracing,
246
250
  }: {
247
251
  azureDeployment: string;
248
252
  azureEndpoint?: string;
@@ -251,11 +255,13 @@ export class RealtimeModel extends llm.RealtimeModel {
251
255
  entraToken?: string;
252
256
  baseURL?: string;
253
257
  voice?: string;
258
+ /** @deprecated Unused in GA API (v1). Temperature is no longer supported. */
259
+ temperature?: number;
254
260
  inputAudioTranscription?: api_proto.InputAudioTranscription;
255
- // TODO(shubhra): add inputAudioNoiseReduction
261
+ inputAudioNoiseReduction?: api_proto.NoiseReduction;
256
262
  turnDetection?: api_proto.TurnDetectionType;
257
- temperature?: number;
258
263
  speed?: number;
264
+ tracing?: api_proto.TracingConfig;
259
265
  }) {
260
266
  apiKey = apiKey || process.env.AZURE_OPENAI_API_KEY;
261
267
  if (!apiKey && !entraToken) {
@@ -284,9 +290,10 @@ export class RealtimeModel extends llm.RealtimeModel {
284
290
  return new RealtimeModel({
285
291
  voice,
286
292
  inputAudioTranscription,
293
+ inputAudioNoiseReduction,
287
294
  turnDetection,
288
- temperature,
289
295
  speed,
296
+ tracing,
290
297
  apiKey,
291
298
  azureDeployment,
292
299
  apiVersion,
@@ -401,32 +408,38 @@ export class RealtimeSession extends llm.RealtimeSession {
401
408
  }
402
409
 
403
410
  private createSessionUpdateEvent(): api_proto.SessionUpdateEvent {
404
- // OpenAI supports ['text'] or ['text', 'audio'] (audio always includes text transcript)
405
- // We normalize to ensure 'text' is always present when using audio
406
- const modalities: Modality[] = this.oaiRealtimeModel._options.modalities.includes('audio')
407
- ? ['text', 'audio']
408
- : ['text'];
411
+ const audioFormat: api_proto.AudioFormat = { type: 'audio/pcm', rate: SAMPLE_RATE };
412
+
413
+ const modality: Modality = this.oaiRealtimeModel._options.modalities.includes('audio')
414
+ ? 'audio'
415
+ : 'text';
409
416
 
410
417
  return {
411
418
  type: 'session.update',
412
419
  session: {
420
+ type: 'realtime',
413
421
  model: this.oaiRealtimeModel._options.model,
414
- voice: this.oaiRealtimeModel._options.voice,
415
- input_audio_format: 'pcm16',
416
- output_audio_format: 'pcm16',
417
- modalities: modalities,
418
- turn_detection: this.oaiRealtimeModel._options.turnDetection,
419
- input_audio_transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
420
- // TODO(shubhra): add inputAudioNoiseReduction
421
- temperature: this.oaiRealtimeModel._options.temperature,
422
- tool_choice: toOaiToolChoice(this.oaiRealtimeModel._options.toolChoice),
423
- max_response_output_tokens:
422
+ output_modalities: [modality],
423
+ audio: {
424
+ input: {
425
+ format: audioFormat,
426
+ noise_reduction: this.oaiRealtimeModel._options.inputAudioNoiseReduction,
427
+ transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
428
+ turn_detection: this.oaiRealtimeModel._options.turnDetection,
429
+ },
430
+ output: {
431
+ format: audioFormat,
432
+ speed: this.oaiRealtimeModel._options.speed,
433
+ voice: this.oaiRealtimeModel._options.voice,
434
+ },
435
+ },
436
+ max_output_tokens:
424
437
  this.oaiRealtimeModel._options.maxResponseOutputTokens === Infinity
425
438
  ? 'inf'
426
439
  : this.oaiRealtimeModel._options.maxResponseOutputTokens,
427
- // TODO(shubhra): add tracing options
440
+ tool_choice: toOaiToolChoice(this.oaiRealtimeModel._options.toolChoice),
441
+ tracing: this.oaiRealtimeModel._options.tracing,
428
442
  instructions: this.instructions,
429
- speed: this.oaiRealtimeModel._options.speed,
430
443
  },
431
444
  };
432
445
  }
@@ -574,6 +587,7 @@ export class RealtimeSession extends llm.RealtimeSession {
574
587
  return {
575
588
  type: 'session.update',
576
589
  session: {
590
+ type: 'realtime',
577
591
  model: this.oaiRealtimeModel._options.model,
578
592
  tools: oaiTools,
579
593
  },
@@ -586,6 +600,7 @@ export class RealtimeSession extends llm.RealtimeSession {
586
600
  this.sendEvent({
587
601
  type: 'session.update',
588
602
  session: {
603
+ type: 'realtime',
589
604
  instructions: _instructions,
590
605
  },
591
606
  event_id: eventId,
@@ -594,7 +609,9 @@ export class RealtimeSession extends llm.RealtimeSession {
594
609
  }
595
610
 
596
611
  updateOptions({ toolChoice }: { toolChoice?: llm.ToolChoice }): void {
597
- const options: api_proto.SessionUpdateEvent['session'] = {};
612
+ const options: api_proto.SessionUpdateEvent['session'] = {
613
+ type: 'realtime',
614
+ };
598
615
 
599
616
  this.oaiRealtimeModel._options.toolChoice = toolChoice;
600
617
  options.tool_choice = toOaiToolChoice(toolChoice);
@@ -724,8 +741,12 @@ export class RealtimeSession extends llm.RealtimeSession {
724
741
  throw new Error('Microsoft API key or entraToken is required');
725
742
  }
726
743
  } else {
744
+ if (!this.oaiRealtimeModel._options.apiKey) {
745
+ throw new Error(
746
+ 'OpenAI API key is required but not set. Check OPENAI_API_KEY environment variable.',
747
+ );
748
+ }
727
749
  headers.Authorization = `Bearer ${this.oaiRealtimeModel._options.apiKey}`;
728
- headers['OpenAI-Beta'] = 'realtime=v1';
729
750
  }
730
751
 
731
752
  const url = processBaseURL({
@@ -912,7 +933,8 @@ export class RealtimeSession extends llm.RealtimeSession {
912
933
  };
913
934
 
914
935
  wsConn.onmessage = (message: MessageEvent) => {
915
- const event: api_proto.ServerEvent = JSON.parse(message.data as string);
936
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
937
+ const event: any = JSON.parse(message.data as string);
916
938
 
917
939
  this.emit('openai_server_event_received', event);
918
940
  if (lkOaiDebug) {
@@ -932,7 +954,8 @@ export class RealtimeSession extends llm.RealtimeSession {
932
954
  case 'response.output_item.added':
933
955
  this.handleResponseOutputItemAdded(event);
934
956
  break;
935
- case 'conversation.item.created':
957
+ case 'conversation.item.added':
958
+ case 'conversation.item.created': // Beta: kept for backward compatibility
936
959
  this.handleConversationItemCreated(event);
937
960
  break;
938
961
  case 'conversation.item.deleted':
@@ -950,22 +973,28 @@ export class RealtimeSession extends llm.RealtimeSession {
950
973
  case 'response.content_part.done':
951
974
  this.handleResponseContentPartDone(event);
952
975
  break;
953
- case 'response.text.delta':
976
+ case 'response.output_text.delta':
977
+ case 'response.text.delta': // Beta: kept for backward compatibility
954
978
  this.handleResponseTextDelta(event);
955
979
  break;
956
- case 'response.text.done':
980
+ case 'response.output_text.done':
981
+ case 'response.text.done': // Beta: kept for backward compatibility
957
982
  this.handleResponseTextDone(event);
958
983
  break;
959
- case 'response.audio_transcript.delta':
984
+ case 'response.output_audio_transcript.delta':
985
+ case 'response.audio_transcript.delta': // Beta: kept for backward compatibility
960
986
  this.handleResponseAudioTranscriptDelta(event);
961
987
  break;
962
- case 'response.audio.delta':
988
+ case 'response.output_audio.delta':
989
+ case 'response.audio.delta': // Beta: kept for backward compatibility
963
990
  this.handleResponseAudioDelta(event);
964
991
  break;
965
- case 'response.audio_transcript.done':
992
+ case 'response.output_audio_transcript.done':
993
+ case 'response.audio_transcript.done': // Beta: kept for backward compatibility
966
994
  this.handleResponseAudioTranscriptDone(event);
967
995
  break;
968
- case 'response.audio.done':
996
+ case 'response.output_audio.done':
997
+ case 'response.audio.done': // Beta: kept for backward compatibility
969
998
  this.handleResponseAudioDone(event);
970
999
  break;
971
1000
  case 'response.output_item.done':
@@ -1059,6 +1088,7 @@ export class RealtimeSession extends llm.RealtimeSession {
1059
1088
  messageStream: this.currentGeneration.messageChannel.stream(),
1060
1089
  functionStream: this.currentGeneration.functionChannel.stream(),
1061
1090
  userInitiated: false,
1091
+ responseId: event.response.id,
1062
1092
  } as llm.GenerationCreatedEvent;
1063
1093
 
1064
1094
  const clientEventId = event.response.metadata?.client_event_id;
@@ -1210,12 +1240,13 @@ export class RealtimeSession extends llm.RealtimeSession {
1210
1240
  return;
1211
1241
  }
1212
1242
 
1213
- if (itemType === 'text' && this.oaiRealtimeModel.capabilities.audioOutput) {
1243
+ const isTextType = itemType === 'text' || itemType === 'output_text';
1244
+ if (isTextType && this.oaiRealtimeModel.capabilities.audioOutput) {
1214
1245
  this.#logger.warn('Text response received from OpenAI Realtime API in audio modality.');
1215
1246
  }
1216
1247
 
1217
1248
  if (!itemGeneration.modalities.done) {
1218
- const modalityResult: Modality[] = itemType === 'text' ? ['text'] : ['audio', 'text'];
1249
+ const modalityResult: Modality[] = isTextType ? ['text'] : ['audio', 'text'];
1219
1250
  itemGeneration.modalities.resolve(modalityResult);
1220
1251
  }
1221
1252
 
@@ -1225,6 +1256,9 @@ export class RealtimeSession extends llm.RealtimeSession {
1225
1256
  }
1226
1257
 
1227
1258
  private handleResponseContentPartDone(event: api_proto.ResponseContentPartDoneEvent): void {
1259
+ if (!event.part) {
1260
+ return;
1261
+ }
1228
1262
  if (event.part.type !== 'text') {
1229
1263
  return;
1230
1264
  }
@@ -1346,11 +1380,13 @@ export class RealtimeSession extends llm.RealtimeSession {
1346
1380
  if (!item.call_id || !item.name || !item.arguments) {
1347
1381
  throw new Error('item is not a function call');
1348
1382
  }
1349
- this.currentGeneration.functionChannel.write({
1350
- callId: item.call_id,
1351
- name: item.name,
1352
- args: item.arguments,
1353
- } as llm.FunctionCall);
1383
+ this.currentGeneration.functionChannel.write(
1384
+ llm.FunctionCall.create({
1385
+ callId: item.call_id,
1386
+ name: item.name,
1387
+ args: item.arguments,
1388
+ }),
1389
+ );
1354
1390
  } else if (itemType === 'message') {
1355
1391
  const itemGeneration = this.currentGeneration.messages.get(itemId);
1356
1392
  if (!itemGeneration) {
@@ -1518,6 +1554,7 @@ export class RealtimeSession extends llm.RealtimeSession {
1518
1554
  messageStream: this.currentGeneration.messageChannel.stream(),
1519
1555
  functionStream: this.currentGeneration.functionChannel.stream(),
1520
1556
  userInitiated: false,
1557
+ responseId,
1521
1558
  } as llm.GenerationCreatedEvent;
1522
1559
 
1523
1560
  const handle = this.responseCreatedFutures[responseId];