@livekit/agents-plugin-openai 1.0.31 → 1.0.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -52,7 +52,6 @@ class CreateResponseHandle {
52
52
  }
53
53
  }
54
54
  const DEFAULT_FIRST_RETRY_INTERVAL_MS = 100;
55
- const DEFAULT_TEMPERATURE = 0.8;
56
55
  const DEFAULT_TURN_DETECTION = {
57
56
  type: "semantic_vad",
58
57
  eagerness: "medium",
@@ -78,14 +77,15 @@ const DEFAULT_MAX_SESSION_DURATION = 20 * 60 * 1e3;
78
77
  const DEFAULT_REALTIME_MODEL_OPTIONS = {
79
78
  model: "gpt-realtime",
80
79
  voice: "marin",
81
- temperature: DEFAULT_TEMPERATURE,
82
80
  inputAudioTranscription: DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
81
+ inputAudioNoiseReduction: void 0,
83
82
  turnDetection: DEFAULT_TURN_DETECTION,
84
83
  toolChoice: DEFAULT_TOOL_CHOICE,
85
84
  maxResponseOutputTokens: DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS,
86
85
  maxSessionDuration: DEFAULT_MAX_SESSION_DURATION,
87
86
  connOptions: import_agents.DEFAULT_API_CONNECT_OPTIONS,
88
- modalities: ["text", "audio"]
87
+ modalities: ["text", "audio"],
88
+ tracing: void 0
89
89
  };
90
90
  class RealtimeModel extends import_agents.llm.RealtimeModel {
91
91
  sampleRate = api_proto.SAMPLE_RATE;
@@ -94,6 +94,9 @@ class RealtimeModel extends import_agents.llm.RealtimeModel {
94
94
  outFrameSize = api_proto.OUT_FRAME_SIZE;
95
95
  /* @internal */
96
96
  _options;
97
+ get model() {
98
+ return this._options.model;
99
+ }
97
100
  constructor(options = {}) {
98
101
  const modalities = options.modalities || DEFAULT_REALTIME_MODEL_OPTIONS.modalities;
99
102
  super({
@@ -146,11 +149,10 @@ class RealtimeModel extends import_agents.llm.RealtimeModel {
146
149
  * @param baseURL - Base URL for the API endpoint. If undefined, constructed from the azure_endpoint.
147
150
  * @param voice - Voice setting for audio outputs. Defaults to "alloy".
148
151
  * @param inputAudioTranscription - Options for transcribing input audio. Defaults to @see DEFAULT_INPUT_AUDIO_TRANSCRIPTION.
152
+ * @param inputAudioNoiseReduction - Options for noise reduction. Defaults to undefined.
149
153
  * @param turnDetection - Options for server-based voice activity detection (VAD). Defaults to @see DEFAULT_SERVER_VAD_OPTIONS.
150
- * @param temperature - Sampling temperature for response generation. Defaults to @see DEFAULT_TEMPERATURE.
151
154
  * @param speed - Speed of the audio output. Defaults to 1.0.
152
- * @param maxResponseOutputTokens - Maximum number of tokens in the response. Defaults to @see DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS.
153
- * @param maxSessionDuration - Maximum duration of the session in milliseconds. Defaults to @see DEFAULT_MAX_SESSION_DURATION.
155
+ * @param tracing - Tracing configuration. Defaults to undefined.
154
156
  *
155
157
  * @returns A RealtimeModel instance configured for Azure OpenAI Service.
156
158
  *
@@ -164,10 +166,13 @@ class RealtimeModel extends import_agents.llm.RealtimeModel {
164
166
  entraToken,
165
167
  baseURL,
166
168
  voice = "alloy",
169
+ temperature,
170
+ // eslint-disable-line @typescript-eslint/no-unused-vars
167
171
  inputAudioTranscription = AZURE_DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
172
+ inputAudioNoiseReduction,
168
173
  turnDetection = AZURE_DEFAULT_TURN_DETECTION,
169
- temperature = 0.8,
170
- speed
174
+ speed,
175
+ tracing
171
176
  }) {
172
177
  apiKey = apiKey || process.env.AZURE_OPENAI_API_KEY;
173
178
  if (!apiKey && !entraToken) {
@@ -193,9 +198,10 @@ class RealtimeModel extends import_agents.llm.RealtimeModel {
193
198
  return new RealtimeModel({
194
199
  voice,
195
200
  inputAudioTranscription,
201
+ inputAudioNoiseReduction,
196
202
  turnDetection,
197
- temperature,
198
203
  speed,
204
+ tracing,
199
205
  apiKey,
200
206
  azureDeployment,
201
207
  apiVersion,
@@ -272,24 +278,31 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
272
278
  this.messageChannel.put(command);
273
279
  }
274
280
  createSessionUpdateEvent() {
275
- const modalities = this.oaiRealtimeModel._options.modalities.includes("audio") ? ["text", "audio"] : ["text"];
281
+ const audioFormat = { type: "audio/pcm", rate: SAMPLE_RATE };
282
+ const modality = this.oaiRealtimeModel._options.modalities.includes("audio") ? "audio" : "text";
276
283
  return {
277
284
  type: "session.update",
278
285
  session: {
286
+ type: "realtime",
279
287
  model: this.oaiRealtimeModel._options.model,
280
- voice: this.oaiRealtimeModel._options.voice,
281
- input_audio_format: "pcm16",
282
- output_audio_format: "pcm16",
283
- modalities,
284
- turn_detection: this.oaiRealtimeModel._options.turnDetection,
285
- input_audio_transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
286
- // TODO(shubhra): add inputAudioNoiseReduction
287
- temperature: this.oaiRealtimeModel._options.temperature,
288
+ output_modalities: [modality],
289
+ audio: {
290
+ input: {
291
+ format: audioFormat,
292
+ noise_reduction: this.oaiRealtimeModel._options.inputAudioNoiseReduction,
293
+ transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
294
+ turn_detection: this.oaiRealtimeModel._options.turnDetection
295
+ },
296
+ output: {
297
+ format: audioFormat,
298
+ speed: this.oaiRealtimeModel._options.speed,
299
+ voice: this.oaiRealtimeModel._options.voice
300
+ }
301
+ },
302
+ max_output_tokens: this.oaiRealtimeModel._options.maxResponseOutputTokens === Infinity ? "inf" : this.oaiRealtimeModel._options.maxResponseOutputTokens,
288
303
  tool_choice: toOaiToolChoice(this.oaiRealtimeModel._options.toolChoice),
289
- max_response_output_tokens: this.oaiRealtimeModel._options.maxResponseOutputTokens === Infinity ? "inf" : this.oaiRealtimeModel._options.maxResponseOutputTokens,
290
- // TODO(shubhra): add tracing options
291
- instructions: this.instructions,
292
- speed: this.oaiRealtimeModel._options.speed
304
+ tracing: this.oaiRealtimeModel._options.tracing,
305
+ instructions: this.instructions
293
306
  }
294
307
  };
295
308
  }
@@ -405,6 +418,7 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
405
418
  return {
406
419
  type: "session.update",
407
420
  session: {
421
+ type: "realtime",
408
422
  model: this.oaiRealtimeModel._options.model,
409
423
  tools: oaiTools
410
424
  },
@@ -416,6 +430,7 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
416
430
  this.sendEvent({
417
431
  type: "session.update",
418
432
  session: {
433
+ type: "realtime",
419
434
  instructions: _instructions
420
435
  },
421
436
  event_id: eventId
@@ -423,7 +438,9 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
423
438
  this.instructions = _instructions;
424
439
  }
425
440
  updateOptions({ toolChoice }) {
426
- const options = {};
441
+ const options = {
442
+ type: "realtime"
443
+ };
427
444
  this.oaiRealtimeModel._options.toolChoice = toolChoice;
428
445
  options.tool_choice = toOaiToolChoice(toolChoice);
429
446
  this.sendEvent({
@@ -522,8 +539,12 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
522
539
  throw new Error("Microsoft API key or entraToken is required");
523
540
  }
524
541
  } else {
542
+ if (!this.oaiRealtimeModel._options.apiKey) {
543
+ throw new Error(
544
+ "OpenAI API key is required but not set. Check OPENAI_API_KEY environment variable."
545
+ );
546
+ }
525
547
  headers.Authorization = `Bearer ${this.oaiRealtimeModel._options.apiKey}`;
526
- headers["OpenAI-Beta"] = "realtime=v1";
527
548
  }
528
549
  const url = processBaseURL({
529
550
  baseURL: this.oaiRealtimeModel._options.baseURL,
@@ -690,6 +711,7 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
690
711
  case "response.output_item.added":
691
712
  this.handleResponseOutputItemAdded(event);
692
713
  break;
714
+ case "conversation.item.added":
693
715
  case "conversation.item.created":
694
716
  this.handleConversationItemCreated(event);
695
717
  break;
@@ -708,21 +730,27 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
708
730
  case "response.content_part.done":
709
731
  this.handleResponseContentPartDone(event);
710
732
  break;
733
+ case "response.output_text.delta":
711
734
  case "response.text.delta":
712
735
  this.handleResponseTextDelta(event);
713
736
  break;
737
+ case "response.output_text.done":
714
738
  case "response.text.done":
715
739
  this.handleResponseTextDone(event);
716
740
  break;
741
+ case "response.output_audio_transcript.delta":
717
742
  case "response.audio_transcript.delta":
718
743
  this.handleResponseAudioTranscriptDelta(event);
719
744
  break;
745
+ case "response.output_audio.delta":
720
746
  case "response.audio.delta":
721
747
  this.handleResponseAudioDelta(event);
722
748
  break;
749
+ case "response.output_audio_transcript.done":
723
750
  case "response.audio_transcript.done":
724
751
  this.handleResponseAudioTranscriptDone(event);
725
752
  break;
753
+ case "response.output_audio.done":
726
754
  case "response.audio.done":
727
755
  this.handleResponseAudioDone(event);
728
756
  break;
@@ -798,7 +826,8 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
798
826
  const generationEv = {
799
827
  messageStream: this.currentGeneration.messageChannel.stream(),
800
828
  functionStream: this.currentGeneration.functionChannel.stream(),
801
- userInitiated: false
829
+ userInitiated: false,
830
+ responseId: event.response.id
802
831
  };
803
832
  const clientEventId = (_a = event.response.metadata) == null ? void 0 : _a.client_event_id;
804
833
  if (clientEventId) {
@@ -918,11 +947,12 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
918
947
  this.#logger.warn(`itemGeneration not found for itemId=${itemId}`);
919
948
  return;
920
949
  }
921
- if (itemType === "text" && this.oaiRealtimeModel.capabilities.audioOutput) {
950
+ const isTextType = itemType === "text" || itemType === "output_text";
951
+ if (isTextType && this.oaiRealtimeModel.capabilities.audioOutput) {
922
952
  this.#logger.warn("Text response received from OpenAI Realtime API in audio modality.");
923
953
  }
924
954
  if (!itemGeneration.modalities.done) {
925
- const modalityResult = itemType === "text" ? ["text"] : ["audio", "text"];
955
+ const modalityResult = isTextType ? ["text"] : ["audio", "text"];
926
956
  itemGeneration.modalities.resolve(modalityResult);
927
957
  }
928
958
  if (this.currentGeneration._firstTokenTimestamp === void 0) {
@@ -930,6 +960,9 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
930
960
  }
931
961
  }
932
962
  handleResponseContentPartDone(event) {
963
+ if (!event.part) {
964
+ return;
965
+ }
933
966
  if (event.part.type !== "text") {
934
967
  return;
935
968
  }
@@ -1020,11 +1053,13 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
1020
1053
  if (!item.call_id || !item.name || !item.arguments) {
1021
1054
  throw new Error("item is not a function call");
1022
1055
  }
1023
- this.currentGeneration.functionChannel.write({
1024
- callId: item.call_id,
1025
- name: item.name,
1026
- args: item.arguments
1027
- });
1056
+ this.currentGeneration.functionChannel.write(
1057
+ import_agents.llm.FunctionCall.create({
1058
+ callId: item.call_id,
1059
+ name: item.name,
1060
+ args: item.arguments
1061
+ })
1062
+ );
1028
1063
  } else if (itemType === "message") {
1029
1064
  const itemGeneration = this.currentGeneration.messages.get(itemId);
1030
1065
  if (!itemGeneration) {
@@ -1157,7 +1192,8 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
1157
1192
  const generation_ev = {
1158
1193
  messageStream: this.currentGeneration.messageChannel.stream(),
1159
1194
  functionStream: this.currentGeneration.functionChannel.stream(),
1160
- userInitiated: false
1195
+ userInitiated: false,
1196
+ responseId
1161
1197
  };
1162
1198
  const handle = this.responseCreatedFutures[responseId];
1163
1199
  if (handle) {