@livekit/agents-plugin-openai 1.0.31 → 1.0.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -33,7 +33,6 @@ class CreateResponseHandle {
33
33
  }
34
34
  }
35
35
  const DEFAULT_FIRST_RETRY_INTERVAL_MS = 100;
36
- const DEFAULT_TEMPERATURE = 0.8;
37
36
  const DEFAULT_TURN_DETECTION = {
38
37
  type: "semantic_vad",
39
38
  eagerness: "medium",
@@ -59,14 +58,15 @@ const DEFAULT_MAX_SESSION_DURATION = 20 * 60 * 1e3;
59
58
  const DEFAULT_REALTIME_MODEL_OPTIONS = {
60
59
  model: "gpt-realtime",
61
60
  voice: "marin",
62
- temperature: DEFAULT_TEMPERATURE,
63
61
  inputAudioTranscription: DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
62
+ inputAudioNoiseReduction: void 0,
64
63
  turnDetection: DEFAULT_TURN_DETECTION,
65
64
  toolChoice: DEFAULT_TOOL_CHOICE,
66
65
  maxResponseOutputTokens: DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS,
67
66
  maxSessionDuration: DEFAULT_MAX_SESSION_DURATION,
68
67
  connOptions: DEFAULT_API_CONNECT_OPTIONS,
69
- modalities: ["text", "audio"]
68
+ modalities: ["text", "audio"],
69
+ tracing: void 0
70
70
  };
71
71
  class RealtimeModel extends llm.RealtimeModel {
72
72
  sampleRate = api_proto.SAMPLE_RATE;
@@ -75,6 +75,9 @@ class RealtimeModel extends llm.RealtimeModel {
75
75
  outFrameSize = api_proto.OUT_FRAME_SIZE;
76
76
  /* @internal */
77
77
  _options;
78
+ get model() {
79
+ return this._options.model;
80
+ }
78
81
  constructor(options = {}) {
79
82
  const modalities = options.modalities || DEFAULT_REALTIME_MODEL_OPTIONS.modalities;
80
83
  super({
@@ -127,11 +130,10 @@ class RealtimeModel extends llm.RealtimeModel {
127
130
  * @param baseURL - Base URL for the API endpoint. If undefined, constructed from the azure_endpoint.
128
131
  * @param voice - Voice setting for audio outputs. Defaults to "alloy".
129
132
  * @param inputAudioTranscription - Options for transcribing input audio. Defaults to @see DEFAULT_INPUT_AUDIO_TRANSCRIPTION.
133
+ * @param inputAudioNoiseReduction - Options for noise reduction. Defaults to undefined.
130
134
  * @param turnDetection - Options for server-based voice activity detection (VAD). Defaults to @see DEFAULT_SERVER_VAD_OPTIONS.
131
- * @param temperature - Sampling temperature for response generation. Defaults to @see DEFAULT_TEMPERATURE.
132
135
  * @param speed - Speed of the audio output. Defaults to 1.0.
133
- * @param maxResponseOutputTokens - Maximum number of tokens in the response. Defaults to @see DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS.
134
- * @param maxSessionDuration - Maximum duration of the session in milliseconds. Defaults to @see DEFAULT_MAX_SESSION_DURATION.
136
+ * @param tracing - Tracing configuration. Defaults to undefined.
135
137
  *
136
138
  * @returns A RealtimeModel instance configured for Azure OpenAI Service.
137
139
  *
@@ -145,10 +147,13 @@ class RealtimeModel extends llm.RealtimeModel {
145
147
  entraToken,
146
148
  baseURL,
147
149
  voice = "alloy",
150
+ temperature,
151
+ // eslint-disable-line @typescript-eslint/no-unused-vars
148
152
  inputAudioTranscription = AZURE_DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
153
+ inputAudioNoiseReduction,
149
154
  turnDetection = AZURE_DEFAULT_TURN_DETECTION,
150
- temperature = 0.8,
151
- speed
155
+ speed,
156
+ tracing
152
157
  }) {
153
158
  apiKey = apiKey || process.env.AZURE_OPENAI_API_KEY;
154
159
  if (!apiKey && !entraToken) {
@@ -174,9 +179,10 @@ class RealtimeModel extends llm.RealtimeModel {
174
179
  return new RealtimeModel({
175
180
  voice,
176
181
  inputAudioTranscription,
182
+ inputAudioNoiseReduction,
177
183
  turnDetection,
178
- temperature,
179
184
  speed,
185
+ tracing,
180
186
  apiKey,
181
187
  azureDeployment,
182
188
  apiVersion,
@@ -253,24 +259,31 @@ class RealtimeSession extends llm.RealtimeSession {
253
259
  this.messageChannel.put(command);
254
260
  }
255
261
  createSessionUpdateEvent() {
256
- const modalities = this.oaiRealtimeModel._options.modalities.includes("audio") ? ["text", "audio"] : ["text"];
262
+ const audioFormat = { type: "audio/pcm", rate: SAMPLE_RATE };
263
+ const modality = this.oaiRealtimeModel._options.modalities.includes("audio") ? "audio" : "text";
257
264
  return {
258
265
  type: "session.update",
259
266
  session: {
267
+ type: "realtime",
260
268
  model: this.oaiRealtimeModel._options.model,
261
- voice: this.oaiRealtimeModel._options.voice,
262
- input_audio_format: "pcm16",
263
- output_audio_format: "pcm16",
264
- modalities,
265
- turn_detection: this.oaiRealtimeModel._options.turnDetection,
266
- input_audio_transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
267
- // TODO(shubhra): add inputAudioNoiseReduction
268
- temperature: this.oaiRealtimeModel._options.temperature,
269
+ output_modalities: [modality],
270
+ audio: {
271
+ input: {
272
+ format: audioFormat,
273
+ noise_reduction: this.oaiRealtimeModel._options.inputAudioNoiseReduction,
274
+ transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
275
+ turn_detection: this.oaiRealtimeModel._options.turnDetection
276
+ },
277
+ output: {
278
+ format: audioFormat,
279
+ speed: this.oaiRealtimeModel._options.speed,
280
+ voice: this.oaiRealtimeModel._options.voice
281
+ }
282
+ },
283
+ max_output_tokens: this.oaiRealtimeModel._options.maxResponseOutputTokens === Infinity ? "inf" : this.oaiRealtimeModel._options.maxResponseOutputTokens,
269
284
  tool_choice: toOaiToolChoice(this.oaiRealtimeModel._options.toolChoice),
270
- max_response_output_tokens: this.oaiRealtimeModel._options.maxResponseOutputTokens === Infinity ? "inf" : this.oaiRealtimeModel._options.maxResponseOutputTokens,
271
- // TODO(shubhra): add tracing options
272
- instructions: this.instructions,
273
- speed: this.oaiRealtimeModel._options.speed
285
+ tracing: this.oaiRealtimeModel._options.tracing,
286
+ instructions: this.instructions
274
287
  }
275
288
  };
276
289
  }
@@ -386,6 +399,7 @@ class RealtimeSession extends llm.RealtimeSession {
386
399
  return {
387
400
  type: "session.update",
388
401
  session: {
402
+ type: "realtime",
389
403
  model: this.oaiRealtimeModel._options.model,
390
404
  tools: oaiTools
391
405
  },
@@ -397,6 +411,7 @@ class RealtimeSession extends llm.RealtimeSession {
397
411
  this.sendEvent({
398
412
  type: "session.update",
399
413
  session: {
414
+ type: "realtime",
400
415
  instructions: _instructions
401
416
  },
402
417
  event_id: eventId
@@ -404,7 +419,9 @@ class RealtimeSession extends llm.RealtimeSession {
404
419
  this.instructions = _instructions;
405
420
  }
406
421
  updateOptions({ toolChoice }) {
407
- const options = {};
422
+ const options = {
423
+ type: "realtime"
424
+ };
408
425
  this.oaiRealtimeModel._options.toolChoice = toolChoice;
409
426
  options.tool_choice = toOaiToolChoice(toolChoice);
410
427
  this.sendEvent({
@@ -503,8 +520,12 @@ class RealtimeSession extends llm.RealtimeSession {
503
520
  throw new Error("Microsoft API key or entraToken is required");
504
521
  }
505
522
  } else {
523
+ if (!this.oaiRealtimeModel._options.apiKey) {
524
+ throw new Error(
525
+ "OpenAI API key is required but not set. Check OPENAI_API_KEY environment variable."
526
+ );
527
+ }
506
528
  headers.Authorization = `Bearer ${this.oaiRealtimeModel._options.apiKey}`;
507
- headers["OpenAI-Beta"] = "realtime=v1";
508
529
  }
509
530
  const url = processBaseURL({
510
531
  baseURL: this.oaiRealtimeModel._options.baseURL,
@@ -671,6 +692,7 @@ class RealtimeSession extends llm.RealtimeSession {
671
692
  case "response.output_item.added":
672
693
  this.handleResponseOutputItemAdded(event);
673
694
  break;
695
+ case "conversation.item.added":
674
696
  case "conversation.item.created":
675
697
  this.handleConversationItemCreated(event);
676
698
  break;
@@ -689,21 +711,27 @@ class RealtimeSession extends llm.RealtimeSession {
689
711
  case "response.content_part.done":
690
712
  this.handleResponseContentPartDone(event);
691
713
  break;
714
+ case "response.output_text.delta":
692
715
  case "response.text.delta":
693
716
  this.handleResponseTextDelta(event);
694
717
  break;
718
+ case "response.output_text.done":
695
719
  case "response.text.done":
696
720
  this.handleResponseTextDone(event);
697
721
  break;
722
+ case "response.output_audio_transcript.delta":
698
723
  case "response.audio_transcript.delta":
699
724
  this.handleResponseAudioTranscriptDelta(event);
700
725
  break;
726
+ case "response.output_audio.delta":
701
727
  case "response.audio.delta":
702
728
  this.handleResponseAudioDelta(event);
703
729
  break;
730
+ case "response.output_audio_transcript.done":
704
731
  case "response.audio_transcript.done":
705
732
  this.handleResponseAudioTranscriptDone(event);
706
733
  break;
734
+ case "response.output_audio.done":
707
735
  case "response.audio.done":
708
736
  this.handleResponseAudioDone(event);
709
737
  break;
@@ -779,7 +807,8 @@ class RealtimeSession extends llm.RealtimeSession {
779
807
  const generationEv = {
780
808
  messageStream: this.currentGeneration.messageChannel.stream(),
781
809
  functionStream: this.currentGeneration.functionChannel.stream(),
782
- userInitiated: false
810
+ userInitiated: false,
811
+ responseId: event.response.id
783
812
  };
784
813
  const clientEventId = (_a = event.response.metadata) == null ? void 0 : _a.client_event_id;
785
814
  if (clientEventId) {
@@ -899,11 +928,12 @@ class RealtimeSession extends llm.RealtimeSession {
899
928
  this.#logger.warn(`itemGeneration not found for itemId=${itemId}`);
900
929
  return;
901
930
  }
902
- if (itemType === "text" && this.oaiRealtimeModel.capabilities.audioOutput) {
931
+ const isTextType = itemType === "text" || itemType === "output_text";
932
+ if (isTextType && this.oaiRealtimeModel.capabilities.audioOutput) {
903
933
  this.#logger.warn("Text response received from OpenAI Realtime API in audio modality.");
904
934
  }
905
935
  if (!itemGeneration.modalities.done) {
906
- const modalityResult = itemType === "text" ? ["text"] : ["audio", "text"];
936
+ const modalityResult = isTextType ? ["text"] : ["audio", "text"];
907
937
  itemGeneration.modalities.resolve(modalityResult);
908
938
  }
909
939
  if (this.currentGeneration._firstTokenTimestamp === void 0) {
@@ -911,6 +941,9 @@ class RealtimeSession extends llm.RealtimeSession {
911
941
  }
912
942
  }
913
943
  handleResponseContentPartDone(event) {
944
+ if (!event.part) {
945
+ return;
946
+ }
914
947
  if (event.part.type !== "text") {
915
948
  return;
916
949
  }
@@ -1001,11 +1034,13 @@ class RealtimeSession extends llm.RealtimeSession {
1001
1034
  if (!item.call_id || !item.name || !item.arguments) {
1002
1035
  throw new Error("item is not a function call");
1003
1036
  }
1004
- this.currentGeneration.functionChannel.write({
1005
- callId: item.call_id,
1006
- name: item.name,
1007
- args: item.arguments
1008
- });
1037
+ this.currentGeneration.functionChannel.write(
1038
+ llm.FunctionCall.create({
1039
+ callId: item.call_id,
1040
+ name: item.name,
1041
+ args: item.arguments
1042
+ })
1043
+ );
1009
1044
  } else if (itemType === "message") {
1010
1045
  const itemGeneration = this.currentGeneration.messages.get(itemId);
1011
1046
  if (!itemGeneration) {
@@ -1138,7 +1173,8 @@ class RealtimeSession extends llm.RealtimeSession {
1138
1173
  const generation_ev = {
1139
1174
  messageStream: this.currentGeneration.messageChannel.stream(),
1140
1175
  functionStream: this.currentGeneration.functionChannel.stream(),
1141
- userInitiated: false
1176
+ userInitiated: false,
1177
+ responseId
1142
1178
  };
1143
1179
  const handle = this.responseCreatedFutures[responseId];
1144
1180
  if (handle) {