npm - @livekit/agents-plugin-openai - Versions diffs - 1.0.31 → 1.0.33 - Mend

@livekit/agents-plugin-openai 1.0.31 → 1.0.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/dist/realtime/api_proto.cjs.map +1 -1
package/dist/realtime/api_proto.d.cts +50 -12
package/dist/realtime/api_proto.d.ts +50 -12
package/dist/realtime/api_proto.d.ts.map +1 -1
package/dist/realtime/api_proto.js.map +1 -1
package/dist/realtime/index.cjs +19 -0
package/dist/realtime/index.cjs.map +1 -1
package/dist/realtime/index.d.cts +1 -0
package/dist/realtime/index.d.ts +1 -0
package/dist/realtime/index.d.ts.map +1 -1
package/dist/realtime/index.js +4 -0
package/dist/realtime/index.js.map +1 -1
package/dist/realtime/realtime_model.cjs +69 -33
package/dist/realtime/realtime_model.cjs.map +1 -1
package/dist/realtime/realtime_model.d.cts +14 -6
package/dist/realtime/realtime_model.d.ts +14 -6
package/dist/realtime/realtime_model.d.ts.map +1 -1
package/dist/realtime/realtime_model.js +69 -33
package/dist/realtime/realtime_model.js.map +1 -1
package/dist/realtime/realtime_model_beta.cjs +1300 -0
package/dist/realtime/realtime_model_beta.cjs.map +1 -0
package/dist/realtime/realtime_model_beta.d.cts +165 -0
package/dist/realtime/realtime_model_beta.d.ts +165 -0
package/dist/realtime/realtime_model_beta.d.ts.map +1 -0
package/dist/realtime/realtime_model_beta.js +1280 -0
package/dist/realtime/realtime_model_beta.js.map +1 -0
package/package.json +5 -5
package/src/realtime/api_proto.ts +76 -17
package/src/realtime/index.ts +1 -0
package/src/realtime/realtime_model.ts +86 -49
package/src/realtime/realtime_model_beta.ts +1665 -0

package/dist/realtime/realtime_model.js CHANGED Viewed

@@ -33,7 +33,6 @@ class CreateResponseHandle {
   }
 }
 const DEFAULT_FIRST_RETRY_INTERVAL_MS = 100;
-const DEFAULT_TEMPERATURE = 0.8;
 const DEFAULT_TURN_DETECTION = {
   type: "semantic_vad",
   eagerness: "medium",
@@ -59,14 +58,15 @@ const DEFAULT_MAX_SESSION_DURATION = 20 * 60 * 1e3;
 const DEFAULT_REALTIME_MODEL_OPTIONS = {
   model: "gpt-realtime",
   voice: "marin",
-  temperature: DEFAULT_TEMPERATURE,
   inputAudioTranscription: DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
+  inputAudioNoiseReduction: void 0,
   turnDetection: DEFAULT_TURN_DETECTION,
   toolChoice: DEFAULT_TOOL_CHOICE,
   maxResponseOutputTokens: DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS,
   maxSessionDuration: DEFAULT_MAX_SESSION_DURATION,
   connOptions: DEFAULT_API_CONNECT_OPTIONS,
-  modalities: ["text", "audio"]
+  modalities: ["text", "audio"],
+  tracing: void 0
 };
 class RealtimeModel extends llm.RealtimeModel {
   sampleRate = api_proto.SAMPLE_RATE;
@@ -75,6 +75,9 @@ class RealtimeModel extends llm.RealtimeModel {
   outFrameSize = api_proto.OUT_FRAME_SIZE;
   /* @internal */
   _options;
+  get model() {
+    return this._options.model;
+  }
   constructor(options = {}) {
     const modalities = options.modalities || DEFAULT_REALTIME_MODEL_OPTIONS.modalities;
     super({
@@ -127,11 +130,10 @@ class RealtimeModel extends llm.RealtimeModel {
    * @param baseURL - Base URL for the API endpoint. If undefined, constructed from the azure_endpoint.
    * @param voice - Voice setting for audio outputs. Defaults to "alloy".
    * @param inputAudioTranscription - Options for transcribing input audio. Defaults to @see DEFAULT_INPUT_AUDIO_TRANSCRIPTION.
+   * @param inputAudioNoiseReduction - Options for noise reduction. Defaults to undefined.
    * @param turnDetection - Options for server-based voice activity detection (VAD). Defaults to @see DEFAULT_SERVER_VAD_OPTIONS.
-   * @param temperature - Sampling temperature for response generation. Defaults to @see DEFAULT_TEMPERATURE.
    * @param speed - Speed of the audio output. Defaults to 1.0.
-   * @param maxResponseOutputTokens - Maximum number of tokens in the response. Defaults to @see DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS.
-   * @param maxSessionDuration - Maximum duration of the session in milliseconds. Defaults to @see DEFAULT_MAX_SESSION_DURATION.
+   * @param tracing - Tracing configuration. Defaults to undefined.
    *
    * @returns A RealtimeModel instance configured for Azure OpenAI Service.
    *
@@ -145,10 +147,13 @@ class RealtimeModel extends llm.RealtimeModel {
     entraToken,
     baseURL,
     voice = "alloy",
+    temperature,
+    // eslint-disable-line @typescript-eslint/no-unused-vars
     inputAudioTranscription = AZURE_DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
+    inputAudioNoiseReduction,
     turnDetection = AZURE_DEFAULT_TURN_DETECTION,
-    temperature = 0.8,
-    speed
+    speed,
+    tracing
   }) {
     apiKey = apiKey || process.env.AZURE_OPENAI_API_KEY;
     if (!apiKey && !entraToken) {
@@ -174,9 +179,10 @@ class RealtimeModel extends llm.RealtimeModel {
     return new RealtimeModel({
       voice,
       inputAudioTranscription,
+      inputAudioNoiseReduction,
       turnDetection,
-      temperature,
       speed,
+      tracing,
       apiKey,
       azureDeployment,
       apiVersion,
@@ -253,24 +259,31 @@ class RealtimeSession extends llm.RealtimeSession {
     this.messageChannel.put(command);
   }
   createSessionUpdateEvent() {
-    const modalities = this.oaiRealtimeModel._options.modalities.includes("audio") ? ["text", "audio"] : ["text"];
+    const audioFormat = { type: "audio/pcm", rate: SAMPLE_RATE };
+    const modality = this.oaiRealtimeModel._options.modalities.includes("audio") ? "audio" : "text";
     return {
       type: "session.update",
       session: {
+        type: "realtime",
         model: this.oaiRealtimeModel._options.model,
-        voice: this.oaiRealtimeModel._options.voice,
-        input_audio_format: "pcm16",
-        output_audio_format: "pcm16",
-        modalities,
-        turn_detection: this.oaiRealtimeModel._options.turnDetection,
-        input_audio_transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
-        // TODO(shubhra): add inputAudioNoiseReduction
-        temperature: this.oaiRealtimeModel._options.temperature,
+        output_modalities: [modality],
+        audio: {
+          input: {
+            format: audioFormat,
+            noise_reduction: this.oaiRealtimeModel._options.inputAudioNoiseReduction,
+            transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
+            turn_detection: this.oaiRealtimeModel._options.turnDetection
+          },
+          output: {
+            format: audioFormat,
+            speed: this.oaiRealtimeModel._options.speed,
+            voice: this.oaiRealtimeModel._options.voice
+          }
+        },
+        max_output_tokens: this.oaiRealtimeModel._options.maxResponseOutputTokens === Infinity ? "inf" : this.oaiRealtimeModel._options.maxResponseOutputTokens,
         tool_choice: toOaiToolChoice(this.oaiRealtimeModel._options.toolChoice),
-        max_response_output_tokens: this.oaiRealtimeModel._options.maxResponseOutputTokens === Infinity ? "inf" : this.oaiRealtimeModel._options.maxResponseOutputTokens,
-        // TODO(shubhra): add tracing options
-        instructions: this.instructions,
-        speed: this.oaiRealtimeModel._options.speed
+        tracing: this.oaiRealtimeModel._options.tracing,
+        instructions: this.instructions
       }
     };
   }
@@ -386,6 +399,7 @@ class RealtimeSession extends llm.RealtimeSession {
     return {
       type: "session.update",
       session: {
+        type: "realtime",
         model: this.oaiRealtimeModel._options.model,
         tools: oaiTools
       },
@@ -397,6 +411,7 @@ class RealtimeSession extends llm.RealtimeSession {
     this.sendEvent({
       type: "session.update",
       session: {
+        type: "realtime",
         instructions: _instructions
       },
       event_id: eventId
@@ -404,7 +419,9 @@ class RealtimeSession extends llm.RealtimeSession {
     this.instructions = _instructions;
   }
   updateOptions({ toolChoice }) {
-    const options = {};
+    const options = {
+      type: "realtime"
+    };
     this.oaiRealtimeModel._options.toolChoice = toolChoice;
     options.tool_choice = toOaiToolChoice(toolChoice);
     this.sendEvent({
@@ -503,8 +520,12 @@ class RealtimeSession extends llm.RealtimeSession {
         throw new Error("Microsoft API key or entraToken is required");
       }
     } else {
+      if (!this.oaiRealtimeModel._options.apiKey) {
+        throw new Error(
+          "OpenAI API key is required but not set. Check OPENAI_API_KEY environment variable."
+        );
+      }
       headers.Authorization = `Bearer ${this.oaiRealtimeModel._options.apiKey}`;
-      headers["OpenAI-Beta"] = "realtime=v1";
     }
     const url = processBaseURL({
       baseURL: this.oaiRealtimeModel._options.baseURL,
@@ -671,6 +692,7 @@ class RealtimeSession extends llm.RealtimeSession {
         case "response.output_item.added":
           this.handleResponseOutputItemAdded(event);
           break;
+        case "conversation.item.added":
         case "conversation.item.created":
           this.handleConversationItemCreated(event);
           break;
@@ -689,21 +711,27 @@ class RealtimeSession extends llm.RealtimeSession {
         case "response.content_part.done":
           this.handleResponseContentPartDone(event);
           break;
+        case "response.output_text.delta":
         case "response.text.delta":
           this.handleResponseTextDelta(event);
           break;
+        case "response.output_text.done":
         case "response.text.done":
           this.handleResponseTextDone(event);
           break;
+        case "response.output_audio_transcript.delta":
         case "response.audio_transcript.delta":
           this.handleResponseAudioTranscriptDelta(event);
           break;
+        case "response.output_audio.delta":
         case "response.audio.delta":
           this.handleResponseAudioDelta(event);
           break;
+        case "response.output_audio_transcript.done":
         case "response.audio_transcript.done":
           this.handleResponseAudioTranscriptDone(event);
           break;
+        case "response.output_audio.done":
         case "response.audio.done":
           this.handleResponseAudioDone(event);
           break;
@@ -779,7 +807,8 @@ class RealtimeSession extends llm.RealtimeSession {
     const generationEv = {
       messageStream: this.currentGeneration.messageChannel.stream(),
       functionStream: this.currentGeneration.functionChannel.stream(),
-      userInitiated: false
+      userInitiated: false,
+      responseId: event.response.id
     };
     const clientEventId = (_a = event.response.metadata) == null ? void 0 : _a.client_event_id;
     if (clientEventId) {
@@ -899,11 +928,12 @@ class RealtimeSession extends llm.RealtimeSession {
       this.#logger.warn(`itemGeneration not found for itemId=${itemId}`);
       return;
     }
-    if (itemType === "text" && this.oaiRealtimeModel.capabilities.audioOutput) {
+    const isTextType = itemType === "text" || itemType === "output_text";
+    if (isTextType && this.oaiRealtimeModel.capabilities.audioOutput) {
       this.#logger.warn("Text response received from OpenAI Realtime API in audio modality.");
     }
     if (!itemGeneration.modalities.done) {
-      const modalityResult = itemType === "text" ? ["text"] : ["audio", "text"];
+      const modalityResult = isTextType ? ["text"] : ["audio", "text"];
       itemGeneration.modalities.resolve(modalityResult);
     }
     if (this.currentGeneration._firstTokenTimestamp === void 0) {
@@ -911,6 +941,9 @@ class RealtimeSession extends llm.RealtimeSession {
     }
   }
   handleResponseContentPartDone(event) {
+    if (!event.part) {
+      return;
+    }
     if (event.part.type !== "text") {
       return;
     }
@@ -1001,11 +1034,13 @@ class RealtimeSession extends llm.RealtimeSession {
       if (!item.call_id || !item.name || !item.arguments) {
         throw new Error("item is not a function call");
       }
-      this.currentGeneration.functionChannel.write({
-        callId: item.call_id,
-        name: item.name,
-        args: item.arguments
-      });
+      this.currentGeneration.functionChannel.write(
+        llm.FunctionCall.create({
+          callId: item.call_id,
+          name: item.name,
+          args: item.arguments
+        })
+      );
     } else if (itemType === "message") {
       const itemGeneration = this.currentGeneration.messages.get(itemId);
       if (!itemGeneration) {
@@ -1138,7 +1173,8 @@ class RealtimeSession extends llm.RealtimeSession {
     const generation_ev = {
       messageStream: this.currentGeneration.messageChannel.stream(),
       functionStream: this.currentGeneration.functionChannel.stream(),
-      userInitiated: false
+      userInitiated: false,
+      responseId
     };
     const handle = this.responseCreatedFutures[responseId];
     if (handle) {