npm - @livekit/agents-plugin-openai - Versions diffs - 1.0.30 → 1.0.32 - Mend

@livekit/agents-plugin-openai 1.0.30 → 1.0.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/dist/realtime/api_proto.cjs.map +1 -1
package/dist/realtime/api_proto.d.cts +50 -12
package/dist/realtime/api_proto.d.ts +50 -12
package/dist/realtime/api_proto.d.ts.map +1 -1
package/dist/realtime/api_proto.js.map +1 -1
package/dist/realtime/index.cjs +19 -0
package/dist/realtime/index.cjs.map +1 -1
package/dist/realtime/index.d.cts +1 -0
package/dist/realtime/index.d.ts +1 -0
package/dist/realtime/index.d.ts.map +1 -1
package/dist/realtime/index.js +4 -0
package/dist/realtime/index.js.map +1 -1
package/dist/realtime/realtime_model.cjs +69 -33
package/dist/realtime/realtime_model.cjs.map +1 -1
package/dist/realtime/realtime_model.d.cts +14 -6
package/dist/realtime/realtime_model.d.ts +14 -6
package/dist/realtime/realtime_model.d.ts.map +1 -1
package/dist/realtime/realtime_model.js +69 -33
package/dist/realtime/realtime_model.js.map +1 -1
package/dist/realtime/realtime_model_beta.cjs +1300 -0
package/dist/realtime/realtime_model_beta.cjs.map +1 -0
package/dist/realtime/realtime_model_beta.d.cts +165 -0
package/dist/realtime/realtime_model_beta.d.ts +165 -0
package/dist/realtime/realtime_model_beta.d.ts.map +1 -0
package/dist/realtime/realtime_model_beta.js +1280 -0
package/dist/realtime/realtime_model_beta.js.map +1 -0
package/package.json +5 -5
package/src/realtime/api_proto.ts +76 -17
package/src/realtime/index.ts +1 -0
package/src/realtime/realtime_model.ts +86 -49
package/src/realtime/realtime_model_beta.ts +1665 -0

package/src/realtime/realtime_model.ts CHANGED Viewed

@@ -39,14 +39,13 @@ type Modality = 'text' | 'audio';
 interface RealtimeOptions {
   model: api_proto.Model;
   voice: api_proto.Voice;
-  temperature: number;
   toolChoice?: llm.ToolChoice;
   inputAudioTranscription?: api_proto.InputAudioTranscription | null;
-  // TODO(shubhra): add inputAudioNoiseReduction
+  inputAudioNoiseReduction?: api_proto.NoiseReduction | null;
   turnDetection?: api_proto.TurnDetectionType | null;
   maxResponseOutputTokens?: number | 'inf';
   speed?: number;
-  // TODO(shubhra): add openai tracing options
+  tracing?: api_proto.TracingConfig | null;
   apiKey?: string;
   baseURL: string;
   isAzure: boolean;
@@ -90,9 +89,7 @@ class CreateResponseHandle {
   }
 }
-// default values got from a "default" session from their API
 const DEFAULT_FIRST_RETRY_INTERVAL_MS = 100;
-const DEFAULT_TEMPERATURE = 0.8;
 const DEFAULT_TURN_DETECTION: api_proto.TurnDetectionType = {
   type: 'semantic_vad',
   eagerness: 'medium',
@@ -122,14 +119,15 @@ const DEFAULT_MAX_SESSION_DURATION = 20 * 60 * 1000; // 20 minutes
 const DEFAULT_REALTIME_MODEL_OPTIONS = {
   model: 'gpt-realtime',
   voice: 'marin',
-  temperature: DEFAULT_TEMPERATURE,
   inputAudioTranscription: DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
+  inputAudioNoiseReduction: undefined as api_proto.NoiseReduction | undefined,
   turnDetection: DEFAULT_TURN_DETECTION,
   toolChoice: DEFAULT_TOOL_CHOICE,
   maxResponseOutputTokens: DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS,
   maxSessionDuration: DEFAULT_MAX_SESSION_DURATION,
   connOptions: DEFAULT_API_CONNECT_OPTIONS,
   modalities: ['text', 'audio'] as Modality[],
+  tracing: undefined as api_proto.TracingConfig | undefined,
 };
 export class RealtimeModel extends llm.RealtimeModel {
   sampleRate = api_proto.SAMPLE_RATE;
@@ -140,19 +138,24 @@ export class RealtimeModel extends llm.RealtimeModel {
   /* @internal */
   _options: RealtimeOptions;
+  get model(): string {
+    return this._options.model;
+  }
   constructor(
     options: {
       model?: string;
       voice?: string;
+      /** @deprecated Unused in GA API (v1). Temperature is no longer supported. */
       temperature?: number;
       toolChoice?: llm.ToolChoice;
       baseURL?: string;
       modalities?: Modality[];
       inputAudioTranscription?: api_proto.InputAudioTranscription | null;
-      // TODO(shubhra): add inputAudioNoiseReduction
+      inputAudioNoiseReduction?: api_proto.NoiseReduction | null;
       turnDetection?: api_proto.TurnDetectionType | null;
       speed?: number;
-      // TODO(shubhra): add openai tracing options
+      tracing?: api_proto.TracingConfig | null;
       azureDeployment?: string;
       apiKey?: string;
       entraToken?: string;
@@ -221,11 +224,10 @@ export class RealtimeModel extends llm.RealtimeModel {
    * @param baseURL - Base URL for the API endpoint. If undefined, constructed from the azure_endpoint.
    * @param voice - Voice setting for audio outputs. Defaults to "alloy".
    * @param inputAudioTranscription - Options for transcribing input audio. Defaults to @see DEFAULT_INPUT_AUDIO_TRANSCRIPTION.
+   * @param inputAudioNoiseReduction - Options for noise reduction. Defaults to undefined.
    * @param turnDetection - Options for server-based voice activity detection (VAD). Defaults to @see DEFAULT_SERVER_VAD_OPTIONS.
-   * @param temperature - Sampling temperature for response generation. Defaults to @see DEFAULT_TEMPERATURE.
    * @param speed - Speed of the audio output. Defaults to 1.0.
-   * @param maxResponseOutputTokens - Maximum number of tokens in the response. Defaults to @see DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS.
-   * @param maxSessionDuration - Maximum duration of the session in milliseconds. Defaults to @see DEFAULT_MAX_SESSION_DURATION.
+   * @param tracing - Tracing configuration. Defaults to undefined.
    *
    * @returns A RealtimeModel instance configured for Azure OpenAI Service.
    *
@@ -239,10 +241,12 @@ export class RealtimeModel extends llm.RealtimeModel {
     entraToken,
     baseURL,
     voice = 'alloy',
+    temperature, // eslint-disable-line @typescript-eslint/no-unused-vars
     inputAudioTranscription = AZURE_DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
+    inputAudioNoiseReduction,
     turnDetection = AZURE_DEFAULT_TURN_DETECTION,
-    temperature = 0.8,
     speed,
+    tracing,
   }: {
     azureDeployment: string;
     azureEndpoint?: string;
@@ -251,11 +255,13 @@ export class RealtimeModel extends llm.RealtimeModel {
     entraToken?: string;
     baseURL?: string;
     voice?: string;
+    /** @deprecated Unused in GA API (v1). Temperature is no longer supported. */
+    temperature?: number;
     inputAudioTranscription?: api_proto.InputAudioTranscription;
-    // TODO(shubhra): add inputAudioNoiseReduction
+    inputAudioNoiseReduction?: api_proto.NoiseReduction;
     turnDetection?: api_proto.TurnDetectionType;
-    temperature?: number;
     speed?: number;
+    tracing?: api_proto.TracingConfig;
   }) {
     apiKey = apiKey || process.env.AZURE_OPENAI_API_KEY;
     if (!apiKey && !entraToken) {
@@ -284,9 +290,10 @@ export class RealtimeModel extends llm.RealtimeModel {
     return new RealtimeModel({
       voice,
       inputAudioTranscription,
+      inputAudioNoiseReduction,
       turnDetection,
-      temperature,
       speed,
+      tracing,
       apiKey,
       azureDeployment,
       apiVersion,
@@ -401,32 +408,38 @@ export class RealtimeSession extends llm.RealtimeSession {
   }
   private createSessionUpdateEvent(): api_proto.SessionUpdateEvent {
-    // OpenAI supports ['text'] or ['text', 'audio'] (audio always includes text transcript)
-    // We normalize to ensure 'text' is always present when using audio
-    const modalities: Modality[] = this.oaiRealtimeModel._options.modalities.includes('audio')
-      ? ['text', 'audio']
-      : ['text'];
+    const audioFormat: api_proto.AudioFormat = { type: 'audio/pcm', rate: SAMPLE_RATE };
+    const modality: Modality = this.oaiRealtimeModel._options.modalities.includes('audio')
+      ? 'audio'
+      : 'text';
     return {
       type: 'session.update',
       session: {
+        type: 'realtime',
         model: this.oaiRealtimeModel._options.model,
-        voice: this.oaiRealtimeModel._options.voice,
-        input_audio_format: 'pcm16',
-        output_audio_format: 'pcm16',
-        modalities: modalities,
-        turn_detection: this.oaiRealtimeModel._options.turnDetection,
-        input_audio_transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
-        // TODO(shubhra): add inputAudioNoiseReduction
-        temperature: this.oaiRealtimeModel._options.temperature,
-        tool_choice: toOaiToolChoice(this.oaiRealtimeModel._options.toolChoice),
-        max_response_output_tokens:
+        output_modalities: [modality],
+        audio: {
+          input: {
+            format: audioFormat,
+            noise_reduction: this.oaiRealtimeModel._options.inputAudioNoiseReduction,
+            transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
+            turn_detection: this.oaiRealtimeModel._options.turnDetection,
+          },
+          output: {
+            format: audioFormat,
+            speed: this.oaiRealtimeModel._options.speed,
+            voice: this.oaiRealtimeModel._options.voice,
+          },
+        },
+        max_output_tokens:
           this.oaiRealtimeModel._options.maxResponseOutputTokens === Infinity
             ? 'inf'
             : this.oaiRealtimeModel._options.maxResponseOutputTokens,
-        // TODO(shubhra): add tracing options
+        tool_choice: toOaiToolChoice(this.oaiRealtimeModel._options.toolChoice),
+        tracing: this.oaiRealtimeModel._options.tracing,
         instructions: this.instructions,
-        speed: this.oaiRealtimeModel._options.speed,
       },
     };
   }
@@ -574,6 +587,7 @@ export class RealtimeSession extends llm.RealtimeSession {
     return {
       type: 'session.update',
       session: {
+        type: 'realtime',
         model: this.oaiRealtimeModel._options.model,
         tools: oaiTools,
       },
@@ -586,6 +600,7 @@ export class RealtimeSession extends llm.RealtimeSession {
     this.sendEvent({
       type: 'session.update',
       session: {
+        type: 'realtime',
         instructions: _instructions,
       },
       event_id: eventId,
@@ -594,7 +609,9 @@ export class RealtimeSession extends llm.RealtimeSession {
   }
   updateOptions({ toolChoice }: { toolChoice?: llm.ToolChoice }): void {
-    const options: api_proto.SessionUpdateEvent['session'] = {};
+    const options: api_proto.SessionUpdateEvent['session'] = {
+      type: 'realtime',
+    };
     this.oaiRealtimeModel._options.toolChoice = toolChoice;
     options.tool_choice = toOaiToolChoice(toolChoice);
@@ -724,8 +741,12 @@ export class RealtimeSession extends llm.RealtimeSession {
         throw new Error('Microsoft API key or entraToken is required');
       }
     } else {
+      if (!this.oaiRealtimeModel._options.apiKey) {
+        throw new Error(
+          'OpenAI API key is required but not set. Check OPENAI_API_KEY environment variable.',
+        );
+      }
       headers.Authorization = `Bearer ${this.oaiRealtimeModel._options.apiKey}`;
-      headers['OpenAI-Beta'] = 'realtime=v1';
     }
     const url = processBaseURL({
@@ -912,7 +933,8 @@ export class RealtimeSession extends llm.RealtimeSession {
     };
     wsConn.onmessage = (message: MessageEvent) => {
-      const event: api_proto.ServerEvent = JSON.parse(message.data as string);
+      // eslint-disable-next-line @typescript-eslint/no-explicit-any
+      const event: any = JSON.parse(message.data as string);
       this.emit('openai_server_event_received', event);
       if (lkOaiDebug) {
@@ -932,7 +954,8 @@ export class RealtimeSession extends llm.RealtimeSession {
         case 'response.output_item.added':
           this.handleResponseOutputItemAdded(event);
           break;
-        case 'conversation.item.created':
+        case 'conversation.item.added':
+        case 'conversation.item.created': // Beta: kept for backward compatibility
           this.handleConversationItemCreated(event);
           break;
         case 'conversation.item.deleted':
@@ -950,22 +973,28 @@ export class RealtimeSession extends llm.RealtimeSession {
         case 'response.content_part.done':
           this.handleResponseContentPartDone(event);
           break;
-        case 'response.text.delta':
+        case 'response.output_text.delta':
+        case 'response.text.delta': // Beta: kept for backward compatibility
           this.handleResponseTextDelta(event);
           break;
-        case 'response.text.done':
+        case 'response.output_text.done':
+        case 'response.text.done': // Beta: kept for backward compatibility
           this.handleResponseTextDone(event);
           break;
-        case 'response.audio_transcript.delta':
+        case 'response.output_audio_transcript.delta':
+        case 'response.audio_transcript.delta': // Beta: kept for backward compatibility
           this.handleResponseAudioTranscriptDelta(event);
           break;
-        case 'response.audio.delta':
+        case 'response.output_audio.delta':
+        case 'response.audio.delta': // Beta: kept for backward compatibility
           this.handleResponseAudioDelta(event);
           break;
-        case 'response.audio_transcript.done':
+        case 'response.output_audio_transcript.done':
+        case 'response.audio_transcript.done': // Beta: kept for backward compatibility
           this.handleResponseAudioTranscriptDone(event);
           break;
-        case 'response.audio.done':
+        case 'response.output_audio.done':
+        case 'response.audio.done': // Beta: kept for backward compatibility
           this.handleResponseAudioDone(event);
           break;
         case 'response.output_item.done':
@@ -1059,6 +1088,7 @@ export class RealtimeSession extends llm.RealtimeSession {
       messageStream: this.currentGeneration.messageChannel.stream(),
       functionStream: this.currentGeneration.functionChannel.stream(),
       userInitiated: false,
+      responseId: event.response.id,
     } as llm.GenerationCreatedEvent;
     const clientEventId = event.response.metadata?.client_event_id;
@@ -1210,12 +1240,13 @@ export class RealtimeSession extends llm.RealtimeSession {
       return;
     }
-    if (itemType === 'text' && this.oaiRealtimeModel.capabilities.audioOutput) {
+    const isTextType = itemType === 'text' || itemType === 'output_text';
+    if (isTextType && this.oaiRealtimeModel.capabilities.audioOutput) {
       this.#logger.warn('Text response received from OpenAI Realtime API in audio modality.');
     }
     if (!itemGeneration.modalities.done) {
-      const modalityResult: Modality[] = itemType === 'text' ? ['text'] : ['audio', 'text'];
+      const modalityResult: Modality[] = isTextType ? ['text'] : ['audio', 'text'];
       itemGeneration.modalities.resolve(modalityResult);
     }
@@ -1225,6 +1256,9 @@ export class RealtimeSession extends llm.RealtimeSession {
   }
   private handleResponseContentPartDone(event: api_proto.ResponseContentPartDoneEvent): void {
+    if (!event.part) {
+      return;
+    }
     if (event.part.type !== 'text') {
       return;
     }
@@ -1346,11 +1380,13 @@ export class RealtimeSession extends llm.RealtimeSession {
       if (!item.call_id || !item.name || !item.arguments) {
         throw new Error('item is not a function call');
       }
-      this.currentGeneration.functionChannel.write({
-        callId: item.call_id,
-        name: item.name,
-        args: item.arguments,
-      } as llm.FunctionCall);
+      this.currentGeneration.functionChannel.write(
+        llm.FunctionCall.create({
+          callId: item.call_id,
+          name: item.name,
+          args: item.arguments,
+        }),
+      );
     } else if (itemType === 'message') {
       const itemGeneration = this.currentGeneration.messages.get(itemId);
       if (!itemGeneration) {
@@ -1518,6 +1554,7 @@ export class RealtimeSession extends llm.RealtimeSession {
       messageStream: this.currentGeneration.messageChannel.stream(),
       functionStream: this.currentGeneration.functionChannel.stream(),
       userInitiated: false,
+      responseId,
     } as llm.GenerationCreatedEvent;
     const handle = this.responseCreatedFutures[responseId];