npm - @livekit/agents-plugin-openai - Versions diffs - 1.0.17 → 1.0.19 - Mend

@livekit/agents-plugin-openai 1.0.17 → 1.0.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

package/dist/llm.cjs +5 -2
package/dist/llm.cjs.map +1 -1
package/dist/llm.d.cts +2 -1
package/dist/llm.d.ts +2 -1
package/dist/llm.d.ts.map +1 -1
package/dist/llm.js +5 -2
package/dist/llm.js.map +1 -1
package/dist/llm.test.cjs +9 -0
package/dist/llm.test.cjs.map +1 -1
package/dist/llm.test.js +10 -1
package/dist/llm.test.js.map +1 -1
package/dist/realtime/api_proto.cjs.map +1 -1
package/dist/realtime/api_proto.d.cts +5 -3
package/dist/realtime/api_proto.d.ts +5 -3
package/dist/realtime/api_proto.d.ts.map +1 -1
package/dist/realtime/api_proto.js.map +1 -1
package/dist/realtime/realtime_model.cjs +111 -39
package/dist/realtime/realtime_model.cjs.map +1 -1
package/dist/realtime/realtime_model.d.cts +7 -0
package/dist/realtime/realtime_model.d.ts +7 -0
package/dist/realtime/realtime_model.d.ts.map +1 -1
package/dist/realtime/realtime_model.js +111 -39
package/dist/realtime/realtime_model.js.map +1 -1
package/package.json +7 -7
package/src/llm.test.ts +11 -1
package/src/llm.ts +6 -2
package/src/realtime/api_proto.ts +5 -3
package/src/realtime/realtime_model.ts +146 -39

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@livekit/agents-plugin-openai",
-  "version": "1.0.17",
+  "version": "1.0.19",
   "description": "OpenAI plugin for LiveKit Node Agents",
   "main": "dist/index.js",
   "require": "dist/index.cjs",
@@ -30,18 +30,18 @@
     "@types/ws": "^8.5.10",
     "tsup": "^8.3.5",
     "typescript": "^5.0.0",
-    "@livekit/agents": "1.0.17",
-    "@livekit/agents-plugin-silero": "1.0.17",
-    "@livekit/agents-plugins-test": "1.0.17"
+    "@livekit/agents": "1.0.19",
+    "@livekit/agents-plugin-silero": "1.0.19",
+    "@livekit/agents-plugins-test": "1.0.19"
   },
   "dependencies": {
     "@livekit/mutex": "^1.1.1",
-    "openai": "^4.91.1",
-    "ws": "^8.16.0"
+    "openai": "^6.8.1",
+    "ws": "^8.18.0"
   },
   "peerDependencies": {
     "@livekit/rtc-node": "^0.13.12",
-    "@livekit/agents": "1.0.17"
+    "@livekit/agents": "1.0.19"
   },
   "scripts": {
     "build": "tsup --onSuccess \"pnpm build:types\"",

package/src/llm.test.ts CHANGED Viewed

@@ -1,7 +1,7 @@
 // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
-import { llm } from '@livekit/agents-plugins-test';
+import { llm, llmStrict } from '@livekit/agents-plugins-test';
 import { describe } from 'vitest';
 import { LLM } from './llm.js';
@@ -10,5 +10,15 @@ describe('OpenAI', async () => {
     new LLM({
       temperature: 0,
     }),
+    false,
+  );
+});
+describe('OpenAI strict tool schema', async () => {
+  await llmStrict(
+    new LLM({
+      temperature: 0,
+      strictToolSchema: true,
+    }),
   );
 });

package/src/llm.ts CHANGED Viewed

@@ -30,17 +30,20 @@ export interface LLMOptions {
   maxCompletionTokens?: number;
   serviceTier?: string;
   store?: boolean;
+  strictToolSchema?: boolean;
 }
 const defaultLLMOptions: LLMOptions = {
   model: 'gpt-4.1',
   apiKey: process.env.OPENAI_API_KEY,
   parallelToolCalls: true,
+  strictToolSchema: false,
 };
 const defaultAzureLLMOptions: LLMOptions = {
   model: 'gpt-4.1',
   apiKey: process.env.AZURE_API_KEY,
+  strictToolSchema: false,
 };
 export class LLM extends llm.LLM {
@@ -445,9 +448,9 @@ export class LLM extends llm.LLM {
     connOptions?: APIConnectOptions;
     parallelToolCalls?: boolean;
     toolChoice?: llm.ToolChoice;
-    extraKwargs?: Record<string, any>;
+    extraKwargs?: Record<string, unknown>;
   }): LLMStream {
-    const extras: Record<string, any> = { ...extraKwargs }; // eslint-disable-line @typescript-eslint/no-explicit-any
+    const extras: Record<string, unknown> = { ...extraKwargs };
     if (this.#opts.metadata) {
       extras.metadata = this.#opts.metadata;
@@ -492,6 +495,7 @@ export class LLM extends llm.LLM {
       toolCtx,
       connOptions,
       modelOptions: extras,
+      strictToolSchema: this.#opts.strictToolSchema || false,
       gatewayOptions: undefined, // OpenAI plugin doesn't use gateway authentication
     });
   }

package/src/realtime/api_proto.ts CHANGED Viewed

@@ -190,7 +190,7 @@ export interface SessionResource {
   id: string;
   object: 'realtime.session';
   model: string;
-  modalities: ['text', 'audio'] | ['text']; // default: ["text", "audio"]
+  modalities: Modality[]; // default: ["text", "audio"]
   instructions: string;
   voice: Voice; // default: "alloy"
   input_audio_format: AudioFormat; // default: "pcm16"
@@ -267,7 +267,7 @@ export interface SessionUpdateEvent extends BaseClientEvent {
   type: 'session.update';
   session: Partial<{
     model: Model;
-    modalities: ['text', 'audio'] | ['text'];
+    modalities: Modality[];
     instructions: string;
     voice: Voice;
     input_audio_format: AudioFormat;
@@ -350,7 +350,7 @@ export interface ConversationItemDeleteEvent extends BaseClientEvent {
 export interface ResponseCreateEvent extends BaseClientEvent {
   type: 'response.create';
   response?: Partial<{
-    modalities: ['text', 'audio'] | ['text'];
+    modalities: Modality[];
     instructions: string;
     voice: Voice;
     output_audio_format: AudioFormat;
@@ -511,6 +511,7 @@ export interface ResponseContentPartDoneEvent extends BaseServerEvent {
 export interface ResponseTextDeltaEvent extends BaseServerEvent {
   type: 'response.text.delta';
   response_id: string;
+  item_id: string;
   output_index: number;
   content_index: number;
   delta: string;
@@ -519,6 +520,7 @@ export interface ResponseTextDeltaEvent extends BaseServerEvent {
 export interface ResponseTextDoneEvent extends BaseServerEvent {
   type: 'response.text.done';
   response_id: string;
+  item_id: string;
   output_index: number;
   content_index: number;
   text: string;

package/src/realtime/realtime_model.ts CHANGED Viewed

@@ -34,6 +34,8 @@ const BASE_URL = 'https://api.openai.com/v1';
 const MOCK_AUDIO_ID_PREFIX = 'lk_mock_audio_item_';
+type Modality = 'text' | 'audio';
 interface RealtimeOptions {
   model: api_proto.Model;
   voice: api_proto.Voice;
@@ -54,6 +56,7 @@ interface RealtimeOptions {
   maxSessionDuration: number;
   // reset the connection after this many seconds if provided
   connOptions: APIConnectOptions;
+  modalities: Modality[];
 }
 interface MessageGeneration {
@@ -61,6 +64,7 @@ interface MessageGeneration {
   textChannel: stream.StreamChannel<string>;
   audioChannel: stream.StreamChannel<AudioFrame>;
   audioTranscript: string;
+  modalities: Future<('text' | 'audio')[]>;
 }
 interface ResponseGeneration {
@@ -125,6 +129,7 @@ const DEFAULT_REALTIME_MODEL_OPTIONS = {
   maxResponseOutputTokens: DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS,
   maxSessionDuration: DEFAULT_MAX_SESSION_DURATION,
   connOptions: DEFAULT_API_CONNECT_OPTIONS,
+  modalities: ['text', 'audio'] as Modality[],
 };
 export class RealtimeModel extends llm.RealtimeModel {
   sampleRate = api_proto.SAMPLE_RATE;
@@ -142,6 +147,7 @@ export class RealtimeModel extends llm.RealtimeModel {
       temperature?: number;
       toolChoice?: llm.ToolChoice;
       baseURL?: string;
+      modalities?: Modality[];
       inputAudioTranscription?: api_proto.InputAudioTranscription | null;
       // TODO(shubhra): add inputAudioNoiseReduction
       turnDetection?: api_proto.TurnDetectionType | null;
@@ -155,11 +161,15 @@ export class RealtimeModel extends llm.RealtimeModel {
       connOptions?: APIConnectOptions;
     } = {},
   ) {
+    const modalities = (options.modalities ||
+      DEFAULT_REALTIME_MODEL_OPTIONS.modalities) as Modality[];
     super({
       messageTruncation: true,
       turnDetection: options.turnDetection !== null,
       userTranscription: options.inputAudioTranscription !== null,
       autoToolReplyGeneration: false,
+      audioOutput: modalities.includes('audio'),
     });
     const isAzure = !!(options.apiVersion || options.entraToken || options.azureDeployment);
@@ -188,13 +198,15 @@ export class RealtimeModel extends llm.RealtimeModel {
       options.baseURL = `${azureEndpoint.replace(/\/$/, '')}/openai`;
     }
+    const { modalities: _, ...optionsWithoutModalities } = options;
     this._options = {
       ...DEFAULT_REALTIME_MODEL_OPTIONS,
-      ...options,
+      ...optionsWithoutModalities,
       baseURL: options.baseURL || BASE_URL,
       apiKey,
       isAzure,
       model: options.model || DEFAULT_REALTIME_MODEL_OPTIONS.model,
+      modalities,
     };
   }
@@ -389,6 +401,12 @@ export class RealtimeSession extends llm.RealtimeSession {
   }
   private createSessionUpdateEvent(): api_proto.SessionUpdateEvent {
+    // OpenAI supports ['text'] or ['text', 'audio'] (audio always includes text transcript)
+    // We normalize to ensure 'text' is always present when using audio
+    const modalities: Modality[] = this.oaiRealtimeModel._options.modalities.includes('audio')
+      ? ['text', 'audio']
+      : ['text'];
     return {
       type: 'session.update',
       session: {
@@ -396,7 +414,7 @@ export class RealtimeSession extends llm.RealtimeSession {
         voice: this.oaiRealtimeModel._options.voice,
         input_audio_format: 'pcm16',
         output_audio_format: 'pcm16',
-        modalities: ['text', 'audio'],
+        modalities: modalities,
         turn_detection: this.oaiRealtimeModel._options.turnDetection,
         input_audio_transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
         // TODO(shubhra): add inputAudioNoiseReduction
@@ -592,7 +610,7 @@ export class RealtimeSession extends llm.RealtimeSession {
   pushAudio(frame: AudioFrame): void {
     for (const f of this.resampleAudio(frame)) {
-      for (const nf of this.bstream.write(f.data.buffer)) {
+      for (const nf of this.bstream.write(f.data.buffer as ArrayBuffer)) {
         this.sendEvent({
           type: 'input_audio_buffer.append',
           audio: Buffer.from(nf.data.buffer).toString('base64'),
@@ -632,13 +650,38 @@ export class RealtimeSession extends llm.RealtimeSession {
     } as api_proto.ResponseCancelEvent);
   }
-  async truncate(_options: { messageId: string; audioEndMs: number }): Promise<void> {
-    this.sendEvent({
-      type: 'conversation.item.truncate',
-      content_index: 0,
-      item_id: _options.messageId,
-      audio_end_ms: _options.audioEndMs,
-    } as api_proto.ConversationItemTruncateEvent);
+  async truncate(_options: {
+    messageId: string;
+    audioEndMs: number;
+    modalities?: Modality[];
+    audioTranscript?: string;
+  }): Promise<void> {
+    if (!_options.modalities || _options.modalities.includes('audio')) {
+      this.sendEvent({
+        type: 'conversation.item.truncate',
+        content_index: 0,
+        item_id: _options.messageId,
+        audio_end_ms: _options.audioEndMs,
+      } as api_proto.ConversationItemTruncateEvent);
+    } else if (_options.audioTranscript !== undefined) {
+      // sync it to the remote chat context
+      const chatCtx = this.chatCtx.copy();
+      const idx = chatCtx.indexById(_options.messageId);
+      if (idx !== undefined) {
+        const item = chatCtx.items[idx];
+        if (item && item.type === 'message') {
+          const newItem = llm.ChatMessage.create({
+            ...item,
+            content: [_options.audioTranscript],
+          });
+          chatCtx.items[idx] = newItem;
+          const events = this.createChatCtxUpdateEvents(chatCtx);
+          for (const ev of events) {
+            this.sendEvent(ev);
+          }
+        }
+      }
+    }
   }
   private loggableEvent(
@@ -907,6 +950,12 @@ export class RealtimeSession extends llm.RealtimeSession {
         case 'response.content_part.done':
           this.handleResponseContentPartDone(event);
           break;
+        case 'response.text.delta':
+          this.handleResponseTextDelta(event);
+          break;
+        case 'response.text.done':
+          this.handleResponseTextDone(event);
+          break;
         case 'response.audio_transcript.delta':
           this.handleResponseAudioTranscriptDelta(event);
           break;
@@ -1049,6 +1098,35 @@ export class RealtimeSession extends llm.RealtimeSession {
       this.textModeRecoveryRetries = 0;
       return;
     }
+    const itemId = event.item.id;
+    if (!itemId) {
+      throw new Error('item.id is not set');
+    }
+    const modalitiesFut = new Future<Modality[]>();
+    const itemGeneration: MessageGeneration = {
+      messageId: itemId,
+      textChannel: stream.createStreamChannel<string>(),
+      audioChannel: stream.createStreamChannel<AudioFrame>(),
+      audioTranscript: '',
+      modalities: modalitiesFut,
+    };
+    // If audioOutput is not supported, close audio channel immediately
+    if (!this.oaiRealtimeModel.capabilities.audioOutput) {
+      itemGeneration.audioChannel.close();
+      modalitiesFut.resolve(['text']);
+    }
+    this.currentGeneration.messageChannel.write({
+      messageId: itemId,
+      textStream: itemGeneration.textChannel.stream(),
+      audioStream: itemGeneration.audioChannel.stream(),
+      modalities: modalitiesFut.await,
+    });
+    this.currentGeneration.messages.set(itemId, itemGeneration);
   }
   private handleConversationItemCreated(event: api_proto.ConversationItemCreatedEvent): void {
@@ -1125,39 +1203,24 @@ export class RealtimeSession extends llm.RealtimeSession {
     const itemId = event.item_id;
     const itemType = event.part.type;
-    const responseId = event.response_id;
-    if (itemType === 'audio') {
-      this.resolveGeneration(responseId);
-      if (this.textModeRecoveryRetries > 0) {
-        this.#logger.info(
-          { retries: this.textModeRecoveryRetries },
-          'recovered from text-only response',
-        );
-        this.textModeRecoveryRetries = 0;
-      }
+    const itemGeneration = this.currentGeneration.messages.get(itemId);
+    if (!itemGeneration) {
+      this.#logger.warn(`itemGeneration not found for itemId=${itemId}`);
+      return;
+    }
-      const itemGeneration: MessageGeneration = {
-        messageId: itemId,
-        textChannel: stream.createStreamChannel<string>(),
-        audioChannel: stream.createStreamChannel<AudioFrame>(),
-        audioTranscript: '',
-      };
-      this.currentGeneration.messageChannel.write({
-        messageId: itemId,
-        textStream: itemGeneration.textChannel.stream(),
-        audioStream: itemGeneration.audioChannel.stream(),
-      });
+    if (itemType === 'text' && this.oaiRealtimeModel.capabilities.audioOutput) {
+      this.#logger.warn('Text response received from OpenAI Realtime API in audio modality.');
+    }
-      this.currentGeneration.messages.set(itemId, itemGeneration);
+    if (!itemGeneration.modalities.done) {
+      const modalityResult: Modality[] = itemType === 'text' ? ['text'] : ['audio', 'text'];
+      itemGeneration.modalities.resolve(modalityResult);
+    }
+    if (this.currentGeneration._firstTokenTimestamp === undefined) {
       this.currentGeneration._firstTokenTimestamp = Date.now();
-      return;
-    } else {
-      this.interrupt();
-      if (this.textModeRecoveryRetries === 0) {
-        this.#logger.warn({ responseId }, 'received text-only response from OpenAI Realtime API');
-      }
     }
   }
@@ -1173,6 +1236,33 @@ export class RealtimeSession extends llm.RealtimeSession {
     // TODO(shubhra): handle text mode recovery
   }
+  private handleResponseTextDelta(event: api_proto.ResponseTextDeltaEvent): void {
+    if (!this.currentGeneration) {
+      throw new Error('currentGeneration is not set');
+    }
+    const itemGeneration = this.currentGeneration.messages.get(event.item_id);
+    if (!itemGeneration) {
+      throw new Error('itemGeneration is not set');
+    }
+    if (
+      !this.oaiRealtimeModel.capabilities.audioOutput &&
+      !this.currentGeneration._firstTokenTimestamp
+    ) {
+      this.currentGeneration._firstTokenTimestamp = Date.now();
+    }
+    itemGeneration.textChannel.write(event.delta);
+    itemGeneration.audioTranscript += event.delta;
+  }
+  private handleResponseTextDone(_event: api_proto.ResponseTextDoneEvent): void {
+    if (!this.currentGeneration) {
+      throw new Error('currentGeneration is not set');
+    }
+  }
   private handleResponseAudioTranscriptDelta(
     event: api_proto.ResponseAudioTranscriptDeltaEvent,
   ): void {
@@ -1204,6 +1294,14 @@ export class RealtimeSession extends llm.RealtimeSession {
       throw new Error('itemGeneration is not set');
     }
+    if (this.currentGeneration._firstTokenTimestamp === undefined) {
+      this.currentGeneration._firstTokenTimestamp = Date.now();
+    }
+    if (!itemGeneration.modalities.done) {
+      itemGeneration.modalities.resolve(['audio', 'text']);
+    }
     const binaryString = atob(event.delta);
     const len = binaryString.length;
     const bytes = new Uint8Array(len);
@@ -1261,6 +1359,10 @@ export class RealtimeSession extends llm.RealtimeSession {
       // text response doesn't have itemGeneration
       itemGeneration.textChannel.close();
       itemGeneration.audioChannel.close();
+      if (!itemGeneration.modalities.done) {
+        // In case message modalities is not set, this shouldn't happen
+        itemGeneration.modalities.resolve(this.oaiRealtimeModel._options.modalities);
+      }
     }
   }
@@ -1284,6 +1386,9 @@ export class RealtimeSession extends llm.RealtimeSession {
     for (const generation of this.currentGeneration.messages.values()) {
       generation.textChannel.close();
       generation.audioChannel.close();
+      if (!generation.modalities.done) {
+        generation.modalities.resolve(this.oaiRealtimeModel._options.modalities);
+      }
     }
     this.currentGeneration.functionChannel.close();
@@ -1473,6 +1578,8 @@ function livekitItemToOpenAIItem(item: llm.ChatItem): api_proto.ItemResource {
         role,
         content: contentList,
       } as api_proto.UserItem;
+    default:
+      throw new Error(`Unsupported item type: ${(item as any).type}`);
   }
 }