npm - @livekit/agents-plugin-openai - Versions diffs - 0.9.1 → 0.9.3 - Mend

@livekit/agents-plugin-openai 0.9.1 → 0.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/dist/index.d.cts +6 -0
package/dist/llm.cjs +48 -21
package/dist/llm.cjs.map +1 -1
package/dist/llm.d.cts +211 -0
package/dist/llm.d.ts +16 -1
package/dist/llm.d.ts.map +1 -1
package/dist/llm.js +48 -21
package/dist/llm.js.map +1 -1
package/dist/llm.test.d.cts +2 -0
package/dist/models.cjs.map +1 -1
package/dist/models.d.cts +15 -0
package/dist/models.d.ts +1 -0
package/dist/models.d.ts.map +1 -1
package/dist/realtime/api_proto.d.cts +413 -0
package/dist/realtime/index.d.cts +3 -0
package/dist/realtime/realtime_model.d.cts +190 -0
package/dist/stt.d.cts +43 -0
package/dist/stt.test.d.cts +2 -0
package/dist/tts.d.cts +37 -0
package/dist/tts.test.d.cts +2 -0
package/package.json +12 -8
package/src/llm.ts +62 -21
package/src/models.ts +6 -0

package/dist/realtime/realtime_model.d.cts ADDED Viewed

@@ -0,0 +1,190 @@
+import { AsyncIterableQueue, Future, llm, multimodal } from '@livekit/agents';
+import { AudioFrame } from '@livekit/rtc-node';
+import * as api_proto from './api_proto.js';
+interface ModelOptions {
+    modalities: ['text', 'audio'] | ['text'];
+    instructions: string;
+    voice: api_proto.Voice;
+    inputAudioFormat: api_proto.AudioFormat;
+    outputAudioFormat: api_proto.AudioFormat;
+    inputAudioTranscription: api_proto.InputAudioTranscription | null;
+    turnDetection: api_proto.TurnDetectionType | null;
+    temperature: number;
+    maxResponseOutputTokens: number;
+    model: api_proto.Model;
+    apiKey?: string;
+    baseURL: string;
+    isAzure: boolean;
+    entraToken?: string;
+    apiVersion?: string;
+}
+export interface RealtimeResponse {
+    id: string;
+    status: api_proto.ResponseStatus;
+    statusDetails: api_proto.ResponseStatusDetails | null;
+    usage: api_proto.ModelUsage | null;
+    output: RealtimeOutput[];
+    doneFut: Future;
+    createdTimestamp: number;
+    firstTokenTimestamp?: number;
+}
+export interface RealtimeOutput {
+    responseId: string;
+    itemId: string;
+    outputIndex: number;
+    role: api_proto.Role;
+    type: 'message' | 'function_call';
+    content: RealtimeContent[];
+    doneFut: Future;
+}
+export interface RealtimeContent {
+    responseId: string;
+    itemId: string;
+    outputIndex: number;
+    contentIndex: number;
+    text: string;
+    audio: AudioFrame[];
+    textStream: AsyncIterableQueue<string>;
+    audioStream: AsyncIterableQueue<AudioFrame>;
+    toolCalls: RealtimeToolCall[];
+    contentType: api_proto.Modality;
+}
+export interface RealtimeToolCall {
+    name: string;
+    arguments: string;
+    toolCallID: string;
+}
+export interface InputSpeechTranscriptionCompleted {
+    itemId: string;
+    transcript: string;
+}
+export interface InputSpeechTranscriptionFailed {
+    itemId: string;
+    message: string;
+}
+export interface InputSpeechStarted {
+    itemId: string;
+}
+export interface InputSpeechCommitted {
+    itemId: string;
+}
+declare class InputAudioBuffer {
+    #private;
+    constructor(session: RealtimeSession);
+    append(frame: AudioFrame): void;
+    clear(): void;
+    commit(): void;
+}
+declare class ConversationItem {
+    #private;
+    constructor(session: RealtimeSession);
+    truncate(itemId: string, contentIndex: number, audioEnd: number): void;
+    delete(itemId: string): void;
+    create(message: llm.ChatMessage, previousItemId?: string): void;
+}
+declare class Conversation {
+    #private;
+    constructor(session: RealtimeSession);
+    get item(): ConversationItem;
+}
+declare class Response {
+    #private;
+    constructor(session: RealtimeSession);
+    create(): void;
+    cancel(): void;
+}
+export declare class RealtimeModel extends multimodal.RealtimeModel {
+    #private;
+    sampleRate: number;
+    numChannels: number;
+    inFrameSize: number;
+    outFrameSize: number;
+    static withAzure({ baseURL, azureDeployment, apiVersion, apiKey, entraToken, instructions, modalities, voice, inputAudioFormat, outputAudioFormat, inputAudioTranscription, turnDetection, temperature, maxResponseOutputTokens, }: {
+        baseURL: string;
+        azureDeployment: string;
+        apiVersion?: string;
+        apiKey?: string;
+        entraToken?: string;
+        instructions?: string;
+        modalities?: ['text', 'audio'] | ['text'];
+        voice?: api_proto.Voice;
+        inputAudioFormat?: api_proto.AudioFormat;
+        outputAudioFormat?: api_proto.AudioFormat;
+        inputAudioTranscription?: api_proto.InputAudioTranscription;
+        turnDetection?: api_proto.TurnDetectionType;
+        temperature?: number;
+        maxResponseOutputTokens?: number;
+    }): RealtimeModel;
+    constructor({ modalities, instructions, voice, inputAudioFormat, outputAudioFormat, inputAudioTranscription, turnDetection, temperature, maxResponseOutputTokens, model, apiKey, baseURL, isAzure, apiVersion, entraToken, }: {
+        modalities?: ['text', 'audio'] | ['text'];
+        instructions?: string;
+        voice?: api_proto.Voice;
+        inputAudioFormat?: api_proto.AudioFormat;
+        outputAudioFormat?: api_proto.AudioFormat;
+        inputAudioTranscription?: api_proto.InputAudioTranscription;
+        turnDetection?: api_proto.TurnDetectionType;
+        temperature?: number;
+        maxResponseOutputTokens?: number;
+        model?: api_proto.Model;
+        apiKey?: string;
+        baseURL?: string;
+        isAzure?: boolean;
+        apiVersion?: string;
+        entraToken?: string;
+    });
+    get sessions(): RealtimeSession[];
+    session({ fncCtx, chatCtx, modalities, instructions, voice, inputAudioFormat, outputAudioFormat, inputAudioTranscription, turnDetection, temperature, maxResponseOutputTokens, }: {
+        fncCtx?: llm.FunctionContext;
+        chatCtx?: llm.ChatContext;
+        modalities?: ['text', 'audio'] | ['text'];
+        instructions?: string;
+        voice?: api_proto.Voice;
+        inputAudioFormat?: api_proto.AudioFormat;
+        outputAudioFormat?: api_proto.AudioFormat;
+        inputAudioTranscription?: api_proto.InputAudioTranscription | null;
+        turnDetection?: api_proto.TurnDetectionType | null;
+        temperature?: number;
+        maxResponseOutputTokens?: number;
+    }): RealtimeSession;
+    close(): Promise<void>;
+}
+export declare class RealtimeSession extends multimodal.RealtimeSession {
+    #private;
+    constructor(opts: ModelOptions, { fncCtx, chatCtx }: {
+        fncCtx?: llm.FunctionContext;
+        chatCtx?: llm.ChatContext;
+    });
+    get chatCtx(): llm.ChatContext | undefined;
+    get fncCtx(): llm.FunctionContext | undefined;
+    set fncCtx(ctx: llm.FunctionContext | undefined);
+    get conversation(): Conversation;
+    get inputAudioBuffer(): InputAudioBuffer;
+    get response(): Response;
+    get expiration(): number;
+    queueMsg(command: api_proto.ClientEvent): void;
+    sessionUpdate({ modalities, instructions, voice, inputAudioFormat, outputAudioFormat, inputAudioTranscription, turnDetection, temperature, maxResponseOutputTokens, toolChoice, selectedTools, }: {
+        modalities: ['text', 'audio'] | ['text'];
+        instructions?: string;
+        voice?: api_proto.Voice;
+        inputAudioFormat?: api_proto.AudioFormat;
+        outputAudioFormat?: api_proto.AudioFormat;
+        inputAudioTranscription?: api_proto.InputAudioTranscription | null;
+        turnDetection?: api_proto.TurnDetectionType | null;
+        temperature?: number;
+        maxResponseOutputTokens?: number;
+        toolChoice?: api_proto.ToolChoice;
+        selectedTools?: string[];
+    }): void;
+    /**
+     * Try to recover from a text response to audio mode.
+     *
+     * @remarks
+     * Sometimes the OpenAI Realtime API returns text instead of audio responses.
+     * This method tries to recover from this by requesting a new response after deleting the text
+     * response and creating an empty user audio message.
+     */
+    recoverFromTextResponse(itemId: string): void;
+    close(): Promise<void>;
+}
+export {};
+//# sourceMappingURL=realtime_model.d.ts.map

package/dist/stt.d.cts ADDED Viewed

@@ -0,0 +1,43 @@
+import { type AudioBuffer, stt } from '@livekit/agents';
+import { OpenAI } from 'openai';
+import type { GroqAudioModels, WhisperModels } from './models.js';
+export interface STTOptions {
+    apiKey?: string;
+    language: string;
+    prompt?: string;
+    detectLanguage: boolean;
+    model: WhisperModels | string;
+    baseURL?: string;
+    client?: OpenAI;
+}
+export declare class STT extends stt.STT {
+    #private;
+    label: string;
+    /**
+     * Create a new instance of OpenAI STT.
+     *
+     * @remarks
+     * `apiKey` must be set to your OpenAI API key, either using the argument or by setting the
+     * `OPENAI_API_KEY` environmental variable.
+     */
+    constructor(opts?: Partial<STTOptions>);
+    /**
+     * Create a new instance of Groq STT.
+     *
+     * @remarks
+     * `apiKey` must be set to your Groq API key, either using the argument or by setting the
+     * `GROQ_API_KEY` environmental variable.
+     */
+    static withGroq(opts?: Partial<{
+        model: string | GroqAudioModels;
+        apiKey?: string;
+        baseURL?: string;
+        client: OpenAI;
+        language: string;
+        detectLanguage: boolean;
+    }>): STT;
+    _recognize(buffer: AudioBuffer, language?: string): Promise<stt.SpeechEvent>;
+    /** This method throws an error; streaming is unsupported on OpenAI STT. */
+    stream(): stt.SpeechStream;
+}
+//# sourceMappingURL=stt.d.ts.map

package/dist/stt.test.d.cts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ export {};
2	+ //# sourceMappingURL=stt.test.d.ts.map

package/dist/tts.d.cts ADDED Viewed

@@ -0,0 +1,37 @@
+import { tts } from '@livekit/agents';
+import { OpenAI } from 'openai';
+import type { TTSModels, TTSVoices } from './models.js';
+export interface TTSOptions {
+    model: TTSModels | string;
+    voice: TTSVoices;
+    speed: number;
+    instructions?: string;
+    baseURL?: string;
+    client?: OpenAI;
+    apiKey?: string;
+}
+export declare class TTS extends tts.TTS {
+    #private;
+    label: string;
+    /**
+     * Create a new instance of OpenAI TTS.
+     *
+     * @remarks
+     * `apiKey` must be set to your OpenAI API key, either using the argument or by setting the
+     * `OPENAI_API_KEY` environmental variable.
+     */
+    constructor(opts?: Partial<TTSOptions>);
+    updateOptions(opts: {
+        model?: TTSModels | string;
+        voice?: TTSVoices;
+        speed?: number;
+    }): void;
+    synthesize(text: string): ChunkedStream;
+    stream(): tts.SynthesizeStream;
+}
+export declare class ChunkedStream extends tts.ChunkedStream {
+    #private;
+    label: string;
+    constructor(tts: TTS, text: string, stream: Promise<any>);
+}
+//# sourceMappingURL=tts.d.ts.map

package/dist/tts.test.d.cts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ export {};
2	+ //# sourceMappingURL=tts.test.d.ts.map

package/package.json CHANGED Viewed

@@ -1,15 +1,18 @@
 {
   "name": "@livekit/agents-plugin-openai",
-  "version": "0.9.1",
+  "version": "0.9.3",
   "description": "OpenAI plugin for LiveKit Node Agents",
   "main": "dist/index.js",
   "require": "dist/index.cjs",
   "types": "dist/index.d.ts",
   "exports": {
-    ".": {
+    "import": {
       "types": "./dist/index.d.ts",
-      "import": "./dist/index.js",
-      "require": "./dist/index.cjs"
+      "default": "./dist/index.js"
+    },
+    "require": {
+      "types": "./dist/index.d.cts",
+      "default": "./dist/index.cjs"
     }
   },
   "author": "LiveKit",
@@ -25,7 +28,7 @@
     "@livekit/agents": "^x",
     "@livekit/agents-plugin-silero": "^x",
     "@livekit/agents-plugins-test": "^x",
-    "@livekit/rtc-node": "^0.13.4",
+    "@livekit/rtc-node": "^0.13.11",
     "@microsoft/api-extractor": "^7.35.0",
     "@types/ws": "^8.5.10",
     "tsup": "^8.3.5",
@@ -37,11 +40,12 @@
     "ws": "^8.16.0"
   },
   "peerDependencies": {
-    "@livekit/rtc-node": "^0.13.4",
-    "@livekit/agents": "^0.7.2x"
+    "@livekit/rtc-node": "^0.13.11",
+    "@livekit/agents": "^0.7.7x"
   },
   "scripts": {
-    "build": "tsup --onSuccess \"tsc --declaration --emitDeclarationOnly\"",
+    "build": "tsup --onSuccess \"pnpm build:types\"",
+    "build:types": "tsc --declaration --emitDeclarationOnly && node ../../scripts/copyDeclarationOutput.js",
     "clean": "rm -rf dist",
     "clean:build": "pnpm clean && pnpm build",
     "lint": "eslint -f unix \"src/**/*.{ts,js}\"",

package/src/llm.ts CHANGED Viewed

@@ -10,6 +10,7 @@ import type {
   ChatModels,
   DeepSeekChatModels,
   GroqChatModels,
+  MetaChatModels,
   OctoChatModels,
   PerplexityChatModels,
   TelnyxChatModels,
@@ -382,6 +383,36 @@ export class LLM extends llm.LLM {
     });
   }
+  /**
+   * Create a new instance of Meta Llama LLM.
+   *
+   * @remarks
+   * `apiKey` must be set to your Meta Llama API key, either using the argument or by setting the
+   * `LLAMA_API_KEY` environmental variable.
+   */
+  static withMeta(
+    opts: Partial<{
+      apiKey?: string;
+      baseURL?: string;
+      client?: OpenAI;
+      model?: string | MetaChatModels;
+      temperature?: number;
+      user?: string;
+    }> = {},
+  ): LLM {
+    opts.apiKey = opts.apiKey || process.env.LLAMA_API_KEY;
+    opts.baseURL = opts.baseURL || 'https://api.llama.com/compat/v1/';
+    opts.model = opts.model || 'Llama-4-Maverick-17B-128E-Instruct-FP8';
+    if (opts.apiKey === undefined) {
+      throw new Error(
+        'Meta Llama API key is required, either as argument or set LLAMA_API_KEY environmental variable',
+      );
+    }
+    return new LLM(opts);
+  }
   chat({
     chatCtx,
     fncCtx,
@@ -605,27 +636,37 @@ const buildMessage = async (msg: llm.ChatMessage, cacheKey: any) => {
       break;
   }
-  if (typeof msg.content === 'string') {
-    oaiMsg.content = msg.content;
-  } else if (Array.isArray(msg.content)) {
-    oaiMsg.content = (await Promise.all(
-      msg.content.map(async (c) => {
-        if (typeof c === 'string') {
-          return { type: 'text', text: c };
-        } else if (
-          // typescript type guard for determining ChatAudio vs ChatImage
-          ((c: llm.ChatAudio | llm.ChatImage): c is llm.ChatImage => {
-            return (c as llm.ChatImage).image !== undefined;
-          })(c)
-        ) {
-          return await buildImageContent(c, cacheKey);
-        } else {
-          throw new Error('ChatAudio is not supported');
-        }
-      }),
-    )) as OpenAI.ChatCompletionContentPart[];
-  } else if (msg.content === undefined) {
-    oaiMsg.content = '';
+  if (msg.role === llm.ChatRole.TOOL) {
+    try {
+      const serializedContent =
+        typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content);
+      oaiMsg.content = serializedContent;
+    } catch (e) {
+      throw Error(`Tool call output is not JSON serializable: ${e}`);
+    }
+  } else {
+    if (typeof msg.content === 'string') {
+      oaiMsg.content = msg.content;
+    } else if (Array.isArray(msg.content)) {
+      oaiMsg.content = (await Promise.all(
+        msg.content.map(async (c) => {
+          if (typeof c === 'string') {
+            return { type: 'text', text: c };
+          } else if (
+            // typescript type guard for determining ChatAudio vs ChatImage
+            ((c: llm.ChatAudio | llm.ChatImage): c is llm.ChatImage => {
+              return (c as llm.ChatImage).image !== undefined;
+            })(c)
+          ) {
+            return await buildImageContent(c, cacheKey);
+          } else {
+            throw new Error('ChatAudio is not supported');
+          }
+        }),
+      )) as OpenAI.ChatCompletionContentPart[];
+    } else if (msg.content === undefined) {
+      oaiMsg.content = '';
+    }
   }
   // make sure to provide when function has been called inside the context

package/src/models.ts CHANGED Viewed

@@ -128,3 +128,9 @@ export type OctoChatModels =
   | 'wizardlm-2-8x22bllamaguard-2-7b';
 export type XAIChatModels = 'grok-2' | 'grok-2-mini' | 'grok-2-mini-public' | 'grok-2-public';
+export type MetaChatModels =
+  | 'Llama-4-Scout-17B-16E-Instruct-FP8'
+  | 'Llama-4-Maverick-17B-128E-Instruct-FP8'
+  | 'Llama-3.3-70B-Instruct'
+  | 'Llama-3.3-8B-Instruct';