npm - @livekit/agents-plugin-openai - Versions diffs - 0.3.4 → 0.4.0 - Mend

@livekit/agents-plugin-openai 0.3.4 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/.turbo/turbo-build.log +1 -1
package/CHANGELOG.md +26 -0
package/dist/index.d.ts +2 -0
package/dist/index.d.ts.map +1 -1
package/dist/index.js +2 -0
package/dist/index.js.map +1 -1
package/dist/llm.d.ts +195 -0
package/dist/llm.d.ts.map +1 -0
package/dist/llm.js +453 -0
package/dist/llm.js.map +1 -0
package/dist/models.d.ts +10 -0
package/dist/models.d.ts.map +1 -0
package/dist/models.js +5 -0
package/dist/models.js.map +1 -0
package/dist/realtime/api_proto.d.ts +1 -1
package/dist/realtime/api_proto.d.ts.map +1 -1
package/dist/realtime/realtime_model.d.ts +9 -3
package/dist/realtime/realtime_model.d.ts.map +1 -1
package/dist/realtime/realtime_model.js +601 -457
package/dist/realtime/realtime_model.js.map +1 -1
package/package.json +5 -3
package/src/index.ts +2 -0
package/src/llm.ts +670 -0
package/src/models.ts +107 -0
package/src/realtime/api_proto.ts +1 -1
package/src/realtime/realtime_model.ts +155 -15
package/tsconfig.tsbuildinfo +1 -1

package/src/models.ts ADDED Viewed

@@ -0,0 +1,107 @@
+// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+export type ChatModels =
+  | 'gpt-4o'
+  | 'gpt-4o-2024-05-13'
+  | 'gpt-4o-mini'
+  | 'gpt-4o-mini-2024-07-18'
+  | 'gpt-4-turbo'
+  | 'gpt-4-turbo-2024-04-09'
+  | 'gpt-4-turbo-preview'
+  | 'gpt-4-0125-preview'
+  | 'gpt-4-1106-preview'
+  | 'gpt-4-vision-preview'
+  | 'gpt-4-1106-vision-preview'
+  | 'gpt-4'
+  | 'gpt-4-0314'
+  | 'gpt-4-0613'
+  | 'gpt-4-32k'
+  | 'gpt-4-32k-0314'
+  | 'gpt-4-32k-0613'
+  | 'gpt-3.5-turbo'
+  | 'gpt-3.5-turbo-16k'
+  | 'gpt-3.5-turbo-0301'
+  | 'gpt-3.5-turbo-0613'
+  | 'gpt-3.5-turbo-1106'
+  | 'gpt-3.5-turbo-16k-0613';
+// adapters for OpenAI-compatible LLMs
+export type TelnyxChatModels =
+  | 'meta-llama/Meta-Llama-3.1-8B-Instruct'
+  | 'meta-llama/Meta-Llama-3.1-70B-Instruct';
+export type CerebrasChatModels = 'llama3.1-8b' | 'llama3.1-70b';
+export type PerplexityChatModels =
+  | 'llama-3.1-sonar-small-128k-online'
+  | 'llama-3.1-sonar-small-128k-chat'
+  | 'llama-3.1-sonar-large-128k-online'
+  | 'llama-3.1-sonar-large-128k-chat'
+  | 'llama-3.1-8b-instruct'
+  | 'llama-3.1-70b-instruct';
+export type GroqChatModels =
+  | 'llama-3.1-405b-reasoning'
+  | 'llama-3.1-70b-versatile'
+  | 'llama-3.1-8b-instant'
+  | 'llama3-groq-70b-8192-tool-use-preview'
+  | 'llama3-groq-8b-8192-tool-use-preview'
+  | 'llama-guard-3-8b'
+  | 'llama3-70b-8192'
+  | 'llama3-8b-8192'
+  | 'mixtral-8x7b-32768'
+  | 'gemma-7b-it'
+  | 'gemma2-9b-it';
+export type DeepSeekChatModels = 'deepseek-coder' | 'deepseek-chat';
+export type TogetherChatModels =
+  | 'garage-bAInd/Platypus2-70B-instruct'
+  | 'google/gemma-2-27b-it'
+  | 'google/gemma-2-9b-it'
+  | 'google/gemma-2b-it'
+  | 'google/gemma-7b-it'
+  | 'lmsys/vicuna-13b-v1.5'
+  | 'lmsys/vicuna-7b-v1.5'
+  | 'meta-llama/Llama-2-13b-chat-hf'
+  | 'meta-llama/Llama-2-70b-chat-hf'
+  | 'meta-llama/Llama-2-7b-chat-hf'
+  | 'meta-llama/Llama-3-70b-chat-hf'
+  | 'meta-llama/Llama-3-8b-chat-hf'
+  | 'meta-llama/Meta-Llama-3-70B-Instruct-Lite'
+  | 'meta-llama/Meta-Llama-3-70B-Instruct-Turbo'
+  | 'meta-llama/Meta-Llama-3-8B-Instruct-Lite'
+  | 'meta-llama/Meta-Llama-3-8B-Instruct-Turbo'
+  | 'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo'
+  | 'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo'
+  | 'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo'
+  | 'mistralai/Mistral-7B-Instruct-v0.1'
+  | 'mistralai/Mistral-7B-Instruct-v0.2'
+  | 'mistralai/Mistral-7B-Instruct-v0.3'
+  | 'mistralai/Mixtral-8x22B-Instruct-v0.1'
+  | 'mistralai/Mixtral-8x7B-Instruct-v0.1'
+  | 'openchat/openchat-3.5-1210'
+  | 'snorkelai/Snorkel-Mistral-PairRM-DPO'
+  | 'teknium/OpenHermes-2-Mistral-7B'
+  | 'teknium/OpenHermes-2p5-Mistral-7B'
+  | 'togethercomputer/Llama-2-7B-32K-Instruct'
+  | 'togethercomputer/RedPajama-INCITE-7B-Chat'
+  | 'togethercomputer/RedPajama-INCITE-Chat-3B-v1'
+  | 'togethercomputer/StripedHyena-Nous-7B'
+  | 'togethercomputer/alpaca-7b'
+  | 'upstage/SOLAR-10.7B-Instruct-v1.0'
+  | 'zero-one-ai/Yi-34B-Chat';
+export type OctoChatModels =
+  | 'meta-llama-3-70b-instruct'
+  | 'meta-llama-3.1-405b-instruct'
+  | 'meta-llama-3.1-70b-instruct'
+  | 'meta-llama-3.1-8b-instruct'
+  | 'mistral-7b-instruct'
+  | 'mixtral-8x7b-instruct'
+  | 'wizardlm-2-8x22bllamaguard-2-7b';
+export type XAIChatModels = 'grok-2' | 'grok-2-mini' | 'grok-2-mini-public' | 'grok-2-public';

package/src/realtime/api_proto.ts CHANGED Viewed

@@ -79,7 +79,7 @@ export interface Tool {
         [prop: string]: any;
       };
     };
-    required_properties: string[];
+    required: string[];
   };
 }

package/src/realtime/realtime_model.ts CHANGED Viewed

@@ -1,9 +1,17 @@
 // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
-import { AsyncIterableQueue, Future, Queue, llm, log, multimodal } from '@livekit/agents';
+import {
+  AsyncIterableQueue,
+  Future,
+  Queue,
+  llm,
+  log,
+  mergeFrames,
+  multimodal,
+} from '@livekit/agents';
 import { AudioFrame } from '@livekit/rtc-node';
-import { once } from 'events';
+import { once } from 'node:events';
 import { WebSocket } from 'ws';
 import * as api_proto from './api_proto.js';
@@ -29,6 +37,7 @@ export interface RealtimeResponse {
   id: string;
   status: api_proto.ResponseStatus;
   statusDetails: api_proto.ResponseStatusDetails | null;
+  usage: api_proto.ResponseResource['usage'] | null;
   output: RealtimeOutput[];
   doneFut: Future;
 }
@@ -108,6 +117,7 @@ class InputAudioBuffer {
 class ConversationItem {
   #session: RealtimeSession;
+  #logger = log();
   constructor(session: RealtimeSession) {
     this.#session = session;
@@ -129,12 +139,126 @@ class ConversationItem {
     });
   }
-  create(item: api_proto.ConversationItemCreateContent, previousItemId?: string): void {
-    this.#session.queueMsg({
-      type: 'conversation.item.create',
-      item,
-      previous_item_id: previousItemId,
-    });
+  create(message: llm.ChatMessage, previousItemId?: string): void {
+    if (!message.content) {
+      return;
+    }
+    let event: api_proto.ConversationItemCreateEvent;
+    if (message.toolCallId) {
+      if (typeof message.content !== 'string') {
+        throw new TypeError('message.content must be a string');
+      }
+      event = {
+        type: 'conversation.item.create',
+        previous_item_id: previousItemId,
+        item: {
+          type: 'function_call_output',
+          call_id: message.toolCallId,
+          output: message.content,
+        },
+      };
+    } else {
+      let content = message.content;
+      if (!Array.isArray(content)) {
+        content = [content];
+      }
+      if (message.role === llm.ChatRole.USER) {
+        const contents: (api_proto.InputTextContent | api_proto.InputAudioContent)[] = [];
+        for (const c of content) {
+          if (typeof c === 'string') {
+            contents.push({
+              type: 'input_text',
+              text: c,
+            });
+          } else if (
+            // typescript type guard for determining ChatAudio vs ChatImage
+            ((c: llm.ChatAudio | llm.ChatImage): c is llm.ChatAudio => {
+              return (c as llm.ChatAudio).frame !== undefined;
+            })(c)
+          ) {
+            contents.push({
+              type: 'input_audio',
+              audio: Buffer.from(mergeFrames(c.frame).data.buffer).toString('base64'),
+            });
+          }
+        }
+        event = {
+          type: 'conversation.item.create',
+          previous_item_id: previousItemId,
+          item: {
+            type: 'message',
+            role: 'user',
+            content: contents,
+          },
+        };
+      } else if (message.role === llm.ChatRole.ASSISTANT) {
+        const contents: api_proto.TextContent[] = [];
+        for (const c of content) {
+          if (typeof c === 'string') {
+            contents.push({
+              type: 'text',
+              text: c,
+            });
+          } else if (
+            // typescript type guard for determining ChatAudio vs ChatImage
+            ((c: llm.ChatAudio | llm.ChatImage): c is llm.ChatAudio => {
+              return (c as llm.ChatAudio).frame !== undefined;
+            })(c)
+          ) {
+            this.#logger.warn('audio content in assistant message is not supported');
+          }
+        }
+        event = {
+          type: 'conversation.item.create',
+          previous_item_id: previousItemId,
+          item: {
+            type: 'message',
+            role: 'assistant',
+            content: contents,
+          },
+        };
+      } else if (message.role === llm.ChatRole.SYSTEM) {
+        const contents: api_proto.InputTextContent[] = [];
+        for (const c of content) {
+          if (typeof c === 'string') {
+            contents.push({
+              type: 'input_text',
+              text: c,
+            });
+          } else if (
+            // typescript type guard for determining ChatAudio vs ChatImage
+            ((c: llm.ChatAudio | llm.ChatImage): c is llm.ChatAudio => {
+              return (c as llm.ChatAudio).frame !== undefined;
+            })(c)
+          ) {
+            this.#logger.warn('audio content in system message is not supported');
+          }
+        }
+        event = {
+          type: 'conversation.item.create',
+          previous_item_id: previousItemId,
+          item: {
+            type: 'message',
+            role: 'system',
+            content: contents,
+          },
+        };
+      } else {
+        this.#logger
+          .child({ message })
+          .warn('chat message is not supported inside the realtime API');
+        return;
+      }
+    }
+    this.#session.queueMsg(event);
   }
 }
@@ -302,6 +426,7 @@ export class RealtimeModel extends multimodal.RealtimeModel {
   session({
     fncCtx,
+    chatCtx,
     modalities = this.#defaultOpts.modalities,
     instructions = this.#defaultOpts.instructions,
     voice = this.#defaultOpts.voice,
@@ -313,6 +438,7 @@ export class RealtimeModel extends multimodal.RealtimeModel {
     maxResponseOutputTokens = this.#defaultOpts.maxResponseOutputTokens,
   }: {
     fncCtx?: llm.FunctionContext;
+    chatCtx?: llm.ChatContext;
     modalities?: ['text', 'audio'] | ['text'];
     instructions?: string;
     voice?: api_proto.Voice;
@@ -341,7 +467,10 @@ export class RealtimeModel extends multimodal.RealtimeModel {
       entraToken: this.#defaultOpts.entraToken,
     };
-    const newSession = new RealtimeSession(opts, fncCtx);
+    const newSession = new RealtimeSession(opts, {
+      chatCtx: chatCtx || new llm.ChatContext(),
+      fncCtx,
+    });
     this.#sessions.push(newSession);
     return newSession;
   }
@@ -352,6 +481,7 @@ export class RealtimeModel extends multimodal.RealtimeModel {
 }
 export class RealtimeSession extends multimodal.RealtimeSession {
+  #chatCtx: llm.ChatContext | undefined = undefined;
   #fncCtx: llm.FunctionContext | undefined = undefined;
   #opts: ModelOptions;
   #pendingResponses: { [id: string]: RealtimeResponse } = {};
@@ -363,10 +493,14 @@ export class RealtimeSession extends multimodal.RealtimeSession {
   #closing = true;
   #sendQueue = new Queue<api_proto.ClientEvent>();
-  constructor(opts: ModelOptions, fncCtx?: llm.FunctionContext | undefined) {
+  constructor(
+    opts: ModelOptions,
+    { fncCtx, chatCtx }: { fncCtx?: llm.FunctionContext; chatCtx?: llm.ChatContext },
+  ) {
     super();
     this.#opts = opts;
+    this.#chatCtx = chatCtx;
     this.#fncCtx = fncCtx;
     this.#task = this.#start();
@@ -385,6 +519,10 @@ export class RealtimeSession extends multimodal.RealtimeSession {
     });
   }
+  get chatCtx(): llm.ChatContext | undefined {
+    return this.#chatCtx;
+  }
   get fncCtx(): llm.FunctionContext | undefined {
     return this.#fncCtx;
   }
@@ -787,6 +925,7 @@ export class RealtimeSession extends multimodal.RealtimeSession {
       id: response.id,
       status: response.status,
       statusDetails: response.status_details,
+      usage: null,
       output: [],
       doneFut: doneFut,
     };
@@ -800,6 +939,7 @@ export class RealtimeSession extends multimodal.RealtimeSession {
     const response = this.#pendingResponses[responseId];
     response.status = responseData.status;
     response.statusDetails = responseData.status_details;
+    response.usage = responseData.usage;
     this.#pendingResponses[responseId] = response;
     response.doneFut.resolve();
     this.emit('response_done', response);
@@ -869,11 +1009,11 @@ export class RealtimeSession extends multimodal.RealtimeSession {
             callId: item.call_id,
           });
           this.conversation.item.create(
-            {
-              type: 'function_call_output',
-              call_id: item.call_id,
-              output: content,
-            },
+            llm.ChatMessage.createToolFromFunctionResult({
+              name: item.name,
+              toolCallId: item.call_id,
+              result: content,
+            }),
             output.itemId,
           );
           this.response.create();