@livekit/agents-plugin-openai 0.9.1 → 0.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,190 @@
1
+ import { AsyncIterableQueue, Future, llm, multimodal } from '@livekit/agents';
2
+ import { AudioFrame } from '@livekit/rtc-node';
3
+ import * as api_proto from './api_proto.js';
4
+ interface ModelOptions {
5
+ modalities: ['text', 'audio'] | ['text'];
6
+ instructions: string;
7
+ voice: api_proto.Voice;
8
+ inputAudioFormat: api_proto.AudioFormat;
9
+ outputAudioFormat: api_proto.AudioFormat;
10
+ inputAudioTranscription: api_proto.InputAudioTranscription | null;
11
+ turnDetection: api_proto.TurnDetectionType | null;
12
+ temperature: number;
13
+ maxResponseOutputTokens: number;
14
+ model: api_proto.Model;
15
+ apiKey?: string;
16
+ baseURL: string;
17
+ isAzure: boolean;
18
+ entraToken?: string;
19
+ apiVersion?: string;
20
+ }
21
+ export interface RealtimeResponse {
22
+ id: string;
23
+ status: api_proto.ResponseStatus;
24
+ statusDetails: api_proto.ResponseStatusDetails | null;
25
+ usage: api_proto.ModelUsage | null;
26
+ output: RealtimeOutput[];
27
+ doneFut: Future;
28
+ createdTimestamp: number;
29
+ firstTokenTimestamp?: number;
30
+ }
31
+ export interface RealtimeOutput {
32
+ responseId: string;
33
+ itemId: string;
34
+ outputIndex: number;
35
+ role: api_proto.Role;
36
+ type: 'message' | 'function_call';
37
+ content: RealtimeContent[];
38
+ doneFut: Future;
39
+ }
40
+ export interface RealtimeContent {
41
+ responseId: string;
42
+ itemId: string;
43
+ outputIndex: number;
44
+ contentIndex: number;
45
+ text: string;
46
+ audio: AudioFrame[];
47
+ textStream: AsyncIterableQueue<string>;
48
+ audioStream: AsyncIterableQueue<AudioFrame>;
49
+ toolCalls: RealtimeToolCall[];
50
+ contentType: api_proto.Modality;
51
+ }
52
+ export interface RealtimeToolCall {
53
+ name: string;
54
+ arguments: string;
55
+ toolCallID: string;
56
+ }
57
+ export interface InputSpeechTranscriptionCompleted {
58
+ itemId: string;
59
+ transcript: string;
60
+ }
61
+ export interface InputSpeechTranscriptionFailed {
62
+ itemId: string;
63
+ message: string;
64
+ }
65
+ export interface InputSpeechStarted {
66
+ itemId: string;
67
+ }
68
+ export interface InputSpeechCommitted {
69
+ itemId: string;
70
+ }
71
+ declare class InputAudioBuffer {
72
+ #private;
73
+ constructor(session: RealtimeSession);
74
+ append(frame: AudioFrame): void;
75
+ clear(): void;
76
+ commit(): void;
77
+ }
78
+ declare class ConversationItem {
79
+ #private;
80
+ constructor(session: RealtimeSession);
81
+ truncate(itemId: string, contentIndex: number, audioEnd: number): void;
82
+ delete(itemId: string): void;
83
+ create(message: llm.ChatMessage, previousItemId?: string): void;
84
+ }
85
+ declare class Conversation {
86
+ #private;
87
+ constructor(session: RealtimeSession);
88
+ get item(): ConversationItem;
89
+ }
90
+ declare class Response {
91
+ #private;
92
+ constructor(session: RealtimeSession);
93
+ create(): void;
94
+ cancel(): void;
95
+ }
96
+ export declare class RealtimeModel extends multimodal.RealtimeModel {
97
+ #private;
98
+ sampleRate: number;
99
+ numChannels: number;
100
+ inFrameSize: number;
101
+ outFrameSize: number;
102
+ static withAzure({ baseURL, azureDeployment, apiVersion, apiKey, entraToken, instructions, modalities, voice, inputAudioFormat, outputAudioFormat, inputAudioTranscription, turnDetection, temperature, maxResponseOutputTokens, }: {
103
+ baseURL: string;
104
+ azureDeployment: string;
105
+ apiVersion?: string;
106
+ apiKey?: string;
107
+ entraToken?: string;
108
+ instructions?: string;
109
+ modalities?: ['text', 'audio'] | ['text'];
110
+ voice?: api_proto.Voice;
111
+ inputAudioFormat?: api_proto.AudioFormat;
112
+ outputAudioFormat?: api_proto.AudioFormat;
113
+ inputAudioTranscription?: api_proto.InputAudioTranscription;
114
+ turnDetection?: api_proto.TurnDetectionType;
115
+ temperature?: number;
116
+ maxResponseOutputTokens?: number;
117
+ }): RealtimeModel;
118
+ constructor({ modalities, instructions, voice, inputAudioFormat, outputAudioFormat, inputAudioTranscription, turnDetection, temperature, maxResponseOutputTokens, model, apiKey, baseURL, isAzure, apiVersion, entraToken, }: {
119
+ modalities?: ['text', 'audio'] | ['text'];
120
+ instructions?: string;
121
+ voice?: api_proto.Voice;
122
+ inputAudioFormat?: api_proto.AudioFormat;
123
+ outputAudioFormat?: api_proto.AudioFormat;
124
+ inputAudioTranscription?: api_proto.InputAudioTranscription;
125
+ turnDetection?: api_proto.TurnDetectionType;
126
+ temperature?: number;
127
+ maxResponseOutputTokens?: number;
128
+ model?: api_proto.Model;
129
+ apiKey?: string;
130
+ baseURL?: string;
131
+ isAzure?: boolean;
132
+ apiVersion?: string;
133
+ entraToken?: string;
134
+ });
135
+ get sessions(): RealtimeSession[];
136
+ session({ fncCtx, chatCtx, modalities, instructions, voice, inputAudioFormat, outputAudioFormat, inputAudioTranscription, turnDetection, temperature, maxResponseOutputTokens, }: {
137
+ fncCtx?: llm.FunctionContext;
138
+ chatCtx?: llm.ChatContext;
139
+ modalities?: ['text', 'audio'] | ['text'];
140
+ instructions?: string;
141
+ voice?: api_proto.Voice;
142
+ inputAudioFormat?: api_proto.AudioFormat;
143
+ outputAudioFormat?: api_proto.AudioFormat;
144
+ inputAudioTranscription?: api_proto.InputAudioTranscription | null;
145
+ turnDetection?: api_proto.TurnDetectionType | null;
146
+ temperature?: number;
147
+ maxResponseOutputTokens?: number;
148
+ }): RealtimeSession;
149
+ close(): Promise<void>;
150
+ }
151
+ export declare class RealtimeSession extends multimodal.RealtimeSession {
152
+ #private;
153
+ constructor(opts: ModelOptions, { fncCtx, chatCtx }: {
154
+ fncCtx?: llm.FunctionContext;
155
+ chatCtx?: llm.ChatContext;
156
+ });
157
+ get chatCtx(): llm.ChatContext | undefined;
158
+ get fncCtx(): llm.FunctionContext | undefined;
159
+ set fncCtx(ctx: llm.FunctionContext | undefined);
160
+ get conversation(): Conversation;
161
+ get inputAudioBuffer(): InputAudioBuffer;
162
+ get response(): Response;
163
+ get expiration(): number;
164
+ queueMsg(command: api_proto.ClientEvent): void;
165
+ sessionUpdate({ modalities, instructions, voice, inputAudioFormat, outputAudioFormat, inputAudioTranscription, turnDetection, temperature, maxResponseOutputTokens, toolChoice, selectedTools, }: {
166
+ modalities: ['text', 'audio'] | ['text'];
167
+ instructions?: string;
168
+ voice?: api_proto.Voice;
169
+ inputAudioFormat?: api_proto.AudioFormat;
170
+ outputAudioFormat?: api_proto.AudioFormat;
171
+ inputAudioTranscription?: api_proto.InputAudioTranscription | null;
172
+ turnDetection?: api_proto.TurnDetectionType | null;
173
+ temperature?: number;
174
+ maxResponseOutputTokens?: number;
175
+ toolChoice?: api_proto.ToolChoice;
176
+ selectedTools?: string[];
177
+ }): void;
178
+ /**
179
+ * Try to recover from a text response to audio mode.
180
+ *
181
+ * @remarks
182
+ * Sometimes the OpenAI Realtime API returns text instead of audio responses.
183
+ * This method tries to recover from this by requesting a new response after deleting the text
184
+ * response and creating an empty user audio message.
185
+ */
186
+ recoverFromTextResponse(itemId: string): void;
187
+ close(): Promise<void>;
188
+ }
189
+ export {};
190
+ //# sourceMappingURL=realtime_model.d.ts.map
package/dist/stt.d.cts ADDED
@@ -0,0 +1,43 @@
1
+ import { type AudioBuffer, stt } from '@livekit/agents';
2
+ import { OpenAI } from 'openai';
3
+ import type { GroqAudioModels, WhisperModels } from './models.js';
4
+ export interface STTOptions {
5
+ apiKey?: string;
6
+ language: string;
7
+ prompt?: string;
8
+ detectLanguage: boolean;
9
+ model: WhisperModels | string;
10
+ baseURL?: string;
11
+ client?: OpenAI;
12
+ }
13
+ export declare class STT extends stt.STT {
14
+ #private;
15
+ label: string;
16
+ /**
17
+ * Create a new instance of OpenAI STT.
18
+ *
19
+ * @remarks
20
+ * `apiKey` must be set to your OpenAI API key, either using the argument or by setting the
21
+ * `OPENAI_API_KEY` environmental variable.
22
+ */
23
+ constructor(opts?: Partial<STTOptions>);
24
+ /**
25
+ * Create a new instance of Groq STT.
26
+ *
27
+ * @remarks
28
+ * `apiKey` must be set to your Groq API key, either using the argument or by setting the
29
+ * `GROQ_API_KEY` environmental variable.
30
+ */
31
+ static withGroq(opts?: Partial<{
32
+ model: string | GroqAudioModels;
33
+ apiKey?: string;
34
+ baseURL?: string;
35
+ client: OpenAI;
36
+ language: string;
37
+ detectLanguage: boolean;
38
+ }>): STT;
39
+ _recognize(buffer: AudioBuffer, language?: string): Promise<stt.SpeechEvent>;
40
+ /** This method throws an error; streaming is unsupported on OpenAI STT. */
41
+ stream(): stt.SpeechStream;
42
+ }
43
+ //# sourceMappingURL=stt.d.ts.map
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=stt.test.d.ts.map
package/dist/tts.d.cts ADDED
@@ -0,0 +1,37 @@
1
+ import { tts } from '@livekit/agents';
2
+ import { OpenAI } from 'openai';
3
+ import type { TTSModels, TTSVoices } from './models.js';
4
+ export interface TTSOptions {
5
+ model: TTSModels | string;
6
+ voice: TTSVoices;
7
+ speed: number;
8
+ instructions?: string;
9
+ baseURL?: string;
10
+ client?: OpenAI;
11
+ apiKey?: string;
12
+ }
13
+ export declare class TTS extends tts.TTS {
14
+ #private;
15
+ label: string;
16
+ /**
17
+ * Create a new instance of OpenAI TTS.
18
+ *
19
+ * @remarks
20
+ * `apiKey` must be set to your OpenAI API key, either using the argument or by setting the
21
+ * `OPENAI_API_KEY` environmental variable.
22
+ */
23
+ constructor(opts?: Partial<TTSOptions>);
24
+ updateOptions(opts: {
25
+ model?: TTSModels | string;
26
+ voice?: TTSVoices;
27
+ speed?: number;
28
+ }): void;
29
+ synthesize(text: string): ChunkedStream;
30
+ stream(): tts.SynthesizeStream;
31
+ }
32
+ export declare class ChunkedStream extends tts.ChunkedStream {
33
+ #private;
34
+ label: string;
35
+ constructor(tts: TTS, text: string, stream: Promise<any>);
36
+ }
37
+ //# sourceMappingURL=tts.d.ts.map
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=tts.test.d.ts.map
package/package.json CHANGED
@@ -1,15 +1,18 @@
1
1
  {
2
2
  "name": "@livekit/agents-plugin-openai",
3
- "version": "0.9.1",
3
+ "version": "0.9.3",
4
4
  "description": "OpenAI plugin for LiveKit Node Agents",
5
5
  "main": "dist/index.js",
6
6
  "require": "dist/index.cjs",
7
7
  "types": "dist/index.d.ts",
8
8
  "exports": {
9
- ".": {
9
+ "import": {
10
10
  "types": "./dist/index.d.ts",
11
- "import": "./dist/index.js",
12
- "require": "./dist/index.cjs"
11
+ "default": "./dist/index.js"
12
+ },
13
+ "require": {
14
+ "types": "./dist/index.d.cts",
15
+ "default": "./dist/index.cjs"
13
16
  }
14
17
  },
15
18
  "author": "LiveKit",
@@ -25,7 +28,7 @@
25
28
  "@livekit/agents": "^x",
26
29
  "@livekit/agents-plugin-silero": "^x",
27
30
  "@livekit/agents-plugins-test": "^x",
28
- "@livekit/rtc-node": "^0.13.4",
31
+ "@livekit/rtc-node": "^0.13.11",
29
32
  "@microsoft/api-extractor": "^7.35.0",
30
33
  "@types/ws": "^8.5.10",
31
34
  "tsup": "^8.3.5",
@@ -37,11 +40,12 @@
37
40
  "ws": "^8.16.0"
38
41
  },
39
42
  "peerDependencies": {
40
- "@livekit/rtc-node": "^0.13.4",
41
- "@livekit/agents": "^0.7.2x"
43
+ "@livekit/rtc-node": "^0.13.11",
44
+ "@livekit/agents": "^0.7.7x"
42
45
  },
43
46
  "scripts": {
44
- "build": "tsup --onSuccess \"tsc --declaration --emitDeclarationOnly\"",
47
+ "build": "tsup --onSuccess \"pnpm build:types\"",
48
+ "build:types": "tsc --declaration --emitDeclarationOnly && node ../../scripts/copyDeclarationOutput.js",
45
49
  "clean": "rm -rf dist",
46
50
  "clean:build": "pnpm clean && pnpm build",
47
51
  "lint": "eslint -f unix \"src/**/*.{ts,js}\"",
package/src/llm.ts CHANGED
@@ -10,6 +10,7 @@ import type {
10
10
  ChatModels,
11
11
  DeepSeekChatModels,
12
12
  GroqChatModels,
13
+ MetaChatModels,
13
14
  OctoChatModels,
14
15
  PerplexityChatModels,
15
16
  TelnyxChatModels,
@@ -382,6 +383,36 @@ export class LLM extends llm.LLM {
382
383
  });
383
384
  }
384
385
 
386
+ /**
387
+ * Create a new instance of Meta Llama LLM.
388
+ *
389
+ * @remarks
390
+ * `apiKey` must be set to your Meta Llama API key, either using the argument or by setting the
391
+ * `LLAMA_API_KEY` environmental variable.
392
+ */
393
+ static withMeta(
394
+ opts: Partial<{
395
+ apiKey?: string;
396
+ baseURL?: string;
397
+ client?: OpenAI;
398
+ model?: string | MetaChatModels;
399
+ temperature?: number;
400
+ user?: string;
401
+ }> = {},
402
+ ): LLM {
403
+ opts.apiKey = opts.apiKey || process.env.LLAMA_API_KEY;
404
+ opts.baseURL = opts.baseURL || 'https://api.llama.com/compat/v1/';
405
+ opts.model = opts.model || 'Llama-4-Maverick-17B-128E-Instruct-FP8';
406
+
407
+ if (opts.apiKey === undefined) {
408
+ throw new Error(
409
+ 'Meta Llama API key is required, either as argument or set LLAMA_API_KEY environmental variable',
410
+ );
411
+ }
412
+
413
+ return new LLM(opts);
414
+ }
415
+
385
416
  chat({
386
417
  chatCtx,
387
418
  fncCtx,
@@ -605,27 +636,37 @@ const buildMessage = async (msg: llm.ChatMessage, cacheKey: any) => {
605
636
  break;
606
637
  }
607
638
 
608
- if (typeof msg.content === 'string') {
609
- oaiMsg.content = msg.content;
610
- } else if (Array.isArray(msg.content)) {
611
- oaiMsg.content = (await Promise.all(
612
- msg.content.map(async (c) => {
613
- if (typeof c === 'string') {
614
- return { type: 'text', text: c };
615
- } else if (
616
- // typescript type guard for determining ChatAudio vs ChatImage
617
- ((c: llm.ChatAudio | llm.ChatImage): c is llm.ChatImage => {
618
- return (c as llm.ChatImage).image !== undefined;
619
- })(c)
620
- ) {
621
- return await buildImageContent(c, cacheKey);
622
- } else {
623
- throw new Error('ChatAudio is not supported');
624
- }
625
- }),
626
- )) as OpenAI.ChatCompletionContentPart[];
627
- } else if (msg.content === undefined) {
628
- oaiMsg.content = '';
639
+ if (msg.role === llm.ChatRole.TOOL) {
640
+ try {
641
+ const serializedContent =
642
+ typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content);
643
+ oaiMsg.content = serializedContent;
644
+ } catch (e) {
645
+ throw Error(`Tool call output is not JSON serializable: ${e}`);
646
+ }
647
+ } else {
648
+ if (typeof msg.content === 'string') {
649
+ oaiMsg.content = msg.content;
650
+ } else if (Array.isArray(msg.content)) {
651
+ oaiMsg.content = (await Promise.all(
652
+ msg.content.map(async (c) => {
653
+ if (typeof c === 'string') {
654
+ return { type: 'text', text: c };
655
+ } else if (
656
+ // typescript type guard for determining ChatAudio vs ChatImage
657
+ ((c: llm.ChatAudio | llm.ChatImage): c is llm.ChatImage => {
658
+ return (c as llm.ChatImage).image !== undefined;
659
+ })(c)
660
+ ) {
661
+ return await buildImageContent(c, cacheKey);
662
+ } else {
663
+ throw new Error('ChatAudio is not supported');
664
+ }
665
+ }),
666
+ )) as OpenAI.ChatCompletionContentPart[];
667
+ } else if (msg.content === undefined) {
668
+ oaiMsg.content = '';
669
+ }
629
670
  }
630
671
 
631
672
  // make sure to provide when function has been called inside the context
package/src/models.ts CHANGED
@@ -128,3 +128,9 @@ export type OctoChatModels =
128
128
  | 'wizardlm-2-8x22bllamaguard-2-7b';
129
129
 
130
130
  export type XAIChatModels = 'grok-2' | 'grok-2-mini' | 'grok-2-mini-public' | 'grok-2-public';
131
+
132
+ export type MetaChatModels =
133
+ | 'Llama-4-Scout-17B-16E-Instruct-FP8'
134
+ | 'Llama-4-Maverick-17B-128E-Instruct-FP8'
135
+ | 'Llama-3.3-70B-Instruct'
136
+ | 'Llama-3.3-8B-Instruct';