@livekit/agents-plugin-openai 0.3.5 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/models.ts ADDED
@@ -0,0 +1,107 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+
5
+ export type ChatModels =
6
+ | 'gpt-4o'
7
+ | 'gpt-4o-2024-05-13'
8
+ | 'gpt-4o-mini'
9
+ | 'gpt-4o-mini-2024-07-18'
10
+ | 'gpt-4-turbo'
11
+ | 'gpt-4-turbo-2024-04-09'
12
+ | 'gpt-4-turbo-preview'
13
+ | 'gpt-4-0125-preview'
14
+ | 'gpt-4-1106-preview'
15
+ | 'gpt-4-vision-preview'
16
+ | 'gpt-4-1106-vision-preview'
17
+ | 'gpt-4'
18
+ | 'gpt-4-0314'
19
+ | 'gpt-4-0613'
20
+ | 'gpt-4-32k'
21
+ | 'gpt-4-32k-0314'
22
+ | 'gpt-4-32k-0613'
23
+ | 'gpt-3.5-turbo'
24
+ | 'gpt-3.5-turbo-16k'
25
+ | 'gpt-3.5-turbo-0301'
26
+ | 'gpt-3.5-turbo-0613'
27
+ | 'gpt-3.5-turbo-1106'
28
+ | 'gpt-3.5-turbo-16k-0613';
29
+
30
+ // adapters for OpenAI-compatible LLMs
31
+
32
+ export type TelnyxChatModels =
33
+ | 'meta-llama/Meta-Llama-3.1-8B-Instruct'
34
+ | 'meta-llama/Meta-Llama-3.1-70B-Instruct';
35
+
36
+ export type CerebrasChatModels = 'llama3.1-8b' | 'llama3.1-70b';
37
+
38
+ export type PerplexityChatModels =
39
+ | 'llama-3.1-sonar-small-128k-online'
40
+ | 'llama-3.1-sonar-small-128k-chat'
41
+ | 'llama-3.1-sonar-large-128k-online'
42
+ | 'llama-3.1-sonar-large-128k-chat'
43
+ | 'llama-3.1-8b-instruct'
44
+ | 'llama-3.1-70b-instruct';
45
+
46
+ export type GroqChatModels =
47
+ | 'llama-3.1-405b-reasoning'
48
+ | 'llama-3.1-70b-versatile'
49
+ | 'llama-3.1-8b-instant'
50
+ | 'llama3-groq-70b-8192-tool-use-preview'
51
+ | 'llama3-groq-8b-8192-tool-use-preview'
52
+ | 'llama-guard-3-8b'
53
+ | 'llama3-70b-8192'
54
+ | 'llama3-8b-8192'
55
+ | 'mixtral-8x7b-32768'
56
+ | 'gemma-7b-it'
57
+ | 'gemma2-9b-it';
58
+
59
+ export type DeepSeekChatModels = 'deepseek-coder' | 'deepseek-chat';
60
+
61
+ export type TogetherChatModels =
62
+ | 'garage-bAInd/Platypus2-70B-instruct'
63
+ | 'google/gemma-2-27b-it'
64
+ | 'google/gemma-2-9b-it'
65
+ | 'google/gemma-2b-it'
66
+ | 'google/gemma-7b-it'
67
+ | 'lmsys/vicuna-13b-v1.5'
68
+ | 'lmsys/vicuna-7b-v1.5'
69
+ | 'meta-llama/Llama-2-13b-chat-hf'
70
+ | 'meta-llama/Llama-2-70b-chat-hf'
71
+ | 'meta-llama/Llama-2-7b-chat-hf'
72
+ | 'meta-llama/Llama-3-70b-chat-hf'
73
+ | 'meta-llama/Llama-3-8b-chat-hf'
74
+ | 'meta-llama/Meta-Llama-3-70B-Instruct-Lite'
75
+ | 'meta-llama/Meta-Llama-3-70B-Instruct-Turbo'
76
+ | 'meta-llama/Meta-Llama-3-8B-Instruct-Lite'
77
+ | 'meta-llama/Meta-Llama-3-8B-Instruct-Turbo'
78
+ | 'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo'
79
+ | 'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo'
80
+ | 'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo'
81
+ | 'mistralai/Mistral-7B-Instruct-v0.1'
82
+ | 'mistralai/Mistral-7B-Instruct-v0.2'
83
+ | 'mistralai/Mistral-7B-Instruct-v0.3'
84
+ | 'mistralai/Mixtral-8x22B-Instruct-v0.1'
85
+ | 'mistralai/Mixtral-8x7B-Instruct-v0.1'
86
+ | 'openchat/openchat-3.5-1210'
87
+ | 'snorkelai/Snorkel-Mistral-PairRM-DPO'
88
+ | 'teknium/OpenHermes-2-Mistral-7B'
89
+ | 'teknium/OpenHermes-2p5-Mistral-7B'
90
+ | 'togethercomputer/Llama-2-7B-32K-Instruct'
91
+ | 'togethercomputer/RedPajama-INCITE-7B-Chat'
92
+ | 'togethercomputer/RedPajama-INCITE-Chat-3B-v1'
93
+ | 'togethercomputer/StripedHyena-Nous-7B'
94
+ | 'togethercomputer/alpaca-7b'
95
+ | 'upstage/SOLAR-10.7B-Instruct-v1.0'
96
+ | 'zero-one-ai/Yi-34B-Chat';
97
+
98
+ export type OctoChatModels =
99
+ | 'meta-llama-3-70b-instruct'
100
+ | 'meta-llama-3.1-405b-instruct'
101
+ | 'meta-llama-3.1-70b-instruct'
102
+ | 'meta-llama-3.1-8b-instruct'
103
+ | 'mistral-7b-instruct'
104
+ | 'mixtral-8x7b-instruct'
105
+ | 'wizardlm-2-8x22bllamaguard-2-7b';
106
+
107
+ export type XAIChatModels = 'grok-2' | 'grok-2-mini' | 'grok-2-mini-public' | 'grok-2-public';
@@ -79,7 +79,7 @@ export interface Tool {
79
79
  [prop: string]: any;
80
80
  };
81
81
  };
82
- required_properties: string[];
82
+ required: string[];
83
83
  };
84
84
  }
85
85
 
@@ -1,9 +1,17 @@
1
1
  // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
- import { AsyncIterableQueue, Future, Queue, llm, log, multimodal } from '@livekit/agents';
4
+ import {
5
+ AsyncIterableQueue,
6
+ Future,
7
+ Queue,
8
+ llm,
9
+ log,
10
+ mergeFrames,
11
+ multimodal,
12
+ } from '@livekit/agents';
5
13
  import { AudioFrame } from '@livekit/rtc-node';
6
- import { once } from 'events';
14
+ import { once } from 'node:events';
7
15
  import { WebSocket } from 'ws';
8
16
  import * as api_proto from './api_proto.js';
9
17
 
@@ -109,6 +117,7 @@ class InputAudioBuffer {
109
117
 
110
118
  class ConversationItem {
111
119
  #session: RealtimeSession;
120
+ #logger = log();
112
121
 
113
122
  constructor(session: RealtimeSession) {
114
123
  this.#session = session;
@@ -130,12 +139,126 @@ class ConversationItem {
130
139
  });
131
140
  }
132
141
 
133
- create(item: api_proto.ConversationItemCreateContent, previousItemId?: string): void {
134
- this.#session.queueMsg({
135
- type: 'conversation.item.create',
136
- item,
137
- previous_item_id: previousItemId,
138
- });
142
+ create(message: llm.ChatMessage, previousItemId?: string): void {
143
+ if (!message.content) {
144
+ return;
145
+ }
146
+
147
+ let event: api_proto.ConversationItemCreateEvent;
148
+
149
+ if (message.toolCallId) {
150
+ if (typeof message.content !== 'string') {
151
+ throw new TypeError('message.content must be a string');
152
+ }
153
+
154
+ event = {
155
+ type: 'conversation.item.create',
156
+ previous_item_id: previousItemId,
157
+ item: {
158
+ type: 'function_call_output',
159
+ call_id: message.toolCallId,
160
+ output: message.content,
161
+ },
162
+ };
163
+ } else {
164
+ let content = message.content;
165
+ if (!Array.isArray(content)) {
166
+ content = [content];
167
+ }
168
+
169
+ if (message.role === llm.ChatRole.USER) {
170
+ const contents: (api_proto.InputTextContent | api_proto.InputAudioContent)[] = [];
171
+ for (const c of content) {
172
+ if (typeof c === 'string') {
173
+ contents.push({
174
+ type: 'input_text',
175
+ text: c,
176
+ });
177
+ } else if (
178
+ // typescript type guard for determining ChatAudio vs ChatImage
179
+ ((c: llm.ChatAudio | llm.ChatImage): c is llm.ChatAudio => {
180
+ return (c as llm.ChatAudio).frame !== undefined;
181
+ })(c)
182
+ ) {
183
+ contents.push({
184
+ type: 'input_audio',
185
+ audio: Buffer.from(mergeFrames(c.frame).data.buffer).toString('base64'),
186
+ });
187
+ }
188
+ }
189
+
190
+ event = {
191
+ type: 'conversation.item.create',
192
+ previous_item_id: previousItemId,
193
+ item: {
194
+ type: 'message',
195
+ role: 'user',
196
+ content: contents,
197
+ },
198
+ };
199
+ } else if (message.role === llm.ChatRole.ASSISTANT) {
200
+ const contents: api_proto.TextContent[] = [];
201
+ for (const c of content) {
202
+ if (typeof c === 'string') {
203
+ contents.push({
204
+ type: 'text',
205
+ text: c,
206
+ });
207
+ } else if (
208
+ // typescript type guard for determining ChatAudio vs ChatImage
209
+ ((c: llm.ChatAudio | llm.ChatImage): c is llm.ChatAudio => {
210
+ return (c as llm.ChatAudio).frame !== undefined;
211
+ })(c)
212
+ ) {
213
+ this.#logger.warn('audio content in assistant message is not supported');
214
+ }
215
+ }
216
+
217
+ event = {
218
+ type: 'conversation.item.create',
219
+ previous_item_id: previousItemId,
220
+ item: {
221
+ type: 'message',
222
+ role: 'assistant',
223
+ content: contents,
224
+ },
225
+ };
226
+ } else if (message.role === llm.ChatRole.SYSTEM) {
227
+ const contents: api_proto.InputTextContent[] = [];
228
+ for (const c of content) {
229
+ if (typeof c === 'string') {
230
+ contents.push({
231
+ type: 'input_text',
232
+ text: c,
233
+ });
234
+ } else if (
235
+ // typescript type guard for determining ChatAudio vs ChatImage
236
+ ((c: llm.ChatAudio | llm.ChatImage): c is llm.ChatAudio => {
237
+ return (c as llm.ChatAudio).frame !== undefined;
238
+ })(c)
239
+ ) {
240
+ this.#logger.warn('audio content in system message is not supported');
241
+ }
242
+ }
243
+
244
+ event = {
245
+ type: 'conversation.item.create',
246
+ previous_item_id: previousItemId,
247
+ item: {
248
+ type: 'message',
249
+ role: 'system',
250
+ content: contents,
251
+ },
252
+ };
253
+ } else {
254
+ this.#logger
255
+ .child({ message })
256
+ .warn('chat message is not supported inside the realtime API');
257
+ return;
258
+ }
259
+ }
260
+
261
+ this.#session.queueMsg(event);
139
262
  }
140
263
  }
141
264
 
@@ -303,6 +426,7 @@ export class RealtimeModel extends multimodal.RealtimeModel {
303
426
 
304
427
  session({
305
428
  fncCtx,
429
+ chatCtx,
306
430
  modalities = this.#defaultOpts.modalities,
307
431
  instructions = this.#defaultOpts.instructions,
308
432
  voice = this.#defaultOpts.voice,
@@ -314,6 +438,7 @@ export class RealtimeModel extends multimodal.RealtimeModel {
314
438
  maxResponseOutputTokens = this.#defaultOpts.maxResponseOutputTokens,
315
439
  }: {
316
440
  fncCtx?: llm.FunctionContext;
441
+ chatCtx?: llm.ChatContext;
317
442
  modalities?: ['text', 'audio'] | ['text'];
318
443
  instructions?: string;
319
444
  voice?: api_proto.Voice;
@@ -342,7 +467,10 @@ export class RealtimeModel extends multimodal.RealtimeModel {
342
467
  entraToken: this.#defaultOpts.entraToken,
343
468
  };
344
469
 
345
- const newSession = new RealtimeSession(opts, fncCtx);
470
+ const newSession = new RealtimeSession(opts, {
471
+ chatCtx: chatCtx || new llm.ChatContext(),
472
+ fncCtx,
473
+ });
346
474
  this.#sessions.push(newSession);
347
475
  return newSession;
348
476
  }
@@ -353,6 +481,7 @@ export class RealtimeModel extends multimodal.RealtimeModel {
353
481
  }
354
482
 
355
483
  export class RealtimeSession extends multimodal.RealtimeSession {
484
+ #chatCtx: llm.ChatContext | undefined = undefined;
356
485
  #fncCtx: llm.FunctionContext | undefined = undefined;
357
486
  #opts: ModelOptions;
358
487
  #pendingResponses: { [id: string]: RealtimeResponse } = {};
@@ -364,10 +493,14 @@ export class RealtimeSession extends multimodal.RealtimeSession {
364
493
  #closing = true;
365
494
  #sendQueue = new Queue<api_proto.ClientEvent>();
366
495
 
367
- constructor(opts: ModelOptions, fncCtx?: llm.FunctionContext | undefined) {
496
+ constructor(
497
+ opts: ModelOptions,
498
+ { fncCtx, chatCtx }: { fncCtx?: llm.FunctionContext; chatCtx?: llm.ChatContext },
499
+ ) {
368
500
  super();
369
501
 
370
502
  this.#opts = opts;
503
+ this.#chatCtx = chatCtx;
371
504
  this.#fncCtx = fncCtx;
372
505
 
373
506
  this.#task = this.#start();
@@ -386,6 +519,10 @@ export class RealtimeSession extends multimodal.RealtimeSession {
386
519
  });
387
520
  }
388
521
 
522
+ get chatCtx(): llm.ChatContext | undefined {
523
+ return this.#chatCtx;
524
+ }
525
+
389
526
  get fncCtx(): llm.FunctionContext | undefined {
390
527
  return this.#fncCtx;
391
528
  }
@@ -872,11 +1009,11 @@ export class RealtimeSession extends multimodal.RealtimeSession {
872
1009
  callId: item.call_id,
873
1010
  });
874
1011
  this.conversation.item.create(
875
- {
876
- type: 'function_call_output',
877
- call_id: item.call_id,
878
- output: content,
879
- },
1012
+ llm.ChatMessage.createToolFromFunctionResult({
1013
+ name: item.name,
1014
+ toolCallId: item.call_id,
1015
+ result: content,
1016
+ }),
880
1017
  output.itemId,
881
1018
  );
882
1019
  this.response.create();