@livekit/agents-plugin-openai 0.3.4 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/models.ts ADDED
@@ -0,0 +1,107 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+
5
+ export type ChatModels =
6
+ | 'gpt-4o'
7
+ | 'gpt-4o-2024-05-13'
8
+ | 'gpt-4o-mini'
9
+ | 'gpt-4o-mini-2024-07-18'
10
+ | 'gpt-4-turbo'
11
+ | 'gpt-4-turbo-2024-04-09'
12
+ | 'gpt-4-turbo-preview'
13
+ | 'gpt-4-0125-preview'
14
+ | 'gpt-4-1106-preview'
15
+ | 'gpt-4-vision-preview'
16
+ | 'gpt-4-1106-vision-preview'
17
+ | 'gpt-4'
18
+ | 'gpt-4-0314'
19
+ | 'gpt-4-0613'
20
+ | 'gpt-4-32k'
21
+ | 'gpt-4-32k-0314'
22
+ | 'gpt-4-32k-0613'
23
+ | 'gpt-3.5-turbo'
24
+ | 'gpt-3.5-turbo-16k'
25
+ | 'gpt-3.5-turbo-0301'
26
+ | 'gpt-3.5-turbo-0613'
27
+ | 'gpt-3.5-turbo-1106'
28
+ | 'gpt-3.5-turbo-16k-0613';
29
+
30
+ // adapters for OpenAI-compatible LLMs
31
+
32
+ export type TelnyxChatModels =
33
+ | 'meta-llama/Meta-Llama-3.1-8B-Instruct'
34
+ | 'meta-llama/Meta-Llama-3.1-70B-Instruct';
35
+
36
+ export type CerebrasChatModels = 'llama3.1-8b' | 'llama3.1-70b';
37
+
38
+ export type PerplexityChatModels =
39
+ | 'llama-3.1-sonar-small-128k-online'
40
+ | 'llama-3.1-sonar-small-128k-chat'
41
+ | 'llama-3.1-sonar-large-128k-online'
42
+ | 'llama-3.1-sonar-large-128k-chat'
43
+ | 'llama-3.1-8b-instruct'
44
+ | 'llama-3.1-70b-instruct';
45
+
46
+ export type GroqChatModels =
47
+ | 'llama-3.1-405b-reasoning'
48
+ | 'llama-3.1-70b-versatile'
49
+ | 'llama-3.1-8b-instant'
50
+ | 'llama3-groq-70b-8192-tool-use-preview'
51
+ | 'llama3-groq-8b-8192-tool-use-preview'
52
+ | 'llama-guard-3-8b'
53
+ | 'llama3-70b-8192'
54
+ | 'llama3-8b-8192'
55
+ | 'mixtral-8x7b-32768'
56
+ | 'gemma-7b-it'
57
+ | 'gemma2-9b-it';
58
+
59
+ export type DeepSeekChatModels = 'deepseek-coder' | 'deepseek-chat';
60
+
61
+ export type TogetherChatModels =
62
+ | 'garage-bAInd/Platypus2-70B-instruct'
63
+ | 'google/gemma-2-27b-it'
64
+ | 'google/gemma-2-9b-it'
65
+ | 'google/gemma-2b-it'
66
+ | 'google/gemma-7b-it'
67
+ | 'lmsys/vicuna-13b-v1.5'
68
+ | 'lmsys/vicuna-7b-v1.5'
69
+ | 'meta-llama/Llama-2-13b-chat-hf'
70
+ | 'meta-llama/Llama-2-70b-chat-hf'
71
+ | 'meta-llama/Llama-2-7b-chat-hf'
72
+ | 'meta-llama/Llama-3-70b-chat-hf'
73
+ | 'meta-llama/Llama-3-8b-chat-hf'
74
+ | 'meta-llama/Meta-Llama-3-70B-Instruct-Lite'
75
+ | 'meta-llama/Meta-Llama-3-70B-Instruct-Turbo'
76
+ | 'meta-llama/Meta-Llama-3-8B-Instruct-Lite'
77
+ | 'meta-llama/Meta-Llama-3-8B-Instruct-Turbo'
78
+ | 'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo'
79
+ | 'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo'
80
+ | 'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo'
81
+ | 'mistralai/Mistral-7B-Instruct-v0.1'
82
+ | 'mistralai/Mistral-7B-Instruct-v0.2'
83
+ | 'mistralai/Mistral-7B-Instruct-v0.3'
84
+ | 'mistralai/Mixtral-8x22B-Instruct-v0.1'
85
+ | 'mistralai/Mixtral-8x7B-Instruct-v0.1'
86
+ | 'openchat/openchat-3.5-1210'
87
+ | 'snorkelai/Snorkel-Mistral-PairRM-DPO'
88
+ | 'teknium/OpenHermes-2-Mistral-7B'
89
+ | 'teknium/OpenHermes-2p5-Mistral-7B'
90
+ | 'togethercomputer/Llama-2-7B-32K-Instruct'
91
+ | 'togethercomputer/RedPajama-INCITE-7B-Chat'
92
+ | 'togethercomputer/RedPajama-INCITE-Chat-3B-v1'
93
+ | 'togethercomputer/StripedHyena-Nous-7B'
94
+ | 'togethercomputer/alpaca-7b'
95
+ | 'upstage/SOLAR-10.7B-Instruct-v1.0'
96
+ | 'zero-one-ai/Yi-34B-Chat';
97
+
98
+ export type OctoChatModels =
99
+ | 'meta-llama-3-70b-instruct'
100
+ | 'meta-llama-3.1-405b-instruct'
101
+ | 'meta-llama-3.1-70b-instruct'
102
+ | 'meta-llama-3.1-8b-instruct'
103
+ | 'mistral-7b-instruct'
104
+ | 'mixtral-8x7b-instruct'
105
+ | 'wizardlm-2-8x22bllamaguard-2-7b';
106
+
107
+ export type XAIChatModels = 'grok-2' | 'grok-2-mini' | 'grok-2-mini-public' | 'grok-2-public';
@@ -79,7 +79,7 @@ export interface Tool {
79
79
  [prop: string]: any;
80
80
  };
81
81
  };
82
- required_properties: string[];
82
+ required: string[];
83
83
  };
84
84
  }
85
85
 
@@ -1,9 +1,17 @@
1
1
  // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
- import { AsyncIterableQueue, Future, Queue, llm, log, multimodal } from '@livekit/agents';
4
+ import {
5
+ AsyncIterableQueue,
6
+ Future,
7
+ Queue,
8
+ llm,
9
+ log,
10
+ mergeFrames,
11
+ multimodal,
12
+ } from '@livekit/agents';
5
13
  import { AudioFrame } from '@livekit/rtc-node';
6
- import { once } from 'events';
14
+ import { once } from 'node:events';
7
15
  import { WebSocket } from 'ws';
8
16
  import * as api_proto from './api_proto.js';
9
17
 
@@ -29,6 +37,7 @@ export interface RealtimeResponse {
29
37
  id: string;
30
38
  status: api_proto.ResponseStatus;
31
39
  statusDetails: api_proto.ResponseStatusDetails | null;
40
+ usage: api_proto.ResponseResource['usage'] | null;
32
41
  output: RealtimeOutput[];
33
42
  doneFut: Future;
34
43
  }
@@ -108,6 +117,7 @@ class InputAudioBuffer {
108
117
 
109
118
  class ConversationItem {
110
119
  #session: RealtimeSession;
120
+ #logger = log();
111
121
 
112
122
  constructor(session: RealtimeSession) {
113
123
  this.#session = session;
@@ -129,12 +139,126 @@ class ConversationItem {
129
139
  });
130
140
  }
131
141
 
132
- create(item: api_proto.ConversationItemCreateContent, previousItemId?: string): void {
133
- this.#session.queueMsg({
134
- type: 'conversation.item.create',
135
- item,
136
- previous_item_id: previousItemId,
137
- });
142
+ create(message: llm.ChatMessage, previousItemId?: string): void {
143
+ if (!message.content) {
144
+ return;
145
+ }
146
+
147
+ let event: api_proto.ConversationItemCreateEvent;
148
+
149
+ if (message.toolCallId) {
150
+ if (typeof message.content !== 'string') {
151
+ throw new TypeError('message.content must be a string');
152
+ }
153
+
154
+ event = {
155
+ type: 'conversation.item.create',
156
+ previous_item_id: previousItemId,
157
+ item: {
158
+ type: 'function_call_output',
159
+ call_id: message.toolCallId,
160
+ output: message.content,
161
+ },
162
+ };
163
+ } else {
164
+ let content = message.content;
165
+ if (!Array.isArray(content)) {
166
+ content = [content];
167
+ }
168
+
169
+ if (message.role === llm.ChatRole.USER) {
170
+ const contents: (api_proto.InputTextContent | api_proto.InputAudioContent)[] = [];
171
+ for (const c of content) {
172
+ if (typeof c === 'string') {
173
+ contents.push({
174
+ type: 'input_text',
175
+ text: c,
176
+ });
177
+ } else if (
178
+ // typescript type guard for determining ChatAudio vs ChatImage
179
+ ((c: llm.ChatAudio | llm.ChatImage): c is llm.ChatAudio => {
180
+ return (c as llm.ChatAudio).frame !== undefined;
181
+ })(c)
182
+ ) {
183
+ contents.push({
184
+ type: 'input_audio',
185
+ audio: Buffer.from(mergeFrames(c.frame).data.buffer).toString('base64'),
186
+ });
187
+ }
188
+ }
189
+
190
+ event = {
191
+ type: 'conversation.item.create',
192
+ previous_item_id: previousItemId,
193
+ item: {
194
+ type: 'message',
195
+ role: 'user',
196
+ content: contents,
197
+ },
198
+ };
199
+ } else if (message.role === llm.ChatRole.ASSISTANT) {
200
+ const contents: api_proto.TextContent[] = [];
201
+ for (const c of content) {
202
+ if (typeof c === 'string') {
203
+ contents.push({
204
+ type: 'text',
205
+ text: c,
206
+ });
207
+ } else if (
208
+ // typescript type guard for determining ChatAudio vs ChatImage
209
+ ((c: llm.ChatAudio | llm.ChatImage): c is llm.ChatAudio => {
210
+ return (c as llm.ChatAudio).frame !== undefined;
211
+ })(c)
212
+ ) {
213
+ this.#logger.warn('audio content in assistant message is not supported');
214
+ }
215
+ }
216
+
217
+ event = {
218
+ type: 'conversation.item.create',
219
+ previous_item_id: previousItemId,
220
+ item: {
221
+ type: 'message',
222
+ role: 'assistant',
223
+ content: contents,
224
+ },
225
+ };
226
+ } else if (message.role === llm.ChatRole.SYSTEM) {
227
+ const contents: api_proto.InputTextContent[] = [];
228
+ for (const c of content) {
229
+ if (typeof c === 'string') {
230
+ contents.push({
231
+ type: 'input_text',
232
+ text: c,
233
+ });
234
+ } else if (
235
+ // typescript type guard for determining ChatAudio vs ChatImage
236
+ ((c: llm.ChatAudio | llm.ChatImage): c is llm.ChatAudio => {
237
+ return (c as llm.ChatAudio).frame !== undefined;
238
+ })(c)
239
+ ) {
240
+ this.#logger.warn('audio content in system message is not supported');
241
+ }
242
+ }
243
+
244
+ event = {
245
+ type: 'conversation.item.create',
246
+ previous_item_id: previousItemId,
247
+ item: {
248
+ type: 'message',
249
+ role: 'system',
250
+ content: contents,
251
+ },
252
+ };
253
+ } else {
254
+ this.#logger
255
+ .child({ message })
256
+ .warn('chat message is not supported inside the realtime API');
257
+ return;
258
+ }
259
+ }
260
+
261
+ this.#session.queueMsg(event);
138
262
  }
139
263
  }
140
264
 
@@ -302,6 +426,7 @@ export class RealtimeModel extends multimodal.RealtimeModel {
302
426
 
303
427
  session({
304
428
  fncCtx,
429
+ chatCtx,
305
430
  modalities = this.#defaultOpts.modalities,
306
431
  instructions = this.#defaultOpts.instructions,
307
432
  voice = this.#defaultOpts.voice,
@@ -313,6 +438,7 @@ export class RealtimeModel extends multimodal.RealtimeModel {
313
438
  maxResponseOutputTokens = this.#defaultOpts.maxResponseOutputTokens,
314
439
  }: {
315
440
  fncCtx?: llm.FunctionContext;
441
+ chatCtx?: llm.ChatContext;
316
442
  modalities?: ['text', 'audio'] | ['text'];
317
443
  instructions?: string;
318
444
  voice?: api_proto.Voice;
@@ -341,7 +467,10 @@ export class RealtimeModel extends multimodal.RealtimeModel {
341
467
  entraToken: this.#defaultOpts.entraToken,
342
468
  };
343
469
 
344
- const newSession = new RealtimeSession(opts, fncCtx);
470
+ const newSession = new RealtimeSession(opts, {
471
+ chatCtx: chatCtx || new llm.ChatContext(),
472
+ fncCtx,
473
+ });
345
474
  this.#sessions.push(newSession);
346
475
  return newSession;
347
476
  }
@@ -352,6 +481,7 @@ export class RealtimeModel extends multimodal.RealtimeModel {
352
481
  }
353
482
 
354
483
  export class RealtimeSession extends multimodal.RealtimeSession {
484
+ #chatCtx: llm.ChatContext | undefined = undefined;
355
485
  #fncCtx: llm.FunctionContext | undefined = undefined;
356
486
  #opts: ModelOptions;
357
487
  #pendingResponses: { [id: string]: RealtimeResponse } = {};
@@ -363,10 +493,14 @@ export class RealtimeSession extends multimodal.RealtimeSession {
363
493
  #closing = true;
364
494
  #sendQueue = new Queue<api_proto.ClientEvent>();
365
495
 
366
- constructor(opts: ModelOptions, fncCtx?: llm.FunctionContext | undefined) {
496
+ constructor(
497
+ opts: ModelOptions,
498
+ { fncCtx, chatCtx }: { fncCtx?: llm.FunctionContext; chatCtx?: llm.ChatContext },
499
+ ) {
367
500
  super();
368
501
 
369
502
  this.#opts = opts;
503
+ this.#chatCtx = chatCtx;
370
504
  this.#fncCtx = fncCtx;
371
505
 
372
506
  this.#task = this.#start();
@@ -385,6 +519,10 @@ export class RealtimeSession extends multimodal.RealtimeSession {
385
519
  });
386
520
  }
387
521
 
522
+ get chatCtx(): llm.ChatContext | undefined {
523
+ return this.#chatCtx;
524
+ }
525
+
388
526
  get fncCtx(): llm.FunctionContext | undefined {
389
527
  return this.#fncCtx;
390
528
  }
@@ -787,6 +925,7 @@ export class RealtimeSession extends multimodal.RealtimeSession {
787
925
  id: response.id,
788
926
  status: response.status,
789
927
  statusDetails: response.status_details,
928
+ usage: null,
790
929
  output: [],
791
930
  doneFut: doneFut,
792
931
  };
@@ -800,6 +939,7 @@ export class RealtimeSession extends multimodal.RealtimeSession {
800
939
  const response = this.#pendingResponses[responseId];
801
940
  response.status = responseData.status;
802
941
  response.statusDetails = responseData.status_details;
942
+ response.usage = responseData.usage;
803
943
  this.#pendingResponses[responseId] = response;
804
944
  response.doneFut.resolve();
805
945
  this.emit('response_done', response);
@@ -869,11 +1009,11 @@ export class RealtimeSession extends multimodal.RealtimeSession {
869
1009
  callId: item.call_id,
870
1010
  });
871
1011
  this.conversation.item.create(
872
- {
873
- type: 'function_call_output',
874
- call_id: item.call_id,
875
- output: content,
876
- },
1012
+ llm.ChatMessage.createToolFromFunctionResult({
1013
+ name: item.name,
1014
+ toolCallId: item.call_id,
1015
+ result: content,
1016
+ }),
877
1017
  output.itemId,
878
1018
  );
879
1019
  this.response.create();