@livekit/agents-plugin-openai 1.0.17 → 1.0.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@livekit/agents-plugin-openai",
3
- "version": "1.0.17",
3
+ "version": "1.0.19",
4
4
  "description": "OpenAI plugin for LiveKit Node Agents",
5
5
  "main": "dist/index.js",
6
6
  "require": "dist/index.cjs",
@@ -30,18 +30,18 @@
30
30
  "@types/ws": "^8.5.10",
31
31
  "tsup": "^8.3.5",
32
32
  "typescript": "^5.0.0",
33
- "@livekit/agents": "1.0.17",
34
- "@livekit/agents-plugin-silero": "1.0.17",
35
- "@livekit/agents-plugins-test": "1.0.17"
33
+ "@livekit/agents": "1.0.19",
34
+ "@livekit/agents-plugin-silero": "1.0.19",
35
+ "@livekit/agents-plugins-test": "1.0.19"
36
36
  },
37
37
  "dependencies": {
38
38
  "@livekit/mutex": "^1.1.1",
39
- "openai": "^4.91.1",
40
- "ws": "^8.16.0"
39
+ "openai": "^6.8.1",
40
+ "ws": "^8.18.0"
41
41
  },
42
42
  "peerDependencies": {
43
43
  "@livekit/rtc-node": "^0.13.12",
44
- "@livekit/agents": "1.0.17"
44
+ "@livekit/agents": "1.0.19"
45
45
  },
46
46
  "scripts": {
47
47
  "build": "tsup --onSuccess \"pnpm build:types\"",
package/src/llm.test.ts CHANGED
@@ -1,7 +1,7 @@
1
1
  // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
- import { llm } from '@livekit/agents-plugins-test';
4
+ import { llm, llmStrict } from '@livekit/agents-plugins-test';
5
5
  import { describe } from 'vitest';
6
6
  import { LLM } from './llm.js';
7
7
 
@@ -10,5 +10,15 @@ describe('OpenAI', async () => {
10
10
  new LLM({
11
11
  temperature: 0,
12
12
  }),
13
+ false,
14
+ );
15
+ });
16
+
17
+ describe('OpenAI strict tool schema', async () => {
18
+ await llmStrict(
19
+ new LLM({
20
+ temperature: 0,
21
+ strictToolSchema: true,
22
+ }),
13
23
  );
14
24
  });
package/src/llm.ts CHANGED
@@ -30,17 +30,20 @@ export interface LLMOptions {
30
30
  maxCompletionTokens?: number;
31
31
  serviceTier?: string;
32
32
  store?: boolean;
33
+ strictToolSchema?: boolean;
33
34
  }
34
35
 
35
36
  const defaultLLMOptions: LLMOptions = {
36
37
  model: 'gpt-4.1',
37
38
  apiKey: process.env.OPENAI_API_KEY,
38
39
  parallelToolCalls: true,
40
+ strictToolSchema: false,
39
41
  };
40
42
 
41
43
  const defaultAzureLLMOptions: LLMOptions = {
42
44
  model: 'gpt-4.1',
43
45
  apiKey: process.env.AZURE_API_KEY,
46
+ strictToolSchema: false,
44
47
  };
45
48
 
46
49
  export class LLM extends llm.LLM {
@@ -445,9 +448,9 @@ export class LLM extends llm.LLM {
445
448
  connOptions?: APIConnectOptions;
446
449
  parallelToolCalls?: boolean;
447
450
  toolChoice?: llm.ToolChoice;
448
- extraKwargs?: Record<string, any>;
451
+ extraKwargs?: Record<string, unknown>;
449
452
  }): LLMStream {
450
- const extras: Record<string, any> = { ...extraKwargs }; // eslint-disable-line @typescript-eslint/no-explicit-any
453
+ const extras: Record<string, unknown> = { ...extraKwargs };
451
454
 
452
455
  if (this.#opts.metadata) {
453
456
  extras.metadata = this.#opts.metadata;
@@ -492,6 +495,7 @@ export class LLM extends llm.LLM {
492
495
  toolCtx,
493
496
  connOptions,
494
497
  modelOptions: extras,
498
+ strictToolSchema: this.#opts.strictToolSchema || false,
495
499
  gatewayOptions: undefined, // OpenAI plugin doesn't use gateway authentication
496
500
  });
497
501
  }
@@ -190,7 +190,7 @@ export interface SessionResource {
190
190
  id: string;
191
191
  object: 'realtime.session';
192
192
  model: string;
193
- modalities: ['text', 'audio'] | ['text']; // default: ["text", "audio"]
193
+ modalities: Modality[]; // default: ["text", "audio"]
194
194
  instructions: string;
195
195
  voice: Voice; // default: "alloy"
196
196
  input_audio_format: AudioFormat; // default: "pcm16"
@@ -267,7 +267,7 @@ export interface SessionUpdateEvent extends BaseClientEvent {
267
267
  type: 'session.update';
268
268
  session: Partial<{
269
269
  model: Model;
270
- modalities: ['text', 'audio'] | ['text'];
270
+ modalities: Modality[];
271
271
  instructions: string;
272
272
  voice: Voice;
273
273
  input_audio_format: AudioFormat;
@@ -350,7 +350,7 @@ export interface ConversationItemDeleteEvent extends BaseClientEvent {
350
350
  export interface ResponseCreateEvent extends BaseClientEvent {
351
351
  type: 'response.create';
352
352
  response?: Partial<{
353
- modalities: ['text', 'audio'] | ['text'];
353
+ modalities: Modality[];
354
354
  instructions: string;
355
355
  voice: Voice;
356
356
  output_audio_format: AudioFormat;
@@ -511,6 +511,7 @@ export interface ResponseContentPartDoneEvent extends BaseServerEvent {
511
511
  export interface ResponseTextDeltaEvent extends BaseServerEvent {
512
512
  type: 'response.text.delta';
513
513
  response_id: string;
514
+ item_id: string;
514
515
  output_index: number;
515
516
  content_index: number;
516
517
  delta: string;
@@ -519,6 +520,7 @@ export interface ResponseTextDeltaEvent extends BaseServerEvent {
519
520
  export interface ResponseTextDoneEvent extends BaseServerEvent {
520
521
  type: 'response.text.done';
521
522
  response_id: string;
523
+ item_id: string;
522
524
  output_index: number;
523
525
  content_index: number;
524
526
  text: string;
@@ -34,6 +34,8 @@ const BASE_URL = 'https://api.openai.com/v1';
34
34
 
35
35
  const MOCK_AUDIO_ID_PREFIX = 'lk_mock_audio_item_';
36
36
 
37
+ type Modality = 'text' | 'audio';
38
+
37
39
  interface RealtimeOptions {
38
40
  model: api_proto.Model;
39
41
  voice: api_proto.Voice;
@@ -54,6 +56,7 @@ interface RealtimeOptions {
54
56
  maxSessionDuration: number;
55
57
  // reset the connection after this many seconds if provided
56
58
  connOptions: APIConnectOptions;
59
+ modalities: Modality[];
57
60
  }
58
61
 
59
62
  interface MessageGeneration {
@@ -61,6 +64,7 @@ interface MessageGeneration {
61
64
  textChannel: stream.StreamChannel<string>;
62
65
  audioChannel: stream.StreamChannel<AudioFrame>;
63
66
  audioTranscript: string;
67
+ modalities: Future<('text' | 'audio')[]>;
64
68
  }
65
69
 
66
70
  interface ResponseGeneration {
@@ -125,6 +129,7 @@ const DEFAULT_REALTIME_MODEL_OPTIONS = {
125
129
  maxResponseOutputTokens: DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS,
126
130
  maxSessionDuration: DEFAULT_MAX_SESSION_DURATION,
127
131
  connOptions: DEFAULT_API_CONNECT_OPTIONS,
132
+ modalities: ['text', 'audio'] as Modality[],
128
133
  };
129
134
  export class RealtimeModel extends llm.RealtimeModel {
130
135
  sampleRate = api_proto.SAMPLE_RATE;
@@ -142,6 +147,7 @@ export class RealtimeModel extends llm.RealtimeModel {
142
147
  temperature?: number;
143
148
  toolChoice?: llm.ToolChoice;
144
149
  baseURL?: string;
150
+ modalities?: Modality[];
145
151
  inputAudioTranscription?: api_proto.InputAudioTranscription | null;
146
152
  // TODO(shubhra): add inputAudioNoiseReduction
147
153
  turnDetection?: api_proto.TurnDetectionType | null;
@@ -155,11 +161,15 @@ export class RealtimeModel extends llm.RealtimeModel {
155
161
  connOptions?: APIConnectOptions;
156
162
  } = {},
157
163
  ) {
164
+ const modalities = (options.modalities ||
165
+ DEFAULT_REALTIME_MODEL_OPTIONS.modalities) as Modality[];
166
+
158
167
  super({
159
168
  messageTruncation: true,
160
169
  turnDetection: options.turnDetection !== null,
161
170
  userTranscription: options.inputAudioTranscription !== null,
162
171
  autoToolReplyGeneration: false,
172
+ audioOutput: modalities.includes('audio'),
163
173
  });
164
174
 
165
175
  const isAzure = !!(options.apiVersion || options.entraToken || options.azureDeployment);
@@ -188,13 +198,15 @@ export class RealtimeModel extends llm.RealtimeModel {
188
198
  options.baseURL = `${azureEndpoint.replace(/\/$/, '')}/openai`;
189
199
  }
190
200
 
201
+ const { modalities: _, ...optionsWithoutModalities } = options;
191
202
  this._options = {
192
203
  ...DEFAULT_REALTIME_MODEL_OPTIONS,
193
- ...options,
204
+ ...optionsWithoutModalities,
194
205
  baseURL: options.baseURL || BASE_URL,
195
206
  apiKey,
196
207
  isAzure,
197
208
  model: options.model || DEFAULT_REALTIME_MODEL_OPTIONS.model,
209
+ modalities,
198
210
  };
199
211
  }
200
212
 
@@ -389,6 +401,12 @@ export class RealtimeSession extends llm.RealtimeSession {
389
401
  }
390
402
 
391
403
  private createSessionUpdateEvent(): api_proto.SessionUpdateEvent {
404
+ // OpenAI supports ['text'] or ['text', 'audio'] (audio always includes text transcript)
405
+ // We normalize to ensure 'text' is always present when using audio
406
+ const modalities: Modality[] = this.oaiRealtimeModel._options.modalities.includes('audio')
407
+ ? ['text', 'audio']
408
+ : ['text'];
409
+
392
410
  return {
393
411
  type: 'session.update',
394
412
  session: {
@@ -396,7 +414,7 @@ export class RealtimeSession extends llm.RealtimeSession {
396
414
  voice: this.oaiRealtimeModel._options.voice,
397
415
  input_audio_format: 'pcm16',
398
416
  output_audio_format: 'pcm16',
399
- modalities: ['text', 'audio'],
417
+ modalities: modalities,
400
418
  turn_detection: this.oaiRealtimeModel._options.turnDetection,
401
419
  input_audio_transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
402
420
  // TODO(shubhra): add inputAudioNoiseReduction
@@ -592,7 +610,7 @@ export class RealtimeSession extends llm.RealtimeSession {
592
610
 
593
611
  pushAudio(frame: AudioFrame): void {
594
612
  for (const f of this.resampleAudio(frame)) {
595
- for (const nf of this.bstream.write(f.data.buffer)) {
613
+ for (const nf of this.bstream.write(f.data.buffer as ArrayBuffer)) {
596
614
  this.sendEvent({
597
615
  type: 'input_audio_buffer.append',
598
616
  audio: Buffer.from(nf.data.buffer).toString('base64'),
@@ -632,13 +650,38 @@ export class RealtimeSession extends llm.RealtimeSession {
632
650
  } as api_proto.ResponseCancelEvent);
633
651
  }
634
652
 
635
- async truncate(_options: { messageId: string; audioEndMs: number }): Promise<void> {
636
- this.sendEvent({
637
- type: 'conversation.item.truncate',
638
- content_index: 0,
639
- item_id: _options.messageId,
640
- audio_end_ms: _options.audioEndMs,
641
- } as api_proto.ConversationItemTruncateEvent);
653
+ async truncate(_options: {
654
+ messageId: string;
655
+ audioEndMs: number;
656
+ modalities?: Modality[];
657
+ audioTranscript?: string;
658
+ }): Promise<void> {
659
+ if (!_options.modalities || _options.modalities.includes('audio')) {
660
+ this.sendEvent({
661
+ type: 'conversation.item.truncate',
662
+ content_index: 0,
663
+ item_id: _options.messageId,
664
+ audio_end_ms: _options.audioEndMs,
665
+ } as api_proto.ConversationItemTruncateEvent);
666
+ } else if (_options.audioTranscript !== undefined) {
667
+ // sync it to the remote chat context
668
+ const chatCtx = this.chatCtx.copy();
669
+ const idx = chatCtx.indexById(_options.messageId);
670
+ if (idx !== undefined) {
671
+ const item = chatCtx.items[idx];
672
+ if (item && item.type === 'message') {
673
+ const newItem = llm.ChatMessage.create({
674
+ ...item,
675
+ content: [_options.audioTranscript],
676
+ });
677
+ chatCtx.items[idx] = newItem;
678
+ const events = this.createChatCtxUpdateEvents(chatCtx);
679
+ for (const ev of events) {
680
+ this.sendEvent(ev);
681
+ }
682
+ }
683
+ }
684
+ }
642
685
  }
643
686
 
644
687
  private loggableEvent(
@@ -907,6 +950,12 @@ export class RealtimeSession extends llm.RealtimeSession {
907
950
  case 'response.content_part.done':
908
951
  this.handleResponseContentPartDone(event);
909
952
  break;
953
+ case 'response.text.delta':
954
+ this.handleResponseTextDelta(event);
955
+ break;
956
+ case 'response.text.done':
957
+ this.handleResponseTextDone(event);
958
+ break;
910
959
  case 'response.audio_transcript.delta':
911
960
  this.handleResponseAudioTranscriptDelta(event);
912
961
  break;
@@ -1049,6 +1098,35 @@ export class RealtimeSession extends llm.RealtimeSession {
1049
1098
  this.textModeRecoveryRetries = 0;
1050
1099
  return;
1051
1100
  }
1101
+
1102
+ const itemId = event.item.id;
1103
+ if (!itemId) {
1104
+ throw new Error('item.id is not set');
1105
+ }
1106
+
1107
+ const modalitiesFut = new Future<Modality[]>();
1108
+ const itemGeneration: MessageGeneration = {
1109
+ messageId: itemId,
1110
+ textChannel: stream.createStreamChannel<string>(),
1111
+ audioChannel: stream.createStreamChannel<AudioFrame>(),
1112
+ audioTranscript: '',
1113
+ modalities: modalitiesFut,
1114
+ };
1115
+
1116
+ // If audioOutput is not supported, close audio channel immediately
1117
+ if (!this.oaiRealtimeModel.capabilities.audioOutput) {
1118
+ itemGeneration.audioChannel.close();
1119
+ modalitiesFut.resolve(['text']);
1120
+ }
1121
+
1122
+ this.currentGeneration.messageChannel.write({
1123
+ messageId: itemId,
1124
+ textStream: itemGeneration.textChannel.stream(),
1125
+ audioStream: itemGeneration.audioChannel.stream(),
1126
+ modalities: modalitiesFut.await,
1127
+ });
1128
+
1129
+ this.currentGeneration.messages.set(itemId, itemGeneration);
1052
1130
  }
1053
1131
 
1054
1132
  private handleConversationItemCreated(event: api_proto.ConversationItemCreatedEvent): void {
@@ -1125,39 +1203,24 @@ export class RealtimeSession extends llm.RealtimeSession {
1125
1203
 
1126
1204
  const itemId = event.item_id;
1127
1205
  const itemType = event.part.type;
1128
- const responseId = event.response_id;
1129
1206
 
1130
- if (itemType === 'audio') {
1131
- this.resolveGeneration(responseId);
1132
- if (this.textModeRecoveryRetries > 0) {
1133
- this.#logger.info(
1134
- { retries: this.textModeRecoveryRetries },
1135
- 'recovered from text-only response',
1136
- );
1137
- this.textModeRecoveryRetries = 0;
1138
- }
1207
+ const itemGeneration = this.currentGeneration.messages.get(itemId);
1208
+ if (!itemGeneration) {
1209
+ this.#logger.warn(`itemGeneration not found for itemId=${itemId}`);
1210
+ return;
1211
+ }
1139
1212
 
1140
- const itemGeneration: MessageGeneration = {
1141
- messageId: itemId,
1142
- textChannel: stream.createStreamChannel<string>(),
1143
- audioChannel: stream.createStreamChannel<AudioFrame>(),
1144
- audioTranscript: '',
1145
- };
1146
-
1147
- this.currentGeneration.messageChannel.write({
1148
- messageId: itemId,
1149
- textStream: itemGeneration.textChannel.stream(),
1150
- audioStream: itemGeneration.audioChannel.stream(),
1151
- });
1213
+ if (itemType === 'text' && this.oaiRealtimeModel.capabilities.audioOutput) {
1214
+ this.#logger.warn('Text response received from OpenAI Realtime API in audio modality.');
1215
+ }
1152
1216
 
1153
- this.currentGeneration.messages.set(itemId, itemGeneration);
1217
+ if (!itemGeneration.modalities.done) {
1218
+ const modalityResult: Modality[] = itemType === 'text' ? ['text'] : ['audio', 'text'];
1219
+ itemGeneration.modalities.resolve(modalityResult);
1220
+ }
1221
+
1222
+ if (this.currentGeneration._firstTokenTimestamp === undefined) {
1154
1223
  this.currentGeneration._firstTokenTimestamp = Date.now();
1155
- return;
1156
- } else {
1157
- this.interrupt();
1158
- if (this.textModeRecoveryRetries === 0) {
1159
- this.#logger.warn({ responseId }, 'received text-only response from OpenAI Realtime API');
1160
- }
1161
1224
  }
1162
1225
  }
1163
1226
 
@@ -1173,6 +1236,33 @@ export class RealtimeSession extends llm.RealtimeSession {
1173
1236
  // TODO(shubhra): handle text mode recovery
1174
1237
  }
1175
1238
 
1239
+ private handleResponseTextDelta(event: api_proto.ResponseTextDeltaEvent): void {
1240
+ if (!this.currentGeneration) {
1241
+ throw new Error('currentGeneration is not set');
1242
+ }
1243
+
1244
+ const itemGeneration = this.currentGeneration.messages.get(event.item_id);
1245
+ if (!itemGeneration) {
1246
+ throw new Error('itemGeneration is not set');
1247
+ }
1248
+
1249
+ if (
1250
+ !this.oaiRealtimeModel.capabilities.audioOutput &&
1251
+ !this.currentGeneration._firstTokenTimestamp
1252
+ ) {
1253
+ this.currentGeneration._firstTokenTimestamp = Date.now();
1254
+ }
1255
+
1256
+ itemGeneration.textChannel.write(event.delta);
1257
+ itemGeneration.audioTranscript += event.delta;
1258
+ }
1259
+
1260
+ private handleResponseTextDone(_event: api_proto.ResponseTextDoneEvent): void {
1261
+ if (!this.currentGeneration) {
1262
+ throw new Error('currentGeneration is not set');
1263
+ }
1264
+ }
1265
+
1176
1266
  private handleResponseAudioTranscriptDelta(
1177
1267
  event: api_proto.ResponseAudioTranscriptDeltaEvent,
1178
1268
  ): void {
@@ -1204,6 +1294,14 @@ export class RealtimeSession extends llm.RealtimeSession {
1204
1294
  throw new Error('itemGeneration is not set');
1205
1295
  }
1206
1296
 
1297
+ if (this.currentGeneration._firstTokenTimestamp === undefined) {
1298
+ this.currentGeneration._firstTokenTimestamp = Date.now();
1299
+ }
1300
+
1301
+ if (!itemGeneration.modalities.done) {
1302
+ itemGeneration.modalities.resolve(['audio', 'text']);
1303
+ }
1304
+
1207
1305
  const binaryString = atob(event.delta);
1208
1306
  const len = binaryString.length;
1209
1307
  const bytes = new Uint8Array(len);
@@ -1261,6 +1359,10 @@ export class RealtimeSession extends llm.RealtimeSession {
1261
1359
  // text response doesn't have itemGeneration
1262
1360
  itemGeneration.textChannel.close();
1263
1361
  itemGeneration.audioChannel.close();
1362
+ if (!itemGeneration.modalities.done) {
1363
+ // In case message modalities is not set, this shouldn't happen
1364
+ itemGeneration.modalities.resolve(this.oaiRealtimeModel._options.modalities);
1365
+ }
1264
1366
  }
1265
1367
  }
1266
1368
 
@@ -1284,6 +1386,9 @@ export class RealtimeSession extends llm.RealtimeSession {
1284
1386
  for (const generation of this.currentGeneration.messages.values()) {
1285
1387
  generation.textChannel.close();
1286
1388
  generation.audioChannel.close();
1389
+ if (!generation.modalities.done) {
1390
+ generation.modalities.resolve(this.oaiRealtimeModel._options.modalities);
1391
+ }
1287
1392
  }
1288
1393
 
1289
1394
  this.currentGeneration.functionChannel.close();
@@ -1473,6 +1578,8 @@ function livekitItemToOpenAIItem(item: llm.ChatItem): api_proto.ItemResource {
1473
1578
  role,
1474
1579
  content: contentList,
1475
1580
  } as api_proto.UserItem;
1581
+ default:
1582
+ throw new Error(`Unsupported item type: ${(item as any).type}`);
1476
1583
  }
1477
1584
  }
1478
1585