@livekit/agents-plugin-openai 0.7.2 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@livekit/agents-plugin-openai",
3
- "version": "0.7.2",
3
+ "version": "0.8.0",
4
4
  "description": "OpenAI plugin for LiveKit Node Agents",
5
5
  "main": "dist/index.js",
6
6
  "require": "dist/index.cjs",
@@ -25,7 +25,7 @@
25
25
  "@livekit/agents": "^x",
26
26
  "@livekit/agents-plugin-silero": "^x",
27
27
  "@livekit/agents-plugins-test": "^x",
28
- "@livekit/rtc-node": "^0.12.1",
28
+ "@livekit/rtc-node": "^0.13.1",
29
29
  "@microsoft/api-extractor": "^7.35.0",
30
30
  "@types/ws": "^8.5.10",
31
31
  "tsup": "^8.3.5",
@@ -37,8 +37,8 @@
37
37
  "ws": "^8.16.0"
38
38
  },
39
39
  "peerDependencies": {
40
- "@livekit/rtc-node": "^0.12.1",
41
- "@livekit/agents": "^0.5.2x"
40
+ "@livekit/rtc-node": "^0.13.1",
41
+ "@livekit/agents": "^0.6.1x"
42
42
  },
43
43
  "scripts": {
44
44
  "build": "tsup --onSuccess \"tsc --declaration --emitDeclarationOnly\"",
package/src/llm.ts CHANGED
@@ -398,6 +398,7 @@ export class LLM extends llm.LLM {
398
398
  temperature = temperature || this.#opts.temperature;
399
399
 
400
400
  return new LLMStream(
401
+ this,
401
402
  this.#client,
402
403
  chatCtx,
403
404
  fncCtx,
@@ -416,8 +417,10 @@ export class LLMStream extends llm.LLMStream {
416
417
  #client: OpenAI;
417
418
  #logger = log();
418
419
  #id = randomUUID();
420
+ label = 'openai.LLMStream';
419
421
 
420
422
  constructor(
423
+ llm: LLM,
421
424
  client: OpenAI,
422
425
  chatCtx: llm.ChatContext,
423
426
  fncCtx: llm.FunctionContext | undefined,
@@ -426,7 +429,7 @@ export class LLMStream extends llm.LLMStream {
426
429
  temperature?: number,
427
430
  n?: number,
428
431
  ) {
429
- super(chatCtx, fncCtx);
432
+ super(llm, chatCtx, fncCtx);
430
433
  this.#client = client;
431
434
  this.#run(opts, n, parallelToolCalls, temperature);
432
435
  }
@@ -8,6 +8,7 @@ import {
8
8
  llm,
9
9
  log,
10
10
  mergeFrames,
11
+ metrics,
11
12
  multimodal,
12
13
  } from '@livekit/agents';
13
14
  import { AudioFrame } from '@livekit/rtc-node';
@@ -40,6 +41,8 @@ export interface RealtimeResponse {
40
41
  usage: api_proto.ModelUsage | null;
41
42
  output: RealtimeOutput[];
42
43
  doneFut: Future;
44
+ createdTimestamp: number;
45
+ firstTokenTimestamp?: number;
43
46
  }
44
47
 
45
48
  export interface RealtimeOutput {
@@ -62,6 +65,7 @@ export interface RealtimeContent {
62
65
  textStream: AsyncIterableQueue<string>;
63
66
  audioStream: AsyncIterableQueue<AudioFrame>;
64
67
  toolCalls: RealtimeToolCall[];
68
+ contentType: api_proto.Modality;
65
69
  }
66
70
 
67
71
  export interface RealtimeToolCall {
@@ -666,6 +670,38 @@ export class RealtimeSession extends multimodal.RealtimeSession {
666
670
  this.queueMsg(sessionUpdateEvent);
667
671
  }
668
672
 
673
+ /** Create an empty audio message with the given duration. */
674
+ #createEmptyUserAudioMessage(duration: number): llm.ChatMessage {
675
+ const samples = duration * api_proto.SAMPLE_RATE;
676
+ return new llm.ChatMessage({
677
+ role: llm.ChatRole.USER,
678
+ content: {
679
+ frame: new AudioFrame(
680
+ new Int16Array(samples * api_proto.NUM_CHANNELS),
681
+ api_proto.SAMPLE_RATE,
682
+ api_proto.NUM_CHANNELS,
683
+ samples,
684
+ ),
685
+ },
686
+ });
687
+ }
688
+
689
+ /**
690
+ * Try to recover from a text response to audio mode.
691
+ *
692
+ * @remarks
693
+ * Sometimes the OpenAI Realtime API returns text instead of audio responses.
694
+ * This method tries to recover from this by requesting a new response after deleting the text
695
+ * response and creating an empty user audio message.
696
+ */
697
+ recoverFromTextResponse(itemId: string) {
698
+ if (itemId) {
699
+ this.conversation.item.delete(itemId);
700
+ }
701
+ this.conversation.item.create(this.#createEmptyUserAudioMessage(1));
702
+ this.response.create();
703
+ }
704
+
669
705
  #start(): Promise<void> {
670
706
  return new Promise(async (resolve, reject) => {
671
707
  const headers: Record<string, string> = {
@@ -932,6 +968,7 @@ export class RealtimeSession extends multimodal.RealtimeSession {
932
968
  usage: null,
933
969
  output: [],
934
970
  doneFut: doneFut,
971
+ createdTimestamp: Date.now(),
935
972
  };
936
973
  this.#pendingResponses[newResponse.id] = newResponse;
937
974
  this.emit('response_created', newResponse);
@@ -946,7 +983,70 @@ export class RealtimeSession extends multimodal.RealtimeSession {
946
983
  response.usage = responseData.usage ?? null;
947
984
  this.#pendingResponses[responseId] = response;
948
985
  response.doneFut.resolve();
986
+
987
+ let metricsError: Error | undefined;
988
+ let cancelled = false;
989
+ switch (response.status) {
990
+ case 'failed': {
991
+ if (response.statusDetails.type !== 'failed') break;
992
+ const err = response.statusDetails.error;
993
+ metricsError = new metrics.MultimodalLLMError({
994
+ type: response.statusDetails.type,
995
+ code: err?.code,
996
+ message: err?.message,
997
+ });
998
+ this.#logger
999
+ .child({ code: err?.code, error: err?.message })
1000
+ .error('response generation failed');
1001
+ break;
1002
+ }
1003
+ case 'incomplete': {
1004
+ if (response.statusDetails.type !== 'incomplete') break;
1005
+ const reason = response.statusDetails.reason;
1006
+ metricsError = new metrics.MultimodalLLMError({
1007
+ type: response.statusDetails.type,
1008
+ reason,
1009
+ });
1010
+ this.#logger.child({ reason }).error('response generation incomplete');
1011
+ break;
1012
+ }
1013
+ case 'cancelled': {
1014
+ cancelled = true;
1015
+ break;
1016
+ }
1017
+ }
949
1018
  this.emit('response_done', response);
1019
+
1020
+ let ttft: number | undefined;
1021
+ if (response.firstTokenTimestamp) {
1022
+ ttft = response.firstTokenTimestamp - response.createdTimestamp;
1023
+ }
1024
+ const duration = Date.now() - response.createdTimestamp;
1025
+
1026
+ const usage = response.usage;
1027
+ const metric: metrics.MultimodalLLMMetrics = {
1028
+ timestamp: response.createdTimestamp,
1029
+ requestId: response.id,
1030
+ ttft: ttft!,
1031
+ duration,
1032
+ cancelled,
1033
+ label: this.constructor.name,
1034
+ completionTokens: usage?.output_tokens || 0,
1035
+ promptTokens: usage?.input_tokens || 0,
1036
+ totalTokens: usage?.total_tokens || 0,
1037
+ tokensPerSecond: ((usage?.output_tokens || 0) / duration) * 1000,
1038
+ error: metricsError,
1039
+ inputTokenDetails: {
1040
+ cachedTokens: usage?.input_token_details.cached_tokens || 0,
1041
+ textTokens: usage?.input_token_details.text_tokens || 0,
1042
+ audioTokens: usage?.input_token_details.audio_tokens || 0,
1043
+ },
1044
+ outputTokenDetails: {
1045
+ textTokens: usage?.output_token_details.text_tokens || 0,
1046
+ audioTokens: usage?.output_token_details.audio_tokens || 0,
1047
+ },
1048
+ };
1049
+ this.emit('metrics_collected', metric);
950
1050
  }
951
1051
 
952
1052
  #handleResponseOutputItemAdded(event: api_proto.ResponseOutputItemAddedEvent): void {
@@ -1060,8 +1160,10 @@ export class RealtimeSession extends multimodal.RealtimeSession {
1060
1160
  textStream: textStream,
1061
1161
  audioStream: audioStream,
1062
1162
  toolCalls: [],
1163
+ contentType: event.part.type,
1063
1164
  };
1064
1165
  output?.content.push(newContent);
1166
+ response!.firstTokenTimestamp = Date.now();
1065
1167
  this.emit('response_content_added', newContent);
1066
1168
  }
1067
1169
 
@@ -1075,6 +1177,8 @@ export class RealtimeSession extends multimodal.RealtimeSession {
1075
1177
  }
1076
1178
 
1077
1179
  #handleResponseTextDone(event: api_proto.ResponseTextDoneEvent): void {
1180
+ const content = this.#getContent(event);
1181
+ content.text = event.text;
1078
1182
  this.emit('response_text_done', event);
1079
1183
  }
1080
1184
 
package/src/stt.ts CHANGED
@@ -9,6 +9,7 @@ import type { GroqAudioModels, WhisperModels } from './models.js';
9
9
  export interface STTOptions {
10
10
  apiKey?: string;
11
11
  language: string;
12
+ prompt?: string;
12
13
  detectLanguage: boolean;
13
14
  model: WhisperModels | string;
14
15
  baseURL?: string;
@@ -25,6 +26,7 @@ const defaultSTTOptions: STTOptions = {
25
26
  export class STT extends stt.STT {
26
27
  #opts: STTOptions;
27
28
  #client: OpenAI;
29
+ label = 'openai.STT';
28
30
 
29
31
  /**
30
32
  * Create a new instance of OpenAI STT.
@@ -108,7 +110,7 @@ export class STT extends stt.STT {
108
110
  return Buffer.concat([header, Buffer.from(frame.data.buffer)]);
109
111
  }
110
112
 
111
- async recognize(buffer: AudioBuffer, language?: string): Promise<stt.SpeechEvent> {
113
+ async _recognize(buffer: AudioBuffer, language?: string): Promise<stt.SpeechEvent> {
112
114
  const config = this.#sanitizeOptions(language);
113
115
  buffer = mergeFrames(buffer);
114
116
  const file = new File([this.#createWav(buffer)], 'audio.wav', { type: 'audio/wav' });
@@ -116,6 +118,7 @@ export class STT extends stt.STT {
116
118
  file,
117
119
  model: this.#opts.model,
118
120
  language: config.language,
121
+ prompt: config.prompt,
119
122
  response_format: 'json',
120
123
  });
121
124
 
package/src/tts.ts CHANGED
@@ -2,6 +2,7 @@
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
  import { AudioByteStream, tts } from '@livekit/agents';
5
+ import type { AudioFrame } from '@livekit/rtc-node';
5
6
  import { randomUUID } from 'crypto';
6
7
  import { OpenAI } from 'openai';
7
8
  import type { TTSModels, TTSVoices } from './models.js';
@@ -28,6 +29,7 @@ const defaultTTSOptions: TTSOptions = {
28
29
  export class TTS extends tts.TTS {
29
30
  #opts: TTSOptions;
30
31
  #client: OpenAI;
32
+ label = 'openai.TTS';
31
33
 
32
34
  /**
33
35
  * Create a new instance of OpenAI TTS.
@@ -58,6 +60,8 @@ export class TTS extends tts.TTS {
58
60
 
59
61
  synthesize(text: string): ChunkedStream {
60
62
  return new ChunkedStream(
63
+ this,
64
+ text,
61
65
  this.#client.audio.speech.create({
62
66
  input: text,
63
67
  model: this.#opts.model,
@@ -74,9 +78,11 @@ export class TTS extends tts.TTS {
74
78
  }
75
79
 
76
80
  export class ChunkedStream extends tts.ChunkedStream {
81
+ label = 'openai.ChunkedStream';
82
+
77
83
  // set Promise<T> to any because OpenAI returns an annoying Response type
78
- constructor(stream: Promise<any>) {
79
- super();
84
+ constructor(tts: TTS, text: string, stream: Promise<any>) {
85
+ super(text, tts);
80
86
  this.#run(stream);
81
87
  }
82
88
 
@@ -86,13 +92,20 @@ export class ChunkedStream extends tts.ChunkedStream {
86
92
  const audioByteStream = new AudioByteStream(OPENAI_TTS_SAMPLE_RATE, OPENAI_TTS_CHANNELS);
87
93
  const frames = audioByteStream.write(buffer);
88
94
 
95
+ let lastFrame: AudioFrame | undefined;
96
+ const sendLastFrame = (segmentId: string, final: boolean) => {
97
+ if (lastFrame) {
98
+ this.queue.put({ requestId, segmentId, frame: lastFrame, final });
99
+ lastFrame = undefined;
100
+ }
101
+ };
102
+
89
103
  for (const frame of frames) {
90
- this.queue.put({
91
- frame,
92
- requestId,
93
- segmentId: requestId,
94
- });
104
+ sendLastFrame(requestId, false);
105
+ lastFrame = frame;
95
106
  }
107
+ sendLastFrame(requestId, true);
108
+
96
109
  this.queue.close();
97
110
  }
98
111
  }