@livekit/agents-plugin-openai 0.7.2 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/llm.cjs +4 -2
- package/dist/llm.cjs.map +1 -1
- package/dist/llm.d.ts +2 -1
- package/dist/llm.d.ts.map +1 -1
- package/dist/llm.js +4 -2
- package/dist/llm.js.map +1 -1
- package/dist/realtime/realtime_model.cjs +95 -2
- package/dist/realtime/realtime_model.cjs.map +1 -1
- package/dist/realtime/realtime_model.d.ts +12 -0
- package/dist/realtime/realtime_model.d.ts.map +1 -1
- package/dist/realtime/realtime_model.js +96 -2
- package/dist/realtime/realtime_model.js.map +1 -1
- package/dist/stt.cjs +3 -1
- package/dist/stt.cjs.map +1 -1
- package/dist/stt.d.ts +3 -1
- package/dist/stt.d.ts.map +1 -1
- package/dist/stt.js +3 -1
- package/dist/stt.js.map +1 -1
- package/dist/tts.cjs +16 -7
- package/dist/tts.cjs.map +1 -1
- package/dist/tts.d.ts +3 -1
- package/dist/tts.d.ts.map +1 -1
- package/dist/tts.js +16 -7
- package/dist/tts.js.map +1 -1
- package/package.json +4 -4
- package/src/llm.ts +4 -1
- package/src/realtime/realtime_model.ts +104 -0
- package/src/stt.ts +4 -1
- package/src/tts.ts +20 -7
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@livekit/agents-plugin-openai",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.8.0",
|
|
4
4
|
"description": "OpenAI plugin for LiveKit Node Agents",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"require": "dist/index.cjs",
|
|
@@ -25,7 +25,7 @@
|
|
|
25
25
|
"@livekit/agents": "^x",
|
|
26
26
|
"@livekit/agents-plugin-silero": "^x",
|
|
27
27
|
"@livekit/agents-plugins-test": "^x",
|
|
28
|
-
"@livekit/rtc-node": "^0.
|
|
28
|
+
"@livekit/rtc-node": "^0.13.1",
|
|
29
29
|
"@microsoft/api-extractor": "^7.35.0",
|
|
30
30
|
"@types/ws": "^8.5.10",
|
|
31
31
|
"tsup": "^8.3.5",
|
|
@@ -37,8 +37,8 @@
|
|
|
37
37
|
"ws": "^8.16.0"
|
|
38
38
|
},
|
|
39
39
|
"peerDependencies": {
|
|
40
|
-
"@livekit/rtc-node": "^0.
|
|
41
|
-
"@livekit/agents": "^0.
|
|
40
|
+
"@livekit/rtc-node": "^0.13.1",
|
|
41
|
+
"@livekit/agents": "^0.6.1x"
|
|
42
42
|
},
|
|
43
43
|
"scripts": {
|
|
44
44
|
"build": "tsup --onSuccess \"tsc --declaration --emitDeclarationOnly\"",
|
package/src/llm.ts
CHANGED
|
@@ -398,6 +398,7 @@ export class LLM extends llm.LLM {
|
|
|
398
398
|
temperature = temperature || this.#opts.temperature;
|
|
399
399
|
|
|
400
400
|
return new LLMStream(
|
|
401
|
+
this,
|
|
401
402
|
this.#client,
|
|
402
403
|
chatCtx,
|
|
403
404
|
fncCtx,
|
|
@@ -416,8 +417,10 @@ export class LLMStream extends llm.LLMStream {
|
|
|
416
417
|
#client: OpenAI;
|
|
417
418
|
#logger = log();
|
|
418
419
|
#id = randomUUID();
|
|
420
|
+
label = 'openai.LLMStream';
|
|
419
421
|
|
|
420
422
|
constructor(
|
|
423
|
+
llm: LLM,
|
|
421
424
|
client: OpenAI,
|
|
422
425
|
chatCtx: llm.ChatContext,
|
|
423
426
|
fncCtx: llm.FunctionContext | undefined,
|
|
@@ -426,7 +429,7 @@ export class LLMStream extends llm.LLMStream {
|
|
|
426
429
|
temperature?: number,
|
|
427
430
|
n?: number,
|
|
428
431
|
) {
|
|
429
|
-
super(chatCtx, fncCtx);
|
|
432
|
+
super(llm, chatCtx, fncCtx);
|
|
430
433
|
this.#client = client;
|
|
431
434
|
this.#run(opts, n, parallelToolCalls, temperature);
|
|
432
435
|
}
|
|
@@ -8,6 +8,7 @@ import {
|
|
|
8
8
|
llm,
|
|
9
9
|
log,
|
|
10
10
|
mergeFrames,
|
|
11
|
+
metrics,
|
|
11
12
|
multimodal,
|
|
12
13
|
} from '@livekit/agents';
|
|
13
14
|
import { AudioFrame } from '@livekit/rtc-node';
|
|
@@ -40,6 +41,8 @@ export interface RealtimeResponse {
|
|
|
40
41
|
usage: api_proto.ModelUsage | null;
|
|
41
42
|
output: RealtimeOutput[];
|
|
42
43
|
doneFut: Future;
|
|
44
|
+
createdTimestamp: number;
|
|
45
|
+
firstTokenTimestamp?: number;
|
|
43
46
|
}
|
|
44
47
|
|
|
45
48
|
export interface RealtimeOutput {
|
|
@@ -62,6 +65,7 @@ export interface RealtimeContent {
|
|
|
62
65
|
textStream: AsyncIterableQueue<string>;
|
|
63
66
|
audioStream: AsyncIterableQueue<AudioFrame>;
|
|
64
67
|
toolCalls: RealtimeToolCall[];
|
|
68
|
+
contentType: api_proto.Modality;
|
|
65
69
|
}
|
|
66
70
|
|
|
67
71
|
export interface RealtimeToolCall {
|
|
@@ -666,6 +670,38 @@ export class RealtimeSession extends multimodal.RealtimeSession {
|
|
|
666
670
|
this.queueMsg(sessionUpdateEvent);
|
|
667
671
|
}
|
|
668
672
|
|
|
673
|
+
/** Create an empty audio message with the given duration. */
|
|
674
|
+
#createEmptyUserAudioMessage(duration: number): llm.ChatMessage {
|
|
675
|
+
const samples = duration * api_proto.SAMPLE_RATE;
|
|
676
|
+
return new llm.ChatMessage({
|
|
677
|
+
role: llm.ChatRole.USER,
|
|
678
|
+
content: {
|
|
679
|
+
frame: new AudioFrame(
|
|
680
|
+
new Int16Array(samples * api_proto.NUM_CHANNELS),
|
|
681
|
+
api_proto.SAMPLE_RATE,
|
|
682
|
+
api_proto.NUM_CHANNELS,
|
|
683
|
+
samples,
|
|
684
|
+
),
|
|
685
|
+
},
|
|
686
|
+
});
|
|
687
|
+
}
|
|
688
|
+
|
|
689
|
+
/**
|
|
690
|
+
* Try to recover from a text response to audio mode.
|
|
691
|
+
*
|
|
692
|
+
* @remarks
|
|
693
|
+
* Sometimes the OpenAI Realtime API returns text instead of audio responses.
|
|
694
|
+
* This method tries to recover from this by requesting a new response after deleting the text
|
|
695
|
+
* response and creating an empty user audio message.
|
|
696
|
+
*/
|
|
697
|
+
recoverFromTextResponse(itemId: string) {
|
|
698
|
+
if (itemId) {
|
|
699
|
+
this.conversation.item.delete(itemId);
|
|
700
|
+
}
|
|
701
|
+
this.conversation.item.create(this.#createEmptyUserAudioMessage(1));
|
|
702
|
+
this.response.create();
|
|
703
|
+
}
|
|
704
|
+
|
|
669
705
|
#start(): Promise<void> {
|
|
670
706
|
return new Promise(async (resolve, reject) => {
|
|
671
707
|
const headers: Record<string, string> = {
|
|
@@ -932,6 +968,7 @@ export class RealtimeSession extends multimodal.RealtimeSession {
|
|
|
932
968
|
usage: null,
|
|
933
969
|
output: [],
|
|
934
970
|
doneFut: doneFut,
|
|
971
|
+
createdTimestamp: Date.now(),
|
|
935
972
|
};
|
|
936
973
|
this.#pendingResponses[newResponse.id] = newResponse;
|
|
937
974
|
this.emit('response_created', newResponse);
|
|
@@ -946,7 +983,70 @@ export class RealtimeSession extends multimodal.RealtimeSession {
|
|
|
946
983
|
response.usage = responseData.usage ?? null;
|
|
947
984
|
this.#pendingResponses[responseId] = response;
|
|
948
985
|
response.doneFut.resolve();
|
|
986
|
+
|
|
987
|
+
let metricsError: Error | undefined;
|
|
988
|
+
let cancelled = false;
|
|
989
|
+
switch (response.status) {
|
|
990
|
+
case 'failed': {
|
|
991
|
+
if (response.statusDetails.type !== 'failed') break;
|
|
992
|
+
const err = response.statusDetails.error;
|
|
993
|
+
metricsError = new metrics.MultimodalLLMError({
|
|
994
|
+
type: response.statusDetails.type,
|
|
995
|
+
code: err?.code,
|
|
996
|
+
message: err?.message,
|
|
997
|
+
});
|
|
998
|
+
this.#logger
|
|
999
|
+
.child({ code: err?.code, error: err?.message })
|
|
1000
|
+
.error('response generation failed');
|
|
1001
|
+
break;
|
|
1002
|
+
}
|
|
1003
|
+
case 'incomplete': {
|
|
1004
|
+
if (response.statusDetails.type !== 'incomplete') break;
|
|
1005
|
+
const reason = response.statusDetails.reason;
|
|
1006
|
+
metricsError = new metrics.MultimodalLLMError({
|
|
1007
|
+
type: response.statusDetails.type,
|
|
1008
|
+
reason,
|
|
1009
|
+
});
|
|
1010
|
+
this.#logger.child({ reason }).error('response generation incomplete');
|
|
1011
|
+
break;
|
|
1012
|
+
}
|
|
1013
|
+
case 'cancelled': {
|
|
1014
|
+
cancelled = true;
|
|
1015
|
+
break;
|
|
1016
|
+
}
|
|
1017
|
+
}
|
|
949
1018
|
this.emit('response_done', response);
|
|
1019
|
+
|
|
1020
|
+
let ttft: number | undefined;
|
|
1021
|
+
if (response.firstTokenTimestamp) {
|
|
1022
|
+
ttft = response.firstTokenTimestamp - response.createdTimestamp;
|
|
1023
|
+
}
|
|
1024
|
+
const duration = Date.now() - response.createdTimestamp;
|
|
1025
|
+
|
|
1026
|
+
const usage = response.usage;
|
|
1027
|
+
const metric: metrics.MultimodalLLMMetrics = {
|
|
1028
|
+
timestamp: response.createdTimestamp,
|
|
1029
|
+
requestId: response.id,
|
|
1030
|
+
ttft: ttft!,
|
|
1031
|
+
duration,
|
|
1032
|
+
cancelled,
|
|
1033
|
+
label: this.constructor.name,
|
|
1034
|
+
completionTokens: usage?.output_tokens || 0,
|
|
1035
|
+
promptTokens: usage?.input_tokens || 0,
|
|
1036
|
+
totalTokens: usage?.total_tokens || 0,
|
|
1037
|
+
tokensPerSecond: ((usage?.output_tokens || 0) / duration) * 1000,
|
|
1038
|
+
error: metricsError,
|
|
1039
|
+
inputTokenDetails: {
|
|
1040
|
+
cachedTokens: usage?.input_token_details.cached_tokens || 0,
|
|
1041
|
+
textTokens: usage?.input_token_details.text_tokens || 0,
|
|
1042
|
+
audioTokens: usage?.input_token_details.audio_tokens || 0,
|
|
1043
|
+
},
|
|
1044
|
+
outputTokenDetails: {
|
|
1045
|
+
textTokens: usage?.output_token_details.text_tokens || 0,
|
|
1046
|
+
audioTokens: usage?.output_token_details.audio_tokens || 0,
|
|
1047
|
+
},
|
|
1048
|
+
};
|
|
1049
|
+
this.emit('metrics_collected', metric);
|
|
950
1050
|
}
|
|
951
1051
|
|
|
952
1052
|
#handleResponseOutputItemAdded(event: api_proto.ResponseOutputItemAddedEvent): void {
|
|
@@ -1060,8 +1160,10 @@ export class RealtimeSession extends multimodal.RealtimeSession {
|
|
|
1060
1160
|
textStream: textStream,
|
|
1061
1161
|
audioStream: audioStream,
|
|
1062
1162
|
toolCalls: [],
|
|
1163
|
+
contentType: event.part.type,
|
|
1063
1164
|
};
|
|
1064
1165
|
output?.content.push(newContent);
|
|
1166
|
+
response!.firstTokenTimestamp = Date.now();
|
|
1065
1167
|
this.emit('response_content_added', newContent);
|
|
1066
1168
|
}
|
|
1067
1169
|
|
|
@@ -1075,6 +1177,8 @@ export class RealtimeSession extends multimodal.RealtimeSession {
|
|
|
1075
1177
|
}
|
|
1076
1178
|
|
|
1077
1179
|
#handleResponseTextDone(event: api_proto.ResponseTextDoneEvent): void {
|
|
1180
|
+
const content = this.#getContent(event);
|
|
1181
|
+
content.text = event.text;
|
|
1078
1182
|
this.emit('response_text_done', event);
|
|
1079
1183
|
}
|
|
1080
1184
|
|
package/src/stt.ts
CHANGED
|
@@ -9,6 +9,7 @@ import type { GroqAudioModels, WhisperModels } from './models.js';
|
|
|
9
9
|
export interface STTOptions {
|
|
10
10
|
apiKey?: string;
|
|
11
11
|
language: string;
|
|
12
|
+
prompt?: string;
|
|
12
13
|
detectLanguage: boolean;
|
|
13
14
|
model: WhisperModels | string;
|
|
14
15
|
baseURL?: string;
|
|
@@ -25,6 +26,7 @@ const defaultSTTOptions: STTOptions = {
|
|
|
25
26
|
export class STT extends stt.STT {
|
|
26
27
|
#opts: STTOptions;
|
|
27
28
|
#client: OpenAI;
|
|
29
|
+
label = 'openai.STT';
|
|
28
30
|
|
|
29
31
|
/**
|
|
30
32
|
* Create a new instance of OpenAI STT.
|
|
@@ -108,7 +110,7 @@ export class STT extends stt.STT {
|
|
|
108
110
|
return Buffer.concat([header, Buffer.from(frame.data.buffer)]);
|
|
109
111
|
}
|
|
110
112
|
|
|
111
|
-
async
|
|
113
|
+
async _recognize(buffer: AudioBuffer, language?: string): Promise<stt.SpeechEvent> {
|
|
112
114
|
const config = this.#sanitizeOptions(language);
|
|
113
115
|
buffer = mergeFrames(buffer);
|
|
114
116
|
const file = new File([this.#createWav(buffer)], 'audio.wav', { type: 'audio/wav' });
|
|
@@ -116,6 +118,7 @@ export class STT extends stt.STT {
|
|
|
116
118
|
file,
|
|
117
119
|
model: this.#opts.model,
|
|
118
120
|
language: config.language,
|
|
121
|
+
prompt: config.prompt,
|
|
119
122
|
response_format: 'json',
|
|
120
123
|
});
|
|
121
124
|
|
package/src/tts.ts
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
import { AudioByteStream, tts } from '@livekit/agents';
|
|
5
|
+
import type { AudioFrame } from '@livekit/rtc-node';
|
|
5
6
|
import { randomUUID } from 'crypto';
|
|
6
7
|
import { OpenAI } from 'openai';
|
|
7
8
|
import type { TTSModels, TTSVoices } from './models.js';
|
|
@@ -28,6 +29,7 @@ const defaultTTSOptions: TTSOptions = {
|
|
|
28
29
|
export class TTS extends tts.TTS {
|
|
29
30
|
#opts: TTSOptions;
|
|
30
31
|
#client: OpenAI;
|
|
32
|
+
label = 'openai.TTS';
|
|
31
33
|
|
|
32
34
|
/**
|
|
33
35
|
* Create a new instance of OpenAI TTS.
|
|
@@ -58,6 +60,8 @@ export class TTS extends tts.TTS {
|
|
|
58
60
|
|
|
59
61
|
synthesize(text: string): ChunkedStream {
|
|
60
62
|
return new ChunkedStream(
|
|
63
|
+
this,
|
|
64
|
+
text,
|
|
61
65
|
this.#client.audio.speech.create({
|
|
62
66
|
input: text,
|
|
63
67
|
model: this.#opts.model,
|
|
@@ -74,9 +78,11 @@ export class TTS extends tts.TTS {
|
|
|
74
78
|
}
|
|
75
79
|
|
|
76
80
|
export class ChunkedStream extends tts.ChunkedStream {
|
|
81
|
+
label = 'openai.ChunkedStream';
|
|
82
|
+
|
|
77
83
|
// set Promise<T> to any because OpenAI returns an annoying Response type
|
|
78
|
-
constructor(stream: Promise<any>) {
|
|
79
|
-
super();
|
|
84
|
+
constructor(tts: TTS, text: string, stream: Promise<any>) {
|
|
85
|
+
super(text, tts);
|
|
80
86
|
this.#run(stream);
|
|
81
87
|
}
|
|
82
88
|
|
|
@@ -86,13 +92,20 @@ export class ChunkedStream extends tts.ChunkedStream {
|
|
|
86
92
|
const audioByteStream = new AudioByteStream(OPENAI_TTS_SAMPLE_RATE, OPENAI_TTS_CHANNELS);
|
|
87
93
|
const frames = audioByteStream.write(buffer);
|
|
88
94
|
|
|
95
|
+
let lastFrame: AudioFrame | undefined;
|
|
96
|
+
const sendLastFrame = (segmentId: string, final: boolean) => {
|
|
97
|
+
if (lastFrame) {
|
|
98
|
+
this.queue.put({ requestId, segmentId, frame: lastFrame, final });
|
|
99
|
+
lastFrame = undefined;
|
|
100
|
+
}
|
|
101
|
+
};
|
|
102
|
+
|
|
89
103
|
for (const frame of frames) {
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
requestId,
|
|
93
|
-
segmentId: requestId,
|
|
94
|
-
});
|
|
104
|
+
sendLastFrame(requestId, false);
|
|
105
|
+
lastFrame = frame;
|
|
95
106
|
}
|
|
107
|
+
sendLastFrame(requestId, true);
|
|
108
|
+
|
|
96
109
|
this.queue.close();
|
|
97
110
|
}
|
|
98
111
|
}
|