@livekit/agents-plugin-openai 0.7.2 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/llm.cjs +4 -2
- package/dist/llm.cjs.map +1 -1
- package/dist/llm.d.ts +2 -1
- package/dist/llm.d.ts.map +1 -1
- package/dist/llm.js +4 -2
- package/dist/llm.js.map +1 -1
- package/dist/realtime/realtime_model.cjs +61 -1
- package/dist/realtime/realtime_model.cjs.map +1 -1
- package/dist/realtime/realtime_model.d.ts +2 -0
- package/dist/realtime/realtime_model.d.ts.map +1 -1
- package/dist/realtime/realtime_model.js +62 -1
- package/dist/realtime/realtime_model.js.map +1 -1
- package/dist/stt.cjs +2 -1
- package/dist/stt.cjs.map +1 -1
- package/dist/stt.d.ts +2 -1
- package/dist/stt.d.ts.map +1 -1
- package/dist/stt.js +2 -1
- package/dist/stt.js.map +1 -1
- package/dist/tts.cjs +16 -7
- package/dist/tts.cjs.map +1 -1
- package/dist/tts.d.ts +3 -1
- package/dist/tts.d.ts.map +1 -1
- package/dist/tts.js +16 -7
- package/dist/tts.js.map +1 -1
- package/package.json +2 -2
- package/src/llm.ts +4 -1
- package/src/realtime/realtime_model.ts +68 -0
- package/src/stt.ts +2 -1
- package/src/tts.ts +20 -7
package/src/llm.ts
CHANGED
|
@@ -398,6 +398,7 @@ export class LLM extends llm.LLM {
|
|
|
398
398
|
temperature = temperature || this.#opts.temperature;
|
|
399
399
|
|
|
400
400
|
return new LLMStream(
|
|
401
|
+
this,
|
|
401
402
|
this.#client,
|
|
402
403
|
chatCtx,
|
|
403
404
|
fncCtx,
|
|
@@ -416,8 +417,10 @@ export class LLMStream extends llm.LLMStream {
|
|
|
416
417
|
#client: OpenAI;
|
|
417
418
|
#logger = log();
|
|
418
419
|
#id = randomUUID();
|
|
420
|
+
label = 'openai.LLMStream';
|
|
419
421
|
|
|
420
422
|
constructor(
|
|
423
|
+
llm: LLM,
|
|
421
424
|
client: OpenAI,
|
|
422
425
|
chatCtx: llm.ChatContext,
|
|
423
426
|
fncCtx: llm.FunctionContext | undefined,
|
|
@@ -426,7 +429,7 @@ export class LLMStream extends llm.LLMStream {
|
|
|
426
429
|
temperature?: number,
|
|
427
430
|
n?: number,
|
|
428
431
|
) {
|
|
429
|
-
super(chatCtx, fncCtx);
|
|
432
|
+
super(llm, chatCtx, fncCtx);
|
|
430
433
|
this.#client = client;
|
|
431
434
|
this.#run(opts, n, parallelToolCalls, temperature);
|
|
432
435
|
}
|
|
@@ -8,6 +8,7 @@ import {
|
|
|
8
8
|
llm,
|
|
9
9
|
log,
|
|
10
10
|
mergeFrames,
|
|
11
|
+
metrics,
|
|
11
12
|
multimodal,
|
|
12
13
|
} from '@livekit/agents';
|
|
13
14
|
import { AudioFrame } from '@livekit/rtc-node';
|
|
@@ -40,6 +41,8 @@ export interface RealtimeResponse {
|
|
|
40
41
|
usage: api_proto.ModelUsage | null;
|
|
41
42
|
output: RealtimeOutput[];
|
|
42
43
|
doneFut: Future;
|
|
44
|
+
createdTimestamp: number;
|
|
45
|
+
firstTokenTimestamp?: number;
|
|
43
46
|
}
|
|
44
47
|
|
|
45
48
|
export interface RealtimeOutput {
|
|
@@ -932,6 +935,7 @@ export class RealtimeSession extends multimodal.RealtimeSession {
|
|
|
932
935
|
usage: null,
|
|
933
936
|
output: [],
|
|
934
937
|
doneFut: doneFut,
|
|
938
|
+
createdTimestamp: Date.now(),
|
|
935
939
|
};
|
|
936
940
|
this.#pendingResponses[newResponse.id] = newResponse;
|
|
937
941
|
this.emit('response_created', newResponse);
|
|
@@ -946,7 +950,70 @@ export class RealtimeSession extends multimodal.RealtimeSession {
|
|
|
946
950
|
response.usage = responseData.usage ?? null;
|
|
947
951
|
this.#pendingResponses[responseId] = response;
|
|
948
952
|
response.doneFut.resolve();
|
|
953
|
+
|
|
954
|
+
let metricsError: Error | undefined;
|
|
955
|
+
let cancelled = false;
|
|
956
|
+
switch (response.status) {
|
|
957
|
+
case 'failed': {
|
|
958
|
+
if (response.statusDetails.type !== 'failed') break;
|
|
959
|
+
const err = response.statusDetails.error;
|
|
960
|
+
metricsError = new metrics.MultimodalLLMError({
|
|
961
|
+
type: response.statusDetails.type,
|
|
962
|
+
code: err?.code,
|
|
963
|
+
message: err?.message,
|
|
964
|
+
});
|
|
965
|
+
this.#logger
|
|
966
|
+
.child({ code: err?.code, error: err?.message })
|
|
967
|
+
.error('response generation failed');
|
|
968
|
+
break;
|
|
969
|
+
}
|
|
970
|
+
case 'incomplete': {
|
|
971
|
+
if (response.statusDetails.type !== 'incomplete') break;
|
|
972
|
+
const reason = response.statusDetails.reason;
|
|
973
|
+
metricsError = new metrics.MultimodalLLMError({
|
|
974
|
+
type: response.statusDetails.type,
|
|
975
|
+
reason,
|
|
976
|
+
});
|
|
977
|
+
this.#logger.child({ reason }).error('response generation incomplete');
|
|
978
|
+
break;
|
|
979
|
+
}
|
|
980
|
+
case 'cancelled': {
|
|
981
|
+
cancelled = true;
|
|
982
|
+
break;
|
|
983
|
+
}
|
|
984
|
+
}
|
|
949
985
|
this.emit('response_done', response);
|
|
986
|
+
|
|
987
|
+
let ttft: number | undefined;
|
|
988
|
+
if (response.firstTokenTimestamp) {
|
|
989
|
+
ttft = response.firstTokenTimestamp - response.createdTimestamp;
|
|
990
|
+
}
|
|
991
|
+
const duration = Date.now() - response.createdTimestamp;
|
|
992
|
+
|
|
993
|
+
const usage = response.usage;
|
|
994
|
+
const metric: metrics.MultimodalLLMMetrics = {
|
|
995
|
+
timestamp: response.createdTimestamp,
|
|
996
|
+
requestId: response.id,
|
|
997
|
+
ttft: ttft!,
|
|
998
|
+
duration,
|
|
999
|
+
cancelled,
|
|
1000
|
+
label: this.constructor.name,
|
|
1001
|
+
completionTokens: usage?.output_tokens || 0,
|
|
1002
|
+
promptTokens: usage?.input_tokens || 0,
|
|
1003
|
+
totalTokens: usage?.total_tokens || 0,
|
|
1004
|
+
tokensPerSecond: ((usage?.output_tokens || 0) / duration) * 1000,
|
|
1005
|
+
error: metricsError,
|
|
1006
|
+
inputTokenDetails: {
|
|
1007
|
+
cachedTokens: usage?.input_token_details.cached_tokens || 0,
|
|
1008
|
+
textTokens: usage?.input_token_details.text_tokens || 0,
|
|
1009
|
+
audioTokens: usage?.input_token_details.audio_tokens || 0,
|
|
1010
|
+
},
|
|
1011
|
+
outputTokenDetails: {
|
|
1012
|
+
textTokens: usage?.output_token_details.text_tokens || 0,
|
|
1013
|
+
audioTokens: usage?.output_token_details.audio_tokens || 0,
|
|
1014
|
+
},
|
|
1015
|
+
};
|
|
1016
|
+
this.emit('metrics_collected', metric);
|
|
950
1017
|
}
|
|
951
1018
|
|
|
952
1019
|
#handleResponseOutputItemAdded(event: api_proto.ResponseOutputItemAddedEvent): void {
|
|
@@ -1062,6 +1129,7 @@ export class RealtimeSession extends multimodal.RealtimeSession {
|
|
|
1062
1129
|
toolCalls: [],
|
|
1063
1130
|
};
|
|
1064
1131
|
output?.content.push(newContent);
|
|
1132
|
+
response!.firstTokenTimestamp = Date.now();
|
|
1065
1133
|
this.emit('response_content_added', newContent);
|
|
1066
1134
|
}
|
|
1067
1135
|
|
package/src/stt.ts
CHANGED
|
@@ -25,6 +25,7 @@ const defaultSTTOptions: STTOptions = {
|
|
|
25
25
|
export class STT extends stt.STT {
|
|
26
26
|
#opts: STTOptions;
|
|
27
27
|
#client: OpenAI;
|
|
28
|
+
label = 'openai.STT';
|
|
28
29
|
|
|
29
30
|
/**
|
|
30
31
|
* Create a new instance of OpenAI STT.
|
|
@@ -108,7 +109,7 @@ export class STT extends stt.STT {
|
|
|
108
109
|
return Buffer.concat([header, Buffer.from(frame.data.buffer)]);
|
|
109
110
|
}
|
|
110
111
|
|
|
111
|
-
async
|
|
112
|
+
async _recognize(buffer: AudioBuffer, language?: string): Promise<stt.SpeechEvent> {
|
|
112
113
|
const config = this.#sanitizeOptions(language);
|
|
113
114
|
buffer = mergeFrames(buffer);
|
|
114
115
|
const file = new File([this.#createWav(buffer)], 'audio.wav', { type: 'audio/wav' });
|
package/src/tts.ts
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
import { AudioByteStream, tts } from '@livekit/agents';
|
|
5
|
+
import type { AudioFrame } from '@livekit/rtc-node';
|
|
5
6
|
import { randomUUID } from 'crypto';
|
|
6
7
|
import { OpenAI } from 'openai';
|
|
7
8
|
import type { TTSModels, TTSVoices } from './models.js';
|
|
@@ -28,6 +29,7 @@ const defaultTTSOptions: TTSOptions = {
|
|
|
28
29
|
export class TTS extends tts.TTS {
|
|
29
30
|
#opts: TTSOptions;
|
|
30
31
|
#client: OpenAI;
|
|
32
|
+
label = 'openai.TTS';
|
|
31
33
|
|
|
32
34
|
/**
|
|
33
35
|
* Create a new instance of OpenAI TTS.
|
|
@@ -58,6 +60,8 @@ export class TTS extends tts.TTS {
|
|
|
58
60
|
|
|
59
61
|
synthesize(text: string): ChunkedStream {
|
|
60
62
|
return new ChunkedStream(
|
|
63
|
+
this,
|
|
64
|
+
text,
|
|
61
65
|
this.#client.audio.speech.create({
|
|
62
66
|
input: text,
|
|
63
67
|
model: this.#opts.model,
|
|
@@ -74,9 +78,11 @@ export class TTS extends tts.TTS {
|
|
|
74
78
|
}
|
|
75
79
|
|
|
76
80
|
export class ChunkedStream extends tts.ChunkedStream {
|
|
81
|
+
label = 'openai.ChunkedStream';
|
|
82
|
+
|
|
77
83
|
// set Promise<T> to any because OpenAI returns an annoying Response type
|
|
78
|
-
constructor(stream: Promise<any>) {
|
|
79
|
-
super();
|
|
84
|
+
constructor(tts: TTS, text: string, stream: Promise<any>) {
|
|
85
|
+
super(text, tts);
|
|
80
86
|
this.#run(stream);
|
|
81
87
|
}
|
|
82
88
|
|
|
@@ -86,13 +92,20 @@ export class ChunkedStream extends tts.ChunkedStream {
|
|
|
86
92
|
const audioByteStream = new AudioByteStream(OPENAI_TTS_SAMPLE_RATE, OPENAI_TTS_CHANNELS);
|
|
87
93
|
const frames = audioByteStream.write(buffer);
|
|
88
94
|
|
|
95
|
+
let lastFrame: AudioFrame | undefined;
|
|
96
|
+
const sendLastFrame = (segmentId: string, final: boolean) => {
|
|
97
|
+
if (lastFrame) {
|
|
98
|
+
this.queue.put({ requestId, segmentId, frame: lastFrame, final });
|
|
99
|
+
lastFrame = undefined;
|
|
100
|
+
}
|
|
101
|
+
};
|
|
102
|
+
|
|
89
103
|
for (const frame of frames) {
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
requestId,
|
|
93
|
-
segmentId: requestId,
|
|
94
|
-
});
|
|
104
|
+
sendLastFrame(requestId, false);
|
|
105
|
+
lastFrame = frame;
|
|
95
106
|
}
|
|
107
|
+
sendLastFrame(requestId, true);
|
|
108
|
+
|
|
96
109
|
this.queue.close();
|
|
97
110
|
}
|
|
98
111
|
}
|