@livekit/agents-plugin-openai 0.4.3 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +1 -1
- package/CHANGELOG.md +13 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/models.d.ts +4 -0
- package/dist/models.d.ts.map +1 -1
- package/dist/realtime/api_proto.d.ts +19 -5
- package/dist/realtime/api_proto.d.ts.map +1 -1
- package/dist/realtime/realtime_model.d.ts +1 -1
- package/dist/realtime/realtime_model.d.ts.map +1 -1
- package/dist/realtime/realtime_model.js +1 -1
- package/dist/realtime/realtime_model.js.map +1 -1
- package/dist/stt.d.ts +41 -0
- package/dist/stt.d.ts.map +1 -0
- package/dist/stt.js +109 -0
- package/dist/stt.js.map +1 -0
- package/dist/tts.d.ts +34 -0
- package/dist/tts.d.ts.map +1 -0
- package/dist/tts.js +73 -0
- package/dist/tts.js.map +1 -0
- package/package.json +2 -2
- package/src/index.ts +2 -0
- package/src/models.ts +12 -1
- package/src/realtime/api_proto.ts +20 -5
- package/src/realtime/realtime_model.ts +2 -2
- package/src/stt.ts +140 -0
- package/src/tts.ts +96 -0
- package/tsconfig.tsbuildinfo +1 -1
package/dist/tts.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tts.js","sourceRoot":"","sources":["../src/tts.ts"],"names":[],"mappings":"AAAA,6CAA6C;AAC7C,EAAE;AACF,sCAAsC;AACtC,OAAO,EAAE,eAAe,EAAE,GAAG,EAAE,MAAM,iBAAiB,CAAC;AACvD,OAAO,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAGhC,MAAM,sBAAsB,GAAG,KAAK,CAAC;AACrC,MAAM,mBAAmB,GAAG,CAAC,CAAC;AAW9B,MAAM,iBAAiB,GAAe;IACpC,MAAM,EAAE,OAAO,CAAC,GAAG,CAAC,cAAc;IAClC,KAAK,EAAE,OAAO;IACd,KAAK,EAAE,OAAO;IACd,KAAK,EAAE,CAAC;CACT,CAAC;AAEF,MAAM,OAAO,GAAI,SAAQ,GAAG,CAAC,GAAG;IAC9B,KAAK,CAAa;IAClB,OAAO,CAAS;IAEhB;;;;;;OAMG;IACH,YAAY,OAA4B,iBAAiB;QACvD,KAAK,CAAC,sBAAsB,EAAE,mBAAmB,EAAE,EAAE,SAAS,EAAE,KAAK,EAAE,CAAC,CAAC;QAEzE,IAAI,CAAC,KAAK,GAAG,EAAE,GAAG,iBAAiB,EAAE,GAAG,IAAI,EAAE,CAAC;QAC/C,IAAI,IAAI,CAAC,KAAK,CAAC,MAAM,KAAK,SAAS,EAAE,CAAC;YACpC,MAAM,IAAI,KAAK,CAAC,0EAA0E,CAAC,CAAC;QAC9F,CAAC;QAED,IAAI,CAAC,OAAO;YACV,IAAI,CAAC,KAAK,CAAC,MAAM;gBACjB,IAAI,MAAM,CAAC;oBACT,OAAO,EAAE,IAAI,CAAC,OAAO;oBACrB,MAAM,EAAE,IAAI,CAAC,MAAM;iBACpB,CAAC,CAAC;IACP,CAAC;IAED,aAAa,CAAC,IAAuE;QACnF,IAAI,CAAC,KAAK,GAAG,EAAE,GAAG,IAAI,CAAC,KAAK,EAAE,GAAG,IAAI,EAAE,CAAC;IAC1C,CAAC;IAED,UAAU,CAAC,IAAY;QACrB,OAAO,IAAI,aAAa,CACtB,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,MAAM,CAAC,MAAM,CAAC;YAC/B,KAAK,EAAE,IAAI;YACX,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK;YACvB,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK;YACvB,eAAe,EAAE,KAAK;YACtB,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK;SACxB,CAAC,CACH,CAAC;IACJ,CAAC;IAED,MAAM;QACJ,MAAM,IAAI,KAAK,CAAC,0CAA0C,CAAC,CAAC;IAC9D,CAAC;CACF;AAED,MAAM,OAAO,aAAc,SAAQ,GAAG,CAAC,aAAa;IAClD,YAAY,MAAyB;QACnC,KAAK,EAAE,CAAC;QACR,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACpB,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,MAAyB;QAClC,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC;QACzD,MAAM,SAAS,GAAG,MAAM,CAAC,UAAU,EAAE,CAAC;QACtC,MAAM,eAAe,GAAG,IAAI,eAAe,CAAC,sBAAsB,EAAE,mBAAmB,CAAC,CAAC;QACzF,MAAM,MAAM,GAAG,eAAe,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QAE7C,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC;gBACb,KAAK;gBACL,SAAS;gBACT,SAAS,EAAE,SAAS;aACrB,CAAC,CAAC;QACL,CAAC;QACD,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC;IACrB,CAAC;CACF"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@livekit/agents-plugin-openai",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.5.0",
|
|
4
4
|
"description": "OpenAI plugin for LiveKit Node Agents",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"types": "dist/index.d.ts",
|
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
"openai": "^4.70.2",
|
|
17
17
|
"sharp": "^0.33.5",
|
|
18
18
|
"ws": "^8.16.0",
|
|
19
|
-
"@livekit/agents": "0.4.
|
|
19
|
+
"@livekit/agents": "0.4.4"
|
|
20
20
|
},
|
|
21
21
|
"scripts": {
|
|
22
22
|
"build": "tsc -b tsconfig.json",
|
package/src/index.ts
CHANGED
package/src/models.ts
CHANGED
|
@@ -27,7 +27,13 @@ export type ChatModels =
|
|
|
27
27
|
| 'gpt-3.5-turbo-1106'
|
|
28
28
|
| 'gpt-3.5-turbo-16k-0613';
|
|
29
29
|
|
|
30
|
-
|
|
30
|
+
export type WhisperModels = 'whisper-1';
|
|
31
|
+
|
|
32
|
+
export type TTSModels = 'tts-1' | 'tts-1-hd';
|
|
33
|
+
|
|
34
|
+
export type TTSVoices = 'alloy' | 'echo' | 'fable' | 'onyx' | 'nova' | 'shimmer';
|
|
35
|
+
|
|
36
|
+
// adapters for OpenAI-compatible LLMs, TTSs, STTs
|
|
31
37
|
|
|
32
38
|
export type TelnyxChatModels =
|
|
33
39
|
| 'meta-llama/Meta-Llama-3.1-8B-Instruct'
|
|
@@ -56,6 +62,11 @@ export type GroqChatModels =
|
|
|
56
62
|
| 'gemma-7b-it'
|
|
57
63
|
| 'gemma2-9b-it';
|
|
58
64
|
|
|
65
|
+
export type GroqAudioModels =
|
|
66
|
+
| 'whisper-large-v3'
|
|
67
|
+
| 'distil-whisper-large-v3-en'
|
|
68
|
+
| 'whisper-large-v3-turbo';
|
|
69
|
+
|
|
59
70
|
export type DeepSeekChatModels = 'deepseek-coder' | 'deepseek-chat';
|
|
60
71
|
|
|
61
72
|
export type TogetherChatModels =
|
|
@@ -208,17 +208,32 @@ export type ResponseStatusDetails =
|
|
|
208
208
|
reason: 'turn_detected' | 'client_cancelled' | string;
|
|
209
209
|
};
|
|
210
210
|
|
|
211
|
+
export interface ModelUsage {
|
|
212
|
+
total_tokens: number;
|
|
213
|
+
input_tokens: number;
|
|
214
|
+
output_tokens: number;
|
|
215
|
+
input_token_details: {
|
|
216
|
+
text_tokens: number;
|
|
217
|
+
audio_tokens: number;
|
|
218
|
+
cached_tokens: number;
|
|
219
|
+
cached_tokens_details: {
|
|
220
|
+
text_tokens: number;
|
|
221
|
+
audio_tokens: number;
|
|
222
|
+
};
|
|
223
|
+
};
|
|
224
|
+
output_token_details: {
|
|
225
|
+
text_tokens: number;
|
|
226
|
+
audio_tokens: number;
|
|
227
|
+
};
|
|
228
|
+
}
|
|
229
|
+
|
|
211
230
|
export interface ResponseResource {
|
|
212
231
|
id: string;
|
|
213
232
|
object: 'realtime.response';
|
|
214
233
|
status: ResponseStatus;
|
|
215
234
|
status_details: ResponseStatusDetails;
|
|
216
235
|
output: ItemResource[];
|
|
217
|
-
usage?:
|
|
218
|
-
total_tokens: number;
|
|
219
|
-
input_tokens: number;
|
|
220
|
-
output_tokens: number;
|
|
221
|
-
};
|
|
236
|
+
usage?: ModelUsage;
|
|
222
237
|
}
|
|
223
238
|
|
|
224
239
|
// Client Events
|
|
@@ -37,7 +37,7 @@ export interface RealtimeResponse {
|
|
|
37
37
|
id: string;
|
|
38
38
|
status: api_proto.ResponseStatus;
|
|
39
39
|
statusDetails: api_proto.ResponseStatusDetails | null;
|
|
40
|
-
usage: api_proto.
|
|
40
|
+
usage: api_proto.ModelUsage | null;
|
|
41
41
|
output: RealtimeOutput[];
|
|
42
42
|
doneFut: Future;
|
|
43
43
|
}
|
|
@@ -939,7 +939,7 @@ export class RealtimeSession extends multimodal.RealtimeSession {
|
|
|
939
939
|
const response = this.#pendingResponses[responseId];
|
|
940
940
|
response.status = responseData.status;
|
|
941
941
|
response.statusDetails = responseData.status_details;
|
|
942
|
-
response.usage = responseData.usage;
|
|
942
|
+
response.usage = responseData.usage ?? null;
|
|
943
943
|
this.#pendingResponses[responseId] = response;
|
|
944
944
|
response.doneFut.resolve();
|
|
945
945
|
this.emit('response_done', response);
|
package/src/stt.ts
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import { type AudioBuffer, mergeFrames, stt } from '@livekit/agents';
|
|
5
|
+
import type { AudioFrame } from '@livekit/rtc-node';
|
|
6
|
+
import { OpenAI } from 'openai';
|
|
7
|
+
import type { GroqAudioModels, WhisperModels } from './models.js';
|
|
8
|
+
|
|
9
|
+
export interface STTOptions {
|
|
10
|
+
apiKey?: string;
|
|
11
|
+
language: string;
|
|
12
|
+
detectLanguage: boolean;
|
|
13
|
+
model: WhisperModels | string;
|
|
14
|
+
baseURL?: string;
|
|
15
|
+
client?: OpenAI;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
const defaultSTTOptions: STTOptions = {
|
|
19
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
20
|
+
language: 'en',
|
|
21
|
+
detectLanguage: false,
|
|
22
|
+
model: 'whisper-1',
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
export class STT extends stt.STT {
|
|
26
|
+
#opts: STTOptions;
|
|
27
|
+
#client: OpenAI;
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Create a new instance of OpenAI STT.
|
|
31
|
+
*
|
|
32
|
+
* @remarks
|
|
33
|
+
* `apiKey` must be set to your OpenAI API key, either using the argument or by setting the
|
|
34
|
+
* `OPENAI_API_KEY` environmental variable.
|
|
35
|
+
*/
|
|
36
|
+
constructor(opts: Partial<STTOptions> = defaultSTTOptions) {
|
|
37
|
+
super({ streaming: false, interimResults: false });
|
|
38
|
+
|
|
39
|
+
this.#opts = { ...defaultSTTOptions, ...opts };
|
|
40
|
+
if (this.#opts.apiKey === undefined) {
|
|
41
|
+
throw new Error('OpenAI API key is required, whether as an argument or as $OPENAI_API_KEY');
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
this.#client =
|
|
45
|
+
this.#opts.client ||
|
|
46
|
+
new OpenAI({
|
|
47
|
+
baseURL: opts.baseURL,
|
|
48
|
+
apiKey: opts.apiKey,
|
|
49
|
+
});
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Create a new instance of Groq STT.
|
|
54
|
+
*
|
|
55
|
+
* @remarks
|
|
56
|
+
* `apiKey` must be set to your Groq API key, either using the argument or by setting the
|
|
57
|
+
* `GROQ_API_KEY` environmental variable.
|
|
58
|
+
*/
|
|
59
|
+
static withGroq(
|
|
60
|
+
opts: Partial<{
|
|
61
|
+
model: string | GroqAudioModels;
|
|
62
|
+
apiKey?: string;
|
|
63
|
+
baseURL?: string;
|
|
64
|
+
client: OpenAI;
|
|
65
|
+
language: string;
|
|
66
|
+
detectLanguage: boolean;
|
|
67
|
+
}> = {},
|
|
68
|
+
): STT {
|
|
69
|
+
opts.apiKey = opts.apiKey || process.env.GROQ_API_KEY;
|
|
70
|
+
if (opts.apiKey === undefined) {
|
|
71
|
+
throw new Error('Groq API key is required, whether as an argument or as $GROQ_API_KEY');
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
return new STT({
|
|
75
|
+
model: 'whisper-large-v3-turbo',
|
|
76
|
+
baseURL: 'https://api.groq.com/openai/v1',
|
|
77
|
+
...opts,
|
|
78
|
+
});
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
#sanitizeOptions(language?: string): STTOptions {
|
|
82
|
+
if (language) {
|
|
83
|
+
return { ...this.#opts, language };
|
|
84
|
+
} else {
|
|
85
|
+
return this.#opts;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
#createWav(frame: AudioFrame): Buffer {
|
|
90
|
+
const bitsPerSample = 16;
|
|
91
|
+
const byteRate = (frame.sampleRate * frame.channels * bitsPerSample) / 8;
|
|
92
|
+
const blockAlign = (frame.channels * bitsPerSample) / 8;
|
|
93
|
+
|
|
94
|
+
const header = Buffer.alloc(44);
|
|
95
|
+
header.write('RIFF', 0);
|
|
96
|
+
header.writeUInt32LE(36 + frame.data.byteLength, 4);
|
|
97
|
+
header.write('WAVE', 8);
|
|
98
|
+
header.write('fmt ', 12);
|
|
99
|
+
header.writeUInt32LE(16, 16);
|
|
100
|
+
header.writeUInt16LE(1, 20);
|
|
101
|
+
header.writeUInt16LE(frame.channels, 22);
|
|
102
|
+
header.writeUInt32LE(frame.sampleRate, 24);
|
|
103
|
+
header.writeUInt32LE(byteRate, 28);
|
|
104
|
+
header.writeUInt16LE(blockAlign, 32);
|
|
105
|
+
header.writeUInt16LE(16, 34);
|
|
106
|
+
header.write('data', 36);
|
|
107
|
+
header.writeUInt32LE(frame.data.byteLength, 40);
|
|
108
|
+
return Buffer.concat([header, Buffer.from(frame.data.buffer)]);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
async recognize(buffer: AudioBuffer, language?: string): Promise<stt.SpeechEvent> {
|
|
112
|
+
const config = this.#sanitizeOptions(language);
|
|
113
|
+
buffer = mergeFrames(buffer);
|
|
114
|
+
const file = new File([this.#createWav(buffer)], 'audio.wav', { type: 'audio/wav' });
|
|
115
|
+
const resp = await this.#client.audio.transcriptions.create({
|
|
116
|
+
file,
|
|
117
|
+
model: this.#opts.model,
|
|
118
|
+
language: config.language,
|
|
119
|
+
response_format: 'json',
|
|
120
|
+
});
|
|
121
|
+
|
|
122
|
+
return {
|
|
123
|
+
type: stt.SpeechEventType.FINAL_TRANSCRIPT,
|
|
124
|
+
alternatives: [
|
|
125
|
+
{
|
|
126
|
+
text: resp.text || '',
|
|
127
|
+
language: language || '',
|
|
128
|
+
startTime: 0,
|
|
129
|
+
endTime: 0,
|
|
130
|
+
confidence: 0,
|
|
131
|
+
},
|
|
132
|
+
],
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/** This method throws an error; streaming is unsupported on OpenAI STT. */
|
|
137
|
+
stream(): stt.SpeechStream {
|
|
138
|
+
throw new Error('Streaming is not supported on OpenAI STT');
|
|
139
|
+
}
|
|
140
|
+
}
|
package/src/tts.ts
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import { AudioByteStream, tts } from '@livekit/agents';
|
|
5
|
+
import { OpenAI } from 'openai';
|
|
6
|
+
import type { TTSModels, TTSVoices } from './models.js';
|
|
7
|
+
|
|
8
|
+
const OPENAI_TTS_SAMPLE_RATE = 24000;
|
|
9
|
+
const OPENAI_TTS_CHANNELS = 1;
|
|
10
|
+
|
|
11
|
+
export interface TTSOptions {
|
|
12
|
+
model: TTSModels | string;
|
|
13
|
+
voice: TTSVoices;
|
|
14
|
+
speed: number;
|
|
15
|
+
baseURL?: string;
|
|
16
|
+
client?: OpenAI;
|
|
17
|
+
apiKey?: string;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
const defaultTTSOptions: TTSOptions = {
|
|
21
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
22
|
+
model: 'tts-1',
|
|
23
|
+
voice: 'alloy',
|
|
24
|
+
speed: 1,
|
|
25
|
+
};
|
|
26
|
+
|
|
27
|
+
export class TTS extends tts.TTS {
|
|
28
|
+
#opts: TTSOptions;
|
|
29
|
+
#client: OpenAI;
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Create a new instance of OpenAI TTS.
|
|
33
|
+
*
|
|
34
|
+
* @remarks
|
|
35
|
+
* `apiKey` must be set to your OpenAI API key, either using the argument or by setting the
|
|
36
|
+
* `OPENAI_API_KEY` environmental variable.
|
|
37
|
+
*/
|
|
38
|
+
constructor(opts: Partial<TTSOptions> = defaultTTSOptions) {
|
|
39
|
+
super(OPENAI_TTS_SAMPLE_RATE, OPENAI_TTS_CHANNELS, { streaming: false });
|
|
40
|
+
|
|
41
|
+
this.#opts = { ...defaultTTSOptions, ...opts };
|
|
42
|
+
if (this.#opts.apiKey === undefined) {
|
|
43
|
+
throw new Error('OpenAI API key is required, whether as an argument or as $OPENAI_API_KEY');
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
this.#client =
|
|
47
|
+
this.#opts.client ||
|
|
48
|
+
new OpenAI({
|
|
49
|
+
baseURL: opts.baseURL,
|
|
50
|
+
apiKey: opts.apiKey,
|
|
51
|
+
});
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
updateOptions(opts: { model?: TTSModels | string; voice?: TTSVoices; speed?: number }) {
|
|
55
|
+
this.#opts = { ...this.#opts, ...opts };
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
synthesize(text: string): ChunkedStream {
|
|
59
|
+
return new ChunkedStream(
|
|
60
|
+
this.#client.audio.speech.create({
|
|
61
|
+
input: text,
|
|
62
|
+
model: this.#opts.model,
|
|
63
|
+
voice: this.#opts.voice,
|
|
64
|
+
response_format: 'pcm',
|
|
65
|
+
speed: this.#opts.speed,
|
|
66
|
+
}),
|
|
67
|
+
);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
stream(): tts.SynthesizeStream {
|
|
71
|
+
throw new Error('Streaming is not supported on OpenAI TTS');
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
export class ChunkedStream extends tts.ChunkedStream {
|
|
76
|
+
constructor(stream: Promise<Response>) {
|
|
77
|
+
super();
|
|
78
|
+
this.#run(stream);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
async #run(stream: Promise<Response>) {
|
|
82
|
+
const buffer = await stream.then((r) => r.arrayBuffer());
|
|
83
|
+
const requestId = crypto.randomUUID();
|
|
84
|
+
const audioByteStream = new AudioByteStream(OPENAI_TTS_SAMPLE_RATE, OPENAI_TTS_CHANNELS);
|
|
85
|
+
const frames = audioByteStream.write(buffer);
|
|
86
|
+
|
|
87
|
+
for (const frame of frames) {
|
|
88
|
+
this.queue.put({
|
|
89
|
+
frame,
|
|
90
|
+
requestId,
|
|
91
|
+
segmentId: requestId,
|
|
92
|
+
});
|
|
93
|
+
}
|
|
94
|
+
this.queue.close();
|
|
95
|
+
}
|
|
96
|
+
}
|