@livekit/agents-plugin-openai 0.4.3 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +1 -1
- package/CHANGELOG.md +30 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/llm.d.ts.map +1 -1
- package/dist/llm.js +4 -1
- package/dist/llm.js.map +1 -1
- package/dist/models.d.ts +4 -0
- package/dist/models.d.ts.map +1 -1
- package/dist/realtime/api_proto.d.ts +19 -5
- package/dist/realtime/api_proto.d.ts.map +1 -1
- package/dist/realtime/realtime_model.d.ts +1 -1
- package/dist/realtime/realtime_model.d.ts.map +1 -1
- package/dist/realtime/realtime_model.js +22 -11
- package/dist/realtime/realtime_model.js.map +1 -1
- package/dist/stt.d.ts +41 -0
- package/dist/stt.d.ts.map +1 -0
- package/dist/stt.js +109 -0
- package/dist/stt.js.map +1 -0
- package/dist/tts.d.ts +34 -0
- package/dist/tts.d.ts.map +1 -0
- package/dist/tts.js +73 -0
- package/dist/tts.js.map +1 -0
- package/package.json +9 -5
- package/src/index.ts +2 -0
- package/src/llm.ts +5 -1
- package/src/models.ts +12 -1
- package/src/realtime/api_proto.ts +20 -5
- package/src/realtime/realtime_model.ts +28 -17
- package/src/stt.ts +140 -0
- package/src/tts.ts +96 -0
- package/tsconfig.tsbuildinfo +1 -1
package/dist/stt.js
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import { mergeFrames, stt } from '@livekit/agents';
|
|
5
|
+
import { OpenAI } from 'openai';
|
|
6
|
+
const defaultSTTOptions = {
|
|
7
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
8
|
+
language: 'en',
|
|
9
|
+
detectLanguage: false,
|
|
10
|
+
model: 'whisper-1',
|
|
11
|
+
};
|
|
12
|
+
export class STT extends stt.STT {
|
|
13
|
+
#opts;
|
|
14
|
+
#client;
|
|
15
|
+
/**
|
|
16
|
+
* Create a new instance of OpenAI STT.
|
|
17
|
+
*
|
|
18
|
+
* @remarks
|
|
19
|
+
* `apiKey` must be set to your OpenAI API key, either using the argument or by setting the
|
|
20
|
+
* `OPENAI_API_KEY` environmental variable.
|
|
21
|
+
*/
|
|
22
|
+
constructor(opts = defaultSTTOptions) {
|
|
23
|
+
super({ streaming: false, interimResults: false });
|
|
24
|
+
this.#opts = { ...defaultSTTOptions, ...opts };
|
|
25
|
+
if (this.#opts.apiKey === undefined) {
|
|
26
|
+
throw new Error('OpenAI API key is required, whether as an argument or as $OPENAI_API_KEY');
|
|
27
|
+
}
|
|
28
|
+
this.#client =
|
|
29
|
+
this.#opts.client ||
|
|
30
|
+
new OpenAI({
|
|
31
|
+
baseURL: opts.baseURL,
|
|
32
|
+
apiKey: opts.apiKey,
|
|
33
|
+
});
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Create a new instance of Groq STT.
|
|
37
|
+
*
|
|
38
|
+
* @remarks
|
|
39
|
+
* `apiKey` must be set to your Groq API key, either using the argument or by setting the
|
|
40
|
+
* `GROQ_API_KEY` environmental variable.
|
|
41
|
+
*/
|
|
42
|
+
static withGroq(opts = {}) {
|
|
43
|
+
opts.apiKey = opts.apiKey || process.env.GROQ_API_KEY;
|
|
44
|
+
if (opts.apiKey === undefined) {
|
|
45
|
+
throw new Error('Groq API key is required, whether as an argument or as $GROQ_API_KEY');
|
|
46
|
+
}
|
|
47
|
+
return new STT({
|
|
48
|
+
model: 'whisper-large-v3-turbo',
|
|
49
|
+
baseURL: 'https://api.groq.com/openai/v1',
|
|
50
|
+
...opts,
|
|
51
|
+
});
|
|
52
|
+
}
|
|
53
|
+
#sanitizeOptions(language) {
|
|
54
|
+
if (language) {
|
|
55
|
+
return { ...this.#opts, language };
|
|
56
|
+
}
|
|
57
|
+
else {
|
|
58
|
+
return this.#opts;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
#createWav(frame) {
|
|
62
|
+
const bitsPerSample = 16;
|
|
63
|
+
const byteRate = (frame.sampleRate * frame.channels * bitsPerSample) / 8;
|
|
64
|
+
const blockAlign = (frame.channels * bitsPerSample) / 8;
|
|
65
|
+
const header = Buffer.alloc(44);
|
|
66
|
+
header.write('RIFF', 0);
|
|
67
|
+
header.writeUInt32LE(36 + frame.data.byteLength, 4);
|
|
68
|
+
header.write('WAVE', 8);
|
|
69
|
+
header.write('fmt ', 12);
|
|
70
|
+
header.writeUInt32LE(16, 16);
|
|
71
|
+
header.writeUInt16LE(1, 20);
|
|
72
|
+
header.writeUInt16LE(frame.channels, 22);
|
|
73
|
+
header.writeUInt32LE(frame.sampleRate, 24);
|
|
74
|
+
header.writeUInt32LE(byteRate, 28);
|
|
75
|
+
header.writeUInt16LE(blockAlign, 32);
|
|
76
|
+
header.writeUInt16LE(16, 34);
|
|
77
|
+
header.write('data', 36);
|
|
78
|
+
header.writeUInt32LE(frame.data.byteLength, 40);
|
|
79
|
+
return Buffer.concat([header, Buffer.from(frame.data.buffer)]);
|
|
80
|
+
}
|
|
81
|
+
async recognize(buffer, language) {
|
|
82
|
+
const config = this.#sanitizeOptions(language);
|
|
83
|
+
buffer = mergeFrames(buffer);
|
|
84
|
+
const file = new File([this.#createWav(buffer)], 'audio.wav', { type: 'audio/wav' });
|
|
85
|
+
const resp = await this.#client.audio.transcriptions.create({
|
|
86
|
+
file,
|
|
87
|
+
model: this.#opts.model,
|
|
88
|
+
language: config.language,
|
|
89
|
+
response_format: 'json',
|
|
90
|
+
});
|
|
91
|
+
return {
|
|
92
|
+
type: stt.SpeechEventType.FINAL_TRANSCRIPT,
|
|
93
|
+
alternatives: [
|
|
94
|
+
{
|
|
95
|
+
text: resp.text || '',
|
|
96
|
+
language: language || '',
|
|
97
|
+
startTime: 0,
|
|
98
|
+
endTime: 0,
|
|
99
|
+
confidence: 0,
|
|
100
|
+
},
|
|
101
|
+
],
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
/** This method throws an error; streaming is unsupported on OpenAI STT. */
|
|
105
|
+
stream() {
|
|
106
|
+
throw new Error('Streaming is not supported on OpenAI STT');
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
//# sourceMappingURL=stt.js.map
|
package/dist/stt.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"stt.js","sourceRoot":"","sources":["../src/stt.ts"],"names":[],"mappings":"AAAA,6CAA6C;AAC7C,EAAE;AACF,sCAAsC;AACtC,OAAO,EAAoB,WAAW,EAAE,GAAG,EAAE,MAAM,iBAAiB,CAAC;AAErE,OAAO,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAYhC,MAAM,iBAAiB,GAAe;IACpC,MAAM,EAAE,OAAO,CAAC,GAAG,CAAC,cAAc;IAClC,QAAQ,EAAE,IAAI;IACd,cAAc,EAAE,KAAK;IACrB,KAAK,EAAE,WAAW;CACnB,CAAC;AAEF,MAAM,OAAO,GAAI,SAAQ,GAAG,CAAC,GAAG;IAC9B,KAAK,CAAa;IAClB,OAAO,CAAS;IAEhB;;;;;;OAMG;IACH,YAAY,OAA4B,iBAAiB;QACvD,KAAK,CAAC,EAAE,SAAS,EAAE,KAAK,EAAE,cAAc,EAAE,KAAK,EAAE,CAAC,CAAC;QAEnD,IAAI,CAAC,KAAK,GAAG,EAAE,GAAG,iBAAiB,EAAE,GAAG,IAAI,EAAE,CAAC;QAC/C,IAAI,IAAI,CAAC,KAAK,CAAC,MAAM,KAAK,SAAS,EAAE,CAAC;YACpC,MAAM,IAAI,KAAK,CAAC,0EAA0E,CAAC,CAAC;QAC9F,CAAC;QAED,IAAI,CAAC,OAAO;YACV,IAAI,CAAC,KAAK,CAAC,MAAM;gBACjB,IAAI,MAAM,CAAC;oBACT,OAAO,EAAE,IAAI,CAAC,OAAO;oBACrB,MAAM,EAAE,IAAI,CAAC,MAAM;iBACpB,CAAC,CAAC;IACP,CAAC;IAED;;;;;;OAMG;IACH,MAAM,CAAC,QAAQ,CACb,OAOK,EAAE;QAEP,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,IAAI,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC;QACtD,IAAI,IAAI,CAAC,MAAM,KAAK,SAAS,EAAE,CAAC;YAC9B,MAAM,IAAI,KAAK,CAAC,sEAAsE,CAAC,CAAC;QAC1F,CAAC;QAED,OAAO,IAAI,GAAG,CAAC;YACb,KAAK,EAAE,wBAAwB;YAC/B,OAAO,EAAE,gCAAgC;YACzC,GAAG,IAAI;SACR,CAAC,CAAC;IACL,CAAC;IAED,gBAAgB,CAAC,QAAiB;QAChC,IAAI,QAAQ,EAAE,CAAC;YACb,OAAO,EAAE,GAAG,IAAI,CAAC,KAAK,EAAE,QAAQ,EAAE,CAAC;QACrC,CAAC;aAAM,CAAC;YACN,OAAO,IAAI,CAAC,KAAK,CAAC;QACpB,CAAC;IACH,CAAC;IAED,UAAU,CAAC,KAAiB;QAC1B,MAAM,aAAa,GAAG,EAAE,CAAC;QACzB,MAAM,QAAQ,GAAG,CAAC,KAAK,CAAC,UAAU,GAAG,KAAK,CAAC,QAAQ,GAAG,aAAa,CAAC,GAAG,CAAC,CAAC;QACzE,MAAM,UAAU,GAAG,CAAC,KAAK,CAAC,QAAQ,GAAG,aAAa,CAAC,GAAG,CAAC,CAAC;QAExD,MAAM,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;QAChC,MAAM,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;QACxB,MAAM,CAAC,aAAa,CAAC,EAAE,GAAG,KAAK,CAAC,IAAI,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC;QACpD,MAAM,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;QACxB,MAAM,CAAC,KAAK,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;QACzB,MAAM,CAAC,aAAa,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC;QAC7B,MAAM,CAAC,aAAa,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QAC5B,MAAM,CAAC,aAAa,CAAC,KAAK,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;QACzC,MAAM,CAAC,aAAa,CAAC,KAAK,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC;QAC3C,MAAM,CAAC,aAAa,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;QACnC,MAAM,CAAC,aAAa,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC;QACrC,MAAM,CAAC,aAAa,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC;QAC7B,MAAM,CAAC,KAAK,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;QACzB,MAAM,CAAC,aAAa,CAAC,KAAK,CAAC,IAAI,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC;QAChD,OAAO,MAAM,CAAC,MAAM,CAAC,CAAC,MAAM,EAAE,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;IACjE,CAAC;IAED,KAAK,CAAC,SAAS,CAAC,MAAmB,EAAE,QAAiB;QACpD,MAAM,MAAM,GAAG,IAAI,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC;QAC/C,MAAM,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC;QAC7B,MAAM,IAAI,GAAG,IAAI,IAAI,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,EAAE,WAAW,EAAE,EAAE,IAAI,EAAE,WAAW,EAAE,CAAC,CAAC;QACrF,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,cAAc,CAAC,MAAM,CAAC;YAC1D,IAAI;YACJ,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK;YACvB,QAAQ,EAAE,MAAM,CAAC,QAAQ;YACzB,eAAe,EAAE,MAAM;SACxB,CAAC,CAAC;QAEH,OAAO;YACL,IAAI,EAAE,GAAG,CAAC,eAAe,CAAC,gBAAgB;YAC1C,YAAY,EAAE;gBACZ;oBACE,IAAI,EAAE,IAAI,CAAC,IAAI,IAAI,EAAE;oBACrB,QAAQ,EAAE,QAAQ,IAAI,EAAE;oBACxB,SAAS,EAAE,CAAC;oBACZ,OAAO,EAAE,CAAC;oBACV,UAAU,EAAE,CAAC;iBACd;aACF;SACF,CAAC;IACJ,CAAC;IAED,2EAA2E;IAC3E,MAAM;QACJ,MAAM,IAAI,KAAK,CAAC,0CAA0C,CAAC,CAAC;IAC9D,CAAC;CACF"}
|
package/dist/tts.d.ts
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import { tts } from '@livekit/agents';
|
|
2
|
+
import { OpenAI } from 'openai';
|
|
3
|
+
import type { TTSModels, TTSVoices } from './models.js';
|
|
4
|
+
export interface TTSOptions {
|
|
5
|
+
model: TTSModels | string;
|
|
6
|
+
voice: TTSVoices;
|
|
7
|
+
speed: number;
|
|
8
|
+
baseURL?: string;
|
|
9
|
+
client?: OpenAI;
|
|
10
|
+
apiKey?: string;
|
|
11
|
+
}
|
|
12
|
+
export declare class TTS extends tts.TTS {
|
|
13
|
+
#private;
|
|
14
|
+
/**
|
|
15
|
+
* Create a new instance of OpenAI TTS.
|
|
16
|
+
*
|
|
17
|
+
* @remarks
|
|
18
|
+
* `apiKey` must be set to your OpenAI API key, either using the argument or by setting the
|
|
19
|
+
* `OPENAI_API_KEY` environmental variable.
|
|
20
|
+
*/
|
|
21
|
+
constructor(opts?: Partial<TTSOptions>);
|
|
22
|
+
updateOptions(opts: {
|
|
23
|
+
model?: TTSModels | string;
|
|
24
|
+
voice?: TTSVoices;
|
|
25
|
+
speed?: number;
|
|
26
|
+
}): void;
|
|
27
|
+
synthesize(text: string): ChunkedStream;
|
|
28
|
+
stream(): tts.SynthesizeStream;
|
|
29
|
+
}
|
|
30
|
+
export declare class ChunkedStream extends tts.ChunkedStream {
|
|
31
|
+
#private;
|
|
32
|
+
constructor(stream: Promise<Response>);
|
|
33
|
+
}
|
|
34
|
+
//# sourceMappingURL=tts.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tts.d.ts","sourceRoot":"","sources":["../src/tts.ts"],"names":[],"mappings":"AAGA,OAAO,EAAmB,GAAG,EAAE,MAAM,iBAAiB,CAAC;AACvD,OAAO,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAChC,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAKxD,MAAM,WAAW,UAAU;IACzB,KAAK,EAAE,SAAS,GAAG,MAAM,CAAC;IAC1B,KAAK,EAAE,SAAS,CAAC;IACjB,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AASD,qBAAa,GAAI,SAAQ,GAAG,CAAC,GAAG;;IAI9B;;;;;;OAMG;gBACS,IAAI,GAAE,OAAO,CAAC,UAAU,CAAqB;IAgBzD,aAAa,CAAC,IAAI,EAAE;QAAE,KAAK,CAAC,EAAE,SAAS,GAAG,MAAM,CAAC;QAAC,KAAK,CAAC,EAAE,SAAS,CAAC;QAAC,KAAK,CAAC,EAAE,MAAM,CAAA;KAAE;IAIrF,UAAU,CAAC,IAAI,EAAE,MAAM,GAAG,aAAa;IAYvC,MAAM,IAAI,GAAG,CAAC,gBAAgB;CAG/B;AAED,qBAAa,aAAc,SAAQ,GAAG,CAAC,aAAa;;gBACtC,MAAM,EAAE,OAAO,CAAC,QAAQ,CAAC;CAoBtC"}
|
package/dist/tts.js
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import { AudioByteStream, tts } from '@livekit/agents';
|
|
5
|
+
import { OpenAI } from 'openai';
|
|
6
|
+
const OPENAI_TTS_SAMPLE_RATE = 24000;
|
|
7
|
+
const OPENAI_TTS_CHANNELS = 1;
|
|
8
|
+
const defaultTTSOptions = {
|
|
9
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
10
|
+
model: 'tts-1',
|
|
11
|
+
voice: 'alloy',
|
|
12
|
+
speed: 1,
|
|
13
|
+
};
|
|
14
|
+
export class TTS extends tts.TTS {
|
|
15
|
+
#opts;
|
|
16
|
+
#client;
|
|
17
|
+
/**
|
|
18
|
+
* Create a new instance of OpenAI TTS.
|
|
19
|
+
*
|
|
20
|
+
* @remarks
|
|
21
|
+
* `apiKey` must be set to your OpenAI API key, either using the argument or by setting the
|
|
22
|
+
* `OPENAI_API_KEY` environmental variable.
|
|
23
|
+
*/
|
|
24
|
+
constructor(opts = defaultTTSOptions) {
|
|
25
|
+
super(OPENAI_TTS_SAMPLE_RATE, OPENAI_TTS_CHANNELS, { streaming: false });
|
|
26
|
+
this.#opts = { ...defaultTTSOptions, ...opts };
|
|
27
|
+
if (this.#opts.apiKey === undefined) {
|
|
28
|
+
throw new Error('OpenAI API key is required, whether as an argument or as $OPENAI_API_KEY');
|
|
29
|
+
}
|
|
30
|
+
this.#client =
|
|
31
|
+
this.#opts.client ||
|
|
32
|
+
new OpenAI({
|
|
33
|
+
baseURL: opts.baseURL,
|
|
34
|
+
apiKey: opts.apiKey,
|
|
35
|
+
});
|
|
36
|
+
}
|
|
37
|
+
updateOptions(opts) {
|
|
38
|
+
this.#opts = { ...this.#opts, ...opts };
|
|
39
|
+
}
|
|
40
|
+
synthesize(text) {
|
|
41
|
+
return new ChunkedStream(this.#client.audio.speech.create({
|
|
42
|
+
input: text,
|
|
43
|
+
model: this.#opts.model,
|
|
44
|
+
voice: this.#opts.voice,
|
|
45
|
+
response_format: 'pcm',
|
|
46
|
+
speed: this.#opts.speed,
|
|
47
|
+
}));
|
|
48
|
+
}
|
|
49
|
+
stream() {
|
|
50
|
+
throw new Error('Streaming is not supported on OpenAI TTS');
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
export class ChunkedStream extends tts.ChunkedStream {
|
|
54
|
+
constructor(stream) {
|
|
55
|
+
super();
|
|
56
|
+
this.#run(stream);
|
|
57
|
+
}
|
|
58
|
+
async #run(stream) {
|
|
59
|
+
const buffer = await stream.then((r) => r.arrayBuffer());
|
|
60
|
+
const requestId = crypto.randomUUID();
|
|
61
|
+
const audioByteStream = new AudioByteStream(OPENAI_TTS_SAMPLE_RATE, OPENAI_TTS_CHANNELS);
|
|
62
|
+
const frames = audioByteStream.write(buffer);
|
|
63
|
+
for (const frame of frames) {
|
|
64
|
+
this.queue.put({
|
|
65
|
+
frame,
|
|
66
|
+
requestId,
|
|
67
|
+
segmentId: requestId,
|
|
68
|
+
});
|
|
69
|
+
}
|
|
70
|
+
this.queue.close();
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
//# sourceMappingURL=tts.js.map
|
package/dist/tts.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tts.js","sourceRoot":"","sources":["../src/tts.ts"],"names":[],"mappings":"AAAA,6CAA6C;AAC7C,EAAE;AACF,sCAAsC;AACtC,OAAO,EAAE,eAAe,EAAE,GAAG,EAAE,MAAM,iBAAiB,CAAC;AACvD,OAAO,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAGhC,MAAM,sBAAsB,GAAG,KAAK,CAAC;AACrC,MAAM,mBAAmB,GAAG,CAAC,CAAC;AAW9B,MAAM,iBAAiB,GAAe;IACpC,MAAM,EAAE,OAAO,CAAC,GAAG,CAAC,cAAc;IAClC,KAAK,EAAE,OAAO;IACd,KAAK,EAAE,OAAO;IACd,KAAK,EAAE,CAAC;CACT,CAAC;AAEF,MAAM,OAAO,GAAI,SAAQ,GAAG,CAAC,GAAG;IAC9B,KAAK,CAAa;IAClB,OAAO,CAAS;IAEhB;;;;;;OAMG;IACH,YAAY,OAA4B,iBAAiB;QACvD,KAAK,CAAC,sBAAsB,EAAE,mBAAmB,EAAE,EAAE,SAAS,EAAE,KAAK,EAAE,CAAC,CAAC;QAEzE,IAAI,CAAC,KAAK,GAAG,EAAE,GAAG,iBAAiB,EAAE,GAAG,IAAI,EAAE,CAAC;QAC/C,IAAI,IAAI,CAAC,KAAK,CAAC,MAAM,KAAK,SAAS,EAAE,CAAC;YACpC,MAAM,IAAI,KAAK,CAAC,0EAA0E,CAAC,CAAC;QAC9F,CAAC;QAED,IAAI,CAAC,OAAO;YACV,IAAI,CAAC,KAAK,CAAC,MAAM;gBACjB,IAAI,MAAM,CAAC;oBACT,OAAO,EAAE,IAAI,CAAC,OAAO;oBACrB,MAAM,EAAE,IAAI,CAAC,MAAM;iBACpB,CAAC,CAAC;IACP,CAAC;IAED,aAAa,CAAC,IAAuE;QACnF,IAAI,CAAC,KAAK,GAAG,EAAE,GAAG,IAAI,CAAC,KAAK,EAAE,GAAG,IAAI,EAAE,CAAC;IAC1C,CAAC;IAED,UAAU,CAAC,IAAY;QACrB,OAAO,IAAI,aAAa,CACtB,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,MAAM,CAAC,MAAM,CAAC;YAC/B,KAAK,EAAE,IAAI;YACX,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK;YACvB,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK;YACvB,eAAe,EAAE,KAAK;YACtB,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK;SACxB,CAAC,CACH,CAAC;IACJ,CAAC;IAED,MAAM;QACJ,MAAM,IAAI,KAAK,CAAC,0CAA0C,CAAC,CAAC;IAC9D,CAAC;CACF;AAED,MAAM,OAAO,aAAc,SAAQ,GAAG,CAAC,aAAa;IAClD,YAAY,MAAyB;QACnC,KAAK,EAAE,CAAC;QACR,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACpB,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,MAAyB;QAClC,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC;QACzD,MAAM,SAAS,GAAG,MAAM,CAAC,UAAU,EAAE,CAAC;QACtC,MAAM,eAAe,GAAG,IAAI,eAAe,CAAC,sBAAsB,EAAE,mBAAmB,CAAC,CAAC;QACzF,MAAM,MAAM,GAAG,eAAe,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QAE7C,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC;gBACb,KAAK;gBACL,SAAS;gBACT,SAAS,EAAE,SAAS;aACrB,CAAC,CAAC;QACL,CAAC;QACD,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC;IACrB,CAAC;CACF"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@livekit/agents-plugin-openai",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.6.0",
|
|
4
4
|
"description": "OpenAI plugin for LiveKit Node Agents",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"types": "dist/index.d.ts",
|
|
@@ -8,15 +8,19 @@
|
|
|
8
8
|
"type": "module",
|
|
9
9
|
"devDependencies": {
|
|
10
10
|
"@microsoft/api-extractor": "^7.35.0",
|
|
11
|
+
"@livekit/rtc-node": "^0.11.1",
|
|
11
12
|
"@types/ws": "^8.5.10",
|
|
12
|
-
"typescript": "^5.0.0"
|
|
13
|
+
"typescript": "^5.0.0",
|
|
14
|
+
"@livekit/agents": "^0.4.5"
|
|
13
15
|
},
|
|
14
16
|
"dependencies": {
|
|
15
|
-
"@livekit/rtc-node": "^0.11.1",
|
|
16
17
|
"openai": "^4.70.2",
|
|
17
18
|
"sharp": "^0.33.5",
|
|
18
|
-
"ws": "^8.16.0"
|
|
19
|
-
|
|
19
|
+
"ws": "^8.16.0"
|
|
20
|
+
},
|
|
21
|
+
"peerDependencies": {
|
|
22
|
+
"@livekit/rtc-node": "^0.11.1",
|
|
23
|
+
"@livekit/agents": "^0.4.5"
|
|
20
24
|
},
|
|
21
25
|
"scripts": {
|
|
22
26
|
"build": "tsc -b tsconfig.json",
|
package/src/index.ts
CHANGED
package/src/llm.ts
CHANGED
|
@@ -438,7 +438,11 @@ export class LLMStream extends llm.LLMStream {
|
|
|
438
438
|
function: {
|
|
439
439
|
name,
|
|
440
440
|
description: func.description,
|
|
441
|
-
parameters
|
|
441
|
+
// don't format parameters if they are raw openai params
|
|
442
|
+
parameters:
|
|
443
|
+
func.parameters.type == ('object' as const)
|
|
444
|
+
? func.parameters
|
|
445
|
+
: llm.oaiParams(func.parameters),
|
|
442
446
|
},
|
|
443
447
|
}))
|
|
444
448
|
: undefined;
|
package/src/models.ts
CHANGED
|
@@ -27,7 +27,13 @@ export type ChatModels =
|
|
|
27
27
|
| 'gpt-3.5-turbo-1106'
|
|
28
28
|
| 'gpt-3.5-turbo-16k-0613';
|
|
29
29
|
|
|
30
|
-
|
|
30
|
+
export type WhisperModels = 'whisper-1';
|
|
31
|
+
|
|
32
|
+
export type TTSModels = 'tts-1' | 'tts-1-hd';
|
|
33
|
+
|
|
34
|
+
export type TTSVoices = 'alloy' | 'echo' | 'fable' | 'onyx' | 'nova' | 'shimmer';
|
|
35
|
+
|
|
36
|
+
// adapters for OpenAI-compatible LLMs, TTSs, STTs
|
|
31
37
|
|
|
32
38
|
export type TelnyxChatModels =
|
|
33
39
|
| 'meta-llama/Meta-Llama-3.1-8B-Instruct'
|
|
@@ -56,6 +62,11 @@ export type GroqChatModels =
|
|
|
56
62
|
| 'gemma-7b-it'
|
|
57
63
|
| 'gemma2-9b-it';
|
|
58
64
|
|
|
65
|
+
export type GroqAudioModels =
|
|
66
|
+
| 'whisper-large-v3'
|
|
67
|
+
| 'distil-whisper-large-v3-en'
|
|
68
|
+
| 'whisper-large-v3-turbo';
|
|
69
|
+
|
|
59
70
|
export type DeepSeekChatModels = 'deepseek-coder' | 'deepseek-chat';
|
|
60
71
|
|
|
61
72
|
export type TogetherChatModels =
|
|
@@ -208,17 +208,32 @@ export type ResponseStatusDetails =
|
|
|
208
208
|
reason: 'turn_detected' | 'client_cancelled' | string;
|
|
209
209
|
};
|
|
210
210
|
|
|
211
|
+
export interface ModelUsage {
|
|
212
|
+
total_tokens: number;
|
|
213
|
+
input_tokens: number;
|
|
214
|
+
output_tokens: number;
|
|
215
|
+
input_token_details: {
|
|
216
|
+
text_tokens: number;
|
|
217
|
+
audio_tokens: number;
|
|
218
|
+
cached_tokens: number;
|
|
219
|
+
cached_tokens_details: {
|
|
220
|
+
text_tokens: number;
|
|
221
|
+
audio_tokens: number;
|
|
222
|
+
};
|
|
223
|
+
};
|
|
224
|
+
output_token_details: {
|
|
225
|
+
text_tokens: number;
|
|
226
|
+
audio_tokens: number;
|
|
227
|
+
};
|
|
228
|
+
}
|
|
229
|
+
|
|
211
230
|
export interface ResponseResource {
|
|
212
231
|
id: string;
|
|
213
232
|
object: 'realtime.response';
|
|
214
233
|
status: ResponseStatus;
|
|
215
234
|
status_details: ResponseStatusDetails;
|
|
216
235
|
output: ItemResource[];
|
|
217
|
-
usage?:
|
|
218
|
-
total_tokens: number;
|
|
219
|
-
input_tokens: number;
|
|
220
|
-
output_tokens: number;
|
|
221
|
-
};
|
|
236
|
+
usage?: ModelUsage;
|
|
222
237
|
}
|
|
223
238
|
|
|
224
239
|
// Client Events
|
|
@@ -37,7 +37,7 @@ export interface RealtimeResponse {
|
|
|
37
37
|
id: string;
|
|
38
38
|
status: api_proto.ResponseStatus;
|
|
39
39
|
statusDetails: api_proto.ResponseStatusDetails | null;
|
|
40
|
-
usage: api_proto.
|
|
40
|
+
usage: api_proto.ModelUsage | null;
|
|
41
41
|
output: RealtimeOutput[];
|
|
42
42
|
doneFut: Future;
|
|
43
43
|
}
|
|
@@ -630,7 +630,11 @@ export class RealtimeSession extends multimodal.RealtimeSession {
|
|
|
630
630
|
type: 'function' as const,
|
|
631
631
|
name,
|
|
632
632
|
description: func.description,
|
|
633
|
-
parameters:
|
|
633
|
+
parameters:
|
|
634
|
+
// don't format parameters if they are raw openai params
|
|
635
|
+
func.parameters.type == ('object' as const)
|
|
636
|
+
? func.parameters
|
|
637
|
+
: llm.oaiParams(func.parameters),
|
|
634
638
|
}))
|
|
635
639
|
: [];
|
|
636
640
|
|
|
@@ -842,8 +846,8 @@ export class RealtimeSession extends multimodal.RealtimeSession {
|
|
|
842
846
|
|
|
843
847
|
#getContent(ptr: ContentPtr): RealtimeContent {
|
|
844
848
|
const response = this.#pendingResponses[ptr.response_id];
|
|
845
|
-
const output = response
|
|
846
|
-
const content = output
|
|
849
|
+
const output = response!.output[ptr.output_index];
|
|
850
|
+
const content = output!.content[ptr.content_index]!;
|
|
847
851
|
return content;
|
|
848
852
|
}
|
|
849
853
|
|
|
@@ -936,10 +940,10 @@ export class RealtimeSession extends multimodal.RealtimeSession {
|
|
|
936
940
|
#handleResponseDone(event: api_proto.ResponseDoneEvent): void {
|
|
937
941
|
const responseData = event.response;
|
|
938
942
|
const responseId = responseData.id;
|
|
939
|
-
const response = this.#pendingResponses[responseId]
|
|
943
|
+
const response = this.#pendingResponses[responseId]!;
|
|
940
944
|
response.status = responseData.status;
|
|
941
945
|
response.statusDetails = responseData.status_details;
|
|
942
|
-
response.usage = responseData.usage;
|
|
946
|
+
response.usage = responseData.usage ?? null;
|
|
943
947
|
this.#pendingResponses[responseId] = response;
|
|
944
948
|
response.doneFut.resolve();
|
|
945
949
|
this.emit('response_done', response);
|
|
@@ -970,7 +974,7 @@ export class RealtimeSession extends multimodal.RealtimeSession {
|
|
|
970
974
|
content: [],
|
|
971
975
|
doneFut: new Future(),
|
|
972
976
|
};
|
|
973
|
-
response
|
|
977
|
+
response?.output.push(newOutput);
|
|
974
978
|
this.emit('response_output_added', newOutput);
|
|
975
979
|
}
|
|
976
980
|
|
|
@@ -978,9 +982,9 @@ export class RealtimeSession extends multimodal.RealtimeSession {
|
|
|
978
982
|
const responseId = event.response_id;
|
|
979
983
|
const response = this.#pendingResponses[responseId];
|
|
980
984
|
const outputIndex = event.output_index;
|
|
981
|
-
const output = response
|
|
985
|
+
const output = response!.output[outputIndex];
|
|
982
986
|
|
|
983
|
-
if (output
|
|
987
|
+
if (output?.type === 'function_call') {
|
|
984
988
|
if (!this.#fncCtx) {
|
|
985
989
|
this.#logger.error('function call received but no fncCtx is available');
|
|
986
990
|
return;
|
|
@@ -991,6 +995,11 @@ export class RealtimeSession extends multimodal.RealtimeSession {
|
|
|
991
995
|
if (item.type !== 'function_call') {
|
|
992
996
|
throw new Error('Expected function_call item');
|
|
993
997
|
}
|
|
998
|
+
const func = this.#fncCtx[item.name];
|
|
999
|
+
if (!func) {
|
|
1000
|
+
this.#logger.error(`no function with name ${item.name} in fncCtx`);
|
|
1001
|
+
return;
|
|
1002
|
+
}
|
|
994
1003
|
|
|
995
1004
|
this.emit('function_call_started', {
|
|
996
1005
|
callId: item.call_id,
|
|
@@ -1002,7 +1011,7 @@ export class RealtimeSession extends multimodal.RealtimeSession {
|
|
|
1002
1011
|
`[Function Call ${item.call_id}] Executing ${item.name} with arguments ${parsedArgs}`,
|
|
1003
1012
|
);
|
|
1004
1013
|
|
|
1005
|
-
|
|
1014
|
+
func.execute(parsedArgs).then(
|
|
1006
1015
|
(content) => {
|
|
1007
1016
|
this.#logger.debug(`[Function Call ${item.call_id}] ${item.name} returned ${content}`);
|
|
1008
1017
|
this.emit('function_call_completed', {
|
|
@@ -1028,7 +1037,7 @@ export class RealtimeSession extends multimodal.RealtimeSession {
|
|
|
1028
1037
|
);
|
|
1029
1038
|
}
|
|
1030
1039
|
|
|
1031
|
-
output
|
|
1040
|
+
output?.doneFut.resolve();
|
|
1032
1041
|
this.emit('response_output_done', output);
|
|
1033
1042
|
}
|
|
1034
1043
|
|
|
@@ -1036,7 +1045,7 @@ export class RealtimeSession extends multimodal.RealtimeSession {
|
|
|
1036
1045
|
const responseId = event.response_id;
|
|
1037
1046
|
const response = this.#pendingResponses[responseId];
|
|
1038
1047
|
const outputIndex = event.output_index;
|
|
1039
|
-
const output = response
|
|
1048
|
+
const output = response!.output[outputIndex];
|
|
1040
1049
|
|
|
1041
1050
|
const textStream = new AsyncIterableQueue<string>();
|
|
1042
1051
|
const audioStream = new AsyncIterableQueue<AudioFrame>();
|
|
@@ -1052,7 +1061,7 @@ export class RealtimeSession extends multimodal.RealtimeSession {
|
|
|
1052
1061
|
audioStream: audioStream,
|
|
1053
1062
|
toolCalls: [],
|
|
1054
1063
|
};
|
|
1055
|
-
output
|
|
1064
|
+
output?.content.push(newContent);
|
|
1056
1065
|
this.emit('response_content_added', newContent);
|
|
1057
1066
|
}
|
|
1058
1067
|
|
|
@@ -1061,11 +1070,13 @@ export class RealtimeSession extends multimodal.RealtimeSession {
|
|
|
1061
1070
|
this.emit('response_content_done', content);
|
|
1062
1071
|
}
|
|
1063
1072
|
|
|
1064
|
-
|
|
1065
|
-
|
|
1073
|
+
#handleResponseTextDelta(event: api_proto.ResponseTextDeltaEvent): void {
|
|
1074
|
+
this.emit('response_text_delta', event);
|
|
1075
|
+
}
|
|
1066
1076
|
|
|
1067
|
-
|
|
1068
|
-
|
|
1077
|
+
#handleResponseTextDone(event: api_proto.ResponseTextDoneEvent): void {
|
|
1078
|
+
this.emit('response_text_done', event);
|
|
1079
|
+
}
|
|
1069
1080
|
|
|
1070
1081
|
#handleResponseAudioTranscriptDelta(event: api_proto.ResponseAudioTranscriptDeltaEvent): void {
|
|
1071
1082
|
const content = this.#getContent(event);
|
package/src/stt.ts
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import { type AudioBuffer, mergeFrames, stt } from '@livekit/agents';
|
|
5
|
+
import type { AudioFrame } from '@livekit/rtc-node';
|
|
6
|
+
import { OpenAI } from 'openai';
|
|
7
|
+
import type { GroqAudioModels, WhisperModels } from './models.js';
|
|
8
|
+
|
|
9
|
+
export interface STTOptions {
|
|
10
|
+
apiKey?: string;
|
|
11
|
+
language: string;
|
|
12
|
+
detectLanguage: boolean;
|
|
13
|
+
model: WhisperModels | string;
|
|
14
|
+
baseURL?: string;
|
|
15
|
+
client?: OpenAI;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
const defaultSTTOptions: STTOptions = {
|
|
19
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
20
|
+
language: 'en',
|
|
21
|
+
detectLanguage: false,
|
|
22
|
+
model: 'whisper-1',
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
export class STT extends stt.STT {
|
|
26
|
+
#opts: STTOptions;
|
|
27
|
+
#client: OpenAI;
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Create a new instance of OpenAI STT.
|
|
31
|
+
*
|
|
32
|
+
* @remarks
|
|
33
|
+
* `apiKey` must be set to your OpenAI API key, either using the argument or by setting the
|
|
34
|
+
* `OPENAI_API_KEY` environmental variable.
|
|
35
|
+
*/
|
|
36
|
+
constructor(opts: Partial<STTOptions> = defaultSTTOptions) {
|
|
37
|
+
super({ streaming: false, interimResults: false });
|
|
38
|
+
|
|
39
|
+
this.#opts = { ...defaultSTTOptions, ...opts };
|
|
40
|
+
if (this.#opts.apiKey === undefined) {
|
|
41
|
+
throw new Error('OpenAI API key is required, whether as an argument or as $OPENAI_API_KEY');
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
this.#client =
|
|
45
|
+
this.#opts.client ||
|
|
46
|
+
new OpenAI({
|
|
47
|
+
baseURL: opts.baseURL,
|
|
48
|
+
apiKey: opts.apiKey,
|
|
49
|
+
});
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Create a new instance of Groq STT.
|
|
54
|
+
*
|
|
55
|
+
* @remarks
|
|
56
|
+
* `apiKey` must be set to your Groq API key, either using the argument or by setting the
|
|
57
|
+
* `GROQ_API_KEY` environmental variable.
|
|
58
|
+
*/
|
|
59
|
+
static withGroq(
|
|
60
|
+
opts: Partial<{
|
|
61
|
+
model: string | GroqAudioModels;
|
|
62
|
+
apiKey?: string;
|
|
63
|
+
baseURL?: string;
|
|
64
|
+
client: OpenAI;
|
|
65
|
+
language: string;
|
|
66
|
+
detectLanguage: boolean;
|
|
67
|
+
}> = {},
|
|
68
|
+
): STT {
|
|
69
|
+
opts.apiKey = opts.apiKey || process.env.GROQ_API_KEY;
|
|
70
|
+
if (opts.apiKey === undefined) {
|
|
71
|
+
throw new Error('Groq API key is required, whether as an argument or as $GROQ_API_KEY');
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
return new STT({
|
|
75
|
+
model: 'whisper-large-v3-turbo',
|
|
76
|
+
baseURL: 'https://api.groq.com/openai/v1',
|
|
77
|
+
...opts,
|
|
78
|
+
});
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
#sanitizeOptions(language?: string): STTOptions {
|
|
82
|
+
if (language) {
|
|
83
|
+
return { ...this.#opts, language };
|
|
84
|
+
} else {
|
|
85
|
+
return this.#opts;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
#createWav(frame: AudioFrame): Buffer {
|
|
90
|
+
const bitsPerSample = 16;
|
|
91
|
+
const byteRate = (frame.sampleRate * frame.channels * bitsPerSample) / 8;
|
|
92
|
+
const blockAlign = (frame.channels * bitsPerSample) / 8;
|
|
93
|
+
|
|
94
|
+
const header = Buffer.alloc(44);
|
|
95
|
+
header.write('RIFF', 0);
|
|
96
|
+
header.writeUInt32LE(36 + frame.data.byteLength, 4);
|
|
97
|
+
header.write('WAVE', 8);
|
|
98
|
+
header.write('fmt ', 12);
|
|
99
|
+
header.writeUInt32LE(16, 16);
|
|
100
|
+
header.writeUInt16LE(1, 20);
|
|
101
|
+
header.writeUInt16LE(frame.channels, 22);
|
|
102
|
+
header.writeUInt32LE(frame.sampleRate, 24);
|
|
103
|
+
header.writeUInt32LE(byteRate, 28);
|
|
104
|
+
header.writeUInt16LE(blockAlign, 32);
|
|
105
|
+
header.writeUInt16LE(16, 34);
|
|
106
|
+
header.write('data', 36);
|
|
107
|
+
header.writeUInt32LE(frame.data.byteLength, 40);
|
|
108
|
+
return Buffer.concat([header, Buffer.from(frame.data.buffer)]);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
async recognize(buffer: AudioBuffer, language?: string): Promise<stt.SpeechEvent> {
|
|
112
|
+
const config = this.#sanitizeOptions(language);
|
|
113
|
+
buffer = mergeFrames(buffer);
|
|
114
|
+
const file = new File([this.#createWav(buffer)], 'audio.wav', { type: 'audio/wav' });
|
|
115
|
+
const resp = await this.#client.audio.transcriptions.create({
|
|
116
|
+
file,
|
|
117
|
+
model: this.#opts.model,
|
|
118
|
+
language: config.language,
|
|
119
|
+
response_format: 'json',
|
|
120
|
+
});
|
|
121
|
+
|
|
122
|
+
return {
|
|
123
|
+
type: stt.SpeechEventType.FINAL_TRANSCRIPT,
|
|
124
|
+
alternatives: [
|
|
125
|
+
{
|
|
126
|
+
text: resp.text || '',
|
|
127
|
+
language: language || '',
|
|
128
|
+
startTime: 0,
|
|
129
|
+
endTime: 0,
|
|
130
|
+
confidence: 0,
|
|
131
|
+
},
|
|
132
|
+
],
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/** This method throws an error; streaming is unsupported on OpenAI STT. */
|
|
137
|
+
stream(): stt.SpeechStream {
|
|
138
|
+
throw new Error('Streaming is not supported on OpenAI STT');
|
|
139
|
+
}
|
|
140
|
+
}
|