@livekit/agents-plugin-openai 0.4.3 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/stt.js ADDED
@@ -0,0 +1,109 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import { mergeFrames, stt } from '@livekit/agents';
5
+ import { OpenAI } from 'openai';
6
+ const defaultSTTOptions = {
7
+ apiKey: process.env.OPENAI_API_KEY,
8
+ language: 'en',
9
+ detectLanguage: false,
10
+ model: 'whisper-1',
11
+ };
12
+ export class STT extends stt.STT {
13
+ #opts;
14
+ #client;
15
+ /**
16
+ * Create a new instance of OpenAI STT.
17
+ *
18
+ * @remarks
19
+ * `apiKey` must be set to your OpenAI API key, either using the argument or by setting the
20
+ * `OPENAI_API_KEY` environmental variable.
21
+ */
22
+ constructor(opts = defaultSTTOptions) {
23
+ super({ streaming: false, interimResults: false });
24
+ this.#opts = { ...defaultSTTOptions, ...opts };
25
+ if (this.#opts.apiKey === undefined) {
26
+ throw new Error('OpenAI API key is required, whether as an argument or as $OPENAI_API_KEY');
27
+ }
28
+ this.#client =
29
+ this.#opts.client ||
30
+ new OpenAI({
31
+ baseURL: opts.baseURL,
32
+ apiKey: opts.apiKey,
33
+ });
34
+ }
35
+ /**
36
+ * Create a new instance of Groq STT.
37
+ *
38
+ * @remarks
39
+ * `apiKey` must be set to your Groq API key, either using the argument or by setting the
40
+ * `GROQ_API_KEY` environmental variable.
41
+ */
42
+ static withGroq(opts = {}) {
43
+ opts.apiKey = opts.apiKey || process.env.GROQ_API_KEY;
44
+ if (opts.apiKey === undefined) {
45
+ throw new Error('Groq API key is required, whether as an argument or as $GROQ_API_KEY');
46
+ }
47
+ return new STT({
48
+ model: 'whisper-large-v3-turbo',
49
+ baseURL: 'https://api.groq.com/openai/v1',
50
+ ...opts,
51
+ });
52
+ }
53
+ #sanitizeOptions(language) {
54
+ if (language) {
55
+ return { ...this.#opts, language };
56
+ }
57
+ else {
58
+ return this.#opts;
59
+ }
60
+ }
61
+ #createWav(frame) {
62
+ const bitsPerSample = 16;
63
+ const byteRate = (frame.sampleRate * frame.channels * bitsPerSample) / 8;
64
+ const blockAlign = (frame.channels * bitsPerSample) / 8;
65
+ const header = Buffer.alloc(44);
66
+ header.write('RIFF', 0);
67
+ header.writeUInt32LE(36 + frame.data.byteLength, 4);
68
+ header.write('WAVE', 8);
69
+ header.write('fmt ', 12);
70
+ header.writeUInt32LE(16, 16);
71
+ header.writeUInt16LE(1, 20);
72
+ header.writeUInt16LE(frame.channels, 22);
73
+ header.writeUInt32LE(frame.sampleRate, 24);
74
+ header.writeUInt32LE(byteRate, 28);
75
+ header.writeUInt16LE(blockAlign, 32);
76
+ header.writeUInt16LE(16, 34);
77
+ header.write('data', 36);
78
+ header.writeUInt32LE(frame.data.byteLength, 40);
79
+ return Buffer.concat([header, Buffer.from(frame.data.buffer)]);
80
+ }
81
+ async recognize(buffer, language) {
82
+ const config = this.#sanitizeOptions(language);
83
+ buffer = mergeFrames(buffer);
84
+ const file = new File([this.#createWav(buffer)], 'audio.wav', { type: 'audio/wav' });
85
+ const resp = await this.#client.audio.transcriptions.create({
86
+ file,
87
+ model: this.#opts.model,
88
+ language: config.language,
89
+ response_format: 'json',
90
+ });
91
+ return {
92
+ type: stt.SpeechEventType.FINAL_TRANSCRIPT,
93
+ alternatives: [
94
+ {
95
+ text: resp.text || '',
96
+ language: language || '',
97
+ startTime: 0,
98
+ endTime: 0,
99
+ confidence: 0,
100
+ },
101
+ ],
102
+ };
103
+ }
104
+ /** This method throws an error; streaming is unsupported on OpenAI STT. */
105
+ stream() {
106
+ throw new Error('Streaming is not supported on OpenAI STT');
107
+ }
108
+ }
109
+ //# sourceMappingURL=stt.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"stt.js","sourceRoot":"","sources":["../src/stt.ts"],"names":[],"mappings":"AAAA,6CAA6C;AAC7C,EAAE;AACF,sCAAsC;AACtC,OAAO,EAAoB,WAAW,EAAE,GAAG,EAAE,MAAM,iBAAiB,CAAC;AAErE,OAAO,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAYhC,MAAM,iBAAiB,GAAe;IACpC,MAAM,EAAE,OAAO,CAAC,GAAG,CAAC,cAAc;IAClC,QAAQ,EAAE,IAAI;IACd,cAAc,EAAE,KAAK;IACrB,KAAK,EAAE,WAAW;CACnB,CAAC;AAEF,MAAM,OAAO,GAAI,SAAQ,GAAG,CAAC,GAAG;IAC9B,KAAK,CAAa;IAClB,OAAO,CAAS;IAEhB;;;;;;OAMG;IACH,YAAY,OAA4B,iBAAiB;QACvD,KAAK,CAAC,EAAE,SAAS,EAAE,KAAK,EAAE,cAAc,EAAE,KAAK,EAAE,CAAC,CAAC;QAEnD,IAAI,CAAC,KAAK,GAAG,EAAE,GAAG,iBAAiB,EAAE,GAAG,IAAI,EAAE,CAAC;QAC/C,IAAI,IAAI,CAAC,KAAK,CAAC,MAAM,KAAK,SAAS,EAAE,CAAC;YACpC,MAAM,IAAI,KAAK,CAAC,0EAA0E,CAAC,CAAC;QAC9F,CAAC;QAED,IAAI,CAAC,OAAO;YACV,IAAI,CAAC,KAAK,CAAC,MAAM;gBACjB,IAAI,MAAM,CAAC;oBACT,OAAO,EAAE,IAAI,CAAC,OAAO;oBACrB,MAAM,EAAE,IAAI,CAAC,MAAM;iBACpB,CAAC,CAAC;IACP,CAAC;IAED;;;;;;OAMG;IACH,MAAM,CAAC,QAAQ,CACb,OAOK,EAAE;QAEP,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,IAAI,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC;QACtD,IAAI,IAAI,CAAC,MAAM,KAAK,SAAS,EAAE,CAAC;YAC9B,MAAM,IAAI,KAAK,CAAC,sEAAsE,CAAC,CAAC;QAC1F,CAAC;QAED,OAAO,IAAI,GAAG,CAAC;YACb,KAAK,EAAE,wBAAwB;YAC/B,OAAO,EAAE,gCAAgC;YACzC,GAAG,IAAI;SACR,CAAC,CAAC;IACL,CAAC;IAED,gBAAgB,CAAC,QAAiB;QAChC,IAAI,QAAQ,EAAE,CAAC;YACb,OAAO,EAAE,GAAG,IAAI,CAAC,KAAK,EAAE,QAAQ,EAAE,CAAC;QACrC,CAAC;aAAM,CAAC;YACN,OAAO,IAAI,CAAC,KAAK,CAAC;QACpB,CAAC;IACH,CAAC;IAED,UAAU,CAAC,KAAiB;QAC1B,MAAM,aAAa,GAAG,EAAE,CAAC;QACzB,MAAM,QAAQ,GAAG,CAAC,KAAK,CAAC,UAAU,GAAG,KAAK,CAAC,QAAQ,GAAG,aAAa,CAAC,GAAG,CAAC,CAAC;QACzE,MAAM,UAAU,GAAG,CAAC,KAAK,CAAC,QAAQ,GAAG,aAAa,CAAC,GAAG,CAAC,CAAC;QAExD,MAAM,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;QAChC,MAAM,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;QACxB,MAAM,CAAC,aAAa,CAAC,EAAE,GAAG,KAAK,CAAC,IAAI,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC;QACpD,MAAM,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;QACxB,MAAM,CAAC,KAAK,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;QACzB,MAAM,CAAC,aAAa,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC;QAC7B,MAAM,CAAC,aAAa,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QAC5B,MAAM,CAAC,aAAa,CAAC,KAAK,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;QACzC,MAAM,CAAC,aAAa,CAAC,KAAK,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC;QAC3C,MAAM,CAAC,aAAa,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;QACnC,MAAM,CAAC,aAAa,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC;QACrC,MAAM,CAAC,aAAa,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC;QAC7B,MAAM,CAAC,KAAK,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;QACzB,MAAM,CAAC,aAAa,CAAC,KAAK,CAAC,IAAI,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC;QAChD,OAAO,MAAM,CAAC,MAAM,CAAC,CAAC,MAAM,EAAE,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;IACjE,CAAC;IAED,KAAK,CAAC,SAAS,CAAC,MAAmB,EAAE,QAAiB;QACpD,MAAM,MAAM,GAAG,IAAI,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC;QAC/C,MAAM,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC;QAC7B,MAAM,IAAI,GAAG,IAAI,IAAI,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,EAAE,WAAW,EAAE,EAAE,IAAI,EAAE,WAAW,EAAE,CAAC,CAAC;QACrF,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,cAAc,CAAC,MAAM,CAAC;YAC1D,IAAI;YACJ,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK;YACvB,QAAQ,EAAE,MAAM,CAAC,QAAQ;YACzB,eAAe,EAAE,MAAM;SACxB,CAAC,CAAC;QAEH,OAAO;YACL,IAAI,EAAE,GAAG,CAAC,eAAe,CAAC,gBAAgB;YAC1C,YAAY,EAAE;gBACZ;oBACE,IAAI,EAAE,IAAI,CAAC,IAAI,IAAI,EAAE;oBACrB,QAAQ,EAAE,QAAQ,IAAI,EAAE;oBACxB,SAAS,EAAE,CAAC;oBACZ,OAAO,EAAE,CAAC;oBACV,UAAU,EAAE,CAAC;iBACd;aACF;SACF,CAAC;IACJ,CAAC;IAED,2EAA2E;IAC3E,MAAM;QACJ,MAAM,IAAI,KAAK,CAAC,0CAA0C,CAAC,CAAC;IAC9D,CAAC;CACF"}
package/dist/tts.d.ts ADDED
@@ -0,0 +1,34 @@
1
+ import { tts } from '@livekit/agents';
2
+ import { OpenAI } from 'openai';
3
+ import type { TTSModels, TTSVoices } from './models.js';
4
+ export interface TTSOptions {
5
+ model: TTSModels | string;
6
+ voice: TTSVoices;
7
+ speed: number;
8
+ baseURL?: string;
9
+ client?: OpenAI;
10
+ apiKey?: string;
11
+ }
12
+ export declare class TTS extends tts.TTS {
13
+ #private;
14
+ /**
15
+ * Create a new instance of OpenAI TTS.
16
+ *
17
+ * @remarks
18
+ * `apiKey` must be set to your OpenAI API key, either using the argument or by setting the
19
+ * `OPENAI_API_KEY` environmental variable.
20
+ */
21
+ constructor(opts?: Partial<TTSOptions>);
22
+ updateOptions(opts: {
23
+ model?: TTSModels | string;
24
+ voice?: TTSVoices;
25
+ speed?: number;
26
+ }): void;
27
+ synthesize(text: string): ChunkedStream;
28
+ stream(): tts.SynthesizeStream;
29
+ }
30
+ export declare class ChunkedStream extends tts.ChunkedStream {
31
+ #private;
32
+ constructor(stream: Promise<Response>);
33
+ }
34
+ //# sourceMappingURL=tts.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"tts.d.ts","sourceRoot":"","sources":["../src/tts.ts"],"names":[],"mappings":"AAGA,OAAO,EAAmB,GAAG,EAAE,MAAM,iBAAiB,CAAC;AACvD,OAAO,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAChC,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAKxD,MAAM,WAAW,UAAU;IACzB,KAAK,EAAE,SAAS,GAAG,MAAM,CAAC;IAC1B,KAAK,EAAE,SAAS,CAAC;IACjB,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AASD,qBAAa,GAAI,SAAQ,GAAG,CAAC,GAAG;;IAI9B;;;;;;OAMG;gBACS,IAAI,GAAE,OAAO,CAAC,UAAU,CAAqB;IAgBzD,aAAa,CAAC,IAAI,EAAE;QAAE,KAAK,CAAC,EAAE,SAAS,GAAG,MAAM,CAAC;QAAC,KAAK,CAAC,EAAE,SAAS,CAAC;QAAC,KAAK,CAAC,EAAE,MAAM,CAAA;KAAE;IAIrF,UAAU,CAAC,IAAI,EAAE,MAAM,GAAG,aAAa;IAYvC,MAAM,IAAI,GAAG,CAAC,gBAAgB;CAG/B;AAED,qBAAa,aAAc,SAAQ,GAAG,CAAC,aAAa;;gBACtC,MAAM,EAAE,OAAO,CAAC,QAAQ,CAAC;CAoBtC"}
package/dist/tts.js ADDED
@@ -0,0 +1,73 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import { AudioByteStream, tts } from '@livekit/agents';
5
+ import { OpenAI } from 'openai';
6
+ const OPENAI_TTS_SAMPLE_RATE = 24000;
7
+ const OPENAI_TTS_CHANNELS = 1;
8
+ const defaultTTSOptions = {
9
+ apiKey: process.env.OPENAI_API_KEY,
10
+ model: 'tts-1',
11
+ voice: 'alloy',
12
+ speed: 1,
13
+ };
14
+ export class TTS extends tts.TTS {
15
+ #opts;
16
+ #client;
17
+ /**
18
+ * Create a new instance of OpenAI TTS.
19
+ *
20
+ * @remarks
21
+ * `apiKey` must be set to your OpenAI API key, either using the argument or by setting the
22
+ * `OPENAI_API_KEY` environmental variable.
23
+ */
24
+ constructor(opts = defaultTTSOptions) {
25
+ super(OPENAI_TTS_SAMPLE_RATE, OPENAI_TTS_CHANNELS, { streaming: false });
26
+ this.#opts = { ...defaultTTSOptions, ...opts };
27
+ if (this.#opts.apiKey === undefined) {
28
+ throw new Error('OpenAI API key is required, whether as an argument or as $OPENAI_API_KEY');
29
+ }
30
+ this.#client =
31
+ this.#opts.client ||
32
+ new OpenAI({
33
+ baseURL: opts.baseURL,
34
+ apiKey: opts.apiKey,
35
+ });
36
+ }
37
+ updateOptions(opts) {
38
+ this.#opts = { ...this.#opts, ...opts };
39
+ }
40
+ synthesize(text) {
41
+ return new ChunkedStream(this.#client.audio.speech.create({
42
+ input: text,
43
+ model: this.#opts.model,
44
+ voice: this.#opts.voice,
45
+ response_format: 'pcm',
46
+ speed: this.#opts.speed,
47
+ }));
48
+ }
49
+ stream() {
50
+ throw new Error('Streaming is not supported on OpenAI TTS');
51
+ }
52
+ }
53
+ export class ChunkedStream extends tts.ChunkedStream {
54
+ constructor(stream) {
55
+ super();
56
+ this.#run(stream);
57
+ }
58
+ async #run(stream) {
59
+ const buffer = await stream.then((r) => r.arrayBuffer());
60
+ const requestId = crypto.randomUUID();
61
+ const audioByteStream = new AudioByteStream(OPENAI_TTS_SAMPLE_RATE, OPENAI_TTS_CHANNELS);
62
+ const frames = audioByteStream.write(buffer);
63
+ for (const frame of frames) {
64
+ this.queue.put({
65
+ frame,
66
+ requestId,
67
+ segmentId: requestId,
68
+ });
69
+ }
70
+ this.queue.close();
71
+ }
72
+ }
73
+ //# sourceMappingURL=tts.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"tts.js","sourceRoot":"","sources":["../src/tts.ts"],"names":[],"mappings":"AAAA,6CAA6C;AAC7C,EAAE;AACF,sCAAsC;AACtC,OAAO,EAAE,eAAe,EAAE,GAAG,EAAE,MAAM,iBAAiB,CAAC;AACvD,OAAO,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAGhC,MAAM,sBAAsB,GAAG,KAAK,CAAC;AACrC,MAAM,mBAAmB,GAAG,CAAC,CAAC;AAW9B,MAAM,iBAAiB,GAAe;IACpC,MAAM,EAAE,OAAO,CAAC,GAAG,CAAC,cAAc;IAClC,KAAK,EAAE,OAAO;IACd,KAAK,EAAE,OAAO;IACd,KAAK,EAAE,CAAC;CACT,CAAC;AAEF,MAAM,OAAO,GAAI,SAAQ,GAAG,CAAC,GAAG;IAC9B,KAAK,CAAa;IAClB,OAAO,CAAS;IAEhB;;;;;;OAMG;IACH,YAAY,OAA4B,iBAAiB;QACvD,KAAK,CAAC,sBAAsB,EAAE,mBAAmB,EAAE,EAAE,SAAS,EAAE,KAAK,EAAE,CAAC,CAAC;QAEzE,IAAI,CAAC,KAAK,GAAG,EAAE,GAAG,iBAAiB,EAAE,GAAG,IAAI,EAAE,CAAC;QAC/C,IAAI,IAAI,CAAC,KAAK,CAAC,MAAM,KAAK,SAAS,EAAE,CAAC;YACpC,MAAM,IAAI,KAAK,CAAC,0EAA0E,CAAC,CAAC;QAC9F,CAAC;QAED,IAAI,CAAC,OAAO;YACV,IAAI,CAAC,KAAK,CAAC,MAAM;gBACjB,IAAI,MAAM,CAAC;oBACT,OAAO,EAAE,IAAI,CAAC,OAAO;oBACrB,MAAM,EAAE,IAAI,CAAC,MAAM;iBACpB,CAAC,CAAC;IACP,CAAC;IAED,aAAa,CAAC,IAAuE;QACnF,IAAI,CAAC,KAAK,GAAG,EAAE,GAAG,IAAI,CAAC,KAAK,EAAE,GAAG,IAAI,EAAE,CAAC;IAC1C,CAAC;IAED,UAAU,CAAC,IAAY;QACrB,OAAO,IAAI,aAAa,CACtB,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,MAAM,CAAC,MAAM,CAAC;YAC/B,KAAK,EAAE,IAAI;YACX,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK;YACvB,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK;YACvB,eAAe,EAAE,KAAK;YACtB,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK;SACxB,CAAC,CACH,CAAC;IACJ,CAAC;IAED,MAAM;QACJ,MAAM,IAAI,KAAK,CAAC,0CAA0C,CAAC,CAAC;IAC9D,CAAC;CACF;AAED,MAAM,OAAO,aAAc,SAAQ,GAAG,CAAC,aAAa;IAClD,YAAY,MAAyB;QACnC,KAAK,EAAE,CAAC;QACR,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACpB,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,MAAyB;QAClC,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC;QACzD,MAAM,SAAS,GAAG,MAAM,CAAC,UAAU,EAAE,CAAC;QACtC,MAAM,eAAe,GAAG,IAAI,eAAe,CAAC,sBAAsB,EAAE,mBAAmB,CAAC,CAAC;QACzF,MAAM,MAAM,GAAG,eAAe,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QAE7C,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC;gBACb,KAAK;gBACL,SAAS;gBACT,SAAS,EAAE,SAAS;aACrB,CAAC,CAAC;QACL,CAAC;QACD,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC;IACrB,CAAC;CACF"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@livekit/agents-plugin-openai",
3
- "version": "0.4.3",
3
+ "version": "0.6.0",
4
4
  "description": "OpenAI plugin for LiveKit Node Agents",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",
@@ -8,15 +8,19 @@
8
8
  "type": "module",
9
9
  "devDependencies": {
10
10
  "@microsoft/api-extractor": "^7.35.0",
11
+ "@livekit/rtc-node": "^0.11.1",
11
12
  "@types/ws": "^8.5.10",
12
- "typescript": "^5.0.0"
13
+ "typescript": "^5.0.0",
14
+ "@livekit/agents": "^0.4.5"
13
15
  },
14
16
  "dependencies": {
15
- "@livekit/rtc-node": "^0.11.1",
16
17
  "openai": "^4.70.2",
17
18
  "sharp": "^0.33.5",
18
- "ws": "^8.16.0",
19
- "@livekit/agents": "0.4.3"
19
+ "ws": "^8.16.0"
20
+ },
21
+ "peerDependencies": {
22
+ "@livekit/rtc-node": "^0.11.1",
23
+ "@livekit/agents": "^0.4.5"
20
24
  },
21
25
  "scripts": {
22
26
  "build": "tsc -b tsconfig.json",
package/src/index.ts CHANGED
@@ -4,3 +4,5 @@
4
4
  export * as realtime from './realtime/index.js';
5
5
  export * from './models.js';
6
6
  export { type LLMOptions, LLM, LLMStream } from './llm.js';
7
+ export { type STTOptions, STT } from './stt.js';
8
+ export { type TTSOptions, TTS, ChunkedStream } from './tts.js';
package/src/llm.ts CHANGED
@@ -438,7 +438,11 @@ export class LLMStream extends llm.LLMStream {
438
438
  function: {
439
439
  name,
440
440
  description: func.description,
441
- parameters: llm.oaiParams(func.parameters),
441
+ // don't format parameters if they are raw openai params
442
+ parameters:
443
+ func.parameters.type == ('object' as const)
444
+ ? func.parameters
445
+ : llm.oaiParams(func.parameters),
442
446
  },
443
447
  }))
444
448
  : undefined;
package/src/models.ts CHANGED
@@ -27,7 +27,13 @@ export type ChatModels =
27
27
  | 'gpt-3.5-turbo-1106'
28
28
  | 'gpt-3.5-turbo-16k-0613';
29
29
 
30
- // adapters for OpenAI-compatible LLMs
30
+ export type WhisperModels = 'whisper-1';
31
+
32
+ export type TTSModels = 'tts-1' | 'tts-1-hd';
33
+
34
+ export type TTSVoices = 'alloy' | 'echo' | 'fable' | 'onyx' | 'nova' | 'shimmer';
35
+
36
+ // adapters for OpenAI-compatible LLMs, TTSs, STTs
31
37
 
32
38
  export type TelnyxChatModels =
33
39
  | 'meta-llama/Meta-Llama-3.1-8B-Instruct'
@@ -56,6 +62,11 @@ export type GroqChatModels =
56
62
  | 'gemma-7b-it'
57
63
  | 'gemma2-9b-it';
58
64
 
65
+ export type GroqAudioModels =
66
+ | 'whisper-large-v3'
67
+ | 'distil-whisper-large-v3-en'
68
+ | 'whisper-large-v3-turbo';
69
+
59
70
  export type DeepSeekChatModels = 'deepseek-coder' | 'deepseek-chat';
60
71
 
61
72
  export type TogetherChatModels =
@@ -208,17 +208,32 @@ export type ResponseStatusDetails =
208
208
  reason: 'turn_detected' | 'client_cancelled' | string;
209
209
  };
210
210
 
211
+ export interface ModelUsage {
212
+ total_tokens: number;
213
+ input_tokens: number;
214
+ output_tokens: number;
215
+ input_token_details: {
216
+ text_tokens: number;
217
+ audio_tokens: number;
218
+ cached_tokens: number;
219
+ cached_tokens_details: {
220
+ text_tokens: number;
221
+ audio_tokens: number;
222
+ };
223
+ };
224
+ output_token_details: {
225
+ text_tokens: number;
226
+ audio_tokens: number;
227
+ };
228
+ }
229
+
211
230
  export interface ResponseResource {
212
231
  id: string;
213
232
  object: 'realtime.response';
214
233
  status: ResponseStatus;
215
234
  status_details: ResponseStatusDetails;
216
235
  output: ItemResource[];
217
- usage?: {
218
- total_tokens: number;
219
- input_tokens: number;
220
- output_tokens: number;
221
- };
236
+ usage?: ModelUsage;
222
237
  }
223
238
 
224
239
  // Client Events
@@ -37,7 +37,7 @@ export interface RealtimeResponse {
37
37
  id: string;
38
38
  status: api_proto.ResponseStatus;
39
39
  statusDetails: api_proto.ResponseStatusDetails | null;
40
- usage: api_proto.ResponseResource['usage'] | null;
40
+ usage: api_proto.ModelUsage | null;
41
41
  output: RealtimeOutput[];
42
42
  doneFut: Future;
43
43
  }
@@ -630,7 +630,11 @@ export class RealtimeSession extends multimodal.RealtimeSession {
630
630
  type: 'function' as const,
631
631
  name,
632
632
  description: func.description,
633
- parameters: llm.oaiParams(func.parameters),
633
+ parameters:
634
+ // don't format parameters if they are raw openai params
635
+ func.parameters.type == ('object' as const)
636
+ ? func.parameters
637
+ : llm.oaiParams(func.parameters),
634
638
  }))
635
639
  : [];
636
640
 
@@ -842,8 +846,8 @@ export class RealtimeSession extends multimodal.RealtimeSession {
842
846
 
843
847
  #getContent(ptr: ContentPtr): RealtimeContent {
844
848
  const response = this.#pendingResponses[ptr.response_id];
845
- const output = response.output[ptr.output_index];
846
- const content = output.content[ptr.content_index];
849
+ const output = response!.output[ptr.output_index];
850
+ const content = output!.content[ptr.content_index]!;
847
851
  return content;
848
852
  }
849
853
 
@@ -936,10 +940,10 @@ export class RealtimeSession extends multimodal.RealtimeSession {
936
940
  #handleResponseDone(event: api_proto.ResponseDoneEvent): void {
937
941
  const responseData = event.response;
938
942
  const responseId = responseData.id;
939
- const response = this.#pendingResponses[responseId];
943
+ const response = this.#pendingResponses[responseId]!;
940
944
  response.status = responseData.status;
941
945
  response.statusDetails = responseData.status_details;
942
- response.usage = responseData.usage;
946
+ response.usage = responseData.usage ?? null;
943
947
  this.#pendingResponses[responseId] = response;
944
948
  response.doneFut.resolve();
945
949
  this.emit('response_done', response);
@@ -970,7 +974,7 @@ export class RealtimeSession extends multimodal.RealtimeSession {
970
974
  content: [],
971
975
  doneFut: new Future(),
972
976
  };
973
- response.output.push(newOutput);
977
+ response?.output.push(newOutput);
974
978
  this.emit('response_output_added', newOutput);
975
979
  }
976
980
 
@@ -978,9 +982,9 @@ export class RealtimeSession extends multimodal.RealtimeSession {
978
982
  const responseId = event.response_id;
979
983
  const response = this.#pendingResponses[responseId];
980
984
  const outputIndex = event.output_index;
981
- const output = response.output[outputIndex];
985
+ const output = response!.output[outputIndex];
982
986
 
983
- if (output.type === 'function_call') {
987
+ if (output?.type === 'function_call') {
984
988
  if (!this.#fncCtx) {
985
989
  this.#logger.error('function call received but no fncCtx is available');
986
990
  return;
@@ -991,6 +995,11 @@ export class RealtimeSession extends multimodal.RealtimeSession {
991
995
  if (item.type !== 'function_call') {
992
996
  throw new Error('Expected function_call item');
993
997
  }
998
+ const func = this.#fncCtx[item.name];
999
+ if (!func) {
1000
+ this.#logger.error(`no function with name ${item.name} in fncCtx`);
1001
+ return;
1002
+ }
994
1003
 
995
1004
  this.emit('function_call_started', {
996
1005
  callId: item.call_id,
@@ -1002,7 +1011,7 @@ export class RealtimeSession extends multimodal.RealtimeSession {
1002
1011
  `[Function Call ${item.call_id}] Executing ${item.name} with arguments ${parsedArgs}`,
1003
1012
  );
1004
1013
 
1005
- this.#fncCtx[item.name].execute(parsedArgs).then(
1014
+ func.execute(parsedArgs).then(
1006
1015
  (content) => {
1007
1016
  this.#logger.debug(`[Function Call ${item.call_id}] ${item.name} returned ${content}`);
1008
1017
  this.emit('function_call_completed', {
@@ -1028,7 +1037,7 @@ export class RealtimeSession extends multimodal.RealtimeSession {
1028
1037
  );
1029
1038
  }
1030
1039
 
1031
- output.doneFut.resolve();
1040
+ output?.doneFut.resolve();
1032
1041
  this.emit('response_output_done', output);
1033
1042
  }
1034
1043
 
@@ -1036,7 +1045,7 @@ export class RealtimeSession extends multimodal.RealtimeSession {
1036
1045
  const responseId = event.response_id;
1037
1046
  const response = this.#pendingResponses[responseId];
1038
1047
  const outputIndex = event.output_index;
1039
- const output = response.output[outputIndex];
1048
+ const output = response!.output[outputIndex];
1040
1049
 
1041
1050
  const textStream = new AsyncIterableQueue<string>();
1042
1051
  const audioStream = new AsyncIterableQueue<AudioFrame>();
@@ -1052,7 +1061,7 @@ export class RealtimeSession extends multimodal.RealtimeSession {
1052
1061
  audioStream: audioStream,
1053
1062
  toolCalls: [],
1054
1063
  };
1055
- output.content.push(newContent);
1064
+ output?.content.push(newContent);
1056
1065
  this.emit('response_content_added', newContent);
1057
1066
  }
1058
1067
 
@@ -1061,11 +1070,13 @@ export class RealtimeSession extends multimodal.RealtimeSession {
1061
1070
  this.emit('response_content_done', content);
1062
1071
  }
1063
1072
 
1064
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
1065
- #handleResponseTextDelta(event: api_proto.ResponseTextDeltaEvent): void {}
1073
+ #handleResponseTextDelta(event: api_proto.ResponseTextDeltaEvent): void {
1074
+ this.emit('response_text_delta', event);
1075
+ }
1066
1076
 
1067
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
1068
- #handleResponseTextDone(event: api_proto.ResponseTextDoneEvent): void {}
1077
+ #handleResponseTextDone(event: api_proto.ResponseTextDoneEvent): void {
1078
+ this.emit('response_text_done', event);
1079
+ }
1069
1080
 
1070
1081
  #handleResponseAudioTranscriptDelta(event: api_proto.ResponseAudioTranscriptDeltaEvent): void {
1071
1082
  const content = this.#getContent(event);
package/src/stt.ts ADDED
@@ -0,0 +1,140 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import { type AudioBuffer, mergeFrames, stt } from '@livekit/agents';
5
+ import type { AudioFrame } from '@livekit/rtc-node';
6
+ import { OpenAI } from 'openai';
7
+ import type { GroqAudioModels, WhisperModels } from './models.js';
8
+
9
+ export interface STTOptions {
10
+ apiKey?: string;
11
+ language: string;
12
+ detectLanguage: boolean;
13
+ model: WhisperModels | string;
14
+ baseURL?: string;
15
+ client?: OpenAI;
16
+ }
17
+
18
+ const defaultSTTOptions: STTOptions = {
19
+ apiKey: process.env.OPENAI_API_KEY,
20
+ language: 'en',
21
+ detectLanguage: false,
22
+ model: 'whisper-1',
23
+ };
24
+
25
+ export class STT extends stt.STT {
26
+ #opts: STTOptions;
27
+ #client: OpenAI;
28
+
29
+ /**
30
+ * Create a new instance of OpenAI STT.
31
+ *
32
+ * @remarks
33
+ * `apiKey` must be set to your OpenAI API key, either using the argument or by setting the
34
+ * `OPENAI_API_KEY` environmental variable.
35
+ */
36
+ constructor(opts: Partial<STTOptions> = defaultSTTOptions) {
37
+ super({ streaming: false, interimResults: false });
38
+
39
+ this.#opts = { ...defaultSTTOptions, ...opts };
40
+ if (this.#opts.apiKey === undefined) {
41
+ throw new Error('OpenAI API key is required, whether as an argument or as $OPENAI_API_KEY');
42
+ }
43
+
44
+ this.#client =
45
+ this.#opts.client ||
46
+ new OpenAI({
47
+ baseURL: opts.baseURL,
48
+ apiKey: opts.apiKey,
49
+ });
50
+ }
51
+
52
+ /**
53
+ * Create a new instance of Groq STT.
54
+ *
55
+ * @remarks
56
+ * `apiKey` must be set to your Groq API key, either using the argument or by setting the
57
+ * `GROQ_API_KEY` environmental variable.
58
+ */
59
+ static withGroq(
60
+ opts: Partial<{
61
+ model: string | GroqAudioModels;
62
+ apiKey?: string;
63
+ baseURL?: string;
64
+ client: OpenAI;
65
+ language: string;
66
+ detectLanguage: boolean;
67
+ }> = {},
68
+ ): STT {
69
+ opts.apiKey = opts.apiKey || process.env.GROQ_API_KEY;
70
+ if (opts.apiKey === undefined) {
71
+ throw new Error('Groq API key is required, whether as an argument or as $GROQ_API_KEY');
72
+ }
73
+
74
+ return new STT({
75
+ model: 'whisper-large-v3-turbo',
76
+ baseURL: 'https://api.groq.com/openai/v1',
77
+ ...opts,
78
+ });
79
+ }
80
+
81
+ #sanitizeOptions(language?: string): STTOptions {
82
+ if (language) {
83
+ return { ...this.#opts, language };
84
+ } else {
85
+ return this.#opts;
86
+ }
87
+ }
88
+
89
+ #createWav(frame: AudioFrame): Buffer {
90
+ const bitsPerSample = 16;
91
+ const byteRate = (frame.sampleRate * frame.channels * bitsPerSample) / 8;
92
+ const blockAlign = (frame.channels * bitsPerSample) / 8;
93
+
94
+ const header = Buffer.alloc(44);
95
+ header.write('RIFF', 0);
96
+ header.writeUInt32LE(36 + frame.data.byteLength, 4);
97
+ header.write('WAVE', 8);
98
+ header.write('fmt ', 12);
99
+ header.writeUInt32LE(16, 16);
100
+ header.writeUInt16LE(1, 20);
101
+ header.writeUInt16LE(frame.channels, 22);
102
+ header.writeUInt32LE(frame.sampleRate, 24);
103
+ header.writeUInt32LE(byteRate, 28);
104
+ header.writeUInt16LE(blockAlign, 32);
105
+ header.writeUInt16LE(16, 34);
106
+ header.write('data', 36);
107
+ header.writeUInt32LE(frame.data.byteLength, 40);
108
+ return Buffer.concat([header, Buffer.from(frame.data.buffer)]);
109
+ }
110
+
111
+ async recognize(buffer: AudioBuffer, language?: string): Promise<stt.SpeechEvent> {
112
+ const config = this.#sanitizeOptions(language);
113
+ buffer = mergeFrames(buffer);
114
+ const file = new File([this.#createWav(buffer)], 'audio.wav', { type: 'audio/wav' });
115
+ const resp = await this.#client.audio.transcriptions.create({
116
+ file,
117
+ model: this.#opts.model,
118
+ language: config.language,
119
+ response_format: 'json',
120
+ });
121
+
122
+ return {
123
+ type: stt.SpeechEventType.FINAL_TRANSCRIPT,
124
+ alternatives: [
125
+ {
126
+ text: resp.text || '',
127
+ language: language || '',
128
+ startTime: 0,
129
+ endTime: 0,
130
+ confidence: 0,
131
+ },
132
+ ],
133
+ };
134
+ }
135
+
136
+ /** This method throws an error; streaming is unsupported on OpenAI STT. */
137
+ stream(): stt.SpeechStream {
138
+ throw new Error('Streaming is not supported on OpenAI STT');
139
+ }
140
+ }