@livekit/agents-plugin-elevenlabs 0.1.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/tts.ts CHANGED
@@ -1,11 +1,13 @@
1
1
  // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
- import { log, tts } from '@livekit/agents';
4
+ import { AsyncIterableQueue, log, tokenize, tts } from '@livekit/agents';
5
+ import type { WordStream } from '@livekit/agents/dist/tokenize/tokenizer.js';
5
6
  import { AudioFrame } from '@livekit/rtc-node';
6
- import { URL } from 'url';
7
+ import { randomUUID } from 'node:crypto';
8
+ import { URL } from 'node:url';
7
9
  import { type RawData, WebSocket } from 'ws';
8
- import type { TTSModels } from './models.js';
10
+ import type { TTSEncoding, TTSModels } from './models.js';
9
11
 
10
12
  type Voice = {
11
13
  id: string;
@@ -33,52 +35,53 @@ const DEFAULT_VOICE: Voice = {
33
35
  },
34
36
  };
35
37
 
36
- const API_BASE_URL_V1 = 'https://api.elevenlabs.io/v1';
38
+ const API_BASE_URL_V1 = 'https://api.elevenlabs.io/v1/';
37
39
  const AUTHORIZATION_HEADER = 'xi-api-key';
38
- const STREAM_EOS = '';
39
40
 
40
- type TTSOptions = {
41
- apiKey: string;
41
+ export interface TTSOptions {
42
+ apiKey?: string;
42
43
  voice: Voice;
43
44
  modelID: TTSModels;
44
45
  baseURL: string;
45
- sampleRate: number;
46
- latency: number;
46
+ encoding: TTSEncoding;
47
+ streamingLatency: number;
48
+ wordTokenizer: tokenize.WordTokenizer;
49
+ chunkLengthSchedule: number[];
50
+ enableSsmlParsing: boolean;
51
+ }
52
+
53
+ const defaultTTSOptions: TTSOptions = {
54
+ apiKey: process.env.ELEVEN_API_KEY,
55
+ voice: DEFAULT_VOICE,
56
+ modelID: 'eleven_turbo_v2_5',
57
+ baseURL: API_BASE_URL_V1,
58
+ encoding: 'pcm_22050',
59
+ streamingLatency: 3,
60
+ wordTokenizer: new tokenize.basic.WordTokenizer(false),
61
+ chunkLengthSchedule: [],
62
+ enableSsmlParsing: false,
47
63
  };
48
64
 
49
65
  export class TTS extends tts.TTS {
50
- config: TTSOptions;
51
-
52
- constructor(
53
- voice = DEFAULT_VOICE,
54
- modelID: TTSModels = 'eleven_multilingual_v2',
55
- apiKey?: string,
56
- baseURL?: string,
57
- sampleRate = 24000,
58
- latency = 2,
59
- ) {
60
- super(true);
61
- apiKey = apiKey || process.env.ELEVEN_API_KEY;
62
- if (apiKey === undefined) {
66
+ #opts: TTSOptions;
67
+
68
+ constructor(opts: Partial<TTSOptions> = defaultTTSOptions) {
69
+ super(sampleRateFromFormat(opts.encoding || defaultTTSOptions.encoding), 1, {
70
+ streaming: true,
71
+ });
72
+ if (opts.apiKey === undefined) {
63
73
  throw new Error(
64
74
  'ElevenLabs API key is required, whether as an argument or as $ELEVEN_API_KEY',
65
75
  );
66
76
  }
67
77
 
68
- this.config = {
69
- voice,
70
- modelID,
71
- apiKey,
72
- baseURL: baseURL || API_BASE_URL_V1,
73
- sampleRate,
74
- latency,
75
- };
78
+ this.#opts = { ...defaultTTSOptions, ...opts };
76
79
  }
77
80
 
78
81
  async listVoices(): Promise<Voice[]> {
79
- return fetch(this.config.baseURL + '/voices', {
82
+ return fetch(this.#opts.baseURL + '/voices', {
80
83
  headers: {
81
- [AUTHORIZATION_HEADER]: this.config.apiKey,
84
+ [AUTHORIZATION_HEADER]: this.#opts.apiKey!,
82
85
  },
83
86
  })
84
87
  .then((data) => data.json())
@@ -98,62 +101,76 @@ export class TTS extends tts.TTS {
98
101
  });
99
102
  }
100
103
 
101
- async synthesize(text: string): Promise<tts.ChunkedStream> {
102
- return new ChunkedStream(text, this.config);
103
- }
104
-
105
104
  stream(): tts.SynthesizeStream {
106
- return new SynthesizeStream(this.config);
105
+ return new SynthesizeStream(this.#opts);
107
106
  }
108
107
  }
109
108
 
110
109
  export class SynthesizeStream extends tts.SynthesizeStream {
111
- closed: boolean;
112
- config: TTSOptions;
113
- text: string;
114
- task: {
115
- run: Promise<void>;
116
- cancel: () => void;
117
- };
118
- queue: string[] = [];
119
- eventQueue: (tts.SynthesisEvent | undefined)[] = [];
120
-
121
- constructor(config: TTSOptions) {
110
+ #opts: TTSOptions;
111
+ #logger = log();
112
+ readonly streamURL: URL;
113
+
114
+ constructor(opts: TTSOptions) {
122
115
  super();
123
- this.config = config;
116
+ this.#opts = opts;
124
117
  this.closed = false;
125
- this.text = '';
126
- this.task = {
127
- run: new Promise(() => {
128
- this.run(32);
129
- }),
130
- cancel: () => {},
118
+
119
+ // add trailing slash to URL if needed
120
+ const baseURL = opts.baseURL + (opts.baseURL.endsWith('/') ? '' : '/');
121
+
122
+ this.streamURL = new URL(`text-to-speech/${opts.voice.id}/stream-input`, baseURL);
123
+ const params = {
124
+ model_id: opts.modelID,
125
+ output_format: opts.encoding,
126
+ optimize_streaming_latency: `${opts.streamingLatency}`,
127
+ enable_ssml_parsing: `${opts.enableSsmlParsing}`,
131
128
  };
132
- }
129
+ Object.entries(params).forEach(([k, v]) => this.streamURL.searchParams.append(k, v));
130
+ this.streamURL.protocol = this.streamURL.protocol.replace('http', 'ws');
133
131
 
134
- get streamURL(): string {
135
- return `${this.config.baseURL}/text-to-speech/${this.config.voice.id}/stream-input?model_id=${this.config.modelID}&optimize_streaming_latency=${this.config.latency}`;
132
+ this.#run();
136
133
  }
137
134
 
138
- pushText(token?: string | undefined): void {
139
- if (this.closed) throw new Error('cannot push to a closed stream');
140
- if (!token || token.length === 0) return;
135
+ async #run() {
136
+ const segments = new AsyncIterableQueue<WordStream>();
141
137
 
142
- const splitters = '.,?!;:—-()[]} ';
143
- this.text += token;
144
- if (splitters.includes(token[token.length - 1])) {
145
- this.queue.push(this.text);
146
- this.text = '';
147
- }
138
+ const tokenizeInput = async () => {
139
+ let stream: tokenize.WordStream | null = null;
140
+ for await (const text of this.input) {
141
+ if (text === SynthesizeStream.FLUSH_SENTINEL) {
142
+ if (stream) {
143
+ stream.close();
144
+ }
145
+ stream = null;
146
+ } else {
147
+ if (!stream) {
148
+ stream = this.#opts.wordTokenizer.stream();
149
+ segments.put(stream);
150
+ }
151
+ stream.pushText(text);
152
+ }
153
+ }
154
+ segments.close();
155
+ };
156
+
157
+ const runStream = async () => {
158
+ for await (const stream of segments) {
159
+ await this.#runWS(stream);
160
+ this.queue.put(SynthesizeStream.END_OF_STREAM);
161
+ }
162
+ };
163
+
164
+ await Promise.all([tokenizeInput(), runStream()]);
165
+ this.close();
148
166
  }
149
167
 
150
- async run(maxRetry: number) {
168
+ async #runWS(stream: tokenize.WordStream, maxRetry = 3) {
151
169
  let retries = 0;
152
- while (!this.closed) {
153
- const url = new URL(this.streamURL);
154
- url.protocol = url.protocol.replace('http', 'ws');
155
- const ws = new WebSocket(url, {
156
- headers: { [AUTHORIZATION_HEADER]: this.config.apiKey },
170
+ let ws: WebSocket;
171
+ while (true) {
172
+ ws = new WebSocket(this.streamURL, {
173
+ headers: { [AUTHORIZATION_HEADER]: this.#opts.apiKey },
157
174
  });
158
175
 
159
176
  try {
@@ -162,38 +179,7 @@ export class SynthesizeStream extends tts.SynthesizeStream {
162
179
  ws.on('error', (error) => reject(error));
163
180
  ws.on('close', (code) => reject(`WebSocket returned ${code}`));
164
181
  });
165
-
166
- ws.send(JSON.stringify({ text: ' ', voice_settings: this.config.voice }));
167
- let started = false;
168
- const retryQueue: string[] = [];
169
- const task = this.listenTask(ws);
170
- while (ws.readyState !== ws.CLOSED) {
171
- let text = undefined;
172
- if (retryQueue.length === 0) {
173
- text = this.queue.shift();
174
- } else {
175
- text = retryQueue.shift();
176
- }
177
-
178
- if (!started) {
179
- this.eventQueue.push(new tts.SynthesisEvent(tts.SynthesisEventType.STARTED));
180
- started = true;
181
- }
182
-
183
- try {
184
- ws.send(JSON.stringify({ text, try_trigger_generation: true }));
185
- } catch (e) {
186
- // XI closes idle connections after a while.
187
- retryQueue.push(text!);
188
- break;
189
- }
190
-
191
- if (text == STREAM_EOS) {
192
- await task;
193
- this.eventQueue.push(new tts.SynthesisEvent(tts.SynthesisEventType.FINISHED));
194
- break;
195
- }
196
- }
182
+ break;
197
183
  } catch (e) {
198
184
  if (retries >= maxRetry) {
199
185
  throw new Error(`failed to connect to ElevenLabs after ${retries} attempts: ${e}`);
@@ -202,130 +188,87 @@ export class SynthesizeStream extends tts.SynthesizeStream {
202
188
  const delay = Math.min(retries * 5, 5);
203
189
  retries++;
204
190
 
205
- log.warn(
191
+ this.#logger.warn(
206
192
  `failed to connect to ElevenLabs, retrying in ${delay} seconds: ${e} (${retries}/${maxRetry})`,
207
193
  );
208
194
  await new Promise((resolve) => setTimeout(resolve, delay * 1000));
209
195
  }
210
196
  }
211
- this.closed = true;
212
- }
213
197
 
214
- async listenTask(ws: WebSocket) {
215
- while (!this.closed) {
216
- try {
217
- await new Promise<RawData>((resolve, reject) => {
218
- ws.on('message', (data) => resolve(data));
219
- ws.on('close', (code, reason) => reject(`WebSocket closed with code ${code}: ${reason}`));
220
- }).then((msg) => {
221
- const json = JSON.parse(msg.toString());
222
- if ('audio' in json) {
223
- const data = new Uint16Array(Buffer.from(json.audio, 'base64'));
224
- const audioFrame = new AudioFrame(
225
- data,
226
- this.config.sampleRate,
227
- 1,
228
- Math.trunc(data.length / 2),
229
- );
230
- this.eventQueue.push(
231
- new tts.SynthesisEvent(tts.SynthesisEventType.AUDIO, { text: '', data: audioFrame }),
232
- );
198
+ const requestId = randomUUID();
199
+ const segmentId = randomUUID();
200
+
201
+ ws.send(
202
+ JSON.stringify({
203
+ text: ' ',
204
+ voice_settings: this.#opts.voice.settings,
205
+ try_trigger_generation: true,
206
+ chunk_length_schedule: this.#opts.chunkLengthSchedule,
207
+ }),
208
+ );
209
+ let eosSent = false;
210
+
211
+ const sendTask = async () => {
212
+ let xmlContent: string[] = [];
213
+ for await (const data of stream) {
214
+ let text = data.token;
215
+
216
+ if ((this.#opts.enableSsmlParsing && text.startsWith('<phoneme')) || xmlContent.length) {
217
+ xmlContent.push(text);
218
+ if (text.indexOf('</phoneme>') !== -1) {
219
+ text = xmlContent.join(' ');
220
+ xmlContent = [];
221
+ } else {
222
+ continue;
233
223
  }
234
- });
235
- } catch {
236
- break;
224
+ }
225
+
226
+ ws.send(JSON.stringify({ text: text + ' ', try_trigger_generation: false }));
237
227
  }
238
- }
239
- }
240
228
 
241
- flush() {
242
- this.queue.push(this.text + ' ');
243
- this.text = '';
244
- this.queue.push('');
245
- }
229
+ if (xmlContent.length) {
230
+ this.#logger.warn('ElevenLabs stream ended with incomplete XML content');
231
+ }
246
232
 
247
- next(): IteratorResult<tts.SynthesisEvent> {
248
- const event = this.eventQueue.shift();
249
- if (event) {
250
- return { done: false, value: event };
251
- } else {
252
- return { done: true, value: undefined };
253
- }
254
- }
233
+ ws.send(JSON.stringify({ text: '' }));
234
+ eosSent = true;
235
+ };
255
236
 
256
- async close(wait: boolean) {
257
- if (wait) {
258
- log.warn('wait is not yet supported for ElevenLabs TTS');
259
- }
237
+ const listenTask = async () => {
238
+ while (!this.closed) {
239
+ try {
240
+ await new Promise<RawData>((resolve, reject) => {
241
+ ws.removeAllListeners();
242
+ ws.on('message', (data) => resolve(data));
243
+ ws.on('close', (code, reason) => {
244
+ if (!eosSent) {
245
+ this.#logger.error(`WebSocket closed with code ${code}: ${reason}`);
246
+ }
247
+ reject();
248
+ });
249
+ }).then((msg) => {
250
+ const json = JSON.parse(msg.toString());
251
+ if ('audio' in json) {
252
+ const data = new Int16Array(Buffer.from(json.audio, 'base64').buffer);
253
+ const frame = new AudioFrame(
254
+ data,
255
+ sampleRateFromFormat(this.#opts.encoding),
256
+ 1,
257
+ data.length,
258
+ );
259
+ this.queue.put({ requestId, segmentId, frame });
260
+ }
261
+ });
262
+ } catch {
263
+ break;
264
+ }
265
+ }
266
+ };
260
267
 
261
- try {
262
- await this.task.run;
263
- } finally {
264
- this.eventQueue.push(undefined);
265
- }
268
+ await Promise.all([sendTask(), listenTask()]);
266
269
  }
267
270
  }
268
271
 
269
- class ChunkedStream extends tts.ChunkedStream {
270
- config: TTSOptions;
271
- text: string;
272
- queue: (tts.SynthesizedAudio | undefined)[] = [];
273
-
274
- constructor(text: string, config: TTSOptions) {
275
- super();
276
- this.config = config;
277
- this.text = text;
278
- }
279
-
280
- async next(): Promise<IteratorResult<tts.SynthesizedAudio>> {
281
- await this.run();
282
- const audio = this.queue.shift();
283
- if (audio) {
284
- return { done: false, value: audio };
285
- } else {
286
- return { done: true, value: undefined };
287
- }
288
- }
289
-
290
- async close() {
291
- this.queue.push(undefined);
292
- }
293
-
294
- async run() {
295
- const voice = this.config.voice;
296
-
297
- const url = new URL(`${this.config.baseURL}/text-to-speech/${voice.id}/stream`);
298
- url.searchParams.append('output_format', 'pcm_' + this.config.sampleRate);
299
- url.searchParams.append('optimize_streaming_latency', this.config.latency.toString());
300
-
301
- await fetch(url.toString(), {
302
- method: 'POST',
303
- headers: {
304
- [AUTHORIZATION_HEADER]: this.config.apiKey,
305
- 'Content-Type': 'application/json',
306
- },
307
- body: JSON.stringify({
308
- text: this.text,
309
- model_id: this.config.modelID,
310
- voice_settings: this.config.voice.settings || undefined,
311
- }),
312
- })
313
- .then((data) => data.arrayBuffer())
314
- .then((data) => new DataView(data, 0, data.byteLength))
315
- .then((data) =>
316
- this.queue.push(
317
- {
318
- text: this.text,
319
- data: new AudioFrame(
320
- new Uint16Array(data.buffer),
321
- this.config.sampleRate,
322
- 1,
323
- data.byteLength / 2,
324
- ),
325
- },
326
- undefined,
327
- ),
328
- )
329
- .catch(() => this.queue.push(undefined));
330
- }
331
- }
272
+ const sampleRateFromFormat = (encoding: TTSEncoding): number => {
273
+ return Number(encoding.split('_')[1]);
274
+ };
package/tsconfig.json CHANGED
@@ -6,5 +6,11 @@
6
6
  "rootDir": "./src",
7
7
  "declarationDir": "./dist",
8
8
  "outDir": "./dist"
9
+ },
10
+ "typedocOptions": {
11
+ "name": "plugins/agents-plugin-elevenlabs",
12
+ "entryPointStrategy": "resolve",
13
+ "readme": "none",
14
+ "entryPoints": ["src/index.ts"]
9
15
  }
10
16
  }