npm - @livekit/agents-plugin-elevenlabs - Versions diffs - 0.1.0 → 0.4.0 - Mend

@livekit/agents-plugin-elevenlabs 0.1.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/src/tts.ts CHANGED Viewed

@@ -1,11 +1,13 @@
 // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
-import { log, tts } from '@livekit/agents';
+import { AsyncIterableQueue, log, tokenize, tts } from '@livekit/agents';
+import type { WordStream } from '@livekit/agents/dist/tokenize/tokenizer.js';
 import { AudioFrame } from '@livekit/rtc-node';
-import { URL } from 'url';
+import { randomUUID } from 'node:crypto';
+import { URL } from 'node:url';
 import { type RawData, WebSocket } from 'ws';
-import type { TTSModels } from './models.js';
+import type { TTSEncoding, TTSModels } from './models.js';
 type Voice = {
   id: string;
@@ -33,52 +35,53 @@ const DEFAULT_VOICE: Voice = {
   },
 };
-const API_BASE_URL_V1 = 'https://api.elevenlabs.io/v1';
+const API_BASE_URL_V1 = 'https://api.elevenlabs.io/v1/';
 const AUTHORIZATION_HEADER = 'xi-api-key';
-const STREAM_EOS = '';
-type TTSOptions = {
-  apiKey: string;
+export interface TTSOptions {
+  apiKey?: string;
   voice: Voice;
   modelID: TTSModels;
   baseURL: string;
-  sampleRate: number;
-  latency: number;
+  encoding: TTSEncoding;
+  streamingLatency: number;
+  wordTokenizer: tokenize.WordTokenizer;
+  chunkLengthSchedule: number[];
+  enableSsmlParsing: boolean;
+}
+const defaultTTSOptions: TTSOptions = {
+  apiKey: process.env.ELEVEN_API_KEY,
+  voice: DEFAULT_VOICE,
+  modelID: 'eleven_turbo_v2_5',
+  baseURL: API_BASE_URL_V1,
+  encoding: 'pcm_22050',
+  streamingLatency: 3,
+  wordTokenizer: new tokenize.basic.WordTokenizer(false),
+  chunkLengthSchedule: [],
+  enableSsmlParsing: false,
 };
 export class TTS extends tts.TTS {
-  config: TTSOptions;
-  constructor(
-    voice = DEFAULT_VOICE,
-    modelID: TTSModels = 'eleven_multilingual_v2',
-    apiKey?: string,
-    baseURL?: string,
-    sampleRate = 24000,
-    latency = 2,
-  ) {
-    super(true);
-    apiKey = apiKey || process.env.ELEVEN_API_KEY;
-    if (apiKey === undefined) {
+  #opts: TTSOptions;
+  constructor(opts: Partial<TTSOptions> = defaultTTSOptions) {
+    super(sampleRateFromFormat(opts.encoding || defaultTTSOptions.encoding), 1, {
+      streaming: true,
+    });
+    if (opts.apiKey === undefined) {
       throw new Error(
         'ElevenLabs API key is required, whether as an argument or as $ELEVEN_API_KEY',
       );
     }
-    this.config = {
-      voice,
-      modelID,
-      apiKey,
-      baseURL: baseURL || API_BASE_URL_V1,
-      sampleRate,
-      latency,
-    };
+    this.#opts = { ...defaultTTSOptions, ...opts };
   }
   async listVoices(): Promise<Voice[]> {
-    return fetch(this.config.baseURL + '/voices', {
+    return fetch(this.#opts.baseURL + '/voices', {
       headers: {
-        [AUTHORIZATION_HEADER]: this.config.apiKey,
+        [AUTHORIZATION_HEADER]: this.#opts.apiKey!,
       },
     })
       .then((data) => data.json())
@@ -98,62 +101,76 @@ export class TTS extends tts.TTS {
       });
   }
-  async synthesize(text: string): Promise<tts.ChunkedStream> {
-    return new ChunkedStream(text, this.config);
-  }
   stream(): tts.SynthesizeStream {
-    return new SynthesizeStream(this.config);
+    return new SynthesizeStream(this.#opts);
   }
 }
 export class SynthesizeStream extends tts.SynthesizeStream {
-  closed: boolean;
-  config: TTSOptions;
-  text: string;
-  task: {
-    run: Promise<void>;
-    cancel: () => void;
-  };
-  queue: string[] = [];
-  eventQueue: (tts.SynthesisEvent | undefined)[] = [];
-  constructor(config: TTSOptions) {
+  #opts: TTSOptions;
+  #logger = log();
+  readonly streamURL: URL;
+  constructor(opts: TTSOptions) {
     super();
-    this.config = config;
+    this.#opts = opts;
     this.closed = false;
-    this.text = '';
-    this.task = {
-      run: new Promise(() => {
-        this.run(32);
-      }),
-      cancel: () => {},
+    // add trailing slash to URL if needed
+    const baseURL = opts.baseURL + (opts.baseURL.endsWith('/') ? '' : '/');
+    this.streamURL = new URL(`text-to-speech/${opts.voice.id}/stream-input`, baseURL);
+    const params = {
+      model_id: opts.modelID,
+      output_format: opts.encoding,
+      optimize_streaming_latency: `${opts.streamingLatency}`,
+      enable_ssml_parsing: `${opts.enableSsmlParsing}`,
     };
-  }
+    Object.entries(params).forEach(([k, v]) => this.streamURL.searchParams.append(k, v));
+    this.streamURL.protocol = this.streamURL.protocol.replace('http', 'ws');
-  get streamURL(): string {
-    return `${this.config.baseURL}/text-to-speech/${this.config.voice.id}/stream-input?model_id=${this.config.modelID}&optimize_streaming_latency=${this.config.latency}`;
+    this.#run();
   }
-  pushText(token?: string | undefined): void {
-    if (this.closed) throw new Error('cannot push to a closed stream');
-    if (!token || token.length === 0) return;
+  async #run() {
+    const segments = new AsyncIterableQueue<WordStream>();
-    const splitters = '.,?!;:—-()[]} ';
-    this.text += token;
-    if (splitters.includes(token[token.length - 1])) {
-      this.queue.push(this.text);
-      this.text = '';
-    }
+    const tokenizeInput = async () => {
+      let stream: tokenize.WordStream | null = null;
+      for await (const text of this.input) {
+        if (text === SynthesizeStream.FLUSH_SENTINEL) {
+          if (stream) {
+            stream.close();
+          }
+          stream = null;
+        } else {
+          if (!stream) {
+            stream = this.#opts.wordTokenizer.stream();
+            segments.put(stream);
+          }
+          stream.pushText(text);
+        }
+      }
+      segments.close();
+    };
+    const runStream = async () => {
+      for await (const stream of segments) {
+        await this.#runWS(stream);
+        this.queue.put(SynthesizeStream.END_OF_STREAM);
+      }
+    };
+    await Promise.all([tokenizeInput(), runStream()]);
+    this.close();
   }
-  async run(maxRetry: number) {
+  async #runWS(stream: tokenize.WordStream, maxRetry = 3) {
     let retries = 0;
-    while (!this.closed) {
-      const url = new URL(this.streamURL);
-      url.protocol = url.protocol.replace('http', 'ws');
-      const ws = new WebSocket(url, {
-        headers: { [AUTHORIZATION_HEADER]: this.config.apiKey },
+    let ws: WebSocket;
+    while (true) {
+      ws = new WebSocket(this.streamURL, {
+        headers: { [AUTHORIZATION_HEADER]: this.#opts.apiKey },
       });
       try {
@@ -162,38 +179,7 @@ export class SynthesizeStream extends tts.SynthesizeStream {
           ws.on('error', (error) => reject(error));
           ws.on('close', (code) => reject(`WebSocket returned ${code}`));
         });
-        ws.send(JSON.stringify({ text: ' ', voice_settings: this.config.voice }));
-        let started = false;
-        const retryQueue: string[] = [];
-        const task = this.listenTask(ws);
-        while (ws.readyState !== ws.CLOSED) {
-          let text = undefined;
-          if (retryQueue.length === 0) {
-            text = this.queue.shift();
-          } else {
-            text = retryQueue.shift();
-          }
-          if (!started) {
-            this.eventQueue.push(new tts.SynthesisEvent(tts.SynthesisEventType.STARTED));
-            started = true;
-          }
-          try {
-            ws.send(JSON.stringify({ text, try_trigger_generation: true }));
-          } catch (e) {
-            // XI closes idle connections after a while.
-            retryQueue.push(text!);
-            break;
-          }
-          if (text == STREAM_EOS) {
-            await task;
-            this.eventQueue.push(new tts.SynthesisEvent(tts.SynthesisEventType.FINISHED));
-            break;
-          }
-        }
+        break;
       } catch (e) {
         if (retries >= maxRetry) {
           throw new Error(`failed to connect to ElevenLabs after ${retries} attempts: ${e}`);
@@ -202,130 +188,87 @@ export class SynthesizeStream extends tts.SynthesizeStream {
         const delay = Math.min(retries * 5, 5);
         retries++;
-        log.warn(
+        this.#logger.warn(
           `failed to connect to ElevenLabs, retrying in ${delay} seconds: ${e} (${retries}/${maxRetry})`,
         );
         await new Promise((resolve) => setTimeout(resolve, delay * 1000));
       }
     }
-    this.closed = true;
-  }
-  async listenTask(ws: WebSocket) {
-    while (!this.closed) {
-      try {
-        await new Promise<RawData>((resolve, reject) => {
-          ws.on('message', (data) => resolve(data));
-          ws.on('close', (code, reason) => reject(`WebSocket closed with code ${code}: ${reason}`));
-        }).then((msg) => {
-          const json = JSON.parse(msg.toString());
-          if ('audio' in json) {
-            const data = new Uint16Array(Buffer.from(json.audio, 'base64'));
-            const audioFrame = new AudioFrame(
-              data,
-              this.config.sampleRate,
-              1,
-              Math.trunc(data.length / 2),
-            );
-            this.eventQueue.push(
-              new tts.SynthesisEvent(tts.SynthesisEventType.AUDIO, { text: '', data: audioFrame }),
-            );
+    const requestId = randomUUID();
+    const segmentId = randomUUID();
+    ws.send(
+      JSON.stringify({
+        text: ' ',
+        voice_settings: this.#opts.voice.settings,
+        try_trigger_generation: true,
+        chunk_length_schedule: this.#opts.chunkLengthSchedule,
+      }),
+    );
+    let eosSent = false;
+    const sendTask = async () => {
+      let xmlContent: string[] = [];
+      for await (const data of stream) {
+        let text = data.token;
+        if ((this.#opts.enableSsmlParsing && text.startsWith('<phoneme')) || xmlContent.length) {
+          xmlContent.push(text);
+          if (text.indexOf('</phoneme>') !== -1) {
+            text = xmlContent.join(' ');
+            xmlContent = [];
+          } else {
+            continue;
           }
-        });
-      } catch {
-        break;
+        }
+        ws.send(JSON.stringify({ text: text + ' ', try_trigger_generation: false }));
       }
-    }
-  }
-  flush() {
-    this.queue.push(this.text + ' ');
-    this.text = '';
-    this.queue.push('');
-  }
+      if (xmlContent.length) {
+        this.#logger.warn('ElevenLabs stream ended with incomplete XML content');
+      }
-  next(): IteratorResult<tts.SynthesisEvent> {
-    const event = this.eventQueue.shift();
-    if (event) {
-      return { done: false, value: event };
-    } else {
-      return { done: true, value: undefined };
-    }
-  }
+      ws.send(JSON.stringify({ text: '' }));
+      eosSent = true;
+    };
-  async close(wait: boolean) {
-    if (wait) {
-      log.warn('wait is not yet supported for ElevenLabs TTS');
-    }
+    const listenTask = async () => {
+      while (!this.closed) {
+        try {
+          await new Promise<RawData>((resolve, reject) => {
+            ws.removeAllListeners();
+            ws.on('message', (data) => resolve(data));
+            ws.on('close', (code, reason) => {
+              if (!eosSent) {
+                this.#logger.error(`WebSocket closed with code ${code}: ${reason}`);
+              }
+              reject();
+            });
+          }).then((msg) => {
+            const json = JSON.parse(msg.toString());
+            if ('audio' in json) {
+              const data = new Int16Array(Buffer.from(json.audio, 'base64').buffer);
+              const frame = new AudioFrame(
+                data,
+                sampleRateFromFormat(this.#opts.encoding),
+                1,
+                data.length,
+              );
+              this.queue.put({ requestId, segmentId, frame });
+            }
+          });
+        } catch {
+          break;
+        }
+      }
+    };
-    try {
-      await this.task.run;
-    } finally {
-      this.eventQueue.push(undefined);
-    }
+    await Promise.all([sendTask(), listenTask()]);
   }
 }
-class ChunkedStream extends tts.ChunkedStream {
-  config: TTSOptions;
-  text: string;
-  queue: (tts.SynthesizedAudio | undefined)[] = [];
-  constructor(text: string, config: TTSOptions) {
-    super();
-    this.config = config;
-    this.text = text;
-  }
-  async next(): Promise<IteratorResult<tts.SynthesizedAudio>> {
-    await this.run();
-    const audio = this.queue.shift();
-    if (audio) {
-      return { done: false, value: audio };
-    } else {
-      return { done: true, value: undefined };
-    }
-  }
-  async close() {
-    this.queue.push(undefined);
-  }
-  async run() {
-    const voice = this.config.voice;
-    const url = new URL(`${this.config.baseURL}/text-to-speech/${voice.id}/stream`);
-    url.searchParams.append('output_format', 'pcm_' + this.config.sampleRate);
-    url.searchParams.append('optimize_streaming_latency', this.config.latency.toString());
-    await fetch(url.toString(), {
-      method: 'POST',
-      headers: {
-        [AUTHORIZATION_HEADER]: this.config.apiKey,
-        'Content-Type': 'application/json',
-      },
-      body: JSON.stringify({
-        text: this.text,
-        model_id: this.config.modelID,
-        voice_settings: this.config.voice.settings || undefined,
-      }),
-    })
-      .then((data) => data.arrayBuffer())
-      .then((data) => new DataView(data, 0, data.byteLength))
-      .then((data) =>
-        this.queue.push(
-          {
-            text: this.text,
-            data: new AudioFrame(
-              new Uint16Array(data.buffer),
-              this.config.sampleRate,
-              1,
-              data.byteLength / 2,
-            ),
-          },
-          undefined,
-        ),
-      )
-      .catch(() => this.queue.push(undefined));
-  }
-}
+const sampleRateFromFormat = (encoding: TTSEncoding): number => {
+  return Number(encoding.split('_')[1]);
+};

package/tsconfig.json CHANGED Viewed

@@ -6,5 +6,11 @@
     "rootDir": "./src",
     "declarationDir": "./dist",
     "outDir": "./dist"
+  },
+  "typedocOptions": {
+    "name": "plugins/agents-plugin-elevenlabs",
+    "entryPointStrategy": "resolve",
+    "readme": "none",
+    "entryPoints": ["src/index.ts"]
   }
 }