@livekit/agents-plugin-elevenlabs 0.5.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/tts.cjs +24 -12
- package/dist/tts.cjs.map +1 -1
- package/dist/tts.d.ts +3 -1
- package/dist/tts.d.ts.map +1 -1
- package/dist/tts.js +25 -13
- package/dist/tts.js.map +1 -1
- package/package.json +2 -2
- package/src/tts.ts +27 -13
package/dist/tts.cjs
CHANGED
|
@@ -23,7 +23,6 @@ __export(tts_exports, {
|
|
|
23
23
|
});
|
|
24
24
|
module.exports = __toCommonJS(tts_exports);
|
|
25
25
|
var import_agents = require("@livekit/agents");
|
|
26
|
-
var import_rtc_node = require("@livekit/rtc-node");
|
|
27
26
|
var import_node_crypto = require("node:crypto");
|
|
28
27
|
var import_node_url = require("node:url");
|
|
29
28
|
var import_ws = require("ws");
|
|
@@ -53,6 +52,7 @@ const defaultTTSOptions = {
|
|
|
53
52
|
};
|
|
54
53
|
class TTS extends import_agents.tts.TTS {
|
|
55
54
|
#opts;
|
|
55
|
+
label = "elevenlabs.TTS";
|
|
56
56
|
constructor(opts = {}) {
|
|
57
57
|
super(sampleRateFromFormat(opts.encoding || defaultTTSOptions.encoding), 1, {
|
|
58
58
|
streaming: true
|
|
@@ -89,15 +89,16 @@ class TTS extends import_agents.tts.TTS {
|
|
|
89
89
|
throw new Error("Chunked responses are not supported on ElevenLabs TTS");
|
|
90
90
|
}
|
|
91
91
|
stream() {
|
|
92
|
-
return new SynthesizeStream(this.#opts);
|
|
92
|
+
return new SynthesizeStream(this, this.#opts);
|
|
93
93
|
}
|
|
94
94
|
}
|
|
95
95
|
class SynthesizeStream extends import_agents.tts.SynthesizeStream {
|
|
96
96
|
#opts;
|
|
97
97
|
#logger = (0, import_agents.log)();
|
|
98
|
+
label = "elevenlabs.SynthesizeStream";
|
|
98
99
|
streamURL;
|
|
99
|
-
constructor(opts) {
|
|
100
|
-
super();
|
|
100
|
+
constructor(tts2, opts) {
|
|
101
|
+
super(tts2);
|
|
101
102
|
this.#opts = opts;
|
|
102
103
|
this.closed = false;
|
|
103
104
|
const baseURL = opts.baseURL + (opts.baseURL.endsWith("/") ? "" : "/");
|
|
@@ -197,7 +198,15 @@ class SynthesizeStream extends import_agents.tts.SynthesizeStream {
|
|
|
197
198
|
ws.send(JSON.stringify({ text: "" }));
|
|
198
199
|
eosSent = true;
|
|
199
200
|
};
|
|
201
|
+
let lastFrame;
|
|
202
|
+
const sendLastFrame = (segmentId2, final) => {
|
|
203
|
+
if (lastFrame) {
|
|
204
|
+
this.queue.put({ requestId, segmentId: segmentId2, frame: lastFrame, final });
|
|
205
|
+
lastFrame = void 0;
|
|
206
|
+
}
|
|
207
|
+
};
|
|
200
208
|
const listenTask = async () => {
|
|
209
|
+
const bstream = new import_agents.AudioByteStream(sampleRateFromFormat(this.#opts.encoding), 1);
|
|
201
210
|
while (!this.closed) {
|
|
202
211
|
try {
|
|
203
212
|
await new Promise((resolve, reject) => {
|
|
@@ -212,14 +221,17 @@ class SynthesizeStream extends import_agents.tts.SynthesizeStream {
|
|
|
212
221
|
}).then((msg) => {
|
|
213
222
|
const json = JSON.parse(msg.toString());
|
|
214
223
|
if ("audio" in json) {
|
|
215
|
-
const data = new
|
|
216
|
-
const frame
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
)
|
|
222
|
-
|
|
224
|
+
const data = new Int8Array(Buffer.from(json.audio, "base64").buffer);
|
|
225
|
+
for (const frame of bstream.write(data)) {
|
|
226
|
+
sendLastFrame(segmentId, false);
|
|
227
|
+
lastFrame = frame;
|
|
228
|
+
}
|
|
229
|
+
} else if ("isFinal" in json) {
|
|
230
|
+
for (const frame of bstream.flush()) {
|
|
231
|
+
sendLastFrame(segmentId, false);
|
|
232
|
+
lastFrame = frame;
|
|
233
|
+
}
|
|
234
|
+
sendLastFrame(segmentId, true);
|
|
223
235
|
}
|
|
224
236
|
});
|
|
225
237
|
} catch {
|
package/dist/tts.cjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/tts.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { AsyncIterableQueue, log, tokenize, tts } from '@livekit/agents';\nimport { AudioFrame } from '@livekit/rtc-node';\nimport { randomUUID } from 'node:crypto';\nimport { URL } from 'node:url';\nimport { type RawData, WebSocket } from 'ws';\nimport type { TTSEncoding, TTSModels } from './models.js';\n\ntype Voice = {\n id: string;\n name: string;\n category: string;\n settings?: VoiceSettings;\n};\n\ntype VoiceSettings = {\n stability: number; // 0..1\n similarity_boost: number; // 0..1\n style?: number; // 0..1\n use_speaker_boost: boolean;\n};\n\nconst DEFAULT_VOICE: Voice = {\n id: 'EXAVITQu4vr4xnSDxMaL',\n name: 'Bella',\n category: 'premade',\n settings: {\n stability: 0.71,\n similarity_boost: 0.5,\n style: 0.0,\n use_speaker_boost: true,\n },\n};\n\nconst API_BASE_URL_V1 = 'https://api.elevenlabs.io/v1/';\nconst AUTHORIZATION_HEADER = 'xi-api-key';\n\nexport interface TTSOptions {\n apiKey?: string;\n voice: Voice;\n modelID: TTSModels;\n baseURL: string;\n encoding: TTSEncoding;\n streamingLatency: number;\n wordTokenizer: tokenize.WordTokenizer;\n chunkLengthSchedule: number[];\n enableSsmlParsing: boolean;\n}\n\nconst defaultTTSOptions: TTSOptions = {\n apiKey: process.env.ELEVEN_API_KEY,\n voice: DEFAULT_VOICE,\n modelID: 'eleven_turbo_v2_5',\n baseURL: API_BASE_URL_V1,\n encoding: 'pcm_22050',\n streamingLatency: 3,\n wordTokenizer: new tokenize.basic.WordTokenizer(false),\n chunkLengthSchedule: [],\n enableSsmlParsing: false,\n};\n\nexport class TTS extends tts.TTS {\n #opts: TTSOptions;\n\n constructor(opts: Partial<TTSOptions> = {}) {\n super(sampleRateFromFormat(opts.encoding || defaultTTSOptions.encoding), 1, {\n streaming: true,\n });\n\n this.#opts = {\n ...defaultTTSOptions,\n ...opts,\n };\n\n if (this.#opts.apiKey === undefined) {\n throw new Error(\n 'ElevenLabs API key is required, whether as an argument or as $ELEVEN_API_KEY',\n );\n }\n }\n\n async listVoices(): Promise<Voice[]> {\n return fetch(this.#opts.baseURL + '/voices', {\n headers: {\n [AUTHORIZATION_HEADER]: this.#opts.apiKey!,\n },\n })\n .then((data) => data.json())\n .then((data) => {\n const voices: Voice[] = [];\n for (const voice of (\n data as { voices: { voice_id: string; name: string; category: string }[] }\n ).voices) {\n voices.push({\n id: voice.voice_id,\n name: voice.name,\n category: voice.category,\n settings: undefined,\n });\n }\n return voices;\n });\n }\n\n synthesize(): tts.ChunkedStream {\n throw new Error('Chunked responses are not supported on ElevenLabs TTS');\n }\n\n stream(): tts.SynthesizeStream {\n return new SynthesizeStream(this.#opts);\n }\n}\n\nexport class SynthesizeStream extends tts.SynthesizeStream {\n #opts: TTSOptions;\n #logger = log();\n readonly streamURL: URL;\n\n constructor(opts: TTSOptions) {\n super();\n this.#opts = opts;\n this.closed = false;\n\n // add trailing slash to URL if needed\n const baseURL = opts.baseURL + (opts.baseURL.endsWith('/') ? '' : '/');\n\n this.streamURL = new URL(`text-to-speech/${opts.voice.id}/stream-input`, baseURL);\n const params = {\n model_id: opts.modelID,\n output_format: opts.encoding,\n optimize_streaming_latency: `${opts.streamingLatency}`,\n enable_ssml_parsing: `${opts.enableSsmlParsing}`,\n };\n Object.entries(params).forEach(([k, v]) => this.streamURL.searchParams.append(k, v));\n this.streamURL.protocol = this.streamURL.protocol.replace('http', 'ws');\n\n this.#run();\n }\n\n async #run() {\n const segments = new AsyncIterableQueue<tokenize.WordStream>();\n\n const tokenizeInput = async () => {\n let stream: tokenize.WordStream | null = null;\n for await (const text of this.input) {\n if (text === SynthesizeStream.FLUSH_SENTINEL) {\n stream?.endInput();\n stream = null;\n } else {\n if (!stream) {\n stream = this.#opts.wordTokenizer.stream();\n segments.put(stream);\n }\n stream.pushText(text);\n }\n }\n segments.close();\n };\n\n const runStream = async () => {\n for await (const stream of segments) {\n await this.#runWS(stream);\n this.queue.put(SynthesizeStream.END_OF_STREAM);\n }\n };\n\n await Promise.all([tokenizeInput(), runStream()]);\n this.close();\n }\n\n async #runWS(stream: tokenize.WordStream, maxRetry = 3) {\n let retries = 0;\n let ws: WebSocket;\n while (true) {\n ws = new WebSocket(this.streamURL, {\n headers: { [AUTHORIZATION_HEADER]: this.#opts.apiKey },\n });\n\n try {\n await new Promise((resolve, reject) => {\n ws.on('open', resolve);\n ws.on('error', (error) => reject(error));\n ws.on('close', (code) => reject(`WebSocket returned ${code}`));\n });\n break;\n } catch (e) {\n if (retries >= maxRetry) {\n throw new Error(`failed to connect to ElevenLabs after ${retries} attempts: ${e}`);\n }\n\n const delay = Math.min(retries * 5, 5);\n retries++;\n\n this.#logger.warn(\n `failed to connect to ElevenLabs, retrying in ${delay} seconds: ${e} (${retries}/${maxRetry})`,\n );\n await new Promise((resolve) => setTimeout(resolve, delay * 1000));\n }\n }\n\n const requestId = randomUUID();\n const segmentId = randomUUID();\n\n ws.send(\n JSON.stringify({\n text: ' ',\n voice_settings: this.#opts.voice.settings,\n try_trigger_generation: true,\n chunk_length_schedule: this.#opts.chunkLengthSchedule,\n }),\n );\n let eosSent = false;\n\n const sendTask = async () => {\n let xmlContent: string[] = [];\n for await (const data of stream) {\n let text = data.token;\n\n if ((this.#opts.enableSsmlParsing && text.startsWith('<phoneme')) || xmlContent.length) {\n xmlContent.push(text);\n if (text.indexOf('</phoneme>') !== -1) {\n text = xmlContent.join(' ');\n xmlContent = [];\n } else {\n continue;\n }\n }\n\n ws.send(JSON.stringify({ text: text + ' ', try_trigger_generation: false }));\n }\n\n if (xmlContent.length) {\n this.#logger.warn('ElevenLabs stream ended with incomplete XML content');\n }\n\n ws.send(JSON.stringify({ text: '' }));\n eosSent = true;\n };\n\n const listenTask = async () => {\n while (!this.closed) {\n try {\n await new Promise<RawData>((resolve, reject) => {\n ws.removeAllListeners();\n ws.on('message', (data) => resolve(data));\n ws.on('close', (code, reason) => {\n if (!eosSent) {\n this.#logger.error(`WebSocket closed with code ${code}: ${reason}`);\n }\n reject();\n });\n }).then((msg) => {\n const json = JSON.parse(msg.toString());\n if ('audio' in json) {\n const data = new Int16Array(Buffer.from(json.audio, 'base64').buffer);\n const frame = new AudioFrame(\n data,\n sampleRateFromFormat(this.#opts.encoding),\n 1,\n data.length,\n );\n this.queue.put({ requestId, segmentId, frame });\n }\n });\n } catch {\n break;\n }\n }\n };\n\n await Promise.all([sendTask(), listenTask()]);\n }\n}\n\nconst sampleRateFromFormat = (encoding: TTSEncoding): number => {\n return Number(encoding.split('_')[1]);\n};\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,oBAAuD;AACvD,sBAA2B;AAC3B,yBAA2B;AAC3B,sBAAoB;AACpB,gBAAwC;AAiBxC,MAAM,gBAAuB;AAAA,EAC3B,IAAI;AAAA,EACJ,MAAM;AAAA,EACN,UAAU;AAAA,EACV,UAAU;AAAA,IACR,WAAW;AAAA,IACX,kBAAkB;AAAA,IAClB,OAAO;AAAA,IACP,mBAAmB;AAAA,EACrB;AACF;AAEA,MAAM,kBAAkB;AACxB,MAAM,uBAAuB;AAc7B,MAAM,oBAAgC;AAAA,EACpC,QAAQ,QAAQ,IAAI;AAAA,EACpB,OAAO;AAAA,EACP,SAAS;AAAA,EACT,SAAS;AAAA,EACT,UAAU;AAAA,EACV,kBAAkB;AAAA,EAClB,eAAe,IAAI,uBAAS,MAAM,cAAc,KAAK;AAAA,EACrD,qBAAqB,CAAC;AAAA,EACtB,mBAAmB;AACrB;AAEO,MAAM,YAAY,kBAAI,IAAI;AAAA,EAC/B;AAAA,EAEA,YAAY,OAA4B,CAAC,GAAG;AAC1C,UAAM,qBAAqB,KAAK,YAAY,kBAAkB,QAAQ,GAAG,GAAG;AAAA,MAC1E,WAAW;AAAA,IACb,CAAC;AAED,SAAK,QAAQ;AAAA,MACX,GAAG;AAAA,MACH,GAAG;AAAA,IACL;AAEA,QAAI,KAAK,MAAM,WAAW,QAAW;AACnC,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAAA,EAEA,MAAM,aAA+B;AACnC,WAAO,MAAM,KAAK,MAAM,UAAU,WAAW;AAAA,MAC3C,SAAS;AAAA,QACP,CAAC,oBAAoB,GAAG,KAAK,MAAM;AAAA,MACrC;AAAA,IACF,CAAC,EACE,KAAK,CAAC,SAAS,KAAK,KAAK,CAAC,EAC1B,KAAK,CAAC,SAAS;AACd,YAAM,SAAkB,CAAC;AACzB,iBAAW,SACT,KACA,QAAQ;AACR,eAAO,KAAK;AAAA,UACV,IAAI,MAAM;AAAA,UACV,MAAM,MAAM;AAAA,UACZ,UAAU,MAAM;AAAA,UAChB,UAAU;AAAA,QACZ,CAAC;AAAA,MACH;AACA,aAAO;AAAA,IACT,CAAC;AAAA,EACL;AAAA,EAEA,aAAgC;AAC9B,UAAM,IAAI,MAAM,uDAAuD;AAAA,EACzE;AAAA,EAEA,SAA+B;AAC7B,WAAO,IAAI,iBAAiB,KAAK,KAAK;AAAA,EACxC;AACF;AAEO,MAAM,yBAAyB,kBAAI,iBAAiB;AAAA,EACzD;AAAA,EACA,cAAU,mBAAI;AAAA,EACL;AAAA,EAET,YAAY,MAAkB;AAC5B,UAAM;AACN,SAAK,QAAQ;AACb,SAAK,SAAS;AAGd,UAAM,UAAU,KAAK,WAAW,KAAK,QAAQ,SAAS,GAAG,IAAI,KAAK;AAElE,SAAK,YAAY,IAAI,oBAAI,kBAAkB,KAAK,MAAM,EAAE,iBAAiB,OAAO;AAChF,UAAM,SAAS;AAAA,MACb,UAAU,KAAK;AAAA,MACf,eAAe,KAAK;AAAA,MACpB,4BAA4B,GAAG,KAAK,gBAAgB;AAAA,MACpD,qBAAqB,GAAG,KAAK,iBAAiB;AAAA,IAChD;AACA,WAAO,QAAQ,MAAM,EAAE,QAAQ,CAAC,CAAC,GAAG,CAAC,MAAM,KAAK,UAAU,aAAa,OAAO,GAAG,CAAC,CAAC;AACnF,SAAK,UAAU,WAAW,KAAK,UAAU,SAAS,QAAQ,QAAQ,IAAI;AAEtE,SAAK,KAAK;AAAA,EACZ;AAAA,EAEA,MAAM,OAAO;AACX,UAAM,WAAW,IAAI,iCAAwC;AAE7D,UAAM,gBAAgB,YAAY;AAChC,UAAI,SAAqC;AACzC,uBAAiB,QAAQ,KAAK,OAAO;AACnC,YAAI,SAAS,iBAAiB,gBAAgB;AAC5C,2CAAQ;AACR,mBAAS;AAAA,QACX,OAAO;AACL,cAAI,CAAC,QAAQ;AACX,qBAAS,KAAK,MAAM,cAAc,OAAO;AACzC,qBAAS,IAAI,MAAM;AAAA,UACrB;AACA,iBAAO,SAAS,IAAI;AAAA,QACtB;AAAA,MACF;AACA,eAAS,MAAM;AAAA,IACjB;AAEA,UAAM,YAAY,YAAY;AAC5B,uBAAiB,UAAU,UAAU;AACnC,cAAM,KAAK,OAAO,MAAM;AACxB,aAAK,MAAM,IAAI,iBAAiB,aAAa;AAAA,MAC/C;AAAA,IACF;AAEA,UAAM,QAAQ,IAAI,CAAC,cAAc,GAAG,UAAU,CAAC,CAAC;AAChD,SAAK,MAAM;AAAA,EACb;AAAA,EAEA,MAAM,OAAO,QAA6B,WAAW,GAAG;AACtD,QAAI,UAAU;AACd,QAAI;AACJ,WAAO,MAAM;AACX,WAAK,IAAI,oBAAU,KAAK,WAAW;AAAA,QACjC,SAAS,EAAE,CAAC,oBAAoB,GAAG,KAAK,MAAM,OAAO;AAAA,MACvD,CAAC;AAED,UAAI;AACF,cAAM,IAAI,QAAQ,CAAC,SAAS,WAAW;AACrC,aAAG,GAAG,QAAQ,OAAO;AACrB,aAAG,GAAG,SAAS,CAAC,UAAU,OAAO,KAAK,CAAC;AACvC,aAAG,GAAG,SAAS,CAAC,SAAS,OAAO,sBAAsB,IAAI,EAAE,CAAC;AAAA,QAC/D,CAAC;AACD;AAAA,MACF,SAAS,GAAG;AACV,YAAI,WAAW,UAAU;AACvB,gBAAM,IAAI,MAAM,yCAAyC,OAAO,cAAc,CAAC,EAAE;AAAA,QACnF;AAEA,cAAM,QAAQ,KAAK,IAAI,UAAU,GAAG,CAAC;AACrC;AAEA,aAAK,QAAQ;AAAA,UACX,gDAAgD,KAAK,aAAa,CAAC,KAAK,OAAO,IAAI,QAAQ;AAAA,QAC7F;AACA,cAAM,IAAI,QAAQ,CAAC,YAAY,WAAW,SAAS,QAAQ,GAAI,CAAC;AAAA,MAClE;AAAA,IACF;AAEA,UAAM,gBAAY,+BAAW;AAC7B,UAAM,gBAAY,+BAAW;AAE7B,OAAG;AAAA,MACD,KAAK,UAAU;AAAA,QACb,MAAM;AAAA,QACN,gBAAgB,KAAK,MAAM,MAAM;AAAA,QACjC,wBAAwB;AAAA,QACxB,uBAAuB,KAAK,MAAM;AAAA,MACpC,CAAC;AAAA,IACH;AACA,QAAI,UAAU;AAEd,UAAM,WAAW,YAAY;AAC3B,UAAI,aAAuB,CAAC;AAC5B,uBAAiB,QAAQ,QAAQ;AAC/B,YAAI,OAAO,KAAK;AAEhB,YAAK,KAAK,MAAM,qBAAqB,KAAK,WAAW,UAAU,KAAM,WAAW,QAAQ;AACtF,qBAAW,KAAK,IAAI;AACpB,cAAI,KAAK,QAAQ,YAAY,MAAM,IAAI;AACrC,mBAAO,WAAW,KAAK,GAAG;AAC1B,yBAAa,CAAC;AAAA,UAChB,OAAO;AACL;AAAA,UACF;AAAA,QACF;AAEA,WAAG,KAAK,KAAK,UAAU,EAAE,MAAM,OAAO,KAAK,wBAAwB,MAAM,CAAC,CAAC;AAAA,MAC7E;AAEA,UAAI,WAAW,QAAQ;AACrB,aAAK,QAAQ,KAAK,qDAAqD;AAAA,MACzE;AAEA,SAAG,KAAK,KAAK,UAAU,EAAE,MAAM,GAAG,CAAC,CAAC;AACpC,gBAAU;AAAA,IACZ;AAEA,UAAM,aAAa,YAAY;AAC7B,aAAO,CAAC,KAAK,QAAQ;AACnB,YAAI;AACF,gBAAM,IAAI,QAAiB,CAAC,SAAS,WAAW;AAC9C,eAAG,mBAAmB;AACtB,eAAG,GAAG,WAAW,CAAC,SAAS,QAAQ,IAAI,CAAC;AACxC,eAAG,GAAG,SAAS,CAAC,MAAM,WAAW;AAC/B,kBAAI,CAAC,SAAS;AACZ,qBAAK,QAAQ,MAAM,8BAA8B,IAAI,KAAK,MAAM,EAAE;AAAA,cACpE;AACA,qBAAO;AAAA,YACT,CAAC;AAAA,UACH,CAAC,EAAE,KAAK,CAAC,QAAQ;AACf,kBAAM,OAAO,KAAK,MAAM,IAAI,SAAS,CAAC;AACtC,gBAAI,WAAW,MAAM;AACnB,oBAAM,OAAO,IAAI,WAAW,OAAO,KAAK,KAAK,OAAO,QAAQ,EAAE,MAAM;AACpE,oBAAM,QAAQ,IAAI;AAAA,gBAChB;AAAA,gBACA,qBAAqB,KAAK,MAAM,QAAQ;AAAA,gBACxC;AAAA,gBACA,KAAK;AAAA,cACP;AACA,mBAAK,MAAM,IAAI,EAAE,WAAW,WAAW,MAAM,CAAC;AAAA,YAChD;AAAA,UACF,CAAC;AAAA,QACH,QAAQ;AACN;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAEA,UAAM,QAAQ,IAAI,CAAC,SAAS,GAAG,WAAW,CAAC,CAAC;AAAA,EAC9C;AACF;AAEA,MAAM,uBAAuB,CAAC,aAAkC;AAC9D,SAAO,OAAO,SAAS,MAAM,GAAG,EAAE,CAAC,CAAC;AACtC;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../src/tts.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { AsyncIterableQueue, AudioByteStream, log, tokenize, tts } from '@livekit/agents';\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { randomUUID } from 'node:crypto';\nimport { URL } from 'node:url';\nimport { type RawData, WebSocket } from 'ws';\nimport type { TTSEncoding, TTSModels } from './models.js';\n\ntype Voice = {\n id: string;\n name: string;\n category: string;\n settings?: VoiceSettings;\n};\n\ntype VoiceSettings = {\n stability: number; // 0..1\n similarity_boost: number; // 0..1\n style?: number; // 0..1\n use_speaker_boost: boolean;\n};\n\nconst DEFAULT_VOICE: Voice = {\n id: 'EXAVITQu4vr4xnSDxMaL',\n name: 'Bella',\n category: 'premade',\n settings: {\n stability: 0.71,\n similarity_boost: 0.5,\n style: 0.0,\n use_speaker_boost: true,\n },\n};\n\nconst API_BASE_URL_V1 = 'https://api.elevenlabs.io/v1/';\nconst AUTHORIZATION_HEADER = 'xi-api-key';\n\nexport interface TTSOptions {\n apiKey?: string;\n voice: Voice;\n modelID: TTSModels;\n baseURL: string;\n encoding: TTSEncoding;\n streamingLatency: number;\n wordTokenizer: tokenize.WordTokenizer;\n chunkLengthSchedule: number[];\n enableSsmlParsing: boolean;\n}\n\nconst defaultTTSOptions: TTSOptions = {\n apiKey: process.env.ELEVEN_API_KEY,\n voice: DEFAULT_VOICE,\n modelID: 'eleven_turbo_v2_5',\n baseURL: API_BASE_URL_V1,\n encoding: 'pcm_22050',\n streamingLatency: 3,\n wordTokenizer: new tokenize.basic.WordTokenizer(false),\n chunkLengthSchedule: [],\n enableSsmlParsing: false,\n};\n\nexport class TTS extends tts.TTS {\n #opts: TTSOptions;\n label = 'elevenlabs.TTS';\n\n constructor(opts: Partial<TTSOptions> = {}) {\n super(sampleRateFromFormat(opts.encoding || defaultTTSOptions.encoding), 1, {\n streaming: true,\n });\n\n this.#opts = {\n ...defaultTTSOptions,\n ...opts,\n };\n\n if (this.#opts.apiKey === undefined) {\n throw new Error(\n 'ElevenLabs API key is required, whether as an argument or as $ELEVEN_API_KEY',\n );\n }\n }\n\n async listVoices(): Promise<Voice[]> {\n return fetch(this.#opts.baseURL + '/voices', {\n headers: {\n [AUTHORIZATION_HEADER]: this.#opts.apiKey!,\n },\n })\n .then((data) => data.json())\n .then((data) => {\n const voices: Voice[] = [];\n for (const voice of (\n data as { voices: { voice_id: string; name: string; category: string }[] }\n ).voices) {\n voices.push({\n id: voice.voice_id,\n name: voice.name,\n category: voice.category,\n settings: undefined,\n });\n }\n return voices;\n });\n }\n\n synthesize(): tts.ChunkedStream {\n throw new Error('Chunked responses are not supported on ElevenLabs TTS');\n }\n\n stream(): tts.SynthesizeStream {\n return new SynthesizeStream(this, this.#opts);\n }\n}\n\nexport class SynthesizeStream extends tts.SynthesizeStream {\n #opts: TTSOptions;\n #logger = log();\n label = 'elevenlabs.SynthesizeStream';\n readonly streamURL: URL;\n\n constructor(tts: TTS, opts: TTSOptions) {\n super(tts);\n this.#opts = opts;\n this.closed = false;\n\n // add trailing slash to URL if needed\n const baseURL = opts.baseURL + (opts.baseURL.endsWith('/') ? '' : '/');\n\n this.streamURL = new URL(`text-to-speech/${opts.voice.id}/stream-input`, baseURL);\n const params = {\n model_id: opts.modelID,\n output_format: opts.encoding,\n optimize_streaming_latency: `${opts.streamingLatency}`,\n enable_ssml_parsing: `${opts.enableSsmlParsing}`,\n };\n Object.entries(params).forEach(([k, v]) => this.streamURL.searchParams.append(k, v));\n this.streamURL.protocol = this.streamURL.protocol.replace('http', 'ws');\n\n this.#run();\n }\n\n async #run() {\n const segments = new AsyncIterableQueue<tokenize.WordStream>();\n\n const tokenizeInput = async () => {\n let stream: tokenize.WordStream | null = null;\n for await (const text of this.input) {\n if (text === SynthesizeStream.FLUSH_SENTINEL) {\n stream?.endInput();\n stream = null;\n } else {\n if (!stream) {\n stream = this.#opts.wordTokenizer.stream();\n segments.put(stream);\n }\n stream.pushText(text);\n }\n }\n segments.close();\n };\n\n const runStream = async () => {\n for await (const stream of segments) {\n await this.#runWS(stream);\n this.queue.put(SynthesizeStream.END_OF_STREAM);\n }\n };\n\n await Promise.all([tokenizeInput(), runStream()]);\n this.close();\n }\n\n async #runWS(stream: tokenize.WordStream, maxRetry = 3) {\n let retries = 0;\n let ws: WebSocket;\n while (true) {\n ws = new WebSocket(this.streamURL, {\n headers: { [AUTHORIZATION_HEADER]: this.#opts.apiKey },\n });\n\n try {\n await new Promise((resolve, reject) => {\n ws.on('open', resolve);\n ws.on('error', (error) => reject(error));\n ws.on('close', (code) => reject(`WebSocket returned ${code}`));\n });\n break;\n } catch (e) {\n if (retries >= maxRetry) {\n throw new Error(`failed to connect to ElevenLabs after ${retries} attempts: ${e}`);\n }\n\n const delay = Math.min(retries * 5, 5);\n retries++;\n\n this.#logger.warn(\n `failed to connect to ElevenLabs, retrying in ${delay} seconds: ${e} (${retries}/${maxRetry})`,\n );\n await new Promise((resolve) => setTimeout(resolve, delay * 1000));\n }\n }\n\n const requestId = randomUUID();\n const segmentId = randomUUID();\n\n ws.send(\n JSON.stringify({\n text: ' ',\n voice_settings: this.#opts.voice.settings,\n try_trigger_generation: true,\n chunk_length_schedule: this.#opts.chunkLengthSchedule,\n }),\n );\n let eosSent = false;\n\n const sendTask = async () => {\n let xmlContent: string[] = [];\n for await (const data of stream) {\n let text = data.token;\n\n if ((this.#opts.enableSsmlParsing && text.startsWith('<phoneme')) || xmlContent.length) {\n xmlContent.push(text);\n if (text.indexOf('</phoneme>') !== -1) {\n text = xmlContent.join(' ');\n xmlContent = [];\n } else {\n continue;\n }\n }\n\n ws.send(JSON.stringify({ text: text + ' ', try_trigger_generation: false }));\n }\n\n if (xmlContent.length) {\n this.#logger.warn('ElevenLabs stream ended with incomplete XML content');\n }\n\n ws.send(JSON.stringify({ text: '' }));\n eosSent = true;\n };\n\n let lastFrame: AudioFrame | undefined;\n const sendLastFrame = (segmentId: string, final: boolean) => {\n if (lastFrame) {\n this.queue.put({ requestId, segmentId, frame: lastFrame, final });\n lastFrame = undefined;\n }\n };\n\n const listenTask = async () => {\n const bstream = new AudioByteStream(sampleRateFromFormat(this.#opts.encoding), 1);\n while (!this.closed) {\n try {\n await new Promise<RawData>((resolve, reject) => {\n ws.removeAllListeners();\n ws.on('message', (data) => resolve(data));\n ws.on('close', (code, reason) => {\n if (!eosSent) {\n this.#logger.error(`WebSocket closed with code ${code}: ${reason}`);\n }\n reject();\n });\n }).then((msg) => {\n const json = JSON.parse(msg.toString());\n if ('audio' in json) {\n const data = new Int8Array(Buffer.from(json.audio, 'base64').buffer);\n for (const frame of bstream.write(data)) {\n sendLastFrame(segmentId, false);\n lastFrame = frame;\n }\n } else if ('isFinal' in json) {\n for (const frame of bstream.flush()) {\n sendLastFrame(segmentId, false);\n lastFrame = frame;\n }\n sendLastFrame(segmentId, true);\n }\n });\n } catch {\n break;\n }\n }\n };\n\n await Promise.all([sendTask(), listenTask()]);\n }\n}\n\nconst sampleRateFromFormat = (encoding: TTSEncoding): number => {\n return Number(encoding.split('_')[1]);\n};\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,oBAAwE;AAExE,yBAA2B;AAC3B,sBAAoB;AACpB,gBAAwC;AAiBxC,MAAM,gBAAuB;AAAA,EAC3B,IAAI;AAAA,EACJ,MAAM;AAAA,EACN,UAAU;AAAA,EACV,UAAU;AAAA,IACR,WAAW;AAAA,IACX,kBAAkB;AAAA,IAClB,OAAO;AAAA,IACP,mBAAmB;AAAA,EACrB;AACF;AAEA,MAAM,kBAAkB;AACxB,MAAM,uBAAuB;AAc7B,MAAM,oBAAgC;AAAA,EACpC,QAAQ,QAAQ,IAAI;AAAA,EACpB,OAAO;AAAA,EACP,SAAS;AAAA,EACT,SAAS;AAAA,EACT,UAAU;AAAA,EACV,kBAAkB;AAAA,EAClB,eAAe,IAAI,uBAAS,MAAM,cAAc,KAAK;AAAA,EACrD,qBAAqB,CAAC;AAAA,EACtB,mBAAmB;AACrB;AAEO,MAAM,YAAY,kBAAI,IAAI;AAAA,EAC/B;AAAA,EACA,QAAQ;AAAA,EAER,YAAY,OAA4B,CAAC,GAAG;AAC1C,UAAM,qBAAqB,KAAK,YAAY,kBAAkB,QAAQ,GAAG,GAAG;AAAA,MAC1E,WAAW;AAAA,IACb,CAAC;AAED,SAAK,QAAQ;AAAA,MACX,GAAG;AAAA,MACH,GAAG;AAAA,IACL;AAEA,QAAI,KAAK,MAAM,WAAW,QAAW;AACnC,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAAA,EAEA,MAAM,aAA+B;AACnC,WAAO,MAAM,KAAK,MAAM,UAAU,WAAW;AAAA,MAC3C,SAAS;AAAA,QACP,CAAC,oBAAoB,GAAG,KAAK,MAAM;AAAA,MACrC;AAAA,IACF,CAAC,EACE,KAAK,CAAC,SAAS,KAAK,KAAK,CAAC,EAC1B,KAAK,CAAC,SAAS;AACd,YAAM,SAAkB,CAAC;AACzB,iBAAW,SACT,KACA,QAAQ;AACR,eAAO,KAAK;AAAA,UACV,IAAI,MAAM;AAAA,UACV,MAAM,MAAM;AAAA,UACZ,UAAU,MAAM;AAAA,UAChB,UAAU;AAAA,QACZ,CAAC;AAAA,MACH;AACA,aAAO;AAAA,IACT,CAAC;AAAA,EACL;AAAA,EAEA,aAAgC;AAC9B,UAAM,IAAI,MAAM,uDAAuD;AAAA,EACzE;AAAA,EAEA,SAA+B;AAC7B,WAAO,IAAI,iBAAiB,MAAM,KAAK,KAAK;AAAA,EAC9C;AACF;AAEO,MAAM,yBAAyB,kBAAI,iBAAiB;AAAA,EACzD;AAAA,EACA,cAAU,mBAAI;AAAA,EACd,QAAQ;AAAA,EACC;AAAA,EAET,YAAYA,MAAU,MAAkB;AACtC,UAAMA,IAAG;AACT,SAAK,QAAQ;AACb,SAAK,SAAS;AAGd,UAAM,UAAU,KAAK,WAAW,KAAK,QAAQ,SAAS,GAAG,IAAI,KAAK;AAElE,SAAK,YAAY,IAAI,oBAAI,kBAAkB,KAAK,MAAM,EAAE,iBAAiB,OAAO;AAChF,UAAM,SAAS;AAAA,MACb,UAAU,KAAK;AAAA,MACf,eAAe,KAAK;AAAA,MACpB,4BAA4B,GAAG,KAAK,gBAAgB;AAAA,MACpD,qBAAqB,GAAG,KAAK,iBAAiB;AAAA,IAChD;AACA,WAAO,QAAQ,MAAM,EAAE,QAAQ,CAAC,CAAC,GAAG,CAAC,MAAM,KAAK,UAAU,aAAa,OAAO,GAAG,CAAC,CAAC;AACnF,SAAK,UAAU,WAAW,KAAK,UAAU,SAAS,QAAQ,QAAQ,IAAI;AAEtE,SAAK,KAAK;AAAA,EACZ;AAAA,EAEA,MAAM,OAAO;AACX,UAAM,WAAW,IAAI,iCAAwC;AAE7D,UAAM,gBAAgB,YAAY;AAChC,UAAI,SAAqC;AACzC,uBAAiB,QAAQ,KAAK,OAAO;AACnC,YAAI,SAAS,iBAAiB,gBAAgB;AAC5C,2CAAQ;AACR,mBAAS;AAAA,QACX,OAAO;AACL,cAAI,CAAC,QAAQ;AACX,qBAAS,KAAK,MAAM,cAAc,OAAO;AACzC,qBAAS,IAAI,MAAM;AAAA,UACrB;AACA,iBAAO,SAAS,IAAI;AAAA,QACtB;AAAA,MACF;AACA,eAAS,MAAM;AAAA,IACjB;AAEA,UAAM,YAAY,YAAY;AAC5B,uBAAiB,UAAU,UAAU;AACnC,cAAM,KAAK,OAAO,MAAM;AACxB,aAAK,MAAM,IAAI,iBAAiB,aAAa;AAAA,MAC/C;AAAA,IACF;AAEA,UAAM,QAAQ,IAAI,CAAC,cAAc,GAAG,UAAU,CAAC,CAAC;AAChD,SAAK,MAAM;AAAA,EACb;AAAA,EAEA,MAAM,OAAO,QAA6B,WAAW,GAAG;AACtD,QAAI,UAAU;AACd,QAAI;AACJ,WAAO,MAAM;AACX,WAAK,IAAI,oBAAU,KAAK,WAAW;AAAA,QACjC,SAAS,EAAE,CAAC,oBAAoB,GAAG,KAAK,MAAM,OAAO;AAAA,MACvD,CAAC;AAED,UAAI;AACF,cAAM,IAAI,QAAQ,CAAC,SAAS,WAAW;AACrC,aAAG,GAAG,QAAQ,OAAO;AACrB,aAAG,GAAG,SAAS,CAAC,UAAU,OAAO,KAAK,CAAC;AACvC,aAAG,GAAG,SAAS,CAAC,SAAS,OAAO,sBAAsB,IAAI,EAAE,CAAC;AAAA,QAC/D,CAAC;AACD;AAAA,MACF,SAAS,GAAG;AACV,YAAI,WAAW,UAAU;AACvB,gBAAM,IAAI,MAAM,yCAAyC,OAAO,cAAc,CAAC,EAAE;AAAA,QACnF;AAEA,cAAM,QAAQ,KAAK,IAAI,UAAU,GAAG,CAAC;AACrC;AAEA,aAAK,QAAQ;AAAA,UACX,gDAAgD,KAAK,aAAa,CAAC,KAAK,OAAO,IAAI,QAAQ;AAAA,QAC7F;AACA,cAAM,IAAI,QAAQ,CAAC,YAAY,WAAW,SAAS,QAAQ,GAAI,CAAC;AAAA,MAClE;AAAA,IACF;AAEA,UAAM,gBAAY,+BAAW;AAC7B,UAAM,gBAAY,+BAAW;AAE7B,OAAG;AAAA,MACD,KAAK,UAAU;AAAA,QACb,MAAM;AAAA,QACN,gBAAgB,KAAK,MAAM,MAAM;AAAA,QACjC,wBAAwB;AAAA,QACxB,uBAAuB,KAAK,MAAM;AAAA,MACpC,CAAC;AAAA,IACH;AACA,QAAI,UAAU;AAEd,UAAM,WAAW,YAAY;AAC3B,UAAI,aAAuB,CAAC;AAC5B,uBAAiB,QAAQ,QAAQ;AAC/B,YAAI,OAAO,KAAK;AAEhB,YAAK,KAAK,MAAM,qBAAqB,KAAK,WAAW,UAAU,KAAM,WAAW,QAAQ;AACtF,qBAAW,KAAK,IAAI;AACpB,cAAI,KAAK,QAAQ,YAAY,MAAM,IAAI;AACrC,mBAAO,WAAW,KAAK,GAAG;AAC1B,yBAAa,CAAC;AAAA,UAChB,OAAO;AACL;AAAA,UACF;AAAA,QACF;AAEA,WAAG,KAAK,KAAK,UAAU,EAAE,MAAM,OAAO,KAAK,wBAAwB,MAAM,CAAC,CAAC;AAAA,MAC7E;AAEA,UAAI,WAAW,QAAQ;AACrB,aAAK,QAAQ,KAAK,qDAAqD;AAAA,MACzE;AAEA,SAAG,KAAK,KAAK,UAAU,EAAE,MAAM,GAAG,CAAC,CAAC;AACpC,gBAAU;AAAA,IACZ;AAEA,QAAI;AACJ,UAAM,gBAAgB,CAACC,YAAmB,UAAmB;AAC3D,UAAI,WAAW;AACb,aAAK,MAAM,IAAI,EAAE,WAAW,WAAAA,YAAW,OAAO,WAAW,MAAM,CAAC;AAChE,oBAAY;AAAA,MACd;AAAA,IACF;AAEA,UAAM,aAAa,YAAY;AAC7B,YAAM,UAAU,IAAI,8BAAgB,qBAAqB,KAAK,MAAM,QAAQ,GAAG,CAAC;AAChF,aAAO,CAAC,KAAK,QAAQ;AACnB,YAAI;AACF,gBAAM,IAAI,QAAiB,CAAC,SAAS,WAAW;AAC9C,eAAG,mBAAmB;AACtB,eAAG,GAAG,WAAW,CAAC,SAAS,QAAQ,IAAI,CAAC;AACxC,eAAG,GAAG,SAAS,CAAC,MAAM,WAAW;AAC/B,kBAAI,CAAC,SAAS;AACZ,qBAAK,QAAQ,MAAM,8BAA8B,IAAI,KAAK,MAAM,EAAE;AAAA,cACpE;AACA,qBAAO;AAAA,YACT,CAAC;AAAA,UACH,CAAC,EAAE,KAAK,CAAC,QAAQ;AACf,kBAAM,OAAO,KAAK,MAAM,IAAI,SAAS,CAAC;AACtC,gBAAI,WAAW,MAAM;AACnB,oBAAM,OAAO,IAAI,UAAU,OAAO,KAAK,KAAK,OAAO,QAAQ,EAAE,MAAM;AACnE,yBAAW,SAAS,QAAQ,MAAM,IAAI,GAAG;AACvC,8BAAc,WAAW,KAAK;AAC9B,4BAAY;AAAA,cACd;AAAA,YACF,WAAW,aAAa,MAAM;AAC5B,yBAAW,SAAS,QAAQ,MAAM,GAAG;AACnC,8BAAc,WAAW,KAAK;AAC9B,4BAAY;AAAA,cACd;AACA,4BAAc,WAAW,IAAI;AAAA,YAC/B;AAAA,UACF,CAAC;AAAA,QACH,QAAQ;AACN;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAEA,UAAM,QAAQ,IAAI,CAAC,SAAS,GAAG,WAAW,CAAC,CAAC;AAAA,EAC9C;AACF;AAEA,MAAM,uBAAuB,CAAC,aAAkC;AAC9D,SAAO,OAAO,SAAS,MAAM,GAAG,EAAE,CAAC,CAAC;AACtC;","names":["tts","segmentId"]}
|
package/dist/tts.d.ts
CHANGED
|
@@ -27,6 +27,7 @@ export interface TTSOptions {
|
|
|
27
27
|
}
|
|
28
28
|
export declare class TTS extends tts.TTS {
|
|
29
29
|
#private;
|
|
30
|
+
label: string;
|
|
30
31
|
constructor(opts?: Partial<TTSOptions>);
|
|
31
32
|
listVoices(): Promise<Voice[]>;
|
|
32
33
|
synthesize(): tts.ChunkedStream;
|
|
@@ -34,8 +35,9 @@ export declare class TTS extends tts.TTS {
|
|
|
34
35
|
}
|
|
35
36
|
export declare class SynthesizeStream extends tts.SynthesizeStream {
|
|
36
37
|
#private;
|
|
38
|
+
label: string;
|
|
37
39
|
readonly streamURL: URL;
|
|
38
|
-
constructor(opts: TTSOptions);
|
|
40
|
+
constructor(tts: TTS, opts: TTSOptions);
|
|
39
41
|
}
|
|
40
42
|
export {};
|
|
41
43
|
//# sourceMappingURL=tts.d.ts.map
|
package/dist/tts.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tts.d.ts","sourceRoot":"","sources":["../src/tts.ts"],"names":[],"mappings":";AAGA,OAAO,
|
|
1
|
+
{"version":3,"file":"tts.d.ts","sourceRoot":"","sources":["../src/tts.ts"],"names":[],"mappings":";AAGA,OAAO,EAA4C,QAAQ,EAAE,GAAG,EAAE,MAAM,iBAAiB,CAAC;AAG1F,OAAO,EAAE,GAAG,EAAE,MAAM,UAAU,CAAC;AAE/B,OAAO,KAAK,EAAE,WAAW,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAE1D,KAAK,KAAK,GAAG;IACX,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,EAAE,MAAM,CAAC;IACjB,QAAQ,CAAC,EAAE,aAAa,CAAC;CAC1B,CAAC;AAEF,KAAK,aAAa,GAAG;IACnB,SAAS,EAAE,MAAM,CAAC;IAClB,gBAAgB,EAAE,MAAM,CAAC;IACzB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,iBAAiB,EAAE,OAAO,CAAC;CAC5B,CAAC;AAiBF,MAAM,WAAW,UAAU;IACzB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,KAAK,CAAC;IACb,OAAO,EAAE,SAAS,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,WAAW,CAAC;IACtB,gBAAgB,EAAE,MAAM,CAAC;IACzB,aAAa,EAAE,QAAQ,CAAC,aAAa,CAAC;IACtC,mBAAmB,EAAE,MAAM,EAAE,CAAC;IAC9B,iBAAiB,EAAE,OAAO,CAAC;CAC5B;AAcD,qBAAa,GAAI,SAAQ,GAAG,CAAC,GAAG;;IAE9B,KAAK,SAAoB;gBAEb,IAAI,GAAE,OAAO,CAAC,UAAU,CAAM;IAiBpC,UAAU,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;IAuBpC,UAAU,IAAI,GAAG,CAAC,aAAa;IAI/B,MAAM,IAAI,GAAG,CAAC,gBAAgB;CAG/B;AAED,qBAAa,gBAAiB,SAAQ,GAAG,CAAC,gBAAgB;;IAGxD,KAAK,SAAiC;IACtC,QAAQ,CAAC,SAAS,EAAE,GAAG,CAAC;gBAEZ,GAAG,EAAE,GAAG,EAAE,IAAI,EAAE,UAAU;CAsKvC"}
|
package/dist/tts.js
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
import { AsyncIterableQueue, log, tokenize, tts } from "@livekit/agents";
|
|
2
|
-
import { AudioFrame } from "@livekit/rtc-node";
|
|
1
|
+
import { AsyncIterableQueue, AudioByteStream, log, tokenize, tts } from "@livekit/agents";
|
|
3
2
|
import { randomUUID } from "node:crypto";
|
|
4
3
|
import { URL } from "node:url";
|
|
5
4
|
import { WebSocket } from "ws";
|
|
@@ -29,6 +28,7 @@ const defaultTTSOptions = {
|
|
|
29
28
|
};
|
|
30
29
|
class TTS extends tts.TTS {
|
|
31
30
|
#opts;
|
|
31
|
+
label = "elevenlabs.TTS";
|
|
32
32
|
constructor(opts = {}) {
|
|
33
33
|
super(sampleRateFromFormat(opts.encoding || defaultTTSOptions.encoding), 1, {
|
|
34
34
|
streaming: true
|
|
@@ -65,15 +65,16 @@ class TTS extends tts.TTS {
|
|
|
65
65
|
throw new Error("Chunked responses are not supported on ElevenLabs TTS");
|
|
66
66
|
}
|
|
67
67
|
stream() {
|
|
68
|
-
return new SynthesizeStream(this.#opts);
|
|
68
|
+
return new SynthesizeStream(this, this.#opts);
|
|
69
69
|
}
|
|
70
70
|
}
|
|
71
71
|
class SynthesizeStream extends tts.SynthesizeStream {
|
|
72
72
|
#opts;
|
|
73
73
|
#logger = log();
|
|
74
|
+
label = "elevenlabs.SynthesizeStream";
|
|
74
75
|
streamURL;
|
|
75
|
-
constructor(opts) {
|
|
76
|
-
super();
|
|
76
|
+
constructor(tts2, opts) {
|
|
77
|
+
super(tts2);
|
|
77
78
|
this.#opts = opts;
|
|
78
79
|
this.closed = false;
|
|
79
80
|
const baseURL = opts.baseURL + (opts.baseURL.endsWith("/") ? "" : "/");
|
|
@@ -173,7 +174,15 @@ class SynthesizeStream extends tts.SynthesizeStream {
|
|
|
173
174
|
ws.send(JSON.stringify({ text: "" }));
|
|
174
175
|
eosSent = true;
|
|
175
176
|
};
|
|
177
|
+
let lastFrame;
|
|
178
|
+
const sendLastFrame = (segmentId2, final) => {
|
|
179
|
+
if (lastFrame) {
|
|
180
|
+
this.queue.put({ requestId, segmentId: segmentId2, frame: lastFrame, final });
|
|
181
|
+
lastFrame = void 0;
|
|
182
|
+
}
|
|
183
|
+
};
|
|
176
184
|
const listenTask = async () => {
|
|
185
|
+
const bstream = new AudioByteStream(sampleRateFromFormat(this.#opts.encoding), 1);
|
|
177
186
|
while (!this.closed) {
|
|
178
187
|
try {
|
|
179
188
|
await new Promise((resolve, reject) => {
|
|
@@ -188,14 +197,17 @@ class SynthesizeStream extends tts.SynthesizeStream {
|
|
|
188
197
|
}).then((msg) => {
|
|
189
198
|
const json = JSON.parse(msg.toString());
|
|
190
199
|
if ("audio" in json) {
|
|
191
|
-
const data = new
|
|
192
|
-
const frame
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
)
|
|
198
|
-
|
|
200
|
+
const data = new Int8Array(Buffer.from(json.audio, "base64").buffer);
|
|
201
|
+
for (const frame of bstream.write(data)) {
|
|
202
|
+
sendLastFrame(segmentId, false);
|
|
203
|
+
lastFrame = frame;
|
|
204
|
+
}
|
|
205
|
+
} else if ("isFinal" in json) {
|
|
206
|
+
for (const frame of bstream.flush()) {
|
|
207
|
+
sendLastFrame(segmentId, false);
|
|
208
|
+
lastFrame = frame;
|
|
209
|
+
}
|
|
210
|
+
sendLastFrame(segmentId, true);
|
|
199
211
|
}
|
|
200
212
|
});
|
|
201
213
|
} catch {
|
package/dist/tts.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/tts.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { AsyncIterableQueue, log, tokenize, tts } from '@livekit/agents';\nimport { AudioFrame } from '@livekit/rtc-node';\nimport { randomUUID } from 'node:crypto';\nimport { URL } from 'node:url';\nimport { type RawData, WebSocket } from 'ws';\nimport type { TTSEncoding, TTSModels } from './models.js';\n\ntype Voice = {\n id: string;\n name: string;\n category: string;\n settings?: VoiceSettings;\n};\n\ntype VoiceSettings = {\n stability: number; // 0..1\n similarity_boost: number; // 0..1\n style?: number; // 0..1\n use_speaker_boost: boolean;\n};\n\nconst DEFAULT_VOICE: Voice = {\n id: 'EXAVITQu4vr4xnSDxMaL',\n name: 'Bella',\n category: 'premade',\n settings: {\n stability: 0.71,\n similarity_boost: 0.5,\n style: 0.0,\n use_speaker_boost: true,\n },\n};\n\nconst API_BASE_URL_V1 = 'https://api.elevenlabs.io/v1/';\nconst AUTHORIZATION_HEADER = 'xi-api-key';\n\nexport interface TTSOptions {\n apiKey?: string;\n voice: Voice;\n modelID: TTSModels;\n baseURL: string;\n encoding: TTSEncoding;\n streamingLatency: number;\n wordTokenizer: tokenize.WordTokenizer;\n chunkLengthSchedule: number[];\n enableSsmlParsing: boolean;\n}\n\nconst defaultTTSOptions: TTSOptions = {\n apiKey: process.env.ELEVEN_API_KEY,\n voice: DEFAULT_VOICE,\n modelID: 'eleven_turbo_v2_5',\n baseURL: API_BASE_URL_V1,\n encoding: 'pcm_22050',\n streamingLatency: 3,\n wordTokenizer: new tokenize.basic.WordTokenizer(false),\n chunkLengthSchedule: [],\n enableSsmlParsing: false,\n};\n\nexport class TTS extends tts.TTS {\n #opts: TTSOptions;\n\n constructor(opts: Partial<TTSOptions> = {}) {\n super(sampleRateFromFormat(opts.encoding || defaultTTSOptions.encoding), 1, {\n streaming: true,\n });\n\n this.#opts = {\n ...defaultTTSOptions,\n ...opts,\n };\n\n if (this.#opts.apiKey === undefined) {\n throw new Error(\n 'ElevenLabs API key is required, whether as an argument or as $ELEVEN_API_KEY',\n );\n }\n }\n\n async listVoices(): Promise<Voice[]> {\n return fetch(this.#opts.baseURL + '/voices', {\n headers: {\n [AUTHORIZATION_HEADER]: this.#opts.apiKey!,\n },\n })\n .then((data) => data.json())\n .then((data) => {\n const voices: Voice[] = [];\n for (const voice of (\n data as { voices: { voice_id: string; name: string; category: string }[] }\n ).voices) {\n voices.push({\n id: voice.voice_id,\n name: voice.name,\n category: voice.category,\n settings: undefined,\n });\n }\n return voices;\n });\n }\n\n synthesize(): tts.ChunkedStream {\n throw new Error('Chunked responses are not supported on ElevenLabs TTS');\n }\n\n stream(): tts.SynthesizeStream {\n return new SynthesizeStream(this.#opts);\n }\n}\n\nexport class SynthesizeStream extends tts.SynthesizeStream {\n #opts: TTSOptions;\n #logger = log();\n readonly streamURL: URL;\n\n constructor(opts: TTSOptions) {\n super();\n this.#opts = opts;\n this.closed = false;\n\n // add trailing slash to URL if needed\n const baseURL = opts.baseURL + (opts.baseURL.endsWith('/') ? '' : '/');\n\n this.streamURL = new URL(`text-to-speech/${opts.voice.id}/stream-input`, baseURL);\n const params = {\n model_id: opts.modelID,\n output_format: opts.encoding,\n optimize_streaming_latency: `${opts.streamingLatency}`,\n enable_ssml_parsing: `${opts.enableSsmlParsing}`,\n };\n Object.entries(params).forEach(([k, v]) => this.streamURL.searchParams.append(k, v));\n this.streamURL.protocol = this.streamURL.protocol.replace('http', 'ws');\n\n this.#run();\n }\n\n async #run() {\n const segments = new AsyncIterableQueue<tokenize.WordStream>();\n\n const tokenizeInput = async () => {\n let stream: tokenize.WordStream | null = null;\n for await (const text of this.input) {\n if (text === SynthesizeStream.FLUSH_SENTINEL) {\n stream?.endInput();\n stream = null;\n } else {\n if (!stream) {\n stream = this.#opts.wordTokenizer.stream();\n segments.put(stream);\n }\n stream.pushText(text);\n }\n }\n segments.close();\n };\n\n const runStream = async () => {\n for await (const stream of segments) {\n await this.#runWS(stream);\n this.queue.put(SynthesizeStream.END_OF_STREAM);\n }\n };\n\n await Promise.all([tokenizeInput(), runStream()]);\n this.close();\n }\n\n async #runWS(stream: tokenize.WordStream, maxRetry = 3) {\n let retries = 0;\n let ws: WebSocket;\n while (true) {\n ws = new WebSocket(this.streamURL, {\n headers: { [AUTHORIZATION_HEADER]: this.#opts.apiKey },\n });\n\n try {\n await new Promise((resolve, reject) => {\n ws.on('open', resolve);\n ws.on('error', (error) => reject(error));\n ws.on('close', (code) => reject(`WebSocket returned ${code}`));\n });\n break;\n } catch (e) {\n if (retries >= maxRetry) {\n throw new Error(`failed to connect to ElevenLabs after ${retries} attempts: ${e}`);\n }\n\n const delay = Math.min(retries * 5, 5);\n retries++;\n\n this.#logger.warn(\n `failed to connect to ElevenLabs, retrying in ${delay} seconds: ${e} (${retries}/${maxRetry})`,\n );\n await new Promise((resolve) => setTimeout(resolve, delay * 1000));\n }\n }\n\n const requestId = randomUUID();\n const segmentId = randomUUID();\n\n ws.send(\n JSON.stringify({\n text: ' ',\n voice_settings: this.#opts.voice.settings,\n try_trigger_generation: true,\n chunk_length_schedule: this.#opts.chunkLengthSchedule,\n }),\n );\n let eosSent = false;\n\n const sendTask = async () => {\n let xmlContent: string[] = [];\n for await (const data of stream) {\n let text = data.token;\n\n if ((this.#opts.enableSsmlParsing && text.startsWith('<phoneme')) || xmlContent.length) {\n xmlContent.push(text);\n if (text.indexOf('</phoneme>') !== -1) {\n text = xmlContent.join(' ');\n xmlContent = [];\n } else {\n continue;\n }\n }\n\n ws.send(JSON.stringify({ text: text + ' ', try_trigger_generation: false }));\n }\n\n if (xmlContent.length) {\n this.#logger.warn('ElevenLabs stream ended with incomplete XML content');\n }\n\n ws.send(JSON.stringify({ text: '' }));\n eosSent = true;\n };\n\n const listenTask = async () => {\n while (!this.closed) {\n try {\n await new Promise<RawData>((resolve, reject) => {\n ws.removeAllListeners();\n ws.on('message', (data) => resolve(data));\n ws.on('close', (code, reason) => {\n if (!eosSent) {\n this.#logger.error(`WebSocket closed with code ${code}: ${reason}`);\n }\n reject();\n });\n }).then((msg) => {\n const json = JSON.parse(msg.toString());\n if ('audio' in json) {\n const data = new Int16Array(Buffer.from(json.audio, 'base64').buffer);\n const frame = new AudioFrame(\n data,\n sampleRateFromFormat(this.#opts.encoding),\n 1,\n data.length,\n );\n this.queue.put({ requestId, segmentId, frame });\n }\n });\n } catch {\n break;\n }\n }\n };\n\n await Promise.all([sendTask(), listenTask()]);\n }\n}\n\nconst sampleRateFromFormat = (encoding: TTSEncoding): number => {\n return Number(encoding.split('_')[1]);\n};\n"],"mappings":"AAGA,SAAS,oBAAoB,KAAK,UAAU,WAAW;AACvD,SAAS,kBAAkB;AAC3B,SAAS,kBAAkB;AAC3B,SAAS,WAAW;AACpB,SAAuB,iBAAiB;AAiBxC,MAAM,gBAAuB;AAAA,EAC3B,IAAI;AAAA,EACJ,MAAM;AAAA,EACN,UAAU;AAAA,EACV,UAAU;AAAA,IACR,WAAW;AAAA,IACX,kBAAkB;AAAA,IAClB,OAAO;AAAA,IACP,mBAAmB;AAAA,EACrB;AACF;AAEA,MAAM,kBAAkB;AACxB,MAAM,uBAAuB;AAc7B,MAAM,oBAAgC;AAAA,EACpC,QAAQ,QAAQ,IAAI;AAAA,EACpB,OAAO;AAAA,EACP,SAAS;AAAA,EACT,SAAS;AAAA,EACT,UAAU;AAAA,EACV,kBAAkB;AAAA,EAClB,eAAe,IAAI,SAAS,MAAM,cAAc,KAAK;AAAA,EACrD,qBAAqB,CAAC;AAAA,EACtB,mBAAmB;AACrB;AAEO,MAAM,YAAY,IAAI,IAAI;AAAA,EAC/B;AAAA,EAEA,YAAY,OAA4B,CAAC,GAAG;AAC1C,UAAM,qBAAqB,KAAK,YAAY,kBAAkB,QAAQ,GAAG,GAAG;AAAA,MAC1E,WAAW;AAAA,IACb,CAAC;AAED,SAAK,QAAQ;AAAA,MACX,GAAG;AAAA,MACH,GAAG;AAAA,IACL;AAEA,QAAI,KAAK,MAAM,WAAW,QAAW;AACnC,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAAA,EAEA,MAAM,aAA+B;AACnC,WAAO,MAAM,KAAK,MAAM,UAAU,WAAW;AAAA,MAC3C,SAAS;AAAA,QACP,CAAC,oBAAoB,GAAG,KAAK,MAAM;AAAA,MACrC;AAAA,IACF,CAAC,EACE,KAAK,CAAC,SAAS,KAAK,KAAK,CAAC,EAC1B,KAAK,CAAC,SAAS;AACd,YAAM,SAAkB,CAAC;AACzB,iBAAW,SACT,KACA,QAAQ;AACR,eAAO,KAAK;AAAA,UACV,IAAI,MAAM;AAAA,UACV,MAAM,MAAM;AAAA,UACZ,UAAU,MAAM;AAAA,UAChB,UAAU;AAAA,QACZ,CAAC;AAAA,MACH;AACA,aAAO;AAAA,IACT,CAAC;AAAA,EACL;AAAA,EAEA,aAAgC;AAC9B,UAAM,IAAI,MAAM,uDAAuD;AAAA,EACzE;AAAA,EAEA,SAA+B;AAC7B,WAAO,IAAI,iBAAiB,KAAK,KAAK;AAAA,EACxC;AACF;AAEO,MAAM,yBAAyB,IAAI,iBAAiB;AAAA,EACzD;AAAA,EACA,UAAU,IAAI;AAAA,EACL;AAAA,EAET,YAAY,MAAkB;AAC5B,UAAM;AACN,SAAK,QAAQ;AACb,SAAK,SAAS;AAGd,UAAM,UAAU,KAAK,WAAW,KAAK,QAAQ,SAAS,GAAG,IAAI,KAAK;AAElE,SAAK,YAAY,IAAI,IAAI,kBAAkB,KAAK,MAAM,EAAE,iBAAiB,OAAO;AAChF,UAAM,SAAS;AAAA,MACb,UAAU,KAAK;AAAA,MACf,eAAe,KAAK;AAAA,MACpB,4BAA4B,GAAG,KAAK,gBAAgB;AAAA,MACpD,qBAAqB,GAAG,KAAK,iBAAiB;AAAA,IAChD;AACA,WAAO,QAAQ,MAAM,EAAE,QAAQ,CAAC,CAAC,GAAG,CAAC,MAAM,KAAK,UAAU,aAAa,OAAO,GAAG,CAAC,CAAC;AACnF,SAAK,UAAU,WAAW,KAAK,UAAU,SAAS,QAAQ,QAAQ,IAAI;AAEtE,SAAK,KAAK;AAAA,EACZ;AAAA,EAEA,MAAM,OAAO;AACX,UAAM,WAAW,IAAI,mBAAwC;AAE7D,UAAM,gBAAgB,YAAY;AAChC,UAAI,SAAqC;AACzC,uBAAiB,QAAQ,KAAK,OAAO;AACnC,YAAI,SAAS,iBAAiB,gBAAgB;AAC5C,2CAAQ;AACR,mBAAS;AAAA,QACX,OAAO;AACL,cAAI,CAAC,QAAQ;AACX,qBAAS,KAAK,MAAM,cAAc,OAAO;AACzC,qBAAS,IAAI,MAAM;AAAA,UACrB;AACA,iBAAO,SAAS,IAAI;AAAA,QACtB;AAAA,MACF;AACA,eAAS,MAAM;AAAA,IACjB;AAEA,UAAM,YAAY,YAAY;AAC5B,uBAAiB,UAAU,UAAU;AACnC,cAAM,KAAK,OAAO,MAAM;AACxB,aAAK,MAAM,IAAI,iBAAiB,aAAa;AAAA,MAC/C;AAAA,IACF;AAEA,UAAM,QAAQ,IAAI,CAAC,cAAc,GAAG,UAAU,CAAC,CAAC;AAChD,SAAK,MAAM;AAAA,EACb;AAAA,EAEA,MAAM,OAAO,QAA6B,WAAW,GAAG;AACtD,QAAI,UAAU;AACd,QAAI;AACJ,WAAO,MAAM;AACX,WAAK,IAAI,UAAU,KAAK,WAAW;AAAA,QACjC,SAAS,EAAE,CAAC,oBAAoB,GAAG,KAAK,MAAM,OAAO;AAAA,MACvD,CAAC;AAED,UAAI;AACF,cAAM,IAAI,QAAQ,CAAC,SAAS,WAAW;AACrC,aAAG,GAAG,QAAQ,OAAO;AACrB,aAAG,GAAG,SAAS,CAAC,UAAU,OAAO,KAAK,CAAC;AACvC,aAAG,GAAG,SAAS,CAAC,SAAS,OAAO,sBAAsB,IAAI,EAAE,CAAC;AAAA,QAC/D,CAAC;AACD;AAAA,MACF,SAAS,GAAG;AACV,YAAI,WAAW,UAAU;AACvB,gBAAM,IAAI,MAAM,yCAAyC,OAAO,cAAc,CAAC,EAAE;AAAA,QACnF;AAEA,cAAM,QAAQ,KAAK,IAAI,UAAU,GAAG,CAAC;AACrC;AAEA,aAAK,QAAQ;AAAA,UACX,gDAAgD,KAAK,aAAa,CAAC,KAAK,OAAO,IAAI,QAAQ;AAAA,QAC7F;AACA,cAAM,IAAI,QAAQ,CAAC,YAAY,WAAW,SAAS,QAAQ,GAAI,CAAC;AAAA,MAClE;AAAA,IACF;AAEA,UAAM,YAAY,WAAW;AAC7B,UAAM,YAAY,WAAW;AAE7B,OAAG;AAAA,MACD,KAAK,UAAU;AAAA,QACb,MAAM;AAAA,QACN,gBAAgB,KAAK,MAAM,MAAM;AAAA,QACjC,wBAAwB;AAAA,QACxB,uBAAuB,KAAK,MAAM;AAAA,MACpC,CAAC;AAAA,IACH;AACA,QAAI,UAAU;AAEd,UAAM,WAAW,YAAY;AAC3B,UAAI,aAAuB,CAAC;AAC5B,uBAAiB,QAAQ,QAAQ;AAC/B,YAAI,OAAO,KAAK;AAEhB,YAAK,KAAK,MAAM,qBAAqB,KAAK,WAAW,UAAU,KAAM,WAAW,QAAQ;AACtF,qBAAW,KAAK,IAAI;AACpB,cAAI,KAAK,QAAQ,YAAY,MAAM,IAAI;AACrC,mBAAO,WAAW,KAAK,GAAG;AAC1B,yBAAa,CAAC;AAAA,UAChB,OAAO;AACL;AAAA,UACF;AAAA,QACF;AAEA,WAAG,KAAK,KAAK,UAAU,EAAE,MAAM,OAAO,KAAK,wBAAwB,MAAM,CAAC,CAAC;AAAA,MAC7E;AAEA,UAAI,WAAW,QAAQ;AACrB,aAAK,QAAQ,KAAK,qDAAqD;AAAA,MACzE;AAEA,SAAG,KAAK,KAAK,UAAU,EAAE,MAAM,GAAG,CAAC,CAAC;AACpC,gBAAU;AAAA,IACZ;AAEA,UAAM,aAAa,YAAY;AAC7B,aAAO,CAAC,KAAK,QAAQ;AACnB,YAAI;AACF,gBAAM,IAAI,QAAiB,CAAC,SAAS,WAAW;AAC9C,eAAG,mBAAmB;AACtB,eAAG,GAAG,WAAW,CAAC,SAAS,QAAQ,IAAI,CAAC;AACxC,eAAG,GAAG,SAAS,CAAC,MAAM,WAAW;AAC/B,kBAAI,CAAC,SAAS;AACZ,qBAAK,QAAQ,MAAM,8BAA8B,IAAI,KAAK,MAAM,EAAE;AAAA,cACpE;AACA,qBAAO;AAAA,YACT,CAAC;AAAA,UACH,CAAC,EAAE,KAAK,CAAC,QAAQ;AACf,kBAAM,OAAO,KAAK,MAAM,IAAI,SAAS,CAAC;AACtC,gBAAI,WAAW,MAAM;AACnB,oBAAM,OAAO,IAAI,WAAW,OAAO,KAAK,KAAK,OAAO,QAAQ,EAAE,MAAM;AACpE,oBAAM,QAAQ,IAAI;AAAA,gBAChB;AAAA,gBACA,qBAAqB,KAAK,MAAM,QAAQ;AAAA,gBACxC;AAAA,gBACA,KAAK;AAAA,cACP;AACA,mBAAK,MAAM,IAAI,EAAE,WAAW,WAAW,MAAM,CAAC;AAAA,YAChD;AAAA,UACF,CAAC;AAAA,QACH,QAAQ;AACN;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAEA,UAAM,QAAQ,IAAI,CAAC,SAAS,GAAG,WAAW,CAAC,CAAC;AAAA,EAC9C;AACF;AAEA,MAAM,uBAAuB,CAAC,aAAkC;AAC9D,SAAO,OAAO,SAAS,MAAM,GAAG,EAAE,CAAC,CAAC;AACtC;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../src/tts.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { AsyncIterableQueue, AudioByteStream, log, tokenize, tts } from '@livekit/agents';\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { randomUUID } from 'node:crypto';\nimport { URL } from 'node:url';\nimport { type RawData, WebSocket } from 'ws';\nimport type { TTSEncoding, TTSModels } from './models.js';\n\ntype Voice = {\n id: string;\n name: string;\n category: string;\n settings?: VoiceSettings;\n};\n\ntype VoiceSettings = {\n stability: number; // 0..1\n similarity_boost: number; // 0..1\n style?: number; // 0..1\n use_speaker_boost: boolean;\n};\n\nconst DEFAULT_VOICE: Voice = {\n id: 'EXAVITQu4vr4xnSDxMaL',\n name: 'Bella',\n category: 'premade',\n settings: {\n stability: 0.71,\n similarity_boost: 0.5,\n style: 0.0,\n use_speaker_boost: true,\n },\n};\n\nconst API_BASE_URL_V1 = 'https://api.elevenlabs.io/v1/';\nconst AUTHORIZATION_HEADER = 'xi-api-key';\n\nexport interface TTSOptions {\n apiKey?: string;\n voice: Voice;\n modelID: TTSModels;\n baseURL: string;\n encoding: TTSEncoding;\n streamingLatency: number;\n wordTokenizer: tokenize.WordTokenizer;\n chunkLengthSchedule: number[];\n enableSsmlParsing: boolean;\n}\n\nconst defaultTTSOptions: TTSOptions = {\n apiKey: process.env.ELEVEN_API_KEY,\n voice: DEFAULT_VOICE,\n modelID: 'eleven_turbo_v2_5',\n baseURL: API_BASE_URL_V1,\n encoding: 'pcm_22050',\n streamingLatency: 3,\n wordTokenizer: new tokenize.basic.WordTokenizer(false),\n chunkLengthSchedule: [],\n enableSsmlParsing: false,\n};\n\nexport class TTS extends tts.TTS {\n #opts: TTSOptions;\n label = 'elevenlabs.TTS';\n\n constructor(opts: Partial<TTSOptions> = {}) {\n super(sampleRateFromFormat(opts.encoding || defaultTTSOptions.encoding), 1, {\n streaming: true,\n });\n\n this.#opts = {\n ...defaultTTSOptions,\n ...opts,\n };\n\n if (this.#opts.apiKey === undefined) {\n throw new Error(\n 'ElevenLabs API key is required, whether as an argument or as $ELEVEN_API_KEY',\n );\n }\n }\n\n async listVoices(): Promise<Voice[]> {\n return fetch(this.#opts.baseURL + '/voices', {\n headers: {\n [AUTHORIZATION_HEADER]: this.#opts.apiKey!,\n },\n })\n .then((data) => data.json())\n .then((data) => {\n const voices: Voice[] = [];\n for (const voice of (\n data as { voices: { voice_id: string; name: string; category: string }[] }\n ).voices) {\n voices.push({\n id: voice.voice_id,\n name: voice.name,\n category: voice.category,\n settings: undefined,\n });\n }\n return voices;\n });\n }\n\n synthesize(): tts.ChunkedStream {\n throw new Error('Chunked responses are not supported on ElevenLabs TTS');\n }\n\n stream(): tts.SynthesizeStream {\n return new SynthesizeStream(this, this.#opts);\n }\n}\n\nexport class SynthesizeStream extends tts.SynthesizeStream {\n #opts: TTSOptions;\n #logger = log();\n label = 'elevenlabs.SynthesizeStream';\n readonly streamURL: URL;\n\n constructor(tts: TTS, opts: TTSOptions) {\n super(tts);\n this.#opts = opts;\n this.closed = false;\n\n // add trailing slash to URL if needed\n const baseURL = opts.baseURL + (opts.baseURL.endsWith('/') ? '' : '/');\n\n this.streamURL = new URL(`text-to-speech/${opts.voice.id}/stream-input`, baseURL);\n const params = {\n model_id: opts.modelID,\n output_format: opts.encoding,\n optimize_streaming_latency: `${opts.streamingLatency}`,\n enable_ssml_parsing: `${opts.enableSsmlParsing}`,\n };\n Object.entries(params).forEach(([k, v]) => this.streamURL.searchParams.append(k, v));\n this.streamURL.protocol = this.streamURL.protocol.replace('http', 'ws');\n\n this.#run();\n }\n\n async #run() {\n const segments = new AsyncIterableQueue<tokenize.WordStream>();\n\n const tokenizeInput = async () => {\n let stream: tokenize.WordStream | null = null;\n for await (const text of this.input) {\n if (text === SynthesizeStream.FLUSH_SENTINEL) {\n stream?.endInput();\n stream = null;\n } else {\n if (!stream) {\n stream = this.#opts.wordTokenizer.stream();\n segments.put(stream);\n }\n stream.pushText(text);\n }\n }\n segments.close();\n };\n\n const runStream = async () => {\n for await (const stream of segments) {\n await this.#runWS(stream);\n this.queue.put(SynthesizeStream.END_OF_STREAM);\n }\n };\n\n await Promise.all([tokenizeInput(), runStream()]);\n this.close();\n }\n\n async #runWS(stream: tokenize.WordStream, maxRetry = 3) {\n let retries = 0;\n let ws: WebSocket;\n while (true) {\n ws = new WebSocket(this.streamURL, {\n headers: { [AUTHORIZATION_HEADER]: this.#opts.apiKey },\n });\n\n try {\n await new Promise((resolve, reject) => {\n ws.on('open', resolve);\n ws.on('error', (error) => reject(error));\n ws.on('close', (code) => reject(`WebSocket returned ${code}`));\n });\n break;\n } catch (e) {\n if (retries >= maxRetry) {\n throw new Error(`failed to connect to ElevenLabs after ${retries} attempts: ${e}`);\n }\n\n const delay = Math.min(retries * 5, 5);\n retries++;\n\n this.#logger.warn(\n `failed to connect to ElevenLabs, retrying in ${delay} seconds: ${e} (${retries}/${maxRetry})`,\n );\n await new Promise((resolve) => setTimeout(resolve, delay * 1000));\n }\n }\n\n const requestId = randomUUID();\n const segmentId = randomUUID();\n\n ws.send(\n JSON.stringify({\n text: ' ',\n voice_settings: this.#opts.voice.settings,\n try_trigger_generation: true,\n chunk_length_schedule: this.#opts.chunkLengthSchedule,\n }),\n );\n let eosSent = false;\n\n const sendTask = async () => {\n let xmlContent: string[] = [];\n for await (const data of stream) {\n let text = data.token;\n\n if ((this.#opts.enableSsmlParsing && text.startsWith('<phoneme')) || xmlContent.length) {\n xmlContent.push(text);\n if (text.indexOf('</phoneme>') !== -1) {\n text = xmlContent.join(' ');\n xmlContent = [];\n } else {\n continue;\n }\n }\n\n ws.send(JSON.stringify({ text: text + ' ', try_trigger_generation: false }));\n }\n\n if (xmlContent.length) {\n this.#logger.warn('ElevenLabs stream ended with incomplete XML content');\n }\n\n ws.send(JSON.stringify({ text: '' }));\n eosSent = true;\n };\n\n let lastFrame: AudioFrame | undefined;\n const sendLastFrame = (segmentId: string, final: boolean) => {\n if (lastFrame) {\n this.queue.put({ requestId, segmentId, frame: lastFrame, final });\n lastFrame = undefined;\n }\n };\n\n const listenTask = async () => {\n const bstream = new AudioByteStream(sampleRateFromFormat(this.#opts.encoding), 1);\n while (!this.closed) {\n try {\n await new Promise<RawData>((resolve, reject) => {\n ws.removeAllListeners();\n ws.on('message', (data) => resolve(data));\n ws.on('close', (code, reason) => {\n if (!eosSent) {\n this.#logger.error(`WebSocket closed with code ${code}: ${reason}`);\n }\n reject();\n });\n }).then((msg) => {\n const json = JSON.parse(msg.toString());\n if ('audio' in json) {\n const data = new Int8Array(Buffer.from(json.audio, 'base64').buffer);\n for (const frame of bstream.write(data)) {\n sendLastFrame(segmentId, false);\n lastFrame = frame;\n }\n } else if ('isFinal' in json) {\n for (const frame of bstream.flush()) {\n sendLastFrame(segmentId, false);\n lastFrame = frame;\n }\n sendLastFrame(segmentId, true);\n }\n });\n } catch {\n break;\n }\n }\n };\n\n await Promise.all([sendTask(), listenTask()]);\n }\n}\n\nconst sampleRateFromFormat = (encoding: TTSEncoding): number => {\n return Number(encoding.split('_')[1]);\n};\n"],"mappings":"AAGA,SAAS,oBAAoB,iBAAiB,KAAK,UAAU,WAAW;AAExE,SAAS,kBAAkB;AAC3B,SAAS,WAAW;AACpB,SAAuB,iBAAiB;AAiBxC,MAAM,gBAAuB;AAAA,EAC3B,IAAI;AAAA,EACJ,MAAM;AAAA,EACN,UAAU;AAAA,EACV,UAAU;AAAA,IACR,WAAW;AAAA,IACX,kBAAkB;AAAA,IAClB,OAAO;AAAA,IACP,mBAAmB;AAAA,EACrB;AACF;AAEA,MAAM,kBAAkB;AACxB,MAAM,uBAAuB;AAc7B,MAAM,oBAAgC;AAAA,EACpC,QAAQ,QAAQ,IAAI;AAAA,EACpB,OAAO;AAAA,EACP,SAAS;AAAA,EACT,SAAS;AAAA,EACT,UAAU;AAAA,EACV,kBAAkB;AAAA,EAClB,eAAe,IAAI,SAAS,MAAM,cAAc,KAAK;AAAA,EACrD,qBAAqB,CAAC;AAAA,EACtB,mBAAmB;AACrB;AAEO,MAAM,YAAY,IAAI,IAAI;AAAA,EAC/B;AAAA,EACA,QAAQ;AAAA,EAER,YAAY,OAA4B,CAAC,GAAG;AAC1C,UAAM,qBAAqB,KAAK,YAAY,kBAAkB,QAAQ,GAAG,GAAG;AAAA,MAC1E,WAAW;AAAA,IACb,CAAC;AAED,SAAK,QAAQ;AAAA,MACX,GAAG;AAAA,MACH,GAAG;AAAA,IACL;AAEA,QAAI,KAAK,MAAM,WAAW,QAAW;AACnC,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAAA,EAEA,MAAM,aAA+B;AACnC,WAAO,MAAM,KAAK,MAAM,UAAU,WAAW;AAAA,MAC3C,SAAS;AAAA,QACP,CAAC,oBAAoB,GAAG,KAAK,MAAM;AAAA,MACrC;AAAA,IACF,CAAC,EACE,KAAK,CAAC,SAAS,KAAK,KAAK,CAAC,EAC1B,KAAK,CAAC,SAAS;AACd,YAAM,SAAkB,CAAC;AACzB,iBAAW,SACT,KACA,QAAQ;AACR,eAAO,KAAK;AAAA,UACV,IAAI,MAAM;AAAA,UACV,MAAM,MAAM;AAAA,UACZ,UAAU,MAAM;AAAA,UAChB,UAAU;AAAA,QACZ,CAAC;AAAA,MACH;AACA,aAAO;AAAA,IACT,CAAC;AAAA,EACL;AAAA,EAEA,aAAgC;AAC9B,UAAM,IAAI,MAAM,uDAAuD;AAAA,EACzE;AAAA,EAEA,SAA+B;AAC7B,WAAO,IAAI,iBAAiB,MAAM,KAAK,KAAK;AAAA,EAC9C;AACF;AAEO,MAAM,yBAAyB,IAAI,iBAAiB;AAAA,EACzD;AAAA,EACA,UAAU,IAAI;AAAA,EACd,QAAQ;AAAA,EACC;AAAA,EAET,YAAYA,MAAU,MAAkB;AACtC,UAAMA,IAAG;AACT,SAAK,QAAQ;AACb,SAAK,SAAS;AAGd,UAAM,UAAU,KAAK,WAAW,KAAK,QAAQ,SAAS,GAAG,IAAI,KAAK;AAElE,SAAK,YAAY,IAAI,IAAI,kBAAkB,KAAK,MAAM,EAAE,iBAAiB,OAAO;AAChF,UAAM,SAAS;AAAA,MACb,UAAU,KAAK;AAAA,MACf,eAAe,KAAK;AAAA,MACpB,4BAA4B,GAAG,KAAK,gBAAgB;AAAA,MACpD,qBAAqB,GAAG,KAAK,iBAAiB;AAAA,IAChD;AACA,WAAO,QAAQ,MAAM,EAAE,QAAQ,CAAC,CAAC,GAAG,CAAC,MAAM,KAAK,UAAU,aAAa,OAAO,GAAG,CAAC,CAAC;AACnF,SAAK,UAAU,WAAW,KAAK,UAAU,SAAS,QAAQ,QAAQ,IAAI;AAEtE,SAAK,KAAK;AAAA,EACZ;AAAA,EAEA,MAAM,OAAO;AACX,UAAM,WAAW,IAAI,mBAAwC;AAE7D,UAAM,gBAAgB,YAAY;AAChC,UAAI,SAAqC;AACzC,uBAAiB,QAAQ,KAAK,OAAO;AACnC,YAAI,SAAS,iBAAiB,gBAAgB;AAC5C,2CAAQ;AACR,mBAAS;AAAA,QACX,OAAO;AACL,cAAI,CAAC,QAAQ;AACX,qBAAS,KAAK,MAAM,cAAc,OAAO;AACzC,qBAAS,IAAI,MAAM;AAAA,UACrB;AACA,iBAAO,SAAS,IAAI;AAAA,QACtB;AAAA,MACF;AACA,eAAS,MAAM;AAAA,IACjB;AAEA,UAAM,YAAY,YAAY;AAC5B,uBAAiB,UAAU,UAAU;AACnC,cAAM,KAAK,OAAO,MAAM;AACxB,aAAK,MAAM,IAAI,iBAAiB,aAAa;AAAA,MAC/C;AAAA,IACF;AAEA,UAAM,QAAQ,IAAI,CAAC,cAAc,GAAG,UAAU,CAAC,CAAC;AAChD,SAAK,MAAM;AAAA,EACb;AAAA,EAEA,MAAM,OAAO,QAA6B,WAAW,GAAG;AACtD,QAAI,UAAU;AACd,QAAI;AACJ,WAAO,MAAM;AACX,WAAK,IAAI,UAAU,KAAK,WAAW;AAAA,QACjC,SAAS,EAAE,CAAC,oBAAoB,GAAG,KAAK,MAAM,OAAO;AAAA,MACvD,CAAC;AAED,UAAI;AACF,cAAM,IAAI,QAAQ,CAAC,SAAS,WAAW;AACrC,aAAG,GAAG,QAAQ,OAAO;AACrB,aAAG,GAAG,SAAS,CAAC,UAAU,OAAO,KAAK,CAAC;AACvC,aAAG,GAAG,SAAS,CAAC,SAAS,OAAO,sBAAsB,IAAI,EAAE,CAAC;AAAA,QAC/D,CAAC;AACD;AAAA,MACF,SAAS,GAAG;AACV,YAAI,WAAW,UAAU;AACvB,gBAAM,IAAI,MAAM,yCAAyC,OAAO,cAAc,CAAC,EAAE;AAAA,QACnF;AAEA,cAAM,QAAQ,KAAK,IAAI,UAAU,GAAG,CAAC;AACrC;AAEA,aAAK,QAAQ;AAAA,UACX,gDAAgD,KAAK,aAAa,CAAC,KAAK,OAAO,IAAI,QAAQ;AAAA,QAC7F;AACA,cAAM,IAAI,QAAQ,CAAC,YAAY,WAAW,SAAS,QAAQ,GAAI,CAAC;AAAA,MAClE;AAAA,IACF;AAEA,UAAM,YAAY,WAAW;AAC7B,UAAM,YAAY,WAAW;AAE7B,OAAG;AAAA,MACD,KAAK,UAAU;AAAA,QACb,MAAM;AAAA,QACN,gBAAgB,KAAK,MAAM,MAAM;AAAA,QACjC,wBAAwB;AAAA,QACxB,uBAAuB,KAAK,MAAM;AAAA,MACpC,CAAC;AAAA,IACH;AACA,QAAI,UAAU;AAEd,UAAM,WAAW,YAAY;AAC3B,UAAI,aAAuB,CAAC;AAC5B,uBAAiB,QAAQ,QAAQ;AAC/B,YAAI,OAAO,KAAK;AAEhB,YAAK,KAAK,MAAM,qBAAqB,KAAK,WAAW,UAAU,KAAM,WAAW,QAAQ;AACtF,qBAAW,KAAK,IAAI;AACpB,cAAI,KAAK,QAAQ,YAAY,MAAM,IAAI;AACrC,mBAAO,WAAW,KAAK,GAAG;AAC1B,yBAAa,CAAC;AAAA,UAChB,OAAO;AACL;AAAA,UACF;AAAA,QACF;AAEA,WAAG,KAAK,KAAK,UAAU,EAAE,MAAM,OAAO,KAAK,wBAAwB,MAAM,CAAC,CAAC;AAAA,MAC7E;AAEA,UAAI,WAAW,QAAQ;AACrB,aAAK,QAAQ,KAAK,qDAAqD;AAAA,MACzE;AAEA,SAAG,KAAK,KAAK,UAAU,EAAE,MAAM,GAAG,CAAC,CAAC;AACpC,gBAAU;AAAA,IACZ;AAEA,QAAI;AACJ,UAAM,gBAAgB,CAACC,YAAmB,UAAmB;AAC3D,UAAI,WAAW;AACb,aAAK,MAAM,IAAI,EAAE,WAAW,WAAAA,YAAW,OAAO,WAAW,MAAM,CAAC;AAChE,oBAAY;AAAA,MACd;AAAA,IACF;AAEA,UAAM,aAAa,YAAY;AAC7B,YAAM,UAAU,IAAI,gBAAgB,qBAAqB,KAAK,MAAM,QAAQ,GAAG,CAAC;AAChF,aAAO,CAAC,KAAK,QAAQ;AACnB,YAAI;AACF,gBAAM,IAAI,QAAiB,CAAC,SAAS,WAAW;AAC9C,eAAG,mBAAmB;AACtB,eAAG,GAAG,WAAW,CAAC,SAAS,QAAQ,IAAI,CAAC;AACxC,eAAG,GAAG,SAAS,CAAC,MAAM,WAAW;AAC/B,kBAAI,CAAC,SAAS;AACZ,qBAAK,QAAQ,MAAM,8BAA8B,IAAI,KAAK,MAAM,EAAE;AAAA,cACpE;AACA,qBAAO;AAAA,YACT,CAAC;AAAA,UACH,CAAC,EAAE,KAAK,CAAC,QAAQ;AACf,kBAAM,OAAO,KAAK,MAAM,IAAI,SAAS,CAAC;AACtC,gBAAI,WAAW,MAAM;AACnB,oBAAM,OAAO,IAAI,UAAU,OAAO,KAAK,KAAK,OAAO,QAAQ,EAAE,MAAM;AACnE,yBAAW,SAAS,QAAQ,MAAM,IAAI,GAAG;AACvC,8BAAc,WAAW,KAAK;AAC9B,4BAAY;AAAA,cACd;AAAA,YACF,WAAW,aAAa,MAAM;AAC5B,yBAAW,SAAS,QAAQ,MAAM,GAAG;AACnC,8BAAc,WAAW,KAAK;AAC9B,4BAAY;AAAA,cACd;AACA,4BAAc,WAAW,IAAI;AAAA,YAC/B;AAAA,UACF,CAAC;AAAA,QACH,QAAQ;AACN;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAEA,UAAM,QAAQ,IAAI,CAAC,SAAS,GAAG,WAAW,CAAC,CAAC;AAAA,EAC9C;AACF;AAEA,MAAM,uBAAuB,CAAC,aAAkC;AAC9D,SAAO,OAAO,SAAS,MAAM,GAAG,EAAE,CAAC,CAAC;AACtC;","names":["tts","segmentId"]}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@livekit/agents-plugin-elevenlabs",
|
|
3
|
-
"version": "0.5.
|
|
3
|
+
"version": "0.5.2",
|
|
4
4
|
"description": "ElevenLabs plugin for LiveKit Node Agents",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"require": "dist/index.cjs",
|
|
@@ -36,7 +36,7 @@
|
|
|
36
36
|
},
|
|
37
37
|
"peerDependencies": {
|
|
38
38
|
"@livekit/rtc-node": "^0.12.1",
|
|
39
|
-
"@livekit/agents": "^0.
|
|
39
|
+
"@livekit/agents": "^0.6.0x"
|
|
40
40
|
},
|
|
41
41
|
"scripts": {
|
|
42
42
|
"build": "tsup --onSuccess \"tsc --declaration --emitDeclarationOnly\"",
|
package/src/tts.ts
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
import { AsyncIterableQueue, log, tokenize, tts } from '@livekit/agents';
|
|
5
|
-
import { AudioFrame } from '@livekit/rtc-node';
|
|
4
|
+
import { AsyncIterableQueue, AudioByteStream, log, tokenize, tts } from '@livekit/agents';
|
|
5
|
+
import type { AudioFrame } from '@livekit/rtc-node';
|
|
6
6
|
import { randomUUID } from 'node:crypto';
|
|
7
7
|
import { URL } from 'node:url';
|
|
8
8
|
import { type RawData, WebSocket } from 'ws';
|
|
@@ -63,6 +63,7 @@ const defaultTTSOptions: TTSOptions = {
|
|
|
63
63
|
|
|
64
64
|
export class TTS extends tts.TTS {
|
|
65
65
|
#opts: TTSOptions;
|
|
66
|
+
label = 'elevenlabs.TTS';
|
|
66
67
|
|
|
67
68
|
constructor(opts: Partial<TTSOptions> = {}) {
|
|
68
69
|
super(sampleRateFromFormat(opts.encoding || defaultTTSOptions.encoding), 1, {
|
|
@@ -109,17 +110,18 @@ export class TTS extends tts.TTS {
|
|
|
109
110
|
}
|
|
110
111
|
|
|
111
112
|
stream(): tts.SynthesizeStream {
|
|
112
|
-
return new SynthesizeStream(this.#opts);
|
|
113
|
+
return new SynthesizeStream(this, this.#opts);
|
|
113
114
|
}
|
|
114
115
|
}
|
|
115
116
|
|
|
116
117
|
export class SynthesizeStream extends tts.SynthesizeStream {
|
|
117
118
|
#opts: TTSOptions;
|
|
118
119
|
#logger = log();
|
|
120
|
+
label = 'elevenlabs.SynthesizeStream';
|
|
119
121
|
readonly streamURL: URL;
|
|
120
122
|
|
|
121
|
-
constructor(opts: TTSOptions) {
|
|
122
|
-
super();
|
|
123
|
+
constructor(tts: TTS, opts: TTSOptions) {
|
|
124
|
+
super(tts);
|
|
123
125
|
this.#opts = opts;
|
|
124
126
|
this.closed = false;
|
|
125
127
|
|
|
@@ -239,7 +241,16 @@ export class SynthesizeStream extends tts.SynthesizeStream {
|
|
|
239
241
|
eosSent = true;
|
|
240
242
|
};
|
|
241
243
|
|
|
244
|
+
let lastFrame: AudioFrame | undefined;
|
|
245
|
+
const sendLastFrame = (segmentId: string, final: boolean) => {
|
|
246
|
+
if (lastFrame) {
|
|
247
|
+
this.queue.put({ requestId, segmentId, frame: lastFrame, final });
|
|
248
|
+
lastFrame = undefined;
|
|
249
|
+
}
|
|
250
|
+
};
|
|
251
|
+
|
|
242
252
|
const listenTask = async () => {
|
|
253
|
+
const bstream = new AudioByteStream(sampleRateFromFormat(this.#opts.encoding), 1);
|
|
243
254
|
while (!this.closed) {
|
|
244
255
|
try {
|
|
245
256
|
await new Promise<RawData>((resolve, reject) => {
|
|
@@ -254,14 +265,17 @@ export class SynthesizeStream extends tts.SynthesizeStream {
|
|
|
254
265
|
}).then((msg) => {
|
|
255
266
|
const json = JSON.parse(msg.toString());
|
|
256
267
|
if ('audio' in json) {
|
|
257
|
-
const data = new
|
|
258
|
-
const frame
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
)
|
|
264
|
-
|
|
268
|
+
const data = new Int8Array(Buffer.from(json.audio, 'base64').buffer);
|
|
269
|
+
for (const frame of bstream.write(data)) {
|
|
270
|
+
sendLastFrame(segmentId, false);
|
|
271
|
+
lastFrame = frame;
|
|
272
|
+
}
|
|
273
|
+
} else if ('isFinal' in json) {
|
|
274
|
+
for (const frame of bstream.flush()) {
|
|
275
|
+
sendLastFrame(segmentId, false);
|
|
276
|
+
lastFrame = frame;
|
|
277
|
+
}
|
|
278
|
+
sendLastFrame(segmentId, true);
|
|
265
279
|
}
|
|
266
280
|
});
|
|
267
281
|
} catch {
|