@livekit/agents-plugin-elevenlabs 1.0.17 → 1.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/tts.cjs +32 -17
- package/dist/tts.cjs.map +1 -1
- package/dist/tts.d.cts +1 -1
- package/dist/tts.d.ts +1 -1
- package/dist/tts.d.ts.map +1 -1
- package/dist/tts.js +32 -17
- package/dist/tts.js.map +1 -1
- package/package.json +5 -5
- package/src/tts.ts +66 -22
package/dist/tts.cjs
CHANGED
|
@@ -39,13 +39,12 @@ const DEFAULT_VOICE = {
|
|
|
39
39
|
};
|
|
40
40
|
const API_BASE_URL_V1 = "https://api.elevenlabs.io/v1/";
|
|
41
41
|
const AUTHORIZATION_HEADER = "xi-api-key";
|
|
42
|
-
const
|
|
42
|
+
const defaultTTSOptionsBase = {
|
|
43
43
|
apiKey: process.env.ELEVEN_API_KEY,
|
|
44
44
|
voice: DEFAULT_VOICE,
|
|
45
45
|
modelID: "eleven_turbo_v2_5",
|
|
46
46
|
baseURL: API_BASE_URL_V1,
|
|
47
47
|
encoding: "pcm_22050",
|
|
48
|
-
wordTokenizer: new import_agents.tokenize.basic.WordTokenizer(false),
|
|
49
48
|
enableSsmlParsing: false,
|
|
50
49
|
inactivityTimeout: DEFAULT_INACTIVITY_TIMEOUT,
|
|
51
50
|
syncAlignment: true
|
|
@@ -54,12 +53,23 @@ class TTS extends import_agents.tts.TTS {
|
|
|
54
53
|
#opts;
|
|
55
54
|
label = "elevenlabs.TTS";
|
|
56
55
|
constructor(opts = {}) {
|
|
57
|
-
super(sampleRateFromFormat(opts.encoding ||
|
|
56
|
+
super(sampleRateFromFormat(opts.encoding || defaultTTSOptionsBase.encoding), 1, {
|
|
58
57
|
streaming: true
|
|
59
58
|
});
|
|
59
|
+
const autoMode = opts.autoMode !== void 0 ? opts.autoMode : false;
|
|
60
|
+
let wordTokenizer = opts.wordTokenizer;
|
|
61
|
+
if (!wordTokenizer) {
|
|
62
|
+
wordTokenizer = autoMode ? new import_agents.tokenize.basic.SentenceTokenizer() : new import_agents.tokenize.basic.WordTokenizer(false);
|
|
63
|
+
} else if (autoMode && !(wordTokenizer instanceof import_agents.tokenize.SentenceTokenizer)) {
|
|
64
|
+
(0, import_agents.log)().warn(
|
|
65
|
+
"autoMode is enabled, it expects full sentences or phrases. Please provide a SentenceTokenizer instead of a WordTokenizer."
|
|
66
|
+
);
|
|
67
|
+
}
|
|
60
68
|
this.#opts = {
|
|
61
|
-
...
|
|
62
|
-
...opts
|
|
69
|
+
...defaultTTSOptionsBase,
|
|
70
|
+
...opts,
|
|
71
|
+
autoMode,
|
|
72
|
+
wordTokenizer
|
|
63
73
|
};
|
|
64
74
|
if (this.#opts.apiKey === void 0) {
|
|
65
75
|
throw new Error(
|
|
@@ -180,17 +190,18 @@ class SynthesizeStream extends import_agents.tts.SynthesizeStream {
|
|
|
180
190
|
}
|
|
181
191
|
const requestId = (0, import_agents.shortuuid)();
|
|
182
192
|
const segmentId = (0, import_agents.shortuuid)();
|
|
183
|
-
|
|
184
|
-
JSON.stringify(
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
193
|
+
const wsSend = (data) => {
|
|
194
|
+
ws.send(JSON.stringify(data));
|
|
195
|
+
};
|
|
196
|
+
wsSend({
|
|
197
|
+
text: " ",
|
|
198
|
+
voice_settings: this.#opts.voice.settings,
|
|
199
|
+
...this.#opts.chunkLengthSchedule && {
|
|
200
|
+
generation_config: {
|
|
201
|
+
chunk_length_schedule: this.#opts.chunkLengthSchedule
|
|
191
202
|
}
|
|
192
|
-
}
|
|
193
|
-
);
|
|
203
|
+
}
|
|
204
|
+
});
|
|
194
205
|
let eosSent = false;
|
|
195
206
|
const sendTask = async () => {
|
|
196
207
|
let xmlContent = [];
|
|
@@ -208,12 +219,16 @@ class SynthesizeStream extends import_agents.tts.SynthesizeStream {
|
|
|
208
219
|
continue;
|
|
209
220
|
}
|
|
210
221
|
}
|
|
211
|
-
|
|
222
|
+
wsSend({
|
|
223
|
+
text: text + " "
|
|
224
|
+
// must always end with a space
|
|
225
|
+
// ...(flushOnChunk && { flush: true }),
|
|
226
|
+
});
|
|
212
227
|
}
|
|
213
228
|
if (xmlContent.length) {
|
|
214
229
|
this.#logger.warn("ElevenLabs stream ended with incomplete XML content");
|
|
215
230
|
}
|
|
216
|
-
|
|
231
|
+
wsSend({ text: "" });
|
|
217
232
|
eosSent = true;
|
|
218
233
|
};
|
|
219
234
|
let lastFrame;
|
package/dist/tts.cjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/tts.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport {\n AsyncIterableQueue,\n AudioByteStream,\n log,\n shortuuid,\n tokenize,\n tts,\n} from '@livekit/agents';\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { URL } from 'node:url';\nimport { type RawData, WebSocket } from 'ws';\nimport type { TTSEncoding, TTSModels } from './models.js';\n\nconst DEFAULT_INACTIVITY_TIMEOUT = 300;\n\ntype Voice = {\n id: string;\n name: string;\n category: string;\n settings?: VoiceSettings;\n};\n\ntype VoiceSettings = {\n stability: number; // 0..1\n similarity_boost: number; // 0..1\n style?: number; // 0..1\n use_speaker_boost: boolean;\n};\n\nconst DEFAULT_VOICE: Voice = {\n id: 'bIHbv24MWmeRgasZH58o',\n name: 'Bella',\n category: 'premade',\n settings: {\n stability: 0.71,\n similarity_boost: 0.5,\n style: 0.0,\n use_speaker_boost: true,\n },\n};\n\nconst API_BASE_URL_V1 = 'https://api.elevenlabs.io/v1/';\nconst AUTHORIZATION_HEADER = 'xi-api-key';\n\nexport interface TTSOptions {\n apiKey?: string;\n voice: Voice;\n modelID: TTSModels | string;\n languageCode?: string;\n baseURL: string;\n encoding: TTSEncoding;\n streamingLatency?: number;\n wordTokenizer: tokenize.WordTokenizer;\n chunkLengthSchedule?: number[];\n enableSsmlParsing: boolean;\n inactivityTimeout: number;\n syncAlignment: boolean;\n autoMode?: boolean;\n}\n\nconst defaultTTSOptions: TTSOptions = {\n apiKey: process.env.ELEVEN_API_KEY,\n voice: DEFAULT_VOICE,\n modelID: 'eleven_turbo_v2_5',\n baseURL: API_BASE_URL_V1,\n encoding: 'pcm_22050',\n wordTokenizer: new tokenize.basic.WordTokenizer(false),\n enableSsmlParsing: false,\n inactivityTimeout: DEFAULT_INACTIVITY_TIMEOUT,\n syncAlignment: true,\n};\n\nexport class TTS extends tts.TTS {\n #opts: TTSOptions;\n label = 'elevenlabs.TTS';\n\n constructor(opts: Partial<TTSOptions> = {}) {\n super(sampleRateFromFormat(opts.encoding || defaultTTSOptions.encoding), 1, {\n streaming: true,\n });\n\n this.#opts = {\n ...defaultTTSOptions,\n ...opts,\n };\n\n if (this.#opts.apiKey === undefined) {\n throw new Error(\n 'ElevenLabs API key is required, whether as an argument or as $ELEVEN_API_KEY',\n );\n }\n }\n\n async listVoices(): Promise<Voice[]> {\n return fetch(this.#opts.baseURL + '/voices', {\n headers: {\n [AUTHORIZATION_HEADER]: this.#opts.apiKey!,\n },\n })\n .then((data) => data.json())\n .then((data) => {\n const voices: Voice[] = [];\n for (const voice of (\n data as { voices: { voice_id: string; name: string; category: string }[] }\n ).voices) {\n voices.push({\n id: voice.voice_id,\n name: voice.name,\n category: voice.category,\n settings: undefined,\n });\n }\n return voices;\n });\n }\n\n synthesize(): tts.ChunkedStream {\n throw new Error('Chunked responses are not supported on ElevenLabs TTS');\n }\n\n stream(): tts.SynthesizeStream {\n return new SynthesizeStream(this, this.#opts);\n }\n}\n\nexport class SynthesizeStream extends tts.SynthesizeStream {\n #opts: TTSOptions;\n #logger = log();\n label = 'elevenlabs.SynthesizeStream';\n readonly streamURL: URL;\n\n constructor(tts: TTS, opts: TTSOptions) {\n super(tts);\n this.#opts = opts;\n this.closed = false;\n\n // add trailing slash to URL if needed\n const baseURL = opts.baseURL + (opts.baseURL.endsWith('/') ? '' : '/');\n\n this.streamURL = new URL(`text-to-speech/${opts.voice.id}/stream-input`, baseURL);\n const params = {\n model_id: opts.modelID,\n output_format: opts.encoding,\n enable_ssml_parsing: `${opts.enableSsmlParsing}`,\n sync_alignment: `${opts.syncAlignment}`,\n ...(opts.autoMode !== undefined && { auto_mode: `${opts.autoMode}` }),\n ...(opts.languageCode && { language_code: opts.languageCode }),\n ...(opts.inactivityTimeout && { inactivity_timeout: `${opts.inactivityTimeout}` }),\n ...(opts.streamingLatency && { optimize_streaming_latency: `${opts.streamingLatency}` }),\n };\n Object.entries(params).forEach(([k, v]) => this.streamURL.searchParams.append(k, v));\n this.streamURL.protocol = this.streamURL.protocol.replace('http', 'ws');\n }\n\n protected async run() {\n const segments = new AsyncIterableQueue<tokenize.WordStream>();\n\n const tokenizeInput = async () => {\n let stream: tokenize.WordStream | null = null;\n for await (const text of this.input) {\n if (this.abortController.signal.aborted) {\n break;\n }\n if (text === SynthesizeStream.FLUSH_SENTINEL) {\n stream?.endInput();\n stream = null;\n } else {\n if (!stream) {\n stream = this.#opts.wordTokenizer.stream();\n segments.put(stream);\n }\n stream.pushText(text);\n }\n }\n segments.close();\n };\n\n const runStream = async () => {\n for await (const stream of segments) {\n if (this.abortController.signal.aborted) {\n break;\n }\n await this.#runWS(stream);\n this.queue.put(SynthesizeStream.END_OF_STREAM);\n }\n };\n\n await Promise.all([tokenizeInput(), runStream()]);\n }\n\n async #runWS(stream: tokenize.WordStream, maxRetry = 3) {\n let retries = 0;\n let ws: WebSocket;\n while (true) {\n ws = new WebSocket(this.streamURL, {\n headers: { [AUTHORIZATION_HEADER]: this.#opts.apiKey },\n });\n\n ws.on('error', (error) => {\n this.abortController.abort();\n this.#logger.error({ error }, 'Error connecting to ElevenLabs');\n });\n\n try {\n await new Promise((resolve, reject) => {\n ws.on('open', resolve);\n ws.on('error', (error) => reject(error));\n ws.on('close', (code) => reject(`WebSocket returned ${code}`));\n });\n break;\n } catch (e) {\n if (retries >= maxRetry) {\n throw new Error(`failed to connect to ElevenLabs after ${retries} attempts: ${e}`);\n }\n\n const delay = Math.min(retries * 5, 5);\n retries++;\n\n this.#logger.warn(\n `failed to connect to ElevenLabs, retrying in ${delay} seconds: ${e} (${retries}/${maxRetry})`,\n );\n await new Promise((resolve) => setTimeout(resolve, delay * 1000));\n }\n }\n\n const requestId = shortuuid();\n const segmentId = shortuuid();\n\n ws.send(\n JSON.stringify({\n text: ' ',\n voice_settings: this.#opts.voice.settings,\n ...(this.#opts.chunkLengthSchedule && {\n generation_config: {\n chunk_length_schedule: this.#opts.chunkLengthSchedule,\n },\n }),\n }),\n );\n let eosSent = false;\n\n const sendTask = async () => {\n let xmlContent: string[] = [];\n for await (const data of stream) {\n if (this.abortController.signal.aborted) {\n break;\n }\n let text = data.token;\n\n if ((this.#opts.enableSsmlParsing && text.startsWith('<phoneme')) || xmlContent.length) {\n xmlContent.push(text);\n if (text.indexOf('</phoneme>') !== -1) {\n text = xmlContent.join(' ');\n xmlContent = [];\n } else {\n continue;\n }\n }\n\n ws.send(JSON.stringify({ text: text + ' ' })); // must always end with a space\n }\n\n if (xmlContent.length) {\n this.#logger.warn('ElevenLabs stream ended with incomplete XML content');\n }\n\n // no more tokens, mark eos\n ws.send(JSON.stringify({ text: '' }));\n eosSent = true;\n };\n\n let lastFrame: AudioFrame | undefined;\n const sendLastFrame = (segmentId: string, final: boolean) => {\n if (lastFrame) {\n this.queue.put({ requestId, segmentId, frame: lastFrame, final });\n lastFrame = undefined;\n }\n };\n\n const listenTask = async () => {\n let finalReceived = false;\n const bstream = new AudioByteStream(sampleRateFromFormat(this.#opts.encoding), 1);\n while (!this.closed && !this.abortController.signal.aborted) {\n try {\n await new Promise<RawData>((resolve, reject) => {\n ws.removeAllListeners();\n ws.on('message', (data) => resolve(data));\n ws.on('close', (code, reason) => {\n if (!eosSent) {\n this.#logger.error(`WebSocket closed with code ${code}: ${reason}`);\n }\n if (!finalReceived) {\n reject(new Error('WebSocket closed'));\n }\n });\n }).then((msg) => {\n const json = JSON.parse(msg.toString());\n // remove the \"audio\" field from the json object when printing\n if ('audio' in json && json.audio !== null) {\n const data = new Int8Array(Buffer.from(json.audio, 'base64'));\n for (const frame of bstream.write(data)) {\n sendLastFrame(segmentId, false);\n lastFrame = frame;\n }\n } else if (json.isFinal) {\n finalReceived = true;\n for (const frame of bstream.flush()) {\n sendLastFrame(segmentId, false);\n lastFrame = frame;\n }\n sendLastFrame(segmentId, true);\n this.queue.put(SynthesizeStream.END_OF_STREAM);\n\n if (segmentId === requestId || this.abortController.signal.aborted) {\n ws.close();\n return;\n }\n }\n });\n } catch (err) {\n // skip log error for normal websocket close\n if (err instanceof Error && !err.message.includes('WebSocket closed')) {\n this.#logger.error({ err }, 'Error in listenTask from ElevenLabs WebSocket');\n }\n break;\n }\n }\n };\n\n await Promise.all([sendTask(), listenTask()]);\n }\n}\n\nconst sampleRateFromFormat = (encoding: TTSEncoding): number => {\n return Number(encoding.split('_')[1]);\n};\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,oBAOO;AAEP,sBAAoB;AACpB,gBAAwC;AAGxC,MAAM,6BAA6B;AAgBnC,MAAM,gBAAuB;AAAA,EAC3B,IAAI;AAAA,EACJ,MAAM;AAAA,EACN,UAAU;AAAA,EACV,UAAU;AAAA,IACR,WAAW;AAAA,IACX,kBAAkB;AAAA,IAClB,OAAO;AAAA,IACP,mBAAmB;AAAA,EACrB;AACF;AAEA,MAAM,kBAAkB;AACxB,MAAM,uBAAuB;AAkB7B,MAAM,oBAAgC;AAAA,EACpC,QAAQ,QAAQ,IAAI;AAAA,EACpB,OAAO;AAAA,EACP,SAAS;AAAA,EACT,SAAS;AAAA,EACT,UAAU;AAAA,EACV,eAAe,IAAI,uBAAS,MAAM,cAAc,KAAK;AAAA,EACrD,mBAAmB;AAAA,EACnB,mBAAmB;AAAA,EACnB,eAAe;AACjB;AAEO,MAAM,YAAY,kBAAI,IAAI;AAAA,EAC/B;AAAA,EACA,QAAQ;AAAA,EAER,YAAY,OAA4B,CAAC,GAAG;AAC1C,UAAM,qBAAqB,KAAK,YAAY,kBAAkB,QAAQ,GAAG,GAAG;AAAA,MAC1E,WAAW;AAAA,IACb,CAAC;AAED,SAAK,QAAQ;AAAA,MACX,GAAG;AAAA,MACH,GAAG;AAAA,IACL;AAEA,QAAI,KAAK,MAAM,WAAW,QAAW;AACnC,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAAA,EAEA,MAAM,aAA+B;AACnC,WAAO,MAAM,KAAK,MAAM,UAAU,WAAW;AAAA,MAC3C,SAAS;AAAA,QACP,CAAC,oBAAoB,GAAG,KAAK,MAAM;AAAA,MACrC;AAAA,IACF,CAAC,EACE,KAAK,CAAC,SAAS,KAAK,KAAK,CAAC,EAC1B,KAAK,CAAC,SAAS;AACd,YAAM,SAAkB,CAAC;AACzB,iBAAW,SACT,KACA,QAAQ;AACR,eAAO,KAAK;AAAA,UACV,IAAI,MAAM;AAAA,UACV,MAAM,MAAM;AAAA,UACZ,UAAU,MAAM;AAAA,UAChB,UAAU;AAAA,QACZ,CAAC;AAAA,MACH;AACA,aAAO;AAAA,IACT,CAAC;AAAA,EACL;AAAA,EAEA,aAAgC;AAC9B,UAAM,IAAI,MAAM,uDAAuD;AAAA,EACzE;AAAA,EAEA,SAA+B;AAC7B,WAAO,IAAI,iBAAiB,MAAM,KAAK,KAAK;AAAA,EAC9C;AACF;AAEO,MAAM,yBAAyB,kBAAI,iBAAiB;AAAA,EACzD;AAAA,EACA,cAAU,mBAAI;AAAA,EACd,QAAQ;AAAA,EACC;AAAA,EAET,YAAYA,MAAU,MAAkB;AACtC,UAAMA,IAAG;AACT,SAAK,QAAQ;AACb,SAAK,SAAS;AAGd,UAAM,UAAU,KAAK,WAAW,KAAK,QAAQ,SAAS,GAAG,IAAI,KAAK;AAElE,SAAK,YAAY,IAAI,oBAAI,kBAAkB,KAAK,MAAM,EAAE,iBAAiB,OAAO;AAChF,UAAM,SAAS;AAAA,MACb,UAAU,KAAK;AAAA,MACf,eAAe,KAAK;AAAA,MACpB,qBAAqB,GAAG,KAAK,iBAAiB;AAAA,MAC9C,gBAAgB,GAAG,KAAK,aAAa;AAAA,MACrC,GAAI,KAAK,aAAa,UAAa,EAAE,WAAW,GAAG,KAAK,QAAQ,GAAG;AAAA,MACnE,GAAI,KAAK,gBAAgB,EAAE,eAAe,KAAK,aAAa;AAAA,MAC5D,GAAI,KAAK,qBAAqB,EAAE,oBAAoB,GAAG,KAAK,iBAAiB,GAAG;AAAA,MAChF,GAAI,KAAK,oBAAoB,EAAE,4BAA4B,GAAG,KAAK,gBAAgB,GAAG;AAAA,IACxF;AACA,WAAO,QAAQ,MAAM,EAAE,QAAQ,CAAC,CAAC,GAAG,CAAC,MAAM,KAAK,UAAU,aAAa,OAAO,GAAG,CAAC,CAAC;AACnF,SAAK,UAAU,WAAW,KAAK,UAAU,SAAS,QAAQ,QAAQ,IAAI;AAAA,EACxE;AAAA,EAEA,MAAgB,MAAM;AACpB,UAAM,WAAW,IAAI,iCAAwC;AAE7D,UAAM,gBAAgB,YAAY;AAChC,UAAI,SAAqC;AACzC,uBAAiB,QAAQ,KAAK,OAAO;AACnC,YAAI,KAAK,gBAAgB,OAAO,SAAS;AACvC;AAAA,QACF;AACA,YAAI,SAAS,iBAAiB,gBAAgB;AAC5C,2CAAQ;AACR,mBAAS;AAAA,QACX,OAAO;AACL,cAAI,CAAC,QAAQ;AACX,qBAAS,KAAK,MAAM,cAAc,OAAO;AACzC,qBAAS,IAAI,MAAM;AAAA,UACrB;AACA,iBAAO,SAAS,IAAI;AAAA,QACtB;AAAA,MACF;AACA,eAAS,MAAM;AAAA,IACjB;AAEA,UAAM,YAAY,YAAY;AAC5B,uBAAiB,UAAU,UAAU;AACnC,YAAI,KAAK,gBAAgB,OAAO,SAAS;AACvC;AAAA,QACF;AACA,cAAM,KAAK,OAAO,MAAM;AACxB,aAAK,MAAM,IAAI,iBAAiB,aAAa;AAAA,MAC/C;AAAA,IACF;AAEA,UAAM,QAAQ,IAAI,CAAC,cAAc,GAAG,UAAU,CAAC,CAAC;AAAA,EAClD;AAAA,EAEA,MAAM,OAAO,QAA6B,WAAW,GAAG;AACtD,QAAI,UAAU;AACd,QAAI;AACJ,WAAO,MAAM;AACX,WAAK,IAAI,oBAAU,KAAK,WAAW;AAAA,QACjC,SAAS,EAAE,CAAC,oBAAoB,GAAG,KAAK,MAAM,OAAO;AAAA,MACvD,CAAC;AAED,SAAG,GAAG,SAAS,CAAC,UAAU;AACxB,aAAK,gBAAgB,MAAM;AAC3B,aAAK,QAAQ,MAAM,EAAE,MAAM,GAAG,gCAAgC;AAAA,MAChE,CAAC;AAED,UAAI;AACF,cAAM,IAAI,QAAQ,CAAC,SAAS,WAAW;AACrC,aAAG,GAAG,QAAQ,OAAO;AACrB,aAAG,GAAG,SAAS,CAAC,UAAU,OAAO,KAAK,CAAC;AACvC,aAAG,GAAG,SAAS,CAAC,SAAS,OAAO,sBAAsB,IAAI,EAAE,CAAC;AAAA,QAC/D,CAAC;AACD;AAAA,MACF,SAAS,GAAG;AACV,YAAI,WAAW,UAAU;AACvB,gBAAM,IAAI,MAAM,yCAAyC,OAAO,cAAc,CAAC,EAAE;AAAA,QACnF;AAEA,cAAM,QAAQ,KAAK,IAAI,UAAU,GAAG,CAAC;AACrC;AAEA,aAAK,QAAQ;AAAA,UACX,gDAAgD,KAAK,aAAa,CAAC,KAAK,OAAO,IAAI,QAAQ;AAAA,QAC7F;AACA,cAAM,IAAI,QAAQ,CAAC,YAAY,WAAW,SAAS,QAAQ,GAAI,CAAC;AAAA,MAClE;AAAA,IACF;AAEA,UAAM,gBAAY,yBAAU;AAC5B,UAAM,gBAAY,yBAAU;AAE5B,OAAG;AAAA,MACD,KAAK,UAAU;AAAA,QACb,MAAM;AAAA,QACN,gBAAgB,KAAK,MAAM,MAAM;AAAA,QACjC,GAAI,KAAK,MAAM,uBAAuB;AAAA,UACpC,mBAAmB;AAAA,YACjB,uBAAuB,KAAK,MAAM;AAAA,UACpC;AAAA,QACF;AAAA,MACF,CAAC;AAAA,IACH;AACA,QAAI,UAAU;AAEd,UAAM,WAAW,YAAY;AAC3B,UAAI,aAAuB,CAAC;AAC5B,uBAAiB,QAAQ,QAAQ;AAC/B,YAAI,KAAK,gBAAgB,OAAO,SAAS;AACvC;AAAA,QACF;AACA,YAAI,OAAO,KAAK;AAEhB,YAAK,KAAK,MAAM,qBAAqB,KAAK,WAAW,UAAU,KAAM,WAAW,QAAQ;AACtF,qBAAW,KAAK,IAAI;AACpB,cAAI,KAAK,QAAQ,YAAY,MAAM,IAAI;AACrC,mBAAO,WAAW,KAAK,GAAG;AAC1B,yBAAa,CAAC;AAAA,UAChB,OAAO;AACL;AAAA,UACF;AAAA,QACF;AAEA,WAAG,KAAK,KAAK,UAAU,EAAE,MAAM,OAAO,IAAI,CAAC,CAAC;AAAA,MAC9C;AAEA,UAAI,WAAW,QAAQ;AACrB,aAAK,QAAQ,KAAK,qDAAqD;AAAA,MACzE;AAGA,SAAG,KAAK,KAAK,UAAU,EAAE,MAAM,GAAG,CAAC,CAAC;AACpC,gBAAU;AAAA,IACZ;AAEA,QAAI;AACJ,UAAM,gBAAgB,CAACC,YAAmB,UAAmB;AAC3D,UAAI,WAAW;AACb,aAAK,MAAM,IAAI,EAAE,WAAW,WAAAA,YAAW,OAAO,WAAW,MAAM,CAAC;AAChE,oBAAY;AAAA,MACd;AAAA,IACF;AAEA,UAAM,aAAa,YAAY;AAC7B,UAAI,gBAAgB;AACpB,YAAM,UAAU,IAAI,8BAAgB,qBAAqB,KAAK,MAAM,QAAQ,GAAG,CAAC;AAChF,aAAO,CAAC,KAAK,UAAU,CAAC,KAAK,gBAAgB,OAAO,SAAS;AAC3D,YAAI;AACF,gBAAM,IAAI,QAAiB,CAAC,SAAS,WAAW;AAC9C,eAAG,mBAAmB;AACtB,eAAG,GAAG,WAAW,CAAC,SAAS,QAAQ,IAAI,CAAC;AACxC,eAAG,GAAG,SAAS,CAAC,MAAM,WAAW;AAC/B,kBAAI,CAAC,SAAS;AACZ,qBAAK,QAAQ,MAAM,8BAA8B,IAAI,KAAK,MAAM,EAAE;AAAA,cACpE;AACA,kBAAI,CAAC,eAAe;AAClB,uBAAO,IAAI,MAAM,kBAAkB,CAAC;AAAA,cACtC;AAAA,YACF,CAAC;AAAA,UACH,CAAC,EAAE,KAAK,CAAC,QAAQ;AACf,kBAAM,OAAO,KAAK,MAAM,IAAI,SAAS,CAAC;AAEtC,gBAAI,WAAW,QAAQ,KAAK,UAAU,MAAM;AAC1C,oBAAM,OAAO,IAAI,UAAU,OAAO,KAAK,KAAK,OAAO,QAAQ,CAAC;AAC5D,yBAAW,SAAS,QAAQ,MAAM,IAAI,GAAG;AACvC,8BAAc,WAAW,KAAK;AAC9B,4BAAY;AAAA,cACd;AAAA,YACF,WAAW,KAAK,SAAS;AACvB,8BAAgB;AAChB,yBAAW,SAAS,QAAQ,MAAM,GAAG;AACnC,8BAAc,WAAW,KAAK;AAC9B,4BAAY;AAAA,cACd;AACA,4BAAc,WAAW,IAAI;AAC7B,mBAAK,MAAM,IAAI,iBAAiB,aAAa;AAE7C,kBAAI,cAAc,aAAa,KAAK,gBAAgB,OAAO,SAAS;AAClE,mBAAG,MAAM;AACT;AAAA,cACF;AAAA,YACF;AAAA,UACF,CAAC;AAAA,QACH,SAAS,KAAK;AAEZ,cAAI,eAAe,SAAS,CAAC,IAAI,QAAQ,SAAS,kBAAkB,GAAG;AACrE,iBAAK,QAAQ,MAAM,EAAE,IAAI,GAAG,+CAA+C;AAAA,UAC7E;AACA;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAEA,UAAM,QAAQ,IAAI,CAAC,SAAS,GAAG,WAAW,CAAC,CAAC;AAAA,EAC9C;AACF;AAEA,MAAM,uBAAuB,CAAC,aAAkC;AAC9D,SAAO,OAAO,SAAS,MAAM,GAAG,EAAE,CAAC,CAAC;AACtC;","names":["tts","segmentId"]}
|
|
1
|
+
{"version":3,"sources":["../src/tts.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport {\n AsyncIterableQueue,\n AudioByteStream,\n log,\n shortuuid,\n tokenize,\n tts,\n} from '@livekit/agents';\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { URL } from 'node:url';\nimport { type RawData, WebSocket } from 'ws';\nimport type { TTSEncoding, TTSModels } from './models.js';\n\nconst DEFAULT_INACTIVITY_TIMEOUT = 300;\n\ntype Voice = {\n id: string;\n name: string;\n category: string;\n settings?: VoiceSettings;\n};\n\ntype VoiceSettings = {\n stability: number; // 0..1\n similarity_boost: number; // 0..1\n style?: number; // 0..1\n use_speaker_boost: boolean;\n};\n\nconst DEFAULT_VOICE: Voice = {\n id: 'bIHbv24MWmeRgasZH58o',\n name: 'Bella',\n category: 'premade',\n settings: {\n stability: 0.71,\n similarity_boost: 0.5,\n style: 0.0,\n use_speaker_boost: true,\n },\n};\n\nconst API_BASE_URL_V1 = 'https://api.elevenlabs.io/v1/';\nconst AUTHORIZATION_HEADER = 'xi-api-key';\n\nexport interface TTSOptions {\n apiKey?: string;\n voice: Voice;\n modelID: TTSModels | string;\n languageCode?: string;\n baseURL: string;\n encoding: TTSEncoding;\n streamingLatency?: number;\n wordTokenizer: tokenize.WordTokenizer | tokenize.SentenceTokenizer;\n chunkLengthSchedule?: number[];\n enableSsmlParsing: boolean;\n inactivityTimeout: number;\n syncAlignment: boolean;\n autoMode?: boolean;\n}\n\nconst defaultTTSOptionsBase = {\n apiKey: process.env.ELEVEN_API_KEY,\n voice: DEFAULT_VOICE,\n modelID: 'eleven_turbo_v2_5',\n baseURL: API_BASE_URL_V1,\n encoding: 'pcm_22050' as TTSEncoding,\n enableSsmlParsing: false,\n inactivityTimeout: DEFAULT_INACTIVITY_TIMEOUT,\n syncAlignment: true,\n};\n\nexport class TTS extends tts.TTS {\n #opts: TTSOptions;\n label = 'elevenlabs.TTS';\n\n constructor(opts: Partial<TTSOptions> = {}) {\n super(sampleRateFromFormat(opts.encoding || defaultTTSOptionsBase.encoding), 1, {\n streaming: true,\n });\n\n // Set autoMode to true by default if not provided is Python behavior,\n // but to make it non-breaking, we keep false as default in typescript\n const autoMode = opts.autoMode !== undefined ? opts.autoMode : false;\n\n // Set default tokenizer based on autoMode if not provided\n let wordTokenizer = opts.wordTokenizer;\n if (!wordTokenizer) {\n wordTokenizer = autoMode\n ? new tokenize.basic.SentenceTokenizer()\n : new tokenize.basic.WordTokenizer(false);\n } else if (autoMode && !(wordTokenizer instanceof tokenize.SentenceTokenizer)) {\n // Warn if autoMode is enabled but a WordTokenizer was provided\n log().warn(\n 'autoMode is enabled, it expects full sentences or phrases. ' +\n 'Please provide a SentenceTokenizer instead of a WordTokenizer.',\n );\n }\n\n this.#opts = {\n ...defaultTTSOptionsBase,\n ...opts,\n autoMode,\n wordTokenizer,\n };\n\n if (this.#opts.apiKey === undefined) {\n throw new Error(\n 'ElevenLabs API key is required, whether as an argument or as $ELEVEN_API_KEY',\n );\n }\n }\n\n async listVoices(): Promise<Voice[]> {\n return fetch(this.#opts.baseURL + '/voices', {\n headers: {\n [AUTHORIZATION_HEADER]: this.#opts.apiKey!,\n },\n })\n .then((data) => data.json())\n .then((data) => {\n const voices: Voice[] = [];\n for (const voice of (\n data as { voices: { voice_id: string; name: string; category: string }[] }\n ).voices) {\n voices.push({\n id: voice.voice_id,\n name: voice.name,\n category: voice.category,\n settings: undefined,\n });\n }\n return voices;\n });\n }\n\n synthesize(): tts.ChunkedStream {\n throw new Error('Chunked responses are not supported on ElevenLabs TTS');\n }\n\n stream(): tts.SynthesizeStream {\n return new SynthesizeStream(this, this.#opts);\n }\n}\n\nexport class SynthesizeStream extends tts.SynthesizeStream {\n #opts: TTSOptions;\n #logger = log();\n label = 'elevenlabs.SynthesizeStream';\n readonly streamURL: URL;\n\n constructor(tts: TTS, opts: TTSOptions) {\n super(tts);\n this.#opts = opts;\n this.closed = false;\n\n // add trailing slash to URL if needed\n const baseURL = opts.baseURL + (opts.baseURL.endsWith('/') ? '' : '/');\n\n this.streamURL = new URL(`text-to-speech/${opts.voice.id}/stream-input`, baseURL);\n const params = {\n model_id: opts.modelID,\n output_format: opts.encoding,\n enable_ssml_parsing: `${opts.enableSsmlParsing}`,\n sync_alignment: `${opts.syncAlignment}`,\n ...(opts.autoMode !== undefined && { auto_mode: `${opts.autoMode}` }),\n ...(opts.languageCode && { language_code: opts.languageCode }),\n ...(opts.inactivityTimeout && { inactivity_timeout: `${opts.inactivityTimeout}` }),\n ...(opts.streamingLatency && { optimize_streaming_latency: `${opts.streamingLatency}` }),\n };\n Object.entries(params).forEach(([k, v]) => this.streamURL.searchParams.append(k, v));\n this.streamURL.protocol = this.streamURL.protocol.replace('http', 'ws');\n }\n\n protected async run() {\n const segments = new AsyncIterableQueue<tokenize.WordStream | tokenize.SentenceStream>();\n\n const tokenizeInput = async () => {\n let stream: tokenize.WordStream | tokenize.SentenceStream | null = null;\n for await (const text of this.input) {\n if (this.abortController.signal.aborted) {\n break;\n }\n if (text === SynthesizeStream.FLUSH_SENTINEL) {\n stream?.endInput();\n stream = null;\n } else {\n if (!stream) {\n stream = this.#opts.wordTokenizer.stream();\n segments.put(stream);\n }\n stream.pushText(text);\n }\n }\n segments.close();\n };\n\n const runStream = async () => {\n for await (const stream of segments) {\n if (this.abortController.signal.aborted) {\n break;\n }\n await this.#runWS(stream);\n this.queue.put(SynthesizeStream.END_OF_STREAM);\n }\n };\n\n await Promise.all([tokenizeInput(), runStream()]);\n }\n\n async #runWS(stream: tokenize.WordStream | tokenize.SentenceStream, maxRetry = 3) {\n let retries = 0;\n let ws: WebSocket;\n while (true) {\n ws = new WebSocket(this.streamURL, {\n headers: { [AUTHORIZATION_HEADER]: this.#opts.apiKey },\n });\n\n ws.on('error', (error) => {\n this.abortController.abort();\n this.#logger.error({ error }, 'Error connecting to ElevenLabs');\n });\n\n try {\n await new Promise((resolve, reject) => {\n ws.on('open', resolve);\n ws.on('error', (error) => reject(error));\n ws.on('close', (code) => reject(`WebSocket returned ${code}`));\n });\n break;\n } catch (e) {\n if (retries >= maxRetry) {\n throw new Error(`failed to connect to ElevenLabs after ${retries} attempts: ${e}`);\n }\n\n const delay = Math.min(retries * 5, 5);\n retries++;\n\n this.#logger.warn(\n `failed to connect to ElevenLabs, retrying in ${delay} seconds: ${e} (${retries}/${maxRetry})`,\n );\n await new Promise((resolve) => setTimeout(resolve, delay * 1000));\n }\n }\n\n const requestId = shortuuid();\n const segmentId = shortuuid();\n\n // simple helper to make sure what we send to ws.send\n const wsSend = (data: {\n // (SynthesizeContent from python)\n text: string;\n // setting flush somehow never finishes the current speech generation\n // https://github.com/livekit/agents-js/pull/820#issuecomment-3517138706\n // flush?: boolean;\n // initialization\n voice_settings?: VoiceSettings;\n generation_config?: {\n chunk_length_schedule: number[];\n };\n }) => {\n ws.send(JSON.stringify(data));\n };\n\n wsSend({\n text: ' ',\n voice_settings: this.#opts.voice.settings,\n ...(this.#opts.chunkLengthSchedule && {\n generation_config: {\n chunk_length_schedule: this.#opts.chunkLengthSchedule,\n },\n }),\n });\n let eosSent = false;\n\n const sendTask = async () => {\n // Determine if we should flush on each chunk (sentence)\n /*const flushOnChunk =\n this.#opts.wordTokenizer instanceof tokenize.SentenceTokenizer &&\n this.#opts.autoMode !== undefined &&\n this.#opts.autoMode;*/\n\n let xmlContent: string[] = [];\n for await (const data of stream) {\n if (this.abortController.signal.aborted) {\n break;\n }\n let text = data.token;\n\n if ((this.#opts.enableSsmlParsing && text.startsWith('<phoneme')) || xmlContent.length) {\n xmlContent.push(text);\n if (text.indexOf('</phoneme>') !== -1) {\n text = xmlContent.join(' ');\n xmlContent = [];\n } else {\n continue;\n }\n }\n\n wsSend({\n text: text + ' ', // must always end with a space\n // ...(flushOnChunk && { flush: true }),\n });\n }\n\n if (xmlContent.length) {\n this.#logger.warn('ElevenLabs stream ended with incomplete XML content');\n }\n\n // no more tokens, mark eos with flush\n // setting flush somehow never finishes the current speech generation\n // wsSend({ text: '', flush: true });\n wsSend({ text: '' });\n eosSent = true;\n };\n\n let lastFrame: AudioFrame | undefined;\n const sendLastFrame = (segmentId: string, final: boolean) => {\n if (lastFrame) {\n this.queue.put({ requestId, segmentId, frame: lastFrame, final });\n lastFrame = undefined;\n }\n };\n\n const listenTask = async () => {\n let finalReceived = false;\n const bstream = new AudioByteStream(sampleRateFromFormat(this.#opts.encoding), 1);\n while (!this.closed && !this.abortController.signal.aborted) {\n try {\n await new Promise<RawData>((resolve, reject) => {\n ws.removeAllListeners();\n ws.on('message', (data) => resolve(data));\n ws.on('close', (code, reason) => {\n if (!eosSent) {\n this.#logger.error(`WebSocket closed with code ${code}: ${reason}`);\n }\n if (!finalReceived) {\n reject(new Error('WebSocket closed'));\n }\n });\n }).then((msg) => {\n const json = JSON.parse(msg.toString());\n // remove the \"audio\" field from the json object when printing\n if ('audio' in json && json.audio !== null) {\n const data = new Int8Array(Buffer.from(json.audio, 'base64'));\n for (const frame of bstream.write(data)) {\n sendLastFrame(segmentId, false);\n lastFrame = frame;\n }\n } else if (json.isFinal) {\n finalReceived = true;\n for (const frame of bstream.flush()) {\n sendLastFrame(segmentId, false);\n lastFrame = frame;\n }\n sendLastFrame(segmentId, true);\n this.queue.put(SynthesizeStream.END_OF_STREAM);\n\n if (segmentId === requestId || this.abortController.signal.aborted) {\n ws.close();\n return;\n }\n }\n });\n } catch (err) {\n // skip log error for normal websocket close\n if (err instanceof Error && !err.message.includes('WebSocket closed')) {\n this.#logger.error({ err }, 'Error in listenTask from ElevenLabs WebSocket');\n }\n break;\n }\n }\n };\n\n await Promise.all([sendTask(), listenTask()]);\n }\n}\n\nconst sampleRateFromFormat = (encoding: TTSEncoding): number => {\n return Number(encoding.split('_')[1]);\n};\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,oBAOO;AAEP,sBAAoB;AACpB,gBAAwC;AAGxC,MAAM,6BAA6B;AAgBnC,MAAM,gBAAuB;AAAA,EAC3B,IAAI;AAAA,EACJ,MAAM;AAAA,EACN,UAAU;AAAA,EACV,UAAU;AAAA,IACR,WAAW;AAAA,IACX,kBAAkB;AAAA,IAClB,OAAO;AAAA,IACP,mBAAmB;AAAA,EACrB;AACF;AAEA,MAAM,kBAAkB;AACxB,MAAM,uBAAuB;AAkB7B,MAAM,wBAAwB;AAAA,EAC5B,QAAQ,QAAQ,IAAI;AAAA,EACpB,OAAO;AAAA,EACP,SAAS;AAAA,EACT,SAAS;AAAA,EACT,UAAU;AAAA,EACV,mBAAmB;AAAA,EACnB,mBAAmB;AAAA,EACnB,eAAe;AACjB;AAEO,MAAM,YAAY,kBAAI,IAAI;AAAA,EAC/B;AAAA,EACA,QAAQ;AAAA,EAER,YAAY,OAA4B,CAAC,GAAG;AAC1C,UAAM,qBAAqB,KAAK,YAAY,sBAAsB,QAAQ,GAAG,GAAG;AAAA,MAC9E,WAAW;AAAA,IACb,CAAC;AAID,UAAM,WAAW,KAAK,aAAa,SAAY,KAAK,WAAW;AAG/D,QAAI,gBAAgB,KAAK;AACzB,QAAI,CAAC,eAAe;AAClB,sBAAgB,WACZ,IAAI,uBAAS,MAAM,kBAAkB,IACrC,IAAI,uBAAS,MAAM,cAAc,KAAK;AAAA,IAC5C,WAAW,YAAY,EAAE,yBAAyB,uBAAS,oBAAoB;AAE7E,6BAAI,EAAE;AAAA,QACJ;AAAA,MAEF;AAAA,IACF;AAEA,SAAK,QAAQ;AAAA,MACX,GAAG;AAAA,MACH,GAAG;AAAA,MACH;AAAA,MACA;AAAA,IACF;AAEA,QAAI,KAAK,MAAM,WAAW,QAAW;AACnC,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAAA,EAEA,MAAM,aAA+B;AACnC,WAAO,MAAM,KAAK,MAAM,UAAU,WAAW;AAAA,MAC3C,SAAS;AAAA,QACP,CAAC,oBAAoB,GAAG,KAAK,MAAM;AAAA,MACrC;AAAA,IACF,CAAC,EACE,KAAK,CAAC,SAAS,KAAK,KAAK,CAAC,EAC1B,KAAK,CAAC,SAAS;AACd,YAAM,SAAkB,CAAC;AACzB,iBAAW,SACT,KACA,QAAQ;AACR,eAAO,KAAK;AAAA,UACV,IAAI,MAAM;AAAA,UACV,MAAM,MAAM;AAAA,UACZ,UAAU,MAAM;AAAA,UAChB,UAAU;AAAA,QACZ,CAAC;AAAA,MACH;AACA,aAAO;AAAA,IACT,CAAC;AAAA,EACL;AAAA,EAEA,aAAgC;AAC9B,UAAM,IAAI,MAAM,uDAAuD;AAAA,EACzE;AAAA,EAEA,SAA+B;AAC7B,WAAO,IAAI,iBAAiB,MAAM,KAAK,KAAK;AAAA,EAC9C;AACF;AAEO,MAAM,yBAAyB,kBAAI,iBAAiB;AAAA,EACzD;AAAA,EACA,cAAU,mBAAI;AAAA,EACd,QAAQ;AAAA,EACC;AAAA,EAET,YAAYA,MAAU,MAAkB;AACtC,UAAMA,IAAG;AACT,SAAK,QAAQ;AACb,SAAK,SAAS;AAGd,UAAM,UAAU,KAAK,WAAW,KAAK,QAAQ,SAAS,GAAG,IAAI,KAAK;AAElE,SAAK,YAAY,IAAI,oBAAI,kBAAkB,KAAK,MAAM,EAAE,iBAAiB,OAAO;AAChF,UAAM,SAAS;AAAA,MACb,UAAU,KAAK;AAAA,MACf,eAAe,KAAK;AAAA,MACpB,qBAAqB,GAAG,KAAK,iBAAiB;AAAA,MAC9C,gBAAgB,GAAG,KAAK,aAAa;AAAA,MACrC,GAAI,KAAK,aAAa,UAAa,EAAE,WAAW,GAAG,KAAK,QAAQ,GAAG;AAAA,MACnE,GAAI,KAAK,gBAAgB,EAAE,eAAe,KAAK,aAAa;AAAA,MAC5D,GAAI,KAAK,qBAAqB,EAAE,oBAAoB,GAAG,KAAK,iBAAiB,GAAG;AAAA,MAChF,GAAI,KAAK,oBAAoB,EAAE,4BAA4B,GAAG,KAAK,gBAAgB,GAAG;AAAA,IACxF;AACA,WAAO,QAAQ,MAAM,EAAE,QAAQ,CAAC,CAAC,GAAG,CAAC,MAAM,KAAK,UAAU,aAAa,OAAO,GAAG,CAAC,CAAC;AACnF,SAAK,UAAU,WAAW,KAAK,UAAU,SAAS,QAAQ,QAAQ,IAAI;AAAA,EACxE;AAAA,EAEA,MAAgB,MAAM;AACpB,UAAM,WAAW,IAAI,iCAAkE;AAEvF,UAAM,gBAAgB,YAAY;AAChC,UAAI,SAA+D;AACnE,uBAAiB,QAAQ,KAAK,OAAO;AACnC,YAAI,KAAK,gBAAgB,OAAO,SAAS;AACvC;AAAA,QACF;AACA,YAAI,SAAS,iBAAiB,gBAAgB;AAC5C,2CAAQ;AACR,mBAAS;AAAA,QACX,OAAO;AACL,cAAI,CAAC,QAAQ;AACX,qBAAS,KAAK,MAAM,cAAc,OAAO;AACzC,qBAAS,IAAI,MAAM;AAAA,UACrB;AACA,iBAAO,SAAS,IAAI;AAAA,QACtB;AAAA,MACF;AACA,eAAS,MAAM;AAAA,IACjB;AAEA,UAAM,YAAY,YAAY;AAC5B,uBAAiB,UAAU,UAAU;AACnC,YAAI,KAAK,gBAAgB,OAAO,SAAS;AACvC;AAAA,QACF;AACA,cAAM,KAAK,OAAO,MAAM;AACxB,aAAK,MAAM,IAAI,iBAAiB,aAAa;AAAA,MAC/C;AAAA,IACF;AAEA,UAAM,QAAQ,IAAI,CAAC,cAAc,GAAG,UAAU,CAAC,CAAC;AAAA,EAClD;AAAA,EAEA,MAAM,OAAO,QAAuD,WAAW,GAAG;AAChF,QAAI,UAAU;AACd,QAAI;AACJ,WAAO,MAAM;AACX,WAAK,IAAI,oBAAU,KAAK,WAAW;AAAA,QACjC,SAAS,EAAE,CAAC,oBAAoB,GAAG,KAAK,MAAM,OAAO;AAAA,MACvD,CAAC;AAED,SAAG,GAAG,SAAS,CAAC,UAAU;AACxB,aAAK,gBAAgB,MAAM;AAC3B,aAAK,QAAQ,MAAM,EAAE,MAAM,GAAG,gCAAgC;AAAA,MAChE,CAAC;AAED,UAAI;AACF,cAAM,IAAI,QAAQ,CAAC,SAAS,WAAW;AACrC,aAAG,GAAG,QAAQ,OAAO;AACrB,aAAG,GAAG,SAAS,CAAC,UAAU,OAAO,KAAK,CAAC;AACvC,aAAG,GAAG,SAAS,CAAC,SAAS,OAAO,sBAAsB,IAAI,EAAE,CAAC;AAAA,QAC/D,CAAC;AACD;AAAA,MACF,SAAS,GAAG;AACV,YAAI,WAAW,UAAU;AACvB,gBAAM,IAAI,MAAM,yCAAyC,OAAO,cAAc,CAAC,EAAE;AAAA,QACnF;AAEA,cAAM,QAAQ,KAAK,IAAI,UAAU,GAAG,CAAC;AACrC;AAEA,aAAK,QAAQ;AAAA,UACX,gDAAgD,KAAK,aAAa,CAAC,KAAK,OAAO,IAAI,QAAQ;AAAA,QAC7F;AACA,cAAM,IAAI,QAAQ,CAAC,YAAY,WAAW,SAAS,QAAQ,GAAI,CAAC;AAAA,MAClE;AAAA,IACF;AAEA,UAAM,gBAAY,yBAAU;AAC5B,UAAM,gBAAY,yBAAU;AAG5B,UAAM,SAAS,CAAC,SAWV;AACJ,SAAG,KAAK,KAAK,UAAU,IAAI,CAAC;AAAA,IAC9B;AAEA,WAAO;AAAA,MACL,MAAM;AAAA,MACN,gBAAgB,KAAK,MAAM,MAAM;AAAA,MACjC,GAAI,KAAK,MAAM,uBAAuB;AAAA,QACpC,mBAAmB;AAAA,UACjB,uBAAuB,KAAK,MAAM;AAAA,QACpC;AAAA,MACF;AAAA,IACF,CAAC;AACD,QAAI,UAAU;AAEd,UAAM,WAAW,YAAY;AAO3B,UAAI,aAAuB,CAAC;AAC5B,uBAAiB,QAAQ,QAAQ;AAC/B,YAAI,KAAK,gBAAgB,OAAO,SAAS;AACvC;AAAA,QACF;AACA,YAAI,OAAO,KAAK;AAEhB,YAAK,KAAK,MAAM,qBAAqB,KAAK,WAAW,UAAU,KAAM,WAAW,QAAQ;AACtF,qBAAW,KAAK,IAAI;AACpB,cAAI,KAAK,QAAQ,YAAY,MAAM,IAAI;AACrC,mBAAO,WAAW,KAAK,GAAG;AAC1B,yBAAa,CAAC;AAAA,UAChB,OAAO;AACL;AAAA,UACF;AAAA,QACF;AAEA,eAAO;AAAA,UACL,MAAM,OAAO;AAAA;AAAA;AAAA,QAEf,CAAC;AAAA,MACH;AAEA,UAAI,WAAW,QAAQ;AACrB,aAAK,QAAQ,KAAK,qDAAqD;AAAA,MACzE;AAKA,aAAO,EAAE,MAAM,GAAG,CAAC;AACnB,gBAAU;AAAA,IACZ;AAEA,QAAI;AACJ,UAAM,gBAAgB,CAACC,YAAmB,UAAmB;AAC3D,UAAI,WAAW;AACb,aAAK,MAAM,IAAI,EAAE,WAAW,WAAAA,YAAW,OAAO,WAAW,MAAM,CAAC;AAChE,oBAAY;AAAA,MACd;AAAA,IACF;AAEA,UAAM,aAAa,YAAY;AAC7B,UAAI,gBAAgB;AACpB,YAAM,UAAU,IAAI,8BAAgB,qBAAqB,KAAK,MAAM,QAAQ,GAAG,CAAC;AAChF,aAAO,CAAC,KAAK,UAAU,CAAC,KAAK,gBAAgB,OAAO,SAAS;AAC3D,YAAI;AACF,gBAAM,IAAI,QAAiB,CAAC,SAAS,WAAW;AAC9C,eAAG,mBAAmB;AACtB,eAAG,GAAG,WAAW,CAAC,SAAS,QAAQ,IAAI,CAAC;AACxC,eAAG,GAAG,SAAS,CAAC,MAAM,WAAW;AAC/B,kBAAI,CAAC,SAAS;AACZ,qBAAK,QAAQ,MAAM,8BAA8B,IAAI,KAAK,MAAM,EAAE;AAAA,cACpE;AACA,kBAAI,CAAC,eAAe;AAClB,uBAAO,IAAI,MAAM,kBAAkB,CAAC;AAAA,cACtC;AAAA,YACF,CAAC;AAAA,UACH,CAAC,EAAE,KAAK,CAAC,QAAQ;AACf,kBAAM,OAAO,KAAK,MAAM,IAAI,SAAS,CAAC;AAEtC,gBAAI,WAAW,QAAQ,KAAK,UAAU,MAAM;AAC1C,oBAAM,OAAO,IAAI,UAAU,OAAO,KAAK,KAAK,OAAO,QAAQ,CAAC;AAC5D,yBAAW,SAAS,QAAQ,MAAM,IAAI,GAAG;AACvC,8BAAc,WAAW,KAAK;AAC9B,4BAAY;AAAA,cACd;AAAA,YACF,WAAW,KAAK,SAAS;AACvB,8BAAgB;AAChB,yBAAW,SAAS,QAAQ,MAAM,GAAG;AACnC,8BAAc,WAAW,KAAK;AAC9B,4BAAY;AAAA,cACd;AACA,4BAAc,WAAW,IAAI;AAC7B,mBAAK,MAAM,IAAI,iBAAiB,aAAa;AAE7C,kBAAI,cAAc,aAAa,KAAK,gBAAgB,OAAO,SAAS;AAClE,mBAAG,MAAM;AACT;AAAA,cACF;AAAA,YACF;AAAA,UACF,CAAC;AAAA,QACH,SAAS,KAAK;AAEZ,cAAI,eAAe,SAAS,CAAC,IAAI,QAAQ,SAAS,kBAAkB,GAAG;AACrE,iBAAK,QAAQ,MAAM,EAAE,IAAI,GAAG,+CAA+C;AAAA,UAC7E;AACA;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAEA,UAAM,QAAQ,IAAI,CAAC,SAAS,GAAG,WAAW,CAAC,CAAC;AAAA,EAC9C;AACF;AAEA,MAAM,uBAAuB,CAAC,aAAkC;AAC9D,SAAO,OAAO,SAAS,MAAM,GAAG,EAAE,CAAC,CAAC;AACtC;","names":["tts","segmentId"]}
|
package/dist/tts.d.cts
CHANGED
|
@@ -22,7 +22,7 @@ export interface TTSOptions {
|
|
|
22
22
|
baseURL: string;
|
|
23
23
|
encoding: TTSEncoding;
|
|
24
24
|
streamingLatency?: number;
|
|
25
|
-
wordTokenizer: tokenize.WordTokenizer;
|
|
25
|
+
wordTokenizer: tokenize.WordTokenizer | tokenize.SentenceTokenizer;
|
|
26
26
|
chunkLengthSchedule?: number[];
|
|
27
27
|
enableSsmlParsing: boolean;
|
|
28
28
|
inactivityTimeout: number;
|
package/dist/tts.d.ts
CHANGED
|
@@ -22,7 +22,7 @@ export interface TTSOptions {
|
|
|
22
22
|
baseURL: string;
|
|
23
23
|
encoding: TTSEncoding;
|
|
24
24
|
streamingLatency?: number;
|
|
25
|
-
wordTokenizer: tokenize.WordTokenizer;
|
|
25
|
+
wordTokenizer: tokenize.WordTokenizer | tokenize.SentenceTokenizer;
|
|
26
26
|
chunkLengthSchedule?: number[];
|
|
27
27
|
enableSsmlParsing: boolean;
|
|
28
28
|
inactivityTimeout: number;
|
package/dist/tts.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tts.d.ts","sourceRoot":"","sources":["../src/tts.ts"],"names":[],"mappings":";AAGA,OAAO,EAKL,QAAQ,EACR,GAAG,EACJ,MAAM,iBAAiB,CAAC;AAEzB,OAAO,EAAE,GAAG,EAAE,MAAM,UAAU,CAAC;AAE/B,OAAO,KAAK,EAAE,WAAW,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAI1D,KAAK,KAAK,GAAG;IACX,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,EAAE,MAAM,CAAC;IACjB,QAAQ,CAAC,EAAE,aAAa,CAAC;CAC1B,CAAC;AAEF,KAAK,aAAa,GAAG;IACnB,SAAS,EAAE,MAAM,CAAC;IAClB,gBAAgB,EAAE,MAAM,CAAC;IACzB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,iBAAiB,EAAE,OAAO,CAAC;CAC5B,CAAC;AAiBF,MAAM,WAAW,UAAU;IACzB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,KAAK,CAAC;IACb,OAAO,EAAE,SAAS,GAAG,MAAM,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,WAAW,CAAC;IACtB,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,aAAa,EAAE,QAAQ,CAAC,aAAa,CAAC;
|
|
1
|
+
{"version":3,"file":"tts.d.ts","sourceRoot":"","sources":["../src/tts.ts"],"names":[],"mappings":";AAGA,OAAO,EAKL,QAAQ,EACR,GAAG,EACJ,MAAM,iBAAiB,CAAC;AAEzB,OAAO,EAAE,GAAG,EAAE,MAAM,UAAU,CAAC;AAE/B,OAAO,KAAK,EAAE,WAAW,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAI1D,KAAK,KAAK,GAAG;IACX,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,EAAE,MAAM,CAAC;IACjB,QAAQ,CAAC,EAAE,aAAa,CAAC;CAC1B,CAAC;AAEF,KAAK,aAAa,GAAG;IACnB,SAAS,EAAE,MAAM,CAAC;IAClB,gBAAgB,EAAE,MAAM,CAAC;IACzB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,iBAAiB,EAAE,OAAO,CAAC;CAC5B,CAAC;AAiBF,MAAM,WAAW,UAAU;IACzB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,KAAK,CAAC;IACb,OAAO,EAAE,SAAS,GAAG,MAAM,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,WAAW,CAAC;IACtB,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,aAAa,EAAE,QAAQ,CAAC,aAAa,GAAG,QAAQ,CAAC,iBAAiB,CAAC;IACnE,mBAAmB,CAAC,EAAE,MAAM,EAAE,CAAC;IAC/B,iBAAiB,EAAE,OAAO,CAAC;IAC3B,iBAAiB,EAAE,MAAM,CAAC;IAC1B,aAAa,EAAE,OAAO,CAAC;IACvB,QAAQ,CAAC,EAAE,OAAO,CAAC;CACpB;AAaD,qBAAa,GAAI,SAAQ,GAAG,CAAC,GAAG;;IAE9B,KAAK,SAAoB;gBAEb,IAAI,GAAE,OAAO,CAAC,UAAU,CAAM;IAqCpC,UAAU,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;IAuBpC,UAAU,IAAI,GAAG,CAAC,aAAa;IAI/B,MAAM,IAAI,GAAG,CAAC,gBAAgB;CAG/B;AAED,qBAAa,gBAAiB,SAAQ,GAAG,CAAC,gBAAgB;;IAGxD,KAAK,SAAiC;IACtC,QAAQ,CAAC,SAAS,EAAE,GAAG,CAAC;gBAEZ,GAAG,EAAE,GAAG,EAAE,IAAI,EAAE,UAAU;cAuBtB,GAAG;CA0MpB"}
|
package/dist/tts.js
CHANGED
|
@@ -22,13 +22,12 @@ const DEFAULT_VOICE = {
|
|
|
22
22
|
};
|
|
23
23
|
const API_BASE_URL_V1 = "https://api.elevenlabs.io/v1/";
|
|
24
24
|
const AUTHORIZATION_HEADER = "xi-api-key";
|
|
25
|
-
const
|
|
25
|
+
const defaultTTSOptionsBase = {
|
|
26
26
|
apiKey: process.env.ELEVEN_API_KEY,
|
|
27
27
|
voice: DEFAULT_VOICE,
|
|
28
28
|
modelID: "eleven_turbo_v2_5",
|
|
29
29
|
baseURL: API_BASE_URL_V1,
|
|
30
30
|
encoding: "pcm_22050",
|
|
31
|
-
wordTokenizer: new tokenize.basic.WordTokenizer(false),
|
|
32
31
|
enableSsmlParsing: false,
|
|
33
32
|
inactivityTimeout: DEFAULT_INACTIVITY_TIMEOUT,
|
|
34
33
|
syncAlignment: true
|
|
@@ -37,12 +36,23 @@ class TTS extends tts.TTS {
|
|
|
37
36
|
#opts;
|
|
38
37
|
label = "elevenlabs.TTS";
|
|
39
38
|
constructor(opts = {}) {
|
|
40
|
-
super(sampleRateFromFormat(opts.encoding ||
|
|
39
|
+
super(sampleRateFromFormat(opts.encoding || defaultTTSOptionsBase.encoding), 1, {
|
|
41
40
|
streaming: true
|
|
42
41
|
});
|
|
42
|
+
const autoMode = opts.autoMode !== void 0 ? opts.autoMode : false;
|
|
43
|
+
let wordTokenizer = opts.wordTokenizer;
|
|
44
|
+
if (!wordTokenizer) {
|
|
45
|
+
wordTokenizer = autoMode ? new tokenize.basic.SentenceTokenizer() : new tokenize.basic.WordTokenizer(false);
|
|
46
|
+
} else if (autoMode && !(wordTokenizer instanceof tokenize.SentenceTokenizer)) {
|
|
47
|
+
log().warn(
|
|
48
|
+
"autoMode is enabled, it expects full sentences or phrases. Please provide a SentenceTokenizer instead of a WordTokenizer."
|
|
49
|
+
);
|
|
50
|
+
}
|
|
43
51
|
this.#opts = {
|
|
44
|
-
...
|
|
45
|
-
...opts
|
|
52
|
+
...defaultTTSOptionsBase,
|
|
53
|
+
...opts,
|
|
54
|
+
autoMode,
|
|
55
|
+
wordTokenizer
|
|
46
56
|
};
|
|
47
57
|
if (this.#opts.apiKey === void 0) {
|
|
48
58
|
throw new Error(
|
|
@@ -163,17 +173,18 @@ class SynthesizeStream extends tts.SynthesizeStream {
|
|
|
163
173
|
}
|
|
164
174
|
const requestId = shortuuid();
|
|
165
175
|
const segmentId = shortuuid();
|
|
166
|
-
|
|
167
|
-
JSON.stringify(
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
176
|
+
const wsSend = (data) => {
|
|
177
|
+
ws.send(JSON.stringify(data));
|
|
178
|
+
};
|
|
179
|
+
wsSend({
|
|
180
|
+
text: " ",
|
|
181
|
+
voice_settings: this.#opts.voice.settings,
|
|
182
|
+
...this.#opts.chunkLengthSchedule && {
|
|
183
|
+
generation_config: {
|
|
184
|
+
chunk_length_schedule: this.#opts.chunkLengthSchedule
|
|
174
185
|
}
|
|
175
|
-
}
|
|
176
|
-
);
|
|
186
|
+
}
|
|
187
|
+
});
|
|
177
188
|
let eosSent = false;
|
|
178
189
|
const sendTask = async () => {
|
|
179
190
|
let xmlContent = [];
|
|
@@ -191,12 +202,16 @@ class SynthesizeStream extends tts.SynthesizeStream {
|
|
|
191
202
|
continue;
|
|
192
203
|
}
|
|
193
204
|
}
|
|
194
|
-
|
|
205
|
+
wsSend({
|
|
206
|
+
text: text + " "
|
|
207
|
+
// must always end with a space
|
|
208
|
+
// ...(flushOnChunk && { flush: true }),
|
|
209
|
+
});
|
|
195
210
|
}
|
|
196
211
|
if (xmlContent.length) {
|
|
197
212
|
this.#logger.warn("ElevenLabs stream ended with incomplete XML content");
|
|
198
213
|
}
|
|
199
|
-
|
|
214
|
+
wsSend({ text: "" });
|
|
200
215
|
eosSent = true;
|
|
201
216
|
};
|
|
202
217
|
let lastFrame;
|
package/dist/tts.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/tts.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport {\n AsyncIterableQueue,\n AudioByteStream,\n log,\n shortuuid,\n tokenize,\n tts,\n} from '@livekit/agents';\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { URL } from 'node:url';\nimport { type RawData, WebSocket } from 'ws';\nimport type { TTSEncoding, TTSModels } from './models.js';\n\nconst DEFAULT_INACTIVITY_TIMEOUT = 300;\n\ntype Voice = {\n id: string;\n name: string;\n category: string;\n settings?: VoiceSettings;\n};\n\ntype VoiceSettings = {\n stability: number; // 0..1\n similarity_boost: number; // 0..1\n style?: number; // 0..1\n use_speaker_boost: boolean;\n};\n\nconst DEFAULT_VOICE: Voice = {\n id: 'bIHbv24MWmeRgasZH58o',\n name: 'Bella',\n category: 'premade',\n settings: {\n stability: 0.71,\n similarity_boost: 0.5,\n style: 0.0,\n use_speaker_boost: true,\n },\n};\n\nconst API_BASE_URL_V1 = 'https://api.elevenlabs.io/v1/';\nconst AUTHORIZATION_HEADER = 'xi-api-key';\n\nexport interface TTSOptions {\n apiKey?: string;\n voice: Voice;\n modelID: TTSModels | string;\n languageCode?: string;\n baseURL: string;\n encoding: TTSEncoding;\n streamingLatency?: number;\n wordTokenizer: tokenize.WordTokenizer;\n chunkLengthSchedule?: number[];\n enableSsmlParsing: boolean;\n inactivityTimeout: number;\n syncAlignment: boolean;\n autoMode?: boolean;\n}\n\nconst defaultTTSOptions: TTSOptions = {\n apiKey: process.env.ELEVEN_API_KEY,\n voice: DEFAULT_VOICE,\n modelID: 'eleven_turbo_v2_5',\n baseURL: API_BASE_URL_V1,\n encoding: 'pcm_22050',\n wordTokenizer: new tokenize.basic.WordTokenizer(false),\n enableSsmlParsing: false,\n inactivityTimeout: DEFAULT_INACTIVITY_TIMEOUT,\n syncAlignment: true,\n};\n\nexport class TTS extends tts.TTS {\n #opts: TTSOptions;\n label = 'elevenlabs.TTS';\n\n constructor(opts: Partial<TTSOptions> = {}) {\n super(sampleRateFromFormat(opts.encoding || defaultTTSOptions.encoding), 1, {\n streaming: true,\n });\n\n this.#opts = {\n ...defaultTTSOptions,\n ...opts,\n };\n\n if (this.#opts.apiKey === undefined) {\n throw new Error(\n 'ElevenLabs API key is required, whether as an argument or as $ELEVEN_API_KEY',\n );\n }\n }\n\n async listVoices(): Promise<Voice[]> {\n return fetch(this.#opts.baseURL + '/voices', {\n headers: {\n [AUTHORIZATION_HEADER]: this.#opts.apiKey!,\n },\n })\n .then((data) => data.json())\n .then((data) => {\n const voices: Voice[] = [];\n for (const voice of (\n data as { voices: { voice_id: string; name: string; category: string }[] }\n ).voices) {\n voices.push({\n id: voice.voice_id,\n name: voice.name,\n category: voice.category,\n settings: undefined,\n });\n }\n return voices;\n });\n }\n\n synthesize(): tts.ChunkedStream {\n throw new Error('Chunked responses are not supported on ElevenLabs TTS');\n }\n\n stream(): tts.SynthesizeStream {\n return new SynthesizeStream(this, this.#opts);\n }\n}\n\nexport class SynthesizeStream extends tts.SynthesizeStream {\n #opts: TTSOptions;\n #logger = log();\n label = 'elevenlabs.SynthesizeStream';\n readonly streamURL: URL;\n\n constructor(tts: TTS, opts: TTSOptions) {\n super(tts);\n this.#opts = opts;\n this.closed = false;\n\n // add trailing slash to URL if needed\n const baseURL = opts.baseURL + (opts.baseURL.endsWith('/') ? '' : '/');\n\n this.streamURL = new URL(`text-to-speech/${opts.voice.id}/stream-input`, baseURL);\n const params = {\n model_id: opts.modelID,\n output_format: opts.encoding,\n enable_ssml_parsing: `${opts.enableSsmlParsing}`,\n sync_alignment: `${opts.syncAlignment}`,\n ...(opts.autoMode !== undefined && { auto_mode: `${opts.autoMode}` }),\n ...(opts.languageCode && { language_code: opts.languageCode }),\n ...(opts.inactivityTimeout && { inactivity_timeout: `${opts.inactivityTimeout}` }),\n ...(opts.streamingLatency && { optimize_streaming_latency: `${opts.streamingLatency}` }),\n };\n Object.entries(params).forEach(([k, v]) => this.streamURL.searchParams.append(k, v));\n this.streamURL.protocol = this.streamURL.protocol.replace('http', 'ws');\n }\n\n protected async run() {\n const segments = new AsyncIterableQueue<tokenize.WordStream>();\n\n const tokenizeInput = async () => {\n let stream: tokenize.WordStream | null = null;\n for await (const text of this.input) {\n if (this.abortController.signal.aborted) {\n break;\n }\n if (text === SynthesizeStream.FLUSH_SENTINEL) {\n stream?.endInput();\n stream = null;\n } else {\n if (!stream) {\n stream = this.#opts.wordTokenizer.stream();\n segments.put(stream);\n }\n stream.pushText(text);\n }\n }\n segments.close();\n };\n\n const runStream = async () => {\n for await (const stream of segments) {\n if (this.abortController.signal.aborted) {\n break;\n }\n await this.#runWS(stream);\n this.queue.put(SynthesizeStream.END_OF_STREAM);\n }\n };\n\n await Promise.all([tokenizeInput(), runStream()]);\n }\n\n async #runWS(stream: tokenize.WordStream, maxRetry = 3) {\n let retries = 0;\n let ws: WebSocket;\n while (true) {\n ws = new WebSocket(this.streamURL, {\n headers: { [AUTHORIZATION_HEADER]: this.#opts.apiKey },\n });\n\n ws.on('error', (error) => {\n this.abortController.abort();\n this.#logger.error({ error }, 'Error connecting to ElevenLabs');\n });\n\n try {\n await new Promise((resolve, reject) => {\n ws.on('open', resolve);\n ws.on('error', (error) => reject(error));\n ws.on('close', (code) => reject(`WebSocket returned ${code}`));\n });\n break;\n } catch (e) {\n if (retries >= maxRetry) {\n throw new Error(`failed to connect to ElevenLabs after ${retries} attempts: ${e}`);\n }\n\n const delay = Math.min(retries * 5, 5);\n retries++;\n\n this.#logger.warn(\n `failed to connect to ElevenLabs, retrying in ${delay} seconds: ${e} (${retries}/${maxRetry})`,\n );\n await new Promise((resolve) => setTimeout(resolve, delay * 1000));\n }\n }\n\n const requestId = shortuuid();\n const segmentId = shortuuid();\n\n ws.send(\n JSON.stringify({\n text: ' ',\n voice_settings: this.#opts.voice.settings,\n ...(this.#opts.chunkLengthSchedule && {\n generation_config: {\n chunk_length_schedule: this.#opts.chunkLengthSchedule,\n },\n }),\n }),\n );\n let eosSent = false;\n\n const sendTask = async () => {\n let xmlContent: string[] = [];\n for await (const data of stream) {\n if (this.abortController.signal.aborted) {\n break;\n }\n let text = data.token;\n\n if ((this.#opts.enableSsmlParsing && text.startsWith('<phoneme')) || xmlContent.length) {\n xmlContent.push(text);\n if (text.indexOf('</phoneme>') !== -1) {\n text = xmlContent.join(' ');\n xmlContent = [];\n } else {\n continue;\n }\n }\n\n ws.send(JSON.stringify({ text: text + ' ' })); // must always end with a space\n }\n\n if (xmlContent.length) {\n this.#logger.warn('ElevenLabs stream ended with incomplete XML content');\n }\n\n // no more tokens, mark eos\n ws.send(JSON.stringify({ text: '' }));\n eosSent = true;\n };\n\n let lastFrame: AudioFrame | undefined;\n const sendLastFrame = (segmentId: string, final: boolean) => {\n if (lastFrame) {\n this.queue.put({ requestId, segmentId, frame: lastFrame, final });\n lastFrame = undefined;\n }\n };\n\n const listenTask = async () => {\n let finalReceived = false;\n const bstream = new AudioByteStream(sampleRateFromFormat(this.#opts.encoding), 1);\n while (!this.closed && !this.abortController.signal.aborted) {\n try {\n await new Promise<RawData>((resolve, reject) => {\n ws.removeAllListeners();\n ws.on('message', (data) => resolve(data));\n ws.on('close', (code, reason) => {\n if (!eosSent) {\n this.#logger.error(`WebSocket closed with code ${code}: ${reason}`);\n }\n if (!finalReceived) {\n reject(new Error('WebSocket closed'));\n }\n });\n }).then((msg) => {\n const json = JSON.parse(msg.toString());\n // remove the \"audio\" field from the json object when printing\n if ('audio' in json && json.audio !== null) {\n const data = new Int8Array(Buffer.from(json.audio, 'base64'));\n for (const frame of bstream.write(data)) {\n sendLastFrame(segmentId, false);\n lastFrame = frame;\n }\n } else if (json.isFinal) {\n finalReceived = true;\n for (const frame of bstream.flush()) {\n sendLastFrame(segmentId, false);\n lastFrame = frame;\n }\n sendLastFrame(segmentId, true);\n this.queue.put(SynthesizeStream.END_OF_STREAM);\n\n if (segmentId === requestId || this.abortController.signal.aborted) {\n ws.close();\n return;\n }\n }\n });\n } catch (err) {\n // skip log error for normal websocket close\n if (err instanceof Error && !err.message.includes('WebSocket closed')) {\n this.#logger.error({ err }, 'Error in listenTask from ElevenLabs WebSocket');\n }\n break;\n }\n }\n };\n\n await Promise.all([sendTask(), listenTask()]);\n }\n}\n\nconst sampleRateFromFormat = (encoding: TTSEncoding): number => {\n return Number(encoding.split('_')[1]);\n};\n"],"mappings":"AAGA;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OACK;AAEP,SAAS,WAAW;AACpB,SAAuB,iBAAiB;AAGxC,MAAM,6BAA6B;AAgBnC,MAAM,gBAAuB;AAAA,EAC3B,IAAI;AAAA,EACJ,MAAM;AAAA,EACN,UAAU;AAAA,EACV,UAAU;AAAA,IACR,WAAW;AAAA,IACX,kBAAkB;AAAA,IAClB,OAAO;AAAA,IACP,mBAAmB;AAAA,EACrB;AACF;AAEA,MAAM,kBAAkB;AACxB,MAAM,uBAAuB;AAkB7B,MAAM,oBAAgC;AAAA,EACpC,QAAQ,QAAQ,IAAI;AAAA,EACpB,OAAO;AAAA,EACP,SAAS;AAAA,EACT,SAAS;AAAA,EACT,UAAU;AAAA,EACV,eAAe,IAAI,SAAS,MAAM,cAAc,KAAK;AAAA,EACrD,mBAAmB;AAAA,EACnB,mBAAmB;AAAA,EACnB,eAAe;AACjB;AAEO,MAAM,YAAY,IAAI,IAAI;AAAA,EAC/B;AAAA,EACA,QAAQ;AAAA,EAER,YAAY,OAA4B,CAAC,GAAG;AAC1C,UAAM,qBAAqB,KAAK,YAAY,kBAAkB,QAAQ,GAAG,GAAG;AAAA,MAC1E,WAAW;AAAA,IACb,CAAC;AAED,SAAK,QAAQ;AAAA,MACX,GAAG;AAAA,MACH,GAAG;AAAA,IACL;AAEA,QAAI,KAAK,MAAM,WAAW,QAAW;AACnC,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAAA,EAEA,MAAM,aAA+B;AACnC,WAAO,MAAM,KAAK,MAAM,UAAU,WAAW;AAAA,MAC3C,SAAS;AAAA,QACP,CAAC,oBAAoB,GAAG,KAAK,MAAM;AAAA,MACrC;AAAA,IACF,CAAC,EACE,KAAK,CAAC,SAAS,KAAK,KAAK,CAAC,EAC1B,KAAK,CAAC,SAAS;AACd,YAAM,SAAkB,CAAC;AACzB,iBAAW,SACT,KACA,QAAQ;AACR,eAAO,KAAK;AAAA,UACV,IAAI,MAAM;AAAA,UACV,MAAM,MAAM;AAAA,UACZ,UAAU,MAAM;AAAA,UAChB,UAAU;AAAA,QACZ,CAAC;AAAA,MACH;AACA,aAAO;AAAA,IACT,CAAC;AAAA,EACL;AAAA,EAEA,aAAgC;AAC9B,UAAM,IAAI,MAAM,uDAAuD;AAAA,EACzE;AAAA,EAEA,SAA+B;AAC7B,WAAO,IAAI,iBAAiB,MAAM,KAAK,KAAK;AAAA,EAC9C;AACF;AAEO,MAAM,yBAAyB,IAAI,iBAAiB;AAAA,EACzD;AAAA,EACA,UAAU,IAAI;AAAA,EACd,QAAQ;AAAA,EACC;AAAA,EAET,YAAYA,MAAU,MAAkB;AACtC,UAAMA,IAAG;AACT,SAAK,QAAQ;AACb,SAAK,SAAS;AAGd,UAAM,UAAU,KAAK,WAAW,KAAK,QAAQ,SAAS,GAAG,IAAI,KAAK;AAElE,SAAK,YAAY,IAAI,IAAI,kBAAkB,KAAK,MAAM,EAAE,iBAAiB,OAAO;AAChF,UAAM,SAAS;AAAA,MACb,UAAU,KAAK;AAAA,MACf,eAAe,KAAK;AAAA,MACpB,qBAAqB,GAAG,KAAK,iBAAiB;AAAA,MAC9C,gBAAgB,GAAG,KAAK,aAAa;AAAA,MACrC,GAAI,KAAK,aAAa,UAAa,EAAE,WAAW,GAAG,KAAK,QAAQ,GAAG;AAAA,MACnE,GAAI,KAAK,gBAAgB,EAAE,eAAe,KAAK,aAAa;AAAA,MAC5D,GAAI,KAAK,qBAAqB,EAAE,oBAAoB,GAAG,KAAK,iBAAiB,GAAG;AAAA,MAChF,GAAI,KAAK,oBAAoB,EAAE,4BAA4B,GAAG,KAAK,gBAAgB,GAAG;AAAA,IACxF;AACA,WAAO,QAAQ,MAAM,EAAE,QAAQ,CAAC,CAAC,GAAG,CAAC,MAAM,KAAK,UAAU,aAAa,OAAO,GAAG,CAAC,CAAC;AACnF,SAAK,UAAU,WAAW,KAAK,UAAU,SAAS,QAAQ,QAAQ,IAAI;AAAA,EACxE;AAAA,EAEA,MAAgB,MAAM;AACpB,UAAM,WAAW,IAAI,mBAAwC;AAE7D,UAAM,gBAAgB,YAAY;AAChC,UAAI,SAAqC;AACzC,uBAAiB,QAAQ,KAAK,OAAO;AACnC,YAAI,KAAK,gBAAgB,OAAO,SAAS;AACvC;AAAA,QACF;AACA,YAAI,SAAS,iBAAiB,gBAAgB;AAC5C,2CAAQ;AACR,mBAAS;AAAA,QACX,OAAO;AACL,cAAI,CAAC,QAAQ;AACX,qBAAS,KAAK,MAAM,cAAc,OAAO;AACzC,qBAAS,IAAI,MAAM;AAAA,UACrB;AACA,iBAAO,SAAS,IAAI;AAAA,QACtB;AAAA,MACF;AACA,eAAS,MAAM;AAAA,IACjB;AAEA,UAAM,YAAY,YAAY;AAC5B,uBAAiB,UAAU,UAAU;AACnC,YAAI,KAAK,gBAAgB,OAAO,SAAS;AACvC;AAAA,QACF;AACA,cAAM,KAAK,OAAO,MAAM;AACxB,aAAK,MAAM,IAAI,iBAAiB,aAAa;AAAA,MAC/C;AAAA,IACF;AAEA,UAAM,QAAQ,IAAI,CAAC,cAAc,GAAG,UAAU,CAAC,CAAC;AAAA,EAClD;AAAA,EAEA,MAAM,OAAO,QAA6B,WAAW,GAAG;AACtD,QAAI,UAAU;AACd,QAAI;AACJ,WAAO,MAAM;AACX,WAAK,IAAI,UAAU,KAAK,WAAW;AAAA,QACjC,SAAS,EAAE,CAAC,oBAAoB,GAAG,KAAK,MAAM,OAAO;AAAA,MACvD,CAAC;AAED,SAAG,GAAG,SAAS,CAAC,UAAU;AACxB,aAAK,gBAAgB,MAAM;AAC3B,aAAK,QAAQ,MAAM,EAAE,MAAM,GAAG,gCAAgC;AAAA,MAChE,CAAC;AAED,UAAI;AACF,cAAM,IAAI,QAAQ,CAAC,SAAS,WAAW;AACrC,aAAG,GAAG,QAAQ,OAAO;AACrB,aAAG,GAAG,SAAS,CAAC,UAAU,OAAO,KAAK,CAAC;AACvC,aAAG,GAAG,SAAS,CAAC,SAAS,OAAO,sBAAsB,IAAI,EAAE,CAAC;AAAA,QAC/D,CAAC;AACD;AAAA,MACF,SAAS,GAAG;AACV,YAAI,WAAW,UAAU;AACvB,gBAAM,IAAI,MAAM,yCAAyC,OAAO,cAAc,CAAC,EAAE;AAAA,QACnF;AAEA,cAAM,QAAQ,KAAK,IAAI,UAAU,GAAG,CAAC;AACrC;AAEA,aAAK,QAAQ;AAAA,UACX,gDAAgD,KAAK,aAAa,CAAC,KAAK,OAAO,IAAI,QAAQ;AAAA,QAC7F;AACA,cAAM,IAAI,QAAQ,CAAC,YAAY,WAAW,SAAS,QAAQ,GAAI,CAAC;AAAA,MAClE;AAAA,IACF;AAEA,UAAM,YAAY,UAAU;AAC5B,UAAM,YAAY,UAAU;AAE5B,OAAG;AAAA,MACD,KAAK,UAAU;AAAA,QACb,MAAM;AAAA,QACN,gBAAgB,KAAK,MAAM,MAAM;AAAA,QACjC,GAAI,KAAK,MAAM,uBAAuB;AAAA,UACpC,mBAAmB;AAAA,YACjB,uBAAuB,KAAK,MAAM;AAAA,UACpC;AAAA,QACF;AAAA,MACF,CAAC;AAAA,IACH;AACA,QAAI,UAAU;AAEd,UAAM,WAAW,YAAY;AAC3B,UAAI,aAAuB,CAAC;AAC5B,uBAAiB,QAAQ,QAAQ;AAC/B,YAAI,KAAK,gBAAgB,OAAO,SAAS;AACvC;AAAA,QACF;AACA,YAAI,OAAO,KAAK;AAEhB,YAAK,KAAK,MAAM,qBAAqB,KAAK,WAAW,UAAU,KAAM,WAAW,QAAQ;AACtF,qBAAW,KAAK,IAAI;AACpB,cAAI,KAAK,QAAQ,YAAY,MAAM,IAAI;AACrC,mBAAO,WAAW,KAAK,GAAG;AAC1B,yBAAa,CAAC;AAAA,UAChB,OAAO;AACL;AAAA,UACF;AAAA,QACF;AAEA,WAAG,KAAK,KAAK,UAAU,EAAE,MAAM,OAAO,IAAI,CAAC,CAAC;AAAA,MAC9C;AAEA,UAAI,WAAW,QAAQ;AACrB,aAAK,QAAQ,KAAK,qDAAqD;AAAA,MACzE;AAGA,SAAG,KAAK,KAAK,UAAU,EAAE,MAAM,GAAG,CAAC,CAAC;AACpC,gBAAU;AAAA,IACZ;AAEA,QAAI;AACJ,UAAM,gBAAgB,CAACC,YAAmB,UAAmB;AAC3D,UAAI,WAAW;AACb,aAAK,MAAM,IAAI,EAAE,WAAW,WAAAA,YAAW,OAAO,WAAW,MAAM,CAAC;AAChE,oBAAY;AAAA,MACd;AAAA,IACF;AAEA,UAAM,aAAa,YAAY;AAC7B,UAAI,gBAAgB;AACpB,YAAM,UAAU,IAAI,gBAAgB,qBAAqB,KAAK,MAAM,QAAQ,GAAG,CAAC;AAChF,aAAO,CAAC,KAAK,UAAU,CAAC,KAAK,gBAAgB,OAAO,SAAS;AAC3D,YAAI;AACF,gBAAM,IAAI,QAAiB,CAAC,SAAS,WAAW;AAC9C,eAAG,mBAAmB;AACtB,eAAG,GAAG,WAAW,CAAC,SAAS,QAAQ,IAAI,CAAC;AACxC,eAAG,GAAG,SAAS,CAAC,MAAM,WAAW;AAC/B,kBAAI,CAAC,SAAS;AACZ,qBAAK,QAAQ,MAAM,8BAA8B,IAAI,KAAK,MAAM,EAAE;AAAA,cACpE;AACA,kBAAI,CAAC,eAAe;AAClB,uBAAO,IAAI,MAAM,kBAAkB,CAAC;AAAA,cACtC;AAAA,YACF,CAAC;AAAA,UACH,CAAC,EAAE,KAAK,CAAC,QAAQ;AACf,kBAAM,OAAO,KAAK,MAAM,IAAI,SAAS,CAAC;AAEtC,gBAAI,WAAW,QAAQ,KAAK,UAAU,MAAM;AAC1C,oBAAM,OAAO,IAAI,UAAU,OAAO,KAAK,KAAK,OAAO,QAAQ,CAAC;AAC5D,yBAAW,SAAS,QAAQ,MAAM,IAAI,GAAG;AACvC,8BAAc,WAAW,KAAK;AAC9B,4BAAY;AAAA,cACd;AAAA,YACF,WAAW,KAAK,SAAS;AACvB,8BAAgB;AAChB,yBAAW,SAAS,QAAQ,MAAM,GAAG;AACnC,8BAAc,WAAW,KAAK;AAC9B,4BAAY;AAAA,cACd;AACA,4BAAc,WAAW,IAAI;AAC7B,mBAAK,MAAM,IAAI,iBAAiB,aAAa;AAE7C,kBAAI,cAAc,aAAa,KAAK,gBAAgB,OAAO,SAAS;AAClE,mBAAG,MAAM;AACT;AAAA,cACF;AAAA,YACF;AAAA,UACF,CAAC;AAAA,QACH,SAAS,KAAK;AAEZ,cAAI,eAAe,SAAS,CAAC,IAAI,QAAQ,SAAS,kBAAkB,GAAG;AACrE,iBAAK,QAAQ,MAAM,EAAE,IAAI,GAAG,+CAA+C;AAAA,UAC7E;AACA;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAEA,UAAM,QAAQ,IAAI,CAAC,SAAS,GAAG,WAAW,CAAC,CAAC;AAAA,EAC9C;AACF;AAEA,MAAM,uBAAuB,CAAC,aAAkC;AAC9D,SAAO,OAAO,SAAS,MAAM,GAAG,EAAE,CAAC,CAAC;AACtC;","names":["tts","segmentId"]}
|
|
1
|
+
{"version":3,"sources":["../src/tts.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport {\n AsyncIterableQueue,\n AudioByteStream,\n log,\n shortuuid,\n tokenize,\n tts,\n} from '@livekit/agents';\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { URL } from 'node:url';\nimport { type RawData, WebSocket } from 'ws';\nimport type { TTSEncoding, TTSModels } from './models.js';\n\nconst DEFAULT_INACTIVITY_TIMEOUT = 300;\n\ntype Voice = {\n id: string;\n name: string;\n category: string;\n settings?: VoiceSettings;\n};\n\ntype VoiceSettings = {\n stability: number; // 0..1\n similarity_boost: number; // 0..1\n style?: number; // 0..1\n use_speaker_boost: boolean;\n};\n\nconst DEFAULT_VOICE: Voice = {\n id: 'bIHbv24MWmeRgasZH58o',\n name: 'Bella',\n category: 'premade',\n settings: {\n stability: 0.71,\n similarity_boost: 0.5,\n style: 0.0,\n use_speaker_boost: true,\n },\n};\n\nconst API_BASE_URL_V1 = 'https://api.elevenlabs.io/v1/';\nconst AUTHORIZATION_HEADER = 'xi-api-key';\n\nexport interface TTSOptions {\n apiKey?: string;\n voice: Voice;\n modelID: TTSModels | string;\n languageCode?: string;\n baseURL: string;\n encoding: TTSEncoding;\n streamingLatency?: number;\n wordTokenizer: tokenize.WordTokenizer | tokenize.SentenceTokenizer;\n chunkLengthSchedule?: number[];\n enableSsmlParsing: boolean;\n inactivityTimeout: number;\n syncAlignment: boolean;\n autoMode?: boolean;\n}\n\nconst defaultTTSOptionsBase = {\n apiKey: process.env.ELEVEN_API_KEY,\n voice: DEFAULT_VOICE,\n modelID: 'eleven_turbo_v2_5',\n baseURL: API_BASE_URL_V1,\n encoding: 'pcm_22050' as TTSEncoding,\n enableSsmlParsing: false,\n inactivityTimeout: DEFAULT_INACTIVITY_TIMEOUT,\n syncAlignment: true,\n};\n\nexport class TTS extends tts.TTS {\n #opts: TTSOptions;\n label = 'elevenlabs.TTS';\n\n constructor(opts: Partial<TTSOptions> = {}) {\n super(sampleRateFromFormat(opts.encoding || defaultTTSOptionsBase.encoding), 1, {\n streaming: true,\n });\n\n // Set autoMode to true by default if not provided is Python behavior,\n // but to make it non-breaking, we keep false as default in typescript\n const autoMode = opts.autoMode !== undefined ? opts.autoMode : false;\n\n // Set default tokenizer based on autoMode if not provided\n let wordTokenizer = opts.wordTokenizer;\n if (!wordTokenizer) {\n wordTokenizer = autoMode\n ? new tokenize.basic.SentenceTokenizer()\n : new tokenize.basic.WordTokenizer(false);\n } else if (autoMode && !(wordTokenizer instanceof tokenize.SentenceTokenizer)) {\n // Warn if autoMode is enabled but a WordTokenizer was provided\n log().warn(\n 'autoMode is enabled, it expects full sentences or phrases. ' +\n 'Please provide a SentenceTokenizer instead of a WordTokenizer.',\n );\n }\n\n this.#opts = {\n ...defaultTTSOptionsBase,\n ...opts,\n autoMode,\n wordTokenizer,\n };\n\n if (this.#opts.apiKey === undefined) {\n throw new Error(\n 'ElevenLabs API key is required, whether as an argument or as $ELEVEN_API_KEY',\n );\n }\n }\n\n async listVoices(): Promise<Voice[]> {\n return fetch(this.#opts.baseURL + '/voices', {\n headers: {\n [AUTHORIZATION_HEADER]: this.#opts.apiKey!,\n },\n })\n .then((data) => data.json())\n .then((data) => {\n const voices: Voice[] = [];\n for (const voice of (\n data as { voices: { voice_id: string; name: string; category: string }[] }\n ).voices) {\n voices.push({\n id: voice.voice_id,\n name: voice.name,\n category: voice.category,\n settings: undefined,\n });\n }\n return voices;\n });\n }\n\n synthesize(): tts.ChunkedStream {\n throw new Error('Chunked responses are not supported on ElevenLabs TTS');\n }\n\n stream(): tts.SynthesizeStream {\n return new SynthesizeStream(this, this.#opts);\n }\n}\n\nexport class SynthesizeStream extends tts.SynthesizeStream {\n #opts: TTSOptions;\n #logger = log();\n label = 'elevenlabs.SynthesizeStream';\n readonly streamURL: URL;\n\n constructor(tts: TTS, opts: TTSOptions) {\n super(tts);\n this.#opts = opts;\n this.closed = false;\n\n // add trailing slash to URL if needed\n const baseURL = opts.baseURL + (opts.baseURL.endsWith('/') ? '' : '/');\n\n this.streamURL = new URL(`text-to-speech/${opts.voice.id}/stream-input`, baseURL);\n const params = {\n model_id: opts.modelID,\n output_format: opts.encoding,\n enable_ssml_parsing: `${opts.enableSsmlParsing}`,\n sync_alignment: `${opts.syncAlignment}`,\n ...(opts.autoMode !== undefined && { auto_mode: `${opts.autoMode}` }),\n ...(opts.languageCode && { language_code: opts.languageCode }),\n ...(opts.inactivityTimeout && { inactivity_timeout: `${opts.inactivityTimeout}` }),\n ...(opts.streamingLatency && { optimize_streaming_latency: `${opts.streamingLatency}` }),\n };\n Object.entries(params).forEach(([k, v]) => this.streamURL.searchParams.append(k, v));\n this.streamURL.protocol = this.streamURL.protocol.replace('http', 'ws');\n }\n\n protected async run() {\n const segments = new AsyncIterableQueue<tokenize.WordStream | tokenize.SentenceStream>();\n\n const tokenizeInput = async () => {\n let stream: tokenize.WordStream | tokenize.SentenceStream | null = null;\n for await (const text of this.input) {\n if (this.abortController.signal.aborted) {\n break;\n }\n if (text === SynthesizeStream.FLUSH_SENTINEL) {\n stream?.endInput();\n stream = null;\n } else {\n if (!stream) {\n stream = this.#opts.wordTokenizer.stream();\n segments.put(stream);\n }\n stream.pushText(text);\n }\n }\n segments.close();\n };\n\n const runStream = async () => {\n for await (const stream of segments) {\n if (this.abortController.signal.aborted) {\n break;\n }\n await this.#runWS(stream);\n this.queue.put(SynthesizeStream.END_OF_STREAM);\n }\n };\n\n await Promise.all([tokenizeInput(), runStream()]);\n }\n\n async #runWS(stream: tokenize.WordStream | tokenize.SentenceStream, maxRetry = 3) {\n let retries = 0;\n let ws: WebSocket;\n while (true) {\n ws = new WebSocket(this.streamURL, {\n headers: { [AUTHORIZATION_HEADER]: this.#opts.apiKey },\n });\n\n ws.on('error', (error) => {\n this.abortController.abort();\n this.#logger.error({ error }, 'Error connecting to ElevenLabs');\n });\n\n try {\n await new Promise((resolve, reject) => {\n ws.on('open', resolve);\n ws.on('error', (error) => reject(error));\n ws.on('close', (code) => reject(`WebSocket returned ${code}`));\n });\n break;\n } catch (e) {\n if (retries >= maxRetry) {\n throw new Error(`failed to connect to ElevenLabs after ${retries} attempts: ${e}`);\n }\n\n const delay = Math.min(retries * 5, 5);\n retries++;\n\n this.#logger.warn(\n `failed to connect to ElevenLabs, retrying in ${delay} seconds: ${e} (${retries}/${maxRetry})`,\n );\n await new Promise((resolve) => setTimeout(resolve, delay * 1000));\n }\n }\n\n const requestId = shortuuid();\n const segmentId = shortuuid();\n\n // simple helper to make sure what we send to ws.send\n const wsSend = (data: {\n // (SynthesizeContent from python)\n text: string;\n // setting flush somehow never finishes the current speech generation\n // https://github.com/livekit/agents-js/pull/820#issuecomment-3517138706\n // flush?: boolean;\n // initialization\n voice_settings?: VoiceSettings;\n generation_config?: {\n chunk_length_schedule: number[];\n };\n }) => {\n ws.send(JSON.stringify(data));\n };\n\n wsSend({\n text: ' ',\n voice_settings: this.#opts.voice.settings,\n ...(this.#opts.chunkLengthSchedule && {\n generation_config: {\n chunk_length_schedule: this.#opts.chunkLengthSchedule,\n },\n }),\n });\n let eosSent = false;\n\n const sendTask = async () => {\n // Determine if we should flush on each chunk (sentence)\n /*const flushOnChunk =\n this.#opts.wordTokenizer instanceof tokenize.SentenceTokenizer &&\n this.#opts.autoMode !== undefined &&\n this.#opts.autoMode;*/\n\n let xmlContent: string[] = [];\n for await (const data of stream) {\n if (this.abortController.signal.aborted) {\n break;\n }\n let text = data.token;\n\n if ((this.#opts.enableSsmlParsing && text.startsWith('<phoneme')) || xmlContent.length) {\n xmlContent.push(text);\n if (text.indexOf('</phoneme>') !== -1) {\n text = xmlContent.join(' ');\n xmlContent = [];\n } else {\n continue;\n }\n }\n\n wsSend({\n text: text + ' ', // must always end with a space\n // ...(flushOnChunk && { flush: true }),\n });\n }\n\n if (xmlContent.length) {\n this.#logger.warn('ElevenLabs stream ended with incomplete XML content');\n }\n\n // no more tokens, mark eos with flush\n // setting flush somehow never finishes the current speech generation\n // wsSend({ text: '', flush: true });\n wsSend({ text: '' });\n eosSent = true;\n };\n\n let lastFrame: AudioFrame | undefined;\n const sendLastFrame = (segmentId: string, final: boolean) => {\n if (lastFrame) {\n this.queue.put({ requestId, segmentId, frame: lastFrame, final });\n lastFrame = undefined;\n }\n };\n\n const listenTask = async () => {\n let finalReceived = false;\n const bstream = new AudioByteStream(sampleRateFromFormat(this.#opts.encoding), 1);\n while (!this.closed && !this.abortController.signal.aborted) {\n try {\n await new Promise<RawData>((resolve, reject) => {\n ws.removeAllListeners();\n ws.on('message', (data) => resolve(data));\n ws.on('close', (code, reason) => {\n if (!eosSent) {\n this.#logger.error(`WebSocket closed with code ${code}: ${reason}`);\n }\n if (!finalReceived) {\n reject(new Error('WebSocket closed'));\n }\n });\n }).then((msg) => {\n const json = JSON.parse(msg.toString());\n // remove the \"audio\" field from the json object when printing\n if ('audio' in json && json.audio !== null) {\n const data = new Int8Array(Buffer.from(json.audio, 'base64'));\n for (const frame of bstream.write(data)) {\n sendLastFrame(segmentId, false);\n lastFrame = frame;\n }\n } else if (json.isFinal) {\n finalReceived = true;\n for (const frame of bstream.flush()) {\n sendLastFrame(segmentId, false);\n lastFrame = frame;\n }\n sendLastFrame(segmentId, true);\n this.queue.put(SynthesizeStream.END_OF_STREAM);\n\n if (segmentId === requestId || this.abortController.signal.aborted) {\n ws.close();\n return;\n }\n }\n });\n } catch (err) {\n // skip log error for normal websocket close\n if (err instanceof Error && !err.message.includes('WebSocket closed')) {\n this.#logger.error({ err }, 'Error in listenTask from ElevenLabs WebSocket');\n }\n break;\n }\n }\n };\n\n await Promise.all([sendTask(), listenTask()]);\n }\n}\n\nconst sampleRateFromFormat = (encoding: TTSEncoding): number => {\n return Number(encoding.split('_')[1]);\n};\n"],"mappings":"AAGA;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OACK;AAEP,SAAS,WAAW;AACpB,SAAuB,iBAAiB;AAGxC,MAAM,6BAA6B;AAgBnC,MAAM,gBAAuB;AAAA,EAC3B,IAAI;AAAA,EACJ,MAAM;AAAA,EACN,UAAU;AAAA,EACV,UAAU;AAAA,IACR,WAAW;AAAA,IACX,kBAAkB;AAAA,IAClB,OAAO;AAAA,IACP,mBAAmB;AAAA,EACrB;AACF;AAEA,MAAM,kBAAkB;AACxB,MAAM,uBAAuB;AAkB7B,MAAM,wBAAwB;AAAA,EAC5B,QAAQ,QAAQ,IAAI;AAAA,EACpB,OAAO;AAAA,EACP,SAAS;AAAA,EACT,SAAS;AAAA,EACT,UAAU;AAAA,EACV,mBAAmB;AAAA,EACnB,mBAAmB;AAAA,EACnB,eAAe;AACjB;AAEO,MAAM,YAAY,IAAI,IAAI;AAAA,EAC/B;AAAA,EACA,QAAQ;AAAA,EAER,YAAY,OAA4B,CAAC,GAAG;AAC1C,UAAM,qBAAqB,KAAK,YAAY,sBAAsB,QAAQ,GAAG,GAAG;AAAA,MAC9E,WAAW;AAAA,IACb,CAAC;AAID,UAAM,WAAW,KAAK,aAAa,SAAY,KAAK,WAAW;AAG/D,QAAI,gBAAgB,KAAK;AACzB,QAAI,CAAC,eAAe;AAClB,sBAAgB,WACZ,IAAI,SAAS,MAAM,kBAAkB,IACrC,IAAI,SAAS,MAAM,cAAc,KAAK;AAAA,IAC5C,WAAW,YAAY,EAAE,yBAAyB,SAAS,oBAAoB;AAE7E,UAAI,EAAE;AAAA,QACJ;AAAA,MAEF;AAAA,IACF;AAEA,SAAK,QAAQ;AAAA,MACX,GAAG;AAAA,MACH,GAAG;AAAA,MACH;AAAA,MACA;AAAA,IACF;AAEA,QAAI,KAAK,MAAM,WAAW,QAAW;AACnC,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAAA,EAEA,MAAM,aAA+B;AACnC,WAAO,MAAM,KAAK,MAAM,UAAU,WAAW;AAAA,MAC3C,SAAS;AAAA,QACP,CAAC,oBAAoB,GAAG,KAAK,MAAM;AAAA,MACrC;AAAA,IACF,CAAC,EACE,KAAK,CAAC,SAAS,KAAK,KAAK,CAAC,EAC1B,KAAK,CAAC,SAAS;AACd,YAAM,SAAkB,CAAC;AACzB,iBAAW,SACT,KACA,QAAQ;AACR,eAAO,KAAK;AAAA,UACV,IAAI,MAAM;AAAA,UACV,MAAM,MAAM;AAAA,UACZ,UAAU,MAAM;AAAA,UAChB,UAAU;AAAA,QACZ,CAAC;AAAA,MACH;AACA,aAAO;AAAA,IACT,CAAC;AAAA,EACL;AAAA,EAEA,aAAgC;AAC9B,UAAM,IAAI,MAAM,uDAAuD;AAAA,EACzE;AAAA,EAEA,SAA+B;AAC7B,WAAO,IAAI,iBAAiB,MAAM,KAAK,KAAK;AAAA,EAC9C;AACF;AAEO,MAAM,yBAAyB,IAAI,iBAAiB;AAAA,EACzD;AAAA,EACA,UAAU,IAAI;AAAA,EACd,QAAQ;AAAA,EACC;AAAA,EAET,YAAYA,MAAU,MAAkB;AACtC,UAAMA,IAAG;AACT,SAAK,QAAQ;AACb,SAAK,SAAS;AAGd,UAAM,UAAU,KAAK,WAAW,KAAK,QAAQ,SAAS,GAAG,IAAI,KAAK;AAElE,SAAK,YAAY,IAAI,IAAI,kBAAkB,KAAK,MAAM,EAAE,iBAAiB,OAAO;AAChF,UAAM,SAAS;AAAA,MACb,UAAU,KAAK;AAAA,MACf,eAAe,KAAK;AAAA,MACpB,qBAAqB,GAAG,KAAK,iBAAiB;AAAA,MAC9C,gBAAgB,GAAG,KAAK,aAAa;AAAA,MACrC,GAAI,KAAK,aAAa,UAAa,EAAE,WAAW,GAAG,KAAK,QAAQ,GAAG;AAAA,MACnE,GAAI,KAAK,gBAAgB,EAAE,eAAe,KAAK,aAAa;AAAA,MAC5D,GAAI,KAAK,qBAAqB,EAAE,oBAAoB,GAAG,KAAK,iBAAiB,GAAG;AAAA,MAChF,GAAI,KAAK,oBAAoB,EAAE,4BAA4B,GAAG,KAAK,gBAAgB,GAAG;AAAA,IACxF;AACA,WAAO,QAAQ,MAAM,EAAE,QAAQ,CAAC,CAAC,GAAG,CAAC,MAAM,KAAK,UAAU,aAAa,OAAO,GAAG,CAAC,CAAC;AACnF,SAAK,UAAU,WAAW,KAAK,UAAU,SAAS,QAAQ,QAAQ,IAAI;AAAA,EACxE;AAAA,EAEA,MAAgB,MAAM;AACpB,UAAM,WAAW,IAAI,mBAAkE;AAEvF,UAAM,gBAAgB,YAAY;AAChC,UAAI,SAA+D;AACnE,uBAAiB,QAAQ,KAAK,OAAO;AACnC,YAAI,KAAK,gBAAgB,OAAO,SAAS;AACvC;AAAA,QACF;AACA,YAAI,SAAS,iBAAiB,gBAAgB;AAC5C,2CAAQ;AACR,mBAAS;AAAA,QACX,OAAO;AACL,cAAI,CAAC,QAAQ;AACX,qBAAS,KAAK,MAAM,cAAc,OAAO;AACzC,qBAAS,IAAI,MAAM;AAAA,UACrB;AACA,iBAAO,SAAS,IAAI;AAAA,QACtB;AAAA,MACF;AACA,eAAS,MAAM;AAAA,IACjB;AAEA,UAAM,YAAY,YAAY;AAC5B,uBAAiB,UAAU,UAAU;AACnC,YAAI,KAAK,gBAAgB,OAAO,SAAS;AACvC;AAAA,QACF;AACA,cAAM,KAAK,OAAO,MAAM;AACxB,aAAK,MAAM,IAAI,iBAAiB,aAAa;AAAA,MAC/C;AAAA,IACF;AAEA,UAAM,QAAQ,IAAI,CAAC,cAAc,GAAG,UAAU,CAAC,CAAC;AAAA,EAClD;AAAA,EAEA,MAAM,OAAO,QAAuD,WAAW,GAAG;AAChF,QAAI,UAAU;AACd,QAAI;AACJ,WAAO,MAAM;AACX,WAAK,IAAI,UAAU,KAAK,WAAW;AAAA,QACjC,SAAS,EAAE,CAAC,oBAAoB,GAAG,KAAK,MAAM,OAAO;AAAA,MACvD,CAAC;AAED,SAAG,GAAG,SAAS,CAAC,UAAU;AACxB,aAAK,gBAAgB,MAAM;AAC3B,aAAK,QAAQ,MAAM,EAAE,MAAM,GAAG,gCAAgC;AAAA,MAChE,CAAC;AAED,UAAI;AACF,cAAM,IAAI,QAAQ,CAAC,SAAS,WAAW;AACrC,aAAG,GAAG,QAAQ,OAAO;AACrB,aAAG,GAAG,SAAS,CAAC,UAAU,OAAO,KAAK,CAAC;AACvC,aAAG,GAAG,SAAS,CAAC,SAAS,OAAO,sBAAsB,IAAI,EAAE,CAAC;AAAA,QAC/D,CAAC;AACD;AAAA,MACF,SAAS,GAAG;AACV,YAAI,WAAW,UAAU;AACvB,gBAAM,IAAI,MAAM,yCAAyC,OAAO,cAAc,CAAC,EAAE;AAAA,QACnF;AAEA,cAAM,QAAQ,KAAK,IAAI,UAAU,GAAG,CAAC;AACrC;AAEA,aAAK,QAAQ;AAAA,UACX,gDAAgD,KAAK,aAAa,CAAC,KAAK,OAAO,IAAI,QAAQ;AAAA,QAC7F;AACA,cAAM,IAAI,QAAQ,CAAC,YAAY,WAAW,SAAS,QAAQ,GAAI,CAAC;AAAA,MAClE;AAAA,IACF;AAEA,UAAM,YAAY,UAAU;AAC5B,UAAM,YAAY,UAAU;AAG5B,UAAM,SAAS,CAAC,SAWV;AACJ,SAAG,KAAK,KAAK,UAAU,IAAI,CAAC;AAAA,IAC9B;AAEA,WAAO;AAAA,MACL,MAAM;AAAA,MACN,gBAAgB,KAAK,MAAM,MAAM;AAAA,MACjC,GAAI,KAAK,MAAM,uBAAuB;AAAA,QACpC,mBAAmB;AAAA,UACjB,uBAAuB,KAAK,MAAM;AAAA,QACpC;AAAA,MACF;AAAA,IACF,CAAC;AACD,QAAI,UAAU;AAEd,UAAM,WAAW,YAAY;AAO3B,UAAI,aAAuB,CAAC;AAC5B,uBAAiB,QAAQ,QAAQ;AAC/B,YAAI,KAAK,gBAAgB,OAAO,SAAS;AACvC;AAAA,QACF;AACA,YAAI,OAAO,KAAK;AAEhB,YAAK,KAAK,MAAM,qBAAqB,KAAK,WAAW,UAAU,KAAM,WAAW,QAAQ;AACtF,qBAAW,KAAK,IAAI;AACpB,cAAI,KAAK,QAAQ,YAAY,MAAM,IAAI;AACrC,mBAAO,WAAW,KAAK,GAAG;AAC1B,yBAAa,CAAC;AAAA,UAChB,OAAO;AACL;AAAA,UACF;AAAA,QACF;AAEA,eAAO;AAAA,UACL,MAAM,OAAO;AAAA;AAAA;AAAA,QAEf,CAAC;AAAA,MACH;AAEA,UAAI,WAAW,QAAQ;AACrB,aAAK,QAAQ,KAAK,qDAAqD;AAAA,MACzE;AAKA,aAAO,EAAE,MAAM,GAAG,CAAC;AACnB,gBAAU;AAAA,IACZ;AAEA,QAAI;AACJ,UAAM,gBAAgB,CAACC,YAAmB,UAAmB;AAC3D,UAAI,WAAW;AACb,aAAK,MAAM,IAAI,EAAE,WAAW,WAAAA,YAAW,OAAO,WAAW,MAAM,CAAC;AAChE,oBAAY;AAAA,MACd;AAAA,IACF;AAEA,UAAM,aAAa,YAAY;AAC7B,UAAI,gBAAgB;AACpB,YAAM,UAAU,IAAI,gBAAgB,qBAAqB,KAAK,MAAM,QAAQ,GAAG,CAAC;AAChF,aAAO,CAAC,KAAK,UAAU,CAAC,KAAK,gBAAgB,OAAO,SAAS;AAC3D,YAAI;AACF,gBAAM,IAAI,QAAiB,CAAC,SAAS,WAAW;AAC9C,eAAG,mBAAmB;AACtB,eAAG,GAAG,WAAW,CAAC,SAAS,QAAQ,IAAI,CAAC;AACxC,eAAG,GAAG,SAAS,CAAC,MAAM,WAAW;AAC/B,kBAAI,CAAC,SAAS;AACZ,qBAAK,QAAQ,MAAM,8BAA8B,IAAI,KAAK,MAAM,EAAE;AAAA,cACpE;AACA,kBAAI,CAAC,eAAe;AAClB,uBAAO,IAAI,MAAM,kBAAkB,CAAC;AAAA,cACtC;AAAA,YACF,CAAC;AAAA,UACH,CAAC,EAAE,KAAK,CAAC,QAAQ;AACf,kBAAM,OAAO,KAAK,MAAM,IAAI,SAAS,CAAC;AAEtC,gBAAI,WAAW,QAAQ,KAAK,UAAU,MAAM;AAC1C,oBAAM,OAAO,IAAI,UAAU,OAAO,KAAK,KAAK,OAAO,QAAQ,CAAC;AAC5D,yBAAW,SAAS,QAAQ,MAAM,IAAI,GAAG;AACvC,8BAAc,WAAW,KAAK;AAC9B,4BAAY;AAAA,cACd;AAAA,YACF,WAAW,KAAK,SAAS;AACvB,8BAAgB;AAChB,yBAAW,SAAS,QAAQ,MAAM,GAAG;AACnC,8BAAc,WAAW,KAAK;AAC9B,4BAAY;AAAA,cACd;AACA,4BAAc,WAAW,IAAI;AAC7B,mBAAK,MAAM,IAAI,iBAAiB,aAAa;AAE7C,kBAAI,cAAc,aAAa,KAAK,gBAAgB,OAAO,SAAS;AAClE,mBAAG,MAAM;AACT;AAAA,cACF;AAAA,YACF;AAAA,UACF,CAAC;AAAA,QACH,SAAS,KAAK;AAEZ,cAAI,eAAe,SAAS,CAAC,IAAI,QAAQ,SAAS,kBAAkB,GAAG;AACrE,iBAAK,QAAQ,MAAM,EAAE,IAAI,GAAG,+CAA+C;AAAA,UAC7E;AACA;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAEA,UAAM,QAAQ,IAAI,CAAC,SAAS,GAAG,WAAW,CAAC,CAAC;AAAA,EAC9C;AACF;AAEA,MAAM,uBAAuB,CAAC,aAAkC;AAC9D,SAAO,OAAO,SAAS,MAAM,GAAG,EAAE,CAAC,CAAC;AACtC;","names":["tts","segmentId"]}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@livekit/agents-plugin-elevenlabs",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.18",
|
|
4
4
|
"description": "ElevenLabs plugin for LiveKit Node Agents",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"require": "dist/index.cjs",
|
|
@@ -30,16 +30,16 @@
|
|
|
30
30
|
"@types/ws": "^8.5.10",
|
|
31
31
|
"tsup": "^8.3.5",
|
|
32
32
|
"typescript": "^5.0.0",
|
|
33
|
-
"@livekit/agents": "1.0.
|
|
34
|
-
"@livekit/agents-plugin-openai": "1.0.
|
|
35
|
-
"@livekit/agents-plugins-test": "1.0.
|
|
33
|
+
"@livekit/agents": "1.0.18",
|
|
34
|
+
"@livekit/agents-plugin-openai": "1.0.18",
|
|
35
|
+
"@livekit/agents-plugins-test": "1.0.18"
|
|
36
36
|
},
|
|
37
37
|
"dependencies": {
|
|
38
38
|
"ws": "^8.16.0"
|
|
39
39
|
},
|
|
40
40
|
"peerDependencies": {
|
|
41
41
|
"@livekit/rtc-node": "^0.13.12",
|
|
42
|
-
"@livekit/agents": "1.0.
|
|
42
|
+
"@livekit/agents": "1.0.18"
|
|
43
43
|
},
|
|
44
44
|
"scripts": {
|
|
45
45
|
"build": "tsup --onSuccess \"pnpm build:types\"",
|
package/src/tts.ts
CHANGED
|
@@ -53,7 +53,7 @@ export interface TTSOptions {
|
|
|
53
53
|
baseURL: string;
|
|
54
54
|
encoding: TTSEncoding;
|
|
55
55
|
streamingLatency?: number;
|
|
56
|
-
wordTokenizer: tokenize.WordTokenizer;
|
|
56
|
+
wordTokenizer: tokenize.WordTokenizer | tokenize.SentenceTokenizer;
|
|
57
57
|
chunkLengthSchedule?: number[];
|
|
58
58
|
enableSsmlParsing: boolean;
|
|
59
59
|
inactivityTimeout: number;
|
|
@@ -61,13 +61,12 @@ export interface TTSOptions {
|
|
|
61
61
|
autoMode?: boolean;
|
|
62
62
|
}
|
|
63
63
|
|
|
64
|
-
const
|
|
64
|
+
const defaultTTSOptionsBase = {
|
|
65
65
|
apiKey: process.env.ELEVEN_API_KEY,
|
|
66
66
|
voice: DEFAULT_VOICE,
|
|
67
67
|
modelID: 'eleven_turbo_v2_5',
|
|
68
68
|
baseURL: API_BASE_URL_V1,
|
|
69
|
-
encoding: 'pcm_22050',
|
|
70
|
-
wordTokenizer: new tokenize.basic.WordTokenizer(false),
|
|
69
|
+
encoding: 'pcm_22050' as TTSEncoding,
|
|
71
70
|
enableSsmlParsing: false,
|
|
72
71
|
inactivityTimeout: DEFAULT_INACTIVITY_TIMEOUT,
|
|
73
72
|
syncAlignment: true,
|
|
@@ -78,13 +77,33 @@ export class TTS extends tts.TTS {
|
|
|
78
77
|
label = 'elevenlabs.TTS';
|
|
79
78
|
|
|
80
79
|
constructor(opts: Partial<TTSOptions> = {}) {
|
|
81
|
-
super(sampleRateFromFormat(opts.encoding ||
|
|
80
|
+
super(sampleRateFromFormat(opts.encoding || defaultTTSOptionsBase.encoding), 1, {
|
|
82
81
|
streaming: true,
|
|
83
82
|
});
|
|
84
83
|
|
|
84
|
+
// Set autoMode to true by default if not provided is Python behavior,
|
|
85
|
+
// but to make it non-breaking, we keep false as default in typescript
|
|
86
|
+
const autoMode = opts.autoMode !== undefined ? opts.autoMode : false;
|
|
87
|
+
|
|
88
|
+
// Set default tokenizer based on autoMode if not provided
|
|
89
|
+
let wordTokenizer = opts.wordTokenizer;
|
|
90
|
+
if (!wordTokenizer) {
|
|
91
|
+
wordTokenizer = autoMode
|
|
92
|
+
? new tokenize.basic.SentenceTokenizer()
|
|
93
|
+
: new tokenize.basic.WordTokenizer(false);
|
|
94
|
+
} else if (autoMode && !(wordTokenizer instanceof tokenize.SentenceTokenizer)) {
|
|
95
|
+
// Warn if autoMode is enabled but a WordTokenizer was provided
|
|
96
|
+
log().warn(
|
|
97
|
+
'autoMode is enabled, it expects full sentences or phrases. ' +
|
|
98
|
+
'Please provide a SentenceTokenizer instead of a WordTokenizer.',
|
|
99
|
+
);
|
|
100
|
+
}
|
|
101
|
+
|
|
85
102
|
this.#opts = {
|
|
86
|
-
...
|
|
103
|
+
...defaultTTSOptionsBase,
|
|
87
104
|
...opts,
|
|
105
|
+
autoMode,
|
|
106
|
+
wordTokenizer,
|
|
88
107
|
};
|
|
89
108
|
|
|
90
109
|
if (this.#opts.apiKey === undefined) {
|
|
@@ -156,10 +175,10 @@ export class SynthesizeStream extends tts.SynthesizeStream {
|
|
|
156
175
|
}
|
|
157
176
|
|
|
158
177
|
protected async run() {
|
|
159
|
-
const segments = new AsyncIterableQueue<tokenize.WordStream>();
|
|
178
|
+
const segments = new AsyncIterableQueue<tokenize.WordStream | tokenize.SentenceStream>();
|
|
160
179
|
|
|
161
180
|
const tokenizeInput = async () => {
|
|
162
|
-
let stream: tokenize.WordStream | null = null;
|
|
181
|
+
let stream: tokenize.WordStream | tokenize.SentenceStream | null = null;
|
|
163
182
|
for await (const text of this.input) {
|
|
164
183
|
if (this.abortController.signal.aborted) {
|
|
165
184
|
break;
|
|
@@ -191,7 +210,7 @@ export class SynthesizeStream extends tts.SynthesizeStream {
|
|
|
191
210
|
await Promise.all([tokenizeInput(), runStream()]);
|
|
192
211
|
}
|
|
193
212
|
|
|
194
|
-
async #runWS(stream: tokenize.WordStream, maxRetry = 3) {
|
|
213
|
+
async #runWS(stream: tokenize.WordStream | tokenize.SentenceStream, maxRetry = 3) {
|
|
195
214
|
let retries = 0;
|
|
196
215
|
let ws: WebSocket;
|
|
197
216
|
while (true) {
|
|
@@ -229,20 +248,40 @@ export class SynthesizeStream extends tts.SynthesizeStream {
|
|
|
229
248
|
const requestId = shortuuid();
|
|
230
249
|
const segmentId = shortuuid();
|
|
231
250
|
|
|
232
|
-
ws.send
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
251
|
+
// simple helper to make sure what we send to ws.send
|
|
252
|
+
const wsSend = (data: {
|
|
253
|
+
// (SynthesizeContent from python)
|
|
254
|
+
text: string;
|
|
255
|
+
// setting flush somehow never finishes the current speech generation
|
|
256
|
+
// https://github.com/livekit/agents-js/pull/820#issuecomment-3517138706
|
|
257
|
+
// flush?: boolean;
|
|
258
|
+
// initialization
|
|
259
|
+
voice_settings?: VoiceSettings;
|
|
260
|
+
generation_config?: {
|
|
261
|
+
chunk_length_schedule: number[];
|
|
262
|
+
};
|
|
263
|
+
}) => {
|
|
264
|
+
ws.send(JSON.stringify(data));
|
|
265
|
+
};
|
|
266
|
+
|
|
267
|
+
wsSend({
|
|
268
|
+
text: ' ',
|
|
269
|
+
voice_settings: this.#opts.voice.settings,
|
|
270
|
+
...(this.#opts.chunkLengthSchedule && {
|
|
271
|
+
generation_config: {
|
|
272
|
+
chunk_length_schedule: this.#opts.chunkLengthSchedule,
|
|
273
|
+
},
|
|
241
274
|
}),
|
|
242
|
-
);
|
|
275
|
+
});
|
|
243
276
|
let eosSent = false;
|
|
244
277
|
|
|
245
278
|
const sendTask = async () => {
|
|
279
|
+
// Determine if we should flush on each chunk (sentence)
|
|
280
|
+
/*const flushOnChunk =
|
|
281
|
+
this.#opts.wordTokenizer instanceof tokenize.SentenceTokenizer &&
|
|
282
|
+
this.#opts.autoMode !== undefined &&
|
|
283
|
+
this.#opts.autoMode;*/
|
|
284
|
+
|
|
246
285
|
let xmlContent: string[] = [];
|
|
247
286
|
for await (const data of stream) {
|
|
248
287
|
if (this.abortController.signal.aborted) {
|
|
@@ -260,15 +299,20 @@ export class SynthesizeStream extends tts.SynthesizeStream {
|
|
|
260
299
|
}
|
|
261
300
|
}
|
|
262
301
|
|
|
263
|
-
|
|
302
|
+
wsSend({
|
|
303
|
+
text: text + ' ', // must always end with a space
|
|
304
|
+
// ...(flushOnChunk && { flush: true }),
|
|
305
|
+
});
|
|
264
306
|
}
|
|
265
307
|
|
|
266
308
|
if (xmlContent.length) {
|
|
267
309
|
this.#logger.warn('ElevenLabs stream ended with incomplete XML content');
|
|
268
310
|
}
|
|
269
311
|
|
|
270
|
-
// no more tokens, mark eos
|
|
271
|
-
|
|
312
|
+
// no more tokens, mark eos with flush
|
|
313
|
+
// setting flush somehow never finishes the current speech generation
|
|
314
|
+
// wsSend({ text: '', flush: true });
|
|
315
|
+
wsSend({ text: '' });
|
|
272
316
|
eosSent = true;
|
|
273
317
|
};
|
|
274
318
|
|