@livekit/agents-plugin-cartesia 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/tts.js ADDED
@@ -0,0 +1,234 @@
1
+ import { AudioByteStream, log, tokenize, tts } from "@livekit/agents";
2
+ import { randomUUID } from "node:crypto";
3
+ import { request } from "node:https";
4
+ import { WebSocket } from "ws";
5
+ import {
6
+ TTSDefaultVoiceId
7
+ } from "./models.js";
8
+ const AUTHORIZATION_HEADER = "X-API-Key";
9
+ const VERSION_HEADER = "Cartesia-Version";
10
+ const VERSION = "2024-06-10";
11
+ const NUM_CHANNELS = 1;
12
+ const BUFFERED_WORDS_COUNT = 8;
13
+ const defaultTTSOptions = {
14
+ model: "sonic-english",
15
+ encoding: "pcm_s16le",
16
+ sampleRate: 24e3,
17
+ voice: TTSDefaultVoiceId,
18
+ apiKey: process.env.CARTESIA_API_KEY,
19
+ language: "en"
20
+ };
21
+ class TTS extends tts.TTS {
22
+ #opts;
23
+ label = "cartesia.TTS";
24
+ constructor(opts = {}) {
25
+ super(opts.sampleRate || defaultTTSOptions.sampleRate, NUM_CHANNELS, {
26
+ streaming: true
27
+ });
28
+ this.#opts = {
29
+ ...defaultTTSOptions,
30
+ ...opts
31
+ };
32
+ if (this.#opts.apiKey === void 0) {
33
+ throw new Error(
34
+ "Cartesia API key is required, whether as an argument or as $CARTESIA_API_KEY"
35
+ );
36
+ }
37
+ }
38
+ // TODO(nbsp): updateOptions
39
+ synthesize(text) {
40
+ return new ChunkedStream(this, text, this.#opts);
41
+ }
42
+ stream() {
43
+ return new SynthesizeStream(this, this.#opts);
44
+ }
45
+ }
46
+ class ChunkedStream extends tts.ChunkedStream {
47
+ label = "cartesia.ChunkedStream";
48
+ #opts;
49
+ #text;
50
+ // set Promise<T> to any because OpenAI returns an annoying Response type
51
+ constructor(tts2, text, opts) {
52
+ super(text, tts2);
53
+ this.#text = text;
54
+ this.#opts = opts;
55
+ this.#run();
56
+ }
57
+ async #run() {
58
+ const requestId = randomUUID();
59
+ const bstream = new AudioByteStream(this.#opts.sampleRate, NUM_CHANNELS);
60
+ const json = toCartesiaOptions(this.#opts);
61
+ json.transcript = this.#text;
62
+ const req = request(
63
+ {
64
+ hostname: "api.cartesia.ai",
65
+ port: 443,
66
+ path: "/tts/bytes",
67
+ method: "POST",
68
+ headers: {
69
+ [AUTHORIZATION_HEADER]: this.#opts.apiKey,
70
+ [VERSION_HEADER]: VERSION
71
+ }
72
+ },
73
+ (res) => {
74
+ res.on("data", (chunk) => {
75
+ for (const frame of bstream.write(chunk)) {
76
+ this.queue.put({
77
+ requestId,
78
+ frame,
79
+ final: false,
80
+ segmentId: requestId
81
+ });
82
+ }
83
+ });
84
+ res.on("close", () => {
85
+ for (const frame of bstream.flush()) {
86
+ this.queue.put({
87
+ requestId,
88
+ frame,
89
+ final: false,
90
+ segmentId: requestId
91
+ });
92
+ }
93
+ this.queue.close();
94
+ });
95
+ }
96
+ );
97
+ req.write(JSON.stringify(json));
98
+ req.end();
99
+ }
100
+ }
101
+ class SynthesizeStream extends tts.SynthesizeStream {
102
+ #opts;
103
+ #logger = log();
104
+ #tokenizer = new tokenize.basic.SentenceTokenizer(void 0, BUFFERED_WORDS_COUNT).stream();
105
+ label = "cartesia.SynthesizeStream";
106
+ constructor(tts2, opts) {
107
+ super(tts2);
108
+ this.#opts = opts;
109
+ this.#run();
110
+ }
111
+ async #run() {
112
+ const requestId = randomUUID();
113
+ let closing = false;
114
+ const sentenceStreamTask = async (ws2) => {
115
+ const packet = toCartesiaOptions(this.#opts);
116
+ for await (const event of this.#tokenizer) {
117
+ ws2.send(
118
+ JSON.stringify({
119
+ ...packet,
120
+ context_id: requestId,
121
+ transcript: event.token + " ",
122
+ continue: true
123
+ })
124
+ );
125
+ }
126
+ ws2.send(
127
+ JSON.stringify({
128
+ ...packet,
129
+ context_id: requestId,
130
+ transcript: " ",
131
+ continue: false
132
+ })
133
+ );
134
+ };
135
+ const inputTask = async () => {
136
+ for await (const data of this.input) {
137
+ if (data === SynthesizeStream.FLUSH_SENTINEL) {
138
+ this.#tokenizer.flush();
139
+ continue;
140
+ }
141
+ this.#tokenizer.pushText(data);
142
+ }
143
+ this.#tokenizer.endInput();
144
+ this.#tokenizer.close();
145
+ };
146
+ const recvTask = async (ws2) => {
147
+ const bstream = new AudioByteStream(this.#opts.sampleRate, NUM_CHANNELS);
148
+ let lastFrame;
149
+ const sendLastFrame = (segmentId, final) => {
150
+ if (lastFrame) {
151
+ this.queue.put({ requestId, segmentId, frame: lastFrame, final });
152
+ lastFrame = void 0;
153
+ }
154
+ };
155
+ ws2.on("message", (data) => {
156
+ const json = JSON.parse(data.toString());
157
+ const segmentId = json.context_id;
158
+ if ("data" in json) {
159
+ const data2 = new Int8Array(Buffer.from(json.data, "base64"));
160
+ for (const frame of bstream.write(data2)) {
161
+ sendLastFrame(segmentId, false);
162
+ lastFrame = frame;
163
+ }
164
+ } else if ("done" in json) {
165
+ for (const frame of bstream.flush()) {
166
+ sendLastFrame(segmentId, false);
167
+ lastFrame = frame;
168
+ }
169
+ sendLastFrame(segmentId, true);
170
+ this.queue.put(SynthesizeStream.END_OF_STREAM);
171
+ if (segmentId === requestId) {
172
+ closing = true;
173
+ ws2.close();
174
+ return;
175
+ }
176
+ }
177
+ });
178
+ ws2.on("close", (code, reason) => {
179
+ if (!closing) {
180
+ this.#logger.error(`WebSocket closed with code ${code}: ${reason}`);
181
+ }
182
+ ws2.removeAllListeners();
183
+ });
184
+ };
185
+ const url = `wss://api.cartesia.ai/tts/websocket?api_key=${this.#opts.apiKey}&cartesia_version=${VERSION}`;
186
+ const ws = new WebSocket(url);
187
+ try {
188
+ await new Promise((resolve, reject) => {
189
+ ws.on("open", resolve);
190
+ ws.on("error", (error) => reject(error));
191
+ ws.on("close", (code) => reject(`WebSocket returned ${code}`));
192
+ });
193
+ await Promise.all([inputTask(), sentenceStreamTask(ws), recvTask(ws)]);
194
+ } catch (e) {
195
+ throw new Error(`failed to connect to Cartesia: ${e}`);
196
+ }
197
+ }
198
+ }
199
+ const toCartesiaOptions = (opts) => {
200
+ const voice = {};
201
+ if (typeof opts.voice === "string") {
202
+ voice.mode = "id";
203
+ voice.id = opts.voice;
204
+ } else {
205
+ voice.mode = "embedding";
206
+ voice.embedding = opts.voice;
207
+ }
208
+ const voiceControls = {};
209
+ if (opts.speed) {
210
+ voiceControls.speed = opts.speed;
211
+ }
212
+ if (opts.emotion) {
213
+ voiceControls.emotion = opts.emotion;
214
+ }
215
+ if (Object.keys({}).length) {
216
+ voice.__experimental_controls = voiceControls;
217
+ }
218
+ return {
219
+ model_id: opts.model,
220
+ voice,
221
+ output_format: {
222
+ container: "raw",
223
+ encoding: opts.encoding,
224
+ sample_rate: opts.sampleRate
225
+ },
226
+ language: opts.language
227
+ };
228
+ };
229
+ export {
230
+ ChunkedStream,
231
+ SynthesizeStream,
232
+ TTS
233
+ };
234
+ //# sourceMappingURL=tts.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/tts.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { AudioByteStream, log, tokenize, tts } from '@livekit/agents';\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { randomUUID } from 'node:crypto';\nimport { request } from 'node:https';\nimport { WebSocket } from 'ws';\nimport {\n TTSDefaultVoiceId,\n type TTSEncoding,\n type TTSModels,\n type TTSVoiceEmotion,\n type TTSVoiceSpeed,\n} from './models.js';\n\nconst AUTHORIZATION_HEADER = 'X-API-Key';\nconst VERSION_HEADER = 'Cartesia-Version';\nconst VERSION = '2024-06-10';\nconst NUM_CHANNELS = 1;\nconst BUFFERED_WORDS_COUNT = 8;\n\nexport interface TTSOptions {\n model: TTSModels | string;\n encoding: TTSEncoding;\n sampleRate: number;\n voice: string | number[];\n speed?: TTSVoiceSpeed | number;\n emotion?: (TTSVoiceEmotion | string)[];\n apiKey?: string;\n language: string;\n}\n\nconst defaultTTSOptions: TTSOptions = {\n model: 'sonic-english',\n encoding: 'pcm_s16le',\n sampleRate: 24000,\n voice: TTSDefaultVoiceId,\n apiKey: process.env.CARTESIA_API_KEY,\n language: 'en',\n};\n\nexport class TTS extends tts.TTS {\n #opts: TTSOptions;\n label = 'cartesia.TTS';\n\n constructor(opts: Partial<TTSOptions> = {}) {\n super(opts.sampleRate || defaultTTSOptions.sampleRate, NUM_CHANNELS, {\n streaming: true,\n });\n\n this.#opts = {\n ...defaultTTSOptions,\n ...opts,\n };\n\n if (this.#opts.apiKey === undefined) {\n throw new Error(\n 'Cartesia API key is required, whether as an argument or as $CARTESIA_API_KEY',\n );\n }\n }\n\n // TODO(nbsp): updateOptions\n\n synthesize(text: string): tts.ChunkedStream {\n return new ChunkedStream(this, text, this.#opts);\n }\n\n stream(): tts.SynthesizeStream {\n return new SynthesizeStream(this, this.#opts);\n }\n}\n\nexport class ChunkedStream extends tts.ChunkedStream {\n label = 'cartesia.ChunkedStream';\n #opts: TTSOptions;\n #text: string;\n\n // set Promise<T> to any because OpenAI returns an annoying Response type\n constructor(tts: TTS, text: string, opts: TTSOptions) {\n super(text, tts);\n this.#text = text;\n this.#opts = opts;\n this.#run();\n }\n\n async #run() {\n const requestId = randomUUID();\n const bstream = new AudioByteStream(this.#opts.sampleRate, NUM_CHANNELS);\n const json = toCartesiaOptions(this.#opts);\n json.transcript = this.#text;\n\n const req = request(\n {\n hostname: 'api.cartesia.ai',\n port: 443,\n path: '/tts/bytes',\n method: 'POST',\n headers: {\n [AUTHORIZATION_HEADER]: this.#opts.apiKey!,\n [VERSION_HEADER]: VERSION,\n },\n },\n (res) => {\n res.on('data', (chunk) => {\n for (const frame of bstream.write(chunk)) {\n this.queue.put({\n requestId,\n frame,\n final: false,\n segmentId: requestId,\n });\n }\n });\n res.on('close', () => {\n for (const frame of bstream.flush()) {\n this.queue.put({\n requestId,\n frame,\n final: false,\n segmentId: requestId,\n });\n }\n this.queue.close();\n });\n },\n );\n\n req.write(JSON.stringify(json));\n req.end();\n }\n}\n\nexport class SynthesizeStream extends tts.SynthesizeStream {\n #opts: TTSOptions;\n #logger = log();\n #tokenizer = new tokenize.basic.SentenceTokenizer(undefined, BUFFERED_WORDS_COUNT).stream();\n label = 'cartesia.SynthesizeStream';\n\n constructor(tts: TTS, opts: TTSOptions) {\n super(tts);\n this.#opts = opts;\n this.#run();\n }\n\n async #run() {\n const requestId = randomUUID();\n let closing = false;\n\n const sentenceStreamTask = async (ws: WebSocket) => {\n const packet = toCartesiaOptions(this.#opts);\n for await (const event of this.#tokenizer) {\n ws.send(\n JSON.stringify({\n ...packet,\n context_id: requestId,\n transcript: event.token + ' ',\n continue: true,\n }),\n );\n }\n\n ws.send(\n JSON.stringify({\n ...packet,\n context_id: requestId,\n transcript: ' ',\n continue: false,\n }),\n );\n };\n\n const inputTask = async () => {\n for await (const data of this.input) {\n if (data === SynthesizeStream.FLUSH_SENTINEL) {\n this.#tokenizer.flush();\n continue;\n }\n this.#tokenizer.pushText(data);\n }\n this.#tokenizer.endInput();\n this.#tokenizer.close();\n };\n\n const recvTask = async (ws: WebSocket) => {\n const bstream = new AudioByteStream(this.#opts.sampleRate, NUM_CHANNELS);\n\n let lastFrame: AudioFrame | undefined;\n const sendLastFrame = (segmentId: string, final: boolean) => {\n if (lastFrame) {\n this.queue.put({ requestId, segmentId, frame: lastFrame, final });\n lastFrame = undefined;\n }\n };\n\n ws.on('message', (data) => {\n const json = JSON.parse(data.toString());\n const segmentId = json.context_id;\n if ('data' in json) {\n const data = new Int8Array(Buffer.from(json.data, 'base64'));\n for (const frame of bstream.write(data)) {\n sendLastFrame(segmentId, false);\n lastFrame = frame;\n }\n } else if ('done' in json) {\n for (const frame of bstream.flush()) {\n sendLastFrame(segmentId, false);\n lastFrame = frame;\n }\n sendLastFrame(segmentId, true);\n this.queue.put(SynthesizeStream.END_OF_STREAM);\n\n if (segmentId === requestId) {\n closing = true;\n ws.close();\n return;\n }\n }\n });\n ws.on('close', (code, reason) => {\n if (!closing) {\n this.#logger.error(`WebSocket closed with code ${code}: ${reason}`);\n }\n ws.removeAllListeners();\n });\n };\n\n const url = `wss://api.cartesia.ai/tts/websocket?api_key=${this.#opts.apiKey}&cartesia_version=${VERSION}`;\n const ws = new WebSocket(url);\n\n try {\n await new Promise((resolve, reject) => {\n ws.on('open', resolve);\n ws.on('error', (error) => reject(error));\n ws.on('close', (code) => reject(`WebSocket returned ${code}`));\n });\n\n await Promise.all([inputTask(), sentenceStreamTask(ws), recvTask(ws)]);\n } catch (e) {\n throw new Error(`failed to connect to Cartesia: ${e}`);\n }\n }\n}\n\nconst toCartesiaOptions = (opts: TTSOptions): { [id: string]: unknown } => {\n const voice: { [id: string]: unknown } = {};\n if (typeof opts.voice === 'string') {\n voice.mode = 'id';\n voice.id = opts.voice;\n } else {\n voice.mode = 'embedding';\n voice.embedding = opts.voice;\n }\n\n const voiceControls: { [id: string]: unknown } = {};\n if (opts.speed) {\n voiceControls.speed = opts.speed;\n }\n if (opts.emotion) {\n voiceControls.emotion = opts.emotion;\n }\n\n if (Object.keys({}).length) {\n voice.__experimental_controls = voiceControls;\n }\n\n return {\n model_id: opts.model,\n voice,\n output_format: {\n container: 'raw',\n encoding: opts.encoding,\n sample_rate: opts.sampleRate,\n },\n language: opts.language,\n };\n};\n"],"mappings":"AAGA,SAAS,iBAAiB,KAAK,UAAU,WAAW;AAEpD,SAAS,kBAAkB;AAC3B,SAAS,eAAe;AACxB,SAAS,iBAAiB;AAC1B;AAAA,EACE;AAAA,OAKK;AAEP,MAAM,uBAAuB;AAC7B,MAAM,iBAAiB;AACvB,MAAM,UAAU;AAChB,MAAM,eAAe;AACrB,MAAM,uBAAuB;AAa7B,MAAM,oBAAgC;AAAA,EACpC,OAAO;AAAA,EACP,UAAU;AAAA,EACV,YAAY;AAAA,EACZ,OAAO;AAAA,EACP,QAAQ,QAAQ,IAAI;AAAA,EACpB,UAAU;AACZ;AAEO,MAAM,YAAY,IAAI,IAAI;AAAA,EAC/B;AAAA,EACA,QAAQ;AAAA,EAER,YAAY,OAA4B,CAAC,GAAG;AAC1C,UAAM,KAAK,cAAc,kBAAkB,YAAY,cAAc;AAAA,MACnE,WAAW;AAAA,IACb,CAAC;AAED,SAAK,QAAQ;AAAA,MACX,GAAG;AAAA,MACH,GAAG;AAAA,IACL;AAEA,QAAI,KAAK,MAAM,WAAW,QAAW;AACnC,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAAA;AAAA,EAIA,WAAW,MAAiC;AAC1C,WAAO,IAAI,cAAc,MAAM,MAAM,KAAK,KAAK;AAAA,EACjD;AAAA,EAEA,SAA+B;AAC7B,WAAO,IAAI,iBAAiB,MAAM,KAAK,KAAK;AAAA,EAC9C;AACF;AAEO,MAAM,sBAAsB,IAAI,cAAc;AAAA,EACnD,QAAQ;AAAA,EACR;AAAA,EACA;AAAA;AAAA,EAGA,YAAYA,MAAU,MAAc,MAAkB;AACpD,UAAM,MAAMA,IAAG;AACf,SAAK,QAAQ;AACb,SAAK,QAAQ;AACb,SAAK,KAAK;AAAA,EACZ;AAAA,EAEA,MAAM,OAAO;AACX,UAAM,YAAY,WAAW;AAC7B,UAAM,UAAU,IAAI,gBAAgB,KAAK,MAAM,YAAY,YAAY;AACvE,UAAM,OAAO,kBAAkB,KAAK,KAAK;AACzC,SAAK,aAAa,KAAK;AAEvB,UAAM,MAAM;AAAA,MACV;AAAA,QACE,UAAU;AAAA,QACV,MAAM;AAAA,QACN,MAAM;AAAA,QACN,QAAQ;AAAA,QACR,SAAS;AAAA,UACP,CAAC,oBAAoB,GAAG,KAAK,MAAM;AAAA,UACnC,CAAC,cAAc,GAAG;AAAA,QACpB;AAAA,MACF;AAAA,MACA,CAAC,QAAQ;AACP,YAAI,GAAG,QAAQ,CAAC,UAAU;AACxB,qBAAW,SAAS,QAAQ,MAAM,KAAK,GAAG;AACxC,iBAAK,MAAM,IAAI;AAAA,cACb;AAAA,cACA;AAAA,cACA,OAAO;AAAA,cACP,WAAW;AAAA,YACb,CAAC;AAAA,UACH;AAAA,QACF,CAAC;AACD,YAAI,GAAG,SAAS,MAAM;AACpB,qBAAW,SAAS,QAAQ,MAAM,GAAG;AACnC,iBAAK,MAAM,IAAI;AAAA,cACb;AAAA,cACA;AAAA,cACA,OAAO;AAAA,cACP,WAAW;AAAA,YACb,CAAC;AAAA,UACH;AACA,eAAK,MAAM,MAAM;AAAA,QACnB,CAAC;AAAA,MACH;AAAA,IACF;AAEA,QAAI,MAAM,KAAK,UAAU,IAAI,CAAC;AAC9B,QAAI,IAAI;AAAA,EACV;AACF;AAEO,MAAM,yBAAyB,IAAI,iBAAiB;AAAA,EACzD;AAAA,EACA,UAAU,IAAI;AAAA,EACd,aAAa,IAAI,SAAS,MAAM,kBAAkB,QAAW,oBAAoB,EAAE,OAAO;AAAA,EAC1F,QAAQ;AAAA,EAER,YAAYA,MAAU,MAAkB;AACtC,UAAMA,IAAG;AACT,SAAK,QAAQ;AACb,SAAK,KAAK;AAAA,EACZ;AAAA,EAEA,MAAM,OAAO;AACX,UAAM,YAAY,WAAW;AAC7B,QAAI,UAAU;AAEd,UAAM,qBAAqB,OAAOC,QAAkB;AAClD,YAAM,SAAS,kBAAkB,KAAK,KAAK;AAC3C,uBAAiB,SAAS,KAAK,YAAY;AACzC,QAAAA,IAAG;AAAA,UACD,KAAK,UAAU;AAAA,YACb,GAAG;AAAA,YACH,YAAY;AAAA,YACZ,YAAY,MAAM,QAAQ;AAAA,YAC1B,UAAU;AAAA,UACZ,CAAC;AAAA,QACH;AAAA,MACF;AAEA,MAAAA,IAAG;AAAA,QACD,KAAK,UAAU;AAAA,UACb,GAAG;AAAA,UACH,YAAY;AAAA,UACZ,YAAY;AAAA,UACZ,UAAU;AAAA,QACZ,CAAC;AAAA,MACH;AAAA,IACF;AAEA,UAAM,YAAY,YAAY;AAC5B,uBAAiB,QAAQ,KAAK,OAAO;AACnC,YAAI,SAAS,iBAAiB,gBAAgB;AAC5C,eAAK,WAAW,MAAM;AACtB;AAAA,QACF;AACA,aAAK,WAAW,SAAS,IAAI;AAAA,MAC/B;AACA,WAAK,WAAW,SAAS;AACzB,WAAK,WAAW,MAAM;AAAA,IACxB;AAEA,UAAM,WAAW,OAAOA,QAAkB;AACxC,YAAM,UAAU,IAAI,gBAAgB,KAAK,MAAM,YAAY,YAAY;AAEvE,UAAI;AACJ,YAAM,gBAAgB,CAAC,WAAmB,UAAmB;AAC3D,YAAI,WAAW;AACb,eAAK,MAAM,IAAI,EAAE,WAAW,WAAW,OAAO,WAAW,MAAM,CAAC;AAChE,sBAAY;AAAA,QACd;AAAA,MACF;AAEA,MAAAA,IAAG,GAAG,WAAW,CAAC,SAAS;AACzB,cAAM,OAAO,KAAK,MAAM,KAAK,SAAS,CAAC;AACvC,cAAM,YAAY,KAAK;AACvB,YAAI,UAAU,MAAM;AAClB,gBAAMC,QAAO,IAAI,UAAU,OAAO,KAAK,KAAK,MAAM,QAAQ,CAAC;AAC3D,qBAAW,SAAS,QAAQ,MAAMA,KAAI,GAAG;AACvC,0BAAc,WAAW,KAAK;AAC9B,wBAAY;AAAA,UACd;AAAA,QACF,WAAW,UAAU,MAAM;AACzB,qBAAW,SAAS,QAAQ,MAAM,GAAG;AACnC,0BAAc,WAAW,KAAK;AAC9B,wBAAY;AAAA,UACd;AACA,wBAAc,WAAW,IAAI;AAC7B,eAAK,MAAM,IAAI,iBAAiB,aAAa;AAE7C,cAAI,cAAc,WAAW;AAC3B,sBAAU;AACV,YAAAD,IAAG,MAAM;AACT;AAAA,UACF;AAAA,QACF;AAAA,MACF,CAAC;AACD,MAAAA,IAAG,GAAG,SAAS,CAAC,MAAM,WAAW;AAC/B,YAAI,CAAC,SAAS;AACZ,eAAK,QAAQ,MAAM,8BAA8B,IAAI,KAAK,MAAM,EAAE;AAAA,QACpE;AACA,QAAAA,IAAG,mBAAmB;AAAA,MACxB,CAAC;AAAA,IACH;AAEA,UAAM,MAAM,+CAA+C,KAAK,MAAM,MAAM,qBAAqB,OAAO;AACxG,UAAM,KAAK,IAAI,UAAU,GAAG;AAE5B,QAAI;AACF,YAAM,IAAI,QAAQ,CAAC,SAAS,WAAW;AACrC,WAAG,GAAG,QAAQ,OAAO;AACrB,WAAG,GAAG,SAAS,CAAC,UAAU,OAAO,KAAK,CAAC;AACvC,WAAG,GAAG,SAAS,CAAC,SAAS,OAAO,sBAAsB,IAAI,EAAE,CAAC;AAAA,MAC/D,CAAC;AAED,YAAM,QAAQ,IAAI,CAAC,UAAU,GAAG,mBAAmB,EAAE,GAAG,SAAS,EAAE,CAAC,CAAC;AAAA,IACvE,SAAS,GAAG;AACV,YAAM,IAAI,MAAM,kCAAkC,CAAC,EAAE;AAAA,IACvD;AAAA,EACF;AACF;AAEA,MAAM,oBAAoB,CAAC,SAAgD;AACzE,QAAM,QAAmC,CAAC;AAC1C,MAAI,OAAO,KAAK,UAAU,UAAU;AAClC,UAAM,OAAO;AACb,UAAM,KAAK,KAAK;AAAA,EAClB,OAAO;AACL,UAAM,OAAO;AACb,UAAM,YAAY,KAAK;AAAA,EACzB;AAEA,QAAM,gBAA2C,CAAC;AAClD,MAAI,KAAK,OAAO;AACd,kBAAc,QAAQ,KAAK;AAAA,EAC7B;AACA,MAAI,KAAK,SAAS;AAChB,kBAAc,UAAU,KAAK;AAAA,EAC/B;AAEA,MAAI,OAAO,KAAK,CAAC,CAAC,EAAE,QAAQ;AAC1B,UAAM,0BAA0B;AAAA,EAClC;AAEA,SAAO;AAAA,IACL,UAAU,KAAK;AAAA,IACf;AAAA,IACA,eAAe;AAAA,MACb,WAAW;AAAA,MACX,UAAU,KAAK;AAAA,MACf,aAAa,KAAK;AAAA,IACpB;AAAA,IACA,UAAU,KAAK;AAAA,EACjB;AACF;","names":["tts","ws","data"]}
@@ -0,0 +1,9 @@
1
+ "use strict";
2
+ var import_agents_plugin_openai = require("@livekit/agents-plugin-openai");
3
+ var import_agents_plugins_test = require("@livekit/agents-plugins-test");
4
+ var import_vitest = require("vitest");
5
+ var import_tts = require("./tts.cjs");
6
+ (0, import_vitest.describe)("Cartesia", async () => {
7
+ await (0, import_agents_plugins_test.tts)(new import_tts.TTS(), new import_agents_plugin_openai.STT());
8
+ });
9
+ //# sourceMappingURL=tts.test.cjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/tts.test.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { STT } from '@livekit/agents-plugin-openai';\nimport { tts } from '@livekit/agents-plugins-test';\nimport { describe } from 'vitest';\nimport { TTS } from './tts.js';\n\ndescribe('Cartesia', async () => {\n await tts(new TTS(), new STT());\n});\n"],"mappings":";AAGA,kCAAoB;AACpB,iCAAoB;AACpB,oBAAyB;AACzB,iBAAoB;AAAA,IAEpB,wBAAS,YAAY,YAAY;AAC/B,YAAM,gCAAI,IAAI,eAAI,GAAG,IAAI,gCAAI,CAAC;AAChC,CAAC;","names":[]}
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=tts.test.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"tts.test.d.ts","sourceRoot":"","sources":["../src/tts.test.ts"],"names":[],"mappings":""}
@@ -0,0 +1,8 @@
1
+ import { STT } from "@livekit/agents-plugin-openai";
2
+ import { tts } from "@livekit/agents-plugins-test";
3
+ import { describe } from "vitest";
4
+ import { TTS } from "./tts.js";
5
+ describe("Cartesia", async () => {
6
+ await tts(new TTS(), new STT());
7
+ });
8
+ //# sourceMappingURL=tts.test.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/tts.test.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { STT } from '@livekit/agents-plugin-openai';\nimport { tts } from '@livekit/agents-plugins-test';\nimport { describe } from 'vitest';\nimport { TTS } from './tts.js';\n\ndescribe('Cartesia', async () => {\n await tts(new TTS(), new STT());\n});\n"],"mappings":"AAGA,SAAS,WAAW;AACpB,SAAS,WAAW;AACpB,SAAS,gBAAgB;AACzB,SAAS,WAAW;AAEpB,SAAS,YAAY,YAAY;AAC/B,QAAM,IAAI,IAAI,IAAI,GAAG,IAAI,IAAI,CAAC;AAChC,CAAC;","names":[]}
package/package.json ADDED
@@ -0,0 +1,49 @@
1
+ {
2
+ "name": "@livekit/agents-plugin-cartesia",
3
+ "version": "0.1.0",
4
+ "description": "Cartesia plugin for LiveKit Node Agents",
5
+ "main": "dist/index.js",
6
+ "require": "dist/index.cjs",
7
+ "types": "dist/index.d.ts",
8
+ "exports": {
9
+ ".": {
10
+ "types": "./dist/index.d.ts",
11
+ "import": "./dist/index.js",
12
+ "require": "./dist/index.cjs"
13
+ }
14
+ },
15
+ "author": "LiveKit",
16
+ "type": "module",
17
+ "repository": "git@github.com:livekit/agents-js.git",
18
+ "license": "Apache-2.0",
19
+ "files": [
20
+ "dist",
21
+ "src",
22
+ "README.md"
23
+ ],
24
+ "devDependencies": {
25
+ "@livekit/agents": "^x",
26
+ "@livekit/agents-plugin-openai": "^x",
27
+ "@livekit/agents-plugins-test": "^x",
28
+ "@livekit/rtc-node": "^0.13.1",
29
+ "@microsoft/api-extractor": "^7.35.0",
30
+ "@types/ws": "^8.5.10",
31
+ "tsup": "^8.3.5",
32
+ "typescript": "^5.0.0"
33
+ },
34
+ "dependencies": {
35
+ "ws": "^8.16.0"
36
+ },
37
+ "peerDependencies": {
38
+ "@livekit/rtc-node": "^0.13.1",
39
+ "@livekit/agents": "^0.6.1x"
40
+ },
41
+ "scripts": {
42
+ "build": "tsup --onSuccess \"tsc --declaration --emitDeclarationOnly\"",
43
+ "clean": "rm -rf dist",
44
+ "clean:build": "pnpm clean && pnpm build",
45
+ "lint": "eslint -f unix \"src/**/*.{ts,js}\"",
46
+ "api:check": "api-extractor run --typescript-compiler-folder ../../node_modules/typescript",
47
+ "api:update": "api-extractor run --local --typescript-compiler-folder ../../node_modules/typescript --verbose"
48
+ }
49
+ }
package/src/index.ts ADDED
@@ -0,0 +1,5 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+
5
+ export * from './tts.js';
package/src/models.ts ADDED
@@ -0,0 +1,45 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+
5
+ export type TTSModels = 'sonic-english' | 'sonic-multilingual';
6
+
7
+ export type TTSLanguages = 'en' | 'es' | 'fr' | 'de' | 'pt' | 'zh' | 'ja';
8
+
9
+ export const TTSDefaultVoiceId = 'c2ac25f9-ecc4-4f56-9095-651354df60c0';
10
+
11
+ export type TTSVoiceSpeed = 'fastest' | 'fast' | 'normal' | 'slow' | 'slowest';
12
+
13
+ export type TTSVoiceEmotion =
14
+ | 'anger:lowest'
15
+ | 'anger:low'
16
+ | 'anger'
17
+ | 'anger:high'
18
+ | 'anger:highest'
19
+ | 'positivity:lowest'
20
+ | 'positivity:low'
21
+ | 'positivity'
22
+ | 'positivity:high'
23
+ | 'positivity:highest'
24
+ | 'surprise:lowest'
25
+ | 'surprise:low'
26
+ | 'surprise'
27
+ | 'surprise:high'
28
+ | 'surprise:highest'
29
+ | 'sadness:lowest'
30
+ | 'sadness:low'
31
+ | 'sadness'
32
+ | 'sadness:high'
33
+ | 'sadness:highest'
34
+ | 'curiosity:lowest'
35
+ | 'curiosity:low'
36
+ | 'curiosity'
37
+ | 'curiosity:high'
38
+ | 'curiosity:highest';
39
+
40
+ export type TTSEncoding =
41
+ // XXX(nbsp): not yet supported
42
+ // | 'pcm_f32le'
43
+ // | 'pcm_mulaw'
44
+ // | 'pcm_alaw'
45
+ 'pcm_s16le';
@@ -0,0 +1,11 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import { STT } from '@livekit/agents-plugin-openai';
5
+ import { tts } from '@livekit/agents-plugins-test';
6
+ import { describe } from 'vitest';
7
+ import { TTS } from './tts.js';
8
+
9
+ describe('Cartesia', async () => {
10
+ await tts(new TTS(), new STT());
11
+ });
package/src/tts.ts ADDED
@@ -0,0 +1,278 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import { AudioByteStream, log, tokenize, tts } from '@livekit/agents';
5
+ import type { AudioFrame } from '@livekit/rtc-node';
6
+ import { randomUUID } from 'node:crypto';
7
+ import { request } from 'node:https';
8
+ import { WebSocket } from 'ws';
9
+ import {
10
+ TTSDefaultVoiceId,
11
+ type TTSEncoding,
12
+ type TTSModels,
13
+ type TTSVoiceEmotion,
14
+ type TTSVoiceSpeed,
15
+ } from './models.js';
16
+
17
+ const AUTHORIZATION_HEADER = 'X-API-Key';
18
+ const VERSION_HEADER = 'Cartesia-Version';
19
+ const VERSION = '2024-06-10';
20
+ const NUM_CHANNELS = 1;
21
+ const BUFFERED_WORDS_COUNT = 8;
22
+
23
+ export interface TTSOptions {
24
+ model: TTSModels | string;
25
+ encoding: TTSEncoding;
26
+ sampleRate: number;
27
+ voice: string | number[];
28
+ speed?: TTSVoiceSpeed | number;
29
+ emotion?: (TTSVoiceEmotion | string)[];
30
+ apiKey?: string;
31
+ language: string;
32
+ }
33
+
34
+ const defaultTTSOptions: TTSOptions = {
35
+ model: 'sonic-english',
36
+ encoding: 'pcm_s16le',
37
+ sampleRate: 24000,
38
+ voice: TTSDefaultVoiceId,
39
+ apiKey: process.env.CARTESIA_API_KEY,
40
+ language: 'en',
41
+ };
42
+
43
+ export class TTS extends tts.TTS {
44
+ #opts: TTSOptions;
45
+ label = 'cartesia.TTS';
46
+
47
+ constructor(opts: Partial<TTSOptions> = {}) {
48
+ super(opts.sampleRate || defaultTTSOptions.sampleRate, NUM_CHANNELS, {
49
+ streaming: true,
50
+ });
51
+
52
+ this.#opts = {
53
+ ...defaultTTSOptions,
54
+ ...opts,
55
+ };
56
+
57
+ if (this.#opts.apiKey === undefined) {
58
+ throw new Error(
59
+ 'Cartesia API key is required, whether as an argument or as $CARTESIA_API_KEY',
60
+ );
61
+ }
62
+ }
63
+
64
+ // TODO(nbsp): updateOptions
65
+
66
+ synthesize(text: string): tts.ChunkedStream {
67
+ return new ChunkedStream(this, text, this.#opts);
68
+ }
69
+
70
+ stream(): tts.SynthesizeStream {
71
+ return new SynthesizeStream(this, this.#opts);
72
+ }
73
+ }
74
+
75
+ export class ChunkedStream extends tts.ChunkedStream {
76
+ label = 'cartesia.ChunkedStream';
77
+ #opts: TTSOptions;
78
+ #text: string;
79
+
80
+ // set Promise<T> to any because OpenAI returns an annoying Response type
81
+ constructor(tts: TTS, text: string, opts: TTSOptions) {
82
+ super(text, tts);
83
+ this.#text = text;
84
+ this.#opts = opts;
85
+ this.#run();
86
+ }
87
+
88
+ async #run() {
89
+ const requestId = randomUUID();
90
+ const bstream = new AudioByteStream(this.#opts.sampleRate, NUM_CHANNELS);
91
+ const json = toCartesiaOptions(this.#opts);
92
+ json.transcript = this.#text;
93
+
94
+ const req = request(
95
+ {
96
+ hostname: 'api.cartesia.ai',
97
+ port: 443,
98
+ path: '/tts/bytes',
99
+ method: 'POST',
100
+ headers: {
101
+ [AUTHORIZATION_HEADER]: this.#opts.apiKey!,
102
+ [VERSION_HEADER]: VERSION,
103
+ },
104
+ },
105
+ (res) => {
106
+ res.on('data', (chunk) => {
107
+ for (const frame of bstream.write(chunk)) {
108
+ this.queue.put({
109
+ requestId,
110
+ frame,
111
+ final: false,
112
+ segmentId: requestId,
113
+ });
114
+ }
115
+ });
116
+ res.on('close', () => {
117
+ for (const frame of bstream.flush()) {
118
+ this.queue.put({
119
+ requestId,
120
+ frame,
121
+ final: false,
122
+ segmentId: requestId,
123
+ });
124
+ }
125
+ this.queue.close();
126
+ });
127
+ },
128
+ );
129
+
130
+ req.write(JSON.stringify(json));
131
+ req.end();
132
+ }
133
+ }
134
+
135
+ export class SynthesizeStream extends tts.SynthesizeStream {
136
+ #opts: TTSOptions;
137
+ #logger = log();
138
+ #tokenizer = new tokenize.basic.SentenceTokenizer(undefined, BUFFERED_WORDS_COUNT).stream();
139
+ label = 'cartesia.SynthesizeStream';
140
+
141
+ constructor(tts: TTS, opts: TTSOptions) {
142
+ super(tts);
143
+ this.#opts = opts;
144
+ this.#run();
145
+ }
146
+
147
+ async #run() {
148
+ const requestId = randomUUID();
149
+ let closing = false;
150
+
151
+ const sentenceStreamTask = async (ws: WebSocket) => {
152
+ const packet = toCartesiaOptions(this.#opts);
153
+ for await (const event of this.#tokenizer) {
154
+ ws.send(
155
+ JSON.stringify({
156
+ ...packet,
157
+ context_id: requestId,
158
+ transcript: event.token + ' ',
159
+ continue: true,
160
+ }),
161
+ );
162
+ }
163
+
164
+ ws.send(
165
+ JSON.stringify({
166
+ ...packet,
167
+ context_id: requestId,
168
+ transcript: ' ',
169
+ continue: false,
170
+ }),
171
+ );
172
+ };
173
+
174
+ const inputTask = async () => {
175
+ for await (const data of this.input) {
176
+ if (data === SynthesizeStream.FLUSH_SENTINEL) {
177
+ this.#tokenizer.flush();
178
+ continue;
179
+ }
180
+ this.#tokenizer.pushText(data);
181
+ }
182
+ this.#tokenizer.endInput();
183
+ this.#tokenizer.close();
184
+ };
185
+
186
+ const recvTask = async (ws: WebSocket) => {
187
+ const bstream = new AudioByteStream(this.#opts.sampleRate, NUM_CHANNELS);
188
+
189
+ let lastFrame: AudioFrame | undefined;
190
+ const sendLastFrame = (segmentId: string, final: boolean) => {
191
+ if (lastFrame) {
192
+ this.queue.put({ requestId, segmentId, frame: lastFrame, final });
193
+ lastFrame = undefined;
194
+ }
195
+ };
196
+
197
+ ws.on('message', (data) => {
198
+ const json = JSON.parse(data.toString());
199
+ const segmentId = json.context_id;
200
+ if ('data' in json) {
201
+ const data = new Int8Array(Buffer.from(json.data, 'base64'));
202
+ for (const frame of bstream.write(data)) {
203
+ sendLastFrame(segmentId, false);
204
+ lastFrame = frame;
205
+ }
206
+ } else if ('done' in json) {
207
+ for (const frame of bstream.flush()) {
208
+ sendLastFrame(segmentId, false);
209
+ lastFrame = frame;
210
+ }
211
+ sendLastFrame(segmentId, true);
212
+ this.queue.put(SynthesizeStream.END_OF_STREAM);
213
+
214
+ if (segmentId === requestId) {
215
+ closing = true;
216
+ ws.close();
217
+ return;
218
+ }
219
+ }
220
+ });
221
+ ws.on('close', (code, reason) => {
222
+ if (!closing) {
223
+ this.#logger.error(`WebSocket closed with code ${code}: ${reason}`);
224
+ }
225
+ ws.removeAllListeners();
226
+ });
227
+ };
228
+
229
+ const url = `wss://api.cartesia.ai/tts/websocket?api_key=${this.#opts.apiKey}&cartesia_version=${VERSION}`;
230
+ const ws = new WebSocket(url);
231
+
232
+ try {
233
+ await new Promise((resolve, reject) => {
234
+ ws.on('open', resolve);
235
+ ws.on('error', (error) => reject(error));
236
+ ws.on('close', (code) => reject(`WebSocket returned ${code}`));
237
+ });
238
+
239
+ await Promise.all([inputTask(), sentenceStreamTask(ws), recvTask(ws)]);
240
+ } catch (e) {
241
+ throw new Error(`failed to connect to Cartesia: ${e}`);
242
+ }
243
+ }
244
+ }
245
+
246
+ const toCartesiaOptions = (opts: TTSOptions): { [id: string]: unknown } => {
247
+ const voice: { [id: string]: unknown } = {};
248
+ if (typeof opts.voice === 'string') {
249
+ voice.mode = 'id';
250
+ voice.id = opts.voice;
251
+ } else {
252
+ voice.mode = 'embedding';
253
+ voice.embedding = opts.voice;
254
+ }
255
+
256
+ const voiceControls: { [id: string]: unknown } = {};
257
+ if (opts.speed) {
258
+ voiceControls.speed = opts.speed;
259
+ }
260
+ if (opts.emotion) {
261
+ voiceControls.emotion = opts.emotion;
262
+ }
263
+
264
+ if (Object.keys({}).length) {
265
+ voice.__experimental_controls = voiceControls;
266
+ }
267
+
268
+ return {
269
+ model_id: opts.model,
270
+ voice,
271
+ output_format: {
272
+ container: 'raw',
273
+ encoding: opts.encoding,
274
+ sample_rate: opts.sampleRate,
275
+ },
276
+ language: opts.language,
277
+ };
278
+ };