@livekit/agents-plugin-deepgram 1.0.10 → 1.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/tts.ts ADDED
@@ -0,0 +1,352 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import { AudioByteStream, shortuuid, tokenize, tts } from '@livekit/agents';
5
+ import type { AudioFrame } from '@livekit/rtc-node';
6
+ import { request } from 'node:https';
7
+ import { type RawData, WebSocket } from 'ws';
8
+ import type { TTSEncoding, TTSModels } from './models.js';
9
+
10
+ const AUTHORIZATION_HEADER = 'Authorization';
11
+ const NUM_CHANNELS = 1;
12
+ const MIN_SENTENCE_LENGTH = 8;
13
+
14
+ export interface TTSOptions {
15
+ model: TTSModels | string;
16
+ encoding: TTSEncoding;
17
+ sampleRate: number;
18
+ apiKey?: string;
19
+ baseUrl?: string;
20
+ sentenceTokenizer: tokenize.SentenceTokenizer;
21
+ capabilities: tts.TTSCapabilities;
22
+ }
23
+
24
+ const defaultTTSOptions: TTSOptions = {
25
+ model: 'aura-asteria-en',
26
+ encoding: 'linear16',
27
+ sampleRate: 24000,
28
+ apiKey: process.env.DEEPGRAM_API_KEY,
29
+ baseUrl: 'https://api.deepgram.com',
30
+ capabilities: {
31
+ streaming: true,
32
+ },
33
+ sentenceTokenizer: new tokenize.basic.SentenceTokenizer({
34
+ minSentenceLength: MIN_SENTENCE_LENGTH,
35
+ }),
36
+ };
37
+
38
+ export class TTS extends tts.TTS {
39
+ private opts: TTSOptions;
40
+ label = 'deepgram.TTS';
41
+
42
+ constructor(opts: Partial<TTSOptions> = {}) {
43
+ super(opts.sampleRate || defaultTTSOptions.sampleRate, NUM_CHANNELS, {
44
+ streaming: opts.capabilities?.streaming ?? defaultTTSOptions.capabilities.streaming,
45
+ });
46
+
47
+ this.opts = {
48
+ ...defaultTTSOptions,
49
+ ...opts,
50
+ };
51
+
52
+ if (this.opts.apiKey === undefined) {
53
+ throw new Error(
54
+ 'Deepgram API key is required, whether as an argument or as $DEEPGRAM_API_KEY',
55
+ );
56
+ }
57
+ }
58
+
59
+ synthesize(text: string): tts.ChunkedStream {
60
+ return new ChunkedStream(this, text, this.opts);
61
+ }
62
+
63
+ stream(): tts.SynthesizeStream {
64
+ return new SynthesizeStream(this, this.opts);
65
+ }
66
+ }
67
+
68
+ export class ChunkedStream extends tts.ChunkedStream {
69
+ label = 'deepgram.ChunkedStream';
70
+ private opts: TTSOptions;
71
+ private text: string;
72
+
73
+ constructor(tts: TTS, text: string, opts: TTSOptions) {
74
+ super(text, tts);
75
+ this.text = text;
76
+ this.opts = opts;
77
+ }
78
+
79
+ protected async run() {
80
+ const requestId = shortuuid();
81
+ const bstream = new AudioByteStream(this.opts.sampleRate, NUM_CHANNELS);
82
+ const json = { text: this.text };
83
+ const url = new URL(`${this.opts.baseUrl!}/v1/speak`);
84
+ url.searchParams.append('sample_rate', this.opts.sampleRate.toString());
85
+ url.searchParams.append('model', this.opts.model);
86
+ url.searchParams.append('encoding', this.opts.encoding);
87
+
88
+ await new Promise<void>((resolve, reject) => {
89
+ const req = request(
90
+ {
91
+ hostname: url.hostname,
92
+ port: 443,
93
+ path: url.pathname + url.search,
94
+ method: 'POST',
95
+ headers: {
96
+ [AUTHORIZATION_HEADER]: `Token ${this.opts.apiKey!}`,
97
+ 'Content-Type': 'application/json',
98
+ },
99
+ },
100
+ (res) => {
101
+ if (res.statusCode !== 200) {
102
+ reject(
103
+ new Error(`Deepgram TTS HTTP request failed: ${res.statusCode} ${res.statusMessage}`),
104
+ );
105
+ return;
106
+ }
107
+
108
+ res.on('data', (chunk) => {
109
+ for (const frame of bstream.write(chunk)) {
110
+ if (!this.queue.closed) {
111
+ this.queue.put({
112
+ requestId,
113
+ frame,
114
+ final: false,
115
+ segmentId: requestId,
116
+ });
117
+ }
118
+ }
119
+ });
120
+
121
+ res.on('error', (err) => {
122
+ reject(err);
123
+ });
124
+
125
+ res.on('close', () => {
126
+ for (const frame of bstream.flush()) {
127
+ if (!this.queue.closed) {
128
+ this.queue.put({
129
+ requestId,
130
+ frame,
131
+ final: false,
132
+ segmentId: requestId,
133
+ });
134
+ }
135
+ }
136
+ if (!this.queue.closed) {
137
+ this.queue.close();
138
+ }
139
+ resolve();
140
+ });
141
+ },
142
+ );
143
+
144
+ req.on('error', (err) => {
145
+ reject(err);
146
+ });
147
+
148
+ req.write(JSON.stringify(json));
149
+ req.end();
150
+ });
151
+ }
152
+ }
153
+
154
+ export class SynthesizeStream extends tts.SynthesizeStream {
155
+ private opts: TTSOptions;
156
+ private tokenizer: tokenize.SentenceStream;
157
+ label = 'deepgram.SynthesizeStream';
158
+
159
+ private static readonly FLUSH_MSG = JSON.stringify({ type: 'Flush' });
160
+ private static readonly CLOSE_MSG = JSON.stringify({ type: 'Close' });
161
+
162
+ constructor(tts: TTS, opts: TTSOptions) {
163
+ super(tts);
164
+ this.opts = opts;
165
+ this.tokenizer = opts.sentenceTokenizer.stream();
166
+ }
167
+
168
+ private async closeWebSocket(ws: WebSocket): Promise<void> {
169
+ try {
170
+ // Send Flush and Close messages to ensure Deepgram processes all remaining audio
171
+ // and properly terminates the session, preventing lingering TTS sessions
172
+ if (ws.readyState === WebSocket.OPEN) {
173
+ ws.send(SynthesizeStream.FLUSH_MSG);
174
+ ws.send(SynthesizeStream.CLOSE_MSG);
175
+
176
+ // Wait for server acknowledgment to prevent race conditions and ensure
177
+ // proper cleanup, avoiding 429 Too Many Requests errors from lingering sessions
178
+ try {
179
+ await new Promise<void>((resolve, _reject) => {
180
+ const timeout = setTimeout(() => {
181
+ resolve();
182
+ }, 1000);
183
+
184
+ ws.once('message', () => {
185
+ clearTimeout(timeout);
186
+ resolve();
187
+ });
188
+
189
+ ws.once('close', () => {
190
+ clearTimeout(timeout);
191
+ resolve();
192
+ });
193
+
194
+ ws.once('error', () => {
195
+ clearTimeout(timeout);
196
+ resolve();
197
+ });
198
+ });
199
+ } catch (e) {
200
+ // Ignore timeout or other errors during close sequence
201
+ }
202
+ }
203
+ } catch (e) {
204
+ console.warn(`Error during WebSocket close sequence: ${e}`);
205
+ } finally {
206
+ if (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING) {
207
+ ws.close();
208
+ }
209
+ }
210
+ }
211
+
212
+ protected async run() {
213
+ const requestId = shortuuid();
214
+ const segmentId = shortuuid();
215
+
216
+ const wsUrl = this.opts.baseUrl!.replace(/^http/, 'ws');
217
+ const url = new URL(`${wsUrl}/v1/speak`);
218
+ url.searchParams.append('sample_rate', this.opts.sampleRate.toString());
219
+ url.searchParams.append('model', this.opts.model);
220
+ url.searchParams.append('encoding', this.opts.encoding);
221
+
222
+ const ws = new WebSocket(url, {
223
+ headers: {
224
+ [AUTHORIZATION_HEADER]: `Token ${this.opts.apiKey!}`,
225
+ },
226
+ });
227
+
228
+ await new Promise((resolve, reject) => {
229
+ ws.on('open', resolve);
230
+ ws.on('error', (error) => reject(error));
231
+ ws.on('close', (code) => reject(`WebSocket returned ${code}`));
232
+ });
233
+
234
+ const inputTask = async () => {
235
+ for await (const data of this.input) {
236
+ if (data === SynthesizeStream.FLUSH_SENTINEL) {
237
+ this.tokenizer.flush();
238
+ continue;
239
+ }
240
+ this.tokenizer.pushText(data);
241
+ }
242
+ this.tokenizer.endInput();
243
+ this.tokenizer.close();
244
+ };
245
+
246
+ const sendTask = async () => {
247
+ for await (const event of this.tokenizer) {
248
+ if (this.abortController.signal.aborted) break;
249
+
250
+ let text = event.token;
251
+ if (!text.endsWith(' ')) {
252
+ text += ' ';
253
+ }
254
+
255
+ const message = JSON.stringify({
256
+ type: 'Speak',
257
+ text: text,
258
+ });
259
+
260
+ ws.send(message);
261
+ }
262
+
263
+ if (!this.abortController.signal.aborted) {
264
+ ws.send(SynthesizeStream.FLUSH_MSG);
265
+ }
266
+ };
267
+
268
+ const recvTask = async () => {
269
+ const bstream = new AudioByteStream(this.opts.sampleRate, NUM_CHANNELS);
270
+ let finalReceived = false;
271
+ let timeout: NodeJS.Timeout | null = null;
272
+ let lastFrame: AudioFrame | undefined;
273
+
274
+ const sendLastFrame = (segmentId: string, final: boolean) => {
275
+ if (lastFrame && !this.queue.closed) {
276
+ this.queue.put({ requestId, segmentId, frame: lastFrame, final });
277
+ lastFrame = undefined;
278
+ }
279
+ };
280
+
281
+ const clearMessageTimeout = () => {
282
+ if (timeout) {
283
+ clearTimeout(timeout);
284
+ timeout = null;
285
+ }
286
+ };
287
+
288
+ return new Promise<void>((resolve, reject) => {
289
+ ws.on('message', (data: RawData, isBinary: boolean) => {
290
+ clearMessageTimeout();
291
+
292
+ if (!isBinary) {
293
+ const message = JSON.parse(data.toString());
294
+ if (message.type === 'Flushed') {
295
+ finalReceived = true;
296
+ clearMessageTimeout();
297
+ for (const frame of bstream.flush()) {
298
+ sendLastFrame(segmentId, false);
299
+ lastFrame = frame;
300
+ }
301
+ sendLastFrame(segmentId, true);
302
+
303
+ if (!this.queue.closed) {
304
+ this.queue.put(SynthesizeStream.END_OF_STREAM);
305
+ }
306
+ resolve();
307
+ }
308
+
309
+ return;
310
+ }
311
+
312
+ const buffer =
313
+ data instanceof Buffer
314
+ ? data.buffer.slice(data.byteOffset, data.byteOffset + data.byteLength)
315
+ : (data as ArrayBuffer);
316
+ for (const frame of bstream.write(buffer as ArrayBuffer)) {
317
+ sendLastFrame(segmentId, false);
318
+ lastFrame = frame;
319
+ }
320
+ });
321
+
322
+ ws.on('close', (_code, _reason) => {
323
+ if (!finalReceived) {
324
+ for (const frame of bstream.flush()) {
325
+ sendLastFrame(segmentId, false);
326
+ lastFrame = frame;
327
+ }
328
+ sendLastFrame(segmentId, true);
329
+
330
+ if (!this.queue.closed) {
331
+ this.queue.put(SynthesizeStream.END_OF_STREAM);
332
+ }
333
+ }
334
+ resolve();
335
+ });
336
+
337
+ ws.on('error', (error) => {
338
+ clearMessageTimeout();
339
+ reject(error);
340
+ });
341
+ });
342
+ };
343
+
344
+ try {
345
+ await Promise.all([inputTask(), sendTask(), recvTask()]);
346
+ } catch (e) {
347
+ throw new Error(`failed in main task: ${e}`);
348
+ } finally {
349
+ await this.closeWebSocket(ws);
350
+ }
351
+ }
352
+ }