@livekit/agents-plugin-deepgram 1.0.30 → 1.0.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/stt_v2.ts ADDED
@@ -0,0 +1,489 @@
1
+ // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import {
5
+ type APIConnectOptions,
6
+ AudioByteStream,
7
+ Event,
8
+ calculateAudioDurationSeconds,
9
+ log,
10
+ stt,
11
+ } from '@livekit/agents';
12
+ import type { AudioFrame } from '@livekit/rtc-node';
13
+ import * as queryString from 'node:querystring';
14
+ import { WebSocket } from 'ws';
15
+ import { PeriodicCollector } from './_utils.js';
16
+ import type { V2Models } from './models.js';
17
+
18
+ const _CLOSE_MSG = JSON.stringify({ type: 'CloseStream' });
19
+
20
+ // --- Configuration ---
21
+
22
+ /**
23
+ * Configuration options for STTv2 (Deepgram Flux model).
24
+ */
25
+ export interface STTv2Options {
26
+ apiKey?: string;
27
+ model: V2Models | string;
28
+ sampleRate: number;
29
+ keyterms: string[];
30
+ endpointUrl: string;
31
+ language?: string;
32
+ eagerEotThreshold?: number;
33
+ eotThreshold?: number;
34
+ eotTimeoutMs?: number;
35
+ mipOptOut?: boolean;
36
+ tags?: string[];
37
+ }
38
+
39
+ const defaultSTTv2Options: Omit<STTv2Options, 'apiKey'> = {
40
+ model: 'flux-general-en',
41
+ sampleRate: 16000,
42
+ keyterms: [],
43
+ endpointUrl: 'wss://api.deepgram.com/v2/listen',
44
+ language: 'en',
45
+ mipOptOut: false,
46
+ };
47
+
48
+ function validateTags(tags: string[]): string[] {
49
+ for (const tag of tags) {
50
+ if (tag.length > 128) {
51
+ throw new Error('tag must be no more than 128 characters');
52
+ }
53
+ }
54
+ return tags;
55
+ }
56
+
57
+ /**
58
+ * Deepgram STTv2 using the Flux model for streaming speech-to-text.
59
+ *
60
+ * This uses Deepgram's V2 API (`/v2/listen`) which provides turn-based
61
+ * transcription with support for preemptive generation.
62
+ *
63
+ * @remarks
64
+ * Key differences from STT (V1):
65
+ * - Uses `TurnInfo` events instead of `SpeechStarted`/`Results`
66
+ * - Supports `eagerEotThreshold` for preemptive LLM generation
67
+ * - Sends `PREFLIGHT_TRANSCRIPT` events when eager end-of-turn is detected
68
+ *
69
+ * @example
70
+ * ```typescript
71
+ * import { STTv2 } from '@livekit/agents-plugin-deepgram';
72
+ *
73
+ * const stt = new STTv2({
74
+ * model: 'flux-general-en',
75
+ * eagerEotThreshold: 0.5, // Enable preemptive generation
76
+ * });
77
+ *
78
+ * const stream = stt.stream();
79
+ * stream.pushFrame(audioFrame);
80
+ *
81
+ * for await (const event of stream) {
82
+ * if (event.type === SpeechEventType.FINAL_TRANSCRIPT) {
83
+ * console.log(event.alternatives?.[0]?.text);
84
+ * }
85
+ * }
86
+ * ```
87
+ */
88
+ export class STTv2 extends stt.STT {
89
+ readonly label = 'deepgram.STTv2';
90
+ #opts: STTv2Options;
91
+ #apiKey: string;
92
+ #logger = log();
93
+
94
+ /**
95
+ * Create a new Deepgram STTv2 instance.
96
+ *
97
+ * @param opts - Configuration options
98
+ * @param opts.apiKey - Deepgram API key (defaults to `DEEPGRAM_API_KEY` env var)
99
+ * @param opts.model - Model to use (default: `flux-general-en`)
100
+ * @param opts.eagerEotThreshold - Threshold (0.3-0.9) for preemptive generation
101
+ * @param opts.eotThreshold - End-of-turn detection threshold (default: 0.7)
102
+ * @param opts.eotTimeoutMs - End-of-turn timeout in ms (default: 3000)
103
+ * @param opts.keyterms - List of key terms to improve recognition
104
+ * @param opts.tags - Tags for usage reporting (max 128 chars each)
105
+ *
106
+ * @throws Error if no API key is provided
107
+ */
108
+ constructor(opts: Partial<STTv2Options> = {}) {
109
+ super({
110
+ streaming: true,
111
+ interimResults: true,
112
+ });
113
+
114
+ this.#opts = { ...defaultSTTv2Options, ...opts };
115
+
116
+ const apiKey = opts.apiKey || process.env.DEEPGRAM_API_KEY;
117
+ if (!apiKey) {
118
+ throw new Error('Deepgram API key is required');
119
+ }
120
+ this.#apiKey = apiKey;
121
+
122
+ if (this.#opts.tags) {
123
+ this.#opts.tags = validateTags(this.#opts.tags);
124
+ }
125
+ }
126
+
127
+ /** The model being used for transcription */
128
+ get model(): string {
129
+ return this.#opts.model;
130
+ }
131
+
132
+ /** The STT provider name */
133
+ get provider(): string {
134
+ return 'Deepgram';
135
+ }
136
+
137
+ protected async _recognize(
138
+ _frame: AudioFrame | AudioFrame[],
139
+ _abortSignal?: AbortSignal,
140
+ ): Promise<stt.SpeechEvent> {
141
+ throw new Error('V2 API does not support non-streaming recognize. Use .stream()');
142
+ }
143
+
144
+ /**
145
+ * Create a new streaming transcription session.
146
+ *
147
+ * @param options - Stream options
148
+ * @returns A SpeechStream that emits transcription events
149
+ */
150
+ stream(options?: { connOptions?: APIConnectOptions }): stt.SpeechStream {
151
+ const streamOpts = { ...this.#opts, apiKey: this.#apiKey };
152
+ return new SpeechStreamv2(this, streamOpts, options?.connOptions);
153
+ }
154
+
155
+ /**
156
+ * Update STT options. Changes will take effect on the next stream.
157
+ *
158
+ * @param opts - Partial options to update
159
+ */
160
+ updateOptions(opts: Partial<STTv2Options>) {
161
+ this.#opts = { ...this.#opts, ...opts };
162
+ if (opts.tags) this.#opts.tags = validateTags(opts.tags);
163
+ this.#logger.debug('Updated STTv2 options');
164
+ }
165
+ }
166
+
167
+ // --- Stream Implementation ---
168
+
169
+ class SpeechStreamv2 extends stt.SpeechStream {
170
+ readonly label = 'deepgram.SpeechStreamv2';
171
+ #opts: STTv2Options & { apiKey: string };
172
+ #logger = log();
173
+ #ws: WebSocket | null = null;
174
+
175
+ #audioDurationCollector: PeriodicCollector<number>;
176
+ #requestId = '';
177
+ #speaking = false;
178
+
179
+ // Parity: _reconnect_event - using existing Event class from @livekit/agents
180
+ #reconnectEvent = new Event();
181
+
182
+ constructor(
183
+ sttInstance: STTv2,
184
+ opts: STTv2Options & { apiKey: string },
185
+ connOptions?: APIConnectOptions,
186
+ ) {
187
+ super(sttInstance, opts.sampleRate, connOptions);
188
+ this.#opts = opts;
189
+
190
+ this.#audioDurationCollector = new PeriodicCollector(
191
+ (duration) => this.#onAudioDurationReport(duration),
192
+ { duration: 5.0 },
193
+ );
194
+ }
195
+
196
+ updateOptions(opts: Partial<STTv2Options>) {
197
+ this.#logger.debug('Stream received option update', opts);
198
+ this.#opts = { ...this.#opts, ...opts };
199
+ if (opts.tags) this.#opts.tags = validateTags(opts.tags);
200
+
201
+ // Trigger reconnection loop
202
+ this.#reconnectEvent.set();
203
+ }
204
+
205
+ protected async run() {
206
+ // Outer Loop: Handles reconnections (Configuration updates)
207
+ while (!this.closed) {
208
+ try {
209
+ this.#reconnectEvent.clear();
210
+
211
+ const url = this.#getDeepgramUrl();
212
+ this.#logger.debug(`Connecting to Deepgram: ${url}`);
213
+
214
+ this.#ws = new WebSocket(url, {
215
+ headers: { Authorization: `Token ${this.#opts.apiKey}` },
216
+ });
217
+
218
+ // 1. Wait for Connection Open
219
+ await new Promise<void>((resolve, reject) => {
220
+ if (!this.#ws) return reject(new Error('WebSocket not initialized'));
221
+
222
+ const onOpen = () => {
223
+ this.#ws?.off('error', onError);
224
+ resolve();
225
+ };
226
+ const onError = (err: Error) => {
227
+ this.#ws?.off('open', onOpen);
228
+ reject(err);
229
+ };
230
+
231
+ this.#ws.once('open', onOpen);
232
+ this.#ws.once('error', onError);
233
+ });
234
+
235
+ // 2. Run Concurrent Tasks (Send & Receive)
236
+ const sendPromise = this.#sendTask();
237
+ const recvPromise = this.#recvTask();
238
+ const reconnectWait = this.#reconnectEvent.wait();
239
+
240
+ // 3. Race: Normal Completion vs Reconnect Signal
241
+ const result = await Promise.race([
242
+ Promise.all([sendPromise, recvPromise]),
243
+ reconnectWait.then(() => 'RECONNECT'),
244
+ ]);
245
+
246
+ if (result === 'RECONNECT') {
247
+ this.#logger.debug('Reconnecting stream due to option update...');
248
+ // Close current socket; loop will restart and open a new one
249
+ this.#ws.close();
250
+ } else {
251
+ // Normal finish (Stream ended or Error thrown)
252
+ break;
253
+ }
254
+ } catch (error) {
255
+ this.#logger.error('Deepgram stream error', { error });
256
+ throw error; // Let Base Class handle retry logic
257
+ } finally {
258
+ if (this.#ws?.readyState === WebSocket.OPEN) {
259
+ this.#ws.close();
260
+ }
261
+ }
262
+ }
263
+ this.close();
264
+ }
265
+
266
+ async #sendTask() {
267
+ if (!this.#ws) return;
268
+
269
+ // Buffer audio into 50ms chunks (Parity)
270
+ const samples50ms = Math.floor(this.#opts.sampleRate / 20);
271
+ const audioBstream = new AudioByteStream(this.#opts.sampleRate, 1, samples50ms);
272
+
273
+ let hasEnded = false;
274
+
275
+ // Manual Iterator to allow racing against Reconnect Signal
276
+ const iterator = this.input[Symbol.asyncIterator]();
277
+
278
+ while (true) {
279
+ const nextPromise = iterator.next();
280
+ // If reconnect signal fires, abort the wait
281
+ const abortPromise = this.#reconnectEvent.wait().then(() => ({ abort: true }) as const);
282
+
283
+ const result = await Promise.race([nextPromise, abortPromise]);
284
+
285
+ // Check if we need to abort (Reconnect) or if stream is done
286
+ if ('abort' in result || result.done) {
287
+ if (!('abort' in result) && result.done) {
288
+ // Normal stream end
289
+ hasEnded = true;
290
+ } else {
291
+ // Reconnect triggered - break loop immediately
292
+ break;
293
+ }
294
+ }
295
+
296
+ // If we broke above, we don't process data. If not, 'result' is IteratorResult
297
+ if (hasEnded && !('value' in result)) {
298
+ // Process flush below
299
+ } else if ('value' in result) {
300
+ const data = result.value;
301
+ const frames: AudioFrame[] = [];
302
+
303
+ if (data === SpeechStreamv2.FLUSH_SENTINEL) {
304
+ frames.push(...audioBstream.flush());
305
+ hasEnded = true;
306
+ } else {
307
+ frames.push(...audioBstream.write((data as AudioFrame).data.buffer as ArrayBuffer));
308
+ }
309
+
310
+ for (const frame of frames) {
311
+ this.#audioDurationCollector.push(calculateAudioDurationSeconds(frame));
312
+
313
+ if (this.#ws!.readyState === WebSocket.OPEN) {
314
+ this.#ws!.send(frame.data);
315
+ }
316
+
317
+ if (hasEnded) {
318
+ this.#audioDurationCollector.flush();
319
+ hasEnded = false;
320
+ }
321
+ }
322
+ }
323
+
324
+ if (hasEnded) break;
325
+ }
326
+
327
+ // Only send CloseStream if we are exiting normally (not reconnecting)
328
+ if (!this.#reconnectEvent.isSet && this.#ws!.readyState === WebSocket.OPEN) {
329
+ this.#logger.debug('Sending CloseStream message to Deepgram');
330
+ this.#ws!.send(_CLOSE_MSG);
331
+ }
332
+ }
333
+
334
+ async #recvTask() {
335
+ if (!this.#ws) return;
336
+
337
+ return new Promise<void>((resolve) => {
338
+ if (!this.#ws) return resolve();
339
+
340
+ this.#ws.on('message', (data: Buffer, isBinary: boolean) => {
341
+ if (isBinary) {
342
+ this.#logger.warn('Received unexpected binary message from Deepgram');
343
+ return;
344
+ }
345
+ try {
346
+ const msg = JSON.parse(data.toString());
347
+ this.#processStreamEvent(msg);
348
+ } catch (error) {
349
+ this.#logger.error('Failed to parse Deepgram message', { error });
350
+ }
351
+ });
352
+
353
+ this.#ws.on('close', (code, reason) => {
354
+ this.#logger.debug(`Deepgram WebSocket closed: ${code} ${reason}`);
355
+ resolve();
356
+ });
357
+
358
+ // Errors are caught by run() listener, resolve here to clean up task
359
+ this.#ws.on('error', () => resolve());
360
+ });
361
+ }
362
+
363
+ #processStreamEvent(data: Record<string, unknown>) {
364
+ if (data.request_id) {
365
+ this.#requestId = data.request_id as string;
366
+ }
367
+
368
+ if (data.type === 'TurnInfo') {
369
+ const eventType = data.event;
370
+
371
+ if (eventType === 'StartOfTurn') {
372
+ if (this.#speaking) return;
373
+
374
+ this.#speaking = true;
375
+ this.queue.put({
376
+ type: stt.SpeechEventType.START_OF_SPEECH,
377
+ requestId: this.#requestId,
378
+ });
379
+
380
+ this.#sendTranscriptEvent(stt.SpeechEventType.INTERIM_TRANSCRIPT, data);
381
+ } else if (eventType === 'Update') {
382
+ if (!this.#speaking) return;
383
+ this.#sendTranscriptEvent(stt.SpeechEventType.INTERIM_TRANSCRIPT, data);
384
+ } else if (eventType === 'EagerEndOfTurn') {
385
+ if (!this.#speaking) return;
386
+ this.#sendTranscriptEvent(stt.SpeechEventType.PREFLIGHT_TRANSCRIPT, data);
387
+ } else if (eventType === 'TurnResumed') {
388
+ this.#sendTranscriptEvent(stt.SpeechEventType.INTERIM_TRANSCRIPT, data);
389
+ } else if (eventType === 'EndOfTurn') {
390
+ if (!this.#speaking) return;
391
+
392
+ this.#speaking = false;
393
+ this.#sendTranscriptEvent(stt.SpeechEventType.FINAL_TRANSCRIPT, data);
394
+
395
+ this.queue.put({
396
+ type: stt.SpeechEventType.END_OF_SPEECH,
397
+ requestId: this.#requestId,
398
+ });
399
+ }
400
+ } else if (data.type === 'Error') {
401
+ this.#logger.warn('deepgram sent an error', { data });
402
+ const desc = (data.description as string) || 'unknown error from deepgram';
403
+ throw new Error(`Deepgram API Error: ${desc}`);
404
+ }
405
+ }
406
+
407
+ #sendTranscriptEvent(eventType: stt.SpeechEventType, data: Record<string, unknown>) {
408
+ // Note: start_time_offset is not yet available in the TypeScript base class
409
+ // Using 0.0 for now - full parity would require base class changes
410
+ const alts = parseTranscription(this.#opts.language || 'en', data, 0.0);
411
+
412
+ if (alts.length > 0) {
413
+ this.queue.put({
414
+ type: eventType,
415
+ requestId: this.#requestId,
416
+ alternatives: [alts[0]!, ...alts.slice(1)],
417
+ });
418
+ }
419
+ }
420
+
421
+ #onAudioDurationReport(duration: number) {
422
+ const usageEvent: stt.SpeechEvent = {
423
+ type: stt.SpeechEventType.RECOGNITION_USAGE,
424
+ requestId: this.#requestId,
425
+ recognitionUsage: {
426
+ audioDuration: duration,
427
+ },
428
+ };
429
+ this.queue.put(usageEvent);
430
+ }
431
+
432
+ #getDeepgramUrl(): string {
433
+ const params: Record<string, string | string[]> = {
434
+ model: this.#opts.model,
435
+ sample_rate: this.#opts.sampleRate.toString(),
436
+ encoding: 'linear16',
437
+ mip_opt_out: String(this.#opts.mipOptOut),
438
+ };
439
+
440
+ if (this.#opts.language) params.language = this.#opts.language;
441
+ if (this.#opts.eagerEotThreshold)
442
+ params.eager_eot_threshold = this.#opts.eagerEotThreshold.toString();
443
+ if (this.#opts.eotThreshold) params.eot_threshold = this.#opts.eotThreshold.toString();
444
+ if (this.#opts.eotTimeoutMs) params.eot_timeout_ms = this.#opts.eotTimeoutMs.toString();
445
+
446
+ if (this.#opts.keyterms.length > 0) params.keyterm = this.#opts.keyterms;
447
+ if (this.#opts.tags && this.#opts.tags.length > 0) params.tag = this.#opts.tags;
448
+
449
+ const baseUrl = this.#opts.endpointUrl.replace(/^http/, 'ws');
450
+ const qs = queryString.stringify(params);
451
+ return `${baseUrl}?${qs}`;
452
+ }
453
+
454
+ override close() {
455
+ super.close();
456
+ this.#ws?.close();
457
+ }
458
+ }
459
+
460
+ // --- Helpers ---
461
+
462
+ function parseTranscription(
463
+ language: string,
464
+ data: Record<string, unknown>,
465
+ startTimeOffset: number,
466
+ ): stt.SpeechData[] {
467
+ const transcript = data.transcript as string | undefined;
468
+ const words = (data.words as Array<Record<string, unknown>>) || [];
469
+
470
+ if (!words || words.length === 0) {
471
+ return [];
472
+ }
473
+
474
+ let confidence = 0;
475
+ if (words.length > 0) {
476
+ const sum = words.reduce((acc: number, w) => acc + ((w.confidence as number) || 0), 0);
477
+ confidence = sum / words.length;
478
+ }
479
+
480
+ const sd: stt.SpeechData = {
481
+ language: language,
482
+ startTime: ((data.audio_window_start as number) || 0) + startTimeOffset,
483
+ endTime: ((data.audio_window_end as number) || 0) + startTimeOffset,
484
+ confidence: confidence,
485
+ text: transcript || '',
486
+ };
487
+
488
+ return [sd];
489
+ }