@juspay/neurolink 9.61.1 → 9.62.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. package/CHANGELOG.md +12 -0
  2. package/README.md +23 -17
  3. package/dist/adapters/tts/googleTTSHandler.js +1 -1
  4. package/dist/browser/neurolink.min.js +382 -364
  5. package/dist/cli/commands/serve.js +9 -0
  6. package/dist/cli/commands/voiceServer.d.ts +7 -0
  7. package/dist/cli/commands/voiceServer.js +9 -1
  8. package/dist/cli/factories/commandFactory.js +136 -11
  9. package/dist/cli/loop/optionsSchema.d.ts +1 -1
  10. package/dist/cli/utils/audioFileUtils.d.ts +3 -3
  11. package/dist/cli/utils/audioFileUtils.js +5 -1
  12. package/dist/core/baseProvider.js +29 -6
  13. package/dist/factories/providerRegistry.d.ts +14 -0
  14. package/dist/factories/providerRegistry.js +141 -2
  15. package/dist/lib/adapters/tts/googleTTSHandler.js +1 -1
  16. package/dist/lib/core/baseProvider.js +29 -6
  17. package/dist/lib/factories/providerRegistry.d.ts +14 -0
  18. package/dist/lib/factories/providerRegistry.js +141 -2
  19. package/dist/lib/mcp/toolRegistry.js +7 -1
  20. package/dist/lib/neurolink.d.ts +19 -0
  21. package/dist/lib/neurolink.js +252 -14
  22. package/dist/lib/observability/exporters/laminarExporter.js +1 -0
  23. package/dist/lib/observability/exporters/posthogExporter.js +1 -0
  24. package/dist/lib/observability/utils/spanSerializer.js +1 -0
  25. package/dist/lib/server/voice/tokenCompare.d.ts +14 -0
  26. package/dist/lib/server/voice/tokenCompare.js +23 -0
  27. package/dist/lib/server/voice/voiceServerApp.js +62 -3
  28. package/dist/lib/server/voice/voiceWebSocketHandler.d.ts +20 -3
  29. package/dist/lib/server/voice/voiceWebSocketHandler.js +555 -435
  30. package/dist/lib/types/generate.d.ts +47 -0
  31. package/dist/lib/types/hitl.d.ts +3 -0
  32. package/dist/lib/types/index.d.ts +1 -1
  33. package/dist/lib/types/index.js +1 -1
  34. package/dist/lib/types/realtime.d.ts +243 -0
  35. package/dist/lib/types/realtime.js +70 -0
  36. package/dist/lib/types/server.d.ts +68 -0
  37. package/dist/lib/types/span.d.ts +2 -0
  38. package/dist/lib/types/span.js +2 -0
  39. package/dist/lib/types/stream.d.ts +36 -14
  40. package/dist/lib/types/stt.d.ts +585 -0
  41. package/dist/lib/types/stt.js +90 -0
  42. package/dist/lib/types/tools.d.ts +2 -0
  43. package/dist/lib/types/tts.d.ts +23 -11
  44. package/dist/lib/types/tts.js +7 -0
  45. package/dist/lib/types/voice.d.ts +272 -0
  46. package/dist/lib/types/voice.js +137 -0
  47. package/dist/lib/utils/audioFormatDetector.d.ts +15 -0
  48. package/dist/lib/utils/audioFormatDetector.js +34 -0
  49. package/dist/lib/utils/errorHandling.js +4 -0
  50. package/dist/lib/utils/sttProcessor.d.ts +115 -0
  51. package/dist/lib/utils/sttProcessor.js +295 -0
  52. package/dist/lib/voice/RealtimeVoiceAPI.d.ts +183 -0
  53. package/dist/lib/voice/RealtimeVoiceAPI.js +439 -0
  54. package/dist/lib/voice/audio-utils.d.ts +135 -0
  55. package/dist/lib/voice/audio-utils.js +435 -0
  56. package/dist/lib/voice/errors.d.ts +123 -0
  57. package/dist/lib/voice/errors.js +386 -0
  58. package/dist/lib/voice/index.d.ts +26 -0
  59. package/dist/lib/voice/index.js +55 -0
  60. package/dist/lib/voice/providers/AzureSTT.d.ts +47 -0
  61. package/dist/lib/voice/providers/AzureSTT.js +345 -0
  62. package/dist/lib/voice/providers/AzureTTS.d.ts +59 -0
  63. package/dist/lib/voice/providers/AzureTTS.js +349 -0
  64. package/dist/lib/voice/providers/DeepgramSTT.d.ts +40 -0
  65. package/dist/lib/voice/providers/DeepgramSTT.js +550 -0
  66. package/dist/lib/voice/providers/ElevenLabsTTS.d.ts +53 -0
  67. package/dist/lib/voice/providers/ElevenLabsTTS.js +311 -0
  68. package/dist/lib/voice/providers/GeminiLive.d.ts +52 -0
  69. package/dist/lib/voice/providers/GeminiLive.js +372 -0
  70. package/dist/lib/voice/providers/GoogleSTT.d.ts +60 -0
  71. package/dist/lib/voice/providers/GoogleSTT.js +454 -0
  72. package/dist/lib/voice/providers/OpenAIRealtime.d.ts +47 -0
  73. package/dist/lib/voice/providers/OpenAIRealtime.js +412 -0
  74. package/dist/lib/voice/providers/OpenAISTT.d.ts +41 -0
  75. package/dist/lib/voice/providers/OpenAISTT.js +286 -0
  76. package/dist/lib/voice/providers/OpenAITTS.d.ts +49 -0
  77. package/dist/lib/voice/providers/OpenAITTS.js +271 -0
  78. package/dist/lib/voice/stream-handler.d.ts +166 -0
  79. package/dist/lib/voice/stream-handler.js +514 -0
  80. package/dist/mcp/toolRegistry.js +7 -1
  81. package/dist/neurolink.d.ts +19 -0
  82. package/dist/neurolink.js +252 -14
  83. package/dist/observability/exporters/laminarExporter.js +1 -0
  84. package/dist/observability/exporters/posthogExporter.js +1 -0
  85. package/dist/observability/utils/spanSerializer.js +1 -0
  86. package/dist/server/voice/tokenCompare.d.ts +14 -0
  87. package/dist/server/voice/tokenCompare.js +22 -0
  88. package/dist/server/voice/voiceServerApp.js +62 -3
  89. package/dist/server/voice/voiceWebSocketHandler.d.ts +20 -3
  90. package/dist/server/voice/voiceWebSocketHandler.js +555 -435
  91. package/dist/types/generate.d.ts +47 -0
  92. package/dist/types/hitl.d.ts +3 -0
  93. package/dist/types/index.d.ts +1 -1
  94. package/dist/types/index.js +1 -1
  95. package/dist/types/realtime.d.ts +243 -0
  96. package/dist/types/realtime.js +69 -0
  97. package/dist/types/server.d.ts +68 -0
  98. package/dist/types/span.d.ts +2 -0
  99. package/dist/types/span.js +2 -0
  100. package/dist/types/stream.d.ts +36 -14
  101. package/dist/types/stt.d.ts +585 -0
  102. package/dist/types/stt.js +89 -0
  103. package/dist/types/tools.d.ts +2 -0
  104. package/dist/types/tts.d.ts +23 -11
  105. package/dist/types/tts.js +7 -0
  106. package/dist/types/voice.d.ts +272 -0
  107. package/dist/types/voice.js +136 -0
  108. package/dist/utils/audioFormatDetector.d.ts +15 -0
  109. package/dist/utils/audioFormatDetector.js +33 -0
  110. package/dist/utils/errorHandling.js +4 -0
  111. package/dist/utils/sttProcessor.d.ts +115 -0
  112. package/dist/utils/sttProcessor.js +294 -0
  113. package/dist/voice/RealtimeVoiceAPI.d.ts +183 -0
  114. package/dist/voice/RealtimeVoiceAPI.js +438 -0
  115. package/dist/voice/audio-utils.d.ts +135 -0
  116. package/dist/voice/audio-utils.js +434 -0
  117. package/dist/voice/errors.d.ts +123 -0
  118. package/dist/voice/errors.js +385 -0
  119. package/dist/voice/index.d.ts +26 -0
  120. package/dist/voice/index.js +54 -0
  121. package/dist/voice/providers/AzureSTT.d.ts +47 -0
  122. package/dist/voice/providers/AzureSTT.js +344 -0
  123. package/dist/voice/providers/AzureTTS.d.ts +59 -0
  124. package/dist/voice/providers/AzureTTS.js +348 -0
  125. package/dist/voice/providers/DeepgramSTT.d.ts +40 -0
  126. package/dist/voice/providers/DeepgramSTT.js +549 -0
  127. package/dist/voice/providers/ElevenLabsTTS.d.ts +53 -0
  128. package/dist/voice/providers/ElevenLabsTTS.js +310 -0
  129. package/dist/voice/providers/GeminiLive.d.ts +52 -0
  130. package/dist/voice/providers/GeminiLive.js +371 -0
  131. package/dist/voice/providers/GoogleSTT.d.ts +60 -0
  132. package/dist/voice/providers/GoogleSTT.js +453 -0
  133. package/dist/voice/providers/OpenAIRealtime.d.ts +47 -0
  134. package/dist/voice/providers/OpenAIRealtime.js +411 -0
  135. package/dist/voice/providers/OpenAISTT.d.ts +41 -0
  136. package/dist/voice/providers/OpenAISTT.js +285 -0
  137. package/dist/voice/providers/OpenAITTS.d.ts +49 -0
  138. package/dist/voice/providers/OpenAITTS.js +270 -0
  139. package/dist/voice/stream-handler.d.ts +166 -0
  140. package/dist/voice/stream-handler.js +513 -0
  141. package/package.json +5 -2
@@ -0,0 +1,549 @@
1
+ /**
2
+ * Deepgram Speech-to-Text Handler
3
+ *
4
+ * Implementation of STT using Deepgram's Speech Recognition API.
5
+ *
6
+ * @module voice/providers/DeepgramSTT
7
+ */
8
+ import { logger } from "../../utils/logger.js";
9
+ import { STTError } from "../errors.js";
10
+ /**
11
+ * Deepgram Speech-to-Text Handler
12
+ *
13
+ * Supports real-time streaming, speaker diarization, and smart formatting.
14
+ *
15
+ * @see https://developers.deepgram.com/docs
16
+ */
17
+ export class DeepgramSTT {
18
+ apiKey;
19
+ baseUrl = "https://api.deepgram.com/v1";
20
+ /**
21
+ * Maximum audio duration in seconds (2 hours)
22
+ */
23
+ maxAudioDuration = 7200;
24
+ /**
25
+ * Deepgram supports streaming
26
+ */
27
+ supportsStreaming = true;
28
+ constructor(apiKey) {
29
+ // Normalize: trim surrounding whitespace and treat empty string as null
30
+ // so isConfigured() and transcribe()/transcribeStream() agree on the
31
+ // contract (other voice providers all do this — Deepgram was missed).
32
+ const resolvedKey = (apiKey ?? process.env.DEEPGRAM_API_KEY ?? "").trim();
33
+ this.apiKey = resolvedKey.length > 0 ? resolvedKey : null;
34
+ }
35
+ isConfigured() {
36
+ return this.apiKey !== null;
37
+ }
38
+ getSupportedFormats() {
39
+ return ["mp3", "wav", "ogg", "opus"];
40
+ }
41
+ async getSupportedLanguages() {
42
+ // Deepgram supports 40+ languages
43
+ return [
44
+ {
45
+ code: "en",
46
+ name: "English",
47
+ supportsDiarization: true,
48
+ supportsPunctuation: true,
49
+ },
50
+ {
51
+ code: "en-US",
52
+ name: "English (US)",
53
+ supportsDiarization: true,
54
+ supportsPunctuation: true,
55
+ },
56
+ {
57
+ code: "en-GB",
58
+ name: "English (UK)",
59
+ supportsDiarization: true,
60
+ supportsPunctuation: true,
61
+ },
62
+ {
63
+ code: "es",
64
+ name: "Spanish",
65
+ supportsDiarization: true,
66
+ supportsPunctuation: true,
67
+ },
68
+ {
69
+ code: "fr",
70
+ name: "French",
71
+ supportsDiarization: true,
72
+ supportsPunctuation: true,
73
+ },
74
+ {
75
+ code: "de",
76
+ name: "German",
77
+ supportsDiarization: true,
78
+ supportsPunctuation: true,
79
+ },
80
+ {
81
+ code: "it",
82
+ name: "Italian",
83
+ supportsDiarization: true,
84
+ supportsPunctuation: true,
85
+ },
86
+ {
87
+ code: "pt",
88
+ name: "Portuguese",
89
+ supportsDiarization: true,
90
+ supportsPunctuation: true,
91
+ },
92
+ {
93
+ code: "nl",
94
+ name: "Dutch",
95
+ supportsDiarization: true,
96
+ supportsPunctuation: true,
97
+ },
98
+ {
99
+ code: "ja",
100
+ name: "Japanese",
101
+ supportsDiarization: true,
102
+ supportsPunctuation: true,
103
+ },
104
+ {
105
+ code: "ko",
106
+ name: "Korean",
107
+ supportsDiarization: true,
108
+ supportsPunctuation: true,
109
+ },
110
+ {
111
+ code: "zh",
112
+ name: "Chinese",
113
+ supportsDiarization: true,
114
+ supportsPunctuation: true,
115
+ },
116
+ {
117
+ code: "hi",
118
+ name: "Hindi",
119
+ supportsDiarization: true,
120
+ supportsPunctuation: true,
121
+ },
122
+ {
123
+ code: "ru",
124
+ name: "Russian",
125
+ supportsDiarization: true,
126
+ supportsPunctuation: true,
127
+ },
128
+ ];
129
+ }
130
+ async transcribe(audio, options = {}) {
131
+ if (!this.apiKey) {
132
+ throw STTError.providerNotConfigured("deepgram");
133
+ }
134
+ const audioBuffer = Buffer.isBuffer(audio) ? audio : Buffer.from(audio);
135
+ if (audioBuffer.length === 0) {
136
+ throw STTError.audioEmpty("deepgram");
137
+ }
138
+ const deepgramOptions = options;
139
+ const startTime = Date.now();
140
+ try {
141
+ // Build query parameters
142
+ const params = new URLSearchParams();
143
+ // Add model
144
+ params.set("model", deepgramOptions.model ?? "nova-2");
145
+ // Add language
146
+ if (options.language) {
147
+ params.set("language", options.language);
148
+ }
149
+ // Add punctuation
150
+ if (options.punctuation !== false) {
151
+ params.set("punctuate", "true");
152
+ }
153
+ // Add diarization
154
+ if (options.speakerDiarization) {
155
+ params.set("diarize", "true");
156
+ if (options.speakerCount) {
157
+ params.set("diarize_version", "latest");
158
+ }
159
+ }
160
+ // Add smart format
161
+ if (deepgramOptions.smartFormat) {
162
+ params.set("smart_format", "true");
163
+ }
164
+ // Add utterances
165
+ if (deepgramOptions.utterances) {
166
+ params.set("utterances", "true");
167
+ if (deepgramOptions.uttSplit !== undefined) {
168
+ params.set("utt_split", deepgramOptions.uttSplit.toString());
169
+ }
170
+ }
171
+ // Add paragraphs
172
+ if (deepgramOptions.paragraphs) {
173
+ params.set("paragraphs", "true");
174
+ }
175
+ // Add filler words
176
+ if (deepgramOptions.fillerWords) {
177
+ params.set("filler_words", "true");
178
+ }
179
+ // Add keywords
180
+ if (deepgramOptions.keywords && deepgramOptions.keywords.length > 0) {
181
+ for (const keyword of deepgramOptions.keywords) {
182
+ params.append("keywords", keyword);
183
+ }
184
+ if (deepgramOptions.keywordBoost) {
185
+ params.set("keyword_boost", deepgramOptions.keywordBoost);
186
+ }
187
+ }
188
+ // Add redaction
189
+ if (deepgramOptions.redact && deepgramOptions.redact.length > 0) {
190
+ for (const redactType of deepgramOptions.redact) {
191
+ params.append("redact", redactType);
192
+ }
193
+ }
194
+ // Add profanity filter
195
+ if (options.profanityFilter) {
196
+ params.set("profanity_filter", "true");
197
+ }
198
+ const url = `${this.baseUrl}/listen?${params.toString()}`;
199
+ const controller = new AbortController();
200
+ const timeoutId = setTimeout(() => controller.abort(), 30000);
201
+ let response;
202
+ try {
203
+ response = await fetch(url, {
204
+ method: "POST",
205
+ headers: {
206
+ Authorization: `Token ${this.apiKey}`,
207
+ "Content-Type": this.getMimeType(options.format ?? "wav"),
208
+ },
209
+ body: new Uint8Array(audioBuffer),
210
+ signal: controller.signal,
211
+ });
212
+ }
213
+ catch (fetchErr) {
214
+ if (fetchErr instanceof Error && fetchErr.name === "AbortError") {
215
+ throw STTError.transcriptionFailed("Deepgram STT request timed out after 30 seconds", "deepgram", fetchErr);
216
+ }
217
+ throw fetchErr;
218
+ }
219
+ finally {
220
+ clearTimeout(timeoutId);
221
+ }
222
+ if (!response.ok) {
223
+ const errorData = await response
224
+ .json()
225
+ .catch(() => Object.create(null));
226
+ const errorMessage = errorData.err_msg ||
227
+ `HTTP ${response.status}`;
228
+ throw STTError.transcriptionFailed(errorMessage, "deepgram");
229
+ }
230
+ const data = (await response.json());
231
+ const latency = Date.now() - startTime;
232
+ // Handle empty results
233
+ if (!data.results?.channels ||
234
+ data.results.channels.length === 0 ||
235
+ !data.results.channels[0].alternatives ||
236
+ data.results.channels[0].alternatives.length === 0) {
237
+ return {
238
+ text: "",
239
+ confidence: 0,
240
+ language: options.language,
241
+ duration: data.metadata?.duration,
242
+ metadata: {
243
+ latency,
244
+ provider: "deepgram",
245
+ requestId: data.metadata?.request_id,
246
+ },
247
+ };
248
+ }
249
+ const firstChannel = data.results.channels[0];
250
+ const firstAlternative = firstChannel.alternatives[0];
251
+ // Build result
252
+ const result = {
253
+ text: firstAlternative.transcript,
254
+ confidence: firstAlternative.confidence,
255
+ language: options.language,
256
+ duration: data.metadata?.duration,
257
+ metadata: {
258
+ latency,
259
+ provider: "deepgram",
260
+ model: deepgramOptions.model ?? "nova-2",
261
+ requestId: data.metadata?.request_id,
262
+ },
263
+ };
264
+ // Add word timings
265
+ if (firstAlternative.words && firstAlternative.words.length > 0) {
266
+ const speakers = new Set();
267
+ result.words = firstAlternative.words.map((word) => {
268
+ const wordTiming = {
269
+ word: word.punctuated_word ?? word.word,
270
+ startTime: word.start,
271
+ endTime: word.end,
272
+ confidence: word.confidence,
273
+ };
274
+ if (word.speaker !== undefined) {
275
+ wordTiming.speaker = `Speaker ${word.speaker}`;
276
+ speakers.add(wordTiming.speaker);
277
+ }
278
+ return wordTiming;
279
+ });
280
+ if (speakers.size > 0) {
281
+ result.speakers = Array.from(speakers);
282
+ }
283
+ }
284
+ // Add utterances as segments
285
+ if (data.results.utterances && data.results.utterances.length > 0) {
286
+ result.segments = data.results.utterances.map((utt, index) => ({
287
+ index,
288
+ text: utt.transcript,
289
+ isFinal: true,
290
+ confidence: utt.confidence,
291
+ startTime: utt.start,
292
+ endTime: utt.end,
293
+ speaker: utt.speaker !== undefined ? `Speaker ${utt.speaker}` : undefined,
294
+ }));
295
+ }
296
+ logger.info(`[DeepgramSTTHandler] Transcribed ${data.metadata?.duration?.toFixed(1) ?? "?"}s audio in ${latency}ms`);
297
+ return result;
298
+ }
299
+ catch (err) {
300
+ if (err instanceof STTError) {
301
+ throw err;
302
+ }
303
+ const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error");
304
+ logger.error(`[DeepgramSTTHandler] Transcription failed: ${errorMessage}`);
305
+ throw STTError.transcriptionFailed(errorMessage, "deepgram", err instanceof Error ? err : undefined);
306
+ }
307
+ }
308
+ /**
309
+ * Streaming transcription using WebSocket
310
+ */
311
+ async *transcribeStream(audioStream, options) {
312
+ if (!this.apiKey) {
313
+ throw STTError.providerNotConfigured("deepgram");
314
+ }
315
+ const deepgramOptions = options;
316
+ // Build query parameters
317
+ const params = new URLSearchParams();
318
+ params.set("model", deepgramOptions.model ?? "nova-2");
319
+ if (options.language) {
320
+ params.set("language", options.language);
321
+ }
322
+ if (options.punctuation !== false) {
323
+ params.set("punctuate", "true");
324
+ }
325
+ if (options.speakerDiarization) {
326
+ params.set("diarize", "true");
327
+ }
328
+ if (deepgramOptions.smartFormat) {
329
+ params.set("smart_format", "true");
330
+ }
331
+ // Indicate interim results
332
+ params.set("interim_results", "true");
333
+ const wsUrl = `wss://api.deepgram.com/v1/listen?${params.toString()}`;
334
+ // Create WebSocket connection
335
+ const WebSocket = (await import("ws")).default;
336
+ const ws = new WebSocket(wsUrl, {
337
+ headers: {
338
+ Authorization: `Token ${this.apiKey}`,
339
+ },
340
+ });
341
+ let segmentIndex = 0;
342
+ const messageQueue = [];
343
+ let resolveNext = null;
344
+ let done = false;
345
+ let error = null;
346
+ // Bug 4 fix: name the three permanent handlers so timeout cleanup can call
347
+ // ws.off(event, ref) per pair instead of removeAllListeners(event). The
348
+ // surgical .off() pattern survives any future code that attaches more
349
+ // listeners between this block and the connection-timeout firing.
350
+ const onMessage = (data) => {
351
+ try {
352
+ const response = JSON.parse(data.toString());
353
+ if (response.type === "Results" && response.channel?.alternatives) {
354
+ const alt = response.channel.alternatives[0];
355
+ if (alt && alt.transcript) {
356
+ const segment = {
357
+ index: segmentIndex++,
358
+ text: alt.transcript,
359
+ isFinal: response.is_final ?? false,
360
+ confidence: alt.confidence ?? 0,
361
+ };
362
+ if (resolveNext) {
363
+ resolveNext({ value: segment, done: false });
364
+ resolveNext = null;
365
+ }
366
+ else {
367
+ messageQueue.push(segment);
368
+ }
369
+ }
370
+ }
371
+ }
372
+ catch {
373
+ logger.warn(`[DeepgramSTTHandler] Failed to parse WebSocket message`);
374
+ }
375
+ };
376
+ const onError = (err) => {
377
+ error = err;
378
+ if (resolveNext) {
379
+ resolveNext({
380
+ value: undefined,
381
+ done: true,
382
+ });
383
+ resolveNext = null;
384
+ }
385
+ };
386
+ const onClose = () => {
387
+ done = true;
388
+ if (resolveNext) {
389
+ resolveNext({
390
+ value: undefined,
391
+ done: true,
392
+ });
393
+ resolveNext = null;
394
+ }
395
+ };
396
+ ws.on("message", onMessage);
397
+ ws.on("error", onError);
398
+ ws.on("close", onClose);
399
+ // Wait for connection (10-second timeout to avoid hanging indefinitely)
400
+ await new Promise((resolve, reject) => {
401
+ const openHandler = () => {
402
+ clearTimeout(connectionTimeout);
403
+ ws.off("error", openErrorHandler);
404
+ resolve();
405
+ };
406
+ const openErrorHandler = (err) => {
407
+ clearTimeout(connectionTimeout);
408
+ ws.off("open", openHandler);
409
+ reject(err);
410
+ };
411
+ const connectionTimeout = setTimeout(() => {
412
+ // Bug 4 fix: surgical .off() per (event, handlerRef) so any future
413
+ // listener attached to this socket survives the timeout cleanup.
414
+ ws.off("message", onMessage);
415
+ ws.off("error", onError);
416
+ ws.off("close", onClose);
417
+ ws.off("open", openHandler);
418
+ ws.off("error", openErrorHandler);
419
+ ws.terminate();
420
+ reject(STTError.streamError("WebSocket connection to Deepgram timed out after 10 seconds", "deepgram"));
421
+ }, 10000);
422
+ ws.on("open", openHandler);
423
+ ws.on("error", openErrorHandler);
424
+ });
425
+ // Send audio chunks
426
+ const sendAudio = async () => {
427
+ try {
428
+ for await (const chunk of audioStream) {
429
+ if (ws.readyState === WebSocket.OPEN) {
430
+ ws.send(chunk);
431
+ }
432
+ }
433
+ }
434
+ catch (sendError) {
435
+ logger.error(`[DeepgramSTTHandler] Error sending audio: ${sendError instanceof Error ? sendError.message : String(sendError)}`);
436
+ // Surface the error so the generator loop can exit instead of hanging.
437
+ error = sendError;
438
+ if (resolveNext) {
439
+ resolveNext({
440
+ value: undefined,
441
+ done: true,
442
+ });
443
+ resolveNext = null;
444
+ }
445
+ }
446
+ finally {
447
+ // Always send CloseStream so Deepgram closes the WS even on send error;
448
+ // otherwise `done` is never set and the generator hangs.
449
+ if (ws.readyState === WebSocket.OPEN) {
450
+ try {
451
+ ws.send(JSON.stringify({ type: "CloseStream" }));
452
+ }
453
+ catch {
454
+ /* WS already broken */
455
+ }
456
+ }
457
+ }
458
+ };
459
+ // Start sending audio in background — explicitly fire-and-forget with .catch
460
+ // to surface unhandled rejections instead of crashing the process.
461
+ void sendAudio().catch((err) => {
462
+ logger.error(`[DeepgramSTTHandler] sendAudio rejected: ${err instanceof Error ? err.message : String(err)}`);
463
+ });
464
+ // Track teardown so the audio-pump generator can stop pulling from
465
+ // `audioStream` after the consumer breaks out of the for-await loop or
466
+ // the WS errors. Without this, an infinite/live producer keeps running
467
+ // and leaks the upstream resource (CodeRabbit review).
468
+ const stopProducerEarly = () => {
469
+ const ret = audioStream
470
+ .return;
471
+ if (typeof ret === "function") {
472
+ try {
473
+ void Promise.resolve(ret.call(audioStream)).catch(() => undefined);
474
+ }
475
+ catch {
476
+ // Best-effort — ignore if the iterator's return() throws.
477
+ }
478
+ }
479
+ };
480
+ // Yield segments — wrapped in try/finally so the WebSocket is always
481
+ // closed and a CloseStream message sent, even when the consumer breaks
482
+ // out of the for-await loop early (C2: previously the WS would leak and
483
+ // sendAudio would keep running in the background).
484
+ try {
485
+ while (!done) {
486
+ if (error) {
487
+ throw STTError.streamError(error.message, "deepgram");
488
+ }
489
+ if (messageQueue.length > 0) {
490
+ // Issue 9: explicit narrowing — `length > 0` proves shift returns a
491
+ // value, but TypeScript can't tie the two; narrow without `!`.
492
+ const next = messageQueue.shift();
493
+ if (next !== undefined) {
494
+ yield next;
495
+ }
496
+ }
497
+ else {
498
+ // Wait for next message — capture and yield the resolved segment
499
+ const result = await new Promise((resolve) => {
500
+ resolveNext = resolve;
501
+ });
502
+ if (!result.done && result.value) {
503
+ yield result.value;
504
+ }
505
+ }
506
+ }
507
+ // Yield remaining messages
508
+ while (messageQueue.length > 0) {
509
+ const next = messageQueue.shift();
510
+ if (next !== undefined) {
511
+ yield next;
512
+ }
513
+ }
514
+ }
515
+ finally {
516
+ // Tell the upstream producer (the caller's audioStream iterator) to
517
+ // stop — sendAudio() is the only consumer of that iterator, so once
518
+ // we're tearing down it should not be pulling more chunks.
519
+ stopProducerEarly();
520
+ // C2: always close the socket — sends Deepgram's CloseStream sentinel
521
+ // when reachable, then terminates if still open after a short window.
522
+ if (ws.readyState === WebSocket.OPEN) {
523
+ try {
524
+ ws.send(JSON.stringify({ type: "CloseStream" }));
525
+ }
526
+ catch {
527
+ // Ignore — socket may have been closed by the server
528
+ }
529
+ ws.close();
530
+ }
531
+ else if (ws.readyState === WebSocket.CONNECTING ||
532
+ ws.readyState === WebSocket.CLOSING) {
533
+ ws.terminate();
534
+ }
535
+ }
536
+ }
537
+ /**
538
+ * Get MIME type for audio format
539
+ */
540
+ getMimeType(format) {
541
+ const mimeTypes = {
542
+ mp3: "audio/mpeg",
543
+ wav: "audio/wav",
544
+ ogg: "audio/ogg",
545
+ opus: "audio/opus",
546
+ };
547
+ return mimeTypes[format] ?? "audio/wav";
548
+ }
549
+ }
@@ -0,0 +1,53 @@
1
+ /**
2
+ * ElevenLabs Text-to-Speech Handler
3
+ *
4
+ * Implementation of TTS using ElevenLabs API.
5
+ *
6
+ * @module voice/providers/ElevenLabsTTS
7
+ */
8
+ import type { TTSHandler, TTSOptions, TTSResult, TTSVoice } from "../../types/index.js";
9
+ /**
10
+ * ElevenLabs Text-to-Speech Handler
11
+ *
12
+ * Supports high-quality multilingual TTS with voice cloning.
13
+ *
14
+ * @see https://elevenlabs.io/docs/api-reference
15
+ */
16
+ export declare class ElevenLabsTTS implements TTSHandler {
17
+ private readonly apiKey;
18
+ private readonly baseUrl;
19
+ private voicesCache;
20
+ private static readonly CACHE_TTL_MS;
21
+ /**
22
+ * Maximum text length (5000 characters)
23
+ */
24
+ readonly maxTextLength = 5000;
25
+ constructor(apiKey?: string);
26
+ isConfigured(): boolean;
27
+ getVoices(languageCode?: string): Promise<TTSVoice[]>;
28
+ synthesize(text: string, options?: TTSOptions): Promise<TTSResult>;
29
+ /**
30
+ * Map gender string to standard type
31
+ */
32
+ private mapGender;
33
+ /**
34
+ * Map TTSAudioFormat to ElevenLabs output format
35
+ */
36
+ private mapFormat;
37
+ /**
38
+ * Get sample rate from format string
39
+ */
40
+ private getSampleRate;
41
+ /**
42
+ * Map the ElevenLabs `output_format` string back to a canonical
43
+ * TTSAudioFormat. mapFormat() falls back to mp3_44100_128 for unsupported
44
+ * inputs, so this is needed to keep TTSResult.format honest.
45
+ *
46
+ * NOTE: ElevenLabs `pcm_*` outputs are RAW 16-bit signed-LE PCM samples
47
+ * with no RIFF/WAV header. We surface that as `pcm16` (which exists in the
48
+ * `TTSAudioFormat` union exactly for this case) — labeling it as `wav`
49
+ * would cause consumers writing the buffer to a `.wav` file or feeding it
50
+ * to a WAV parser to produce unplayable output (CodeRabbit review).
51
+ */
52
+ private effectiveFormat;
53
+ }