@juspay/neurolink 9.61.1 → 9.62.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/README.md +23 -17
- package/dist/adapters/tts/googleTTSHandler.js +1 -1
- package/dist/browser/neurolink.min.js +382 -364
- package/dist/cli/commands/serve.js +9 -0
- package/dist/cli/commands/voiceServer.d.ts +7 -0
- package/dist/cli/commands/voiceServer.js +9 -1
- package/dist/cli/factories/commandFactory.js +136 -11
- package/dist/cli/loop/optionsSchema.d.ts +1 -1
- package/dist/cli/utils/audioFileUtils.d.ts +3 -3
- package/dist/cli/utils/audioFileUtils.js +5 -1
- package/dist/core/baseProvider.js +29 -6
- package/dist/factories/providerRegistry.d.ts +14 -0
- package/dist/factories/providerRegistry.js +141 -2
- package/dist/lib/adapters/tts/googleTTSHandler.js +1 -1
- package/dist/lib/core/baseProvider.js +29 -6
- package/dist/lib/factories/providerRegistry.d.ts +14 -0
- package/dist/lib/factories/providerRegistry.js +141 -2
- package/dist/lib/mcp/toolRegistry.js +7 -1
- package/dist/lib/neurolink.d.ts +19 -0
- package/dist/lib/neurolink.js +252 -14
- package/dist/lib/observability/exporters/laminarExporter.js +1 -0
- package/dist/lib/observability/exporters/posthogExporter.js +1 -0
- package/dist/lib/observability/utils/spanSerializer.js +1 -0
- package/dist/lib/server/voice/tokenCompare.d.ts +14 -0
- package/dist/lib/server/voice/tokenCompare.js +23 -0
- package/dist/lib/server/voice/voiceServerApp.js +62 -3
- package/dist/lib/server/voice/voiceWebSocketHandler.d.ts +20 -3
- package/dist/lib/server/voice/voiceWebSocketHandler.js +555 -435
- package/dist/lib/types/generate.d.ts +47 -0
- package/dist/lib/types/hitl.d.ts +3 -0
- package/dist/lib/types/index.d.ts +1 -1
- package/dist/lib/types/index.js +1 -1
- package/dist/lib/types/realtime.d.ts +243 -0
- package/dist/lib/types/realtime.js +70 -0
- package/dist/lib/types/server.d.ts +68 -0
- package/dist/lib/types/span.d.ts +2 -0
- package/dist/lib/types/span.js +2 -0
- package/dist/lib/types/stream.d.ts +36 -14
- package/dist/lib/types/stt.d.ts +585 -0
- package/dist/lib/types/stt.js +90 -0
- package/dist/lib/types/tools.d.ts +2 -0
- package/dist/lib/types/tts.d.ts +23 -11
- package/dist/lib/types/tts.js +7 -0
- package/dist/lib/types/voice.d.ts +272 -0
- package/dist/lib/types/voice.js +137 -0
- package/dist/lib/utils/audioFormatDetector.d.ts +15 -0
- package/dist/lib/utils/audioFormatDetector.js +34 -0
- package/dist/lib/utils/errorHandling.js +4 -0
- package/dist/lib/utils/sttProcessor.d.ts +115 -0
- package/dist/lib/utils/sttProcessor.js +295 -0
- package/dist/lib/voice/RealtimeVoiceAPI.d.ts +183 -0
- package/dist/lib/voice/RealtimeVoiceAPI.js +439 -0
- package/dist/lib/voice/audio-utils.d.ts +135 -0
- package/dist/lib/voice/audio-utils.js +435 -0
- package/dist/lib/voice/errors.d.ts +123 -0
- package/dist/lib/voice/errors.js +386 -0
- package/dist/lib/voice/index.d.ts +26 -0
- package/dist/lib/voice/index.js +55 -0
- package/dist/lib/voice/providers/AzureSTT.d.ts +47 -0
- package/dist/lib/voice/providers/AzureSTT.js +345 -0
- package/dist/lib/voice/providers/AzureTTS.d.ts +59 -0
- package/dist/lib/voice/providers/AzureTTS.js +349 -0
- package/dist/lib/voice/providers/DeepgramSTT.d.ts +40 -0
- package/dist/lib/voice/providers/DeepgramSTT.js +550 -0
- package/dist/lib/voice/providers/ElevenLabsTTS.d.ts +53 -0
- package/dist/lib/voice/providers/ElevenLabsTTS.js +311 -0
- package/dist/lib/voice/providers/GeminiLive.d.ts +52 -0
- package/dist/lib/voice/providers/GeminiLive.js +372 -0
- package/dist/lib/voice/providers/GoogleSTT.d.ts +60 -0
- package/dist/lib/voice/providers/GoogleSTT.js +454 -0
- package/dist/lib/voice/providers/OpenAIRealtime.d.ts +47 -0
- package/dist/lib/voice/providers/OpenAIRealtime.js +412 -0
- package/dist/lib/voice/providers/OpenAISTT.d.ts +41 -0
- package/dist/lib/voice/providers/OpenAISTT.js +286 -0
- package/dist/lib/voice/providers/OpenAITTS.d.ts +49 -0
- package/dist/lib/voice/providers/OpenAITTS.js +271 -0
- package/dist/lib/voice/stream-handler.d.ts +166 -0
- package/dist/lib/voice/stream-handler.js +514 -0
- package/dist/mcp/toolRegistry.js +7 -1
- package/dist/neurolink.d.ts +19 -0
- package/dist/neurolink.js +252 -14
- package/dist/observability/exporters/laminarExporter.js +1 -0
- package/dist/observability/exporters/posthogExporter.js +1 -0
- package/dist/observability/utils/spanSerializer.js +1 -0
- package/dist/server/voice/tokenCompare.d.ts +14 -0
- package/dist/server/voice/tokenCompare.js +22 -0
- package/dist/server/voice/voiceServerApp.js +62 -3
- package/dist/server/voice/voiceWebSocketHandler.d.ts +20 -3
- package/dist/server/voice/voiceWebSocketHandler.js +555 -435
- package/dist/types/generate.d.ts +47 -0
- package/dist/types/hitl.d.ts +3 -0
- package/dist/types/index.d.ts +1 -1
- package/dist/types/index.js +1 -1
- package/dist/types/realtime.d.ts +243 -0
- package/dist/types/realtime.js +69 -0
- package/dist/types/server.d.ts +68 -0
- package/dist/types/span.d.ts +2 -0
- package/dist/types/span.js +2 -0
- package/dist/types/stream.d.ts +36 -14
- package/dist/types/stt.d.ts +585 -0
- package/dist/types/stt.js +89 -0
- package/dist/types/tools.d.ts +2 -0
- package/dist/types/tts.d.ts +23 -11
- package/dist/types/tts.js +7 -0
- package/dist/types/voice.d.ts +272 -0
- package/dist/types/voice.js +136 -0
- package/dist/utils/audioFormatDetector.d.ts +15 -0
- package/dist/utils/audioFormatDetector.js +33 -0
- package/dist/utils/errorHandling.js +4 -0
- package/dist/utils/sttProcessor.d.ts +115 -0
- package/dist/utils/sttProcessor.js +294 -0
- package/dist/voice/RealtimeVoiceAPI.d.ts +183 -0
- package/dist/voice/RealtimeVoiceAPI.js +438 -0
- package/dist/voice/audio-utils.d.ts +135 -0
- package/dist/voice/audio-utils.js +434 -0
- package/dist/voice/errors.d.ts +123 -0
- package/dist/voice/errors.js +385 -0
- package/dist/voice/index.d.ts +26 -0
- package/dist/voice/index.js +54 -0
- package/dist/voice/providers/AzureSTT.d.ts +47 -0
- package/dist/voice/providers/AzureSTT.js +344 -0
- package/dist/voice/providers/AzureTTS.d.ts +59 -0
- package/dist/voice/providers/AzureTTS.js +348 -0
- package/dist/voice/providers/DeepgramSTT.d.ts +40 -0
- package/dist/voice/providers/DeepgramSTT.js +549 -0
- package/dist/voice/providers/ElevenLabsTTS.d.ts +53 -0
- package/dist/voice/providers/ElevenLabsTTS.js +310 -0
- package/dist/voice/providers/GeminiLive.d.ts +52 -0
- package/dist/voice/providers/GeminiLive.js +371 -0
- package/dist/voice/providers/GoogleSTT.d.ts +60 -0
- package/dist/voice/providers/GoogleSTT.js +453 -0
- package/dist/voice/providers/OpenAIRealtime.d.ts +47 -0
- package/dist/voice/providers/OpenAIRealtime.js +411 -0
- package/dist/voice/providers/OpenAISTT.d.ts +41 -0
- package/dist/voice/providers/OpenAISTT.js +285 -0
- package/dist/voice/providers/OpenAITTS.d.ts +49 -0
- package/dist/voice/providers/OpenAITTS.js +270 -0
- package/dist/voice/stream-handler.d.ts +166 -0
- package/dist/voice/stream-handler.js +513 -0
- package/package.json +5 -2
|
@@ -0,0 +1,550 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Deepgram Speech-to-Text Handler
|
|
3
|
+
*
|
|
4
|
+
* Implementation of STT using Deepgram's Speech Recognition API.
|
|
5
|
+
*
|
|
6
|
+
* @module voice/providers/DeepgramSTT
|
|
7
|
+
*/
|
|
8
|
+
import { logger } from "../../utils/logger.js";
|
|
9
|
+
import { STTError } from "../errors.js";
|
|
10
|
+
/**
|
|
11
|
+
* Deepgram Speech-to-Text Handler
|
|
12
|
+
*
|
|
13
|
+
* Supports real-time streaming, speaker diarization, and smart formatting.
|
|
14
|
+
*
|
|
15
|
+
* @see https://developers.deepgram.com/docs
|
|
16
|
+
*/
|
|
17
|
+
export class DeepgramSTT {
|
|
18
|
+
apiKey;
|
|
19
|
+
baseUrl = "https://api.deepgram.com/v1";
|
|
20
|
+
/**
|
|
21
|
+
* Maximum audio duration in seconds (2 hours)
|
|
22
|
+
*/
|
|
23
|
+
maxAudioDuration = 7200;
|
|
24
|
+
/**
|
|
25
|
+
* Deepgram supports streaming
|
|
26
|
+
*/
|
|
27
|
+
supportsStreaming = true;
|
|
28
|
+
constructor(apiKey) {
|
|
29
|
+
// Normalize: trim surrounding whitespace and treat empty string as null
|
|
30
|
+
// so isConfigured() and transcribe()/transcribeStream() agree on the
|
|
31
|
+
// contract (other voice providers all do this — Deepgram was missed).
|
|
32
|
+
const resolvedKey = (apiKey ?? process.env.DEEPGRAM_API_KEY ?? "").trim();
|
|
33
|
+
this.apiKey = resolvedKey.length > 0 ? resolvedKey : null;
|
|
34
|
+
}
|
|
35
|
+
isConfigured() {
|
|
36
|
+
return this.apiKey !== null;
|
|
37
|
+
}
|
|
38
|
+
getSupportedFormats() {
|
|
39
|
+
return ["mp3", "wav", "ogg", "opus"];
|
|
40
|
+
}
|
|
41
|
+
async getSupportedLanguages() {
|
|
42
|
+
// Deepgram supports 40+ languages
|
|
43
|
+
return [
|
|
44
|
+
{
|
|
45
|
+
code: "en",
|
|
46
|
+
name: "English",
|
|
47
|
+
supportsDiarization: true,
|
|
48
|
+
supportsPunctuation: true,
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
code: "en-US",
|
|
52
|
+
name: "English (US)",
|
|
53
|
+
supportsDiarization: true,
|
|
54
|
+
supportsPunctuation: true,
|
|
55
|
+
},
|
|
56
|
+
{
|
|
57
|
+
code: "en-GB",
|
|
58
|
+
name: "English (UK)",
|
|
59
|
+
supportsDiarization: true,
|
|
60
|
+
supportsPunctuation: true,
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
code: "es",
|
|
64
|
+
name: "Spanish",
|
|
65
|
+
supportsDiarization: true,
|
|
66
|
+
supportsPunctuation: true,
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
code: "fr",
|
|
70
|
+
name: "French",
|
|
71
|
+
supportsDiarization: true,
|
|
72
|
+
supportsPunctuation: true,
|
|
73
|
+
},
|
|
74
|
+
{
|
|
75
|
+
code: "de",
|
|
76
|
+
name: "German",
|
|
77
|
+
supportsDiarization: true,
|
|
78
|
+
supportsPunctuation: true,
|
|
79
|
+
},
|
|
80
|
+
{
|
|
81
|
+
code: "it",
|
|
82
|
+
name: "Italian",
|
|
83
|
+
supportsDiarization: true,
|
|
84
|
+
supportsPunctuation: true,
|
|
85
|
+
},
|
|
86
|
+
{
|
|
87
|
+
code: "pt",
|
|
88
|
+
name: "Portuguese",
|
|
89
|
+
supportsDiarization: true,
|
|
90
|
+
supportsPunctuation: true,
|
|
91
|
+
},
|
|
92
|
+
{
|
|
93
|
+
code: "nl",
|
|
94
|
+
name: "Dutch",
|
|
95
|
+
supportsDiarization: true,
|
|
96
|
+
supportsPunctuation: true,
|
|
97
|
+
},
|
|
98
|
+
{
|
|
99
|
+
code: "ja",
|
|
100
|
+
name: "Japanese",
|
|
101
|
+
supportsDiarization: true,
|
|
102
|
+
supportsPunctuation: true,
|
|
103
|
+
},
|
|
104
|
+
{
|
|
105
|
+
code: "ko",
|
|
106
|
+
name: "Korean",
|
|
107
|
+
supportsDiarization: true,
|
|
108
|
+
supportsPunctuation: true,
|
|
109
|
+
},
|
|
110
|
+
{
|
|
111
|
+
code: "zh",
|
|
112
|
+
name: "Chinese",
|
|
113
|
+
supportsDiarization: true,
|
|
114
|
+
supportsPunctuation: true,
|
|
115
|
+
},
|
|
116
|
+
{
|
|
117
|
+
code: "hi",
|
|
118
|
+
name: "Hindi",
|
|
119
|
+
supportsDiarization: true,
|
|
120
|
+
supportsPunctuation: true,
|
|
121
|
+
},
|
|
122
|
+
{
|
|
123
|
+
code: "ru",
|
|
124
|
+
name: "Russian",
|
|
125
|
+
supportsDiarization: true,
|
|
126
|
+
supportsPunctuation: true,
|
|
127
|
+
},
|
|
128
|
+
];
|
|
129
|
+
}
|
|
130
|
+
async transcribe(audio, options = {}) {
|
|
131
|
+
if (!this.apiKey) {
|
|
132
|
+
throw STTError.providerNotConfigured("deepgram");
|
|
133
|
+
}
|
|
134
|
+
const audioBuffer = Buffer.isBuffer(audio) ? audio : Buffer.from(audio);
|
|
135
|
+
if (audioBuffer.length === 0) {
|
|
136
|
+
throw STTError.audioEmpty("deepgram");
|
|
137
|
+
}
|
|
138
|
+
const deepgramOptions = options;
|
|
139
|
+
const startTime = Date.now();
|
|
140
|
+
try {
|
|
141
|
+
// Build query parameters
|
|
142
|
+
const params = new URLSearchParams();
|
|
143
|
+
// Add model
|
|
144
|
+
params.set("model", deepgramOptions.model ?? "nova-2");
|
|
145
|
+
// Add language
|
|
146
|
+
if (options.language) {
|
|
147
|
+
params.set("language", options.language);
|
|
148
|
+
}
|
|
149
|
+
// Add punctuation
|
|
150
|
+
if (options.punctuation !== false) {
|
|
151
|
+
params.set("punctuate", "true");
|
|
152
|
+
}
|
|
153
|
+
// Add diarization
|
|
154
|
+
if (options.speakerDiarization) {
|
|
155
|
+
params.set("diarize", "true");
|
|
156
|
+
if (options.speakerCount) {
|
|
157
|
+
params.set("diarize_version", "latest");
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
// Add smart format
|
|
161
|
+
if (deepgramOptions.smartFormat) {
|
|
162
|
+
params.set("smart_format", "true");
|
|
163
|
+
}
|
|
164
|
+
// Add utterances
|
|
165
|
+
if (deepgramOptions.utterances) {
|
|
166
|
+
params.set("utterances", "true");
|
|
167
|
+
if (deepgramOptions.uttSplit !== undefined) {
|
|
168
|
+
params.set("utt_split", deepgramOptions.uttSplit.toString());
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
// Add paragraphs
|
|
172
|
+
if (deepgramOptions.paragraphs) {
|
|
173
|
+
params.set("paragraphs", "true");
|
|
174
|
+
}
|
|
175
|
+
// Add filler words
|
|
176
|
+
if (deepgramOptions.fillerWords) {
|
|
177
|
+
params.set("filler_words", "true");
|
|
178
|
+
}
|
|
179
|
+
// Add keywords
|
|
180
|
+
if (deepgramOptions.keywords && deepgramOptions.keywords.length > 0) {
|
|
181
|
+
for (const keyword of deepgramOptions.keywords) {
|
|
182
|
+
params.append("keywords", keyword);
|
|
183
|
+
}
|
|
184
|
+
if (deepgramOptions.keywordBoost) {
|
|
185
|
+
params.set("keyword_boost", deepgramOptions.keywordBoost);
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
// Add redaction
|
|
189
|
+
if (deepgramOptions.redact && deepgramOptions.redact.length > 0) {
|
|
190
|
+
for (const redactType of deepgramOptions.redact) {
|
|
191
|
+
params.append("redact", redactType);
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
// Add profanity filter
|
|
195
|
+
if (options.profanityFilter) {
|
|
196
|
+
params.set("profanity_filter", "true");
|
|
197
|
+
}
|
|
198
|
+
const url = `${this.baseUrl}/listen?${params.toString()}`;
|
|
199
|
+
const controller = new AbortController();
|
|
200
|
+
const timeoutId = setTimeout(() => controller.abort(), 30000);
|
|
201
|
+
let response;
|
|
202
|
+
try {
|
|
203
|
+
response = await fetch(url, {
|
|
204
|
+
method: "POST",
|
|
205
|
+
headers: {
|
|
206
|
+
Authorization: `Token ${this.apiKey}`,
|
|
207
|
+
"Content-Type": this.getMimeType(options.format ?? "wav"),
|
|
208
|
+
},
|
|
209
|
+
body: new Uint8Array(audioBuffer),
|
|
210
|
+
signal: controller.signal,
|
|
211
|
+
});
|
|
212
|
+
}
|
|
213
|
+
catch (fetchErr) {
|
|
214
|
+
if (fetchErr instanceof Error && fetchErr.name === "AbortError") {
|
|
215
|
+
throw STTError.transcriptionFailed("Deepgram STT request timed out after 30 seconds", "deepgram", fetchErr);
|
|
216
|
+
}
|
|
217
|
+
throw fetchErr;
|
|
218
|
+
}
|
|
219
|
+
finally {
|
|
220
|
+
clearTimeout(timeoutId);
|
|
221
|
+
}
|
|
222
|
+
if (!response.ok) {
|
|
223
|
+
const errorData = await response
|
|
224
|
+
.json()
|
|
225
|
+
.catch(() => Object.create(null));
|
|
226
|
+
const errorMessage = errorData.err_msg ||
|
|
227
|
+
`HTTP ${response.status}`;
|
|
228
|
+
throw STTError.transcriptionFailed(errorMessage, "deepgram");
|
|
229
|
+
}
|
|
230
|
+
const data = (await response.json());
|
|
231
|
+
const latency = Date.now() - startTime;
|
|
232
|
+
// Handle empty results
|
|
233
|
+
if (!data.results?.channels ||
|
|
234
|
+
data.results.channels.length === 0 ||
|
|
235
|
+
!data.results.channels[0].alternatives ||
|
|
236
|
+
data.results.channels[0].alternatives.length === 0) {
|
|
237
|
+
return {
|
|
238
|
+
text: "",
|
|
239
|
+
confidence: 0,
|
|
240
|
+
language: options.language,
|
|
241
|
+
duration: data.metadata?.duration,
|
|
242
|
+
metadata: {
|
|
243
|
+
latency,
|
|
244
|
+
provider: "deepgram",
|
|
245
|
+
requestId: data.metadata?.request_id,
|
|
246
|
+
},
|
|
247
|
+
};
|
|
248
|
+
}
|
|
249
|
+
const firstChannel = data.results.channels[0];
|
|
250
|
+
const firstAlternative = firstChannel.alternatives[0];
|
|
251
|
+
// Build result
|
|
252
|
+
const result = {
|
|
253
|
+
text: firstAlternative.transcript,
|
|
254
|
+
confidence: firstAlternative.confidence,
|
|
255
|
+
language: options.language,
|
|
256
|
+
duration: data.metadata?.duration,
|
|
257
|
+
metadata: {
|
|
258
|
+
latency,
|
|
259
|
+
provider: "deepgram",
|
|
260
|
+
model: deepgramOptions.model ?? "nova-2",
|
|
261
|
+
requestId: data.metadata?.request_id,
|
|
262
|
+
},
|
|
263
|
+
};
|
|
264
|
+
// Add word timings
|
|
265
|
+
if (firstAlternative.words && firstAlternative.words.length > 0) {
|
|
266
|
+
const speakers = new Set();
|
|
267
|
+
result.words = firstAlternative.words.map((word) => {
|
|
268
|
+
const wordTiming = {
|
|
269
|
+
word: word.punctuated_word ?? word.word,
|
|
270
|
+
startTime: word.start,
|
|
271
|
+
endTime: word.end,
|
|
272
|
+
confidence: word.confidence,
|
|
273
|
+
};
|
|
274
|
+
if (word.speaker !== undefined) {
|
|
275
|
+
wordTiming.speaker = `Speaker ${word.speaker}`;
|
|
276
|
+
speakers.add(wordTiming.speaker);
|
|
277
|
+
}
|
|
278
|
+
return wordTiming;
|
|
279
|
+
});
|
|
280
|
+
if (speakers.size > 0) {
|
|
281
|
+
result.speakers = Array.from(speakers);
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
// Add utterances as segments
|
|
285
|
+
if (data.results.utterances && data.results.utterances.length > 0) {
|
|
286
|
+
result.segments = data.results.utterances.map((utt, index) => ({
|
|
287
|
+
index,
|
|
288
|
+
text: utt.transcript,
|
|
289
|
+
isFinal: true,
|
|
290
|
+
confidence: utt.confidence,
|
|
291
|
+
startTime: utt.start,
|
|
292
|
+
endTime: utt.end,
|
|
293
|
+
speaker: utt.speaker !== undefined ? `Speaker ${utt.speaker}` : undefined,
|
|
294
|
+
}));
|
|
295
|
+
}
|
|
296
|
+
logger.info(`[DeepgramSTTHandler] Transcribed ${data.metadata?.duration?.toFixed(1) ?? "?"}s audio in ${latency}ms`);
|
|
297
|
+
return result;
|
|
298
|
+
}
|
|
299
|
+
catch (err) {
|
|
300
|
+
if (err instanceof STTError) {
|
|
301
|
+
throw err;
|
|
302
|
+
}
|
|
303
|
+
const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error");
|
|
304
|
+
logger.error(`[DeepgramSTTHandler] Transcription failed: ${errorMessage}`);
|
|
305
|
+
throw STTError.transcriptionFailed(errorMessage, "deepgram", err instanceof Error ? err : undefined);
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
/**
|
|
309
|
+
* Streaming transcription using WebSocket
|
|
310
|
+
*/
|
|
311
|
+
async *transcribeStream(audioStream, options) {
|
|
312
|
+
if (!this.apiKey) {
|
|
313
|
+
throw STTError.providerNotConfigured("deepgram");
|
|
314
|
+
}
|
|
315
|
+
const deepgramOptions = options;
|
|
316
|
+
// Build query parameters
|
|
317
|
+
const params = new URLSearchParams();
|
|
318
|
+
params.set("model", deepgramOptions.model ?? "nova-2");
|
|
319
|
+
if (options.language) {
|
|
320
|
+
params.set("language", options.language);
|
|
321
|
+
}
|
|
322
|
+
if (options.punctuation !== false) {
|
|
323
|
+
params.set("punctuate", "true");
|
|
324
|
+
}
|
|
325
|
+
if (options.speakerDiarization) {
|
|
326
|
+
params.set("diarize", "true");
|
|
327
|
+
}
|
|
328
|
+
if (deepgramOptions.smartFormat) {
|
|
329
|
+
params.set("smart_format", "true");
|
|
330
|
+
}
|
|
331
|
+
// Indicate interim results
|
|
332
|
+
params.set("interim_results", "true");
|
|
333
|
+
const wsUrl = `wss://api.deepgram.com/v1/listen?${params.toString()}`;
|
|
334
|
+
// Create WebSocket connection
|
|
335
|
+
const WebSocket = (await import("ws")).default;
|
|
336
|
+
const ws = new WebSocket(wsUrl, {
|
|
337
|
+
headers: {
|
|
338
|
+
Authorization: `Token ${this.apiKey}`,
|
|
339
|
+
},
|
|
340
|
+
});
|
|
341
|
+
let segmentIndex = 0;
|
|
342
|
+
const messageQueue = [];
|
|
343
|
+
let resolveNext = null;
|
|
344
|
+
let done = false;
|
|
345
|
+
let error = null;
|
|
346
|
+
// Bug 4 fix: name the three permanent handlers so timeout cleanup can call
|
|
347
|
+
// ws.off(event, ref) per pair instead of removeAllListeners(event). The
|
|
348
|
+
// surgical .off() pattern survives any future code that attaches more
|
|
349
|
+
// listeners between this block and the connection-timeout firing.
|
|
350
|
+
const onMessage = (data) => {
|
|
351
|
+
try {
|
|
352
|
+
const response = JSON.parse(data.toString());
|
|
353
|
+
if (response.type === "Results" && response.channel?.alternatives) {
|
|
354
|
+
const alt = response.channel.alternatives[0];
|
|
355
|
+
if (alt && alt.transcript) {
|
|
356
|
+
const segment = {
|
|
357
|
+
index: segmentIndex++,
|
|
358
|
+
text: alt.transcript,
|
|
359
|
+
isFinal: response.is_final ?? false,
|
|
360
|
+
confidence: alt.confidence ?? 0,
|
|
361
|
+
};
|
|
362
|
+
if (resolveNext) {
|
|
363
|
+
resolveNext({ value: segment, done: false });
|
|
364
|
+
resolveNext = null;
|
|
365
|
+
}
|
|
366
|
+
else {
|
|
367
|
+
messageQueue.push(segment);
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
catch {
|
|
373
|
+
logger.warn(`[DeepgramSTTHandler] Failed to parse WebSocket message`);
|
|
374
|
+
}
|
|
375
|
+
};
|
|
376
|
+
const onError = (err) => {
|
|
377
|
+
error = err;
|
|
378
|
+
if (resolveNext) {
|
|
379
|
+
resolveNext({
|
|
380
|
+
value: undefined,
|
|
381
|
+
done: true,
|
|
382
|
+
});
|
|
383
|
+
resolveNext = null;
|
|
384
|
+
}
|
|
385
|
+
};
|
|
386
|
+
const onClose = () => {
|
|
387
|
+
done = true;
|
|
388
|
+
if (resolveNext) {
|
|
389
|
+
resolveNext({
|
|
390
|
+
value: undefined,
|
|
391
|
+
done: true,
|
|
392
|
+
});
|
|
393
|
+
resolveNext = null;
|
|
394
|
+
}
|
|
395
|
+
};
|
|
396
|
+
ws.on("message", onMessage);
|
|
397
|
+
ws.on("error", onError);
|
|
398
|
+
ws.on("close", onClose);
|
|
399
|
+
// Wait for connection (10-second timeout to avoid hanging indefinitely)
|
|
400
|
+
await new Promise((resolve, reject) => {
|
|
401
|
+
const openHandler = () => {
|
|
402
|
+
clearTimeout(connectionTimeout);
|
|
403
|
+
ws.off("error", openErrorHandler);
|
|
404
|
+
resolve();
|
|
405
|
+
};
|
|
406
|
+
const openErrorHandler = (err) => {
|
|
407
|
+
clearTimeout(connectionTimeout);
|
|
408
|
+
ws.off("open", openHandler);
|
|
409
|
+
reject(err);
|
|
410
|
+
};
|
|
411
|
+
const connectionTimeout = setTimeout(() => {
|
|
412
|
+
// Bug 4 fix: surgical .off() per (event, handlerRef) so any future
|
|
413
|
+
// listener attached to this socket survives the timeout cleanup.
|
|
414
|
+
ws.off("message", onMessage);
|
|
415
|
+
ws.off("error", onError);
|
|
416
|
+
ws.off("close", onClose);
|
|
417
|
+
ws.off("open", openHandler);
|
|
418
|
+
ws.off("error", openErrorHandler);
|
|
419
|
+
ws.terminate();
|
|
420
|
+
reject(STTError.streamError("WebSocket connection to Deepgram timed out after 10 seconds", "deepgram"));
|
|
421
|
+
}, 10000);
|
|
422
|
+
ws.on("open", openHandler);
|
|
423
|
+
ws.on("error", openErrorHandler);
|
|
424
|
+
});
|
|
425
|
+
// Send audio chunks
|
|
426
|
+
const sendAudio = async () => {
|
|
427
|
+
try {
|
|
428
|
+
for await (const chunk of audioStream) {
|
|
429
|
+
if (ws.readyState === WebSocket.OPEN) {
|
|
430
|
+
ws.send(chunk);
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
catch (sendError) {
|
|
435
|
+
logger.error(`[DeepgramSTTHandler] Error sending audio: ${sendError instanceof Error ? sendError.message : String(sendError)}`);
|
|
436
|
+
// Surface the error so the generator loop can exit instead of hanging.
|
|
437
|
+
error = sendError;
|
|
438
|
+
if (resolveNext) {
|
|
439
|
+
resolveNext({
|
|
440
|
+
value: undefined,
|
|
441
|
+
done: true,
|
|
442
|
+
});
|
|
443
|
+
resolveNext = null;
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
finally {
|
|
447
|
+
// Always send CloseStream so Deepgram closes the WS even on send error;
|
|
448
|
+
// otherwise `done` is never set and the generator hangs.
|
|
449
|
+
if (ws.readyState === WebSocket.OPEN) {
|
|
450
|
+
try {
|
|
451
|
+
ws.send(JSON.stringify({ type: "CloseStream" }));
|
|
452
|
+
}
|
|
453
|
+
catch {
|
|
454
|
+
/* WS already broken */
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
};
|
|
459
|
+
// Start sending audio in background — explicitly fire-and-forget with .catch
|
|
460
|
+
// to surface unhandled rejections instead of crashing the process.
|
|
461
|
+
void sendAudio().catch((err) => {
|
|
462
|
+
logger.error(`[DeepgramSTTHandler] sendAudio rejected: ${err instanceof Error ? err.message : String(err)}`);
|
|
463
|
+
});
|
|
464
|
+
// Track teardown so the audio-pump generator can stop pulling from
|
|
465
|
+
// `audioStream` after the consumer breaks out of the for-await loop or
|
|
466
|
+
// the WS errors. Without this, an infinite/live producer keeps running
|
|
467
|
+
// and leaks the upstream resource (CodeRabbit review).
|
|
468
|
+
const stopProducerEarly = () => {
|
|
469
|
+
const ret = audioStream
|
|
470
|
+
.return;
|
|
471
|
+
if (typeof ret === "function") {
|
|
472
|
+
try {
|
|
473
|
+
void Promise.resolve(ret.call(audioStream)).catch(() => undefined);
|
|
474
|
+
}
|
|
475
|
+
catch {
|
|
476
|
+
// Best-effort — ignore if the iterator's return() throws.
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
};
|
|
480
|
+
// Yield segments — wrapped in try/finally so the WebSocket is always
|
|
481
|
+
// closed and a CloseStream message sent, even when the consumer breaks
|
|
482
|
+
// out of the for-await loop early (C2: previously the WS would leak and
|
|
483
|
+
// sendAudio would keep running in the background).
|
|
484
|
+
try {
|
|
485
|
+
while (!done) {
|
|
486
|
+
if (error) {
|
|
487
|
+
throw STTError.streamError(error.message, "deepgram");
|
|
488
|
+
}
|
|
489
|
+
if (messageQueue.length > 0) {
|
|
490
|
+
// Issue 9: explicit narrowing — `length > 0` proves shift returns a
|
|
491
|
+
// value, but TypeScript can't tie the two; narrow without `!`.
|
|
492
|
+
const next = messageQueue.shift();
|
|
493
|
+
if (next !== undefined) {
|
|
494
|
+
yield next;
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
else {
|
|
498
|
+
// Wait for next message — capture and yield the resolved segment
|
|
499
|
+
const result = await new Promise((resolve) => {
|
|
500
|
+
resolveNext = resolve;
|
|
501
|
+
});
|
|
502
|
+
if (!result.done && result.value) {
|
|
503
|
+
yield result.value;
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
// Yield remaining messages
|
|
508
|
+
while (messageQueue.length > 0) {
|
|
509
|
+
const next = messageQueue.shift();
|
|
510
|
+
if (next !== undefined) {
|
|
511
|
+
yield next;
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
finally {
|
|
516
|
+
// Tell the upstream producer (the caller's audioStream iterator) to
|
|
517
|
+
// stop — sendAudio() is the only consumer of that iterator, so once
|
|
518
|
+
// we're tearing down it should not be pulling more chunks.
|
|
519
|
+
stopProducerEarly();
|
|
520
|
+
// C2: always close the socket — sends Deepgram's CloseStream sentinel
|
|
521
|
+
// when reachable, then terminates if still open after a short window.
|
|
522
|
+
if (ws.readyState === WebSocket.OPEN) {
|
|
523
|
+
try {
|
|
524
|
+
ws.send(JSON.stringify({ type: "CloseStream" }));
|
|
525
|
+
}
|
|
526
|
+
catch {
|
|
527
|
+
// Ignore — socket may have been closed by the server
|
|
528
|
+
}
|
|
529
|
+
ws.close();
|
|
530
|
+
}
|
|
531
|
+
else if (ws.readyState === WebSocket.CONNECTING ||
|
|
532
|
+
ws.readyState === WebSocket.CLOSING) {
|
|
533
|
+
ws.terminate();
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
/**
|
|
538
|
+
* Get MIME type for audio format
|
|
539
|
+
*/
|
|
540
|
+
getMimeType(format) {
|
|
541
|
+
const mimeTypes = {
|
|
542
|
+
mp3: "audio/mpeg",
|
|
543
|
+
wav: "audio/wav",
|
|
544
|
+
ogg: "audio/ogg",
|
|
545
|
+
opus: "audio/opus",
|
|
546
|
+
};
|
|
547
|
+
return mimeTypes[format] ?? "audio/wav";
|
|
548
|
+
}
|
|
549
|
+
}
|
|
550
|
+
//# sourceMappingURL=DeepgramSTT.js.map
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ElevenLabs Text-to-Speech Handler
|
|
3
|
+
*
|
|
4
|
+
* Implementation of TTS using ElevenLabs API.
|
|
5
|
+
*
|
|
6
|
+
* @module voice/providers/ElevenLabsTTS
|
|
7
|
+
*/
|
|
8
|
+
import type { TTSHandler, TTSOptions, TTSResult, TTSVoice } from "../../types/index.js";
|
|
9
|
+
/**
|
|
10
|
+
* ElevenLabs Text-to-Speech Handler
|
|
11
|
+
*
|
|
12
|
+
* Supports high-quality multilingual TTS with voice cloning.
|
|
13
|
+
*
|
|
14
|
+
* @see https://elevenlabs.io/docs/api-reference
|
|
15
|
+
*/
|
|
16
|
+
export declare class ElevenLabsTTS implements TTSHandler {
|
|
17
|
+
private readonly apiKey;
|
|
18
|
+
private readonly baseUrl;
|
|
19
|
+
private voicesCache;
|
|
20
|
+
private static readonly CACHE_TTL_MS;
|
|
21
|
+
/**
|
|
22
|
+
* Maximum text length (5000 characters)
|
|
23
|
+
*/
|
|
24
|
+
readonly maxTextLength = 5000;
|
|
25
|
+
constructor(apiKey?: string);
|
|
26
|
+
isConfigured(): boolean;
|
|
27
|
+
getVoices(languageCode?: string): Promise<TTSVoice[]>;
|
|
28
|
+
synthesize(text: string, options?: TTSOptions): Promise<TTSResult>;
|
|
29
|
+
/**
|
|
30
|
+
* Map gender string to standard type
|
|
31
|
+
*/
|
|
32
|
+
private mapGender;
|
|
33
|
+
/**
|
|
34
|
+
* Map TTSAudioFormat to ElevenLabs output format
|
|
35
|
+
*/
|
|
36
|
+
private mapFormat;
|
|
37
|
+
/**
|
|
38
|
+
* Get sample rate from format string
|
|
39
|
+
*/
|
|
40
|
+
private getSampleRate;
|
|
41
|
+
/**
|
|
42
|
+
* Map the ElevenLabs `output_format` string back to a canonical
|
|
43
|
+
* TTSAudioFormat. mapFormat() falls back to mp3_44100_128 for unsupported
|
|
44
|
+
* inputs, so this is needed to keep TTSResult.format honest.
|
|
45
|
+
*
|
|
46
|
+
* NOTE: ElevenLabs `pcm_*` outputs are RAW 16-bit signed-LE PCM samples
|
|
47
|
+
* with no RIFF/WAV header. We surface that as `pcm16` (which exists in the
|
|
48
|
+
* `TTSAudioFormat` union exactly for this case) — labeling it as `wav`
|
|
49
|
+
* would cause consumers writing the buffer to a `.wav` file or feeding it
|
|
50
|
+
* to a WAV parser to produce unplayable output (CodeRabbit review).
|
|
51
|
+
*/
|
|
52
|
+
private effectiveFormat;
|
|
53
|
+
}
|