@juspay/neurolink 9.61.2 → 9.62.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/README.md +23 -17
- package/dist/adapters/tts/googleTTSHandler.js +1 -1
- package/dist/browser/neurolink.min.js +373 -355
- package/dist/cli/commands/serve.js +9 -0
- package/dist/cli/commands/voiceServer.d.ts +7 -0
- package/dist/cli/commands/voiceServer.js +9 -1
- package/dist/cli/factories/commandFactory.js +136 -11
- package/dist/cli/loop/optionsSchema.d.ts +1 -1
- package/dist/cli/utils/audioFileUtils.d.ts +3 -3
- package/dist/cli/utils/audioFileUtils.js +5 -1
- package/dist/core/baseProvider.js +29 -6
- package/dist/factories/providerRegistry.d.ts +14 -0
- package/dist/factories/providerRegistry.js +141 -2
- package/dist/lib/adapters/tts/googleTTSHandler.js +1 -1
- package/dist/lib/core/baseProvider.js +29 -6
- package/dist/lib/factories/providerRegistry.d.ts +14 -0
- package/dist/lib/factories/providerRegistry.js +141 -2
- package/dist/lib/neurolink.d.ts +19 -0
- package/dist/lib/neurolink.js +248 -12
- package/dist/lib/observability/exporters/laminarExporter.js +1 -0
- package/dist/lib/observability/exporters/posthogExporter.js +1 -0
- package/dist/lib/observability/utils/spanSerializer.js +1 -0
- package/dist/lib/server/voice/tokenCompare.d.ts +14 -0
- package/dist/lib/server/voice/tokenCompare.js +23 -0
- package/dist/lib/server/voice/voiceServerApp.js +62 -3
- package/dist/lib/server/voice/voiceWebSocketHandler.d.ts +20 -3
- package/dist/lib/server/voice/voiceWebSocketHandler.js +555 -435
- package/dist/lib/types/generate.d.ts +47 -0
- package/dist/lib/types/index.d.ts +1 -1
- package/dist/lib/types/index.js +1 -1
- package/dist/lib/types/realtime.d.ts +243 -0
- package/dist/lib/types/realtime.js +70 -0
- package/dist/lib/types/server.d.ts +68 -0
- package/dist/lib/types/span.d.ts +2 -0
- package/dist/lib/types/span.js +2 -0
- package/dist/lib/types/stream.d.ts +36 -14
- package/dist/lib/types/stt.d.ts +585 -0
- package/dist/lib/types/stt.js +90 -0
- package/dist/lib/types/tts.d.ts +23 -11
- package/dist/lib/types/tts.js +7 -0
- package/dist/lib/types/voice.d.ts +272 -0
- package/dist/lib/types/voice.js +137 -0
- package/dist/lib/utils/audioFormatDetector.d.ts +15 -0
- package/dist/lib/utils/audioFormatDetector.js +34 -0
- package/dist/lib/utils/sttProcessor.d.ts +115 -0
- package/dist/lib/utils/sttProcessor.js +295 -0
- package/dist/lib/voice/RealtimeVoiceAPI.d.ts +183 -0
- package/dist/lib/voice/RealtimeVoiceAPI.js +439 -0
- package/dist/lib/voice/audio-utils.d.ts +135 -0
- package/dist/lib/voice/audio-utils.js +435 -0
- package/dist/lib/voice/errors.d.ts +123 -0
- package/dist/lib/voice/errors.js +386 -0
- package/dist/lib/voice/index.d.ts +26 -0
- package/dist/lib/voice/index.js +55 -0
- package/dist/lib/voice/providers/AzureSTT.d.ts +47 -0
- package/dist/lib/voice/providers/AzureSTT.js +345 -0
- package/dist/lib/voice/providers/AzureTTS.d.ts +59 -0
- package/dist/lib/voice/providers/AzureTTS.js +349 -0
- package/dist/lib/voice/providers/DeepgramSTT.d.ts +40 -0
- package/dist/lib/voice/providers/DeepgramSTT.js +550 -0
- package/dist/lib/voice/providers/ElevenLabsTTS.d.ts +53 -0
- package/dist/lib/voice/providers/ElevenLabsTTS.js +311 -0
- package/dist/lib/voice/providers/GeminiLive.d.ts +52 -0
- package/dist/lib/voice/providers/GeminiLive.js +372 -0
- package/dist/lib/voice/providers/GoogleSTT.d.ts +60 -0
- package/dist/lib/voice/providers/GoogleSTT.js +454 -0
- package/dist/lib/voice/providers/OpenAIRealtime.d.ts +47 -0
- package/dist/lib/voice/providers/OpenAIRealtime.js +412 -0
- package/dist/lib/voice/providers/OpenAISTT.d.ts +41 -0
- package/dist/lib/voice/providers/OpenAISTT.js +286 -0
- package/dist/lib/voice/providers/OpenAITTS.d.ts +49 -0
- package/dist/lib/voice/providers/OpenAITTS.js +271 -0
- package/dist/lib/voice/stream-handler.d.ts +166 -0
- package/dist/lib/voice/stream-handler.js +514 -0
- package/dist/neurolink.d.ts +19 -0
- package/dist/neurolink.js +248 -12
- package/dist/observability/exporters/laminarExporter.js +1 -0
- package/dist/observability/exporters/posthogExporter.js +1 -0
- package/dist/observability/utils/spanSerializer.js +1 -0
- package/dist/server/voice/tokenCompare.d.ts +14 -0
- package/dist/server/voice/tokenCompare.js +22 -0
- package/dist/server/voice/voiceServerApp.js +62 -3
- package/dist/server/voice/voiceWebSocketHandler.d.ts +20 -3
- package/dist/server/voice/voiceWebSocketHandler.js +555 -435
- package/dist/types/generate.d.ts +47 -0
- package/dist/types/index.d.ts +1 -1
- package/dist/types/index.js +1 -1
- package/dist/types/realtime.d.ts +243 -0
- package/dist/types/realtime.js +69 -0
- package/dist/types/server.d.ts +68 -0
- package/dist/types/span.d.ts +2 -0
- package/dist/types/span.js +2 -0
- package/dist/types/stream.d.ts +36 -14
- package/dist/types/stt.d.ts +585 -0
- package/dist/types/stt.js +89 -0
- package/dist/types/tts.d.ts +23 -11
- package/dist/types/tts.js +7 -0
- package/dist/types/voice.d.ts +272 -0
- package/dist/types/voice.js +136 -0
- package/dist/utils/audioFormatDetector.d.ts +15 -0
- package/dist/utils/audioFormatDetector.js +33 -0
- package/dist/utils/sttProcessor.d.ts +115 -0
- package/dist/utils/sttProcessor.js +294 -0
- package/dist/voice/RealtimeVoiceAPI.d.ts +183 -0
- package/dist/voice/RealtimeVoiceAPI.js +438 -0
- package/dist/voice/audio-utils.d.ts +135 -0
- package/dist/voice/audio-utils.js +434 -0
- package/dist/voice/errors.d.ts +123 -0
- package/dist/voice/errors.js +385 -0
- package/dist/voice/index.d.ts +26 -0
- package/dist/voice/index.js +54 -0
- package/dist/voice/providers/AzureSTT.d.ts +47 -0
- package/dist/voice/providers/AzureSTT.js +344 -0
- package/dist/voice/providers/AzureTTS.d.ts +59 -0
- package/dist/voice/providers/AzureTTS.js +348 -0
- package/dist/voice/providers/DeepgramSTT.d.ts +40 -0
- package/dist/voice/providers/DeepgramSTT.js +549 -0
- package/dist/voice/providers/ElevenLabsTTS.d.ts +53 -0
- package/dist/voice/providers/ElevenLabsTTS.js +310 -0
- package/dist/voice/providers/GeminiLive.d.ts +52 -0
- package/dist/voice/providers/GeminiLive.js +371 -0
- package/dist/voice/providers/GoogleSTT.d.ts +60 -0
- package/dist/voice/providers/GoogleSTT.js +453 -0
- package/dist/voice/providers/OpenAIRealtime.d.ts +47 -0
- package/dist/voice/providers/OpenAIRealtime.js +411 -0
- package/dist/voice/providers/OpenAISTT.d.ts +41 -0
- package/dist/voice/providers/OpenAISTT.js +285 -0
- package/dist/voice/providers/OpenAITTS.d.ts +49 -0
- package/dist/voice/providers/OpenAITTS.js +270 -0
- package/dist/voice/stream-handler.d.ts +166 -0
- package/dist/voice/stream-handler.js +513 -0
- package/package.json +3 -1
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Azure Cognitive Services Text-to-Speech Handler
|
|
3
|
+
*
|
|
4
|
+
* Implementation of TTS using Azure Speech Services.
|
|
5
|
+
*
|
|
6
|
+
* @module voice/providers/AzureTTS
|
|
7
|
+
*/
|
|
8
|
+
import { ErrorCategory, ErrorSeverity } from "../../constants/enums.js";
|
|
9
|
+
import { logger } from "../../utils/logger.js";
|
|
10
|
+
import { TTS_ERROR_CODES, TTSError } from "../../utils/ttsProcessor.js";
|
|
11
|
+
/**
|
|
12
|
+
* Azure Cognitive Services Text-to-Speech Handler
|
|
13
|
+
*
|
|
14
|
+
* Supports neural voices with SSML and custom voice styles.
|
|
15
|
+
*
|
|
16
|
+
* @see https://docs.microsoft.com/azure/cognitive-services/speech-service/
|
|
17
|
+
*/
|
|
18
|
+
export class AzureTTS {
|
|
19
|
+
apiKey;
|
|
20
|
+
region;
|
|
21
|
+
voicesCache = null;
|
|
22
|
+
static CACHE_TTL_MS = 30 * 60 * 1000; // 30 minutes
|
|
23
|
+
/**
|
|
24
|
+
* Maximum text length (10000 characters for Azure)
|
|
25
|
+
*/
|
|
26
|
+
maxTextLength = 10000;
|
|
27
|
+
constructor(apiKey, region) {
|
|
28
|
+
const resolvedKey = (apiKey ?? process.env.AZURE_SPEECH_KEY ?? "").trim();
|
|
29
|
+
this.apiKey = resolvedKey.length > 0 ? resolvedKey : null;
|
|
30
|
+
const resolvedRegion = (region ??
|
|
31
|
+
process.env.AZURE_SPEECH_REGION ??
|
|
32
|
+
"").trim();
|
|
33
|
+
this.region = resolvedRegion.length > 0 ? resolvedRegion : "eastus";
|
|
34
|
+
}
|
|
35
|
+
isConfigured() {
|
|
36
|
+
return this.apiKey !== null && this.region.length > 0;
|
|
37
|
+
}
|
|
38
|
+
async getVoices(languageCode) {
|
|
39
|
+
if (!this.apiKey) {
|
|
40
|
+
throw new TTSError({
|
|
41
|
+
code: TTS_ERROR_CODES.PROVIDER_NOT_CONFIGURED,
|
|
42
|
+
message: "Azure Speech key not configured",
|
|
43
|
+
category: ErrorCategory.CONFIGURATION,
|
|
44
|
+
severity: ErrorSeverity.HIGH,
|
|
45
|
+
retriable: false,
|
|
46
|
+
});
|
|
47
|
+
}
|
|
48
|
+
// Return cached voices if valid
|
|
49
|
+
if (this.voicesCache &&
|
|
50
|
+
Date.now() - this.voicesCache.timestamp < AzureTTS.CACHE_TTL_MS &&
|
|
51
|
+
!languageCode) {
|
|
52
|
+
return this.voicesCache.voices;
|
|
53
|
+
}
|
|
54
|
+
try {
|
|
55
|
+
const voicesController = new AbortController();
|
|
56
|
+
const voicesTimeoutId = setTimeout(() => voicesController.abort(), 30000);
|
|
57
|
+
let response;
|
|
58
|
+
try {
|
|
59
|
+
response = await fetch(`https://${this.region}.tts.speech.microsoft.com/cognitiveservices/voices/list`, {
|
|
60
|
+
method: "GET",
|
|
61
|
+
headers: {
|
|
62
|
+
"Ocp-Apim-Subscription-Key": this.apiKey,
|
|
63
|
+
},
|
|
64
|
+
signal: voicesController.signal,
|
|
65
|
+
});
|
|
66
|
+
}
|
|
67
|
+
catch (fetchErr) {
|
|
68
|
+
if (fetchErr instanceof Error && fetchErr.name === "AbortError") {
|
|
69
|
+
throw new TTSError({
|
|
70
|
+
code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
|
|
71
|
+
message: "Azure TTS voices request timed out after 30 seconds",
|
|
72
|
+
category: ErrorCategory.NETWORK,
|
|
73
|
+
severity: ErrorSeverity.MEDIUM,
|
|
74
|
+
retriable: true,
|
|
75
|
+
originalError: fetchErr,
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
throw fetchErr;
|
|
79
|
+
}
|
|
80
|
+
finally {
|
|
81
|
+
clearTimeout(voicesTimeoutId);
|
|
82
|
+
}
|
|
83
|
+
if (!response.ok) {
|
|
84
|
+
throw new Error(`HTTP ${response.status}`);
|
|
85
|
+
}
|
|
86
|
+
const data = (await response.json());
|
|
87
|
+
let voices = data.map((voice) => ({
|
|
88
|
+
id: voice.ShortName,
|
|
89
|
+
name: voice.DisplayName,
|
|
90
|
+
languageCode: voice.Locale,
|
|
91
|
+
languageCodes: [voice.Locale],
|
|
92
|
+
gender: this.mapGender(voice.Gender),
|
|
93
|
+
type: voice.VoiceType.toLowerCase().includes("neural")
|
|
94
|
+
? "neural"
|
|
95
|
+
: "standard",
|
|
96
|
+
description: voice.LocaleName,
|
|
97
|
+
}));
|
|
98
|
+
// Filter by language if specified
|
|
99
|
+
if (languageCode) {
|
|
100
|
+
voices = voices.filter((v) => v.languageCode
|
|
101
|
+
.toLowerCase()
|
|
102
|
+
.startsWith(languageCode.toLowerCase()) ||
|
|
103
|
+
v.languageCode.toLowerCase() === languageCode.toLowerCase());
|
|
104
|
+
}
|
|
105
|
+
// Cache full list
|
|
106
|
+
if (!languageCode) {
|
|
107
|
+
this.voicesCache = { voices, timestamp: Date.now() };
|
|
108
|
+
}
|
|
109
|
+
return voices;
|
|
110
|
+
}
|
|
111
|
+
catch (err) {
|
|
112
|
+
// Don't double-wrap an already-typed TTSError (the inner try-block
|
|
113
|
+
// throws TTSError on AbortError timeouts) — preserves the clean error
|
|
114
|
+
// chain. synthesize() at line ~249 already uses this pattern.
|
|
115
|
+
if (err instanceof TTSError) {
|
|
116
|
+
throw err;
|
|
117
|
+
}
|
|
118
|
+
const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error");
|
|
119
|
+
logger.error(`[AzureTTSHandler] Failed to get voices: ${errorMessage}`);
|
|
120
|
+
throw new TTSError({
|
|
121
|
+
code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
|
|
122
|
+
message: `Failed to get voices: ${errorMessage}`,
|
|
123
|
+
category: ErrorCategory.NETWORK,
|
|
124
|
+
severity: ErrorSeverity.MEDIUM,
|
|
125
|
+
retriable: true,
|
|
126
|
+
originalError: err instanceof Error ? err : undefined,
|
|
127
|
+
});
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
async synthesize(text, options = {}) {
|
|
131
|
+
if (!this.apiKey) {
|
|
132
|
+
throw new TTSError({
|
|
133
|
+
code: TTS_ERROR_CODES.PROVIDER_NOT_CONFIGURED,
|
|
134
|
+
message: "Azure Speech key not configured",
|
|
135
|
+
category: ErrorCategory.CONFIGURATION,
|
|
136
|
+
severity: ErrorSeverity.HIGH,
|
|
137
|
+
retriable: false,
|
|
138
|
+
});
|
|
139
|
+
}
|
|
140
|
+
const startTime = Date.now();
|
|
141
|
+
const azureOptions = options;
|
|
142
|
+
try {
|
|
143
|
+
// Get voice (default to a common neural voice)
|
|
144
|
+
const voice = options.voice ?? "en-US-JennyNeural";
|
|
145
|
+
// Determine output format
|
|
146
|
+
const outputFormat = azureOptions.outputFormat ?? this.mapFormat(options.format ?? "mp3");
|
|
147
|
+
// Build SSML
|
|
148
|
+
const ssml = this.buildSSML(text, voice, options);
|
|
149
|
+
const controller = new AbortController();
|
|
150
|
+
const timeoutId = setTimeout(() => controller.abort(), 30000);
|
|
151
|
+
let response;
|
|
152
|
+
try {
|
|
153
|
+
response = await fetch(`https://${this.region}.tts.speech.microsoft.com/cognitiveservices/v1`, {
|
|
154
|
+
method: "POST",
|
|
155
|
+
headers: {
|
|
156
|
+
"Ocp-Apim-Subscription-Key": this.apiKey,
|
|
157
|
+
"Content-Type": "application/ssml+xml",
|
|
158
|
+
"X-Microsoft-OutputFormat": outputFormat,
|
|
159
|
+
},
|
|
160
|
+
body: ssml,
|
|
161
|
+
signal: controller.signal,
|
|
162
|
+
});
|
|
163
|
+
}
|
|
164
|
+
catch (fetchErr) {
|
|
165
|
+
if (fetchErr instanceof Error && fetchErr.name === "AbortError") {
|
|
166
|
+
throw new TTSError({
|
|
167
|
+
code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
|
|
168
|
+
message: "Azure TTS request timed out after 30 seconds",
|
|
169
|
+
category: ErrorCategory.NETWORK,
|
|
170
|
+
severity: ErrorSeverity.HIGH,
|
|
171
|
+
retriable: true,
|
|
172
|
+
originalError: fetchErr,
|
|
173
|
+
});
|
|
174
|
+
}
|
|
175
|
+
throw fetchErr;
|
|
176
|
+
}
|
|
177
|
+
finally {
|
|
178
|
+
clearTimeout(timeoutId);
|
|
179
|
+
}
|
|
180
|
+
if (!response.ok) {
|
|
181
|
+
const errorText = await response.text();
|
|
182
|
+
throw new Error(`HTTP ${response.status}: ${errorText}`);
|
|
183
|
+
}
|
|
184
|
+
const latency = Date.now() - startTime;
|
|
185
|
+
// Get audio buffer
|
|
186
|
+
const arrayBuffer = await response.arrayBuffer();
|
|
187
|
+
const audioBuffer = Buffer.from(arrayBuffer);
|
|
188
|
+
const result = {
|
|
189
|
+
buffer: audioBuffer,
|
|
190
|
+
// Use the *effective* output format derived from outputFormat, not the
|
|
191
|
+
// requested format — otherwise unsupported requests that fell back to
|
|
192
|
+
// mp3 would mislabel the buffer (Copilot review).
|
|
193
|
+
format: this.effectiveFormat(outputFormat),
|
|
194
|
+
size: audioBuffer.length,
|
|
195
|
+
voice,
|
|
196
|
+
sampleRate: this.getSampleRate(outputFormat),
|
|
197
|
+
metadata: {
|
|
198
|
+
latency,
|
|
199
|
+
provider: "azure-tts",
|
|
200
|
+
requestedFormat: options.format,
|
|
201
|
+
outputFormat,
|
|
202
|
+
region: this.region,
|
|
203
|
+
},
|
|
204
|
+
};
|
|
205
|
+
logger.info(`[AzureTTSHandler] Synthesized ${audioBuffer.length} bytes in ${latency}ms`);
|
|
206
|
+
return result;
|
|
207
|
+
}
|
|
208
|
+
catch (err) {
|
|
209
|
+
if (err instanceof TTSError) {
|
|
210
|
+
throw err;
|
|
211
|
+
}
|
|
212
|
+
const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error");
|
|
213
|
+
logger.error(`[AzureTTSHandler] Synthesis failed: ${errorMessage}`);
|
|
214
|
+
throw new TTSError({
|
|
215
|
+
code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
|
|
216
|
+
message: `Synthesis failed: ${errorMessage}`,
|
|
217
|
+
category: ErrorCategory.EXECUTION,
|
|
218
|
+
severity: ErrorSeverity.HIGH,
|
|
219
|
+
retriable: true,
|
|
220
|
+
context: { textLength: text.length },
|
|
221
|
+
originalError: err instanceof Error ? err : undefined,
|
|
222
|
+
});
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
/**
|
|
226
|
+
* Build SSML from text and options
|
|
227
|
+
*/
|
|
228
|
+
buildSSML(text, voice, options) {
|
|
229
|
+
const azureOptions = options;
|
|
230
|
+
// If custom SSML template provided, use it
|
|
231
|
+
if (azureOptions.ssmlTemplate) {
|
|
232
|
+
return azureOptions.ssmlTemplate
|
|
233
|
+
.replace("{text}", this.escapeXml(text))
|
|
234
|
+
.replace("{voice}", this.escapeXml(voice));
|
|
235
|
+
}
|
|
236
|
+
// m1: Only pass raw SSML through when the caller explicitly opted in via
|
|
237
|
+
// `allowRawSSML`. Otherwise escape — `text` from untrusted sources that
|
|
238
|
+
// happens (or is crafted) to begin with `<speak` would otherwise enable
|
|
239
|
+
// SSML injection (arbitrary voice changes, external content references).
|
|
240
|
+
if (azureOptions.allowRawSSML && text.trim().startsWith("<speak")) {
|
|
241
|
+
return text;
|
|
242
|
+
}
|
|
243
|
+
// Build rate string
|
|
244
|
+
const rate = options.speed
|
|
245
|
+
? `${Math.round((options.speed - 1) * 100)}%`
|
|
246
|
+
: "0%";
|
|
247
|
+
// Build pitch string. `TTSOptions.pitch` is documented as semitones, and
|
|
248
|
+
// Azure SSML supports semitone units directly via `<n>st` (e.g. "+2st",
|
|
249
|
+
// "-3st"). Previously this emitted `<n>%`, which Azure interprets as a
|
|
250
|
+
// relative percentage — wrong magnitude (Copilot review).
|
|
251
|
+
const pitchValue = options.pitch ?? 0;
|
|
252
|
+
const pitch = pitchValue >= 0
|
|
253
|
+
? `+${Math.round(pitchValue)}st`
|
|
254
|
+
: `${Math.round(pitchValue)}st`;
|
|
255
|
+
// Build SSML
|
|
256
|
+
return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="${this.escapeXml(this.extractLanguage(voice))}">
|
|
257
|
+
<voice name="${this.escapeXml(voice)}">
|
|
258
|
+
<prosody rate="${rate}" pitch="${pitch}">
|
|
259
|
+
${this.escapeXml(text)}
|
|
260
|
+
</prosody>
|
|
261
|
+
</voice>
|
|
262
|
+
</speak>`;
|
|
263
|
+
}
|
|
264
|
+
/**
|
|
265
|
+
* Extract language from voice name
|
|
266
|
+
*/
|
|
267
|
+
extractLanguage(voice) {
|
|
268
|
+
// Voice names are like "en-US-JennyNeural"
|
|
269
|
+
const match = voice.match(/^([a-z]{2}-[A-Z]{2})/);
|
|
270
|
+
return match ? match[1] : "en-US";
|
|
271
|
+
}
|
|
272
|
+
/**
|
|
273
|
+
* Escape XML special characters
|
|
274
|
+
*/
|
|
275
|
+
escapeXml(text) {
|
|
276
|
+
return text
|
|
277
|
+
.replace(/&/g, "&")
|
|
278
|
+
.replace(/</g, "<")
|
|
279
|
+
.replace(/>/g, ">")
|
|
280
|
+
.replace(/"/g, """)
|
|
281
|
+
.replace(/'/g, "'");
|
|
282
|
+
}
|
|
283
|
+
/**
|
|
284
|
+
* Map gender string to standard type
|
|
285
|
+
*/
|
|
286
|
+
mapGender(gender) {
|
|
287
|
+
switch (gender?.toLowerCase()) {
|
|
288
|
+
case "male":
|
|
289
|
+
return "male";
|
|
290
|
+
case "female":
|
|
291
|
+
return "female";
|
|
292
|
+
default:
|
|
293
|
+
return "neutral";
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
/**
|
|
297
|
+
* Map TTSAudioFormat to Azure output format
|
|
298
|
+
*/
|
|
299
|
+
mapFormat(format) {
|
|
300
|
+
const formats = {
|
|
301
|
+
mp3: "audio-24khz-96kbitrate-mono-mp3",
|
|
302
|
+
wav: "riff-24khz-16bit-mono-pcm",
|
|
303
|
+
ogg: "ogg-24khz-16bit-mono-opus",
|
|
304
|
+
opus: "ogg-24khz-16bit-mono-opus",
|
|
305
|
+
};
|
|
306
|
+
return formats[format] ?? "audio-24khz-96kbitrate-mono-mp3";
|
|
307
|
+
}
|
|
308
|
+
/**
|
|
309
|
+
* Get sample rate from format string
|
|
310
|
+
*/
|
|
311
|
+
getSampleRate(format) {
|
|
312
|
+
if (format.includes("24khz")) {
|
|
313
|
+
return 24000;
|
|
314
|
+
}
|
|
315
|
+
if (format.includes("16khz")) {
|
|
316
|
+
return 16000;
|
|
317
|
+
}
|
|
318
|
+
if (format.includes("48khz")) {
|
|
319
|
+
return 48000;
|
|
320
|
+
}
|
|
321
|
+
return 24000;
|
|
322
|
+
}
|
|
323
|
+
/**
|
|
324
|
+
* Map the Azure outputFormat string back to a canonical TTSAudioFormat so
|
|
325
|
+
* TTSResult.format matches what the API actually returned (mapFormat() can
|
|
326
|
+
* coerce unsupported requests to mp3).
|
|
327
|
+
*/
|
|
328
|
+
effectiveFormat(outputFormat) {
|
|
329
|
+
if (outputFormat.includes("mp3")) {
|
|
330
|
+
return "mp3";
|
|
331
|
+
}
|
|
332
|
+
if (outputFormat.includes("opus")) {
|
|
333
|
+
return "opus";
|
|
334
|
+
}
|
|
335
|
+
// Raw PCM (no RIFF/WAV header) must not be labeled as "wav" — downstream
|
|
336
|
+
// WAV parsers would misread the buffer. Azure uses the `raw-*` prefix
|
|
337
|
+
// for headerless PCM (e.g. `raw-16khz-16bit-mono-pcm`).
|
|
338
|
+
if (outputFormat.startsWith("raw") && outputFormat.includes("pcm")) {
|
|
339
|
+
return "pcm16";
|
|
340
|
+
}
|
|
341
|
+
if (outputFormat.includes("riff") ||
|
|
342
|
+
outputFormat.includes("pcm") ||
|
|
343
|
+
outputFormat.includes("wav")) {
|
|
344
|
+
return "wav";
|
|
345
|
+
}
|
|
346
|
+
return "mp3";
|
|
347
|
+
}
|
|
348
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Deepgram Speech-to-Text Handler
|
|
3
|
+
*
|
|
4
|
+
* Implementation of STT using Deepgram's Speech Recognition API.
|
|
5
|
+
*
|
|
6
|
+
* @module voice/providers/DeepgramSTT
|
|
7
|
+
*/
|
|
8
|
+
import type { TTSAudioFormat, STTHandler, STTLanguage, STTOptions, STTResult, TranscriptionSegment } from "../../types/index.js";
|
|
9
|
+
/**
|
|
10
|
+
* Deepgram Speech-to-Text Handler
|
|
11
|
+
*
|
|
12
|
+
* Supports real-time streaming, speaker diarization, and smart formatting.
|
|
13
|
+
*
|
|
14
|
+
* @see https://developers.deepgram.com/docs
|
|
15
|
+
*/
|
|
16
|
+
export declare class DeepgramSTT implements STTHandler {
|
|
17
|
+
private readonly apiKey;
|
|
18
|
+
private readonly baseUrl;
|
|
19
|
+
/**
|
|
20
|
+
* Maximum audio duration in seconds (2 hours)
|
|
21
|
+
*/
|
|
22
|
+
readonly maxAudioDuration = 7200;
|
|
23
|
+
/**
|
|
24
|
+
* Deepgram supports streaming
|
|
25
|
+
*/
|
|
26
|
+
readonly supportsStreaming = true;
|
|
27
|
+
constructor(apiKey?: string);
|
|
28
|
+
isConfigured(): boolean;
|
|
29
|
+
getSupportedFormats(): TTSAudioFormat[];
|
|
30
|
+
getSupportedLanguages(): Promise<STTLanguage[]>;
|
|
31
|
+
transcribe(audio: Buffer | ArrayBuffer, options?: STTOptions): Promise<STTResult>;
|
|
32
|
+
/**
|
|
33
|
+
* Streaming transcription using WebSocket
|
|
34
|
+
*/
|
|
35
|
+
transcribeStream(audioStream: AsyncIterable<Buffer>, options: STTOptions): AsyncIterable<TranscriptionSegment>;
|
|
36
|
+
/**
|
|
37
|
+
* Get MIME type for audio format
|
|
38
|
+
*/
|
|
39
|
+
private getMimeType;
|
|
40
|
+
}
|