@juspay/neurolink 9.61.2 → 9.62.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/README.md +23 -17
- package/dist/adapters/tts/googleTTSHandler.js +1 -1
- package/dist/browser/neurolink.min.js +373 -355
- package/dist/cli/commands/serve.js +9 -0
- package/dist/cli/commands/voiceServer.d.ts +7 -0
- package/dist/cli/commands/voiceServer.js +9 -1
- package/dist/cli/factories/commandFactory.js +136 -11
- package/dist/cli/loop/optionsSchema.d.ts +1 -1
- package/dist/cli/utils/audioFileUtils.d.ts +3 -3
- package/dist/cli/utils/audioFileUtils.js +5 -1
- package/dist/core/baseProvider.js +29 -6
- package/dist/factories/providerRegistry.d.ts +14 -0
- package/dist/factories/providerRegistry.js +141 -2
- package/dist/lib/adapters/tts/googleTTSHandler.js +1 -1
- package/dist/lib/core/baseProvider.js +29 -6
- package/dist/lib/factories/providerRegistry.d.ts +14 -0
- package/dist/lib/factories/providerRegistry.js +141 -2
- package/dist/lib/neurolink.d.ts +19 -0
- package/dist/lib/neurolink.js +248 -12
- package/dist/lib/observability/exporters/laminarExporter.js +1 -0
- package/dist/lib/observability/exporters/posthogExporter.js +1 -0
- package/dist/lib/observability/utils/spanSerializer.js +1 -0
- package/dist/lib/server/voice/tokenCompare.d.ts +14 -0
- package/dist/lib/server/voice/tokenCompare.js +23 -0
- package/dist/lib/server/voice/voiceServerApp.js +62 -3
- package/dist/lib/server/voice/voiceWebSocketHandler.d.ts +20 -3
- package/dist/lib/server/voice/voiceWebSocketHandler.js +555 -435
- package/dist/lib/types/generate.d.ts +47 -0
- package/dist/lib/types/index.d.ts +1 -1
- package/dist/lib/types/index.js +1 -1
- package/dist/lib/types/realtime.d.ts +243 -0
- package/dist/lib/types/realtime.js +70 -0
- package/dist/lib/types/server.d.ts +68 -0
- package/dist/lib/types/span.d.ts +2 -0
- package/dist/lib/types/span.js +2 -0
- package/dist/lib/types/stream.d.ts +36 -14
- package/dist/lib/types/stt.d.ts +585 -0
- package/dist/lib/types/stt.js +90 -0
- package/dist/lib/types/tts.d.ts +23 -11
- package/dist/lib/types/tts.js +7 -0
- package/dist/lib/types/voice.d.ts +272 -0
- package/dist/lib/types/voice.js +137 -0
- package/dist/lib/utils/audioFormatDetector.d.ts +15 -0
- package/dist/lib/utils/audioFormatDetector.js +34 -0
- package/dist/lib/utils/sttProcessor.d.ts +115 -0
- package/dist/lib/utils/sttProcessor.js +295 -0
- package/dist/lib/voice/RealtimeVoiceAPI.d.ts +183 -0
- package/dist/lib/voice/RealtimeVoiceAPI.js +439 -0
- package/dist/lib/voice/audio-utils.d.ts +135 -0
- package/dist/lib/voice/audio-utils.js +435 -0
- package/dist/lib/voice/errors.d.ts +123 -0
- package/dist/lib/voice/errors.js +386 -0
- package/dist/lib/voice/index.d.ts +26 -0
- package/dist/lib/voice/index.js +55 -0
- package/dist/lib/voice/providers/AzureSTT.d.ts +47 -0
- package/dist/lib/voice/providers/AzureSTT.js +345 -0
- package/dist/lib/voice/providers/AzureTTS.d.ts +59 -0
- package/dist/lib/voice/providers/AzureTTS.js +349 -0
- package/dist/lib/voice/providers/DeepgramSTT.d.ts +40 -0
- package/dist/lib/voice/providers/DeepgramSTT.js +550 -0
- package/dist/lib/voice/providers/ElevenLabsTTS.d.ts +53 -0
- package/dist/lib/voice/providers/ElevenLabsTTS.js +311 -0
- package/dist/lib/voice/providers/GeminiLive.d.ts +52 -0
- package/dist/lib/voice/providers/GeminiLive.js +372 -0
- package/dist/lib/voice/providers/GoogleSTT.d.ts +60 -0
- package/dist/lib/voice/providers/GoogleSTT.js +454 -0
- package/dist/lib/voice/providers/OpenAIRealtime.d.ts +47 -0
- package/dist/lib/voice/providers/OpenAIRealtime.js +412 -0
- package/dist/lib/voice/providers/OpenAISTT.d.ts +41 -0
- package/dist/lib/voice/providers/OpenAISTT.js +286 -0
- package/dist/lib/voice/providers/OpenAITTS.d.ts +49 -0
- package/dist/lib/voice/providers/OpenAITTS.js +271 -0
- package/dist/lib/voice/stream-handler.d.ts +166 -0
- package/dist/lib/voice/stream-handler.js +514 -0
- package/dist/neurolink.d.ts +19 -0
- package/dist/neurolink.js +248 -12
- package/dist/observability/exporters/laminarExporter.js +1 -0
- package/dist/observability/exporters/posthogExporter.js +1 -0
- package/dist/observability/utils/spanSerializer.js +1 -0
- package/dist/server/voice/tokenCompare.d.ts +14 -0
- package/dist/server/voice/tokenCompare.js +22 -0
- package/dist/server/voice/voiceServerApp.js +62 -3
- package/dist/server/voice/voiceWebSocketHandler.d.ts +20 -3
- package/dist/server/voice/voiceWebSocketHandler.js +555 -435
- package/dist/types/generate.d.ts +47 -0
- package/dist/types/index.d.ts +1 -1
- package/dist/types/index.js +1 -1
- package/dist/types/realtime.d.ts +243 -0
- package/dist/types/realtime.js +69 -0
- package/dist/types/server.d.ts +68 -0
- package/dist/types/span.d.ts +2 -0
- package/dist/types/span.js +2 -0
- package/dist/types/stream.d.ts +36 -14
- package/dist/types/stt.d.ts +585 -0
- package/dist/types/stt.js +89 -0
- package/dist/types/tts.d.ts +23 -11
- package/dist/types/tts.js +7 -0
- package/dist/types/voice.d.ts +272 -0
- package/dist/types/voice.js +136 -0
- package/dist/utils/audioFormatDetector.d.ts +15 -0
- package/dist/utils/audioFormatDetector.js +33 -0
- package/dist/utils/sttProcessor.d.ts +115 -0
- package/dist/utils/sttProcessor.js +294 -0
- package/dist/voice/RealtimeVoiceAPI.d.ts +183 -0
- package/dist/voice/RealtimeVoiceAPI.js +438 -0
- package/dist/voice/audio-utils.d.ts +135 -0
- package/dist/voice/audio-utils.js +434 -0
- package/dist/voice/errors.d.ts +123 -0
- package/dist/voice/errors.js +385 -0
- package/dist/voice/index.d.ts +26 -0
- package/dist/voice/index.js +54 -0
- package/dist/voice/providers/AzureSTT.d.ts +47 -0
- package/dist/voice/providers/AzureSTT.js +344 -0
- package/dist/voice/providers/AzureTTS.d.ts +59 -0
- package/dist/voice/providers/AzureTTS.js +348 -0
- package/dist/voice/providers/DeepgramSTT.d.ts +40 -0
- package/dist/voice/providers/DeepgramSTT.js +549 -0
- package/dist/voice/providers/ElevenLabsTTS.d.ts +53 -0
- package/dist/voice/providers/ElevenLabsTTS.js +310 -0
- package/dist/voice/providers/GeminiLive.d.ts +52 -0
- package/dist/voice/providers/GeminiLive.js +371 -0
- package/dist/voice/providers/GoogleSTT.d.ts +60 -0
- package/dist/voice/providers/GoogleSTT.js +453 -0
- package/dist/voice/providers/OpenAIRealtime.d.ts +47 -0
- package/dist/voice/providers/OpenAIRealtime.js +411 -0
- package/dist/voice/providers/OpenAISTT.d.ts +41 -0
- package/dist/voice/providers/OpenAISTT.js +285 -0
- package/dist/voice/providers/OpenAITTS.d.ts +49 -0
- package/dist/voice/providers/OpenAITTS.js +270 -0
- package/dist/voice/stream-handler.d.ts +166 -0
- package/dist/voice/stream-handler.js +513 -0
- package/package.json +3 -1
|
@@ -0,0 +1,345 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Azure Cognitive Services Speech-to-Text Handler
|
|
3
|
+
*
|
|
4
|
+
* Implementation of STT using Azure Speech Services.
|
|
5
|
+
*
|
|
6
|
+
* @module voice/providers/AzureSTT
|
|
7
|
+
*/
|
|
8
|
+
import { logger } from "../../utils/logger.js";
|
|
9
|
+
import { STTError } from "../errors.js";
|
|
10
|
+
/**
|
|
11
|
+
* Azure Cognitive Services Speech-to-Text Handler
|
|
12
|
+
*
|
|
13
|
+
* Supports speech recognition with custom models and detailed output.
|
|
14
|
+
*
|
|
15
|
+
* @see https://docs.microsoft.com/azure/cognitive-services/speech-service/
|
|
16
|
+
*/
|
|
17
|
+
export class AzureSTT {
|
|
18
|
+
apiKey;
|
|
19
|
+
region;
|
|
20
|
+
/**
|
|
21
|
+
* Maximum audio duration in seconds (60s — Azure's REST API for short audio
|
|
22
|
+
* documented limit on `/speech/recognition/conversation/cognitiveservices/v1`).
|
|
23
|
+
* For longer audio, use Azure Batch Transcription (not yet implemented) or
|
|
24
|
+
* pre-segment the input.
|
|
25
|
+
*/
|
|
26
|
+
maxAudioDuration = 60;
|
|
27
|
+
/**
|
|
28
|
+
* Azure STT implementation buffers chunks via REST — not true streaming
|
|
29
|
+
*/
|
|
30
|
+
supportsStreaming = false;
|
|
31
|
+
constructor(apiKey, region) {
|
|
32
|
+
const resolvedKey = (apiKey ?? process.env.AZURE_SPEECH_KEY ?? "").trim();
|
|
33
|
+
this.apiKey = resolvedKey.length > 0 ? resolvedKey : null;
|
|
34
|
+
const resolvedRegion = (region ??
|
|
35
|
+
process.env.AZURE_SPEECH_REGION ??
|
|
36
|
+
"").trim();
|
|
37
|
+
this.region = resolvedRegion.length > 0 ? resolvedRegion : "eastus";
|
|
38
|
+
}
|
|
39
|
+
isConfigured() {
|
|
40
|
+
return this.apiKey !== null && this.region.length > 0;
|
|
41
|
+
}
|
|
42
|
+
getSupportedFormats() {
|
|
43
|
+
// Azure's "Speech-to-text REST API for short audio" only accepts uncompressed
|
|
44
|
+
// PCM WAV (16kHz/16-bit/mono recommended) and Ogg/Opus. MP3 is NOT decoded
|
|
45
|
+
// by this endpoint (it returns Success with empty text). For MP3 input use
|
|
46
|
+
// the Batch Transcription API (not yet implemented) or convert to WAV first.
|
|
47
|
+
return ["wav", "ogg", "opus"];
|
|
48
|
+
}
|
|
49
|
+
async getSupportedLanguages() {
|
|
50
|
+
// Azure supports 100+ languages
|
|
51
|
+
return [
|
|
52
|
+
{
|
|
53
|
+
code: "en-US",
|
|
54
|
+
name: "English (US)",
|
|
55
|
+
supportsDiarization: true,
|
|
56
|
+
supportsPunctuation: true,
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
code: "en-GB",
|
|
60
|
+
name: "English (UK)",
|
|
61
|
+
supportsDiarization: true,
|
|
62
|
+
supportsPunctuation: true,
|
|
63
|
+
},
|
|
64
|
+
{
|
|
65
|
+
code: "es-ES",
|
|
66
|
+
name: "Spanish (Spain)",
|
|
67
|
+
supportsDiarization: true,
|
|
68
|
+
supportsPunctuation: true,
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
code: "es-MX",
|
|
72
|
+
name: "Spanish (Mexico)",
|
|
73
|
+
supportsDiarization: true,
|
|
74
|
+
supportsPunctuation: true,
|
|
75
|
+
},
|
|
76
|
+
{
|
|
77
|
+
code: "fr-FR",
|
|
78
|
+
name: "French",
|
|
79
|
+
supportsDiarization: true,
|
|
80
|
+
supportsPunctuation: true,
|
|
81
|
+
},
|
|
82
|
+
{
|
|
83
|
+
code: "de-DE",
|
|
84
|
+
name: "German",
|
|
85
|
+
supportsDiarization: true,
|
|
86
|
+
supportsPunctuation: true,
|
|
87
|
+
},
|
|
88
|
+
{
|
|
89
|
+
code: "it-IT",
|
|
90
|
+
name: "Italian",
|
|
91
|
+
supportsDiarization: true,
|
|
92
|
+
supportsPunctuation: true,
|
|
93
|
+
},
|
|
94
|
+
{
|
|
95
|
+
code: "pt-BR",
|
|
96
|
+
name: "Portuguese (Brazil)",
|
|
97
|
+
supportsDiarization: true,
|
|
98
|
+
supportsPunctuation: true,
|
|
99
|
+
},
|
|
100
|
+
{
|
|
101
|
+
code: "ja-JP",
|
|
102
|
+
name: "Japanese",
|
|
103
|
+
supportsDiarization: true,
|
|
104
|
+
supportsPunctuation: true,
|
|
105
|
+
},
|
|
106
|
+
{
|
|
107
|
+
code: "ko-KR",
|
|
108
|
+
name: "Korean",
|
|
109
|
+
supportsDiarization: true,
|
|
110
|
+
supportsPunctuation: true,
|
|
111
|
+
},
|
|
112
|
+
{
|
|
113
|
+
code: "zh-CN",
|
|
114
|
+
name: "Chinese (Simplified)",
|
|
115
|
+
supportsDiarization: true,
|
|
116
|
+
supportsPunctuation: true,
|
|
117
|
+
},
|
|
118
|
+
{
|
|
119
|
+
code: "hi-IN",
|
|
120
|
+
name: "Hindi",
|
|
121
|
+
supportsDiarization: true,
|
|
122
|
+
supportsPunctuation: true,
|
|
123
|
+
},
|
|
124
|
+
{
|
|
125
|
+
code: "ar-SA",
|
|
126
|
+
name: "Arabic",
|
|
127
|
+
supportsDiarization: true,
|
|
128
|
+
supportsPunctuation: true,
|
|
129
|
+
},
|
|
130
|
+
{
|
|
131
|
+
code: "ru-RU",
|
|
132
|
+
name: "Russian",
|
|
133
|
+
supportsDiarization: true,
|
|
134
|
+
supportsPunctuation: true,
|
|
135
|
+
},
|
|
136
|
+
];
|
|
137
|
+
}
|
|
138
|
+
async transcribe(audio, options = {}) {
|
|
139
|
+
if (!this.apiKey) {
|
|
140
|
+
throw STTError.providerNotConfigured("azure-stt");
|
|
141
|
+
}
|
|
142
|
+
const audioBuffer = Buffer.isBuffer(audio) ? audio : Buffer.from(audio);
|
|
143
|
+
if (audioBuffer.length === 0) {
|
|
144
|
+
throw STTError.audioEmpty("azure-stt");
|
|
145
|
+
}
|
|
146
|
+
const azureOptions = options;
|
|
147
|
+
const startTime = Date.now();
|
|
148
|
+
try {
|
|
149
|
+
// Build the URL with query parameters
|
|
150
|
+
const params = new URLSearchParams();
|
|
151
|
+
params.set("language", options.language ?? "en-US");
|
|
152
|
+
// Add detailed output format
|
|
153
|
+
if (azureOptions.detailed || options.wordTimestamps) {
|
|
154
|
+
params.set("format", "detailed");
|
|
155
|
+
}
|
|
156
|
+
// Add profanity mode
|
|
157
|
+
if (azureOptions.profanityMode) {
|
|
158
|
+
params.set("profanity", azureOptions.profanityMode);
|
|
159
|
+
}
|
|
160
|
+
else if (options.profanityFilter) {
|
|
161
|
+
params.set("profanity", "masked");
|
|
162
|
+
}
|
|
163
|
+
// Add custom endpoint if provided
|
|
164
|
+
const baseUrl = `https://${this.region}.stt.speech.microsoft.com`;
|
|
165
|
+
if (azureOptions.customEndpointId) {
|
|
166
|
+
params.set("cid", azureOptions.customEndpointId);
|
|
167
|
+
}
|
|
168
|
+
const url = `${baseUrl}/speech/recognition/conversation/cognitiveservices/v1?${params.toString()}`;
|
|
169
|
+
const controller = new AbortController();
|
|
170
|
+
const timeoutId = setTimeout(() => controller.abort(), 30000);
|
|
171
|
+
let response;
|
|
172
|
+
try {
|
|
173
|
+
response = await fetch(url, {
|
|
174
|
+
method: "POST",
|
|
175
|
+
headers: {
|
|
176
|
+
"Ocp-Apim-Subscription-Key": this.apiKey,
|
|
177
|
+
"Content-Type": this.getContentType(options.format ?? "wav"),
|
|
178
|
+
Accept: "application/json",
|
|
179
|
+
},
|
|
180
|
+
body: new Uint8Array(audioBuffer),
|
|
181
|
+
signal: controller.signal,
|
|
182
|
+
});
|
|
183
|
+
}
|
|
184
|
+
catch (fetchErr) {
|
|
185
|
+
if (fetchErr instanceof Error && fetchErr.name === "AbortError") {
|
|
186
|
+
throw STTError.transcriptionFailed("Azure STT request timed out after 30 seconds", "azure-stt", fetchErr);
|
|
187
|
+
}
|
|
188
|
+
throw fetchErr;
|
|
189
|
+
}
|
|
190
|
+
finally {
|
|
191
|
+
clearTimeout(timeoutId);
|
|
192
|
+
}
|
|
193
|
+
if (!response.ok) {
|
|
194
|
+
const errorText = await response.text();
|
|
195
|
+
throw STTError.transcriptionFailed(`HTTP ${response.status}: ${errorText}`, "azure-stt");
|
|
196
|
+
}
|
|
197
|
+
const data = (await response.json());
|
|
198
|
+
const latency = Date.now() - startTime;
|
|
199
|
+
// Check recognition status
|
|
200
|
+
if (data.RecognitionStatus !== "Success") {
|
|
201
|
+
if (data.RecognitionStatus === "NoMatch") {
|
|
202
|
+
return {
|
|
203
|
+
text: "",
|
|
204
|
+
confidence: 0,
|
|
205
|
+
language: options.language,
|
|
206
|
+
metadata: {
|
|
207
|
+
latency,
|
|
208
|
+
provider: "azure-stt",
|
|
209
|
+
status: data.RecognitionStatus,
|
|
210
|
+
},
|
|
211
|
+
};
|
|
212
|
+
}
|
|
213
|
+
throw STTError.transcriptionFailed(`Recognition failed: ${data.RecognitionStatus}`, "azure-stt");
|
|
214
|
+
}
|
|
215
|
+
// Build result from NBest or DisplayText
|
|
216
|
+
const result = {
|
|
217
|
+
text: data.DisplayText ?? "",
|
|
218
|
+
confidence: 0.9, // Default confidence if not available
|
|
219
|
+
language: options.language,
|
|
220
|
+
duration: this.ticksToSeconds(data.Duration ?? 0),
|
|
221
|
+
metadata: {
|
|
222
|
+
latency,
|
|
223
|
+
provider: "azure-stt",
|
|
224
|
+
status: data.RecognitionStatus,
|
|
225
|
+
},
|
|
226
|
+
};
|
|
227
|
+
// Process NBest results if available
|
|
228
|
+
if (data.NBest && data.NBest.length > 0) {
|
|
229
|
+
const best = data.NBest[0];
|
|
230
|
+
result.text = best.Display;
|
|
231
|
+
result.confidence = best.Confidence;
|
|
232
|
+
// Add word timings
|
|
233
|
+
if (best.Words && best.Words.length > 0) {
|
|
234
|
+
result.words = best.Words.map((word) => ({
|
|
235
|
+
word: word.Word,
|
|
236
|
+
startTime: this.ticksToSeconds(word.Offset),
|
|
237
|
+
endTime: this.ticksToSeconds(word.Offset + word.Duration),
|
|
238
|
+
confidence: word.Confidence,
|
|
239
|
+
}));
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
logger.info(`[AzureSTTHandler] Transcribed audio in ${latency}ms`);
|
|
243
|
+
return result;
|
|
244
|
+
}
|
|
245
|
+
catch (err) {
|
|
246
|
+
if (err instanceof STTError) {
|
|
247
|
+
throw err;
|
|
248
|
+
}
|
|
249
|
+
const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error");
|
|
250
|
+
logger.error(`[AzureSTTHandler] Transcription failed: ${errorMessage}`);
|
|
251
|
+
throw STTError.transcriptionFailed(errorMessage, "azure-stt", err instanceof Error ? err : undefined);
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
/**
|
|
255
|
+
* Streaming transcription (placeholder - requires SDK)
|
|
256
|
+
*/
|
|
257
|
+
async *transcribeStream(audioStream, options) {
|
|
258
|
+
// Azure streaming requires the Microsoft Speech SDK
|
|
259
|
+
// For now, buffer and transcribe in chunks
|
|
260
|
+
const chunks = [];
|
|
261
|
+
let chunkIndex = 0;
|
|
262
|
+
// Track buffered byte count incrementally — `chunks.reduce()` per incoming
|
|
263
|
+
// chunk is O(n²) over long streams (Copilot/CodeRabbit review). Reset to 0
|
|
264
|
+
// every time we flush.
|
|
265
|
+
let bufferedBytes = 0;
|
|
266
|
+
for await (const chunk of audioStream) {
|
|
267
|
+
chunks.push(chunk);
|
|
268
|
+
bufferedBytes += chunk.length;
|
|
269
|
+
// Process every ~5 seconds of audio
|
|
270
|
+
const bytesPerSecond = (options.sampleRate ?? 16000) * 2;
|
|
271
|
+
if (bufferedBytes >= bytesPerSecond * 5) {
|
|
272
|
+
const audio = Buffer.concat(chunks);
|
|
273
|
+
chunks.length = 0;
|
|
274
|
+
bufferedBytes = 0;
|
|
275
|
+
try {
|
|
276
|
+
const result = await this.transcribe(audio, options);
|
|
277
|
+
yield {
|
|
278
|
+
index: chunkIndex++,
|
|
279
|
+
text: result.text,
|
|
280
|
+
isFinal: false,
|
|
281
|
+
confidence: result.confidence,
|
|
282
|
+
};
|
|
283
|
+
}
|
|
284
|
+
catch (err) {
|
|
285
|
+
// M5: distinguish permanent (auth, schema, 4xx) from transient
|
|
286
|
+
// (5xx, 429, network) errors. Without this, an expired API key
|
|
287
|
+
// would silently retry every chunk for the entire stream.
|
|
288
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
289
|
+
const isPermanent = /\b(401|403|404|Forbidden|Unauthorized|Invalid.*subscription|Invalid.*key|Wrong.*key|InvalidAudioFormat)\b/i.test(msg);
|
|
290
|
+
if (isPermanent) {
|
|
291
|
+
logger.error(`[AzureSTTHandler] Permanent chunk error — terminating stream: ${msg}`);
|
|
292
|
+
throw err;
|
|
293
|
+
}
|
|
294
|
+
logger.warn(`[AzureSTTHandler] Transient chunk failure (skipping): ${msg}`);
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
// Process remaining audio
|
|
299
|
+
if (chunks.length > 0) {
|
|
300
|
+
const audio = Buffer.concat(chunks);
|
|
301
|
+
try {
|
|
302
|
+
const result = await this.transcribe(audio, options);
|
|
303
|
+
yield {
|
|
304
|
+
index: chunkIndex,
|
|
305
|
+
text: result.text,
|
|
306
|
+
isFinal: true,
|
|
307
|
+
confidence: result.confidence,
|
|
308
|
+
};
|
|
309
|
+
}
|
|
310
|
+
catch (err) {
|
|
311
|
+
// Mirror the permanent-vs-transient split from the chunk loop above so
|
|
312
|
+
// auth/format failures don't masquerade as a successful empty
|
|
313
|
+
// transcription on short streams (≤5s buffer flush).
|
|
314
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
315
|
+
const isPermanent = /\b(401|403|404|Forbidden|Unauthorized|Invalid.*subscription|Invalid.*key|Wrong.*key|InvalidAudioFormat)\b/i.test(msg);
|
|
316
|
+
if (isPermanent) {
|
|
317
|
+
logger.error(`[AzureSTTHandler] Permanent final-chunk error — surfacing: ${msg}`);
|
|
318
|
+
throw err;
|
|
319
|
+
}
|
|
320
|
+
logger.warn(`[AzureSTTHandler] Final chunk transcription failed (transient): ${msg}`);
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
/**
|
|
325
|
+
* Get Content-Type header for audio format
|
|
326
|
+
*/
|
|
327
|
+
getContentType(format) {
|
|
328
|
+
// Note: MP3 is intentionally not in this map even though Azure won't reject
|
|
329
|
+
// the Content-Type — the short-audio REST endpoint silently returns empty
|
|
330
|
+
// text for MP3 bodies. See getSupportedFormats() for the supported list.
|
|
331
|
+
const contentTypes = {
|
|
332
|
+
wav: "audio/wav; codecs=audio/pcm; samplerate=16000",
|
|
333
|
+
ogg: "audio/ogg; codecs=opus",
|
|
334
|
+
opus: "audio/ogg; codecs=opus",
|
|
335
|
+
};
|
|
336
|
+
return contentTypes[format] ?? "audio/wav";
|
|
337
|
+
}
|
|
338
|
+
/**
|
|
339
|
+
* Convert Azure ticks (100ns units) to seconds
|
|
340
|
+
*/
|
|
341
|
+
ticksToSeconds(ticks) {
|
|
342
|
+
return ticks / 10000000;
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
//# sourceMappingURL=AzureSTT.js.map
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Azure Cognitive Services Text-to-Speech Handler
|
|
3
|
+
*
|
|
4
|
+
* Implementation of TTS using Azure Speech Services.
|
|
5
|
+
*
|
|
6
|
+
* @module voice/providers/AzureTTS
|
|
7
|
+
*/
|
|
8
|
+
import type { TTSHandler, TTSOptions, TTSResult, TTSVoice } from "../../types/index.js";
|
|
9
|
+
/**
|
|
10
|
+
* Azure Cognitive Services Text-to-Speech Handler
|
|
11
|
+
*
|
|
12
|
+
* Supports neural voices with SSML and custom voice styles.
|
|
13
|
+
*
|
|
14
|
+
* @see https://docs.microsoft.com/azure/cognitive-services/speech-service/
|
|
15
|
+
*/
|
|
16
|
+
export declare class AzureTTS implements TTSHandler {
|
|
17
|
+
private readonly apiKey;
|
|
18
|
+
private readonly region;
|
|
19
|
+
private voicesCache;
|
|
20
|
+
private static readonly CACHE_TTL_MS;
|
|
21
|
+
/**
|
|
22
|
+
* Maximum text length (10000 characters for Azure)
|
|
23
|
+
*/
|
|
24
|
+
readonly maxTextLength = 10000;
|
|
25
|
+
constructor(apiKey?: string, region?: string);
|
|
26
|
+
isConfigured(): boolean;
|
|
27
|
+
getVoices(languageCode?: string): Promise<TTSVoice[]>;
|
|
28
|
+
synthesize(text: string, options?: TTSOptions): Promise<TTSResult>;
|
|
29
|
+
/**
|
|
30
|
+
* Build SSML from text and options
|
|
31
|
+
*/
|
|
32
|
+
private buildSSML;
|
|
33
|
+
/**
|
|
34
|
+
* Extract language from voice name
|
|
35
|
+
*/
|
|
36
|
+
private extractLanguage;
|
|
37
|
+
/**
|
|
38
|
+
* Escape XML special characters
|
|
39
|
+
*/
|
|
40
|
+
private escapeXml;
|
|
41
|
+
/**
|
|
42
|
+
* Map gender string to standard type
|
|
43
|
+
*/
|
|
44
|
+
private mapGender;
|
|
45
|
+
/**
|
|
46
|
+
* Map TTSAudioFormat to Azure output format
|
|
47
|
+
*/
|
|
48
|
+
private mapFormat;
|
|
49
|
+
/**
|
|
50
|
+
* Get sample rate from format string
|
|
51
|
+
*/
|
|
52
|
+
private getSampleRate;
|
|
53
|
+
/**
|
|
54
|
+
* Map the Azure outputFormat string back to a canonical TTSAudioFormat so
|
|
55
|
+
* TTSResult.format matches what the API actually returned (mapFormat() can
|
|
56
|
+
* coerce unsupported requests to mp3).
|
|
57
|
+
*/
|
|
58
|
+
private effectiveFormat;
|
|
59
|
+
}
|