@juspay/neurolink 9.61.2 → 9.62.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/README.md +23 -17
- package/dist/adapters/tts/googleTTSHandler.js +1 -1
- package/dist/browser/neurolink.min.js +373 -355
- package/dist/cli/commands/serve.js +9 -0
- package/dist/cli/commands/voiceServer.d.ts +7 -0
- package/dist/cli/commands/voiceServer.js +9 -1
- package/dist/cli/factories/commandFactory.js +136 -11
- package/dist/cli/loop/optionsSchema.d.ts +1 -1
- package/dist/cli/utils/audioFileUtils.d.ts +3 -3
- package/dist/cli/utils/audioFileUtils.js +5 -1
- package/dist/core/baseProvider.js +29 -6
- package/dist/factories/providerRegistry.d.ts +14 -0
- package/dist/factories/providerRegistry.js +141 -2
- package/dist/lib/adapters/tts/googleTTSHandler.js +1 -1
- package/dist/lib/core/baseProvider.js +29 -6
- package/dist/lib/factories/providerRegistry.d.ts +14 -0
- package/dist/lib/factories/providerRegistry.js +141 -2
- package/dist/lib/neurolink.d.ts +19 -0
- package/dist/lib/neurolink.js +248 -12
- package/dist/lib/observability/exporters/laminarExporter.js +1 -0
- package/dist/lib/observability/exporters/posthogExporter.js +1 -0
- package/dist/lib/observability/utils/spanSerializer.js +1 -0
- package/dist/lib/server/voice/tokenCompare.d.ts +14 -0
- package/dist/lib/server/voice/tokenCompare.js +23 -0
- package/dist/lib/server/voice/voiceServerApp.js +62 -3
- package/dist/lib/server/voice/voiceWebSocketHandler.d.ts +20 -3
- package/dist/lib/server/voice/voiceWebSocketHandler.js +555 -435
- package/dist/lib/types/generate.d.ts +47 -0
- package/dist/lib/types/index.d.ts +1 -1
- package/dist/lib/types/index.js +1 -1
- package/dist/lib/types/realtime.d.ts +243 -0
- package/dist/lib/types/realtime.js +70 -0
- package/dist/lib/types/server.d.ts +68 -0
- package/dist/lib/types/span.d.ts +2 -0
- package/dist/lib/types/span.js +2 -0
- package/dist/lib/types/stream.d.ts +36 -14
- package/dist/lib/types/stt.d.ts +585 -0
- package/dist/lib/types/stt.js +90 -0
- package/dist/lib/types/tts.d.ts +23 -11
- package/dist/lib/types/tts.js +7 -0
- package/dist/lib/types/voice.d.ts +272 -0
- package/dist/lib/types/voice.js +137 -0
- package/dist/lib/utils/audioFormatDetector.d.ts +15 -0
- package/dist/lib/utils/audioFormatDetector.js +34 -0
- package/dist/lib/utils/sttProcessor.d.ts +115 -0
- package/dist/lib/utils/sttProcessor.js +295 -0
- package/dist/lib/voice/RealtimeVoiceAPI.d.ts +183 -0
- package/dist/lib/voice/RealtimeVoiceAPI.js +439 -0
- package/dist/lib/voice/audio-utils.d.ts +135 -0
- package/dist/lib/voice/audio-utils.js +435 -0
- package/dist/lib/voice/errors.d.ts +123 -0
- package/dist/lib/voice/errors.js +386 -0
- package/dist/lib/voice/index.d.ts +26 -0
- package/dist/lib/voice/index.js +55 -0
- package/dist/lib/voice/providers/AzureSTT.d.ts +47 -0
- package/dist/lib/voice/providers/AzureSTT.js +345 -0
- package/dist/lib/voice/providers/AzureTTS.d.ts +59 -0
- package/dist/lib/voice/providers/AzureTTS.js +349 -0
- package/dist/lib/voice/providers/DeepgramSTT.d.ts +40 -0
- package/dist/lib/voice/providers/DeepgramSTT.js +550 -0
- package/dist/lib/voice/providers/ElevenLabsTTS.d.ts +53 -0
- package/dist/lib/voice/providers/ElevenLabsTTS.js +311 -0
- package/dist/lib/voice/providers/GeminiLive.d.ts +52 -0
- package/dist/lib/voice/providers/GeminiLive.js +372 -0
- package/dist/lib/voice/providers/GoogleSTT.d.ts +60 -0
- package/dist/lib/voice/providers/GoogleSTT.js +454 -0
- package/dist/lib/voice/providers/OpenAIRealtime.d.ts +47 -0
- package/dist/lib/voice/providers/OpenAIRealtime.js +412 -0
- package/dist/lib/voice/providers/OpenAISTT.d.ts +41 -0
- package/dist/lib/voice/providers/OpenAISTT.js +286 -0
- package/dist/lib/voice/providers/OpenAITTS.d.ts +49 -0
- package/dist/lib/voice/providers/OpenAITTS.js +271 -0
- package/dist/lib/voice/stream-handler.d.ts +166 -0
- package/dist/lib/voice/stream-handler.js +514 -0
- package/dist/neurolink.d.ts +19 -0
- package/dist/neurolink.js +248 -12
- package/dist/observability/exporters/laminarExporter.js +1 -0
- package/dist/observability/exporters/posthogExporter.js +1 -0
- package/dist/observability/utils/spanSerializer.js +1 -0
- package/dist/server/voice/tokenCompare.d.ts +14 -0
- package/dist/server/voice/tokenCompare.js +22 -0
- package/dist/server/voice/voiceServerApp.js +62 -3
- package/dist/server/voice/voiceWebSocketHandler.d.ts +20 -3
- package/dist/server/voice/voiceWebSocketHandler.js +555 -435
- package/dist/types/generate.d.ts +47 -0
- package/dist/types/index.d.ts +1 -1
- package/dist/types/index.js +1 -1
- package/dist/types/realtime.d.ts +243 -0
- package/dist/types/realtime.js +69 -0
- package/dist/types/server.d.ts +68 -0
- package/dist/types/span.d.ts +2 -0
- package/dist/types/span.js +2 -0
- package/dist/types/stream.d.ts +36 -14
- package/dist/types/stt.d.ts +585 -0
- package/dist/types/stt.js +89 -0
- package/dist/types/tts.d.ts +23 -11
- package/dist/types/tts.js +7 -0
- package/dist/types/voice.d.ts +272 -0
- package/dist/types/voice.js +136 -0
- package/dist/utils/audioFormatDetector.d.ts +15 -0
- package/dist/utils/audioFormatDetector.js +33 -0
- package/dist/utils/sttProcessor.d.ts +115 -0
- package/dist/utils/sttProcessor.js +294 -0
- package/dist/voice/RealtimeVoiceAPI.d.ts +183 -0
- package/dist/voice/RealtimeVoiceAPI.js +438 -0
- package/dist/voice/audio-utils.d.ts +135 -0
- package/dist/voice/audio-utils.js +434 -0
- package/dist/voice/errors.d.ts +123 -0
- package/dist/voice/errors.js +385 -0
- package/dist/voice/index.d.ts +26 -0
- package/dist/voice/index.js +54 -0
- package/dist/voice/providers/AzureSTT.d.ts +47 -0
- package/dist/voice/providers/AzureSTT.js +344 -0
- package/dist/voice/providers/AzureTTS.d.ts +59 -0
- package/dist/voice/providers/AzureTTS.js +348 -0
- package/dist/voice/providers/DeepgramSTT.d.ts +40 -0
- package/dist/voice/providers/DeepgramSTT.js +549 -0
- package/dist/voice/providers/ElevenLabsTTS.d.ts +53 -0
- package/dist/voice/providers/ElevenLabsTTS.js +310 -0
- package/dist/voice/providers/GeminiLive.d.ts +52 -0
- package/dist/voice/providers/GeminiLive.js +371 -0
- package/dist/voice/providers/GoogleSTT.d.ts +60 -0
- package/dist/voice/providers/GoogleSTT.js +453 -0
- package/dist/voice/providers/OpenAIRealtime.d.ts +47 -0
- package/dist/voice/providers/OpenAIRealtime.js +411 -0
- package/dist/voice/providers/OpenAISTT.d.ts +41 -0
- package/dist/voice/providers/OpenAISTT.js +285 -0
- package/dist/voice/providers/OpenAITTS.d.ts +49 -0
- package/dist/voice/providers/OpenAITTS.js +270 -0
- package/dist/voice/stream-handler.d.ts +166 -0
- package/dist/voice/stream-handler.js +513 -0
- package/package.json +3 -1
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OpenAI Whisper Speech-to-Text Handler
|
|
3
|
+
*
|
|
4
|
+
* Implementation of STT using OpenAI's Whisper model.
|
|
5
|
+
*
|
|
6
|
+
* @module voice/providers/OpenAISTT
|
|
7
|
+
*/
|
|
8
|
+
import { logger } from "../../utils/logger.js";
|
|
9
|
+
import { STTError } from "../errors.js";
|
|
10
|
+
/**
|
|
11
|
+
* OpenAI Whisper Speech-to-Text Handler
|
|
12
|
+
*
|
|
13
|
+
* Supports transcription and translation using OpenAI's Whisper model.
|
|
14
|
+
*
|
|
15
|
+
* @see https://platform.openai.com/docs/api-reference/audio
|
|
16
|
+
*/
|
|
17
|
+
export class OpenAISTT {
|
|
18
|
+
apiKey;
|
|
19
|
+
baseUrl = "https://api.openai.com/v1";
|
|
20
|
+
/**
|
|
21
|
+
* Maximum audio duration in seconds (25 minutes)
|
|
22
|
+
*/
|
|
23
|
+
maxAudioDuration = 25 * 60;
|
|
24
|
+
/**
|
|
25
|
+
* Whisper does not support streaming
|
|
26
|
+
*/
|
|
27
|
+
supportsStreaming = false;
|
|
28
|
+
constructor(apiKey) {
|
|
29
|
+
const resolvedKey = (apiKey ?? process.env.OPENAI_API_KEY ?? "").trim();
|
|
30
|
+
this.apiKey = resolvedKey.length > 0 ? resolvedKey : null;
|
|
31
|
+
}
|
|
32
|
+
isConfigured() {
|
|
33
|
+
return this.apiKey !== null;
|
|
34
|
+
}
|
|
35
|
+
getSupportedFormats() {
|
|
36
|
+
// OpenAI Whisper transcription API accepts: flac, m4a, mp3, mp4, mpeg,
|
|
37
|
+
// mpga, oga, ogg, opus, wav, webm. Keep this in sync with TTSAudioFormat
|
|
38
|
+
// — formats not listed in TTSAudioFormat are filtered out by the type.
|
|
39
|
+
return [
|
|
40
|
+
"mp3",
|
|
41
|
+
"wav",
|
|
42
|
+
"ogg",
|
|
43
|
+
"opus",
|
|
44
|
+
"m4a",
|
|
45
|
+
"flac",
|
|
46
|
+
"webm",
|
|
47
|
+
"mp4",
|
|
48
|
+
"mpeg",
|
|
49
|
+
"mpga",
|
|
50
|
+
];
|
|
51
|
+
}
|
|
52
|
+
async getSupportedLanguages() {
|
|
53
|
+
// Whisper supports 100+ languages
|
|
54
|
+
// Return the most common ones
|
|
55
|
+
return [
|
|
56
|
+
{
|
|
57
|
+
code: "en",
|
|
58
|
+
name: "English",
|
|
59
|
+
supportsDiarization: false,
|
|
60
|
+
supportsPunctuation: true,
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
code: "es",
|
|
64
|
+
name: "Spanish",
|
|
65
|
+
supportsDiarization: false,
|
|
66
|
+
supportsPunctuation: true,
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
code: "fr",
|
|
70
|
+
name: "French",
|
|
71
|
+
supportsDiarization: false,
|
|
72
|
+
supportsPunctuation: true,
|
|
73
|
+
},
|
|
74
|
+
{
|
|
75
|
+
code: "de",
|
|
76
|
+
name: "German",
|
|
77
|
+
supportsDiarization: false,
|
|
78
|
+
supportsPunctuation: true,
|
|
79
|
+
},
|
|
80
|
+
{
|
|
81
|
+
code: "it",
|
|
82
|
+
name: "Italian",
|
|
83
|
+
supportsDiarization: false,
|
|
84
|
+
supportsPunctuation: true,
|
|
85
|
+
},
|
|
86
|
+
{
|
|
87
|
+
code: "pt",
|
|
88
|
+
name: "Portuguese",
|
|
89
|
+
supportsDiarization: false,
|
|
90
|
+
supportsPunctuation: true,
|
|
91
|
+
},
|
|
92
|
+
{
|
|
93
|
+
code: "ru",
|
|
94
|
+
name: "Russian",
|
|
95
|
+
supportsDiarization: false,
|
|
96
|
+
supportsPunctuation: true,
|
|
97
|
+
},
|
|
98
|
+
{
|
|
99
|
+
code: "ja",
|
|
100
|
+
name: "Japanese",
|
|
101
|
+
supportsDiarization: false,
|
|
102
|
+
supportsPunctuation: true,
|
|
103
|
+
},
|
|
104
|
+
{
|
|
105
|
+
code: "ko",
|
|
106
|
+
name: "Korean",
|
|
107
|
+
supportsDiarization: false,
|
|
108
|
+
supportsPunctuation: true,
|
|
109
|
+
},
|
|
110
|
+
{
|
|
111
|
+
code: "zh",
|
|
112
|
+
name: "Chinese",
|
|
113
|
+
supportsDiarization: false,
|
|
114
|
+
supportsPunctuation: true,
|
|
115
|
+
},
|
|
116
|
+
{
|
|
117
|
+
code: "ar",
|
|
118
|
+
name: "Arabic",
|
|
119
|
+
supportsDiarization: false,
|
|
120
|
+
supportsPunctuation: true,
|
|
121
|
+
},
|
|
122
|
+
{
|
|
123
|
+
code: "hi",
|
|
124
|
+
name: "Hindi",
|
|
125
|
+
supportsDiarization: false,
|
|
126
|
+
supportsPunctuation: true,
|
|
127
|
+
},
|
|
128
|
+
];
|
|
129
|
+
}
|
|
130
|
+
async transcribe(audio, options = {}) {
|
|
131
|
+
if (!this.apiKey) {
|
|
132
|
+
throw STTError.providerNotConfigured("whisper");
|
|
133
|
+
}
|
|
134
|
+
const audioBuffer = Buffer.isBuffer(audio) ? audio : Buffer.from(audio);
|
|
135
|
+
if (audioBuffer.length === 0) {
|
|
136
|
+
throw STTError.audioEmpty("whisper");
|
|
137
|
+
}
|
|
138
|
+
const whisperOptions = options;
|
|
139
|
+
const startTime = Date.now();
|
|
140
|
+
try {
|
|
141
|
+
// Prepare form data
|
|
142
|
+
const formData = new FormData();
|
|
143
|
+
// Add audio file - convert Buffer to Uint8Array for compatibility
|
|
144
|
+
const audioBlob = new Blob([new Uint8Array(audioBuffer)], {
|
|
145
|
+
type: this.getMimeType(options.format ?? "wav"),
|
|
146
|
+
});
|
|
147
|
+
formData.append("file", audioBlob, `audio.${options.format ?? "wav"}`);
|
|
148
|
+
// Add model
|
|
149
|
+
formData.append("model", whisperOptions.model ?? "whisper-1");
|
|
150
|
+
// Add optional parameters
|
|
151
|
+
if (options.language) {
|
|
152
|
+
formData.append("language", options.language);
|
|
153
|
+
}
|
|
154
|
+
if (whisperOptions.prompt) {
|
|
155
|
+
formData.append("prompt", whisperOptions.prompt);
|
|
156
|
+
}
|
|
157
|
+
if (whisperOptions.temperature !== undefined) {
|
|
158
|
+
formData.append("temperature", whisperOptions.temperature.toString());
|
|
159
|
+
}
|
|
160
|
+
// Request verbose_json for detailed response
|
|
161
|
+
const responseFormat = whisperOptions.responseFormat ?? "verbose_json";
|
|
162
|
+
formData.append("response_format", responseFormat);
|
|
163
|
+
// Add timestamp granularities for word-level timestamps
|
|
164
|
+
if (options.wordTimestamps && responseFormat === "verbose_json") {
|
|
165
|
+
formData.append("timestamp_granularities[]", "word");
|
|
166
|
+
formData.append("timestamp_granularities[]", "segment");
|
|
167
|
+
}
|
|
168
|
+
// Choose endpoint based on translation option
|
|
169
|
+
const endpoint = whisperOptions.translate
|
|
170
|
+
? `${this.baseUrl}/audio/translations`
|
|
171
|
+
: `${this.baseUrl}/audio/transcriptions`;
|
|
172
|
+
const controller = new AbortController();
|
|
173
|
+
const timeoutId = setTimeout(() => controller.abort(), 30000);
|
|
174
|
+
let response;
|
|
175
|
+
try {
|
|
176
|
+
response = await fetch(endpoint, {
|
|
177
|
+
method: "POST",
|
|
178
|
+
headers: {
|
|
179
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
180
|
+
},
|
|
181
|
+
body: formData,
|
|
182
|
+
signal: controller.signal,
|
|
183
|
+
});
|
|
184
|
+
}
|
|
185
|
+
catch (fetchErr) {
|
|
186
|
+
if (fetchErr instanceof Error && fetchErr.name === "AbortError") {
|
|
187
|
+
throw STTError.transcriptionFailed("OpenAI STT request timed out after 30 seconds", "whisper", fetchErr);
|
|
188
|
+
}
|
|
189
|
+
throw fetchErr;
|
|
190
|
+
}
|
|
191
|
+
finally {
|
|
192
|
+
clearTimeout(timeoutId);
|
|
193
|
+
}
|
|
194
|
+
if (!response.ok) {
|
|
195
|
+
const errorData = await response
|
|
196
|
+
.json()
|
|
197
|
+
.catch(() => Object.create(null));
|
|
198
|
+
const errorMessage = errorData.error?.message ||
|
|
199
|
+
`HTTP ${response.status}`;
|
|
200
|
+
throw STTError.transcriptionFailed(errorMessage, "whisper");
|
|
201
|
+
}
|
|
202
|
+
const latency = Date.now() - startTime;
|
|
203
|
+
// Parse response based on format
|
|
204
|
+
if (responseFormat === "text") {
|
|
205
|
+
const text = await response.text();
|
|
206
|
+
return {
|
|
207
|
+
text,
|
|
208
|
+
confidence: 0.95, // Whisper doesn't return confidence
|
|
209
|
+
metadata: {
|
|
210
|
+
latency,
|
|
211
|
+
provider: "whisper",
|
|
212
|
+
model: whisperOptions.model ?? "whisper-1",
|
|
213
|
+
},
|
|
214
|
+
};
|
|
215
|
+
}
|
|
216
|
+
const data = (await response.json());
|
|
217
|
+
// Build result
|
|
218
|
+
const result = {
|
|
219
|
+
text: data.text,
|
|
220
|
+
confidence: 0.95, // Whisper doesn't return per-result confidence
|
|
221
|
+
language: data.language,
|
|
222
|
+
duration: data.duration,
|
|
223
|
+
metadata: {
|
|
224
|
+
latency,
|
|
225
|
+
provider: "whisper",
|
|
226
|
+
model: whisperOptions.model ?? "whisper-1",
|
|
227
|
+
task: data.task,
|
|
228
|
+
},
|
|
229
|
+
};
|
|
230
|
+
// Add word timings if available
|
|
231
|
+
if (data.words && data.words.length > 0) {
|
|
232
|
+
result.words = data.words.map((word) => ({
|
|
233
|
+
word: word.word,
|
|
234
|
+
startTime: word.start,
|
|
235
|
+
endTime: word.end,
|
|
236
|
+
}));
|
|
237
|
+
}
|
|
238
|
+
// Add segments
|
|
239
|
+
if (data.segments && data.segments.length > 0) {
|
|
240
|
+
result.segments = data.segments.map((segment, index) => ({
|
|
241
|
+
index,
|
|
242
|
+
text: segment.text,
|
|
243
|
+
isFinal: true,
|
|
244
|
+
confidence: Math.exp(segment.avg_logprob), // Convert log prob to confidence
|
|
245
|
+
startTime: segment.start,
|
|
246
|
+
endTime: segment.end,
|
|
247
|
+
}));
|
|
248
|
+
}
|
|
249
|
+
logger.info(`[WhisperSTTHandler] Transcribed ${data.duration?.toFixed(1) ?? "?"}s audio in ${latency}ms`);
|
|
250
|
+
return result;
|
|
251
|
+
}
|
|
252
|
+
catch (err) {
|
|
253
|
+
if (err instanceof STTError) {
|
|
254
|
+
throw err;
|
|
255
|
+
}
|
|
256
|
+
const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error");
|
|
257
|
+
logger.error(`[WhisperSTTHandler] Transcription failed: ${errorMessage}`);
|
|
258
|
+
throw STTError.transcriptionFailed(errorMessage, "whisper", err instanceof Error ? err : undefined);
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
/**
|
|
262
|
+
* Get MIME type for audio format. Whisper auto-detects from headers, but
|
|
263
|
+
* sending a correct MIME helps providers / proxies that sniff Content-Type.
|
|
264
|
+
* Must stay aligned with `getSupportedFormats()`.
|
|
265
|
+
*/
|
|
266
|
+
getMimeType(format) {
|
|
267
|
+
const mimeTypes = {
|
|
268
|
+
mp3: "audio/mpeg",
|
|
269
|
+
wav: "audio/wav",
|
|
270
|
+
ogg: "audio/ogg",
|
|
271
|
+
opus: "audio/opus",
|
|
272
|
+
m4a: "audio/mp4",
|
|
273
|
+
flac: "audio/flac",
|
|
274
|
+
webm: "audio/webm",
|
|
275
|
+
mp4: "audio/mp4",
|
|
276
|
+
mpeg: "audio/mpeg",
|
|
277
|
+
mpga: "audio/mpeg",
|
|
278
|
+
};
|
|
279
|
+
return mimeTypes[format] ?? "audio/wav";
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
// Export as named exports for compatibility
|
|
283
|
+
export { OpenAISTT as WhisperSTT };
|
|
284
|
+
export { OpenAISTT as WhisperSTTHandler };
|
|
285
|
+
export { OpenAISTT as OpenAISTTHandler };
|
|
286
|
+
//# sourceMappingURL=OpenAISTT.js.map
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OpenAI Text-to-Speech Handler
|
|
3
|
+
*
|
|
4
|
+
* Implementation of TTS using OpenAI's TTS API.
|
|
5
|
+
*
|
|
6
|
+
* @module voice/providers/OpenAITTS
|
|
7
|
+
*/
|
|
8
|
+
import type { TTSHandler, TTSOptions, TTSResult, TTSVoice } from "../../types/index.js";
|
|
9
|
+
/**
|
|
10
|
+
* OpenAI Text-to-Speech Handler
|
|
11
|
+
*
|
|
12
|
+
* Supports high-quality neural TTS with multiple voices.
|
|
13
|
+
*
|
|
14
|
+
* @see https://platform.openai.com/docs/api-reference/audio/createSpeech
|
|
15
|
+
*/
|
|
16
|
+
export declare class OpenAITTS implements TTSHandler {
|
|
17
|
+
private readonly apiKey;
|
|
18
|
+
private readonly baseUrl;
|
|
19
|
+
/**
|
|
20
|
+
* Maximum text length (4096 characters)
|
|
21
|
+
*/
|
|
22
|
+
readonly maxTextLength = 4096;
|
|
23
|
+
/**
|
|
24
|
+
* Available voices
|
|
25
|
+
*/
|
|
26
|
+
private static readonly VOICES;
|
|
27
|
+
constructor(apiKey?: string);
|
|
28
|
+
isConfigured(): boolean;
|
|
29
|
+
getVoices(languageCode?: string): Promise<TTSVoice[]>;
|
|
30
|
+
synthesize(text: string, options?: TTSOptions): Promise<TTSResult>;
|
|
31
|
+
/**
|
|
32
|
+
* Map TTSAudioFormat to OpenAI response_format.
|
|
33
|
+
* OpenAI TTS supports: mp3, wav, opus (ogg maps to opus).
|
|
34
|
+
* Unsupported formats are coerced to mp3 with a warning.
|
|
35
|
+
*/
|
|
36
|
+
private mapFormat;
|
|
37
|
+
/**
|
|
38
|
+
* Get sample rate for format
|
|
39
|
+
*/
|
|
40
|
+
private getSampleRate;
|
|
41
|
+
/**
|
|
42
|
+
* Map the OpenAI `response_format` string back to the canonical
|
|
43
|
+
* `TTSAudioFormat` so `TTSResult.format` reflects what the API actually
|
|
44
|
+
* returned (mapFormat() coerces unsupported requests to "mp3"). Note:
|
|
45
|
+
* OpenAI returns Ogg-Opus for both "ogg" and "opus" requests — both
|
|
46
|
+
* surface as "opus" since the bytes are an .ogg/Opus container.
|
|
47
|
+
*/
|
|
48
|
+
private effectiveFormat;
|
|
49
|
+
}
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OpenAI Text-to-Speech Handler
|
|
3
|
+
*
|
|
4
|
+
* Implementation of TTS using OpenAI's TTS API.
|
|
5
|
+
*
|
|
6
|
+
* @module voice/providers/OpenAITTS
|
|
7
|
+
*/
|
|
8
|
+
import { ErrorCategory, ErrorSeverity } from "../../constants/enums.js";
|
|
9
|
+
import { logger } from "../../utils/logger.js";
|
|
10
|
+
import { TTS_ERROR_CODES, TTSError } from "../../utils/ttsProcessor.js";
|
|
11
|
+
/**
|
|
12
|
+
* OpenAI Text-to-Speech Handler
|
|
13
|
+
*
|
|
14
|
+
* Supports high-quality neural TTS with multiple voices.
|
|
15
|
+
*
|
|
16
|
+
* @see https://platform.openai.com/docs/api-reference/audio/createSpeech
|
|
17
|
+
*/
|
|
18
|
+
export class OpenAITTS {
|
|
19
|
+
apiKey;
|
|
20
|
+
baseUrl = "https://api.openai.com/v1";
|
|
21
|
+
/**
|
|
22
|
+
* Maximum text length (4096 characters)
|
|
23
|
+
*/
|
|
24
|
+
maxTextLength = 4096;
|
|
25
|
+
/**
|
|
26
|
+
* Available voices
|
|
27
|
+
*/
|
|
28
|
+
static VOICES = [
|
|
29
|
+
{
|
|
30
|
+
id: "alloy",
|
|
31
|
+
name: "Alloy",
|
|
32
|
+
languageCode: "en",
|
|
33
|
+
languageCodes: ["en"],
|
|
34
|
+
gender: "neutral",
|
|
35
|
+
type: "neural",
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
id: "echo",
|
|
39
|
+
name: "Echo",
|
|
40
|
+
languageCode: "en",
|
|
41
|
+
languageCodes: ["en"],
|
|
42
|
+
gender: "male",
|
|
43
|
+
type: "neural",
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
id: "fable",
|
|
47
|
+
name: "Fable",
|
|
48
|
+
languageCode: "en",
|
|
49
|
+
languageCodes: ["en"],
|
|
50
|
+
gender: "neutral",
|
|
51
|
+
type: "neural",
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
id: "onyx",
|
|
55
|
+
name: "Onyx",
|
|
56
|
+
languageCode: "en",
|
|
57
|
+
languageCodes: ["en"],
|
|
58
|
+
gender: "male",
|
|
59
|
+
type: "neural",
|
|
60
|
+
},
|
|
61
|
+
{
|
|
62
|
+
id: "nova",
|
|
63
|
+
name: "Nova",
|
|
64
|
+
languageCode: "en",
|
|
65
|
+
languageCodes: ["en"],
|
|
66
|
+
gender: "female",
|
|
67
|
+
type: "neural",
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
id: "shimmer",
|
|
71
|
+
name: "Shimmer",
|
|
72
|
+
languageCode: "en",
|
|
73
|
+
languageCodes: ["en"],
|
|
74
|
+
gender: "female",
|
|
75
|
+
type: "neural",
|
|
76
|
+
},
|
|
77
|
+
];
|
|
78
|
+
constructor(apiKey) {
|
|
79
|
+
const resolvedKey = (apiKey ?? process.env.OPENAI_API_KEY ?? "").trim();
|
|
80
|
+
this.apiKey = resolvedKey.length > 0 ? resolvedKey : null;
|
|
81
|
+
}
|
|
82
|
+
isConfigured() {
|
|
83
|
+
return this.apiKey !== null;
|
|
84
|
+
}
|
|
85
|
+
async getVoices(languageCode) {
|
|
86
|
+
// OpenAI voices are pre-defined, filter by language if provided
|
|
87
|
+
if (languageCode && !languageCode.startsWith("en")) {
|
|
88
|
+
// OpenAI TTS works with multiple languages but voices are English-named
|
|
89
|
+
return OpenAITTS.VOICES;
|
|
90
|
+
}
|
|
91
|
+
return OpenAITTS.VOICES;
|
|
92
|
+
}
|
|
93
|
+
async synthesize(text, options = {}) {
|
|
94
|
+
if (!this.apiKey) {
|
|
95
|
+
throw new TTSError({
|
|
96
|
+
code: TTS_ERROR_CODES.PROVIDER_NOT_CONFIGURED,
|
|
97
|
+
message: "OpenAI TTS API key not configured",
|
|
98
|
+
category: ErrorCategory.CONFIGURATION,
|
|
99
|
+
severity: ErrorSeverity.HIGH,
|
|
100
|
+
retriable: false,
|
|
101
|
+
});
|
|
102
|
+
}
|
|
103
|
+
const startTime = Date.now();
|
|
104
|
+
const openaiOptions = options;
|
|
105
|
+
try {
|
|
106
|
+
// Determine model based on quality
|
|
107
|
+
const model = openaiOptions.model ??
|
|
108
|
+
(options.quality === "hd" ? "tts-1-hd" : "tts-1");
|
|
109
|
+
// Determine voice
|
|
110
|
+
const voice = options.voice ?? "alloy";
|
|
111
|
+
// Determine format
|
|
112
|
+
const responseFormat = this.mapFormat(options.format ?? "mp3");
|
|
113
|
+
// Build request
|
|
114
|
+
const requestBody = {
|
|
115
|
+
model,
|
|
116
|
+
input: text,
|
|
117
|
+
voice,
|
|
118
|
+
response_format: responseFormat,
|
|
119
|
+
speed: options.speed ?? 1.0,
|
|
120
|
+
};
|
|
121
|
+
const controller = new AbortController();
|
|
122
|
+
const timeoutId = setTimeout(() => controller.abort(), 30000);
|
|
123
|
+
let response;
|
|
124
|
+
try {
|
|
125
|
+
response = await fetch(`${this.baseUrl}/audio/speech`, {
|
|
126
|
+
method: "POST",
|
|
127
|
+
headers: {
|
|
128
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
129
|
+
"Content-Type": "application/json",
|
|
130
|
+
},
|
|
131
|
+
body: JSON.stringify(requestBody),
|
|
132
|
+
signal: controller.signal,
|
|
133
|
+
});
|
|
134
|
+
}
|
|
135
|
+
catch (fetchErr) {
|
|
136
|
+
if (fetchErr instanceof Error && fetchErr.name === "AbortError") {
|
|
137
|
+
throw new TTSError({
|
|
138
|
+
code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
|
|
139
|
+
message: "OpenAI TTS request timed out after 30 seconds",
|
|
140
|
+
category: ErrorCategory.NETWORK,
|
|
141
|
+
severity: ErrorSeverity.HIGH,
|
|
142
|
+
retriable: true,
|
|
143
|
+
originalError: fetchErr,
|
|
144
|
+
});
|
|
145
|
+
}
|
|
146
|
+
throw fetchErr;
|
|
147
|
+
}
|
|
148
|
+
finally {
|
|
149
|
+
clearTimeout(timeoutId);
|
|
150
|
+
}
|
|
151
|
+
if (!response.ok) {
|
|
152
|
+
const errorData = await response
|
|
153
|
+
.json()
|
|
154
|
+
.catch(() => Object.create(null));
|
|
155
|
+
const errorMessage = errorData.error?.message ||
|
|
156
|
+
`HTTP ${response.status}`;
|
|
157
|
+
// Preserve HTTP status so the outer catch doesn't mark a permanent
|
|
158
|
+
// 4xx (auth, bad input) as retriable and trigger pointless retry loops.
|
|
159
|
+
const retriable = response.status === 408 ||
|
|
160
|
+
response.status === 429 ||
|
|
161
|
+
response.status >= 500;
|
|
162
|
+
throw new TTSError({
|
|
163
|
+
code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
|
|
164
|
+
message: errorMessage,
|
|
165
|
+
category: retriable ? ErrorCategory.NETWORK : ErrorCategory.EXECUTION,
|
|
166
|
+
severity: ErrorSeverity.HIGH,
|
|
167
|
+
retriable,
|
|
168
|
+
context: { status: response.status, model, responseFormat },
|
|
169
|
+
});
|
|
170
|
+
}
|
|
171
|
+
const latency = Date.now() - startTime;
|
|
172
|
+
// Get audio buffer
|
|
173
|
+
const arrayBuffer = await response.arrayBuffer();
|
|
174
|
+
const audioBuffer = Buffer.from(arrayBuffer);
|
|
175
|
+
// Use the *effective* output format (post-mapFormat fallback), not the
|
|
176
|
+
// requested format — otherwise mp3-coerced "m4a" requests would mislabel
|
|
177
|
+
// the buffer and break consumer file-extension routing.
|
|
178
|
+
const effectiveFormat = this.effectiveFormat(responseFormat);
|
|
179
|
+
const result = {
|
|
180
|
+
buffer: audioBuffer,
|
|
181
|
+
format: effectiveFormat,
|
|
182
|
+
size: audioBuffer.length,
|
|
183
|
+
voice,
|
|
184
|
+
sampleRate: this.getSampleRate(effectiveFormat),
|
|
185
|
+
metadata: {
|
|
186
|
+
latency,
|
|
187
|
+
provider: "openai-tts",
|
|
188
|
+
model,
|
|
189
|
+
requestedFormat: options.format,
|
|
190
|
+
responseFormat,
|
|
191
|
+
},
|
|
192
|
+
};
|
|
193
|
+
logger.info(`[OpenAITTSHandler] Synthesized ${audioBuffer.length} bytes in ${latency}ms`);
|
|
194
|
+
return result;
|
|
195
|
+
}
|
|
196
|
+
catch (err) {
|
|
197
|
+
if (err instanceof TTSError) {
|
|
198
|
+
throw err;
|
|
199
|
+
}
|
|
200
|
+
const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error");
|
|
201
|
+
logger.error(`[OpenAITTSHandler] Synthesis failed: ${errorMessage}`);
|
|
202
|
+
throw new TTSError({
|
|
203
|
+
code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
|
|
204
|
+
message: `Synthesis failed: ${errorMessage}`,
|
|
205
|
+
category: ErrorCategory.EXECUTION,
|
|
206
|
+
severity: ErrorSeverity.HIGH,
|
|
207
|
+
retriable: true,
|
|
208
|
+
context: { textLength: text.length },
|
|
209
|
+
originalError: err instanceof Error ? err : undefined,
|
|
210
|
+
});
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
/**
|
|
214
|
+
* Map TTSAudioFormat to OpenAI response_format.
|
|
215
|
+
* OpenAI TTS supports: mp3, wav, opus (ogg maps to opus).
|
|
216
|
+
* Unsupported formats are coerced to mp3 with a warning.
|
|
217
|
+
*/
|
|
218
|
+
mapFormat(format) {
|
|
219
|
+
const formats = {
|
|
220
|
+
mp3: "mp3",
|
|
221
|
+
wav: "wav",
|
|
222
|
+
ogg: "opus", // OpenAI uses opus for ogg
|
|
223
|
+
opus: "opus",
|
|
224
|
+
// OpenAI's "pcm" is raw 16-bit signed LE @ 24kHz (no header) — maps to
|
|
225
|
+
// canonical pcm16 in TTSResult.format. See effectiveFormat() below.
|
|
226
|
+
pcm16: "pcm",
|
|
227
|
+
};
|
|
228
|
+
const mapped = formats[format];
|
|
229
|
+
if (mapped === undefined) {
|
|
230
|
+
logger.warn(`[OpenAITTSHandler] Unsupported format "${format}" — falling back to "mp3". Supported formats: mp3, wav, ogg, opus, pcm16.`);
|
|
231
|
+
return "mp3";
|
|
232
|
+
}
|
|
233
|
+
return mapped;
|
|
234
|
+
}
|
|
235
|
+
/**
|
|
236
|
+
* Get sample rate for format
|
|
237
|
+
*/
|
|
238
|
+
getSampleRate(format) {
|
|
239
|
+
switch (format) {
|
|
240
|
+
case "opus":
|
|
241
|
+
case "ogg":
|
|
242
|
+
return 48000;
|
|
243
|
+
default:
|
|
244
|
+
return 24000;
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
/**
|
|
248
|
+
* Map the OpenAI `response_format` string back to the canonical
|
|
249
|
+
* `TTSAudioFormat` so `TTSResult.format` reflects what the API actually
|
|
250
|
+
* returned (mapFormat() coerces unsupported requests to "mp3"). Note:
|
|
251
|
+
* OpenAI returns Ogg-Opus for both "ogg" and "opus" requests — both
|
|
252
|
+
* surface as "opus" since the bytes are an .ogg/Opus container.
|
|
253
|
+
*/
|
|
254
|
+
effectiveFormat(responseFormat) {
|
|
255
|
+
switch (responseFormat) {
|
|
256
|
+
case "mp3":
|
|
257
|
+
return "mp3";
|
|
258
|
+
case "wav":
|
|
259
|
+
return "wav";
|
|
260
|
+
case "opus":
|
|
261
|
+
return "opus";
|
|
262
|
+
// Raw PCM (16-bit signed LE @ 24kHz, no header) — keep semantics in
|
|
263
|
+
// TTSResult.format so consumers don't write raw bytes to a .wav file.
|
|
264
|
+
case "pcm":
|
|
265
|
+
return "pcm16";
|
|
266
|
+
default:
|
|
267
|
+
return "mp3";
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
//# sourceMappingURL=OpenAITTS.js.map
|