@juspay/neurolink 9.61.1 → 9.62.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. package/CHANGELOG.md +12 -0
  2. package/README.md +23 -17
  3. package/dist/adapters/tts/googleTTSHandler.js +1 -1
  4. package/dist/browser/neurolink.min.js +382 -364
  5. package/dist/cli/commands/serve.js +9 -0
  6. package/dist/cli/commands/voiceServer.d.ts +7 -0
  7. package/dist/cli/commands/voiceServer.js +9 -1
  8. package/dist/cli/factories/commandFactory.js +136 -11
  9. package/dist/cli/loop/optionsSchema.d.ts +1 -1
  10. package/dist/cli/utils/audioFileUtils.d.ts +3 -3
  11. package/dist/cli/utils/audioFileUtils.js +5 -1
  12. package/dist/core/baseProvider.js +29 -6
  13. package/dist/factories/providerRegistry.d.ts +14 -0
  14. package/dist/factories/providerRegistry.js +141 -2
  15. package/dist/lib/adapters/tts/googleTTSHandler.js +1 -1
  16. package/dist/lib/core/baseProvider.js +29 -6
  17. package/dist/lib/factories/providerRegistry.d.ts +14 -0
  18. package/dist/lib/factories/providerRegistry.js +141 -2
  19. package/dist/lib/mcp/toolRegistry.js +7 -1
  20. package/dist/lib/neurolink.d.ts +19 -0
  21. package/dist/lib/neurolink.js +252 -14
  22. package/dist/lib/observability/exporters/laminarExporter.js +1 -0
  23. package/dist/lib/observability/exporters/posthogExporter.js +1 -0
  24. package/dist/lib/observability/utils/spanSerializer.js +1 -0
  25. package/dist/lib/server/voice/tokenCompare.d.ts +14 -0
  26. package/dist/lib/server/voice/tokenCompare.js +23 -0
  27. package/dist/lib/server/voice/voiceServerApp.js +62 -3
  28. package/dist/lib/server/voice/voiceWebSocketHandler.d.ts +20 -3
  29. package/dist/lib/server/voice/voiceWebSocketHandler.js +555 -435
  30. package/dist/lib/types/generate.d.ts +47 -0
  31. package/dist/lib/types/hitl.d.ts +3 -0
  32. package/dist/lib/types/index.d.ts +1 -1
  33. package/dist/lib/types/index.js +1 -1
  34. package/dist/lib/types/realtime.d.ts +243 -0
  35. package/dist/lib/types/realtime.js +70 -0
  36. package/dist/lib/types/server.d.ts +68 -0
  37. package/dist/lib/types/span.d.ts +2 -0
  38. package/dist/lib/types/span.js +2 -0
  39. package/dist/lib/types/stream.d.ts +36 -14
  40. package/dist/lib/types/stt.d.ts +585 -0
  41. package/dist/lib/types/stt.js +90 -0
  42. package/dist/lib/types/tools.d.ts +2 -0
  43. package/dist/lib/types/tts.d.ts +23 -11
  44. package/dist/lib/types/tts.js +7 -0
  45. package/dist/lib/types/voice.d.ts +272 -0
  46. package/dist/lib/types/voice.js +137 -0
  47. package/dist/lib/utils/audioFormatDetector.d.ts +15 -0
  48. package/dist/lib/utils/audioFormatDetector.js +34 -0
  49. package/dist/lib/utils/errorHandling.js +4 -0
  50. package/dist/lib/utils/sttProcessor.d.ts +115 -0
  51. package/dist/lib/utils/sttProcessor.js +295 -0
  52. package/dist/lib/voice/RealtimeVoiceAPI.d.ts +183 -0
  53. package/dist/lib/voice/RealtimeVoiceAPI.js +439 -0
  54. package/dist/lib/voice/audio-utils.d.ts +135 -0
  55. package/dist/lib/voice/audio-utils.js +435 -0
  56. package/dist/lib/voice/errors.d.ts +123 -0
  57. package/dist/lib/voice/errors.js +386 -0
  58. package/dist/lib/voice/index.d.ts +26 -0
  59. package/dist/lib/voice/index.js +55 -0
  60. package/dist/lib/voice/providers/AzureSTT.d.ts +47 -0
  61. package/dist/lib/voice/providers/AzureSTT.js +345 -0
  62. package/dist/lib/voice/providers/AzureTTS.d.ts +59 -0
  63. package/dist/lib/voice/providers/AzureTTS.js +349 -0
  64. package/dist/lib/voice/providers/DeepgramSTT.d.ts +40 -0
  65. package/dist/lib/voice/providers/DeepgramSTT.js +550 -0
  66. package/dist/lib/voice/providers/ElevenLabsTTS.d.ts +53 -0
  67. package/dist/lib/voice/providers/ElevenLabsTTS.js +311 -0
  68. package/dist/lib/voice/providers/GeminiLive.d.ts +52 -0
  69. package/dist/lib/voice/providers/GeminiLive.js +372 -0
  70. package/dist/lib/voice/providers/GoogleSTT.d.ts +60 -0
  71. package/dist/lib/voice/providers/GoogleSTT.js +454 -0
  72. package/dist/lib/voice/providers/OpenAIRealtime.d.ts +47 -0
  73. package/dist/lib/voice/providers/OpenAIRealtime.js +412 -0
  74. package/dist/lib/voice/providers/OpenAISTT.d.ts +41 -0
  75. package/dist/lib/voice/providers/OpenAISTT.js +286 -0
  76. package/dist/lib/voice/providers/OpenAITTS.d.ts +49 -0
  77. package/dist/lib/voice/providers/OpenAITTS.js +271 -0
  78. package/dist/lib/voice/stream-handler.d.ts +166 -0
  79. package/dist/lib/voice/stream-handler.js +514 -0
  80. package/dist/mcp/toolRegistry.js +7 -1
  81. package/dist/neurolink.d.ts +19 -0
  82. package/dist/neurolink.js +252 -14
  83. package/dist/observability/exporters/laminarExporter.js +1 -0
  84. package/dist/observability/exporters/posthogExporter.js +1 -0
  85. package/dist/observability/utils/spanSerializer.js +1 -0
  86. package/dist/server/voice/tokenCompare.d.ts +14 -0
  87. package/dist/server/voice/tokenCompare.js +22 -0
  88. package/dist/server/voice/voiceServerApp.js +62 -3
  89. package/dist/server/voice/voiceWebSocketHandler.d.ts +20 -3
  90. package/dist/server/voice/voiceWebSocketHandler.js +555 -435
  91. package/dist/types/generate.d.ts +47 -0
  92. package/dist/types/hitl.d.ts +3 -0
  93. package/dist/types/index.d.ts +1 -1
  94. package/dist/types/index.js +1 -1
  95. package/dist/types/realtime.d.ts +243 -0
  96. package/dist/types/realtime.js +69 -0
  97. package/dist/types/server.d.ts +68 -0
  98. package/dist/types/span.d.ts +2 -0
  99. package/dist/types/span.js +2 -0
  100. package/dist/types/stream.d.ts +36 -14
  101. package/dist/types/stt.d.ts +585 -0
  102. package/dist/types/stt.js +89 -0
  103. package/dist/types/tools.d.ts +2 -0
  104. package/dist/types/tts.d.ts +23 -11
  105. package/dist/types/tts.js +7 -0
  106. package/dist/types/voice.d.ts +272 -0
  107. package/dist/types/voice.js +136 -0
  108. package/dist/utils/audioFormatDetector.d.ts +15 -0
  109. package/dist/utils/audioFormatDetector.js +33 -0
  110. package/dist/utils/errorHandling.js +4 -0
  111. package/dist/utils/sttProcessor.d.ts +115 -0
  112. package/dist/utils/sttProcessor.js +294 -0
  113. package/dist/voice/RealtimeVoiceAPI.d.ts +183 -0
  114. package/dist/voice/RealtimeVoiceAPI.js +438 -0
  115. package/dist/voice/audio-utils.d.ts +135 -0
  116. package/dist/voice/audio-utils.js +434 -0
  117. package/dist/voice/errors.d.ts +123 -0
  118. package/dist/voice/errors.js +385 -0
  119. package/dist/voice/index.d.ts +26 -0
  120. package/dist/voice/index.js +54 -0
  121. package/dist/voice/providers/AzureSTT.d.ts +47 -0
  122. package/dist/voice/providers/AzureSTT.js +344 -0
  123. package/dist/voice/providers/AzureTTS.d.ts +59 -0
  124. package/dist/voice/providers/AzureTTS.js +348 -0
  125. package/dist/voice/providers/DeepgramSTT.d.ts +40 -0
  126. package/dist/voice/providers/DeepgramSTT.js +549 -0
  127. package/dist/voice/providers/ElevenLabsTTS.d.ts +53 -0
  128. package/dist/voice/providers/ElevenLabsTTS.js +310 -0
  129. package/dist/voice/providers/GeminiLive.d.ts +52 -0
  130. package/dist/voice/providers/GeminiLive.js +371 -0
  131. package/dist/voice/providers/GoogleSTT.d.ts +60 -0
  132. package/dist/voice/providers/GoogleSTT.js +453 -0
  133. package/dist/voice/providers/OpenAIRealtime.d.ts +47 -0
  134. package/dist/voice/providers/OpenAIRealtime.js +411 -0
  135. package/dist/voice/providers/OpenAISTT.d.ts +41 -0
  136. package/dist/voice/providers/OpenAISTT.js +285 -0
  137. package/dist/voice/providers/OpenAITTS.d.ts +49 -0
  138. package/dist/voice/providers/OpenAITTS.js +270 -0
  139. package/dist/voice/stream-handler.d.ts +166 -0
  140. package/dist/voice/stream-handler.js +513 -0
  141. package/package.json +5 -2
@@ -0,0 +1,285 @@
1
+ /**
2
+ * OpenAI Whisper Speech-to-Text Handler
3
+ *
4
+ * Implementation of STT using OpenAI's Whisper model.
5
+ *
6
+ * @module voice/providers/OpenAISTT
7
+ */
8
+ import { logger } from "../../utils/logger.js";
9
+ import { STTError } from "../errors.js";
10
+ /**
11
+ * OpenAI Whisper Speech-to-Text Handler
12
+ *
13
+ * Supports transcription and translation using OpenAI's Whisper model.
14
+ *
15
+ * @see https://platform.openai.com/docs/api-reference/audio
16
+ */
17
+ export class OpenAISTT {
18
+ apiKey;
19
+ baseUrl = "https://api.openai.com/v1";
20
+ /**
21
+ * Maximum audio duration in seconds (25 minutes)
22
+ */
23
+ maxAudioDuration = 25 * 60;
24
+ /**
25
+ * Whisper does not support streaming
26
+ */
27
+ supportsStreaming = false;
28
+ constructor(apiKey) {
29
+ const resolvedKey = (apiKey ?? process.env.OPENAI_API_KEY ?? "").trim();
30
+ this.apiKey = resolvedKey.length > 0 ? resolvedKey : null;
31
+ }
32
+ isConfigured() {
33
+ return this.apiKey !== null;
34
+ }
35
+ getSupportedFormats() {
36
+ // OpenAI Whisper transcription API accepts: flac, m4a, mp3, mp4, mpeg,
37
+ // mpga, oga, ogg, opus, wav, webm. Keep this in sync with TTSAudioFormat
38
+ // — formats not listed in TTSAudioFormat are filtered out by the type.
39
+ return [
40
+ "mp3",
41
+ "wav",
42
+ "ogg",
43
+ "opus",
44
+ "m4a",
45
+ "flac",
46
+ "webm",
47
+ "mp4",
48
+ "mpeg",
49
+ "mpga",
50
+ ];
51
+ }
52
+ async getSupportedLanguages() {
53
+ // Whisper supports 100+ languages
54
+ // Return the most common ones
55
+ return [
56
+ {
57
+ code: "en",
58
+ name: "English",
59
+ supportsDiarization: false,
60
+ supportsPunctuation: true,
61
+ },
62
+ {
63
+ code: "es",
64
+ name: "Spanish",
65
+ supportsDiarization: false,
66
+ supportsPunctuation: true,
67
+ },
68
+ {
69
+ code: "fr",
70
+ name: "French",
71
+ supportsDiarization: false,
72
+ supportsPunctuation: true,
73
+ },
74
+ {
75
+ code: "de",
76
+ name: "German",
77
+ supportsDiarization: false,
78
+ supportsPunctuation: true,
79
+ },
80
+ {
81
+ code: "it",
82
+ name: "Italian",
83
+ supportsDiarization: false,
84
+ supportsPunctuation: true,
85
+ },
86
+ {
87
+ code: "pt",
88
+ name: "Portuguese",
89
+ supportsDiarization: false,
90
+ supportsPunctuation: true,
91
+ },
92
+ {
93
+ code: "ru",
94
+ name: "Russian",
95
+ supportsDiarization: false,
96
+ supportsPunctuation: true,
97
+ },
98
+ {
99
+ code: "ja",
100
+ name: "Japanese",
101
+ supportsDiarization: false,
102
+ supportsPunctuation: true,
103
+ },
104
+ {
105
+ code: "ko",
106
+ name: "Korean",
107
+ supportsDiarization: false,
108
+ supportsPunctuation: true,
109
+ },
110
+ {
111
+ code: "zh",
112
+ name: "Chinese",
113
+ supportsDiarization: false,
114
+ supportsPunctuation: true,
115
+ },
116
+ {
117
+ code: "ar",
118
+ name: "Arabic",
119
+ supportsDiarization: false,
120
+ supportsPunctuation: true,
121
+ },
122
+ {
123
+ code: "hi",
124
+ name: "Hindi",
125
+ supportsDiarization: false,
126
+ supportsPunctuation: true,
127
+ },
128
+ ];
129
+ }
130
+ async transcribe(audio, options = {}) {
131
+ if (!this.apiKey) {
132
+ throw STTError.providerNotConfigured("whisper");
133
+ }
134
+ const audioBuffer = Buffer.isBuffer(audio) ? audio : Buffer.from(audio);
135
+ if (audioBuffer.length === 0) {
136
+ throw STTError.audioEmpty("whisper");
137
+ }
138
+ const whisperOptions = options;
139
+ const startTime = Date.now();
140
+ try {
141
+ // Prepare form data
142
+ const formData = new FormData();
143
+ // Add audio file - convert Buffer to Uint8Array for compatibility
144
+ const audioBlob = new Blob([new Uint8Array(audioBuffer)], {
145
+ type: this.getMimeType(options.format ?? "wav"),
146
+ });
147
+ formData.append("file", audioBlob, `audio.${options.format ?? "wav"}`);
148
+ // Add model
149
+ formData.append("model", whisperOptions.model ?? "whisper-1");
150
+ // Add optional parameters
151
+ if (options.language) {
152
+ formData.append("language", options.language);
153
+ }
154
+ if (whisperOptions.prompt) {
155
+ formData.append("prompt", whisperOptions.prompt);
156
+ }
157
+ if (whisperOptions.temperature !== undefined) {
158
+ formData.append("temperature", whisperOptions.temperature.toString());
159
+ }
160
+ // Request verbose_json for detailed response
161
+ const responseFormat = whisperOptions.responseFormat ?? "verbose_json";
162
+ formData.append("response_format", responseFormat);
163
+ // Add timestamp granularities for word-level timestamps
164
+ if (options.wordTimestamps && responseFormat === "verbose_json") {
165
+ formData.append("timestamp_granularities[]", "word");
166
+ formData.append("timestamp_granularities[]", "segment");
167
+ }
168
+ // Choose endpoint based on translation option
169
+ const endpoint = whisperOptions.translate
170
+ ? `${this.baseUrl}/audio/translations`
171
+ : `${this.baseUrl}/audio/transcriptions`;
172
+ const controller = new AbortController();
173
+ const timeoutId = setTimeout(() => controller.abort(), 30000);
174
+ let response;
175
+ try {
176
+ response = await fetch(endpoint, {
177
+ method: "POST",
178
+ headers: {
179
+ Authorization: `Bearer ${this.apiKey}`,
180
+ },
181
+ body: formData,
182
+ signal: controller.signal,
183
+ });
184
+ }
185
+ catch (fetchErr) {
186
+ if (fetchErr instanceof Error && fetchErr.name === "AbortError") {
187
+ throw STTError.transcriptionFailed("OpenAI STT request timed out after 30 seconds", "whisper", fetchErr);
188
+ }
189
+ throw fetchErr;
190
+ }
191
+ finally {
192
+ clearTimeout(timeoutId);
193
+ }
194
+ if (!response.ok) {
195
+ const errorData = await response
196
+ .json()
197
+ .catch(() => Object.create(null));
198
+ const errorMessage = errorData.error?.message ||
199
+ `HTTP ${response.status}`;
200
+ throw STTError.transcriptionFailed(errorMessage, "whisper");
201
+ }
202
+ const latency = Date.now() - startTime;
203
+ // Parse response based on format
204
+ if (responseFormat === "text") {
205
+ const text = await response.text();
206
+ return {
207
+ text,
208
+ confidence: 0.95, // Whisper doesn't return confidence
209
+ metadata: {
210
+ latency,
211
+ provider: "whisper",
212
+ model: whisperOptions.model ?? "whisper-1",
213
+ },
214
+ };
215
+ }
216
+ const data = (await response.json());
217
+ // Build result
218
+ const result = {
219
+ text: data.text,
220
+ confidence: 0.95, // Whisper doesn't return per-result confidence
221
+ language: data.language,
222
+ duration: data.duration,
223
+ metadata: {
224
+ latency,
225
+ provider: "whisper",
226
+ model: whisperOptions.model ?? "whisper-1",
227
+ task: data.task,
228
+ },
229
+ };
230
+ // Add word timings if available
231
+ if (data.words && data.words.length > 0) {
232
+ result.words = data.words.map((word) => ({
233
+ word: word.word,
234
+ startTime: word.start,
235
+ endTime: word.end,
236
+ }));
237
+ }
238
+ // Add segments
239
+ if (data.segments && data.segments.length > 0) {
240
+ result.segments = data.segments.map((segment, index) => ({
241
+ index,
242
+ text: segment.text,
243
+ isFinal: true,
244
+ confidence: Math.exp(segment.avg_logprob), // Convert log prob to confidence
245
+ startTime: segment.start,
246
+ endTime: segment.end,
247
+ }));
248
+ }
249
+ logger.info(`[WhisperSTTHandler] Transcribed ${data.duration?.toFixed(1) ?? "?"}s audio in ${latency}ms`);
250
+ return result;
251
+ }
252
+ catch (err) {
253
+ if (err instanceof STTError) {
254
+ throw err;
255
+ }
256
+ const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error");
257
+ logger.error(`[WhisperSTTHandler] Transcription failed: ${errorMessage}`);
258
+ throw STTError.transcriptionFailed(errorMessage, "whisper", err instanceof Error ? err : undefined);
259
+ }
260
+ }
261
+ /**
262
+ * Get MIME type for audio format. Whisper auto-detects from headers, but
263
+ * sending a correct MIME helps providers / proxies that sniff Content-Type.
264
+ * Must stay aligned with `getSupportedFormats()`.
265
+ */
266
+ getMimeType(format) {
267
+ const mimeTypes = {
268
+ mp3: "audio/mpeg",
269
+ wav: "audio/wav",
270
+ ogg: "audio/ogg",
271
+ opus: "audio/opus",
272
+ m4a: "audio/mp4",
273
+ flac: "audio/flac",
274
+ webm: "audio/webm",
275
+ mp4: "audio/mp4",
276
+ mpeg: "audio/mpeg",
277
+ mpga: "audio/mpeg",
278
+ };
279
+ return mimeTypes[format] ?? "audio/wav";
280
+ }
281
+ }
282
+ // Export as named exports for compatibility
283
+ export { OpenAISTT as WhisperSTT };
284
+ export { OpenAISTT as WhisperSTTHandler };
285
+ export { OpenAISTT as OpenAISTTHandler };
@@ -0,0 +1,49 @@
1
+ /**
2
+ * OpenAI Text-to-Speech Handler
3
+ *
4
+ * Implementation of TTS using OpenAI's TTS API.
5
+ *
6
+ * @module voice/providers/OpenAITTS
7
+ */
8
+ import type { TTSHandler, TTSOptions, TTSResult, TTSVoice } from "../../types/index.js";
9
+ /**
10
+ * OpenAI Text-to-Speech Handler
11
+ *
12
+ * Supports high-quality neural TTS with multiple voices.
13
+ *
14
+ * @see https://platform.openai.com/docs/api-reference/audio/createSpeech
15
+ */
16
+ export declare class OpenAITTS implements TTSHandler {
17
+ private readonly apiKey;
18
+ private readonly baseUrl;
19
+ /**
20
+ * Maximum text length (4096 characters)
21
+ */
22
+ readonly maxTextLength = 4096;
23
+ /**
24
+ * Available voices
25
+ */
26
+ private static readonly VOICES;
27
+ constructor(apiKey?: string);
28
+ isConfigured(): boolean;
29
+ getVoices(languageCode?: string): Promise<TTSVoice[]>;
30
+ synthesize(text: string, options?: TTSOptions): Promise<TTSResult>;
31
+ /**
32
+ * Map TTSAudioFormat to OpenAI response_format.
33
+ * OpenAI TTS supports: mp3, wav, opus (ogg maps to opus).
34
+ * Unsupported formats are coerced to mp3 with a warning.
35
+ */
36
+ private mapFormat;
37
+ /**
38
+ * Get sample rate for format
39
+ */
40
+ private getSampleRate;
41
+ /**
42
+ * Map the OpenAI `response_format` string back to the canonical
43
+ * `TTSAudioFormat` so `TTSResult.format` reflects what the API actually
44
+ * returned (mapFormat() coerces unsupported requests to "mp3"). Note:
45
+ * OpenAI returns Ogg-Opus for both "ogg" and "opus" requests — both
46
+ * surface as "opus" since the bytes are an .ogg/Opus container.
47
+ */
48
+ private effectiveFormat;
49
+ }
@@ -0,0 +1,270 @@
1
+ /**
2
+ * OpenAI Text-to-Speech Handler
3
+ *
4
+ * Implementation of TTS using OpenAI's TTS API.
5
+ *
6
+ * @module voice/providers/OpenAITTS
7
+ */
8
+ import { ErrorCategory, ErrorSeverity } from "../../constants/enums.js";
9
+ import { logger } from "../../utils/logger.js";
10
+ import { TTS_ERROR_CODES, TTSError } from "../../utils/ttsProcessor.js";
11
+ /**
12
+ * OpenAI Text-to-Speech Handler
13
+ *
14
+ * Supports high-quality neural TTS with multiple voices.
15
+ *
16
+ * @see https://platform.openai.com/docs/api-reference/audio/createSpeech
17
+ */
18
+ export class OpenAITTS {
19
+ apiKey;
20
+ baseUrl = "https://api.openai.com/v1";
21
+ /**
22
+ * Maximum text length (4096 characters)
23
+ */
24
+ maxTextLength = 4096;
25
+ /**
26
+ * Available voices
27
+ */
28
+ static VOICES = [
29
+ {
30
+ id: "alloy",
31
+ name: "Alloy",
32
+ languageCode: "en",
33
+ languageCodes: ["en"],
34
+ gender: "neutral",
35
+ type: "neural",
36
+ },
37
+ {
38
+ id: "echo",
39
+ name: "Echo",
40
+ languageCode: "en",
41
+ languageCodes: ["en"],
42
+ gender: "male",
43
+ type: "neural",
44
+ },
45
+ {
46
+ id: "fable",
47
+ name: "Fable",
48
+ languageCode: "en",
49
+ languageCodes: ["en"],
50
+ gender: "neutral",
51
+ type: "neural",
52
+ },
53
+ {
54
+ id: "onyx",
55
+ name: "Onyx",
56
+ languageCode: "en",
57
+ languageCodes: ["en"],
58
+ gender: "male",
59
+ type: "neural",
60
+ },
61
+ {
62
+ id: "nova",
63
+ name: "Nova",
64
+ languageCode: "en",
65
+ languageCodes: ["en"],
66
+ gender: "female",
67
+ type: "neural",
68
+ },
69
+ {
70
+ id: "shimmer",
71
+ name: "Shimmer",
72
+ languageCode: "en",
73
+ languageCodes: ["en"],
74
+ gender: "female",
75
+ type: "neural",
76
+ },
77
+ ];
78
+ constructor(apiKey) {
79
+ const resolvedKey = (apiKey ?? process.env.OPENAI_API_KEY ?? "").trim();
80
+ this.apiKey = resolvedKey.length > 0 ? resolvedKey : null;
81
+ }
82
+ isConfigured() {
83
+ return this.apiKey !== null;
84
+ }
85
+ async getVoices(languageCode) {
86
+ // OpenAI voices are pre-defined, filter by language if provided
87
+ if (languageCode && !languageCode.startsWith("en")) {
88
+ // OpenAI TTS works with multiple languages but voices are English-named
89
+ return OpenAITTS.VOICES;
90
+ }
91
+ return OpenAITTS.VOICES;
92
+ }
93
+ async synthesize(text, options = {}) {
94
+ if (!this.apiKey) {
95
+ throw new TTSError({
96
+ code: TTS_ERROR_CODES.PROVIDER_NOT_CONFIGURED,
97
+ message: "OpenAI TTS API key not configured",
98
+ category: ErrorCategory.CONFIGURATION,
99
+ severity: ErrorSeverity.HIGH,
100
+ retriable: false,
101
+ });
102
+ }
103
+ const startTime = Date.now();
104
+ const openaiOptions = options;
105
+ try {
106
+ // Determine model based on quality
107
+ const model = openaiOptions.model ??
108
+ (options.quality === "hd" ? "tts-1-hd" : "tts-1");
109
+ // Determine voice
110
+ const voice = options.voice ?? "alloy";
111
+ // Determine format
112
+ const responseFormat = this.mapFormat(options.format ?? "mp3");
113
+ // Build request
114
+ const requestBody = {
115
+ model,
116
+ input: text,
117
+ voice,
118
+ response_format: responseFormat,
119
+ speed: options.speed ?? 1.0,
120
+ };
121
+ const controller = new AbortController();
122
+ const timeoutId = setTimeout(() => controller.abort(), 30000);
123
+ let response;
124
+ try {
125
+ response = await fetch(`${this.baseUrl}/audio/speech`, {
126
+ method: "POST",
127
+ headers: {
128
+ Authorization: `Bearer ${this.apiKey}`,
129
+ "Content-Type": "application/json",
130
+ },
131
+ body: JSON.stringify(requestBody),
132
+ signal: controller.signal,
133
+ });
134
+ }
135
+ catch (fetchErr) {
136
+ if (fetchErr instanceof Error && fetchErr.name === "AbortError") {
137
+ throw new TTSError({
138
+ code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
139
+ message: "OpenAI TTS request timed out after 30 seconds",
140
+ category: ErrorCategory.NETWORK,
141
+ severity: ErrorSeverity.HIGH,
142
+ retriable: true,
143
+ originalError: fetchErr,
144
+ });
145
+ }
146
+ throw fetchErr;
147
+ }
148
+ finally {
149
+ clearTimeout(timeoutId);
150
+ }
151
+ if (!response.ok) {
152
+ const errorData = await response
153
+ .json()
154
+ .catch(() => Object.create(null));
155
+ const errorMessage = errorData.error?.message ||
156
+ `HTTP ${response.status}`;
157
+ // Preserve HTTP status so the outer catch doesn't mark a permanent
158
+ // 4xx (auth, bad input) as retriable and trigger pointless retry loops.
159
+ const retriable = response.status === 408 ||
160
+ response.status === 429 ||
161
+ response.status >= 500;
162
+ throw new TTSError({
163
+ code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
164
+ message: errorMessage,
165
+ category: retriable ? ErrorCategory.NETWORK : ErrorCategory.EXECUTION,
166
+ severity: ErrorSeverity.HIGH,
167
+ retriable,
168
+ context: { status: response.status, model, responseFormat },
169
+ });
170
+ }
171
+ const latency = Date.now() - startTime;
172
+ // Get audio buffer
173
+ const arrayBuffer = await response.arrayBuffer();
174
+ const audioBuffer = Buffer.from(arrayBuffer);
175
+ // Use the *effective* output format (post-mapFormat fallback), not the
176
+ // requested format — otherwise mp3-coerced "m4a" requests would mislabel
177
+ // the buffer and break consumer file-extension routing.
178
+ const effectiveFormat = this.effectiveFormat(responseFormat);
179
+ const result = {
180
+ buffer: audioBuffer,
181
+ format: effectiveFormat,
182
+ size: audioBuffer.length,
183
+ voice,
184
+ sampleRate: this.getSampleRate(effectiveFormat),
185
+ metadata: {
186
+ latency,
187
+ provider: "openai-tts",
188
+ model,
189
+ requestedFormat: options.format,
190
+ responseFormat,
191
+ },
192
+ };
193
+ logger.info(`[OpenAITTSHandler] Synthesized ${audioBuffer.length} bytes in ${latency}ms`);
194
+ return result;
195
+ }
196
+ catch (err) {
197
+ if (err instanceof TTSError) {
198
+ throw err;
199
+ }
200
+ const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error");
201
+ logger.error(`[OpenAITTSHandler] Synthesis failed: ${errorMessage}`);
202
+ throw new TTSError({
203
+ code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
204
+ message: `Synthesis failed: ${errorMessage}`,
205
+ category: ErrorCategory.EXECUTION,
206
+ severity: ErrorSeverity.HIGH,
207
+ retriable: true,
208
+ context: { textLength: text.length },
209
+ originalError: err instanceof Error ? err : undefined,
210
+ });
211
+ }
212
+ }
213
+ /**
214
+ * Map TTSAudioFormat to OpenAI response_format.
215
+ * OpenAI TTS supports: mp3, wav, opus (ogg maps to opus).
216
+ * Unsupported formats are coerced to mp3 with a warning.
217
+ */
218
+ mapFormat(format) {
219
+ const formats = {
220
+ mp3: "mp3",
221
+ wav: "wav",
222
+ ogg: "opus", // OpenAI uses opus for ogg
223
+ opus: "opus",
224
+ // OpenAI's "pcm" is raw 16-bit signed LE @ 24kHz (no header) — maps to
225
+ // canonical pcm16 in TTSResult.format. See effectiveFormat() below.
226
+ pcm16: "pcm",
227
+ };
228
+ const mapped = formats[format];
229
+ if (mapped === undefined) {
230
+ logger.warn(`[OpenAITTSHandler] Unsupported format "${format}" — falling back to "mp3". Supported formats: mp3, wav, ogg, opus, pcm16.`);
231
+ return "mp3";
232
+ }
233
+ return mapped;
234
+ }
235
+ /**
236
+ * Get sample rate for format
237
+ */
238
+ getSampleRate(format) {
239
+ switch (format) {
240
+ case "opus":
241
+ case "ogg":
242
+ return 48000;
243
+ default:
244
+ return 24000;
245
+ }
246
+ }
247
+ /**
248
+ * Map the OpenAI `response_format` string back to the canonical
249
+ * `TTSAudioFormat` so `TTSResult.format` reflects what the API actually
250
+ * returned (mapFormat() coerces unsupported requests to "mp3"). Note:
251
+ * OpenAI returns Ogg-Opus for both "ogg" and "opus" requests — both
252
+ * surface as "opus" since the bytes are an .ogg/Opus container.
253
+ */
254
+ effectiveFormat(responseFormat) {
255
+ switch (responseFormat) {
256
+ case "mp3":
257
+ return "mp3";
258
+ case "wav":
259
+ return "wav";
260
+ case "opus":
261
+ return "opus";
262
+ // Raw PCM (16-bit signed LE @ 24kHz, no header) — keep semantics in
263
+ // TTSResult.format so consumers don't write raw bytes to a .wav file.
264
+ case "pcm":
265
+ return "pcm16";
266
+ default:
267
+ return "mp3";
268
+ }
269
+ }
270
+ }