@juspay/neurolink 9.61.1 → 9.62.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/README.md +23 -17
- package/dist/adapters/tts/googleTTSHandler.js +1 -1
- package/dist/browser/neurolink.min.js +382 -364
- package/dist/cli/commands/serve.js +9 -0
- package/dist/cli/commands/voiceServer.d.ts +7 -0
- package/dist/cli/commands/voiceServer.js +9 -1
- package/dist/cli/factories/commandFactory.js +136 -11
- package/dist/cli/loop/optionsSchema.d.ts +1 -1
- package/dist/cli/utils/audioFileUtils.d.ts +3 -3
- package/dist/cli/utils/audioFileUtils.js +5 -1
- package/dist/core/baseProvider.js +29 -6
- package/dist/factories/providerRegistry.d.ts +14 -0
- package/dist/factories/providerRegistry.js +141 -2
- package/dist/lib/adapters/tts/googleTTSHandler.js +1 -1
- package/dist/lib/core/baseProvider.js +29 -6
- package/dist/lib/factories/providerRegistry.d.ts +14 -0
- package/dist/lib/factories/providerRegistry.js +141 -2
- package/dist/lib/mcp/toolRegistry.js +7 -1
- package/dist/lib/neurolink.d.ts +19 -0
- package/dist/lib/neurolink.js +252 -14
- package/dist/lib/observability/exporters/laminarExporter.js +1 -0
- package/dist/lib/observability/exporters/posthogExporter.js +1 -0
- package/dist/lib/observability/utils/spanSerializer.js +1 -0
- package/dist/lib/server/voice/tokenCompare.d.ts +14 -0
- package/dist/lib/server/voice/tokenCompare.js +23 -0
- package/dist/lib/server/voice/voiceServerApp.js +62 -3
- package/dist/lib/server/voice/voiceWebSocketHandler.d.ts +20 -3
- package/dist/lib/server/voice/voiceWebSocketHandler.js +555 -435
- package/dist/lib/types/generate.d.ts +47 -0
- package/dist/lib/types/hitl.d.ts +3 -0
- package/dist/lib/types/index.d.ts +1 -1
- package/dist/lib/types/index.js +1 -1
- package/dist/lib/types/realtime.d.ts +243 -0
- package/dist/lib/types/realtime.js +70 -0
- package/dist/lib/types/server.d.ts +68 -0
- package/dist/lib/types/span.d.ts +2 -0
- package/dist/lib/types/span.js +2 -0
- package/dist/lib/types/stream.d.ts +36 -14
- package/dist/lib/types/stt.d.ts +585 -0
- package/dist/lib/types/stt.js +90 -0
- package/dist/lib/types/tools.d.ts +2 -0
- package/dist/lib/types/tts.d.ts +23 -11
- package/dist/lib/types/tts.js +7 -0
- package/dist/lib/types/voice.d.ts +272 -0
- package/dist/lib/types/voice.js +137 -0
- package/dist/lib/utils/audioFormatDetector.d.ts +15 -0
- package/dist/lib/utils/audioFormatDetector.js +34 -0
- package/dist/lib/utils/errorHandling.js +4 -0
- package/dist/lib/utils/sttProcessor.d.ts +115 -0
- package/dist/lib/utils/sttProcessor.js +295 -0
- package/dist/lib/voice/RealtimeVoiceAPI.d.ts +183 -0
- package/dist/lib/voice/RealtimeVoiceAPI.js +439 -0
- package/dist/lib/voice/audio-utils.d.ts +135 -0
- package/dist/lib/voice/audio-utils.js +435 -0
- package/dist/lib/voice/errors.d.ts +123 -0
- package/dist/lib/voice/errors.js +386 -0
- package/dist/lib/voice/index.d.ts +26 -0
- package/dist/lib/voice/index.js +55 -0
- package/dist/lib/voice/providers/AzureSTT.d.ts +47 -0
- package/dist/lib/voice/providers/AzureSTT.js +345 -0
- package/dist/lib/voice/providers/AzureTTS.d.ts +59 -0
- package/dist/lib/voice/providers/AzureTTS.js +349 -0
- package/dist/lib/voice/providers/DeepgramSTT.d.ts +40 -0
- package/dist/lib/voice/providers/DeepgramSTT.js +550 -0
- package/dist/lib/voice/providers/ElevenLabsTTS.d.ts +53 -0
- package/dist/lib/voice/providers/ElevenLabsTTS.js +311 -0
- package/dist/lib/voice/providers/GeminiLive.d.ts +52 -0
- package/dist/lib/voice/providers/GeminiLive.js +372 -0
- package/dist/lib/voice/providers/GoogleSTT.d.ts +60 -0
- package/dist/lib/voice/providers/GoogleSTT.js +454 -0
- package/dist/lib/voice/providers/OpenAIRealtime.d.ts +47 -0
- package/dist/lib/voice/providers/OpenAIRealtime.js +412 -0
- package/dist/lib/voice/providers/OpenAISTT.d.ts +41 -0
- package/dist/lib/voice/providers/OpenAISTT.js +286 -0
- package/dist/lib/voice/providers/OpenAITTS.d.ts +49 -0
- package/dist/lib/voice/providers/OpenAITTS.js +271 -0
- package/dist/lib/voice/stream-handler.d.ts +166 -0
- package/dist/lib/voice/stream-handler.js +514 -0
- package/dist/mcp/toolRegistry.js +7 -1
- package/dist/neurolink.d.ts +19 -0
- package/dist/neurolink.js +252 -14
- package/dist/observability/exporters/laminarExporter.js +1 -0
- package/dist/observability/exporters/posthogExporter.js +1 -0
- package/dist/observability/utils/spanSerializer.js +1 -0
- package/dist/server/voice/tokenCompare.d.ts +14 -0
- package/dist/server/voice/tokenCompare.js +22 -0
- package/dist/server/voice/voiceServerApp.js +62 -3
- package/dist/server/voice/voiceWebSocketHandler.d.ts +20 -3
- package/dist/server/voice/voiceWebSocketHandler.js +555 -435
- package/dist/types/generate.d.ts +47 -0
- package/dist/types/hitl.d.ts +3 -0
- package/dist/types/index.d.ts +1 -1
- package/dist/types/index.js +1 -1
- package/dist/types/realtime.d.ts +243 -0
- package/dist/types/realtime.js +69 -0
- package/dist/types/server.d.ts +68 -0
- package/dist/types/span.d.ts +2 -0
- package/dist/types/span.js +2 -0
- package/dist/types/stream.d.ts +36 -14
- package/dist/types/stt.d.ts +585 -0
- package/dist/types/stt.js +89 -0
- package/dist/types/tools.d.ts +2 -0
- package/dist/types/tts.d.ts +23 -11
- package/dist/types/tts.js +7 -0
- package/dist/types/voice.d.ts +272 -0
- package/dist/types/voice.js +136 -0
- package/dist/utils/audioFormatDetector.d.ts +15 -0
- package/dist/utils/audioFormatDetector.js +33 -0
- package/dist/utils/errorHandling.js +4 -0
- package/dist/utils/sttProcessor.d.ts +115 -0
- package/dist/utils/sttProcessor.js +294 -0
- package/dist/voice/RealtimeVoiceAPI.d.ts +183 -0
- package/dist/voice/RealtimeVoiceAPI.js +438 -0
- package/dist/voice/audio-utils.d.ts +135 -0
- package/dist/voice/audio-utils.js +434 -0
- package/dist/voice/errors.d.ts +123 -0
- package/dist/voice/errors.js +385 -0
- package/dist/voice/index.d.ts +26 -0
- package/dist/voice/index.js +54 -0
- package/dist/voice/providers/AzureSTT.d.ts +47 -0
- package/dist/voice/providers/AzureSTT.js +344 -0
- package/dist/voice/providers/AzureTTS.d.ts +59 -0
- package/dist/voice/providers/AzureTTS.js +348 -0
- package/dist/voice/providers/DeepgramSTT.d.ts +40 -0
- package/dist/voice/providers/DeepgramSTT.js +549 -0
- package/dist/voice/providers/ElevenLabsTTS.d.ts +53 -0
- package/dist/voice/providers/ElevenLabsTTS.js +310 -0
- package/dist/voice/providers/GeminiLive.d.ts +52 -0
- package/dist/voice/providers/GeminiLive.js +371 -0
- package/dist/voice/providers/GoogleSTT.d.ts +60 -0
- package/dist/voice/providers/GoogleSTT.js +453 -0
- package/dist/voice/providers/OpenAIRealtime.d.ts +47 -0
- package/dist/voice/providers/OpenAIRealtime.js +411 -0
- package/dist/voice/providers/OpenAISTT.d.ts +41 -0
- package/dist/voice/providers/OpenAISTT.js +285 -0
- package/dist/voice/providers/OpenAITTS.d.ts +49 -0
- package/dist/voice/providers/OpenAITTS.js +270 -0
- package/dist/voice/stream-handler.d.ts +166 -0
- package/dist/voice/stream-handler.js +513 -0
- package/package.json +5 -2
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ElevenLabs Text-to-Speech Handler
|
|
3
|
+
*
|
|
4
|
+
* Implementation of TTS using ElevenLabs API.
|
|
5
|
+
*
|
|
6
|
+
* @module voice/providers/ElevenLabsTTS
|
|
7
|
+
*/
|
|
8
|
+
import { ErrorCategory, ErrorSeverity } from "../../constants/enums.js";
|
|
9
|
+
import { logger } from "../../utils/logger.js";
|
|
10
|
+
import { TTS_ERROR_CODES, TTSError } from "../../utils/ttsProcessor.js";
|
|
11
|
+
/**
|
|
12
|
+
* ElevenLabs Text-to-Speech Handler
|
|
13
|
+
*
|
|
14
|
+
* Supports high-quality multilingual TTS with voice cloning.
|
|
15
|
+
*
|
|
16
|
+
* @see https://elevenlabs.io/docs/api-reference
|
|
17
|
+
*/
|
|
18
|
+
export class ElevenLabsTTS {
|
|
19
|
+
apiKey;
|
|
20
|
+
baseUrl = "https://api.elevenlabs.io/v1";
|
|
21
|
+
voicesCache = null;
|
|
22
|
+
static CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes
|
|
23
|
+
/**
|
|
24
|
+
* Maximum text length (5000 characters)
|
|
25
|
+
*/
|
|
26
|
+
maxTextLength = 5000;
|
|
27
|
+
constructor(apiKey) {
|
|
28
|
+
const resolvedKey = (apiKey ?? process.env.ELEVENLABS_API_KEY ?? "").trim();
|
|
29
|
+
this.apiKey = resolvedKey.length > 0 ? resolvedKey : null;
|
|
30
|
+
}
|
|
31
|
+
isConfigured() {
|
|
32
|
+
return this.apiKey !== null;
|
|
33
|
+
}
|
|
34
|
+
async getVoices(languageCode) {
|
|
35
|
+
if (!this.apiKey) {
|
|
36
|
+
throw new TTSError({
|
|
37
|
+
code: TTS_ERROR_CODES.PROVIDER_NOT_CONFIGURED,
|
|
38
|
+
message: "ElevenLabs API key not configured",
|
|
39
|
+
category: ErrorCategory.CONFIGURATION,
|
|
40
|
+
severity: ErrorSeverity.HIGH,
|
|
41
|
+
retriable: false,
|
|
42
|
+
});
|
|
43
|
+
}
|
|
44
|
+
// Return cached voices if valid
|
|
45
|
+
if (this.voicesCache &&
|
|
46
|
+
Date.now() - this.voicesCache.timestamp < ElevenLabsTTS.CACHE_TTL_MS &&
|
|
47
|
+
!languageCode) {
|
|
48
|
+
return this.voicesCache.voices;
|
|
49
|
+
}
|
|
50
|
+
try {
|
|
51
|
+
const voicesController = new AbortController();
|
|
52
|
+
const voicesTimeoutId = setTimeout(() => voicesController.abort(), 30000);
|
|
53
|
+
let response;
|
|
54
|
+
try {
|
|
55
|
+
response = await fetch(`${this.baseUrl}/voices`, {
|
|
56
|
+
method: "GET",
|
|
57
|
+
headers: {
|
|
58
|
+
"xi-api-key": this.apiKey,
|
|
59
|
+
},
|
|
60
|
+
signal: voicesController.signal,
|
|
61
|
+
});
|
|
62
|
+
}
|
|
63
|
+
catch (fetchErr) {
|
|
64
|
+
if (fetchErr instanceof Error && fetchErr.name === "AbortError") {
|
|
65
|
+
throw new TTSError({
|
|
66
|
+
code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
|
|
67
|
+
message: "ElevenLabs voices request timed out after 30 seconds",
|
|
68
|
+
category: ErrorCategory.NETWORK,
|
|
69
|
+
severity: ErrorSeverity.MEDIUM,
|
|
70
|
+
retriable: true,
|
|
71
|
+
originalError: fetchErr,
|
|
72
|
+
});
|
|
73
|
+
}
|
|
74
|
+
throw fetchErr;
|
|
75
|
+
}
|
|
76
|
+
finally {
|
|
77
|
+
clearTimeout(voicesTimeoutId);
|
|
78
|
+
}
|
|
79
|
+
if (!response.ok) {
|
|
80
|
+
throw new Error(`HTTP ${response.status}`);
|
|
81
|
+
}
|
|
82
|
+
const data = (await response.json());
|
|
83
|
+
let voices = data.voices.map((voice) => ({
|
|
84
|
+
id: voice.voice_id,
|
|
85
|
+
name: voice.name,
|
|
86
|
+
languageCode: "en", // ElevenLabs supports multiple languages per voice
|
|
87
|
+
languageCodes: [
|
|
88
|
+
"en",
|
|
89
|
+
"es",
|
|
90
|
+
"fr",
|
|
91
|
+
"de",
|
|
92
|
+
"it",
|
|
93
|
+
"pt",
|
|
94
|
+
"pl",
|
|
95
|
+
"hi",
|
|
96
|
+
"ar",
|
|
97
|
+
"zh",
|
|
98
|
+
"ja",
|
|
99
|
+
"ko",
|
|
100
|
+
],
|
|
101
|
+
gender: this.mapGender(voice.labels?.gender),
|
|
102
|
+
type: "neural",
|
|
103
|
+
description: voice.labels?.description,
|
|
104
|
+
}));
|
|
105
|
+
// Filter by language if specified
|
|
106
|
+
if (languageCode) {
|
|
107
|
+
const requested = languageCode.toLowerCase();
|
|
108
|
+
const requestedBase = requested.split("-")[0];
|
|
109
|
+
voices = voices.filter((v) => v.languageCodes?.some((code) => {
|
|
110
|
+
const c = code.toLowerCase();
|
|
111
|
+
return (c === requested ||
|
|
112
|
+
c === requestedBase ||
|
|
113
|
+
c.startsWith(requestedBase));
|
|
114
|
+
}));
|
|
115
|
+
}
|
|
116
|
+
// Cache voices
|
|
117
|
+
if (!languageCode) {
|
|
118
|
+
this.voicesCache = { voices, timestamp: Date.now() };
|
|
119
|
+
}
|
|
120
|
+
return voices;
|
|
121
|
+
}
|
|
122
|
+
catch (err) {
|
|
123
|
+
const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error");
|
|
124
|
+
logger.error(`[ElevenLabsTTSHandler] Failed to get voices: ${errorMessage}`);
|
|
125
|
+
throw new TTSError({
|
|
126
|
+
code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
|
|
127
|
+
message: `Failed to get voices: ${errorMessage}`,
|
|
128
|
+
category: ErrorCategory.NETWORK,
|
|
129
|
+
severity: ErrorSeverity.MEDIUM,
|
|
130
|
+
retriable: true,
|
|
131
|
+
originalError: err instanceof Error ? err : undefined,
|
|
132
|
+
});
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
async synthesize(text, options = {}) {
|
|
136
|
+
if (!this.apiKey) {
|
|
137
|
+
throw new TTSError({
|
|
138
|
+
code: TTS_ERROR_CODES.PROVIDER_NOT_CONFIGURED,
|
|
139
|
+
message: "ElevenLabs API key not configured",
|
|
140
|
+
category: ErrorCategory.CONFIGURATION,
|
|
141
|
+
severity: ErrorSeverity.HIGH,
|
|
142
|
+
retriable: false,
|
|
143
|
+
});
|
|
144
|
+
}
|
|
145
|
+
const startTime = Date.now();
|
|
146
|
+
const elevenOptions = options;
|
|
147
|
+
try {
|
|
148
|
+
// Get voice ID (use default if not specified)
|
|
149
|
+
const voiceId = options.voice ?? "21m00Tcm4TlvDq8ikWAM"; // Rachel voice as default
|
|
150
|
+
// Determine model
|
|
151
|
+
const model = elevenOptions.model ?? "eleven_multilingual_v2";
|
|
152
|
+
// Build request body
|
|
153
|
+
const requestBody = {
|
|
154
|
+
text,
|
|
155
|
+
model_id: model,
|
|
156
|
+
voice_settings: {
|
|
157
|
+
stability: elevenOptions.stability ?? 0.5,
|
|
158
|
+
similarity_boost: elevenOptions.similarityBoost ?? 0.75,
|
|
159
|
+
style: elevenOptions.style ?? 0.0,
|
|
160
|
+
use_speaker_boost: elevenOptions.useSpeakerBoost ?? true,
|
|
161
|
+
},
|
|
162
|
+
};
|
|
163
|
+
// Determine output format
|
|
164
|
+
const outputFormat = this.mapFormat(options.format ?? "mp3");
|
|
165
|
+
const controller = new AbortController();
|
|
166
|
+
const timeoutId = setTimeout(() => controller.abort(), 30000);
|
|
167
|
+
let response;
|
|
168
|
+
try {
|
|
169
|
+
response = await fetch(`${this.baseUrl}/text-to-speech/${voiceId}?output_format=${outputFormat}`, {
|
|
170
|
+
method: "POST",
|
|
171
|
+
headers: {
|
|
172
|
+
"xi-api-key": this.apiKey,
|
|
173
|
+
"Content-Type": "application/json",
|
|
174
|
+
},
|
|
175
|
+
body: JSON.stringify(requestBody),
|
|
176
|
+
signal: controller.signal,
|
|
177
|
+
});
|
|
178
|
+
}
|
|
179
|
+
catch (fetchErr) {
|
|
180
|
+
if (fetchErr instanceof Error && fetchErr.name === "AbortError") {
|
|
181
|
+
throw new TTSError({
|
|
182
|
+
code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
|
|
183
|
+
message: "ElevenLabs TTS request timed out after 30 seconds",
|
|
184
|
+
category: ErrorCategory.NETWORK,
|
|
185
|
+
severity: ErrorSeverity.HIGH,
|
|
186
|
+
retriable: true,
|
|
187
|
+
originalError: fetchErr,
|
|
188
|
+
});
|
|
189
|
+
}
|
|
190
|
+
throw fetchErr;
|
|
191
|
+
}
|
|
192
|
+
finally {
|
|
193
|
+
clearTimeout(timeoutId);
|
|
194
|
+
}
|
|
195
|
+
if (!response.ok) {
|
|
196
|
+
const errorData = await response
|
|
197
|
+
.json()
|
|
198
|
+
.catch(() => Object.create(null));
|
|
199
|
+
const errorMessage = errorData.detail?.message ||
|
|
200
|
+
`HTTP ${response.status}`;
|
|
201
|
+
throw new Error(errorMessage);
|
|
202
|
+
}
|
|
203
|
+
const latency = Date.now() - startTime;
|
|
204
|
+
// Get audio buffer
|
|
205
|
+
const arrayBuffer = await response.arrayBuffer();
|
|
206
|
+
const audioBuffer = Buffer.from(arrayBuffer);
|
|
207
|
+
const result = {
|
|
208
|
+
buffer: audioBuffer,
|
|
209
|
+
// Use the *effective* output format from outputFormat, not the
|
|
210
|
+
// requested format — otherwise unsupported requests that fell back to
|
|
211
|
+
// mp3_44100_128 would mislabel the buffer (Copilot review).
|
|
212
|
+
format: this.effectiveFormat(outputFormat),
|
|
213
|
+
size: audioBuffer.length,
|
|
214
|
+
voice: voiceId,
|
|
215
|
+
sampleRate: this.getSampleRate(outputFormat),
|
|
216
|
+
metadata: {
|
|
217
|
+
latency,
|
|
218
|
+
provider: "elevenlabs-tts",
|
|
219
|
+
model,
|
|
220
|
+
requestedFormat: options.format,
|
|
221
|
+
outputFormat,
|
|
222
|
+
},
|
|
223
|
+
};
|
|
224
|
+
logger.info(`[ElevenLabsTTSHandler] Synthesized ${audioBuffer.length} bytes in ${latency}ms`);
|
|
225
|
+
return result;
|
|
226
|
+
}
|
|
227
|
+
catch (err) {
|
|
228
|
+
if (err instanceof TTSError) {
|
|
229
|
+
throw err;
|
|
230
|
+
}
|
|
231
|
+
const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error");
|
|
232
|
+
logger.error(`[ElevenLabsTTSHandler] Synthesis failed: ${errorMessage}`);
|
|
233
|
+
throw new TTSError({
|
|
234
|
+
code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
|
|
235
|
+
message: `Synthesis failed: ${errorMessage}`,
|
|
236
|
+
category: ErrorCategory.EXECUTION,
|
|
237
|
+
severity: ErrorSeverity.HIGH,
|
|
238
|
+
retriable: true,
|
|
239
|
+
context: { textLength: text.length },
|
|
240
|
+
originalError: err instanceof Error ? err : undefined,
|
|
241
|
+
});
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
/**
|
|
245
|
+
* Map gender string to standard type
|
|
246
|
+
*/
|
|
247
|
+
mapGender(gender) {
|
|
248
|
+
if (!gender) {
|
|
249
|
+
return "neutral";
|
|
250
|
+
}
|
|
251
|
+
const lower = gender.toLowerCase();
|
|
252
|
+
if (lower.includes("male") && !lower.includes("female")) {
|
|
253
|
+
return "male";
|
|
254
|
+
}
|
|
255
|
+
if (lower.includes("female")) {
|
|
256
|
+
return "female";
|
|
257
|
+
}
|
|
258
|
+
return "neutral";
|
|
259
|
+
}
|
|
260
|
+
/**
|
|
261
|
+
* Map TTSAudioFormat to ElevenLabs output format
|
|
262
|
+
*/
|
|
263
|
+
mapFormat(format) {
|
|
264
|
+
const formats = {
|
|
265
|
+
mp3: "mp3_44100_128",
|
|
266
|
+
wav: "pcm_44100",
|
|
267
|
+
ogg: "ogg_22050",
|
|
268
|
+
opus: "ogg_22050",
|
|
269
|
+
};
|
|
270
|
+
return formats[format] ?? "mp3_44100_128";
|
|
271
|
+
}
|
|
272
|
+
/**
|
|
273
|
+
* Get sample rate from format string
|
|
274
|
+
*/
|
|
275
|
+
getSampleRate(format) {
|
|
276
|
+
if (format.includes("44100")) {
|
|
277
|
+
return 44100;
|
|
278
|
+
}
|
|
279
|
+
if (format.includes("22050")) {
|
|
280
|
+
return 22050;
|
|
281
|
+
}
|
|
282
|
+
if (format.includes("24000")) {
|
|
283
|
+
return 24000;
|
|
284
|
+
}
|
|
285
|
+
return 44100;
|
|
286
|
+
}
|
|
287
|
+
/**
|
|
288
|
+
* Map the ElevenLabs `output_format` string back to a canonical
|
|
289
|
+
* TTSAudioFormat. mapFormat() falls back to mp3_44100_128 for unsupported
|
|
290
|
+
* inputs, so this is needed to keep TTSResult.format honest.
|
|
291
|
+
*
|
|
292
|
+
* NOTE: ElevenLabs `pcm_*` outputs are RAW 16-bit signed-LE PCM samples
|
|
293
|
+
* with no RIFF/WAV header. We surface that as `pcm16` (which exists in the
|
|
294
|
+
* `TTSAudioFormat` union exactly for this case) — labeling it as `wav`
|
|
295
|
+
* would cause consumers writing the buffer to a `.wav` file or feeding it
|
|
296
|
+
* to a WAV parser to produce unplayable output (CodeRabbit review).
|
|
297
|
+
*/
|
|
298
|
+
effectiveFormat(outputFormat) {
|
|
299
|
+
if (outputFormat.startsWith("mp3")) {
|
|
300
|
+
return "mp3";
|
|
301
|
+
}
|
|
302
|
+
if (outputFormat.startsWith("pcm")) {
|
|
303
|
+
return "pcm16";
|
|
304
|
+
}
|
|
305
|
+
if (outputFormat.startsWith("ogg")) {
|
|
306
|
+
return "opus";
|
|
307
|
+
}
|
|
308
|
+
return "mp3";
|
|
309
|
+
}
|
|
310
|
+
}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Google Gemini Live Voice API Handler
|
|
3
|
+
*
|
|
4
|
+
* Implementation of bidirectional voice communication using Gemini's Live API.
|
|
5
|
+
*
|
|
6
|
+
* @module voice/providers/GeminiLive
|
|
7
|
+
*/
|
|
8
|
+
import { BaseRealtimeHandler } from "../RealtimeVoiceAPI.js";
|
|
9
|
+
import type { TTSAudioFormat, RealtimeAudioChunk, RealtimeConfig, RealtimeSession } from "../../types/index.js";
|
|
10
|
+
/**
|
|
11
|
+
* Google Gemini Live Voice API Handler
|
|
12
|
+
*
|
|
13
|
+
* Implements bidirectional voice communication with Gemini's Live API.
|
|
14
|
+
*
|
|
15
|
+
* @see https://ai.google.dev/gemini-api/docs/live
|
|
16
|
+
*/
|
|
17
|
+
export declare class GeminiLive extends BaseRealtimeHandler {
|
|
18
|
+
readonly name = "gemini-live";
|
|
19
|
+
private readonly apiKey;
|
|
20
|
+
private ws;
|
|
21
|
+
private audioChunkIndex;
|
|
22
|
+
private pendingFunctionCalls;
|
|
23
|
+
constructor(apiKey?: string);
|
|
24
|
+
isConfigured(): boolean;
|
|
25
|
+
getSupportedFormats(): TTSAudioFormat[];
|
|
26
|
+
connect(config: RealtimeConfig): Promise<RealtimeSession>;
|
|
27
|
+
disconnect(): Promise<void>;
|
|
28
|
+
sendAudio(audio: Buffer | RealtimeAudioChunk): Promise<void>;
|
|
29
|
+
sendText(text: string): Promise<void>;
|
|
30
|
+
triggerResponse(): Promise<void>;
|
|
31
|
+
cancelResponse(): Promise<void>;
|
|
32
|
+
/**
|
|
33
|
+
* Send setup message with configuration
|
|
34
|
+
*/
|
|
35
|
+
private sendSetup;
|
|
36
|
+
/**
|
|
37
|
+
* Wait for setup complete message
|
|
38
|
+
*/
|
|
39
|
+
private waitForSetupComplete;
|
|
40
|
+
/**
|
|
41
|
+
* Handle incoming WebSocket messages
|
|
42
|
+
*/
|
|
43
|
+
private handleMessage;
|
|
44
|
+
/**
|
|
45
|
+
* Parse audio format from MIME type
|
|
46
|
+
*/
|
|
47
|
+
private parseAudioFormat;
|
|
48
|
+
/**
|
|
49
|
+
* Handle function call from model
|
|
50
|
+
*/
|
|
51
|
+
private handleFunctionCall;
|
|
52
|
+
}
|